1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/machparam.h>
31 #include <sys/x86_archext.h>
32 #include <sys/systm.h>
33 #include <sys/mach_mmu.h>
34 
35 #include <sys/multiboot.h>
36 
37 extern multiboot_header_t mb_header;
38 extern int have_cpuid(void);
39 extern uint32_t get_cpuid_edx(uint32_t *eax);
40 
41 #include <sys/inttypes.h>
42 #include <sys/bootinfo.h>
43 #include <sys/mach_mmu.h>
44 #include <sys/boot_console.h>
45 
46 #include "dboot_printf.h"
47 #include "dboot_xboot.h"
48 #include "dboot_elfload.h"
49 
50 /*
51  * This file contains code that runs to transition us from either a multiboot
52  * compliant loader (32 bit non-paging) or Xen domain loader to regular kernel
53  * execution. Its task is to setup the kernel memory image and page tables.
54  *
55  * The code executes as:
56  *	- 32 bits under GRUB (for 32 or 64 bit Solaris)
57  * 	- 32 bit program for Xen 32 bit
58  *	- 64 bit program for Xen 64 bit (at least that's my assumption for now)
59  *
60  * Under Xen, we must create mappings for any memory beyond the initial
61  * start of day allocation (such as the kernel itself).
62  *
63  * When not under Xen, the mapping between maddr_t and paddr_t is 1:1.
64  * Since we are running in real mode, so all such memory is accessible.
65  */
66 
67 /*
68  * Standard bits used in PTE (page level) and PTP (internal levels)
69  */
70 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_USER | PT_WRITABLE | PT_USER;
71 x86pte_t pte_bits = PT_VALID | PT_REF | PT_MOD | PT_NOCONSIST | PT_WRITABLE;
72 
73 /*
74  * This is the target addresses (physical) where the kernel text and data
75  * nucleus pages will be unpacked. On Xen this is actually a virtual address.
76  */
77 paddr_t ktext_phys;
78 uint32_t ksize = 2 * FOUR_MEG;	/* kernel nucleus is 8Meg */
79 
80 static uint64_t target_kernel_text;	/* value to use for KERNEL_TEXT */
81 
82 /*
83  * The stack is setup in assembler before entering startup_kernel()
84  */
85 char stack_space[STACK_SIZE];
86 
87 /*
88  * Used to track physical memory allocation
89  */
90 static paddr_t next_avail_addr = 0;
91 
92 multiboot_info_t *mb_info;
93 
94 /*
95  * This contains information passed to the kernel
96  */
97 struct xboot_info boot_info[2];	/* extra space to fix alignement for amd64 */
98 struct xboot_info *bi;
99 
100 /*
101  * Page table and memory stuff.
102  */
103 static uint64_t max_mem;			/* maximum memory address */
104 
105 /*
106  * Information about processor MMU
107  */
108 int amd64_support = 0;
109 int largepage_support = 0;
110 int pae_support = 0;
111 int pge_support = 0;
112 int NX_support = 0;
113 
114 /*
115  * Low 32 bits of kernel entry address passed back to assembler.
116  * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
117  */
118 uint32_t entry_addr_low;
119 
120 /*
121  * Memlists for the kernel. We shouldn't need a lot of these.
122  */
123 #define	MAX_MEMLIST (50)
124 struct boot_memlist memlists[MAX_MEMLIST];
125 uint_t memlists_used = 0;
126 struct boot_memlist pcimemlists[MAX_MEMLIST];
127 uint_t pcimemlists_used = 0;
128 
129 #define	MAX_MODULES (10)
130 struct boot_modules modules[MAX_MODULES];
131 uint_t modules_used = 0;
132 
133 /*
134  * Debugging macros
135  */
136 uint_t prom_debug = 0;
137 uint_t map_debug = 0;
138 
139 /*
140  * The Xen/Grub specific code builds the initial memlists. This code does
141  * sort/merge/link for final use.
142  */
143 static void
144 sort_physinstall(void)
145 {
146 	int i;
147 	int j;
148 	struct boot_memlist tmp;
149 
150 	/*
151 	 * Now sort the memlists, in case they weren't in order.
152 	 * Yeah, this is a bubble sort; small, simple and easy to get right.
153 	 */
154 	DBG_MSG("Sorting phys-installed list\n");
155 	for (j = memlists_used - 1; j > 0; --j) {
156 		for (i = 0; i < j; ++i) {
157 			if (memlists[i].addr < memlists[i + 1].addr)
158 				continue;
159 			tmp = memlists[i];
160 			memlists[i] = memlists[i + 1];
161 			memlists[i + 1] = tmp;
162 		}
163 	}
164 
165 	/*
166 	 * Merge any memlists that don't have holes between them.
167 	 */
168 	for (i = 0; i <= memlists_used - 1; ++i) {
169 		if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
170 			continue;
171 
172 		if (prom_debug)
173 			dboot_printf(
174 			    "merging mem segs %" PRIx64 "...%" PRIx64
175 			    " w/ %" PRIx64 "...%" PRIx64 "\n",
176 			    memlists[i].addr,
177 			    memlists[i].addr + memlists[i].size,
178 			    memlists[i + 1].addr,
179 			    memlists[i + 1].addr + memlists[i + 1].size);
180 
181 		memlists[i].size += memlists[i + 1].size;
182 		for (j = i + 1; j < memlists_used - 1; ++j)
183 			memlists[j] = memlists[j + 1];
184 		--memlists_used;
185 		DBG(memlists_used);
186 		--i;	/* after merging we need to reexamine, so do this */
187 	}
188 
189 	if (prom_debug) {
190 		dboot_printf("\nFinal memlists:\n");
191 		for (i = 0; i < memlists_used; ++i) {
192 			dboot_printf("\t%d: addr=%" PRIx64 " size=%"
193 			    PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
194 		}
195 	}
196 
197 	/*
198 	 * link together the memlists with native size pointers
199 	 */
200 	memlists[0].next = 0;
201 	memlists[0].prev = 0;
202 	for (i = 1; i < memlists_used; ++i) {
203 		memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
204 		memlists[i].next = 0;
205 		memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
206 	}
207 	bi->bi_phys_install = (native_ptr_t)memlists;
208 	DBG(bi->bi_phys_install);
209 }
210 
211 x86pte_t
212 get_pteval(paddr_t table, uint_t index)
213 {
214 	if (pae_support)
215 		return (((x86pte_t *)(uintptr_t)table)[index]);
216 	return (((x86pte32_t *)(uintptr_t)table)[index]);
217 }
218 
219 /*ARGSUSED*/
220 void
221 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
222 {
223 	uintptr_t tab_addr = (uintptr_t)table;
224 
225 	if (pae_support)
226 		((x86pte_t *)tab_addr)[index] = pteval;
227 	else
228 		((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
229 	if (level == top_level && level == 2)
230 		reload_cr3();
231 }
232 
233 paddr_t
234 make_ptable(x86pte_t *pteval, uint_t level)
235 {
236 	paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
237 
238 	if (level == top_level && level == 2)
239 		*pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
240 	else
241 		*pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
242 
243 	if (map_debug)
244 		dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
245 		    PRIx64 "\n", level, (ulong_t)new_table, *pteval);
246 	return (new_table);
247 }
248 
249 x86pte_t *
250 map_pte(paddr_t table, uint_t index)
251 {
252 	return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
253 }
254 
255 #if 0	/* useful if debugging */
256 /*
257  * dump out the contents of page tables...
258  */
259 static void
260 dump_tables(void)
261 {
262 	uint_t save_index[4];	/* for recursion */
263 	char *save_table[4];	/* for recursion */
264 	uint_t	l;
265 	uint64_t va;
266 	uint64_t pgsize;
267 	int index;
268 	int i;
269 	x86pte_t pteval;
270 	char *table;
271 	static char *tablist = "\t\t\t";
272 	char *tabs = tablist + 3 - top_level;
273 	uint_t pa, pa1;
274 
275 	dboot_printf("Finished pagetables:\n");
276 	table = (char *)top_page_table;
277 	l = top_level;
278 	va = 0;
279 	for (index = 0; index < ptes_per_table; ++index) {
280 		pgsize = 1ull << shift_amt[l];
281 		if (pae_support)
282 			pteval = ((x86pte_t *)table)[index];
283 		else
284 			pteval = ((x86pte32_t *)table)[index];
285 		if (pteval == 0)
286 			goto next_entry;
287 
288 		dboot_printf("%s %lx[0x%x] = %" PRIx64 ", va=%" PRIx64,
289 		    tabs + l, table, index, (uint64_t)pteval, va);
290 		pa = ma_to_pa(pteval & MMU_PAGEMASK);
291 		dboot_printf(" physaddr=%" PRIx64 "\n", pa);
292 
293 		/*
294 		 * Don't try to walk hypervisor private pagetables
295 		 */
296 		if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) {
297 			save_table[l] = table;
298 			save_index[l] = index;
299 			--l;
300 			index = -1;
301 			table = (char *)(uintptr_t)
302 			    ma_to_pa(pteval & MMU_PAGEMASK);
303 			goto recursion;
304 		}
305 
306 		/*
307 		 * shorten dump for consecutive mappings
308 		 */
309 		for (i = 1; index + i < ptes_per_table; ++i) {
310 			if (pae_support)
311 				pteval = ((x86pte_t *)table)[index + i];
312 			else
313 				pteval = ((x86pte32_t *)table)[index + i];
314 			if (pteval == 0)
315 				break;
316 			pa1 = ma_to_pa(pteval & MMU_PAGEMASK);
317 			if (pa1 != pa + i * pgsize)
318 				break;
319 		}
320 		if (i > 2) {
321 			dboot_printf("%s...\n", tabs + l);
322 			va += pgsize * (i - 2);
323 			index += i - 2;
324 		}
325 next_entry:
326 		va += pgsize;
327 		if (l == 3 && index == 256)	/* VA hole */
328 			va = 0xffff800000000000ull;
329 recursion:
330 		;
331 	}
332 	if (l < top_level) {
333 		++l;
334 		index = save_index[l];
335 		table = save_table[l];
336 		goto recursion;
337 	}
338 }
339 #endif
340 
341 /*
342  * Add a mapping for the physical page at the given virtual address.
343  */
344 static void
345 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
346 {
347 	x86pte_t *ptep;
348 	x86pte_t pteval;
349 
350 	pteval = pa_to_ma(pa) | pte_bits;
351 	if (level > 0)
352 		pteval |= PT_PAGESIZE;
353 	if (va >= target_kernel_text && pge_support)
354 		pteval |= PT_GLOBAL;
355 
356 	if (map_debug && pa != va)
357 		dboot_printf("mapping pa=0x%" PRIx64 " va=0x%" PRIx64
358 		    " pte=0x%" PRIx64 " l=%d\n",
359 		    (uint64_t)pa, (uint64_t)va, pteval, level);
360 
361 	/*
362 	 * Find the pte that will map this address. This creates any
363 	 * missing intermediate level page tables
364 	 */
365 	ptep = find_pte(va, NULL, level, 0);
366 
367 	/*
368 	 * On Xen we must use hypervisor calls to modify the PTE, since
369 	 * paging is active. On real hardware we just write to the pagetables
370 	 * which aren't in use yet.
371 	 */
372 	if (va < 1024 * 1024)
373 		pteval |= PT_NOCACHE;		/* for video RAM */
374 	if (pae_support)
375 		*ptep = pteval;
376 	else
377 		*((x86pte32_t *)ptep) = (x86pte32_t)pteval;
378 }
379 
380 /*
381  * During memory allocation, find the highest address not used yet.
382  */
383 static void
384 check_higher(paddr_t a)
385 {
386 	if (a < next_avail_addr)
387 		return;
388 	next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
389 	DBG(next_avail_addr);
390 }
391 
392 /*
393  * This is called to remove start..end from the
394  * possible range of PCI addresses.
395  */
396 const uint64_t pci_lo_limit = 0x00100000ul;
397 const uint64_t pci_hi_limit = 0xfff00000ul;
398 static void
399 exclude_from_pci(uint64_t start, uint64_t end)
400 {
401 	int i;
402 	int j;
403 	struct boot_memlist *ml;
404 
405 	for (i = 0; i < pcimemlists_used; ++i) {
406 		ml = &pcimemlists[i];
407 
408 		/* delete the entire range? */
409 		if (start <= ml->addr && ml->addr + ml->size <= end) {
410 			--pcimemlists_used;
411 			for (j = i; j < pcimemlists_used; ++j)
412 				pcimemlists[j] = pcimemlists[j + 1];
413 			--i;	/* to revisit the new one at this index */
414 		}
415 
416 		/* split a range? */
417 		else if (ml->addr < start && end < ml->addr + ml->size) {
418 
419 			++pcimemlists_used;
420 			if (pcimemlists_used > MAX_MEMLIST)
421 				dboot_panic("too many pcimemlists");
422 
423 			for (j = pcimemlists_used - 1; j > i; --j)
424 				pcimemlists[j] = pcimemlists[j - 1];
425 			ml->size = start - ml->addr;
426 
427 			++ml;
428 			ml->size = (ml->addr + ml->size) - end;
429 			ml->addr = end;
430 			++i;	/* skip on to next one */
431 		}
432 
433 		/* cut memory off the start? */
434 		else if (ml->addr < end && end < ml->addr + ml->size) {
435 			ml->size -= end - ml->addr;
436 			ml->addr = end;
437 		}
438 
439 		/* cut memory off the end? */
440 		else if (ml->addr <= start && start < ml->addr + ml->size) {
441 			ml->size = start - ml->addr;
442 		}
443 	}
444 }
445 
446 /*
447  * Walk through the module information finding the last used address.
448  * The first available address will become the top level page table.
449  *
450  * We then build the phys_install memlist from the multiboot information.
451  */
452 static void
453 init_mem_alloc(void)
454 {
455 	mb_memory_map_t *mmap;
456 	mb_module_t *mod;
457 	uint64_t start;
458 	uint64_t end;
459 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
460 	extern char _end[];
461 	int i;
462 
463 	DBG_MSG("Entered init_mem_alloc()\n");
464 	DBG((uintptr_t)mb_info);
465 
466 	/*
467 	 * search the modules to find the last used address
468 	 * we'll build the module list while we're walking through here
469 	 */
470 	DBG_MSG("\nFinding Modules\n");
471 	check_higher((paddr_t)&_end);
472 	for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0;
473 	    i < mb_info->mods_count;
474 	    ++mod, ++i) {
475 		if (prom_debug) {
476 			dboot_printf("\tmodule #%d: %s at: 0x%lx, len 0x%lx\n",
477 			    i, (char *)(mod->mod_name),
478 			    (ulong_t)mod->mod_start, (ulong_t)mod->mod_end);
479 		}
480 		modules[i].bm_addr = mod->mod_start;
481 		modules[i].bm_size = mod->mod_end;
482 
483 		check_higher(mod->mod_end);
484 	}
485 	bi->bi_modules = (native_ptr_t)modules;
486 	DBG(bi->bi_modules);
487 	bi->bi_module_cnt = mb_info->mods_count;
488 	DBG(bi->bi_module_cnt);
489 
490 	/*
491 	 * start out by assuming PCI can use all physical addresses
492 	 */
493 	pcimemlists[0].addr = pci_lo_limit;
494 	pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
495 	pcimemlists_used = 1;
496 
497 	/*
498 	 * Walk through the memory map from multiboot and build our memlist
499 	 * structures. Note these will have native format pointers.
500 	 */
501 	DBG_MSG("\nFinding Memory Map\n");
502 	DBG(mb_info->flags);
503 	max_mem = 0;
504 	if (mb_info->flags & 0x40) {
505 		DBG(mb_info->mmap_addr);
506 		DBG(mb_info->mmap_length);
507 		check_higher(mb_info->mmap_addr + mb_info->mmap_length);
508 
509 		for (mmap = (mb_memory_map_t *)mb_info->mmap_addr;
510 		    (uint32_t)mmap < mb_info->mmap_addr + mb_info->mmap_length;
511 		    mmap = (mb_memory_map_t *)((uint32_t)mmap + mmap->size
512 		    + sizeof (mmap->size))) {
513 
514 			start = ((uint64_t)mmap->base_addr_high << 32) +
515 			    mmap->base_addr_low;
516 			end = start + ((uint64_t)mmap->length_high << 32) +
517 			    mmap->length_low;
518 
519 			if (prom_debug)
520 				dboot_printf("\ttype: %d %" PRIx64 "..%"
521 				    PRIx64 "\n", mmap->type, start, end);
522 
523 			/*
524 			 * page align start and end
525 			 */
526 			start = (start + page_offset) & ~page_offset;
527 			end &= ~page_offset;
528 			if (end <= start)
529 				continue;
530 
531 			exclude_from_pci(start, end);
532 
533 			/*
534 			 * only type 1 is usable RAM
535 			 */
536 			if (mmap->type != 1)
537 				continue;
538 
539 			if (end > max_mem)
540 				max_mem = end;
541 
542 			memlists[memlists_used].addr = start;
543 			memlists[memlists_used].size = end - start;
544 			++memlists_used;
545 			if (memlists_used > MAX_MEMLIST)
546 				dboot_panic("too many memlists");
547 		}
548 	} else if (mb_info->flags & 0x01) {
549 		DBG(mb_info->mem_lower);
550 		memlists[memlists_used].addr = 0;
551 		memlists[memlists_used].size = mb_info->mem_lower * 1024;
552 		++memlists_used;
553 		DBG(mb_info->mem_upper);
554 		memlists[memlists_used].addr = 1024 * 1024;
555 		memlists[memlists_used].size = mb_info->mem_upper * 1024;
556 		++memlists_used;
557 		exclude_from_pci(memlists[0].addr,
558 		    memlists[0].addr + memlists[memlists_used].size);
559 		exclude_from_pci(memlists[1].addr,
560 		    memlists[1].addr + memlists[memlists_used].size);
561 	} else {
562 		dboot_panic("No memory info from boot loader!!!\n");
563 	}
564 
565 	check_higher(bi->bi_cmdline);
566 
567 	/*
568 	 * finish processing the physinstall list
569 	 */
570 	sort_physinstall();
571 
572 	/*
573 	 * Finish off the pcimemlist
574 	 */
575 	if (prom_debug) {
576 		for (i = 0; i < pcimemlists_used; ++i) {
577 			dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
578 				    PRIx64 "\n", pcimemlists[i].addr,
579 				pcimemlists[i].addr + pcimemlists[i].size);
580 		}
581 	}
582 	pcimemlists[0].next = 0;
583 	pcimemlists[0].prev = 0;
584 	for (i = 1; i < pcimemlists_used; ++i) {
585 		pcimemlists[i].prev =
586 		    (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
587 		pcimemlists[i].next = 0;
588 		pcimemlists[i - 1].next =
589 		    (native_ptr_t)(uintptr_t)(pcimemlists + i);
590 	}
591 	bi->bi_pcimem = (native_ptr_t)pcimemlists;
592 	DBG(bi->bi_pcimem);
593 }
594 
595 /*
596  * Simple memory allocator, allocates aligned physical memory.
597  * Note that startup_kernel() only allocates memory, never frees.
598  * Memory usage just grows in an upward direction.
599  */
600 static void *
601 do_mem_alloc(uint32_t size, uint32_t align)
602 {
603 	uint_t i;
604 	uint64_t best;
605 	uint64_t start;
606 	uint64_t end;
607 
608 	/*
609 	 * make sure size is a multiple of pagesize
610 	 */
611 	size = RNDUP(size, MMU_PAGESIZE);
612 	next_avail_addr = RNDUP(next_avail_addr, align);
613 
614 	/*
615 	 * a really large bootarchive that causes you to run out of memory
616 	 * may cause this to blow up
617 	 */
618 	/* LINTED E_UNEXPECTED_UINT_PROMOTION */
619 	best = (uint64_t)-size;
620 	for (i = 0; i < memlists_used; ++i) {
621 		start = memlists[i].addr;
622 		end = start + memlists[i].size;
623 
624 		/*
625 		 * did we find the desired address?
626 		 */
627 		if (start <= next_avail_addr && next_avail_addr + size <= end) {
628 			best = next_avail_addr;
629 			goto done;
630 		}
631 
632 		/*
633 		 * if not is this address the best so far?
634 		 */
635 		if (start > next_avail_addr && start < best &&
636 		    RNDUP(start, align) + size <= end)
637 			best = RNDUP(start, align);
638 	}
639 
640 	/*
641 	 * We didn't find exactly the address we wanted, due to going off the
642 	 * end of a memory region. Return the best found memory address.
643 	 */
644 done:
645 	next_avail_addr = best + size;
646 	(void) memset((void *)(uintptr_t)best, 0, size);
647 	return ((void *)(uintptr_t)best);
648 }
649 
650 void *
651 mem_alloc(uint32_t size)
652 {
653 	return (do_mem_alloc(size, MMU_PAGESIZE));
654 }
655 
656 
657 /*
658  * Build page tables to map all of memory used so far as well as the kernel.
659  */
660 static void
661 build_page_tables(void)
662 {
663 	uint32_t psize;
664 	uint32_t level;
665 	uint32_t off;
666 	uint32_t i;
667 	uint64_t start;
668 	uint64_t end;
669 	uint64_t next_mapping;
670 
671 	/*
672 	 * If we're not using Xen, we need to create the top level pagetable.
673 	 */
674 	top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
675 	DBG((uintptr_t)top_page_table);
676 
677 	/*
678 	 * Determine if we'll use large mappings for kernel, then map it.
679 	 */
680 	if (largepage_support) {
681 		psize = lpagesize;
682 		level = 1;
683 	} else {
684 		psize = MMU_PAGESIZE;
685 		level = 0;
686 	}
687 
688 	DBG_MSG("Mapping kernel\n");
689 	DBG(ktext_phys);
690 	DBG(target_kernel_text);
691 	DBG(ksize);
692 	DBG(psize);
693 	for (off = 0; off < ksize; off += psize)
694 		map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
695 
696 	/*
697 	 * The kernel will need a 1 page window to work with page tables
698 	 */
699 	bi->bi_pt_window = (uintptr_t)mem_alloc(MMU_PAGESIZE);
700 	DBG(bi->bi_pt_window);
701 	bi->bi_pte_to_pt_window =
702 	    (uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
703 	DBG(bi->bi_pte_to_pt_window);
704 
705 	/*
706 	 * Under multiboot we need 1:1 mappings for all of low memory, which
707 	 * includes our pagetables. The following code works because our
708 	 * simple memory allocator only grows usage in an upwards direction.
709 	 *
710 	 * We map *all* possible addresses below 1 Meg, since things like
711 	 * the video RAM are down there.
712 	 *
713 	 * Skip memory between 1M and _start, this acts as a reserve
714 	 * of memory usable for DMA.
715 	 */
716 	next_mapping = (uintptr_t)_start & MMU_PAGEMASK;
717 	if (map_debug)
718 		dboot_printf("1:1 map pa=0..1Meg\n");
719 	for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE)
720 		map_pa_at_va(start, start, 0);
721 
722 	for (i = 0; i < memlists_used; ++i) {
723 		start = memlists[i].addr;
724 		if (start < next_mapping)
725 			start = next_mapping;
726 
727 		end = start + memlists[i].size;
728 
729 		if (map_debug)
730 			dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
731 			    start, end);
732 		while (start < end && start < next_avail_addr) {
733 			map_pa_at_va(start, start, 0);
734 			start += MMU_PAGESIZE;
735 		}
736 	}
737 
738 	DBG_MSG("\nPage tables constructed\n");
739 }
740 
741 #define	NO_MULTIBOOT	\
742 "multiboot is no longer used to boot the Solaris Operating System.\n\
743 The grub entry should be changed to:\n\
744 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
745 module$ /platform/i86pc/$ISADIR/boot_archive\n\
746 See http://www.sun.com/msg/SUNOS-8000-AK for details.\n"
747 
748 /*
749  * startup_kernel has a pretty simple job. It builds pagetables which reflect
750  * 1:1 mappings for all memory in use. It then also adds mappings for
751  * the kernel nucleus at virtual address of target_kernel_text using large page
752  * mappings. The page table pages are also accessible at 1:1 mapped
753  * virtual addresses.
754  */
755 /*ARGSUSED*/
756 void
757 startup_kernel(void)
758 {
759 	char *cmdline;
760 	uintptr_t addr;
761 
762 	/*
763 	 * At this point we are executing in a 32 bit real mode.
764 	 */
765 	cmdline = (char *)mb_info->cmdline;
766 	prom_debug = (strstr(cmdline, "prom_debug") != NULL);
767 	map_debug = (strstr(cmdline, "map_debug") != NULL);
768 	bcons_init(cmdline);
769 	DBG_MSG("\n\nSolaris prekernel set: ");
770 	DBG_MSG(cmdline);
771 	DBG_MSG("\n");
772 
773 	if (strstr(cmdline, "multiboot") != NULL) {
774 		dboot_panic(NO_MULTIBOOT);
775 	}
776 
777 	/*
778 	 * boot info must be 16 byte aligned for 64 bit kernel ABI
779 	 */
780 	addr = (uintptr_t)boot_info;
781 	addr = (addr + 0xf) & ~0xf;
782 	bi = (struct xboot_info *)addr;
783 	DBG((uintptr_t)bi);
784 	bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
785 
786 	/*
787 	 * Need correct target_kernel_text value
788 	 */
789 #if defined(_BOOT_TARGET_amd64)
790 	target_kernel_text = KERNEL_TEXT_amd64;
791 #else
792 	target_kernel_text = KERNEL_TEXT_i386;
793 #endif
794 	DBG(target_kernel_text);
795 
796 	/*
797 	 * use cpuid to enable MMU features
798 	 */
799 	if (have_cpuid()) {
800 		uint32_t eax, edx;
801 
802 		eax = 1;
803 		edx = get_cpuid_edx(&eax);
804 		if (edx & CPUID_INTC_EDX_PSE)
805 			largepage_support = 1;
806 		if (edx & CPUID_INTC_EDX_PGE)
807 			pge_support = 1;
808 		if (edx & CPUID_INTC_EDX_PAE)
809 			pae_support = 1;
810 
811 		eax = 0x80000000;
812 		edx = get_cpuid_edx(&eax);
813 		if (eax >= 0x80000001) {
814 			eax = 0x80000001;
815 			edx = get_cpuid_edx(&eax);
816 			if (edx & CPUID_AMD_EDX_LM)
817 				amd64_support = 1;
818 			if (edx & CPUID_AMD_EDX_NX)
819 				NX_support = 1;
820 		}
821 	} else {
822 		dboot_printf("cpuid not supported\n");
823 	}
824 
825 #if defined(_BOOT_TARGET_amd64)
826 	if (amd64_support == 0)
827 		dboot_panic("long mode not supported, rebooting\n");
828 	else if (pae_support == 0)
829 		dboot_panic("long mode, but no PAE; rebooting\n");
830 #endif
831 
832 	/*
833 	 * initialize our memory allocator
834 	 */
835 	init_mem_alloc();
836 
837 	/*
838 	 * configure mmu information
839 	 */
840 #if !defined(_BOOT_TARGET_amd64)
841 	if (pae_support && (max_mem > FOUR_GIG || NX_support)) {
842 #endif
843 		shift_amt = shift_amt_pae;
844 		ptes_per_table = 512;
845 		pte_size = 8;
846 		lpagesize = TWO_MEG;
847 #if defined(_BOOT_TARGET_amd64)
848 		top_level = 3;
849 #else
850 		top_level = 2;
851 #endif
852 #if !defined(_BOOT_TARGET_amd64)
853 	} else {
854 		pae_support = 0;
855 		NX_support = 0;
856 		shift_amt = shift_amt_nopae;
857 		ptes_per_table = 1024;
858 		pte_size = 4;
859 		lpagesize = FOUR_MEG;
860 		top_level = 1;
861 	}
862 #endif
863 
864 	DBG(pge_support);
865 	DBG(NX_support);
866 	DBG(largepage_support);
867 	DBG(amd64_support);
868 	DBG(top_level);
869 	DBG(pte_size);
870 	DBG(ptes_per_table);
871 	DBG(lpagesize);
872 
873 	ktext_phys = FOUR_MEG;		/* from UNIX Mapfile */
874 
875 #if defined(_BOOT_TARGET_amd64)
876 	/*
877 	 * For grub, copy kernel bits from the ELF64 file to final place.
878 	 */
879 	DBG_MSG("\nAllocating nucleus pages.\n");
880 	ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
881 	if (ktext_phys == 0)
882 		dboot_panic("failed to allocate aligned kernel memory\n");
883 	if (dboot_elfload64(mb_header.load_addr) != 0)
884 		dboot_panic("failed to parse kernel ELF image, rebooting\n");
885 
886 #endif
887 	DBG(ktext_phys);
888 
889 	/*
890 	 * Allocate page tables.
891 	 */
892 	build_page_tables();
893 
894 	/*
895 	 * return to assembly code to switch to running kernel
896 	 */
897 	entry_addr_low = (uint32_t)target_kernel_text;
898 	DBG(entry_addr_low);
899 	bi->bi_use_largepage = largepage_support;
900 	bi->bi_use_pae = pae_support;
901 	bi->bi_use_pge = pge_support;
902 	bi->bi_use_nx = NX_support;
903 	bi->bi_next_paddr = next_avail_addr;
904 	DBG(bi->bi_next_paddr);
905 	bi->bi_next_vaddr = (uintptr_t)next_avail_addr;
906 	DBG(bi->bi_next_vaddr);
907 	bi->bi_mb_info = (uintptr_t)mb_info;
908 	bi->bi_top_page_table = (uintptr_t)top_page_table;
909 
910 	bi->bi_kseg_size = FOUR_MEG;
911 	DBG(bi->bi_kseg_size);
912 
913 #if 0		/* useful if debugging initial page tables */
914 	if (prom_debug)
915 		dump_tables();
916 #endif
917 
918 	DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
919 }
920