1 /*- 2 * Copyright (c) 1998 Michael Smith <msmith@freebsd.org> 3 * Copyright (c) 2014 The FreeBSD Foundation 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 #define __ELF_WORD_SIZE 64 30 #include <sys/param.h> 31 #include <sys/exec.h> 32 #include <sys/linker.h> 33 #include <string.h> 34 #include <machine/elf.h> 35 #include <stand.h> 36 #include <vm/vm.h> 37 #include <vm/pmap.h> 38 39 #ifdef EFI 40 #include <efi.h> 41 #include <efilib.h> 42 #else 43 #include "host_syscall.h" 44 #endif 45 46 #include "bootstrap.h" 47 #include "kboot.h" 48 49 #include "platform/acfreebsd.h" 50 #include "acconfig.h" 51 #define ACPI_SYSTEM_XFACE 52 #include "actypes.h" 53 #include "actbl.h" 54 55 #ifdef EFI 56 #include "loader_efi.h" 57 58 static EFI_GUID acpi_guid = ACPI_TABLE_GUID; 59 static EFI_GUID acpi20_guid = ACPI_20_TABLE_GUID; 60 #endif 61 62 #ifdef EFI 63 #define LOADER_PAGE_SIZE EFI_PAGE_SIZE 64 #else 65 #define LOADER_PAGE_SIZE PAGE_SIZE 66 #endif 67 68 extern int bi_load(char *args, vm_offset_t *modulep, vm_offset_t *kernendp, 69 bool exit_bs); 70 71 static int elf64_exec(struct preloaded_file *amp); 72 static int elf64_obj_exec(struct preloaded_file *amp); 73 74 static struct file_format amd64_elf = { 75 .l_load = elf64_loadfile, 76 .l_exec = elf64_exec, 77 }; 78 static struct file_format amd64_elf_obj = { 79 .l_load = elf64_obj_loadfile, 80 .l_exec = elf64_obj_exec, 81 }; 82 83 #ifdef EFI 84 extern struct file_format multiboot2; 85 extern struct file_format multiboot2_obj; 86 #endif 87 88 struct file_format *file_formats[] = { 89 #ifdef EFI 90 &multiboot2, 91 &multiboot2_obj, 92 #endif 93 &amd64_elf, 94 &amd64_elf_obj, 95 NULL 96 }; 97 98 #ifndef EFI 99 /* 100 * We create the stack that we want. We have the address of the page tables 101 * we make on top (so we pop that off and set %cr3). We have the entry point 102 * to the kernel (which retq pops off) This leaves the stack that the btext 103 * wants: offset 4 is modulep and offset8 is kernend, with the filler bytes 104 * to keep this aligned. This makes the trampoline very simple. 105 */ 106 struct trampoline_data { 107 uint64_t pt4; // Page table address to pop 108 uint64_t entry; // return address to jump to kernel 109 uint32_t fill1; // 0 110 uint32_t modulep; // 4 module metadata 111 uint32_t kernend; // 8 kernel end 112 uint32_t fill2; // 12 113 }; 114 _Static_assert(sizeof(struct trampoline_data) == 32, "Bad size for trampoline data"); 115 #endif 116 117 static pml4_entry_t *PT4; 118 static pdp_entry_t *PT3_l, *PT3_u; 119 static pd_entry_t *PT2_l0, *PT2_l1, *PT2_l2, *PT2_l3, *PT2_u0, *PT2_u1; 120 121 #ifdef EFI 122 static pdp_entry_t *PT3; 123 static pd_entry_t *PT2; 124 125 extern EFI_PHYSICAL_ADDRESS staging; 126 127 static void (*trampoline)(uint64_t stack, void *copy_finish, uint64_t kernend, 128 uint64_t modulep, pml4_entry_t *pagetable, uint64_t entry); 129 #endif 130 131 extern uintptr_t tramp; 132 extern uint32_t tramp_size; 133 #ifndef EFI 134 extern uint32_t tramp_data_offset; 135 #endif 136 137 /* 138 * There is an ELF kernel and one or more ELF modules loaded. 139 * We wish to start executing the kernel image, so make such 140 * preparations as are required, and do so. 141 */ 142 static int 143 elf64_exec(struct preloaded_file *fp) 144 { 145 struct file_metadata *md; 146 Elf_Ehdr *ehdr; 147 vm_offset_t modulep, kernend; 148 int err, i; 149 char buf[24]; 150 #ifdef EFI 151 ACPI_TABLE_RSDP *rsdp = NULL; 152 int revision; 153 int copy_auto; 154 vm_offset_t trampstack, trampcode; 155 #else 156 vm_offset_t rsdp = 0; 157 void *trampcode; 158 int nseg; 159 void *kseg; 160 vm_offset_t trampolinebase; 161 uint64_t *trampoline; 162 struct trampoline_data *trampoline_data; 163 vm_offset_t staging; 164 int error; 165 #endif 166 167 #ifdef EFI 168 copy_auto = copy_staging == COPY_STAGING_AUTO; 169 if (copy_auto) 170 copy_staging = fp->f_kernphys_relocatable ? 171 COPY_STAGING_DISABLE : COPY_STAGING_ENABLE; 172 #else 173 /* 174 * Figure out where to put it. 175 * 176 * Linux does not allow to do kexec_load into any part of memory. Ask 177 * arch_loadaddr to resolve the first available chunk of physical memory 178 * where loading is possible (load_addr). 179 * 180 * The kernel is loaded at the 'base' address in continguous physical 181 * pages (using 2MB super pages). The first such page is unused by the 182 * kernel and serves as a good place to put not only the trampoline, but 183 * the page table pages that the trampoline needs to setup the proper 184 * kernel starting environment. 185 */ 186 staging = trampolinebase = kboot_get_phys_load_segment(); 187 trampolinebase += 1ULL << 20; /* Copy trampoline to base + 1MB, kernel will wind up at 2MB */ 188 printf("Load address at %#jx\n", (uintmax_t)trampolinebase); 189 printf("Relocation offset is %#jx\n", (uintmax_t)elf64_relocation_offset); 190 #endif 191 192 /* 193 * Report the RSDP to the kernel. While this can be found with 194 * a BIOS boot, the RSDP may be elsewhere when booted from UEFI. 195 */ 196 #ifdef EFI 197 rsdp = efi_get_table(&acpi20_guid); 198 if (rsdp == NULL) { 199 rsdp = efi_get_table(&acpi_guid); 200 } 201 #else 202 rsdp = acpi_rsdp(); 203 #endif 204 if (rsdp != 0) { 205 sprintf(buf, "0x%016llx", (unsigned long long)rsdp); 206 setenv("acpi.rsdp", buf, 1); 207 } 208 if ((md = file_findmetadata(fp, MODINFOMD_ELFHDR)) == NULL) 209 return (EFTYPE); 210 ehdr = (Elf_Ehdr *)&(md->md_data); 211 212 #ifdef EFI 213 trampcode = copy_staging == COPY_STAGING_ENABLE ? 214 (vm_offset_t)0x0000000040000000 /* 1G */ : 215 (vm_offset_t)0x0000000100000000; /* 4G */; 216 err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 1, 217 (EFI_PHYSICAL_ADDRESS *)&trampcode); 218 if (EFI_ERROR(err)) { 219 printf("Unable to allocate trampoline\n"); 220 if (copy_auto) 221 copy_staging = COPY_STAGING_AUTO; 222 return (ENOMEM); 223 } 224 trampstack = trampcode + LOADER_PAGE_SIZE - 8; 225 #else 226 // XXX Question: why not just use malloc? 227 trampcode = host_getmem(LOADER_PAGE_SIZE); 228 if (trampcode == NULL) { 229 printf("Unable to allocate trampoline\n"); 230 return (ENOMEM); 231 } 232 #endif 233 bzero((void *)trampcode, LOADER_PAGE_SIZE); 234 bcopy((void *)&tramp, (void *)trampcode, tramp_size); 235 trampoline = (void *)trampcode; 236 237 #ifdef EFI 238 if (copy_staging == COPY_STAGING_ENABLE) { 239 PT4 = (pml4_entry_t *)0x0000000040000000; /* 1G */ 240 err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 3, 241 (EFI_PHYSICAL_ADDRESS *)&PT4); 242 if (EFI_ERROR(err)) { 243 printf("Unable to allocate trampoline page table\n"); 244 BS->FreePages(trampcode, 1); 245 if (copy_auto) 246 copy_staging = COPY_STAGING_AUTO; 247 return (ENOMEM); 248 } 249 bzero(PT4, 3 * LOADER_PAGE_SIZE); 250 PT3 = &PT4[512]; 251 PT2 = &PT3[512]; 252 253 /* 254 * This is kinda brutal, but every single 1GB VM 255 * memory segment points to the same first 1GB of 256 * physical memory. But it is more than adequate. 257 */ 258 for (i = 0; i < NPTEPG; i++) { 259 /* 260 * Each slot of the L4 pages points to the 261 * same L3 page. 262 */ 263 PT4[i] = (pml4_entry_t)PT3; 264 PT4[i] |= PG_V | PG_RW; 265 266 /* 267 * Each slot of the L3 pages points to the 268 * same L2 page. 269 */ 270 PT3[i] = (pdp_entry_t)PT2; 271 PT3[i] |= PG_V | PG_RW; 272 273 /* 274 * The L2 page slots are mapped with 2MB pages for 1GB. 275 */ 276 PT2[i] = (pd_entry_t)i * (2 * 1024 * 1024); 277 PT2[i] |= PG_V | PG_RW | PG_PS; 278 } 279 } else { 280 PT4 = (pml4_entry_t *)0x0000000100000000; /* 4G */ 281 err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 9, 282 (EFI_PHYSICAL_ADDRESS *)&PT4); 283 if (EFI_ERROR(err)) { 284 printf("Unable to allocate trampoline page table\n"); 285 BS->FreePages(trampcode, 9); 286 if (copy_auto) 287 copy_staging = COPY_STAGING_AUTO; 288 return (ENOMEM); 289 } 290 bzero(PT4, 9 * LOADER_PAGE_SIZE); 291 292 PT3_l = &PT4[NPML4EPG * 1]; 293 PT3_u = &PT4[NPML4EPG * 2]; 294 PT2_l0 = &PT4[NPML4EPG * 3]; 295 PT2_l1 = &PT4[NPML4EPG * 4]; 296 PT2_l2 = &PT4[NPML4EPG * 5]; 297 PT2_l3 = &PT4[NPML4EPG * 6]; 298 PT2_u0 = &PT4[NPML4EPG * 7]; 299 PT2_u1 = &PT4[NPML4EPG * 8]; 300 301 /* 1:1 mapping of lower 4G */ 302 PT4[0] = (pml4_entry_t)PT3_l | PG_V | PG_RW; 303 PT3_l[0] = (pdp_entry_t)PT2_l0 | PG_V | PG_RW; 304 PT3_l[1] = (pdp_entry_t)PT2_l1 | PG_V | PG_RW; 305 PT3_l[2] = (pdp_entry_t)PT2_l2 | PG_V | PG_RW; 306 PT3_l[3] = (pdp_entry_t)PT2_l3 | PG_V | PG_RW; 307 for (i = 0; i < 4 * NPDEPG; i++) { 308 PT2_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V | 309 PG_RW | PG_PS; 310 } 311 312 /* mapping of kernel 2G below top */ 313 PT4[NPML4EPG - 1] = (pml4_entry_t)PT3_u | PG_V | PG_RW; 314 PT3_u[NPDPEPG - 2] = (pdp_entry_t)PT2_u0 | PG_V | PG_RW; 315 PT3_u[NPDPEPG - 1] = (pdp_entry_t)PT2_u1 | PG_V | PG_RW; 316 /* compat mapping of phys @0 */ 317 PT2_u0[0] = PG_PS | PG_V | PG_RW; 318 /* this maps past staging area */ 319 for (i = 1; i < 2 * NPDEPG; i++) { 320 PT2_u0[i] = ((pd_entry_t)staging + 321 ((pd_entry_t)i - 1) * NBPDR) | 322 PG_V | PG_RW | PG_PS; 323 } 324 } 325 #else 326 { 327 vm_offset_t pabase, pa_pt3_l, pa_pt3_u, pa_pt2_l0, pa_pt2_l1, pa_pt2_l2, pa_pt2_l3, pa_pt2_u0, pa_pt2_u1; 328 329 /* We'll find a place for these later */ 330 PT4 = (pml4_entry_t *)host_getmem(9 * LOADER_PAGE_SIZE); 331 bzero(PT4, 9 * LOADER_PAGE_SIZE); 332 333 PT3_l = &PT4[NPML4EPG * 1]; 334 PT3_u = &PT4[NPML4EPG * 2]; 335 PT2_l0 = &PT4[NPML4EPG * 3]; 336 PT2_l1 = &PT4[NPML4EPG * 4]; 337 PT2_l2 = &PT4[NPML4EPG * 5]; 338 PT2_l3 = &PT4[NPML4EPG * 6]; 339 PT2_u0 = &PT4[NPML4EPG * 7]; 340 PT2_u1 = &PT4[NPML4EPG * 8]; 341 342 pabase = trampolinebase + LOADER_PAGE_SIZE; 343 pa_pt3_l = pabase + LOADER_PAGE_SIZE * 1; 344 pa_pt3_u = pabase + LOADER_PAGE_SIZE * 2; 345 pa_pt2_l0 = pabase + LOADER_PAGE_SIZE * 3; 346 pa_pt2_l1 = pabase + LOADER_PAGE_SIZE * 4; 347 pa_pt2_l2 = pabase + LOADER_PAGE_SIZE * 5; 348 pa_pt2_l3 = pabase + LOADER_PAGE_SIZE * 6; 349 pa_pt2_u0 = pabase + LOADER_PAGE_SIZE * 7; 350 pa_pt2_u1 = pabase + LOADER_PAGE_SIZE * 8; 351 352 /* 1:1 mapping of lower 4G */ 353 PT4[0] = (pml4_entry_t)pa_pt3_l | PG_V | PG_RW; 354 PT3_l[0] = (pdp_entry_t)pa_pt2_l0 | PG_V | PG_RW; 355 PT3_l[1] = (pdp_entry_t)pa_pt2_l1 | PG_V | PG_RW; 356 PT3_l[2] = (pdp_entry_t)pa_pt2_l2 | PG_V | PG_RW; 357 PT3_l[3] = (pdp_entry_t)pa_pt2_l3 | PG_V | PG_RW; 358 for (i = 0; i < 4 * NPDEPG; i++) { /* we overflow PT2_l0 into _l1, etc */ 359 PT2_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V | 360 PG_RW | PG_PS; 361 } 362 363 /* mapping of kernel 2G below top */ 364 PT4[NPML4EPG - 1] = (pml4_entry_t)pa_pt3_u | PG_V | PG_RW; 365 PT3_u[NPDPEPG - 2] = (pdp_entry_t)pa_pt2_u0 | PG_V | PG_RW; 366 PT3_u[NPDPEPG - 1] = (pdp_entry_t)pa_pt2_u1 | PG_V | PG_RW; 367 /* compat mapping of phys @0 */ 368 PT2_u0[0] = PG_PS | PG_V | PG_RW; 369 /* this maps past staging area */ 370 /* 371 * Kernel uses the KERNSTART (== KERNBASE + 2MB) entry to figure 372 * out where we loaded the kernel. This is PT2_u0[1] (since 373 * these map 2MB pages. So the PA that this maps has to be 374 * kboot's staging + 2MB. For UEFI we do 'i - 1' since we load 375 * the kernel right at staging (and assume the first address we 376 * load is 2MB in efi_copyin). However for kboot, staging + 1 * 377 * NBPDR == staging + 2MB which is where the kernel starts. Our 378 * trampoline need not be mapped into the kernel space since we 379 * execute PA==VA for that, and the trampoline can just go away 380 * once the kernel is called. 381 * 382 * Staging should likely be as low as possible, though, because 383 * all the 'early' allocations are at kernend (which the kernel 384 * calls physfree). 385 */ 386 for (i = 1; i < 2 * NPDEPG; i++) { /* we overflow PT2_u0 into _u1 */ 387 PT2_u0[i] = ((pd_entry_t)staging + 388 ((pd_entry_t)i) * NBPDR) | 389 PG_V | PG_RW | PG_PS; 390 if (i < 10) printf("Mapping %d to %#lx staging %#lx\n", i, PT2_u0[i], staging); 391 } 392 } 393 #endif 394 395 #ifdef EFI 396 printf("staging %#lx (%scopying) tramp %p PT4 %p\n", 397 staging, copy_staging == COPY_STAGING_ENABLE ? "" : "not ", 398 trampoline, PT4); 399 #else 400 printf("staging %#lx tramp %p PT4 %p\n", staging, (void *)trampolinebase, 401 (void *)trampolinebase + LOADER_PAGE_SIZE); 402 #endif 403 printf("Start @ 0x%lx ...\n", ehdr->e_entry); 404 405 #ifdef EFI 406 efi_time_fini(); 407 #endif 408 err = bi_load(fp->f_args, &modulep, &kernend, true); 409 if (err != 0) { 410 #ifdef EFI 411 efi_time_init(); 412 if (copy_auto) 413 copy_staging = COPY_STAGING_AUTO; 414 #endif 415 return (err); 416 } 417 418 dev_cleanup(); 419 420 #ifdef EFI 421 trampoline(trampstack, copy_staging == COPY_STAGING_ENABLE ? 422 efi_copy_finish : efi_copy_finish_nop, kernend, modulep, 423 PT4, ehdr->e_entry); 424 #else 425 trampoline_data = (void *)trampoline + tramp_data_offset; 426 trampoline_data->entry = ehdr->e_entry; 427 trampoline_data->pt4 = trampolinebase + LOADER_PAGE_SIZE; 428 /* 429 * So we compute the VA of the module data by modulep + KERNBASE.... 430 * need to make sure that that address is mapped right. We calculate 431 * the start of available memory to allocate via kernend (which is 432 * calculated with a phyaddr of "kernend + PA(PT_u0[1])"), so we better 433 * make sure we're not overwriting the last 2MB of the kernel :). 434 */ 435 trampoline_data->modulep = modulep; /* Offset from KERNBASE */ 436 trampoline_data->kernend = kernend; /* Offset from the load address */ 437 trampoline_data->fill1 = trampoline_data->fill2 = 0; 438 printf("Modulep = %lx kernend %lx\n", modulep, kernend); 439 /* NOTE: when copyting in, it's relative to the start of our 'area' not an abs addr */ 440 /* Copy the trampoline to the ksegs */ 441 archsw.arch_copyin((void *)trampcode, trampolinebase - staging, tramp_size); 442 /* Copy the page table to the ksegs */ 443 archsw.arch_copyin(PT4, trampoline_data->pt4 - staging, 9 * LOADER_PAGE_SIZE); 444 445 kboot_kseg_get(&nseg, &kseg); 446 error = host_kexec_load(trampolinebase, nseg, kseg, HOST_KEXEC_ARCH_X86_64); 447 if (error != 0) 448 panic("kexec_load returned error: %d", error); 449 host_reboot(HOST_REBOOT_MAGIC1, HOST_REBOOT_MAGIC2, HOST_REBOOT_CMD_KEXEC, 0); 450 #endif 451 452 panic("exec returned"); 453 } 454 455 static int 456 elf64_obj_exec(struct preloaded_file *fp) 457 { 458 459 return (EFTYPE); 460 } 461