1 /*- 2 * Copyright (c) 1998 Michael Smith <msmith@freebsd.org> 3 * Copyright (c) 2014 The FreeBSD Foundation 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #define __ELF_WORD_SIZE 64 29 #include <sys/param.h> 30 #include <sys/exec.h> 31 #include <sys/linker.h> 32 #include <string.h> 33 #include <machine/elf.h> 34 #include <stand.h> 35 #include <vm/vm.h> 36 #include <vm/pmap.h> 37 38 #ifdef EFI 39 #include <efi.h> 40 #include <efilib.h> 41 #else 42 #include "host_syscall.h" 43 #endif 44 45 #include "bootstrap.h" 46 #include "kboot.h" 47 48 #include "platform/acfreebsd.h" 49 #include "acconfig.h" 50 #define ACPI_SYSTEM_XFACE 51 #include "actypes.h" 52 #include "actbl.h" 53 54 #ifdef EFI 55 #include "loader_efi.h" 56 57 static EFI_GUID acpi_guid = ACPI_TABLE_GUID; 58 static EFI_GUID acpi20_guid = ACPI_20_TABLE_GUID; 59 #endif 60 61 #ifdef EFI 62 #define LOADER_PAGE_SIZE EFI_PAGE_SIZE 63 #else 64 #define LOADER_PAGE_SIZE PAGE_SIZE 65 #endif 66 67 extern int bi_load(char *args, vm_offset_t *modulep, vm_offset_t *kernendp, 68 bool exit_bs); 69 70 static int elf64_exec(struct preloaded_file *amp); 71 static int elf64_obj_exec(struct preloaded_file *amp); 72 73 static struct file_format amd64_elf = { 74 .l_load = elf64_loadfile, 75 .l_exec = elf64_exec, 76 }; 77 static struct file_format amd64_elf_obj = { 78 .l_load = elf64_obj_loadfile, 79 .l_exec = elf64_obj_exec, 80 }; 81 82 #ifdef EFI 83 extern struct file_format multiboot2; 84 extern struct file_format multiboot2_obj; 85 #endif 86 87 struct file_format *file_formats[] = { 88 #ifdef EFI 89 &multiboot2, 90 &multiboot2_obj, 91 #endif 92 &amd64_elf, 93 &amd64_elf_obj, 94 NULL 95 }; 96 97 #ifndef EFI 98 /* 99 * We create the stack that we want. We have the address of the page tables 100 * we make on top (so we pop that off and set %cr3). We have the entry point 101 * to the kernel (which retq pops off) This leaves the stack that the btext 102 * wants: offset 4 is modulep and offset8 is kernend, with the filler bytes 103 * to keep this aligned. This makes the trampoline very simple. 104 */ 105 struct trampoline_data { 106 uint64_t pt4; // Page table address to pop 107 uint64_t entry; // return address to jump to kernel 108 uint32_t fill1; // 0 109 uint32_t modulep; // 4 module metadata 110 uint32_t kernend; // 8 kernel end 111 uint32_t fill2; // 12 112 }; 113 _Static_assert(sizeof(struct trampoline_data) == 32, "Bad size for trampoline data"); 114 #endif 115 116 static pml4_entry_t *PT4; 117 static pdp_entry_t *PT3_l, *PT3_u; 118 static pd_entry_t *PT2_l0, *PT2_l1, *PT2_l2, *PT2_l3, *PT2_u0, *PT2_u1; 119 120 #ifdef EFI 121 static pdp_entry_t *PT3; 122 static pd_entry_t *PT2; 123 124 extern EFI_PHYSICAL_ADDRESS staging; 125 126 static void (*trampoline)(uint64_t stack, void *copy_finish, uint64_t kernend, 127 uint64_t modulep, pml4_entry_t *pagetable, uint64_t entry); 128 #endif 129 130 extern uintptr_t tramp; 131 extern uint32_t tramp_size; 132 #ifndef EFI 133 extern uint32_t tramp_data_offset; 134 #endif 135 136 /* 137 * There is an ELF kernel and one or more ELF modules loaded. 138 * We wish to start executing the kernel image, so make such 139 * preparations as are required, and do so. 140 */ 141 static int 142 elf64_exec(struct preloaded_file *fp) 143 { 144 struct file_metadata *md; 145 Elf_Ehdr *ehdr; 146 vm_offset_t modulep, kernend; 147 int err, i; 148 char buf[24]; 149 #ifdef EFI 150 ACPI_TABLE_RSDP *rsdp = NULL; 151 int revision; 152 int copy_auto; 153 vm_offset_t trampstack, trampcode; 154 #else 155 vm_offset_t rsdp = 0; 156 void *trampcode; 157 int nseg; 158 void *kseg; 159 vm_offset_t trampolinebase; 160 uint64_t *trampoline; 161 struct trampoline_data *trampoline_data; 162 vm_offset_t staging; 163 int error; 164 #endif 165 166 #ifdef EFI 167 copy_auto = copy_staging == COPY_STAGING_AUTO; 168 if (copy_auto) 169 copy_staging = fp->f_kernphys_relocatable ? 170 COPY_STAGING_DISABLE : COPY_STAGING_ENABLE; 171 #else 172 /* 173 * Figure out where to put it. 174 * 175 * Linux does not allow to do kexec_load into any part of memory. Ask 176 * arch_loadaddr to resolve the first available chunk of physical memory 177 * where loading is possible (load_addr). 178 * 179 * The kernel is loaded at the 'base' address in continguous physical 180 * pages (using 2MB super pages). The first such page is unused by the 181 * kernel and serves as a good place to put not only the trampoline, but 182 * the page table pages that the trampoline needs to setup the proper 183 * kernel starting environment. 184 */ 185 staging = trampolinebase = kboot_get_phys_load_segment(); 186 trampolinebase += 1ULL << 20; /* Copy trampoline to base + 1MB, kernel will wind up at 2MB */ 187 printf("Load address at %#jx\n", (uintmax_t)trampolinebase); 188 printf("Relocation offset is %#jx\n", (uintmax_t)elf64_relocation_offset); 189 #endif 190 191 /* 192 * Report the RSDP to the kernel. While this can be found with 193 * a BIOS boot, the RSDP may be elsewhere when booted from UEFI. 194 */ 195 #ifdef EFI 196 rsdp = efi_get_table(&acpi20_guid); 197 if (rsdp == NULL) { 198 rsdp = efi_get_table(&acpi_guid); 199 } 200 #else 201 rsdp = acpi_rsdp(); 202 #endif 203 if (rsdp != 0) { 204 sprintf(buf, "0x%016llx", (unsigned long long)rsdp); 205 setenv("acpi.rsdp", buf, 1); 206 } 207 if ((md = file_findmetadata(fp, MODINFOMD_ELFHDR)) == NULL) 208 return (EFTYPE); 209 ehdr = (Elf_Ehdr *)&(md->md_data); 210 211 #ifdef EFI 212 trampcode = copy_staging == COPY_STAGING_ENABLE ? 213 (vm_offset_t)0x0000000040000000 /* 1G */ : 214 (vm_offset_t)0x0000000100000000; /* 4G */; 215 err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 1, 216 (EFI_PHYSICAL_ADDRESS *)&trampcode); 217 if (EFI_ERROR(err)) { 218 printf("Unable to allocate trampoline\n"); 219 if (copy_auto) 220 copy_staging = COPY_STAGING_AUTO; 221 return (ENOMEM); 222 } 223 trampstack = trampcode + LOADER_PAGE_SIZE - 8; 224 #else 225 // XXX Question: why not just use malloc? 226 trampcode = host_getmem(LOADER_PAGE_SIZE); 227 if (trampcode == NULL) { 228 printf("Unable to allocate trampoline\n"); 229 return (ENOMEM); 230 } 231 #endif 232 bzero((void *)trampcode, LOADER_PAGE_SIZE); 233 bcopy((void *)&tramp, (void *)trampcode, tramp_size); 234 trampoline = (void *)trampcode; 235 236 #ifdef EFI 237 if (copy_staging == COPY_STAGING_ENABLE) { 238 PT4 = (pml4_entry_t *)0x0000000040000000; /* 1G */ 239 err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 3, 240 (EFI_PHYSICAL_ADDRESS *)&PT4); 241 if (EFI_ERROR(err)) { 242 printf("Unable to allocate trampoline page table\n"); 243 BS->FreePages(trampcode, 1); 244 if (copy_auto) 245 copy_staging = COPY_STAGING_AUTO; 246 return (ENOMEM); 247 } 248 bzero(PT4, 3 * LOADER_PAGE_SIZE); 249 PT3 = &PT4[512]; 250 PT2 = &PT3[512]; 251 252 /* 253 * This is kinda brutal, but every single 1GB VM 254 * memory segment points to the same first 1GB of 255 * physical memory. But it is more than adequate. 256 */ 257 for (i = 0; i < NPTEPG; i++) { 258 /* 259 * Each slot of the L4 pages points to the 260 * same L3 page. 261 */ 262 PT4[i] = (pml4_entry_t)PT3; 263 PT4[i] |= PG_V | PG_RW; 264 265 /* 266 * Each slot of the L3 pages points to the 267 * same L2 page. 268 */ 269 PT3[i] = (pdp_entry_t)PT2; 270 PT3[i] |= PG_V | PG_RW; 271 272 /* 273 * The L2 page slots are mapped with 2MB pages for 1GB. 274 */ 275 PT2[i] = (pd_entry_t)i * (2 * 1024 * 1024); 276 PT2[i] |= PG_V | PG_RW | PG_PS; 277 } 278 } else { 279 PT4 = (pml4_entry_t *)0x0000000100000000; /* 4G */ 280 err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 9, 281 (EFI_PHYSICAL_ADDRESS *)&PT4); 282 if (EFI_ERROR(err)) { 283 printf("Unable to allocate trampoline page table\n"); 284 BS->FreePages(trampcode, 9); 285 if (copy_auto) 286 copy_staging = COPY_STAGING_AUTO; 287 return (ENOMEM); 288 } 289 bzero(PT4, 9 * LOADER_PAGE_SIZE); 290 291 PT3_l = &PT4[NPML4EPG * 1]; 292 PT3_u = &PT4[NPML4EPG * 2]; 293 PT2_l0 = &PT4[NPML4EPG * 3]; 294 PT2_l1 = &PT4[NPML4EPG * 4]; 295 PT2_l2 = &PT4[NPML4EPG * 5]; 296 PT2_l3 = &PT4[NPML4EPG * 6]; 297 PT2_u0 = &PT4[NPML4EPG * 7]; 298 PT2_u1 = &PT4[NPML4EPG * 8]; 299 300 /* 1:1 mapping of lower 4G */ 301 PT4[0] = (pml4_entry_t)PT3_l | PG_V | PG_RW; 302 PT3_l[0] = (pdp_entry_t)PT2_l0 | PG_V | PG_RW; 303 PT3_l[1] = (pdp_entry_t)PT2_l1 | PG_V | PG_RW; 304 PT3_l[2] = (pdp_entry_t)PT2_l2 | PG_V | PG_RW; 305 PT3_l[3] = (pdp_entry_t)PT2_l3 | PG_V | PG_RW; 306 for (i = 0; i < 4 * NPDEPG; i++) { 307 PT2_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V | 308 PG_RW | PG_PS; 309 } 310 311 /* mapping of kernel 2G below top */ 312 PT4[NPML4EPG - 1] = (pml4_entry_t)PT3_u | PG_V | PG_RW; 313 PT3_u[NPDPEPG - 2] = (pdp_entry_t)PT2_u0 | PG_V | PG_RW; 314 PT3_u[NPDPEPG - 1] = (pdp_entry_t)PT2_u1 | PG_V | PG_RW; 315 /* compat mapping of phys @0 */ 316 PT2_u0[0] = PG_PS | PG_V | PG_RW; 317 /* this maps past staging area */ 318 for (i = 1; i < 2 * NPDEPG; i++) { 319 PT2_u0[i] = ((pd_entry_t)staging + 320 ((pd_entry_t)i - 1) * NBPDR) | 321 PG_V | PG_RW | PG_PS; 322 } 323 } 324 #else 325 { 326 vm_offset_t pabase, pa_pt3_l, pa_pt3_u, pa_pt2_l0, pa_pt2_l1, pa_pt2_l2, pa_pt2_l3, pa_pt2_u0, pa_pt2_u1; 327 328 /* We'll find a place for these later */ 329 PT4 = (pml4_entry_t *)host_getmem(9 * LOADER_PAGE_SIZE); 330 bzero(PT4, 9 * LOADER_PAGE_SIZE); 331 332 PT3_l = &PT4[NPML4EPG * 1]; 333 PT3_u = &PT4[NPML4EPG * 2]; 334 PT2_l0 = &PT4[NPML4EPG * 3]; 335 PT2_l1 = &PT4[NPML4EPG * 4]; 336 PT2_l2 = &PT4[NPML4EPG * 5]; 337 PT2_l3 = &PT4[NPML4EPG * 6]; 338 PT2_u0 = &PT4[NPML4EPG * 7]; 339 PT2_u1 = &PT4[NPML4EPG * 8]; 340 341 pabase = trampolinebase + LOADER_PAGE_SIZE; 342 pa_pt3_l = pabase + LOADER_PAGE_SIZE * 1; 343 pa_pt3_u = pabase + LOADER_PAGE_SIZE * 2; 344 pa_pt2_l0 = pabase + LOADER_PAGE_SIZE * 3; 345 pa_pt2_l1 = pabase + LOADER_PAGE_SIZE * 4; 346 pa_pt2_l2 = pabase + LOADER_PAGE_SIZE * 5; 347 pa_pt2_l3 = pabase + LOADER_PAGE_SIZE * 6; 348 pa_pt2_u0 = pabase + LOADER_PAGE_SIZE * 7; 349 pa_pt2_u1 = pabase + LOADER_PAGE_SIZE * 8; 350 351 /* 1:1 mapping of lower 4G */ 352 PT4[0] = (pml4_entry_t)pa_pt3_l | PG_V | PG_RW; 353 PT3_l[0] = (pdp_entry_t)pa_pt2_l0 | PG_V | PG_RW; 354 PT3_l[1] = (pdp_entry_t)pa_pt2_l1 | PG_V | PG_RW; 355 PT3_l[2] = (pdp_entry_t)pa_pt2_l2 | PG_V | PG_RW; 356 PT3_l[3] = (pdp_entry_t)pa_pt2_l3 | PG_V | PG_RW; 357 for (i = 0; i < 4 * NPDEPG; i++) { /* we overflow PT2_l0 into _l1, etc */ 358 PT2_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V | 359 PG_RW | PG_PS; 360 } 361 362 /* mapping of kernel 2G below top */ 363 PT4[NPML4EPG - 1] = (pml4_entry_t)pa_pt3_u | PG_V | PG_RW; 364 PT3_u[NPDPEPG - 2] = (pdp_entry_t)pa_pt2_u0 | PG_V | PG_RW; 365 PT3_u[NPDPEPG - 1] = (pdp_entry_t)pa_pt2_u1 | PG_V | PG_RW; 366 /* compat mapping of phys @0 */ 367 PT2_u0[0] = PG_PS | PG_V | PG_RW; 368 /* this maps past staging area */ 369 /* 370 * Kernel uses the KERNSTART (== KERNBASE + 2MB) entry to figure 371 * out where we loaded the kernel. This is PT2_u0[1] (since 372 * these map 2MB pages. So the PA that this maps has to be 373 * kboot's staging + 2MB. For UEFI we do 'i - 1' since we load 374 * the kernel right at staging (and assume the first address we 375 * load is 2MB in efi_copyin). However for kboot, staging + 1 * 376 * NBPDR == staging + 2MB which is where the kernel starts. Our 377 * trampoline need not be mapped into the kernel space since we 378 * execute PA==VA for that, and the trampoline can just go away 379 * once the kernel is called. 380 * 381 * Staging should likely be as low as possible, though, because 382 * all the 'early' allocations are at kernend (which the kernel 383 * calls physfree). 384 */ 385 for (i = 1; i < 2 * NPDEPG; i++) { /* we overflow PT2_u0 into _u1 */ 386 PT2_u0[i] = ((pd_entry_t)staging + 387 ((pd_entry_t)i) * NBPDR) | 388 PG_V | PG_RW | PG_PS; 389 if (i < 10) printf("Mapping %d to %#lx staging %#lx\n", i, PT2_u0[i], staging); 390 } 391 } 392 #endif 393 394 #ifdef EFI 395 printf("staging %#lx (%scopying) tramp %p PT4 %p\n", 396 staging, copy_staging == COPY_STAGING_ENABLE ? "" : "not ", 397 trampoline, PT4); 398 #else 399 printf("staging %#lx tramp %p PT4 %p\n", staging, (void *)trampolinebase, 400 (void *)trampolinebase + LOADER_PAGE_SIZE); 401 #endif 402 printf("Start @ 0x%lx ...\n", ehdr->e_entry); 403 404 #ifdef EFI 405 efi_time_fini(); 406 #endif 407 err = bi_load(fp->f_args, &modulep, &kernend, true); 408 if (err != 0) { 409 #ifdef EFI 410 efi_time_init(); 411 if (copy_auto) 412 copy_staging = COPY_STAGING_AUTO; 413 #endif 414 return (err); 415 } 416 417 dev_cleanup(); 418 419 #ifdef EFI 420 trampoline(trampstack, copy_staging == COPY_STAGING_ENABLE ? 421 efi_copy_finish : efi_copy_finish_nop, kernend, modulep, 422 PT4, ehdr->e_entry); 423 #else 424 trampoline_data = (void *)trampoline + tramp_data_offset; 425 trampoline_data->entry = ehdr->e_entry; 426 trampoline_data->pt4 = trampolinebase + LOADER_PAGE_SIZE; 427 /* 428 * So we compute the VA of the module data by modulep + KERNBASE.... 429 * need to make sure that that address is mapped right. We calculate 430 * the start of available memory to allocate via kernend (which is 431 * calculated with a phyaddr of "kernend + PA(PT_u0[1])"), so we better 432 * make sure we're not overwriting the last 2MB of the kernel :). 433 */ 434 trampoline_data->modulep = modulep; /* Offset from KERNBASE */ 435 trampoline_data->kernend = kernend; /* Offset from the load address */ 436 trampoline_data->fill1 = trampoline_data->fill2 = 0; 437 printf("Modulep = %lx kernend %lx\n", modulep, kernend); 438 /* NOTE: when copyting in, it's relative to the start of our 'area' not an abs addr */ 439 /* Copy the trampoline to the ksegs */ 440 archsw.arch_copyin((void *)trampcode, trampolinebase - staging, tramp_size); 441 /* Copy the page table to the ksegs */ 442 archsw.arch_copyin(PT4, trampoline_data->pt4 - staging, 9 * LOADER_PAGE_SIZE); 443 444 kboot_kseg_get(&nseg, &kseg); 445 error = host_kexec_load(trampolinebase, nseg, kseg, HOST_KEXEC_ARCH_X86_64); 446 if (error != 0) 447 panic("kexec_load returned error: %d", error); 448 host_reboot(HOST_REBOOT_MAGIC1, HOST_REBOOT_MAGIC2, HOST_REBOOT_CMD_KEXEC, 0); 449 #endif 450 451 panic("exec returned"); 452 } 453 454 static int 455 elf64_obj_exec(struct preloaded_file *fp) 456 { 457 458 return (EFTYPE); 459 } 460