1 /*-
2 * Copyright (c) 1998 Michael Smith <msmith@freebsd.org>
3 * Copyright (c) 2014 The FreeBSD Foundation
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28 #define __ELF_WORD_SIZE 64
29 #include <sys/param.h>
30 #include <sys/exec.h>
31 #include <sys/linker.h>
32 #include <string.h>
33 #include <machine/elf.h>
34 #include <stand.h>
35 #include <vm/vm.h>
36 #include <vm/pmap.h>
37
38 #ifdef EFI
39 #include <efi.h>
40 #include <efilib.h>
41 #else
42 #include "host_syscall.h"
43 #endif
44
45 #include "bootstrap.h"
46 #include "kboot.h"
47
48 #include "platform/acfreebsd.h"
49 #include "acconfig.h"
50 #define ACPI_SYSTEM_XFACE
51 #include "actypes.h"
52 #include "actbl.h"
53
54 #ifdef EFI
55 #include "loader_efi.h"
56
57 static EFI_GUID acpi_guid = ACPI_TABLE_GUID;
58 static EFI_GUID acpi20_guid = ACPI_20_TABLE_GUID;
59 #endif
60
61 #ifdef EFI
62 #define LOADER_PAGE_SIZE EFI_PAGE_SIZE
63 #else
64 #define LOADER_PAGE_SIZE PAGE_SIZE
65 #endif
66
67 extern int bi_load(char *args, vm_offset_t *modulep, vm_offset_t *kernendp,
68 bool exit_bs);
69
70 static int elf64_exec(struct preloaded_file *amp);
71 static int elf64_obj_exec(struct preloaded_file *amp);
72
73 static struct file_format amd64_elf = {
74 .l_load = elf64_loadfile,
75 .l_exec = elf64_exec,
76 };
77 static struct file_format amd64_elf_obj = {
78 .l_load = elf64_obj_loadfile,
79 .l_exec = elf64_obj_exec,
80 };
81
82 #ifdef EFI
83 extern struct file_format multiboot2;
84 extern struct file_format multiboot2_obj;
85 #endif
86
87 struct file_format *file_formats[] = {
88 #ifdef EFI
89 &multiboot2,
90 &multiboot2_obj,
91 #endif
92 &amd64_elf,
93 &amd64_elf_obj,
94 NULL
95 };
96
97 #ifndef EFI
98 /*
99 * We create the stack that we want. We have the address of the page tables
100 * we make on top (so we pop that off and set %cr3). We have the entry point
101 * to the kernel (which retq pops off) This leaves the stack that the btext
102 * wants: offset 4 is modulep and offset8 is kernend, with the filler bytes
103 * to keep this aligned. This makes the trampoline very simple.
104 */
105 struct trampoline_data {
106 uint64_t pt4; // Page table address to pop
107 uint64_t entry; // return address to jump to kernel
108 uint32_t fill1; // 0
109 uint32_t modulep; // 4 module metadata
110 uint32_t kernend; // 8 kernel end
111 uint32_t fill2; // 12
112 };
113 _Static_assert(sizeof(struct trampoline_data) == 32, "Bad size for trampoline data");
114 #endif
115
116 static pml4_entry_t *PT4;
117 static pdp_entry_t *PT3_l, *PT3_u;
118 static pd_entry_t *PT2_l0, *PT2_l1, *PT2_l2, *PT2_l3, *PT2_u0, *PT2_u1;
119
120 #ifdef EFI
121 static pdp_entry_t *PT3;
122 static pd_entry_t *PT2;
123
124 extern EFI_PHYSICAL_ADDRESS staging;
125
126 static void (*trampoline)(uint64_t stack, void *copy_finish, uint64_t kernend,
127 uint64_t modulep, pml4_entry_t *pagetable, uint64_t entry);
128 #endif
129
130 extern uintptr_t tramp;
131 extern uint32_t tramp_size;
132 #ifndef EFI
133 extern uint32_t tramp_data_offset;
134 #endif
135
136 /*
137 * There is an ELF kernel and one or more ELF modules loaded.
138 * We wish to start executing the kernel image, so make such
139 * preparations as are required, and do so.
140 */
141 static int
elf64_exec(struct preloaded_file * fp)142 elf64_exec(struct preloaded_file *fp)
143 {
144 struct file_metadata *md;
145 Elf_Ehdr *ehdr;
146 vm_offset_t modulep, kernend;
147 int err, i;
148 char buf[24];
149 #ifdef EFI
150 ACPI_TABLE_RSDP *rsdp = NULL;
151 int revision;
152 int copy_auto;
153 vm_offset_t trampstack, trampcode;
154 #else
155 vm_offset_t rsdp = 0;
156 void *trampcode;
157 int nseg;
158 void *kseg;
159 vm_offset_t trampolinebase;
160 uint64_t *trampoline;
161 struct trampoline_data *trampoline_data;
162 vm_offset_t staging;
163 int error;
164 #endif
165
166 #ifdef EFI
167 copy_auto = copy_staging == COPY_STAGING_AUTO;
168 if (copy_auto)
169 copy_staging = fp->f_kernphys_relocatable ?
170 COPY_STAGING_DISABLE : COPY_STAGING_ENABLE;
171 #else
172 /*
173 * Figure out where to put it.
174 *
175 * Linux does not allow to do kexec_load into any part of memory. Ask
176 * arch_loadaddr to resolve the first available chunk of physical memory
177 * where loading is possible (load_addr).
178 *
179 * The kernel is loaded at the 'base' address in continguous physical
180 * pages (using 2MB super pages). The first such page is unused by the
181 * kernel and serves as a good place to put not only the trampoline, but
182 * the page table pages that the trampoline needs to setup the proper
183 * kernel starting environment.
184 */
185 staging = trampolinebase = kboot_get_phys_load_segment();
186 trampolinebase += 1ULL << 20; /* Copy trampoline to base + 1MB, kernel will wind up at 2MB */
187 printf("Load address at %#jx\n", (uintmax_t)trampolinebase);
188 printf("Relocation offset is %#jx\n", (uintmax_t)elf64_relocation_offset);
189 #endif
190
191 /*
192 * Report the RSDP to the kernel. While this can be found with
193 * a BIOS boot, the RSDP may be elsewhere when booted from UEFI.
194 */
195 #ifdef EFI
196 rsdp = efi_get_table(&acpi20_guid);
197 if (rsdp == NULL) {
198 rsdp = efi_get_table(&acpi_guid);
199 }
200 #else
201 rsdp = acpi_rsdp();
202 #endif
203 if (rsdp != 0) {
204 sprintf(buf, "0x%016llx", (unsigned long long)rsdp);
205 setenv("acpi.rsdp", buf, 1);
206 }
207 if ((md = file_findmetadata(fp, MODINFOMD_ELFHDR)) == NULL)
208 return (EFTYPE);
209 ehdr = (Elf_Ehdr *)&(md->md_data);
210
211 #ifdef EFI
212 trampcode = copy_staging == COPY_STAGING_ENABLE ?
213 (vm_offset_t)0x0000000040000000 /* 1G */ :
214 (vm_offset_t)0x0000000100000000; /* 4G */;
215 err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 1,
216 (EFI_PHYSICAL_ADDRESS *)&trampcode);
217 if (EFI_ERROR(err)) {
218 printf("Unable to allocate trampoline\n");
219 if (copy_auto)
220 copy_staging = COPY_STAGING_AUTO;
221 return (ENOMEM);
222 }
223 trampstack = trampcode + LOADER_PAGE_SIZE - 8;
224 #else
225 // XXX Question: why not just use malloc?
226 trampcode = host_getmem(LOADER_PAGE_SIZE);
227 if (trampcode == NULL) {
228 printf("Unable to allocate trampoline\n");
229 return (ENOMEM);
230 }
231 #endif
232 bzero((void *)trampcode, LOADER_PAGE_SIZE);
233 bcopy((void *)&tramp, (void *)trampcode, tramp_size);
234 trampoline = (void *)trampcode;
235
236 #ifdef EFI
237 if (copy_staging == COPY_STAGING_ENABLE) {
238 PT4 = (pml4_entry_t *)0x0000000040000000; /* 1G */
239 err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 3,
240 (EFI_PHYSICAL_ADDRESS *)&PT4);
241 if (EFI_ERROR(err)) {
242 printf("Unable to allocate trampoline page table\n");
243 BS->FreePages(trampcode, 1);
244 if (copy_auto)
245 copy_staging = COPY_STAGING_AUTO;
246 return (ENOMEM);
247 }
248 bzero(PT4, 3 * LOADER_PAGE_SIZE);
249 PT3 = &PT4[512];
250 PT2 = &PT3[512];
251
252 /*
253 * This is kinda brutal, but every single 1GB VM
254 * memory segment points to the same first 1GB of
255 * physical memory. But it is more than adequate.
256 */
257 for (i = 0; i < NPTEPG; i++) {
258 /*
259 * Each slot of the L4 pages points to the
260 * same L3 page.
261 */
262 PT4[i] = (pml4_entry_t)PT3;
263 PT4[i] |= PG_V | PG_RW;
264
265 /*
266 * Each slot of the L3 pages points to the
267 * same L2 page.
268 */
269 PT3[i] = (pdp_entry_t)PT2;
270 PT3[i] |= PG_V | PG_RW;
271
272 /*
273 * The L2 page slots are mapped with 2MB pages for 1GB.
274 */
275 PT2[i] = (pd_entry_t)i * (2 * 1024 * 1024);
276 PT2[i] |= PG_V | PG_RW | PG_PS;
277 }
278 } else {
279 PT4 = (pml4_entry_t *)0x0000000100000000; /* 4G */
280 err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 9,
281 (EFI_PHYSICAL_ADDRESS *)&PT4);
282 if (EFI_ERROR(err)) {
283 printf("Unable to allocate trampoline page table\n");
284 BS->FreePages(trampcode, 9);
285 if (copy_auto)
286 copy_staging = COPY_STAGING_AUTO;
287 return (ENOMEM);
288 }
289 bzero(PT4, 9 * LOADER_PAGE_SIZE);
290
291 PT3_l = &PT4[NPML4EPG * 1];
292 PT3_u = &PT4[NPML4EPG * 2];
293 PT2_l0 = &PT4[NPML4EPG * 3];
294 PT2_l1 = &PT4[NPML4EPG * 4];
295 PT2_l2 = &PT4[NPML4EPG * 5];
296 PT2_l3 = &PT4[NPML4EPG * 6];
297 PT2_u0 = &PT4[NPML4EPG * 7];
298 PT2_u1 = &PT4[NPML4EPG * 8];
299
300 /* 1:1 mapping of lower 4G */
301 PT4[0] = (pml4_entry_t)PT3_l | PG_V | PG_RW;
302 PT3_l[0] = (pdp_entry_t)PT2_l0 | PG_V | PG_RW;
303 PT3_l[1] = (pdp_entry_t)PT2_l1 | PG_V | PG_RW;
304 PT3_l[2] = (pdp_entry_t)PT2_l2 | PG_V | PG_RW;
305 PT3_l[3] = (pdp_entry_t)PT2_l3 | PG_V | PG_RW;
306 for (i = 0; i < 4 * NPDEPG; i++) {
307 PT2_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V |
308 PG_RW | PG_PS;
309 }
310
311 /* mapping of kernel 2G below top */
312 PT4[NPML4EPG - 1] = (pml4_entry_t)PT3_u | PG_V | PG_RW;
313 PT3_u[NPDPEPG - 2] = (pdp_entry_t)PT2_u0 | PG_V | PG_RW;
314 PT3_u[NPDPEPG - 1] = (pdp_entry_t)PT2_u1 | PG_V | PG_RW;
315 /* compat mapping of phys @0 */
316 PT2_u0[0] = PG_PS | PG_V | PG_RW;
317 /* this maps past staging area */
318 for (i = 1; i < 2 * NPDEPG; i++) {
319 PT2_u0[i] = ((pd_entry_t)staging +
320 ((pd_entry_t)i - 1) * NBPDR) |
321 PG_V | PG_RW | PG_PS;
322 }
323 }
324 #else
325 {
326 vm_offset_t pabase, pa_pt3_l, pa_pt3_u, pa_pt2_l0, pa_pt2_l1, pa_pt2_l2, pa_pt2_l3, pa_pt2_u0, pa_pt2_u1;
327
328 /* We'll find a place for these later */
329 PT4 = (pml4_entry_t *)host_getmem(9 * LOADER_PAGE_SIZE);
330 bzero(PT4, 9 * LOADER_PAGE_SIZE);
331
332 PT3_l = &PT4[NPML4EPG * 1];
333 PT3_u = &PT4[NPML4EPG * 2];
334 PT2_l0 = &PT4[NPML4EPG * 3];
335 PT2_l1 = &PT4[NPML4EPG * 4];
336 PT2_l2 = &PT4[NPML4EPG * 5];
337 PT2_l3 = &PT4[NPML4EPG * 6];
338 PT2_u0 = &PT4[NPML4EPG * 7];
339 PT2_u1 = &PT4[NPML4EPG * 8];
340
341 pabase = trampolinebase + LOADER_PAGE_SIZE;
342 pa_pt3_l = pabase + LOADER_PAGE_SIZE * 1;
343 pa_pt3_u = pabase + LOADER_PAGE_SIZE * 2;
344 pa_pt2_l0 = pabase + LOADER_PAGE_SIZE * 3;
345 pa_pt2_l1 = pabase + LOADER_PAGE_SIZE * 4;
346 pa_pt2_l2 = pabase + LOADER_PAGE_SIZE * 5;
347 pa_pt2_l3 = pabase + LOADER_PAGE_SIZE * 6;
348 pa_pt2_u0 = pabase + LOADER_PAGE_SIZE * 7;
349 pa_pt2_u1 = pabase + LOADER_PAGE_SIZE * 8;
350
351 /* 1:1 mapping of lower 4G */
352 PT4[0] = (pml4_entry_t)pa_pt3_l | PG_V | PG_RW;
353 PT3_l[0] = (pdp_entry_t)pa_pt2_l0 | PG_V | PG_RW;
354 PT3_l[1] = (pdp_entry_t)pa_pt2_l1 | PG_V | PG_RW;
355 PT3_l[2] = (pdp_entry_t)pa_pt2_l2 | PG_V | PG_RW;
356 PT3_l[3] = (pdp_entry_t)pa_pt2_l3 | PG_V | PG_RW;
357 for (i = 0; i < 4 * NPDEPG; i++) { /* we overflow PT2_l0 into _l1, etc */
358 PT2_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V |
359 PG_RW | PG_PS;
360 }
361
362 /* mapping of kernel 2G below top */
363 PT4[NPML4EPG - 1] = (pml4_entry_t)pa_pt3_u | PG_V | PG_RW;
364 PT3_u[NPDPEPG - 2] = (pdp_entry_t)pa_pt2_u0 | PG_V | PG_RW;
365 PT3_u[NPDPEPG - 1] = (pdp_entry_t)pa_pt2_u1 | PG_V | PG_RW;
366 /* compat mapping of phys @0 */
367 PT2_u0[0] = PG_PS | PG_V | PG_RW;
368 /* this maps past staging area */
369 /*
370 * Kernel uses the KERNSTART (== KERNBASE + 2MB) entry to figure
371 * out where we loaded the kernel. This is PT2_u0[1] (since
372 * these map 2MB pages. So the PA that this maps has to be
373 * kboot's staging + 2MB. For UEFI we do 'i - 1' since we load
374 * the kernel right at staging (and assume the first address we
375 * load is 2MB in efi_copyin). However for kboot, staging + 1 *
376 * NBPDR == staging + 2MB which is where the kernel starts. Our
377 * trampoline need not be mapped into the kernel space since we
378 * execute PA==VA for that, and the trampoline can just go away
379 * once the kernel is called.
380 *
381 * Staging should likely be as low as possible, though, because
382 * all the 'early' allocations are at kernend (which the kernel
383 * calls physfree).
384 */
385 for (i = 1; i < 2 * NPDEPG; i++) { /* we overflow PT2_u0 into _u1 */
386 PT2_u0[i] = ((pd_entry_t)staging +
387 ((pd_entry_t)i) * NBPDR) |
388 PG_V | PG_RW | PG_PS;
389 if (i < 10) printf("Mapping %d to %#lx staging %#lx\n", i, PT2_u0[i], staging);
390 }
391 }
392 #endif
393
394 #ifdef EFI
395 printf("staging %#lx (%scopying) tramp %p PT4 %p\n",
396 staging, copy_staging == COPY_STAGING_ENABLE ? "" : "not ",
397 trampoline, PT4);
398 #else
399 printf("staging %#lx tramp %p PT4 %p\n", staging, (void *)trampolinebase,
400 (void *)trampolinebase + LOADER_PAGE_SIZE);
401 #endif
402 printf("Start @ 0x%lx ...\n", ehdr->e_entry);
403
404 #ifdef EFI
405 efi_time_fini();
406 #endif
407 err = bi_load(fp->f_args, &modulep, &kernend, true);
408 if (err != 0) {
409 #ifdef EFI
410 efi_time_init();
411 if (copy_auto)
412 copy_staging = COPY_STAGING_AUTO;
413 #endif
414 return (err);
415 }
416
417 dev_cleanup();
418
419 #ifdef EFI
420 trampoline(trampstack, copy_staging == COPY_STAGING_ENABLE ?
421 efi_copy_finish : efi_copy_finish_nop, kernend, modulep,
422 PT4, ehdr->e_entry);
423 #else
424 trampoline_data = (void *)trampoline + tramp_data_offset;
425 trampoline_data->entry = ehdr->e_entry;
426 trampoline_data->pt4 = trampolinebase + LOADER_PAGE_SIZE;
427 /*
428 * So we compute the VA of the module data by modulep + KERNBASE....
429 * need to make sure that that address is mapped right. We calculate
430 * the start of available memory to allocate via kernend (which is
431 * calculated with a phyaddr of "kernend + PA(PT_u0[1])"), so we better
432 * make sure we're not overwriting the last 2MB of the kernel :).
433 */
434 trampoline_data->modulep = modulep; /* Offset from KERNBASE */
435 trampoline_data->kernend = kernend; /* Offset from the load address */
436 trampoline_data->fill1 = trampoline_data->fill2 = 0;
437 printf("Modulep = %lx kernend %lx\n", modulep, kernend);
438 /* NOTE: when copyting in, it's relative to the start of our 'area' not an abs addr */
439 /* Copy the trampoline to the ksegs */
440 archsw.arch_copyin((void *)trampcode, trampolinebase - staging, tramp_size);
441 /* Copy the page table to the ksegs */
442 archsw.arch_copyin(PT4, trampoline_data->pt4 - staging, 9 * LOADER_PAGE_SIZE);
443
444 kboot_kseg_get(&nseg, &kseg);
445 error = host_kexec_load(trampolinebase, nseg, kseg, HOST_KEXEC_ARCH_X86_64);
446 if (error != 0)
447 panic("kexec_load returned error: %d", error);
448 host_reboot(HOST_REBOOT_MAGIC1, HOST_REBOOT_MAGIC2, HOST_REBOOT_CMD_KEXEC, 0);
449 #endif
450
451 panic("exec returned");
452 }
453
454 static int
elf64_obj_exec(struct preloaded_file * fp)455 elf64_obj_exec(struct preloaded_file *fp)
456 {
457
458 return (EFTYPE);
459 }
460