1 /*-
2  * Copyright (c) 1998 Michael Smith <msmith@freebsd.org>
3  * Copyright (c) 2014 The FreeBSD Foundation
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #define __ELF_WORD_SIZE 64
29 #include <sys/param.h>
30 #include <sys/exec.h>
31 #include <sys/linker.h>
32 #include <string.h>
33 #include <machine/elf.h>
34 #include <stand.h>
35 #include <vm/vm.h>
36 #include <vm/pmap.h>
37 
38 #ifdef EFI
39 #include <efi.h>
40 #include <efilib.h>
41 #else
42 #include "host_syscall.h"
43 #endif
44 
45 #include "bootstrap.h"
46 #include "kboot.h"
47 
48 #include "platform/acfreebsd.h"
49 #include "acconfig.h"
50 #define ACPI_SYSTEM_XFACE
51 #include "actypes.h"
52 #include "actbl.h"
53 
54 #ifdef EFI
55 #include "loader_efi.h"
56 
57 static EFI_GUID acpi_guid = ACPI_TABLE_GUID;
58 static EFI_GUID acpi20_guid = ACPI_20_TABLE_GUID;
59 #endif
60 
61 #ifdef EFI
62 #define LOADER_PAGE_SIZE EFI_PAGE_SIZE
63 #else
64 #define LOADER_PAGE_SIZE PAGE_SIZE
65 #endif
66 
67 extern int bi_load(char *args, vm_offset_t *modulep, vm_offset_t *kernendp,
68     bool exit_bs);
69 
70 static int	elf64_exec(struct preloaded_file *amp);
71 static int	elf64_obj_exec(struct preloaded_file *amp);
72 
73 static struct file_format amd64_elf = {
74 	.l_load = elf64_loadfile,
75 	.l_exec = elf64_exec,
76 };
77 static struct file_format amd64_elf_obj = {
78 	.l_load = elf64_obj_loadfile,
79 	.l_exec = elf64_obj_exec,
80 };
81 
82 #ifdef EFI
83 extern struct file_format multiboot2;
84 extern struct file_format multiboot2_obj;
85 #endif
86 
87 struct file_format *file_formats[] = {
88 #ifdef EFI
89 	&multiboot2,
90 	&multiboot2_obj,
91 #endif
92 	&amd64_elf,
93 	&amd64_elf_obj,
94 	NULL
95 };
96 
97 #ifndef	EFI
98 /*
99  * We create the stack that we want. We have the address of the page tables
100  * we make on top (so we pop that off and set %cr3). We have the entry point
101  * to the kernel (which retq pops off) This leaves the stack that the btext
102  * wants: offset 4 is modulep and offset8 is kernend, with the filler bytes
103  * to keep this aligned. This makes the trampoline very simple.
104  */
105 struct trampoline_data {
106 	uint64_t	pt4;			// Page table address to pop
107 	uint64_t	entry;			// return address to jump to kernel
108 	uint32_t	fill1;			// 0
109 	uint32_t	modulep;		// 4 module metadata
110 	uint32_t	kernend;		// 8 kernel end
111 	uint32_t	fill2;			// 12
112 };
113 _Static_assert(sizeof(struct trampoline_data) == 32, "Bad size for trampoline data");
114 #endif
115 
116 static pml4_entry_t *PT4;
117 static pdp_entry_t *PT3_l, *PT3_u;
118 static pd_entry_t *PT2_l0, *PT2_l1, *PT2_l2, *PT2_l3, *PT2_u0, *PT2_u1;
119 
120 #ifdef EFI
121 static pdp_entry_t *PT3;
122 static pd_entry_t *PT2;
123 
124 extern EFI_PHYSICAL_ADDRESS staging;
125 
126 static void (*trampoline)(uint64_t stack, void *copy_finish, uint64_t kernend,
127     uint64_t modulep, pml4_entry_t *pagetable, uint64_t entry);
128 #endif
129 
130 extern uintptr_t tramp;
131 extern uint32_t tramp_size;
132 #ifndef EFI
133 extern uint32_t tramp_data_offset;
134 #endif
135 
136 /*
137  * There is an ELF kernel and one or more ELF modules loaded.
138  * We wish to start executing the kernel image, so make such
139  * preparations as are required, and do so.
140  */
141 static int
elf64_exec(struct preloaded_file * fp)142 elf64_exec(struct preloaded_file *fp)
143 {
144 	struct file_metadata	*md;
145 	Elf_Ehdr 		*ehdr;
146 	vm_offset_t		modulep, kernend;
147 	int			err, i;
148 	char			buf[24];
149 #ifdef EFI
150 	ACPI_TABLE_RSDP		*rsdp = NULL;
151 	int			revision;
152 	int			copy_auto;
153 	vm_offset_t		trampstack, trampcode;
154 #else
155 	vm_offset_t		rsdp = 0;
156 	void			*trampcode;
157 	int			nseg;
158 	void			*kseg;
159 	vm_offset_t		trampolinebase;
160 	uint64_t		*trampoline;
161 	struct trampoline_data	*trampoline_data;
162 	vm_offset_t		staging;
163 	int			error;
164 #endif
165 
166 #ifdef EFI
167 	copy_auto = copy_staging == COPY_STAGING_AUTO;
168 	if (copy_auto)
169 		copy_staging = fp->f_kernphys_relocatable ?
170 		    COPY_STAGING_DISABLE : COPY_STAGING_ENABLE;
171 #else
172 	/*
173 	 * Figure out where to put it.
174 	 *
175 	 * Linux does not allow to do kexec_load into any part of memory. Ask
176 	 * arch_loadaddr to resolve the first available chunk of physical memory
177 	 * where loading is possible (load_addr).
178 	 *
179 	 * The kernel is loaded at the 'base' address in continguous physical
180 	 * pages (using 2MB super pages). The first such page is unused by the
181 	 * kernel and serves as a good place to put not only the trampoline, but
182 	 * the page table pages that the trampoline needs to setup the proper
183 	 * kernel starting environment.
184 	 */
185 	staging = trampolinebase = kboot_get_phys_load_segment();
186 	trampolinebase += 1ULL << 20;	/* Copy trampoline to base + 1MB, kernel will wind up at 2MB */
187 	printf("Load address at %#jx\n", (uintmax_t)trampolinebase);
188 	printf("Relocation offset is %#jx\n", (uintmax_t)elf64_relocation_offset);
189 #endif
190 
191 	/*
192 	 * Report the RSDP to the kernel. While this can be found with
193 	 * a BIOS boot, the RSDP may be elsewhere when booted from UEFI.
194 	 */
195 #ifdef EFI
196 	rsdp = efi_get_table(&acpi20_guid);
197 	if (rsdp == NULL) {
198 		rsdp = efi_get_table(&acpi_guid);
199 	}
200 #else
201 	rsdp = acpi_rsdp();
202 #endif
203 	if (rsdp != 0) {
204 		sprintf(buf, "0x%016llx", (unsigned long long)rsdp);
205 		setenv("acpi.rsdp", buf, 1);
206 	}
207 	if ((md = file_findmetadata(fp, MODINFOMD_ELFHDR)) == NULL)
208 		return (EFTYPE);
209 	ehdr = (Elf_Ehdr *)&(md->md_data);
210 
211 #ifdef EFI
212 	trampcode = copy_staging == COPY_STAGING_ENABLE ?
213 	    (vm_offset_t)0x0000000040000000 /* 1G */ :
214 	    (vm_offset_t)0x0000000100000000; /* 4G */;
215 	err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 1,
216 	    (EFI_PHYSICAL_ADDRESS *)&trampcode);
217 	if (EFI_ERROR(err)) {
218 		printf("Unable to allocate trampoline\n");
219 		if (copy_auto)
220 			copy_staging = COPY_STAGING_AUTO;
221 		return (ENOMEM);
222 	}
223 	trampstack = trampcode + LOADER_PAGE_SIZE - 8;
224 #else
225 	// XXX Question: why not just use malloc?
226 	trampcode = host_getmem(LOADER_PAGE_SIZE);
227 	if (trampcode == NULL) {
228 		printf("Unable to allocate trampoline\n");
229 		return (ENOMEM);
230 	}
231 #endif
232 	bzero((void *)trampcode, LOADER_PAGE_SIZE);
233 	bcopy((void *)&tramp, (void *)trampcode, tramp_size);
234 	trampoline = (void *)trampcode;
235 
236 #ifdef EFI
237 	if (copy_staging == COPY_STAGING_ENABLE) {
238 		PT4 = (pml4_entry_t *)0x0000000040000000; /* 1G */
239 		err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 3,
240 		    (EFI_PHYSICAL_ADDRESS *)&PT4);
241 		if (EFI_ERROR(err)) {
242 			printf("Unable to allocate trampoline page table\n");
243 			BS->FreePages(trampcode, 1);
244 			if (copy_auto)
245 				copy_staging = COPY_STAGING_AUTO;
246 			return (ENOMEM);
247 		}
248 		bzero(PT4, 3 * LOADER_PAGE_SIZE);
249 		PT3 = &PT4[512];
250 		PT2 = &PT3[512];
251 
252 		/*
253 		 * This is kinda brutal, but every single 1GB VM
254 		 * memory segment points to the same first 1GB of
255 		 * physical memory.  But it is more than adequate.
256 		 */
257 		for (i = 0; i < NPTEPG; i++) {
258 			/*
259 			 * Each slot of the L4 pages points to the
260 			 * same L3 page.
261 			 */
262 			PT4[i] = (pml4_entry_t)PT3;
263 			PT4[i] |= PG_V | PG_RW;
264 
265 			/*
266 			 * Each slot of the L3 pages points to the
267 			 * same L2 page.
268 			 */
269 			PT3[i] = (pdp_entry_t)PT2;
270 			PT3[i] |= PG_V | PG_RW;
271 
272 			/*
273 			 * The L2 page slots are mapped with 2MB pages for 1GB.
274 			 */
275 			PT2[i] = (pd_entry_t)i * (2 * 1024 * 1024);
276 			PT2[i] |= PG_V | PG_RW | PG_PS;
277 		}
278 	} else {
279 		PT4 = (pml4_entry_t *)0x0000000100000000; /* 4G */
280 		err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 9,
281 		    (EFI_PHYSICAL_ADDRESS *)&PT4);
282 		if (EFI_ERROR(err)) {
283 			printf("Unable to allocate trampoline page table\n");
284 			BS->FreePages(trampcode, 9);
285 			if (copy_auto)
286 				copy_staging = COPY_STAGING_AUTO;
287 			return (ENOMEM);
288 		}
289 		bzero(PT4, 9 * LOADER_PAGE_SIZE);
290 
291 		PT3_l = &PT4[NPML4EPG * 1];
292 		PT3_u = &PT4[NPML4EPG * 2];
293 		PT2_l0 = &PT4[NPML4EPG * 3];
294 		PT2_l1 = &PT4[NPML4EPG * 4];
295 		PT2_l2 = &PT4[NPML4EPG * 5];
296 		PT2_l3 = &PT4[NPML4EPG * 6];
297 		PT2_u0 = &PT4[NPML4EPG * 7];
298 		PT2_u1 = &PT4[NPML4EPG * 8];
299 
300 		/* 1:1 mapping of lower 4G */
301 		PT4[0] = (pml4_entry_t)PT3_l | PG_V | PG_RW;
302 		PT3_l[0] = (pdp_entry_t)PT2_l0 | PG_V | PG_RW;
303 		PT3_l[1] = (pdp_entry_t)PT2_l1 | PG_V | PG_RW;
304 		PT3_l[2] = (pdp_entry_t)PT2_l2 | PG_V | PG_RW;
305 		PT3_l[3] = (pdp_entry_t)PT2_l3 | PG_V | PG_RW;
306 		for (i = 0; i < 4 * NPDEPG; i++) {
307 			PT2_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V |
308 			    PG_RW | PG_PS;
309 		}
310 
311 		/* mapping of kernel 2G below top */
312 		PT4[NPML4EPG - 1] = (pml4_entry_t)PT3_u | PG_V | PG_RW;
313 		PT3_u[NPDPEPG - 2] = (pdp_entry_t)PT2_u0 | PG_V | PG_RW;
314 		PT3_u[NPDPEPG - 1] = (pdp_entry_t)PT2_u1 | PG_V | PG_RW;
315 		/* compat mapping of phys @0 */
316 		PT2_u0[0] = PG_PS | PG_V | PG_RW;
317 		/* this maps past staging area */
318 		for (i = 1; i < 2 * NPDEPG; i++) {
319 			PT2_u0[i] = ((pd_entry_t)staging +
320 			    ((pd_entry_t)i - 1) * NBPDR) |
321 			    PG_V | PG_RW | PG_PS;
322 		}
323 	}
324 #else
325 	{
326 		vm_offset_t pabase, pa_pt3_l, pa_pt3_u, pa_pt2_l0, pa_pt2_l1, pa_pt2_l2, pa_pt2_l3, pa_pt2_u0, pa_pt2_u1;
327 
328 		/* We'll find a place for these later */
329 		PT4 = (pml4_entry_t *)host_getmem(9 * LOADER_PAGE_SIZE);
330 		bzero(PT4, 9 * LOADER_PAGE_SIZE);
331 
332 		PT3_l = &PT4[NPML4EPG * 1];
333 		PT3_u = &PT4[NPML4EPG * 2];
334 		PT2_l0 = &PT4[NPML4EPG * 3];
335 		PT2_l1 = &PT4[NPML4EPG * 4];
336 		PT2_l2 = &PT4[NPML4EPG * 5];
337 		PT2_l3 = &PT4[NPML4EPG * 6];
338 		PT2_u0 = &PT4[NPML4EPG * 7];
339 		PT2_u1 = &PT4[NPML4EPG * 8];
340 
341 		pabase = trampolinebase + LOADER_PAGE_SIZE;
342 		pa_pt3_l = pabase + LOADER_PAGE_SIZE * 1;
343 		pa_pt3_u = pabase + LOADER_PAGE_SIZE * 2;
344 		pa_pt2_l0 = pabase + LOADER_PAGE_SIZE * 3;
345 		pa_pt2_l1 = pabase + LOADER_PAGE_SIZE * 4;
346 		pa_pt2_l2 = pabase + LOADER_PAGE_SIZE * 5;
347 		pa_pt2_l3 = pabase + LOADER_PAGE_SIZE * 6;
348 		pa_pt2_u0 = pabase + LOADER_PAGE_SIZE * 7;
349 		pa_pt2_u1 = pabase + LOADER_PAGE_SIZE * 8;
350 
351 		/* 1:1 mapping of lower 4G */
352 		PT4[0] = (pml4_entry_t)pa_pt3_l | PG_V | PG_RW;
353 		PT3_l[0] = (pdp_entry_t)pa_pt2_l0 | PG_V | PG_RW;
354 		PT3_l[1] = (pdp_entry_t)pa_pt2_l1 | PG_V | PG_RW;
355 		PT3_l[2] = (pdp_entry_t)pa_pt2_l2 | PG_V | PG_RW;
356 		PT3_l[3] = (pdp_entry_t)pa_pt2_l3 | PG_V | PG_RW;
357 		for (i = 0; i < 4 * NPDEPG; i++) {	/* we overflow PT2_l0 into _l1, etc */
358 			PT2_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V |
359 			    PG_RW | PG_PS;
360 		}
361 
362 		/* mapping of kernel 2G below top */
363 		PT4[NPML4EPG - 1] = (pml4_entry_t)pa_pt3_u | PG_V | PG_RW;
364 		PT3_u[NPDPEPG - 2] = (pdp_entry_t)pa_pt2_u0 | PG_V | PG_RW;
365 		PT3_u[NPDPEPG - 1] = (pdp_entry_t)pa_pt2_u1 | PG_V | PG_RW;
366 		/* compat mapping of phys @0 */
367 		PT2_u0[0] = PG_PS | PG_V | PG_RW;
368 		/* this maps past staging area */
369 		/*
370 		 * Kernel uses the KERNSTART (== KERNBASE + 2MB) entry to figure
371 		 * out where we loaded the kernel. This is PT2_u0[1] (since
372 		 * these map 2MB pages. So the PA that this maps has to be
373 		 * kboot's staging + 2MB.  For UEFI we do 'i - 1' since we load
374 		 * the kernel right at staging (and assume the first address we
375 		 * load is 2MB in efi_copyin). However for kboot, staging + 1 *
376 		 * NBPDR == staging + 2MB which is where the kernel starts. Our
377 		 * trampoline need not be mapped into the kernel space since we
378 		 * execute PA==VA for that, and the trampoline can just go away
379 		 * once the kernel is called.
380 		 *
381 		 * Staging should likely be as low as possible, though, because
382 		 * all the 'early' allocations are at kernend (which the kernel
383 		 * calls physfree).
384 		 */
385 		for (i = 1; i < 2 * NPDEPG; i++) {	/* we overflow PT2_u0 into _u1 */
386 			PT2_u0[i] = ((pd_entry_t)staging +
387 			    ((pd_entry_t)i) * NBPDR) |
388 			    PG_V | PG_RW | PG_PS;
389 			if (i < 10) printf("Mapping %d to %#lx staging %#lx\n", i, PT2_u0[i], staging);
390 		}
391 	}
392 #endif
393 
394 #ifdef EFI
395 	printf("staging %#lx (%scopying) tramp %p PT4 %p\n",
396 	    staging, copy_staging == COPY_STAGING_ENABLE ? "" : "not ",
397 	    trampoline, PT4);
398 #else
399 	printf("staging %#lx tramp %p PT4 %p\n", staging, (void *)trampolinebase,
400 	    (void *)trampolinebase + LOADER_PAGE_SIZE);
401 #endif
402 	printf("Start @ 0x%lx ...\n", ehdr->e_entry);
403 
404 #ifdef EFI
405 	efi_time_fini();
406 #endif
407 	err = bi_load(fp->f_args, &modulep, &kernend, true);
408 	if (err != 0) {
409 #ifdef EFI
410 		efi_time_init();
411 		if (copy_auto)
412 			copy_staging = COPY_STAGING_AUTO;
413 #endif
414 		return (err);
415 	}
416 
417 	dev_cleanup();
418 
419 #ifdef EFI
420 	trampoline(trampstack, copy_staging == COPY_STAGING_ENABLE ?
421 	    efi_copy_finish : efi_copy_finish_nop, kernend, modulep,
422 	    PT4, ehdr->e_entry);
423 #else
424 	trampoline_data = (void *)trampoline + tramp_data_offset;
425 	trampoline_data->entry = ehdr->e_entry;
426 	trampoline_data->pt4 = trampolinebase + LOADER_PAGE_SIZE;
427 	/*
428 	 * So we compute the VA of the module data by modulep + KERNBASE....
429 	 * need to make sure that that address is mapped right. We calculate
430 	 * the start of available memory to allocate via kernend (which is
431 	 * calculated with a phyaddr of "kernend + PA(PT_u0[1])"), so we better
432 	 * make sure we're not overwriting the last 2MB of the kernel :).
433 	 */
434 	trampoline_data->modulep = modulep;	/* Offset from KERNBASE */
435 	trampoline_data->kernend = kernend;	/* Offset from the load address */
436 	trampoline_data->fill1 = trampoline_data->fill2 = 0;
437 	printf("Modulep = %lx kernend %lx\n", modulep, kernend);
438 	/* NOTE: when copyting in, it's relative to the start of our 'area' not an abs addr */
439 	/* Copy the trampoline to the ksegs */
440 	archsw.arch_copyin((void *)trampcode, trampolinebase - staging, tramp_size);
441 	/* Copy the page table to the ksegs */
442 	archsw.arch_copyin(PT4, trampoline_data->pt4 - staging, 9 * LOADER_PAGE_SIZE);
443 
444 	kboot_kseg_get(&nseg, &kseg);
445 	error = host_kexec_load(trampolinebase, nseg, kseg, HOST_KEXEC_ARCH_X86_64);
446 	if (error != 0)
447 		panic("kexec_load returned error: %d", error);
448 	host_reboot(HOST_REBOOT_MAGIC1, HOST_REBOOT_MAGIC2, HOST_REBOOT_CMD_KEXEC, 0);
449 #endif
450 
451 	panic("exec returned");
452 }
453 
454 static int
elf64_obj_exec(struct preloaded_file * fp)455 elf64_obj_exec(struct preloaded_file *fp)
456 {
457 
458 	return (EFTYPE);
459 }
460