1 /*-
2  * Copyright (c) 1998 Michael Smith <msmith@freebsd.org>
3  * Copyright (c) 2014 The FreeBSD Foundation
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 #define __ELF_WORD_SIZE 64
30 #include <sys/param.h>
31 #include <sys/exec.h>
32 #include <sys/linker.h>
33 #include <string.h>
34 #include <machine/elf.h>
35 #include <stand.h>
36 #include <vm/vm.h>
37 #include <vm/pmap.h>
38 
39 #ifdef EFI
40 #include <efi.h>
41 #include <efilib.h>
42 #else
43 #include "host_syscall.h"
44 #endif
45 
46 #include "bootstrap.h"
47 #include "kboot.h"
48 
49 #include "platform/acfreebsd.h"
50 #include "acconfig.h"
51 #define ACPI_SYSTEM_XFACE
52 #include "actypes.h"
53 #include "actbl.h"
54 
55 #ifdef EFI
56 #include "loader_efi.h"
57 
58 static EFI_GUID acpi_guid = ACPI_TABLE_GUID;
59 static EFI_GUID acpi20_guid = ACPI_20_TABLE_GUID;
60 #endif
61 
62 #ifdef EFI
63 #define LOADER_PAGE_SIZE EFI_PAGE_SIZE
64 #else
65 #define LOADER_PAGE_SIZE PAGE_SIZE
66 #endif
67 
68 extern int bi_load(char *args, vm_offset_t *modulep, vm_offset_t *kernendp,
69     bool exit_bs);
70 
71 static int	elf64_exec(struct preloaded_file *amp);
72 static int	elf64_obj_exec(struct preloaded_file *amp);
73 
74 static struct file_format amd64_elf = {
75 	.l_load = elf64_loadfile,
76 	.l_exec = elf64_exec,
77 };
78 static struct file_format amd64_elf_obj = {
79 	.l_load = elf64_obj_loadfile,
80 	.l_exec = elf64_obj_exec,
81 };
82 
83 #ifdef EFI
84 extern struct file_format multiboot2;
85 extern struct file_format multiboot2_obj;
86 #endif
87 
88 struct file_format *file_formats[] = {
89 #ifdef EFI
90 	&multiboot2,
91 	&multiboot2_obj,
92 #endif
93 	&amd64_elf,
94 	&amd64_elf_obj,
95 	NULL
96 };
97 
98 #ifndef	EFI
99 /*
100  * We create the stack that we want. We have the address of the page tables
101  * we make on top (so we pop that off and set %cr3). We have the entry point
102  * to the kernel (which retq pops off) This leaves the stack that the btext
103  * wants: offset 4 is modulep and offset8 is kernend, with the filler bytes
104  * to keep this aligned. This makes the trampoline very simple.
105  */
106 struct trampoline_data {
107 	uint64_t	pt4;			// Page table address to pop
108 	uint64_t	entry;			// return address to jump to kernel
109 	uint32_t	fill1;			// 0
110 	uint32_t	modulep;		// 4 module metadata
111 	uint32_t	kernend;		// 8 kernel end
112 	uint32_t	fill2;			// 12
113 };
114 _Static_assert(sizeof(struct trampoline_data) == 32, "Bad size for trampoline data");
115 #endif
116 
117 static pml4_entry_t *PT4;
118 static pdp_entry_t *PT3_l, *PT3_u;
119 static pd_entry_t *PT2_l0, *PT2_l1, *PT2_l2, *PT2_l3, *PT2_u0, *PT2_u1;
120 
121 #ifdef EFI
122 static pdp_entry_t *PT3;
123 static pd_entry_t *PT2;
124 
125 extern EFI_PHYSICAL_ADDRESS staging;
126 
127 static void (*trampoline)(uint64_t stack, void *copy_finish, uint64_t kernend,
128     uint64_t modulep, pml4_entry_t *pagetable, uint64_t entry);
129 #endif
130 
131 extern uintptr_t tramp;
132 extern uint32_t tramp_size;
133 #ifndef EFI
134 extern uint32_t tramp_data_offset;
135 #endif
136 
137 /*
138  * There is an ELF kernel and one or more ELF modules loaded.
139  * We wish to start executing the kernel image, so make such
140  * preparations as are required, and do so.
141  */
142 static int
143 elf64_exec(struct preloaded_file *fp)
144 {
145 	struct file_metadata	*md;
146 	Elf_Ehdr 		*ehdr;
147 	vm_offset_t		modulep, kernend;
148 	int			err, i;
149 	char			buf[24];
150 #ifdef EFI
151 	ACPI_TABLE_RSDP		*rsdp = NULL;
152 	int			revision;
153 	int			copy_auto;
154 	vm_offset_t		trampstack, trampcode;
155 #else
156 	vm_offset_t		rsdp = 0;
157 	void			*trampcode;
158 	int			nseg;
159 	void			*kseg;
160 	vm_offset_t		trampolinebase;
161 	uint64_t		*trampoline;
162 	struct trampoline_data	*trampoline_data;
163 	vm_offset_t		staging;
164 	int			error;
165 #endif
166 
167 #ifdef EFI
168 	copy_auto = copy_staging == COPY_STAGING_AUTO;
169 	if (copy_auto)
170 		copy_staging = fp->f_kernphys_relocatable ?
171 		    COPY_STAGING_DISABLE : COPY_STAGING_ENABLE;
172 #else
173 	/*
174 	 * Figure out where to put it.
175 	 *
176 	 * Linux does not allow to do kexec_load into any part of memory. Ask
177 	 * arch_loadaddr to resolve the first available chunk of physical memory
178 	 * where loading is possible (load_addr).
179 	 *
180 	 * The kernel is loaded at the 'base' address in continguous physical
181 	 * pages (using 2MB super pages). The first such page is unused by the
182 	 * kernel and serves as a good place to put not only the trampoline, but
183 	 * the page table pages that the trampoline needs to setup the proper
184 	 * kernel starting environment.
185 	 */
186 	staging = trampolinebase = kboot_get_phys_load_segment();
187 	trampolinebase += 1ULL << 20;	/* Copy trampoline to base + 1MB, kernel will wind up at 2MB */
188 	printf("Load address at %#jx\n", (uintmax_t)trampolinebase);
189 	printf("Relocation offset is %#jx\n", (uintmax_t)elf64_relocation_offset);
190 #endif
191 
192 	/*
193 	 * Report the RSDP to the kernel. While this can be found with
194 	 * a BIOS boot, the RSDP may be elsewhere when booted from UEFI.
195 	 */
196 #ifdef EFI
197 	rsdp = efi_get_table(&acpi20_guid);
198 	if (rsdp == NULL) {
199 		rsdp = efi_get_table(&acpi_guid);
200 	}
201 #else
202 	rsdp = acpi_rsdp();
203 #endif
204 	if (rsdp != 0) {
205 		sprintf(buf, "0x%016llx", (unsigned long long)rsdp);
206 		setenv("acpi.rsdp", buf, 1);
207 	}
208 	if ((md = file_findmetadata(fp, MODINFOMD_ELFHDR)) == NULL)
209 		return (EFTYPE);
210 	ehdr = (Elf_Ehdr *)&(md->md_data);
211 
212 #ifdef EFI
213 	trampcode = copy_staging == COPY_STAGING_ENABLE ?
214 	    (vm_offset_t)0x0000000040000000 /* 1G */ :
215 	    (vm_offset_t)0x0000000100000000; /* 4G */;
216 	err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 1,
217 	    (EFI_PHYSICAL_ADDRESS *)&trampcode);
218 	if (EFI_ERROR(err)) {
219 		printf("Unable to allocate trampoline\n");
220 		if (copy_auto)
221 			copy_staging = COPY_STAGING_AUTO;
222 		return (ENOMEM);
223 	}
224 	trampstack = trampcode + LOADER_PAGE_SIZE - 8;
225 #else
226 	// XXX Question: why not just use malloc?
227 	trampcode = host_getmem(LOADER_PAGE_SIZE);
228 	if (trampcode == NULL) {
229 		printf("Unable to allocate trampoline\n");
230 		return (ENOMEM);
231 	}
232 #endif
233 	bzero((void *)trampcode, LOADER_PAGE_SIZE);
234 	bcopy((void *)&tramp, (void *)trampcode, tramp_size);
235 	trampoline = (void *)trampcode;
236 
237 #ifdef EFI
238 	if (copy_staging == COPY_STAGING_ENABLE) {
239 		PT4 = (pml4_entry_t *)0x0000000040000000; /* 1G */
240 		err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 3,
241 		    (EFI_PHYSICAL_ADDRESS *)&PT4);
242 		if (EFI_ERROR(err)) {
243 			printf("Unable to allocate trampoline page table\n");
244 			BS->FreePages(trampcode, 1);
245 			if (copy_auto)
246 				copy_staging = COPY_STAGING_AUTO;
247 			return (ENOMEM);
248 		}
249 		bzero(PT4, 3 * LOADER_PAGE_SIZE);
250 		PT3 = &PT4[512];
251 		PT2 = &PT3[512];
252 
253 		/*
254 		 * This is kinda brutal, but every single 1GB VM
255 		 * memory segment points to the same first 1GB of
256 		 * physical memory.  But it is more than adequate.
257 		 */
258 		for (i = 0; i < NPTEPG; i++) {
259 			/*
260 			 * Each slot of the L4 pages points to the
261 			 * same L3 page.
262 			 */
263 			PT4[i] = (pml4_entry_t)PT3;
264 			PT4[i] |= PG_V | PG_RW;
265 
266 			/*
267 			 * Each slot of the L3 pages points to the
268 			 * same L2 page.
269 			 */
270 			PT3[i] = (pdp_entry_t)PT2;
271 			PT3[i] |= PG_V | PG_RW;
272 
273 			/*
274 			 * The L2 page slots are mapped with 2MB pages for 1GB.
275 			 */
276 			PT2[i] = (pd_entry_t)i * (2 * 1024 * 1024);
277 			PT2[i] |= PG_V | PG_RW | PG_PS;
278 		}
279 	} else {
280 		PT4 = (pml4_entry_t *)0x0000000100000000; /* 4G */
281 		err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 9,
282 		    (EFI_PHYSICAL_ADDRESS *)&PT4);
283 		if (EFI_ERROR(err)) {
284 			printf("Unable to allocate trampoline page table\n");
285 			BS->FreePages(trampcode, 9);
286 			if (copy_auto)
287 				copy_staging = COPY_STAGING_AUTO;
288 			return (ENOMEM);
289 		}
290 		bzero(PT4, 9 * LOADER_PAGE_SIZE);
291 
292 		PT3_l = &PT4[NPML4EPG * 1];
293 		PT3_u = &PT4[NPML4EPG * 2];
294 		PT2_l0 = &PT4[NPML4EPG * 3];
295 		PT2_l1 = &PT4[NPML4EPG * 4];
296 		PT2_l2 = &PT4[NPML4EPG * 5];
297 		PT2_l3 = &PT4[NPML4EPG * 6];
298 		PT2_u0 = &PT4[NPML4EPG * 7];
299 		PT2_u1 = &PT4[NPML4EPG * 8];
300 
301 		/* 1:1 mapping of lower 4G */
302 		PT4[0] = (pml4_entry_t)PT3_l | PG_V | PG_RW;
303 		PT3_l[0] = (pdp_entry_t)PT2_l0 | PG_V | PG_RW;
304 		PT3_l[1] = (pdp_entry_t)PT2_l1 | PG_V | PG_RW;
305 		PT3_l[2] = (pdp_entry_t)PT2_l2 | PG_V | PG_RW;
306 		PT3_l[3] = (pdp_entry_t)PT2_l3 | PG_V | PG_RW;
307 		for (i = 0; i < 4 * NPDEPG; i++) {
308 			PT2_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V |
309 			    PG_RW | PG_PS;
310 		}
311 
312 		/* mapping of kernel 2G below top */
313 		PT4[NPML4EPG - 1] = (pml4_entry_t)PT3_u | PG_V | PG_RW;
314 		PT3_u[NPDPEPG - 2] = (pdp_entry_t)PT2_u0 | PG_V | PG_RW;
315 		PT3_u[NPDPEPG - 1] = (pdp_entry_t)PT2_u1 | PG_V | PG_RW;
316 		/* compat mapping of phys @0 */
317 		PT2_u0[0] = PG_PS | PG_V | PG_RW;
318 		/* this maps past staging area */
319 		for (i = 1; i < 2 * NPDEPG; i++) {
320 			PT2_u0[i] = ((pd_entry_t)staging +
321 			    ((pd_entry_t)i - 1) * NBPDR) |
322 			    PG_V | PG_RW | PG_PS;
323 		}
324 	}
325 #else
326 	{
327 		vm_offset_t pabase, pa_pt3_l, pa_pt3_u, pa_pt2_l0, pa_pt2_l1, pa_pt2_l2, pa_pt2_l3, pa_pt2_u0, pa_pt2_u1;
328 
329 		/* We'll find a place for these later */
330 		PT4 = (pml4_entry_t *)host_getmem(9 * LOADER_PAGE_SIZE);
331 		bzero(PT4, 9 * LOADER_PAGE_SIZE);
332 
333 		PT3_l = &PT4[NPML4EPG * 1];
334 		PT3_u = &PT4[NPML4EPG * 2];
335 		PT2_l0 = &PT4[NPML4EPG * 3];
336 		PT2_l1 = &PT4[NPML4EPG * 4];
337 		PT2_l2 = &PT4[NPML4EPG * 5];
338 		PT2_l3 = &PT4[NPML4EPG * 6];
339 		PT2_u0 = &PT4[NPML4EPG * 7];
340 		PT2_u1 = &PT4[NPML4EPG * 8];
341 
342 		pabase = trampolinebase + LOADER_PAGE_SIZE;
343 		pa_pt3_l = pabase + LOADER_PAGE_SIZE * 1;
344 		pa_pt3_u = pabase + LOADER_PAGE_SIZE * 2;
345 		pa_pt2_l0 = pabase + LOADER_PAGE_SIZE * 3;
346 		pa_pt2_l1 = pabase + LOADER_PAGE_SIZE * 4;
347 		pa_pt2_l2 = pabase + LOADER_PAGE_SIZE * 5;
348 		pa_pt2_l3 = pabase + LOADER_PAGE_SIZE * 6;
349 		pa_pt2_u0 = pabase + LOADER_PAGE_SIZE * 7;
350 		pa_pt2_u1 = pabase + LOADER_PAGE_SIZE * 8;
351 
352 		/* 1:1 mapping of lower 4G */
353 		PT4[0] = (pml4_entry_t)pa_pt3_l | PG_V | PG_RW;
354 		PT3_l[0] = (pdp_entry_t)pa_pt2_l0 | PG_V | PG_RW;
355 		PT3_l[1] = (pdp_entry_t)pa_pt2_l1 | PG_V | PG_RW;
356 		PT3_l[2] = (pdp_entry_t)pa_pt2_l2 | PG_V | PG_RW;
357 		PT3_l[3] = (pdp_entry_t)pa_pt2_l3 | PG_V | PG_RW;
358 		for (i = 0; i < 4 * NPDEPG; i++) {	/* we overflow PT2_l0 into _l1, etc */
359 			PT2_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V |
360 			    PG_RW | PG_PS;
361 		}
362 
363 		/* mapping of kernel 2G below top */
364 		PT4[NPML4EPG - 1] = (pml4_entry_t)pa_pt3_u | PG_V | PG_RW;
365 		PT3_u[NPDPEPG - 2] = (pdp_entry_t)pa_pt2_u0 | PG_V | PG_RW;
366 		PT3_u[NPDPEPG - 1] = (pdp_entry_t)pa_pt2_u1 | PG_V | PG_RW;
367 		/* compat mapping of phys @0 */
368 		PT2_u0[0] = PG_PS | PG_V | PG_RW;
369 		/* this maps past staging area */
370 		/*
371 		 * Kernel uses the KERNSTART (== KERNBASE + 2MB) entry to figure
372 		 * out where we loaded the kernel. This is PT2_u0[1] (since
373 		 * these map 2MB pages. So the PA that this maps has to be
374 		 * kboot's staging + 2MB.  For UEFI we do 'i - 1' since we load
375 		 * the kernel right at staging (and assume the first address we
376 		 * load is 2MB in efi_copyin). However for kboot, staging + 1 *
377 		 * NBPDR == staging + 2MB which is where the kernel starts. Our
378 		 * trampoline need not be mapped into the kernel space since we
379 		 * execute PA==VA for that, and the trampoline can just go away
380 		 * once the kernel is called.
381 		 *
382 		 * Staging should likely be as low as possible, though, because
383 		 * all the 'early' allocations are at kernend (which the kernel
384 		 * calls physfree).
385 		 */
386 		for (i = 1; i < 2 * NPDEPG; i++) {	/* we overflow PT2_u0 into _u1 */
387 			PT2_u0[i] = ((pd_entry_t)staging +
388 			    ((pd_entry_t)i) * NBPDR) |
389 			    PG_V | PG_RW | PG_PS;
390 			if (i < 10) printf("Mapping %d to %#lx staging %#lx\n", i, PT2_u0[i], staging);
391 		}
392 	}
393 #endif
394 
395 #ifdef EFI
396 	printf("staging %#lx (%scopying) tramp %p PT4 %p\n",
397 	    staging, copy_staging == COPY_STAGING_ENABLE ? "" : "not ",
398 	    trampoline, PT4);
399 #else
400 	printf("staging %#lx tramp %p PT4 %p\n", staging, (void *)trampolinebase,
401 	    (void *)trampolinebase + LOADER_PAGE_SIZE);
402 #endif
403 	printf("Start @ 0x%lx ...\n", ehdr->e_entry);
404 
405 #ifdef EFI
406 	efi_time_fini();
407 #endif
408 	err = bi_load(fp->f_args, &modulep, &kernend, true);
409 	if (err != 0) {
410 #ifdef EFI
411 		efi_time_init();
412 		if (copy_auto)
413 			copy_staging = COPY_STAGING_AUTO;
414 #endif
415 		return (err);
416 	}
417 
418 	dev_cleanup();
419 
420 #ifdef EFI
421 	trampoline(trampstack, copy_staging == COPY_STAGING_ENABLE ?
422 	    efi_copy_finish : efi_copy_finish_nop, kernend, modulep,
423 	    PT4, ehdr->e_entry);
424 #else
425 	trampoline_data = (void *)trampoline + tramp_data_offset;
426 	trampoline_data->entry = ehdr->e_entry;
427 	trampoline_data->pt4 = trampolinebase + LOADER_PAGE_SIZE;
428 	/*
429 	 * So we compute the VA of the module data by modulep + KERNBASE....
430 	 * need to make sure that that address is mapped right. We calculate
431 	 * the start of available memory to allocate via kernend (which is
432 	 * calculated with a phyaddr of "kernend + PA(PT_u0[1])"), so we better
433 	 * make sure we're not overwriting the last 2MB of the kernel :).
434 	 */
435 	trampoline_data->modulep = modulep;	/* Offset from KERNBASE */
436 	trampoline_data->kernend = kernend;	/* Offset from the load address */
437 	trampoline_data->fill1 = trampoline_data->fill2 = 0;
438 	printf("Modulep = %lx kernend %lx\n", modulep, kernend);
439 	/* NOTE: when copyting in, it's relative to the start of our 'area' not an abs addr */
440 	/* Copy the trampoline to the ksegs */
441 	archsw.arch_copyin((void *)trampcode, trampolinebase - staging, tramp_size);
442 	/* Copy the page table to the ksegs */
443 	archsw.arch_copyin(PT4, trampoline_data->pt4 - staging, 9 * LOADER_PAGE_SIZE);
444 
445 	kboot_kseg_get(&nseg, &kseg);
446 	error = host_kexec_load(trampolinebase, nseg, kseg, HOST_KEXEC_ARCH_X86_64);
447 	if (error != 0)
448 		panic("kexec_load returned error: %d", error);
449 	host_reboot(HOST_REBOOT_MAGIC1, HOST_REBOOT_MAGIC2, HOST_REBOOT_CMD_KEXEC, 0);
450 #endif
451 
452 	panic("exec returned");
453 }
454 
455 static int
456 elf64_obj_exec(struct preloaded_file *fp)
457 {
458 
459 	return (EFTYPE);
460 }
461