1 /*
2  * Copyright 2014-2019 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "ac_rtld.h"
25 
26 #include <gelf.h>
27 #include <libelf.h>
28 #include <stdarg.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 
33 #include "ac_binary.h"
34 #include "ac_gpu_info.h"
35 #include "util/u_dynarray.h"
36 #include "util/u_math.h"
37 
38 // Old distributions may not have this enum constant
39 #define MY_EM_AMDGPU 224
40 
41 #ifndef STT_AMDGPU_LDS
42 #define STT_AMDGPU_LDS 13 // this is deprecated -- remove
43 #endif
44 
45 #ifndef SHN_AMDGPU_LDS
46 #define SHN_AMDGPU_LDS 0xff00
47 #endif
48 
49 #ifndef R_AMDGPU_NONE
50 #define R_AMDGPU_NONE 0
51 #define R_AMDGPU_ABS32_LO 1
52 #define R_AMDGPU_ABS32_HI 2
53 #define R_AMDGPU_ABS64 3
54 #define R_AMDGPU_REL32 4
55 #define R_AMDGPU_REL64 5
56 #define R_AMDGPU_ABS32 6
57 #define R_AMDGPU_GOTPCREL 7
58 #define R_AMDGPU_GOTPCREL32_LO 8
59 #define R_AMDGPU_GOTPCREL32_HI 9
60 #define R_AMDGPU_REL32_LO 10
61 #define R_AMDGPU_REL32_HI 11
62 #define R_AMDGPU_RELATIVE64 13
63 #endif
64 
65 /* For the UMR disassembler. */
66 #define DEBUGGER_END_OF_CODE_MARKER	0xbf9f0000 /* invalid instruction */
67 #define DEBUGGER_NUM_MARKERS		5
68 
69 struct ac_rtld_section {
70 	bool is_rx : 1;
71 	bool is_pasted_text : 1;
72 	uint64_t offset;
73 	const char *name;
74 };
75 
76 struct ac_rtld_part {
77 	Elf *elf;
78 	struct ac_rtld_section *sections;
79 	unsigned num_sections;
80 };
81 
report_erroraf(const char * fmt,va_list va)82 static void report_erroraf(const char *fmt, va_list va)
83 {
84 	char *msg;
85 	int ret = vasprintf(&msg, fmt, va);
86 	if (ret < 0)
87 		msg = "(vasprintf failed)";
88 
89 	fprintf(stderr, "ac_rtld error: %s\n", msg);
90 
91 	if (ret >= 0)
92 		free(msg);
93 }
94 
95 static void report_errorf(const char *fmt, ...) PRINTFLIKE(1, 2);
96 
report_errorf(const char * fmt,...)97 static void report_errorf(const char *fmt, ...)
98 {
99 	va_list va;
100 	va_start(va, fmt);
101 	report_erroraf(fmt, va);
102 	va_end(va);
103 }
104 
105 static void report_elf_errorf(const char *fmt, ...) PRINTFLIKE(1, 2);
106 
report_elf_errorf(const char * fmt,...)107 static void report_elf_errorf(const char *fmt, ...)
108 {
109 	va_list va;
110 	va_start(va, fmt);
111 	report_erroraf(fmt, va);
112 	va_end(va);
113 
114 	fprintf(stderr, "ELF error: %s\n", elf_errmsg(elf_errno()));
115 }
116 
117 /**
118  * Find a symbol in a dynarray of struct ac_rtld_symbol by \p name and shader
119  * \p part_idx.
120  */
find_symbol(const struct util_dynarray * symbols,const char * name,unsigned part_idx)121 static const struct ac_rtld_symbol *find_symbol(const struct util_dynarray *symbols,
122 						const char *name, unsigned part_idx)
123 {
124 	util_dynarray_foreach(symbols, struct ac_rtld_symbol, symbol) {
125 		if ((symbol->part_idx == ~0u || symbol->part_idx == part_idx) &&
126 		    !strcmp(name, symbol->name))
127 			return symbol;
128 	}
129 	return 0;
130 }
131 
compare_symbol_by_align(const void * lhsp,const void * rhsp)132 static int compare_symbol_by_align(const void *lhsp, const void *rhsp)
133 {
134 	const struct ac_rtld_symbol *lhs = lhsp;
135 	const struct ac_rtld_symbol *rhs = rhsp;
136 	if (rhs->align > lhs->align)
137 		return 1;
138 	if (rhs->align < lhs->align)
139 		return -1;
140 	return 0;
141 }
142 
143 /**
144  * Sort the given symbol list by decreasing alignment and assign offsets.
145  */
layout_symbols(struct ac_rtld_symbol * symbols,unsigned num_symbols,uint64_t * ptotal_size)146 static bool layout_symbols(struct ac_rtld_symbol *symbols, unsigned num_symbols,
147 			   uint64_t *ptotal_size)
148 {
149 	qsort(symbols, num_symbols, sizeof(*symbols), compare_symbol_by_align);
150 
151 	uint64_t total_size = *ptotal_size;
152 
153 	for (unsigned i = 0; i < num_symbols; ++i) {
154 		struct ac_rtld_symbol *s = &symbols[i];
155 		assert(util_is_power_of_two_nonzero(s->align));
156 
157 		total_size = align64(total_size, s->align);
158 		s->offset = total_size;
159 
160 		if (total_size + s->size < total_size) {
161 			report_errorf("%s: size overflow", __FUNCTION__);
162 			return false;
163 		}
164 
165 		total_size += s->size;
166 	}
167 
168 	*ptotal_size = total_size;
169 	return true;
170 }
171 
172 /**
173  * Read LDS symbols from the given \p section of the ELF of \p part and append
174  * them to the LDS symbols list.
175  *
176  * Shared LDS symbols are filtered out.
177  */
read_private_lds_symbols(struct ac_rtld_binary * binary,unsigned part_idx,Elf_Scn * section,uint32_t * lds_end_align)178 static bool read_private_lds_symbols(struct ac_rtld_binary *binary,
179 				     unsigned part_idx,
180 				     Elf_Scn *section,
181 				     uint32_t *lds_end_align)
182 {
183 #define report_if(cond) \
184 	do { \
185 		if ((cond)) { \
186 			report_errorf(#cond); \
187 			return false; \
188 		} \
189 	} while (false)
190 #define report_elf_if(cond) \
191 	do { \
192 		if ((cond)) { \
193 			report_elf_errorf(#cond); \
194 			return false; \
195 		} \
196 	} while (false)
197 
198 	struct ac_rtld_part *part = &binary->parts[part_idx];
199 	Elf64_Shdr *shdr = elf64_getshdr(section);
200 	uint32_t strtabidx = shdr->sh_link;
201 	Elf_Data *symbols_data = elf_getdata(section, NULL);
202 	report_elf_if(!symbols_data);
203 
204 	const Elf64_Sym *symbol = symbols_data->d_buf;
205 	size_t num_symbols = symbols_data->d_size / sizeof(Elf64_Sym);
206 
207 	for (size_t j = 0; j < num_symbols; ++j, ++symbol) {
208 		struct ac_rtld_symbol s = {};
209 
210 		if (ELF64_ST_TYPE(symbol->st_info) == STT_AMDGPU_LDS) {
211 			/* old-style LDS symbols from initial prototype -- remove eventually */
212 			s.align = MIN2(1u << (symbol->st_other >> 3), 1u << 16);
213 		} else if (symbol->st_shndx == SHN_AMDGPU_LDS) {
214 			s.align = MIN2(symbol->st_value, 1u << 16);
215 			report_if(!util_is_power_of_two_nonzero(s.align));
216 		} else
217 			continue;
218 
219 		report_if(symbol->st_size > 1u << 29);
220 
221 		s.name = elf_strptr(part->elf, strtabidx, symbol->st_name);
222 		s.size = symbol->st_size;
223 		s.part_idx = part_idx;
224 
225 		if (!strcmp(s.name, "__lds_end")) {
226 			report_elf_if(s.size != 0);
227 			*lds_end_align = MAX2(*lds_end_align, s.align);
228 			continue;
229 		}
230 
231 		const struct ac_rtld_symbol *shared =
232 			find_symbol(&binary->lds_symbols, s.name, part_idx);
233 		if (shared) {
234 			report_elf_if(s.align > shared->align);
235 			report_elf_if(s.size > shared->size);
236 			continue;
237 		}
238 
239 		util_dynarray_append(&binary->lds_symbols, struct ac_rtld_symbol, s);
240 	}
241 
242 	return true;
243 
244 #undef report_if
245 #undef report_elf_if
246 }
247 
248 /**
249  * Open a binary consisting of one or more shader parts.
250  *
251  * \param binary the uninitialized struct
252  * \param i binary opening parameters
253  */
ac_rtld_open(struct ac_rtld_binary * binary,struct ac_rtld_open_info i)254 bool ac_rtld_open(struct ac_rtld_binary *binary,
255 		  struct ac_rtld_open_info i)
256 {
257 	/* One of the libelf implementations
258 	 * (http://www.mr511.de/software/english.htm) requires calling
259 	 * elf_version() before elf_memory().
260 	 */
261 	elf_version(EV_CURRENT);
262 
263 	memset(binary, 0, sizeof(*binary));
264 	memcpy(&binary->options, &i.options, sizeof(binary->options));
265 	binary->wave_size = i.wave_size;
266 	binary->num_parts = i.num_parts;
267 	binary->parts = calloc(sizeof(*binary->parts), i.num_parts);
268 	if (!binary->parts)
269 		return false;
270 
271 	uint64_t pasted_text_size = 0;
272 	uint64_t rx_align = 1;
273 	uint64_t rx_size = 0;
274 	uint64_t exec_size = 0;
275 
276 #define report_if(cond) \
277 	do { \
278 		if ((cond)) { \
279 			report_errorf(#cond); \
280 			goto fail; \
281 		} \
282 	} while (false)
283 #define report_elf_if(cond) \
284 	do { \
285 		if ((cond)) { \
286 			report_elf_errorf(#cond); \
287 			goto fail; \
288 		} \
289 	} while (false)
290 
291 	/* Copy and layout shared LDS symbols. */
292 	if (i.num_shared_lds_symbols) {
293 		if (!util_dynarray_resize(&binary->lds_symbols, struct ac_rtld_symbol,
294 					  i.num_shared_lds_symbols))
295 			goto fail;
296 
297 		memcpy(binary->lds_symbols.data, i.shared_lds_symbols, binary->lds_symbols.size);
298 	}
299 
300 	util_dynarray_foreach(&binary->lds_symbols, struct ac_rtld_symbol, symbol)
301 		symbol->part_idx = ~0u;
302 
303 	unsigned max_lds_size = 64 * 1024;
304 
305 	if (i.info->chip_class == GFX6 ||
306 	    (i.shader_type != MESA_SHADER_COMPUTE &&
307 	     i.shader_type != MESA_SHADER_FRAGMENT))
308 		max_lds_size = 32 * 1024;
309 
310 	uint64_t shared_lds_size = 0;
311 	if (!layout_symbols(binary->lds_symbols.data, i.num_shared_lds_symbols, &shared_lds_size))
312 		goto fail;
313 
314 	if (shared_lds_size > max_lds_size) {
315 		fprintf(stderr, "ac_rtld error(1): too much LDS (used = %u, max = %u)\n",
316 			(unsigned)shared_lds_size, max_lds_size);
317 		goto fail;
318 	}
319 	binary->lds_size = shared_lds_size;
320 
321 	/* First pass over all parts: open ELFs, pre-determine the placement of
322 	 * sections in the memory image, and collect and layout private LDS symbols. */
323 	uint32_t lds_end_align = 0;
324 
325 	if (binary->options.halt_at_entry)
326 		pasted_text_size += 4;
327 
328 	for (unsigned part_idx = 0; part_idx < i.num_parts; ++part_idx) {
329 		struct ac_rtld_part *part = &binary->parts[part_idx];
330 		unsigned part_lds_symbols_begin =
331 			util_dynarray_num_elements(&binary->lds_symbols, struct ac_rtld_symbol);
332 
333 		part->elf = elf_memory((char *)i.elf_ptrs[part_idx], i.elf_sizes[part_idx]);
334 		report_elf_if(!part->elf);
335 
336 		const Elf64_Ehdr *ehdr = elf64_getehdr(part->elf);
337 		report_elf_if(!ehdr);
338 		report_if(ehdr->e_machine != MY_EM_AMDGPU);
339 
340 		size_t section_str_index;
341 		size_t num_shdrs;
342 		report_elf_if(elf_getshdrstrndx(part->elf, &section_str_index) < 0);
343 		report_elf_if(elf_getshdrnum(part->elf, &num_shdrs) < 0);
344 
345 		part->num_sections = num_shdrs;
346 		part->sections = calloc(sizeof(*part->sections), num_shdrs);
347 		report_if(!part->sections);
348 
349 		Elf_Scn *section = NULL;
350 		while ((section = elf_nextscn(part->elf, section))) {
351 			Elf64_Shdr *shdr = elf64_getshdr(section);
352 			struct ac_rtld_section *s = &part->sections[elf_ndxscn(section)];
353 			s->name = elf_strptr(part->elf, section_str_index, shdr->sh_name);
354 			report_elf_if(!s->name);
355 
356 			/* Cannot actually handle linked objects yet */
357 			report_elf_if(shdr->sh_addr != 0);
358 
359 			/* Alignment must be 0 or a power of two */
360 			report_elf_if(shdr->sh_addralign & (shdr->sh_addralign - 1));
361 			uint64_t sh_align = MAX2(shdr->sh_addralign, 1);
362 
363 			if (shdr->sh_flags & SHF_ALLOC &&
364 			    shdr->sh_type != SHT_NOTE) {
365 				report_if(shdr->sh_flags & SHF_WRITE);
366 
367 				s->is_rx = true;
368 
369 				if (shdr->sh_flags & SHF_EXECINSTR) {
370 					report_elf_if(shdr->sh_size & 3);
371 
372 					if (!strcmp(s->name, ".text"))
373 						s->is_pasted_text = true;
374 
375 					exec_size += shdr->sh_size;
376 				}
377 
378 				if (s->is_pasted_text) {
379 					s->offset = pasted_text_size;
380 					pasted_text_size += shdr->sh_size;
381 				} else {
382 					rx_align = align(rx_align, sh_align);
383 					rx_size = align(rx_size, sh_align);
384 					s->offset = rx_size;
385 					rx_size += shdr->sh_size;
386 				}
387 			} else if (shdr->sh_type == SHT_SYMTAB) {
388 				if (!read_private_lds_symbols(binary, part_idx, section, &lds_end_align))
389 					goto fail;
390 			}
391 		}
392 
393 		uint64_t part_lds_size = shared_lds_size;
394 		if (!layout_symbols(
395 			util_dynarray_element(&binary->lds_symbols, struct ac_rtld_symbol, part_lds_symbols_begin),
396 			util_dynarray_num_elements(&binary->lds_symbols, struct ac_rtld_symbol) - part_lds_symbols_begin,
397 			&part_lds_size))
398 			goto fail;
399 		binary->lds_size = MAX2(binary->lds_size, part_lds_size);
400 	}
401 
402 	binary->rx_end_markers = pasted_text_size;
403 	pasted_text_size += 4 * DEBUGGER_NUM_MARKERS;
404 
405 	/* __lds_end is a special symbol that points at the end of the memory
406 	 * occupied by other LDS symbols. Its alignment is taken as the
407 	 * maximum of its alignment over all shader parts where it occurs.
408 	 */
409 	if (lds_end_align) {
410 		binary->lds_size = align(binary->lds_size, lds_end_align);
411 
412 		struct ac_rtld_symbol *lds_end =
413 			util_dynarray_grow(&binary->lds_symbols, struct ac_rtld_symbol, 1);
414 		lds_end->name = "__lds_end";
415 		lds_end->size = 0;
416 		lds_end->align = lds_end_align;
417 		lds_end->offset = binary->lds_size;
418 		lds_end->part_idx = ~0u;
419 	}
420 
421 	if (binary->lds_size > max_lds_size) {
422 		fprintf(stderr, "ac_rtld error(2): too much LDS (used = %u, max = %u)\n",
423 			(unsigned)binary->lds_size, max_lds_size);
424 		goto fail;
425 	}
426 
427 	/* Second pass: Adjust offsets of non-pasted text sections. */
428 	binary->rx_size = pasted_text_size;
429 	binary->rx_size = align(binary->rx_size, rx_align);
430 
431 	for (unsigned part_idx = 0; part_idx < i.num_parts; ++part_idx) {
432 		struct ac_rtld_part *part = &binary->parts[part_idx];
433 		size_t num_shdrs;
434 		elf_getshdrnum(part->elf, &num_shdrs);
435 
436 		for (unsigned j = 0; j < num_shdrs; ++j) {
437 			struct ac_rtld_section *s = &part->sections[j];
438 			if (s->is_rx && !s->is_pasted_text)
439 				s->offset += binary->rx_size;
440 		}
441 	}
442 
443 	binary->rx_size += rx_size;
444 	binary->exec_size = exec_size;
445 
446 	if (i.info->chip_class >= GFX10) {
447 		/* In gfx10, the SQ fetches up to 3 cache lines of 16 dwords
448 		 * ahead of the PC, configurable by SH_MEM_CONFIG and
449 		 * S_INST_PREFETCH. This can cause two issues:
450 		 *
451 		 * (1) Crossing a page boundary to an unmapped page. The logic
452 		 *     does not distinguish between a required fetch and a "mere"
453 		 *     prefetch and will fault.
454 		 *
455 		 * (2) Prefetching instructions that will be changed for a
456 		 *     different shader.
457 		 *
458 		 * (2) is not currently an issue because we flush the I$ at IB
459 		 * boundaries, but (1) needs to be addressed. Due to buffer
460 		 * suballocation, we just play it safe.
461 		 */
462 		binary->rx_size = align(binary->rx_size + 3 * 64, 64);
463 	}
464 
465 	return true;
466 
467 #undef report_if
468 #undef report_elf_if
469 
470 fail:
471 	ac_rtld_close(binary);
472 	return false;
473 }
474 
ac_rtld_close(struct ac_rtld_binary * binary)475 void ac_rtld_close(struct ac_rtld_binary *binary)
476 {
477 	for (unsigned i = 0; i < binary->num_parts; ++i) {
478 		struct ac_rtld_part *part = &binary->parts[i];
479 		free(part->sections);
480 		elf_end(part->elf);
481 	}
482 
483 	util_dynarray_fini(&binary->lds_symbols);
484 	free(binary->parts);
485 	binary->parts = NULL;
486 	binary->num_parts = 0;
487 }
488 
get_section_by_name(struct ac_rtld_part * part,const char * name,const char ** data,size_t * nbytes)489 static bool get_section_by_name(struct ac_rtld_part *part, const char *name,
490 				const char **data, size_t *nbytes)
491 {
492 	for (unsigned i = 0; i < part->num_sections; ++i) {
493 		struct ac_rtld_section *s = &part->sections[i];
494 		if (s->name && !strcmp(name, s->name)) {
495 			Elf_Scn *target_scn = elf_getscn(part->elf, i);
496 			Elf_Data *target_data = elf_getdata(target_scn, NULL);
497 			if (!target_data) {
498 				report_elf_errorf("ac_rtld: get_section_by_name: elf_getdata");
499 				return false;
500 			}
501 
502 			*data = target_data->d_buf;
503 			*nbytes = target_data->d_size;
504 			return true;
505 		}
506 	}
507 	return false;
508 }
509 
ac_rtld_get_section_by_name(struct ac_rtld_binary * binary,const char * name,const char ** data,size_t * nbytes)510 bool ac_rtld_get_section_by_name(struct ac_rtld_binary *binary, const char *name,
511 				 const char **data, size_t *nbytes)
512 {
513 	assert(binary->num_parts == 1);
514 	return get_section_by_name(&binary->parts[0], name, data, nbytes);
515 }
516 
ac_rtld_read_config(const struct radeon_info * info,struct ac_rtld_binary * binary,struct ac_shader_config * config)517 bool ac_rtld_read_config(const struct radeon_info *info,
518 			 struct ac_rtld_binary *binary,
519 			 struct ac_shader_config *config)
520 {
521 	for (unsigned i = 0; i < binary->num_parts; ++i) {
522 		struct ac_rtld_part *part = &binary->parts[i];
523 		const char *config_data;
524 		size_t config_nbytes;
525 
526 		if (!get_section_by_name(part, ".AMDGPU.config",
527 					 &config_data, &config_nbytes))
528 			return false;
529 
530 		/* TODO: be precise about scratch use? */
531 		struct ac_shader_config c = {};
532 		ac_parse_shader_binary_config(config_data, config_nbytes,
533 					      binary->wave_size, true, info, &c);
534 
535 		config->num_sgprs = MAX2(config->num_sgprs, c.num_sgprs);
536 		config->num_vgprs = MAX2(config->num_vgprs, c.num_vgprs);
537 		config->spilled_sgprs = MAX2(config->spilled_sgprs, c.spilled_sgprs);
538 		config->spilled_vgprs = MAX2(config->spilled_vgprs, c.spilled_vgprs);
539 		config->scratch_bytes_per_wave = MAX2(config->scratch_bytes_per_wave,
540 						      c.scratch_bytes_per_wave);
541 
542 		assert(i == 0 || config->float_mode == c.float_mode);
543 		config->float_mode = c.float_mode;
544 
545 		/* SPI_PS_INPUT_ENA/ADDR can't be combined. Only the value from
546 		 * the main shader part is used. */
547 		assert(config->spi_ps_input_ena == 0 &&
548 		       config->spi_ps_input_addr == 0);
549 		config->spi_ps_input_ena = c.spi_ps_input_ena;
550 		config->spi_ps_input_addr = c.spi_ps_input_addr;
551 
552 		/* TODO: consistently use LDS symbols for this */
553 		config->lds_size = MAX2(config->lds_size, c.lds_size);
554 
555 		/* TODO: Should we combine these somehow? It's currently only
556 		 * used for radeonsi's compute, where multiple parts aren't used. */
557 		assert(config->rsrc1 == 0 && config->rsrc2 == 0);
558 		config->rsrc1 = c.rsrc1;
559 		config->rsrc2 = c.rsrc2;
560 	}
561 
562 	return true;
563 }
564 
resolve_symbol(const struct ac_rtld_upload_info * u,unsigned part_idx,const Elf64_Sym * sym,const char * name,uint64_t * value)565 static bool resolve_symbol(const struct ac_rtld_upload_info *u,
566 			   unsigned part_idx, const Elf64_Sym *sym,
567 			   const char *name, uint64_t *value)
568 {
569 	/* TODO: properly disentangle the undef and the LDS cases once
570 	 * STT_AMDGPU_LDS is retired. */
571 	if (sym->st_shndx == SHN_UNDEF || sym->st_shndx == SHN_AMDGPU_LDS) {
572 		const struct ac_rtld_symbol *lds_sym =
573 			find_symbol(&u->binary->lds_symbols, name, part_idx);
574 
575 		if (lds_sym) {
576 			*value = lds_sym->offset;
577 			return true;
578 		}
579 
580 		/* TODO: resolve from other parts */
581 
582 		if (u->get_external_symbol(u->cb_data, name, value))
583 			return true;
584 
585 		report_errorf("symbol %s: unknown", name);
586 		return false;
587 	}
588 
589 	struct ac_rtld_part *part = &u->binary->parts[part_idx];
590 	if (sym->st_shndx >= part->num_sections) {
591 		report_errorf("symbol %s: section out of bounds", name);
592 		return false;
593 	}
594 
595 	struct ac_rtld_section *s = &part->sections[sym->st_shndx];
596 	if (!s->is_rx) {
597 		report_errorf("symbol %s: bad section", name);
598 		return false;
599 	}
600 
601 	uint64_t section_base = u->rx_va + s->offset;
602 
603 	*value = section_base + sym->st_value;
604 	return true;
605 }
606 
apply_relocs(const struct ac_rtld_upload_info * u,unsigned part_idx,const Elf64_Shdr * reloc_shdr,const Elf_Data * reloc_data)607 static bool apply_relocs(const struct ac_rtld_upload_info *u,
608 			 unsigned part_idx, const Elf64_Shdr *reloc_shdr,
609 			 const Elf_Data *reloc_data)
610 {
611 #define report_if(cond) \
612 	do { \
613 		if ((cond)) { \
614 			report_errorf(#cond); \
615 			return false; \
616 		} \
617 	} while (false)
618 #define report_elf_if(cond) \
619 	do { \
620 		if ((cond)) { \
621 			report_elf_errorf(#cond); \
622 			return false; \
623 		} \
624 	} while (false)
625 
626 	struct ac_rtld_part *part = &u->binary->parts[part_idx];
627 	Elf_Scn *target_scn = elf_getscn(part->elf, reloc_shdr->sh_info);
628 	report_elf_if(!target_scn);
629 
630 	Elf_Data *target_data = elf_getdata(target_scn, NULL);
631 	report_elf_if(!target_data);
632 
633 	Elf_Scn *symbols_scn = elf_getscn(part->elf, reloc_shdr->sh_link);
634 	report_elf_if(!symbols_scn);
635 
636 	Elf64_Shdr *symbols_shdr = elf64_getshdr(symbols_scn);
637 	report_elf_if(!symbols_shdr);
638 	uint32_t strtabidx = symbols_shdr->sh_link;
639 
640 	Elf_Data *symbols_data = elf_getdata(symbols_scn, NULL);
641 	report_elf_if(!symbols_data);
642 
643 	const Elf64_Sym *symbols = symbols_data->d_buf;
644 	size_t num_symbols = symbols_data->d_size / sizeof(Elf64_Sym);
645 
646 	struct ac_rtld_section *s = &part->sections[reloc_shdr->sh_info];
647 	report_if(!s->is_rx);
648 
649 	const char *orig_base = target_data->d_buf;
650 	char *dst_base = u->rx_ptr + s->offset;
651 	uint64_t va_base = u->rx_va + s->offset;
652 
653 	Elf64_Rel *rel = reloc_data->d_buf;
654 	size_t num_relocs = reloc_data->d_size / sizeof(*rel);
655 	for (size_t i = 0; i < num_relocs; ++i, ++rel) {
656 		size_t r_sym = ELF64_R_SYM(rel->r_info);
657 		unsigned r_type = ELF64_R_TYPE(rel->r_info);
658 
659 		const char *orig_ptr = orig_base + rel->r_offset;
660 		char *dst_ptr = dst_base + rel->r_offset;
661 		uint64_t va = va_base + rel->r_offset;
662 
663 		uint64_t symbol;
664 		uint64_t addend;
665 
666 		if (r_sym == STN_UNDEF) {
667 			symbol = 0;
668 		} else {
669 			report_elf_if(r_sym >= num_symbols);
670 
671 			const Elf64_Sym *sym = &symbols[r_sym];
672 			const char *symbol_name =
673 				elf_strptr(part->elf, strtabidx, sym->st_name);
674 			report_elf_if(!symbol_name);
675 
676 			if (!resolve_symbol(u, part_idx, sym, symbol_name, &symbol))
677 				return false;
678 		}
679 
680 		/* TODO: Should we also support .rela sections, where the
681 		 * addend is part of the relocation record? */
682 
683 		/* Load the addend from the ELF instead of the destination,
684 		 * because the destination may be in VRAM. */
685 		switch (r_type) {
686 		case R_AMDGPU_ABS32:
687 		case R_AMDGPU_ABS32_LO:
688 		case R_AMDGPU_ABS32_HI:
689 		case R_AMDGPU_REL32:
690 		case R_AMDGPU_REL32_LO:
691 		case R_AMDGPU_REL32_HI:
692 			addend = *(const uint32_t *)orig_ptr;
693 			break;
694 		case R_AMDGPU_ABS64:
695 		case R_AMDGPU_REL64:
696 			addend = *(const uint64_t *)orig_ptr;
697 			break;
698 		default:
699 			report_errorf("unsupported r_type == %u", r_type);
700 			return false;
701 		}
702 
703 		uint64_t abs = symbol + addend;
704 
705 		switch (r_type) {
706 		case R_AMDGPU_ABS32:
707 			assert((uint32_t)abs == abs);
708 		case R_AMDGPU_ABS32_LO:
709 			*(uint32_t *)dst_ptr = util_cpu_to_le32(abs);
710 			break;
711 		case R_AMDGPU_ABS32_HI:
712 			*(uint32_t *)dst_ptr = util_cpu_to_le32(abs >> 32);
713 			break;
714 		case R_AMDGPU_ABS64:
715 			*(uint64_t *)dst_ptr = util_cpu_to_le64(abs);
716 			break;
717 		case R_AMDGPU_REL32:
718 			assert((int64_t)(int32_t)(abs - va) == (int64_t)(abs - va));
719 		case R_AMDGPU_REL32_LO:
720 			*(uint32_t *)dst_ptr = util_cpu_to_le32(abs - va);
721 			break;
722 		case R_AMDGPU_REL32_HI:
723 			*(uint32_t *)dst_ptr = util_cpu_to_le32((abs - va) >> 32);
724 			break;
725 		case R_AMDGPU_REL64:
726 			*(uint64_t *)dst_ptr = util_cpu_to_le64(abs - va);
727 			break;
728 		default:
729 			unreachable("bad r_type");
730 		}
731 	}
732 
733 	return true;
734 
735 #undef report_if
736 #undef report_elf_if
737 }
738 
739 /**
740  * Upload the binary or binaries to the provided GPU buffers, including
741  * relocations.
742  */
ac_rtld_upload(struct ac_rtld_upload_info * u)743 bool ac_rtld_upload(struct ac_rtld_upload_info *u)
744 {
745 #define report_if(cond) \
746 	do { \
747 		if ((cond)) { \
748 			report_errorf(#cond); \
749 			return false; \
750 		} \
751 	} while (false)
752 #define report_elf_if(cond) \
753 	do { \
754 		if ((cond)) { \
755 			report_errorf(#cond); \
756 			return false; \
757 		} \
758 	} while (false)
759 
760 	if (u->binary->options.halt_at_entry) {
761 		/* s_sethalt 1 */
762 		*(uint32_t *)u->rx_ptr = util_cpu_to_le32(0xbf8d0001);
763 	}
764 
765 	/* First pass: upload raw section data and lay out private LDS symbols. */
766 	for (unsigned i = 0; i < u->binary->num_parts; ++i) {
767 		struct ac_rtld_part *part = &u->binary->parts[i];
768 
769 		Elf_Scn *section = NULL;
770 		while ((section = elf_nextscn(part->elf, section))) {
771 			Elf64_Shdr *shdr = elf64_getshdr(section);
772 			struct ac_rtld_section *s = &part->sections[elf_ndxscn(section)];
773 
774 			if (!s->is_rx)
775 				continue;
776 
777 			report_if(shdr->sh_type != SHT_PROGBITS);
778 
779 			Elf_Data *data = elf_getdata(section, NULL);
780 			report_elf_if(!data || data->d_size != shdr->sh_size);
781 			memcpy(u->rx_ptr + s->offset, data->d_buf, shdr->sh_size);
782 		}
783 	}
784 
785 	if (u->binary->rx_end_markers) {
786 		uint32_t *dst = (uint32_t *)(u->rx_ptr + u->binary->rx_end_markers);
787 		for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; ++i)
788 			*dst++ = util_cpu_to_le32(DEBUGGER_END_OF_CODE_MARKER);
789 	}
790 
791 	/* Second pass: handle relocations, overwriting uploaded data where
792 	 * appropriate. */
793 	for (unsigned i = 0; i < u->binary->num_parts; ++i) {
794 		struct ac_rtld_part *part = &u->binary->parts[i];
795 		Elf_Scn *section = NULL;
796 		while ((section = elf_nextscn(part->elf, section))) {
797 			Elf64_Shdr *shdr = elf64_getshdr(section);
798 			if (shdr->sh_type == SHT_REL) {
799 				Elf_Data *relocs = elf_getdata(section, NULL);
800 				report_elf_if(!relocs || relocs->d_size != shdr->sh_size);
801 				if (!apply_relocs(u, i, shdr, relocs))
802 					return false;
803 			} else if (shdr->sh_type == SHT_RELA) {
804 				report_errorf("SHT_RELA not supported");
805 				return false;
806 			}
807 		}
808 	}
809 
810 	return true;
811 
812 #undef report_if
813 #undef report_elf_if
814 }
815