xref: /freebsd/sys/cddl/dev/kinst/amd64/kinst_isa.c (revision 4b9d6057)
1 /*
2  * SPDX-License-Identifier: CDDL 1.0
3  *
4  * Copyright (c) 2022 Christos Margiolis <christos@FreeBSD.org>
5  * Copyright (c) 2022 Mark Johnston <markj@FreeBSD.org>
6  * Copyright (c) 2023 The FreeBSD Foundation
7  *
8  * Portions of this software were developed by Christos Margiolis
9  * <christos@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
10  */
11 
12 #include <sys/param.h>
13 #include <sys/pcpu.h>
14 
15 #include <machine/cpufunc.h>
16 #include <machine/md_var.h>
17 
18 #include <sys/dtrace.h>
19 #include <cddl/dev/dtrace/dtrace_cddl.h>
20 #include <dis_tables.h>
21 
22 #include "kinst.h"
23 
24 #define KINST_PUSHL_RBP		0x55
25 #define KINST_STI		0xfb
26 #define KINST_POPF		0x9d
27 
28 #define KINST_MODRM_MOD(b)	(((b) & 0xc0) >> 6)
29 #define KINST_MODRM_REG(b)	(((b) & 0x38) >> 3)
30 #define KINST_MODRM_RM(b)	((b) & 0x07)
31 
32 #define KINST_SIB_SCALE(s)	(((s) & 0xc0) >> 6)
33 #define KINST_SIB_INDEX(s)	(((s) & 0x38) >> 3)
34 #define KINST_SIB_BASE(s)	(((s) & 0x07) >> 0)
35 
36 #define KINST_REX_W(r)		(((r) & 0x08) >> 3)
37 #define KINST_REX_R(r)		(((r) & 0x04) >> 2)
38 #define KINST_REX_X(r)		(((r) & 0x02) >> 1)
39 #define KINST_REX_B(r)		(((r) & 0x01) >> 0)
40 
41 #define KINST_F_CALL		0x0001	/* instruction is a "call" */
42 #define KINST_F_DIRECT_CALL	0x0002	/* instruction is a direct call */
43 #define KINST_F_RIPREL		0x0004	/* instruction is position-dependent */
44 #define KINST_F_JMP		0x0008	/* instruction is a %rip-relative jmp */
45 #define KINST_F_MOD_DIRECT	0x0010	/* operand is not a memory address */
46 
47 /*
48  * Per-CPU trampolines used when the interrupted thread is executing with
49  * interrupts disabled.  If an interrupt is raised while executing a trampoline,
50  * the interrupt thread cannot safely overwrite its trampoline if it hits a
51  * kinst probe while executing the interrupt handler.
52  */
53 DPCPU_DEFINE_STATIC(uint8_t *, intr_tramp);
54 
55 /*
56  * Map ModR/M register bits to a trapframe offset.
57  */
58 static int
59 kinst_regoff(int reg)
60 {
61 #define	_MATCH_REG(i, reg)			\
62 	case i:					\
63 		return (offsetof(struct trapframe, tf_ ## reg) / \
64 		    sizeof(register_t))
65 	switch (reg) {
66 	_MATCH_REG( 0, rax);
67 	_MATCH_REG( 1, rcx);
68 	_MATCH_REG( 2, rdx);
69 	_MATCH_REG( 3, rbx);
70 	_MATCH_REG( 4, rsp); /* SIB when mod != 3 */
71 	_MATCH_REG( 5, rbp);
72 	_MATCH_REG( 6, rsi);
73 	_MATCH_REG( 7, rdi);
74 	_MATCH_REG( 8, r8); /* REX.R is set */
75 	_MATCH_REG( 9, r9);
76 	_MATCH_REG(10, r10);
77 	_MATCH_REG(11, r11);
78 	_MATCH_REG(12, r12);
79 	_MATCH_REG(13, r13);
80 	_MATCH_REG(14, r14);
81 	_MATCH_REG(15, r15);
82 	}
83 #undef _MATCH_REG
84 	panic("%s: unhandled register index %d", __func__, reg);
85 }
86 
87 /*
88  * Obtain the specified register's value.
89  */
90 static uint64_t
91 kinst_regval(struct trapframe *frame, int reg)
92 {
93 	if (reg == -1)
94 		return (0);
95 	return (((register_t *)frame)[kinst_regoff(reg)]);
96 }
97 
98 static uint32_t
99 kinst_riprel_disp(struct kinst_probe *kp, void *dst)
100 {
101 	return ((uint32_t)((intptr_t)kp->kp_patchpoint + kp->kp_md.disp -
102 	    (intptr_t)dst));
103 }
104 
105 static void
106 kinst_trampoline_populate(struct kinst_probe *kp, uint8_t *tramp)
107 {
108 	uint8_t *instr;
109 	uint32_t disp;
110 	int ilen;
111 
112 	ilen = kp->kp_md.tinstlen;
113 
114 	kinst_memcpy(tramp, kp->kp_md.template, ilen);
115 	if ((kp->kp_md.flags & KINST_F_RIPREL) != 0) {
116 		disp = kinst_riprel_disp(kp, tramp);
117 		kinst_memcpy(&tramp[kp->kp_md.dispoff], &disp, sizeof(uint32_t));
118 	}
119 
120 	/*
121 	 * The following position-independent jmp takes us back to the
122 	 * original code.  It is encoded as "jmp *0(%rip)" (six bytes),
123 	 * followed by the absolute address of the instruction following
124 	 * the one that was traced (eight bytes).
125 	 */
126 	tramp[ilen + 0] = 0xff;
127 	tramp[ilen + 1] = 0x25;
128 	tramp[ilen + 2] = 0x00;
129 	tramp[ilen + 3] = 0x00;
130 	tramp[ilen + 4] = 0x00;
131 	tramp[ilen + 5] = 0x00;
132 	instr = kp->kp_patchpoint + kp->kp_md.instlen;
133 	kinst_memcpy(&tramp[ilen + 6], &instr, sizeof(uintptr_t));
134 }
135 
136 int
137 kinst_invop(uintptr_t addr, struct trapframe *frame, uintptr_t scratch)
138 {
139 	solaris_cpu_t *cpu;
140 	uintptr_t *stack, retaddr;
141 	struct kinst_probe *kp;
142 	struct kinst_probe_md *kpmd;
143 	uint8_t *tramp;
144 
145 	stack = (uintptr_t *)frame->tf_rsp;
146 	cpu = &solaris_cpu[curcpu];
147 
148 	LIST_FOREACH(kp, KINST_GETPROBE(addr), kp_hashnext) {
149 		if ((uintptr_t)kp->kp_patchpoint == addr)
150 			break;
151 	}
152 	if (kp == NULL)
153 		return (0);
154 
155 	/*
156 	 * Report the address of the breakpoint for the benefit of consumers
157 	 * fetching register values with regs[].
158 	 */
159 	frame->tf_rip--;
160 
161 	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
162 	cpu->cpu_dtrace_caller = stack[0];
163 	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
164 	dtrace_probe(kp->kp_id, 0, 0, 0, 0, 0);
165 	cpu->cpu_dtrace_caller = 0;
166 
167 	kpmd = &kp->kp_md;
168 	if ((kpmd->flags & KINST_F_CALL) != 0) {
169 		/*
170 		 * dtrace_invop_start() reserves space on the stack to
171 		 * store the return address of the call instruction.
172 		 */
173 		retaddr = (uintptr_t)(kp->kp_patchpoint + kpmd->instlen);
174 		*(uintptr_t *)scratch = retaddr;
175 
176 		if ((kpmd->flags & KINST_F_DIRECT_CALL) != 0) {
177 			frame->tf_rip = (uintptr_t)(kp->kp_patchpoint +
178 			    kpmd->disp + kpmd->instlen);
179 		} else {
180 			register_t rval;
181 
182 			if (kpmd->reg1 == -1 && kpmd->reg2 == -1) {
183 				/* rip-relative */
184 				rval = frame->tf_rip + kpmd->instlen;
185 			} else {
186 				/* indirect */
187 				rval = kinst_regval(frame, kpmd->reg1) +
188 				    (kinst_regval(frame, kpmd->reg2) <<
189 				    kpmd->scale);
190 			}
191 
192 			if ((kpmd->flags & KINST_F_MOD_DIRECT) != 0) {
193 				frame->tf_rip = rval + kpmd->disp;
194 			} else {
195 				frame->tf_rip =
196 				    *(uintptr_t *)(rval + kpmd->disp);
197 			}
198 		}
199 		return (DTRACE_INVOP_CALL);
200 	} else {
201 		if ((frame->tf_rflags & PSL_I) == 0)
202 			tramp = DPCPU_GET(intr_tramp);
203 		else
204 			tramp = curthread->t_kinst_tramp;
205 		if (tramp == NULL) {
206 			/*
207 			 * A trampoline allocation failed, so this probe is
208 			 * effectively disabled.  Restore the original
209 			 * instruction.
210 			 *
211 			 * We can't safely print anything here, but the
212 			 * trampoline allocator should have left a breadcrumb in
213 			 * the dmesg.
214 			 */
215 			kinst_patch_tracepoint(kp, kp->kp_savedval);
216 			frame->tf_rip = (register_t)kp->kp_patchpoint;
217 		} else {
218 			kinst_trampoline_populate(kp, tramp);
219 			frame->tf_rip = (register_t)tramp;
220 		}
221 		return (DTRACE_INVOP_NOP);
222 	}
223 }
224 
225 void
226 kinst_patch_tracepoint(struct kinst_probe *kp, kinst_patchval_t val)
227 {
228 	register_t reg;
229 	int oldwp;
230 
231 	reg = intr_disable();
232 	oldwp = disable_wp();
233 	*kp->kp_patchpoint = val;
234 	restore_wp(oldwp);
235 	intr_restore(reg);
236 }
237 
238 static void
239 kinst_set_disp8(struct kinst_probe *kp, uint8_t byte)
240 {
241 	kp->kp_md.disp = (int64_t)(int8_t)byte;
242 }
243 
244 static void
245 kinst_set_disp32(struct kinst_probe *kp, uint8_t *bytes)
246 {
247 	int32_t disp32;
248 
249 	memcpy(&disp32, bytes, sizeof(disp32));
250 	kp->kp_md.disp = (int64_t)disp32;
251 }
252 
253 /*
254  * Set up all of the state needed to faithfully execute a probed instruction.
255  *
256  * In the simple case, we copy the instruction unmodified to a per-thread
257  * trampoline, wherein it is followed by a jump back to the original code.
258  * - Instructions can have %rip as an operand:
259  *   - with %rip-relative addressing encoded in ModR/M, or
260  *   - implicitly as a part of the instruction definition (jmp, call).
261  * - Call instructions (which may be %rip-relative) need to push the correct
262  *   return address onto the stack.
263  *
264  * Call instructions are simple enough to be emulated in software, so we simply
265  * do not use the trampoline mechanism in that case.  kinst_invop() will compute
266  * the branch target using the address info computed here (register operands and
267  * displacement).
268  *
269  * %rip-relative operands encoded using the ModR/M byte always use a 32-bit
270  * displacement; when populating the trampoline the displacement is adjusted to
271  * be relative to the trampoline address.  Trampolines are always allocated
272  * above KERNBASE for this reason.
273  *
274  * For other %rip-relative operands (just jumps) we take the same approach.
275  * Instructions which specify an 8-bit displacement must be rewritten to use a
276  * 32-bit displacement.
277  */
278 static int
279 kinst_instr_dissect(struct kinst_probe *kp, uint8_t **instr)
280 {
281 	struct kinst_probe_md *kpmd;
282 	dis86_t d86;
283 	uint8_t *bytes, modrm, rex;
284 	int dispoff, i, ilen, opcidx;
285 
286 	kpmd = &kp->kp_md;
287 
288 	d86.d86_data = instr;
289 	d86.d86_get_byte = dtrace_dis_get_byte;
290 	d86.d86_check_func = NULL;
291 	if (dtrace_disx86(&d86, SIZE64) != 0) {
292 		KINST_LOG("failed to disassemble instruction at: %p", *instr);
293 		return (EINVAL);
294 	}
295 	bytes = d86.d86_bytes;
296 	kpmd->instlen = kpmd->tinstlen = d86.d86_len;
297 
298 	/*
299 	 * Skip over prefixes, save REX.
300 	 */
301 	rex = 0;
302 	for (i = 0; i < kpmd->instlen; i++) {
303 		switch (bytes[i]) {
304 		case 0xf0 ... 0xf3:
305 			/* group 1 */
306 			continue;
307 		case 0x26:
308 		case 0x2e:
309 		case 0x36:
310 		case 0x3e:
311 		case 0x64:
312 		case 0x65:
313 			/* group 2 */
314 			continue;
315 		case 0x66:
316 			/* group 3 */
317 			continue;
318 		case 0x67:
319 			/* group 4 */
320 			continue;
321 		case 0x40 ... 0x4f:
322 			/* REX */
323 			rex = bytes[i];
324 			continue;
325 		}
326 		break;
327 	}
328 	KASSERT(i < kpmd->instlen,
329 	    ("%s: failed to disassemble instruction at %p", __func__, bytes));
330 	opcidx = i;
331 
332 	/*
333 	 * Identify instructions of interest by opcode: calls and jumps.
334 	 * Extract displacements.
335 	 */
336 	dispoff = -1;
337 	switch (bytes[opcidx]) {
338 	case 0x0f:
339 		switch (bytes[opcidx + 1]) {
340 		case 0x80 ... 0x8f:
341 			/* conditional jmp near */
342 			kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
343 			dispoff = opcidx + 2;
344 			kinst_set_disp32(kp, &bytes[dispoff]);
345 			break;
346 		}
347 		break;
348 	case 0xe3:
349 		/*
350 		 * There is no straightforward way to translate this instruction
351 		 * to use a 32-bit displacement.  Fortunately, it is rarely
352 		 * used.
353 		 */
354 		return (EINVAL);
355 	case 0x70 ... 0x7f:
356 		/* conditional jmp short */
357 		kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
358 		dispoff = opcidx + 1;
359 		kinst_set_disp8(kp, bytes[dispoff]);
360 		break;
361 	case 0xe9:
362 		/* unconditional jmp near */
363 		kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
364 		dispoff = opcidx + 1;
365 		kinst_set_disp32(kp, &bytes[dispoff]);
366 		break;
367 	case 0xeb:
368 		/* unconditional jmp short */
369 		kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
370 		dispoff = opcidx + 1;
371 		kinst_set_disp8(kp, bytes[dispoff]);
372 		break;
373 	case 0xe8:
374 	case 0x9a:
375 		/* direct call */
376 		kpmd->flags |= KINST_F_CALL | KINST_F_DIRECT_CALL;
377 		dispoff = opcidx + 1;
378 		kinst_set_disp32(kp, &bytes[dispoff]);
379 		break;
380 	case 0xff:
381 		KASSERT(d86.d86_got_modrm,
382 		    ("no ModR/M byte for instr at %p", *instr - kpmd->instlen));
383 		switch (KINST_MODRM_REG(bytes[d86.d86_rmindex])) {
384 		case 0x02:
385 		case 0x03:
386 			/* indirect call */
387 			kpmd->flags |= KINST_F_CALL;
388 			break;
389 		case 0x04:
390 		case 0x05:
391 			/* indirect jump */
392 			kpmd->flags |= KINST_F_JMP;
393 			break;
394 		}
395 	}
396 
397 	/*
398 	 * If there's a ModR/M byte, we need to check it to see if the operand
399 	 * is %rip-relative, and rewrite the displacement if so.  If not, we
400 	 * might still have to extract operand info if this is a call
401 	 * instruction.
402 	 */
403 	if (d86.d86_got_modrm) {
404 		uint8_t mod, rm, sib;
405 
406 		kpmd->reg1 = kpmd->reg2 = -1;
407 
408 		modrm = bytes[d86.d86_rmindex];
409 		mod = KINST_MODRM_MOD(modrm);
410 		rm = KINST_MODRM_RM(modrm);
411 		if (mod == 0 && rm == 5) {
412 			kpmd->flags |= KINST_F_RIPREL;
413 			dispoff = d86.d86_rmindex + 1;
414 			kinst_set_disp32(kp, &bytes[dispoff]);
415 		} else if ((kpmd->flags & KINST_F_CALL) != 0) {
416 			bool havesib;
417 
418 			havesib = (mod != 3 && rm == 4);
419 			dispoff = d86.d86_rmindex + (havesib ? 2 : 1);
420 			if (mod == 1)
421 				kinst_set_disp8(kp, bytes[dispoff]);
422 			else if (mod == 2)
423 				kinst_set_disp32(kp, &bytes[dispoff]);
424 			else if (mod == 3)
425 				kpmd->flags |= KINST_F_MOD_DIRECT;
426 
427 			if (havesib) {
428 				sib = bytes[d86.d86_rmindex + 1];
429 				if (KINST_SIB_BASE(sib) != 5) {
430 					kpmd->reg1 = KINST_SIB_BASE(sib) |
431 					    (KINST_REX_B(rex) << 3);
432 				}
433 				kpmd->scale = KINST_SIB_SCALE(sib);
434 				kpmd->reg2 = KINST_SIB_INDEX(sib) |
435 				    (KINST_REX_X(rex) << 3);
436 			} else {
437 				kpmd->reg1 = rm | (KINST_REX_B(rex) << 3);
438 			}
439 		}
440 	}
441 
442 	/*
443 	 * Calls are emulated in software; once operands are decoded we have
444 	 * nothing else to do.
445 	 */
446 	if ((kpmd->flags & KINST_F_CALL) != 0)
447 		return (0);
448 
449 	/*
450 	 * Allocate and populate an instruction trampoline template.
451 	 *
452 	 * Position-independent instructions can simply be copied, but
453 	 * position-dependent instructions require some surgery: jump
454 	 * instructions with an 8-bit displacement need to be converted to use a
455 	 * 32-bit displacement, and the adjusted displacement needs to be
456 	 * computed.
457 	 */
458 	ilen = kpmd->instlen;
459 	if ((kpmd->flags & KINST_F_RIPREL) != 0) {
460 		if ((kpmd->flags & KINST_F_JMP) == 0 ||
461 		    bytes[opcidx] == 0x0f ||
462 		    bytes[opcidx] == 0xe9 ||
463 		    bytes[opcidx] == 0xff) {
464 			memcpy(kpmd->template, bytes, dispoff);
465 			memcpy(&kpmd->template[dispoff + 4],
466 			    &bytes[dispoff + 4], ilen - (dispoff + 4));
467 			kpmd->dispoff = dispoff;
468 		} else if (bytes[opcidx] == 0xeb) {
469 			memcpy(kpmd->template, bytes, opcidx);
470 			kpmd->template[opcidx] = 0xe9;
471 			kpmd->dispoff = opcidx + 1;
472 
473 			/* Instruction length changes from 2 to 5. */
474 			kpmd->tinstlen = 5;
475 			kpmd->disp -= 3;
476 		} else if (bytes[opcidx] >= 0x70 && bytes[opcidx] <= 0x7f)  {
477 			memcpy(kpmd->template, bytes, opcidx);
478 			kpmd->template[opcidx] = 0x0f;
479 			kpmd->template[opcidx + 1] = bytes[opcidx] + 0x10;
480 			kpmd->dispoff = opcidx + 2;
481 
482 			/* Instruction length changes from 2 to 6. */
483 			kpmd->tinstlen = 6;
484 			kpmd->disp -= 4;
485 		} else {
486 			panic("unhandled opcode %#x", bytes[opcidx]);
487 		}
488 	} else {
489 		memcpy(kpmd->template, bytes, ilen);
490 	}
491 
492 	return (0);
493 }
494 
495 int
496 kinst_make_probe(linker_file_t lf, int symindx, linker_symval_t *symval,
497     void *opaque)
498 {
499 	struct kinst_probe *kp;
500 	dtrace_kinst_probedesc_t *pd;
501 	const char *func;
502 	int error, instrsize, n, off;
503 	uint8_t *instr, *limit, *tmp;
504 	bool push_found;
505 
506 	pd = opaque;
507 	func = symval->name;
508 	if (kinst_excluded(func))
509 		return (0);
510 	if (strcmp(func, pd->kpd_func) != 0)
511 		return (0);
512 
513 	instr = (uint8_t *)symval->value;
514 	limit = (uint8_t *)symval->value + symval->size;
515 	if (instr >= limit)
516 		return (0);
517 
518 	/*
519 	 * Refuse to instrument functions lacking the usual frame pointer
520 	 * manipulations since they might correspond to exception handlers.
521 	 */
522 	tmp = instr;
523 	push_found = false;
524 	while (tmp < limit) {
525 		/*
526 		 * Checking for 'pop %rbp' as well makes the filtering too
527 		 * strict as it would skip functions that never return (e.g.,
528 		 * vnlru_proc()).
529 		 */
530 		if (*tmp == KINST_PUSHL_RBP) {
531 			push_found = true;
532 			break;
533 		}
534 		tmp += dtrace_instr_size(tmp);
535 	}
536 	if (!push_found)
537 		return (0);
538 
539 	n = 0;
540 	while (instr < limit) {
541 		instrsize = dtrace_instr_size(instr);
542 		off = (int)(instr - (uint8_t *)symval->value);
543 		if (pd->kpd_off != -1 && off != pd->kpd_off) {
544 			instr += instrsize;
545 			continue;
546 		}
547 
548 		/*
549 		 * Check for instructions which may enable interrupts.  Such
550 		 * instructions are tricky to trace since it is unclear whether
551 		 * to use the per-thread or per-CPU trampolines.  Since they are
552 		 * rare, we don't bother to implement special handling for them.
553 		 *
554 		 * If the caller specified an offset, return an error, otherwise
555 		 * silently ignore the instruction so that it remains possible
556 		 * to enable all instructions in a function.
557 		 */
558 		if (instrsize == 1 &&
559 		    (instr[0] == KINST_POPF || instr[0] == KINST_STI)) {
560 			if (pd->kpd_off != -1)
561 				return (EINVAL);
562 			instr += instrsize;
563 			continue;
564 		}
565 
566 		/*
567 		 * Prevent separate dtrace(1) instances from creating copies of
568 		 * the same probe.
569 		 */
570 		LIST_FOREACH(kp, KINST_GETPROBE(instr), kp_hashnext) {
571 			if (strcmp(kp->kp_func, func) == 0 &&
572 			    strtol(kp->kp_name, NULL, 10) == off)
573 				return (0);
574 		}
575 		if (++n > KINST_PROBETAB_MAX) {
576 			KINST_LOG("probe list full: %d entries", n);
577 			return (ENOMEM);
578 		}
579 		kp = malloc(sizeof(struct kinst_probe), M_KINST,
580 		    M_WAITOK | M_ZERO);
581 		kp->kp_func = func;
582 		snprintf(kp->kp_name, sizeof(kp->kp_name), "%d", off);
583 		kp->kp_savedval = *instr;
584 		kp->kp_patchval = KINST_PATCHVAL;
585 		kp->kp_patchpoint = instr;
586 
587 		error = kinst_instr_dissect(kp, &instr);
588 		if (error != 0)
589 			return (error);
590 
591 		kinst_probe_create(kp, lf);
592 	}
593 
594 	return (0);
595 }
596 
597 int
598 kinst_md_init(void)
599 {
600 	uint8_t *tramp;
601 	int cpu;
602 
603 	CPU_FOREACH(cpu) {
604 		tramp = kinst_trampoline_alloc(M_WAITOK);
605 		if (tramp == NULL)
606 			return (ENOMEM);
607 		DPCPU_ID_SET(cpu, intr_tramp, tramp);
608 	}
609 
610 	return (0);
611 }
612 
613 void
614 kinst_md_deinit(void)
615 {
616 	uint8_t *tramp;
617 	int cpu;
618 
619 	CPU_FOREACH(cpu) {
620 		tramp = DPCPU_ID_GET(cpu, intr_tramp);
621 		if (tramp != NULL) {
622 			kinst_trampoline_dealloc(tramp);
623 			DPCPU_ID_SET(cpu, intr_tramp, NULL);
624 		}
625 	}
626 }
627 
628 /*
629  * Exclude machine-dependent functions that are not safe-to-trace.
630  */
631 bool
632 kinst_md_excluded(const char *name)
633 {
634 	return (false);
635 }
636