1 /*	$NetBSD: dtrace_subr.c,v 1.5 2010/04/23 11:39:53 ahoka Exp $	*/
2 
3 /*
4  * CDDL HEADER START
5  *
6  * The contents of this file are subject to the terms of the
7  * Common Development and Distribution License, Version 1.0 only
8  * (the "License").  You may not use this file except in compliance
9  * with the License.
10  *
11  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
12  * or http://www.opensolaris.org/os/licensing.
13  * See the License for the specific language governing permissions
14  * and limitations under the License.
15  *
16  * When distributing Covered Code, include this CDDL HEADER in each
17  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
18  * If applicable, add the following below this CDDL HEADER, with the
19  * fields enclosed by brackets "[]" replaced with your own identifying
20  * information: Portions Copyright [yyyy] [name of copyright owner]
21  *
22  * CDDL HEADER END
23  *
24  * $FreeBSD: src/sys/cddl/dev/dtrace/i386/dtrace_subr.c,v 1.3.2.1 2009/08/03 08:13:06 kensmith Exp $
25  *
26  */
27 /*
28  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
29  * Use is subject to license terms.
30  */
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/types.h>
35 #include <sys/kernel.h>
36 #include <sys/malloc.h>
37 #include <sys/kmem.h>
38 #include <sys/xcall.h>
39 #include <sys/cpu.h>
40 #include <sys/cpuvar.h>
41 //#include <sys/smp.h>
42 #include <sys/dtrace_impl.h>
43 #include <sys/dtrace_bsd.h>
44 #include <machine/cpu.h>
45 #include <machine/clock.h>
46 #include <machine/frame.h>
47 #include <uvm/uvm_pglist.h>
48 #include <uvm/uvm_prot.h>
49 #include <uvm/uvm_pmap.h>
50 
51 #include <x86/include/cpu_counter.h>
52 
53 extern uintptr_t 	kernelbase;
54 extern uintptr_t 	dtrace_in_probe_addr;
55 extern int		dtrace_in_probe;
56 
57 int dtrace_invop(uintptr_t, uintptr_t *, uintptr_t);
58 
59 typedef struct dtrace_invop_hdlr {
60 	int (*dtih_func)(uintptr_t, uintptr_t *, uintptr_t);
61 	struct dtrace_invop_hdlr *dtih_next;
62 } dtrace_invop_hdlr_t;
63 
64 dtrace_invop_hdlr_t *dtrace_invop_hdlr;
65 
66 void dtrace_gethrtime_init(void *arg);
67 
68 int
69 dtrace_invop(uintptr_t addr, uintptr_t *stack, uintptr_t eax)
70 {
71 	dtrace_invop_hdlr_t *hdlr;
72 	int rval;
73 
74 	for (hdlr = dtrace_invop_hdlr; hdlr != NULL; hdlr = hdlr->dtih_next)
75 		if ((rval = hdlr->dtih_func(addr, stack, eax)) != 0)
76 			return (rval);
77 
78 	return (0);
79 }
80 
81 void
82 dtrace_invop_add(int (*func)(uintptr_t, uintptr_t *, uintptr_t))
83 {
84 	dtrace_invop_hdlr_t *hdlr;
85 
86 	hdlr = kmem_alloc(sizeof (dtrace_invop_hdlr_t), KM_SLEEP);
87 	hdlr->dtih_func = func;
88 	hdlr->dtih_next = dtrace_invop_hdlr;
89 	dtrace_invop_hdlr = hdlr;
90 }
91 
92 void
93 dtrace_invop_remove(int (*func)(uintptr_t, uintptr_t *, uintptr_t))
94 {
95 	dtrace_invop_hdlr_t *hdlr = dtrace_invop_hdlr, *prev = NULL;
96 
97 	for (;;) {
98 		if (hdlr == NULL)
99 			panic("attempt to remove non-existent invop handler");
100 
101 		if (hdlr->dtih_func == func)
102 			break;
103 
104 		prev = hdlr;
105 		hdlr = hdlr->dtih_next;
106 	}
107 
108 	if (prev == NULL) {
109 		ASSERT(dtrace_invop_hdlr == hdlr);
110 		dtrace_invop_hdlr = hdlr->dtih_next;
111 	} else {
112 		ASSERT(dtrace_invop_hdlr != hdlr);
113 		prev->dtih_next = hdlr->dtih_next;
114 	}
115 
116 	kmem_free(hdlr, sizeof (dtrace_invop_hdlr_t));
117 }
118 
119 void
120 dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit))
121 {
122 	(*func)(0, kernelbase);
123 }
124 
125 static void
126 xcall_func(void *arg0, void *arg1)
127 {
128     	dtrace_xcall_t func = arg0;
129 
130     	(*func)(arg1);
131 }
132 
133 void
134 dtrace_xcall(processorid_t cpu, dtrace_xcall_t func, void *arg)
135 {
136 	uint64_t where;
137 
138 	if (cpu == DTRACE_CPUALL) {
139 		where = xc_broadcast(0, xcall_func, func, arg);
140 	} else {
141 		struct cpu_info *cinfo = cpu_lookup(cpu);
142 
143 		KASSERT(cinfo != NULL);
144 		where = xc_unicast(0, xcall_func, func, arg, cinfo);
145 	}
146 	xc_wait(where);
147 
148 	/* XXX Q. Do we really need the other cpus to wait also?
149 	 * (see solaris:xc_sync())
150 	 */
151 }
152 
153 static void
154 dtrace_sync_func(void)
155 {
156 }
157 
158 void
159 dtrace_sync(void)
160 {
161         dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL);
162 }
163 
164 #ifdef notyet
165 int (*dtrace_fasttrap_probe_ptr)(struct regs *);
166 int (*dtrace_pid_probe_ptr)(struct regs *);
167 int (*dtrace_return_probe_ptr)(struct regs *);
168 
169 void
170 dtrace_user_probe(struct regs *rp, caddr_t addr, processorid_t cpuid)
171 {
172 	krwlock_t *rwp;
173 	proc_t *p = curproc;
174 	extern void trap(struct regs *, caddr_t, processorid_t);
175 
176 	if (USERMODE(rp->r_cs) || (rp->r_ps & PS_VM)) {
177 		if (curthread->t_cred != p->p_cred) {
178 			cred_t *oldcred = curthread->t_cred;
179 			/*
180 			 * DTrace accesses t_cred in probe context.  t_cred
181 			 * must always be either NULL, or point to a valid,
182 			 * allocated cred structure.
183 			 */
184 			curthread->t_cred = crgetcred();
185 			crfree(oldcred);
186 		}
187 	}
188 
189 	if (rp->r_trapno == T_DTRACE_RET) {
190 		uint8_t step = curthread->t_dtrace_step;
191 		uint8_t ret = curthread->t_dtrace_ret;
192 		uintptr_t npc = curthread->t_dtrace_npc;
193 
194 		if (curthread->t_dtrace_ast) {
195 			aston(curthread);
196 			curthread->t_sig_check = 1;
197 		}
198 
199 		/*
200 		 * Clear all user tracing flags.
201 		 */
202 		curthread->t_dtrace_ft = 0;
203 
204 		/*
205 		 * If we weren't expecting to take a return probe trap, kill
206 		 * the process as though it had just executed an unassigned
207 		 * trap instruction.
208 		 */
209 		if (step == 0) {
210 			tsignal(curthread, SIGILL);
211 			return;
212 		}
213 
214 		/*
215 		 * If we hit this trap unrelated to a return probe, we're
216 		 * just here to reset the AST flag since we deferred a signal
217 		 * until after we logically single-stepped the instruction we
218 		 * copied out.
219 		 */
220 		if (ret == 0) {
221 			rp->r_pc = npc;
222 			return;
223 		}
224 
225 		/*
226 		 * We need to wait until after we've called the
227 		 * dtrace_return_probe_ptr function pointer to set %pc.
228 		 */
229 		rwp = &CPU->cpu_ft_lock;
230 		rw_enter(rwp, RW_READER);
231 		if (dtrace_return_probe_ptr != NULL)
232 			(void) (*dtrace_return_probe_ptr)(rp);
233 		rw_exit(rwp);
234 		rp->r_pc = npc;
235 
236 	} else if (rp->r_trapno == T_DTRACE_PROBE) {
237 		rwp = &CPU->cpu_ft_lock;
238 		rw_enter(rwp, RW_READER);
239 		if (dtrace_fasttrap_probe_ptr != NULL)
240 			(void) (*dtrace_fasttrap_probe_ptr)(rp);
241 		rw_exit(rwp);
242 
243 	} else if (rp->r_trapno == T_BPTFLT) {
244 		uint8_t instr;
245 		rwp = &CPU->cpu_ft_lock;
246 
247 		/*
248 		 * The DTrace fasttrap provider uses the breakpoint trap
249 		 * (int 3). We let DTrace take the first crack at handling
250 		 * this trap; if it's not a probe that DTrace knowns about,
251 		 * we call into the trap() routine to handle it like a
252 		 * breakpoint placed by a conventional debugger.
253 		 */
254 		rw_enter(rwp, RW_READER);
255 		if (dtrace_pid_probe_ptr != NULL &&
256 		    (*dtrace_pid_probe_ptr)(rp) == 0) {
257 			rw_exit(rwp);
258 			return;
259 		}
260 		rw_exit(rwp);
261 
262 		/*
263 		 * If the instruction that caused the breakpoint trap doesn't
264 		 * look like an int 3 anymore, it may be that this tracepoint
265 		 * was removed just after the user thread executed it. In
266 		 * that case, return to user land to retry the instuction.
267 		 */
268 		if (fuword8((void *)(rp->r_pc - 1), &instr) == 0 &&
269 		    instr != FASTTRAP_INSTR) {
270 			rp->r_pc--;
271 			return;
272 		}
273 
274 		trap(rp, addr, cpuid);
275 
276 	} else {
277 		trap(rp, addr, cpuid);
278 	}
279 }
280 
281 void
282 dtrace_safe_synchronous_signal(void)
283 {
284 	kthread_t *t = curthread;
285 	struct regs *rp = lwptoregs(ttolwp(t));
286 	size_t isz = t->t_dtrace_npc - t->t_dtrace_pc;
287 
288 	ASSERT(t->t_dtrace_on);
289 
290 	/*
291 	 * If we're not in the range of scratch addresses, we're not actually
292 	 * tracing user instructions so turn off the flags. If the instruction
293 	 * we copied out caused a synchonous trap, reset the pc back to its
294 	 * original value and turn off the flags.
295 	 */
296 	if (rp->r_pc < t->t_dtrace_scrpc ||
297 	    rp->r_pc > t->t_dtrace_astpc + isz) {
298 		t->t_dtrace_ft = 0;
299 	} else if (rp->r_pc == t->t_dtrace_scrpc ||
300 	    rp->r_pc == t->t_dtrace_astpc) {
301 		rp->r_pc = t->t_dtrace_pc;
302 		t->t_dtrace_ft = 0;
303 	}
304 }
305 
306 int
307 dtrace_safe_defer_signal(void)
308 {
309 	kthread_t *t = curthread;
310 	struct regs *rp = lwptoregs(ttolwp(t));
311 	size_t isz = t->t_dtrace_npc - t->t_dtrace_pc;
312 
313 	ASSERT(t->t_dtrace_on);
314 
315 	/*
316 	 * If we're not in the range of scratch addresses, we're not actually
317 	 * tracing user instructions so turn off the flags.
318 	 */
319 	if (rp->r_pc < t->t_dtrace_scrpc ||
320 	    rp->r_pc > t->t_dtrace_astpc + isz) {
321 		t->t_dtrace_ft = 0;
322 		return (0);
323 	}
324 
325 	/*
326 	 * If we've executed the original instruction, but haven't performed
327 	 * the jmp back to t->t_dtrace_npc or the clean up of any registers
328 	 * used to emulate %rip-relative instructions in 64-bit mode, do that
329 	 * here and take the signal right away. We detect this condition by
330 	 * seeing if the program counter is the range [scrpc + isz, astpc).
331 	 */
332 	if (t->t_dtrace_astpc - rp->r_pc <
333 	    t->t_dtrace_astpc - t->t_dtrace_scrpc - isz) {
334 #ifdef __amd64
335 		/*
336 		 * If there is a scratch register and we're on the
337 		 * instruction immediately after the modified instruction,
338 		 * restore the value of that scratch register.
339 		 */
340 		if (t->t_dtrace_reg != 0 &&
341 		    rp->r_pc == t->t_dtrace_scrpc + isz) {
342 			switch (t->t_dtrace_reg) {
343 			case REG_RAX:
344 				rp->r_rax = t->t_dtrace_regv;
345 				break;
346 			case REG_RCX:
347 				rp->r_rcx = t->t_dtrace_regv;
348 				break;
349 			case REG_R8:
350 				rp->r_r8 = t->t_dtrace_regv;
351 				break;
352 			case REG_R9:
353 				rp->r_r9 = t->t_dtrace_regv;
354 				break;
355 			}
356 		}
357 #endif
358 		rp->r_pc = t->t_dtrace_npc;
359 		t->t_dtrace_ft = 0;
360 		return (0);
361 	}
362 
363 	/*
364 	 * Otherwise, make sure we'll return to the kernel after executing
365 	 * the copied out instruction and defer the signal.
366 	 */
367 	if (!t->t_dtrace_step) {
368 		ASSERT(rp->r_pc < t->t_dtrace_astpc);
369 		rp->r_pc += t->t_dtrace_astpc - t->t_dtrace_scrpc;
370 		t->t_dtrace_step = 1;
371 	}
372 
373 	t->t_dtrace_ast = 1;
374 
375 	return (1);
376 }
377 #endif
378 
379 #if 0
380 static int64_t	tgt_cpu_tsc;
381 static int64_t	hst_cpu_tsc;
382 #endif
383 static int64_t	tsc_skew[MAXCPUS];
384 static uint64_t	nsec_scale;
385 
386 /* See below for the explanation of this macro. */
387 #define SCALE_SHIFT	28
388 
389 static __inline uint64_t
390 dtrace_rdtsc(void)
391 {
392 	uint64_t rv;
393 
394 	__asm __volatile("rdtsc" : "=A" (rv));
395 	return (rv);
396 }
397 
398 #if 0
399 static void
400 dtrace_gethrtime_init_sync(void *arg)
401 {
402 #ifdef CHECK_SYNC
403 	/*
404 	 * Delay this function from returning on one
405 	 * of the CPUs to check that the synchronisation
406 	 * works.
407 	 */
408 	uintptr_t cpu = (uintptr_t) arg;
409 
410 	if (cpu == curcpu) {
411 		int i;
412 		for (i = 0; i < 1000000000; i++)
413 			tgt_cpu_tsc = dtrace_rdtsc();
414 		tgt_cpu_tsc = 0;
415 	}
416 #endif
417 }
418 #endif
419 
420 #if 0
421 static void
422 dtrace_gethrtime_init_cpu(void *arg)
423 {
424 	uintptr_t cpu = (uintptr_t) arg;
425 
426 	if (cpu == cpu_number())
427 		tgt_cpu_tsc = dtrace_rdtsc();
428 	else
429 		hst_cpu_tsc = dtrace_rdtsc();
430 }
431 #endif
432 
433 void
434 dtrace_gethrtime_init(void *arg)
435 {
436 	uint64_t tsc_f;
437 	CPU_INFO_ITERATOR cpuind;
438 	struct cpu_info *cinfo = curcpu();
439 	cpuid_t cur_cpuid = cpu_number();	/* current cpu id */
440 
441 	/*
442 	 * Get TSC frequency known at this moment.
443 	 * This should be constant if TSC is invariant.
444 	 * Otherwise tick->time conversion will be inaccurate, but
445 	 * will preserve monotonic property of TSC.
446 	 */
447 	tsc_f = cpu_frequency(cinfo);
448 
449 	/*
450 	 * The following line checks that nsec_scale calculated below
451 	 * doesn't overflow 32-bit unsigned integer, so that it can multiply
452 	 * another 32-bit integer without overflowing 64-bit.
453 	 * Thus minimum supported TSC frequency is 62.5MHz.
454 	 */
455 	//KASSERT(tsc_f > (NANOSEC >> (32 - SCALE_SHIFT)), ("TSC frequency is too low"));
456 	KASSERT(tsc_f > (NANOSEC >> (32 - SCALE_SHIFT)));
457 
458 	/*
459 	 * We scale up NANOSEC/tsc_f ratio to preserve as much precision
460 	 * as possible.
461 	 * 2^28 factor was chosen quite arbitrarily from practical
462 	 * considerations:
463 	 * - it supports TSC frequencies as low as 62.5MHz (see above);
464 	 * - it provides quite good precision (e < 0.01%) up to THz
465 	 *   (terahertz) values;
466 	 */
467 	nsec_scale = ((uint64_t)NANOSEC << SCALE_SHIFT) / tsc_f;
468 
469 	/* The current CPU is the reference one. */
470 	tsc_skew[cur_cpuid] = 0;
471 
472 	for (CPU_INFO_FOREACH(cpuind, cinfo)) {
473 		/* use skew relative to cpu 0 */
474 		tsc_skew[cpu_index(cinfo)] = cinfo->ci_data.cpu_cc_skew;
475 	}
476 
477 	/* Already handled in x86/tsc.c for ci_data.cpu_cc_skew */
478 #if 0
479 	for (i = 0; i <= mp_maxid; i++) {
480 		if (i == curcpu)
481 			continue;
482 
483 		if (pcpu_find(i) == NULL)
484 			continue;
485 
486 		map = 0;
487 		map |= (1 << curcpu);
488 		map |= (1 << i);
489 
490 		smp_rendezvous_cpus(map, dtrace_gethrtime_init_sync,
491 		    dtrace_gethrtime_init_cpu,
492 		    smp_no_rendevous_barrier, (void *)(uintptr_t) i);
493 
494 		tsc_skew[i] = tgt_cpu_tsc - hst_cpu_tsc;
495 	}
496 #endif
497 }
498 
499 /*
500  * DTrace needs a high resolution time function which can
501  * be called from a probe context and guaranteed not to have
502  * instrumented with probes itself.
503  *
504  * Returns nanoseconds since boot.
505  */
506 uint64_t
507 dtrace_gethrtime()
508 {
509 	uint64_t tsc;
510 	uint32_t lo;
511 	uint32_t hi;
512 
513 	/*
514 	 * We split TSC value into lower and higher 32-bit halves and separately
515 	 * scale them with nsec_scale, then we scale them down by 2^28
516 	 * (see nsec_scale calculations) taking into account 32-bit shift of
517 	 * the higher half and finally add.
518 	 */
519 	tsc = dtrace_rdtsc() + tsc_skew[cpu_number()];
520 	lo = tsc;
521 	hi = tsc >> 32;
522 	return (((lo * nsec_scale) >> SCALE_SHIFT) +
523 	    ((hi * nsec_scale) << (32 - SCALE_SHIFT)));
524 }
525 
526 uint64_t
527 dtrace_gethrestime(void)
528 {
529 	printf("%s(%d): XXX\n",__func__,__LINE__);
530 	return (0);
531 }
532 
533 /* Function to handle DTrace traps during probes. See i386/i386/trap.c */
534 int
535 dtrace_trap(struct trapframe *frame, u_int type)
536 {
537 	cpuid_t cpuid = cpu_number();	/* current cpu id */
538 
539 	/*
540 	 * A trap can occur while DTrace executes a probe. Before
541 	 * executing the probe, DTrace blocks re-scheduling and sets
542 	 * a flag in it's per-cpu flags to indicate that it doesn't
543 	 * want to fault. On returning from the the probe, the no-fault
544 	 * flag is cleared and finally re-scheduling is enabled.
545 	 *
546 	 * Check if DTrace has enabled 'no-fault' mode:
547 	 *
548 	 */
549 	if ((cpu_core[cpuid].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0) {
550 		/*
551 		 * There are only a couple of trap types that are expected.
552 		 * All the rest will be handled in the usual way.
553 		 */
554 		switch (type) {
555 		/* General protection fault. */
556 		case T_PROTFLT:
557 			/* Flag an illegal operation. */
558 			cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
559 
560 			/*
561 			 * Offset the instruction pointer to the instruction
562 			 * following the one causing the fault.
563 			 */
564 			frame->tf_eip += dtrace_instr_size((u_char *) frame->tf_eip);
565 			return (1);
566 		/* Page fault. */
567 		case T_PAGEFLT:
568 			/* Flag a bad address. */
569 			cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR;
570 			cpu_core[cpuid].cpuc_dtrace_illval = rcr2();
571 
572 			/*
573 			 * Offset the instruction pointer to the instruction
574 			 * following the one causing the fault.
575 			 */
576 			frame->tf_eip += dtrace_instr_size((u_char *) frame->tf_eip);
577 			return (1);
578 		default:
579 			/* Handle all other traps in the usual way. */
580 			break;
581 		}
582 	}
583 
584 	/* Handle the trap in the usual way. */
585 	return (0);
586 }
587