xref: /illumos-gate/usr/src/uts/common/os/kcpc.c (revision 06e1a714)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
29 
30 #include <sys/param.h>
31 #include <sys/thread.h>
32 #include <sys/cpuvar.h>
33 #include <sys/inttypes.h>
34 #include <sys/cmn_err.h>
35 #include <sys/time.h>
36 #include <sys/mutex.h>
37 #include <sys/systm.h>
38 #include <sys/kcpc.h>
39 #include <sys/cpc_impl.h>
40 #include <sys/cpc_pcbe.h>
41 #include <sys/atomic.h>
42 #include <sys/sunddi.h>
43 #include <sys/modctl.h>
44 #include <sys/sdt.h>
45 #if defined(__x86)
46 #include <asm/clock.h>
47 #endif
48 
49 kmutex_t	kcpc_ctx_llock[CPC_HASH_BUCKETS];	/* protects ctx_list */
50 kcpc_ctx_t	*kcpc_ctx_list[CPC_HASH_BUCKETS];	/* head of list */
51 
52 
53 krwlock_t	kcpc_cpuctx_lock;	/* lock for 'kcpc_cpuctx' below */
54 int		kcpc_cpuctx;		/* number of cpu-specific contexts */
55 
56 int kcpc_counts_include_idle = 1; /* Project Private /etc/system variable */
57 
58 /*
59  * These are set when a PCBE module is loaded.
60  */
61 uint_t		cpc_ncounters = 0;
62 pcbe_ops_t	*pcbe_ops = NULL;
63 
64 /*
65  * Statistics on (mis)behavior
66  */
67 static uint32_t kcpc_intrctx_count;    /* # overflows in an interrupt handler */
68 static uint32_t kcpc_nullctx_count;    /* # overflows in a thread with no ctx */
69 
70 /*
71  * Is misbehaviour (overflow in a thread with no context) fatal?
72  */
73 #ifdef DEBUG
74 static int kcpc_nullctx_panic = 1;
75 #else
76 static int kcpc_nullctx_panic = 0;
77 #endif
78 
79 static void kcpc_lwp_create(kthread_t *t, kthread_t *ct);
80 static void kcpc_restore(kcpc_ctx_t *ctx);
81 static void kcpc_save(kcpc_ctx_t *ctx);
82 static void kcpc_free(kcpc_ctx_t *ctx, int isexec);
83 static int kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode);
84 static void kcpc_free_configs(kcpc_set_t *set);
85 static kcpc_ctx_t *kcpc_ctx_alloc(void);
86 static void kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx);
87 static void kcpc_ctx_free(kcpc_ctx_t *ctx);
88 static int kcpc_assign_reqs(kcpc_set_t *set, kcpc_ctx_t *ctx);
89 static int kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch);
90 static kcpc_set_t *kcpc_dup_set(kcpc_set_t *set);
91 
92 void
93 kcpc_register_pcbe(pcbe_ops_t *ops)
94 {
95 	pcbe_ops = ops;
96 	cpc_ncounters = pcbe_ops->pcbe_ncounters();
97 }
98 
99 int
100 kcpc_bind_cpu(kcpc_set_t *set, processorid_t cpuid, int *subcode)
101 {
102 	cpu_t		*cp;
103 	kcpc_ctx_t	*ctx;
104 	int		error;
105 
106 	ctx = kcpc_ctx_alloc();
107 
108 	if (kcpc_assign_reqs(set, ctx) != 0) {
109 		kcpc_ctx_free(ctx);
110 		*subcode = CPC_RESOURCE_UNAVAIL;
111 		return (EINVAL);
112 	}
113 
114 	ctx->kc_cpuid = cpuid;
115 	ctx->kc_thread = curthread;
116 
117 	set->ks_data = kmem_zalloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
118 
119 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
120 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
121 		kcpc_ctx_free(ctx);
122 		return (error);
123 	}
124 
125 	set->ks_ctx = ctx;
126 	ctx->kc_set = set;
127 
128 	/*
129 	 * We must hold cpu_lock to prevent DR, offlining, or unbinding while
130 	 * we are manipulating the cpu_t and programming the hardware, else the
131 	 * the cpu_t could go away while we're looking at it.
132 	 */
133 	mutex_enter(&cpu_lock);
134 	cp = cpu_get(cpuid);
135 
136 	if (cp == NULL)
137 		/*
138 		 * The CPU could have been DRd out while we were getting set up.
139 		 */
140 		goto unbound;
141 
142 	mutex_enter(&cp->cpu_cpc_ctxlock);
143 
144 	if (cp->cpu_cpc_ctx != NULL) {
145 		/*
146 		 * If this CPU already has a bound set, return an error.
147 		 */
148 		mutex_exit(&cp->cpu_cpc_ctxlock);
149 		goto unbound;
150 	}
151 
152 	if (curthread->t_bind_cpu != cpuid) {
153 		mutex_exit(&cp->cpu_cpc_ctxlock);
154 		goto unbound;
155 	}
156 	cp->cpu_cpc_ctx = ctx;
157 
158 	/*
159 	 * Kernel preemption must be disabled while fiddling with the hardware
160 	 * registers to prevent partial updates.
161 	 */
162 	kpreempt_disable();
163 	ctx->kc_rawtick = KCPC_GET_TICK();
164 	pcbe_ops->pcbe_program(ctx);
165 	kpreempt_enable();
166 
167 	mutex_exit(&cp->cpu_cpc_ctxlock);
168 	mutex_exit(&cpu_lock);
169 
170 	return (0);
171 
172 unbound:
173 	mutex_exit(&cpu_lock);
174 	set->ks_ctx = NULL;
175 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
176 	kcpc_ctx_free(ctx);
177 	return (EAGAIN);
178 }
179 
180 int
181 kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
182 {
183 	kcpc_ctx_t	*ctx;
184 	int		error;
185 
186 	/*
187 	 * Only one set is allowed per context, so ensure there is no
188 	 * existing context.
189 	 */
190 
191 	if (t->t_cpc_ctx != NULL)
192 		return (EEXIST);
193 
194 	ctx = kcpc_ctx_alloc();
195 
196 	/*
197 	 * The context must begin life frozen until it has been properly
198 	 * programmed onto the hardware. This prevents the context ops from
199 	 * worrying about it until we're ready.
200 	 */
201 	ctx->kc_flags |= KCPC_CTX_FREEZE;
202 	ctx->kc_hrtime = gethrtime();
203 
204 	if (kcpc_assign_reqs(set, ctx) != 0) {
205 		kcpc_ctx_free(ctx);
206 		*subcode = CPC_RESOURCE_UNAVAIL;
207 		return (EINVAL);
208 	}
209 
210 	ctx->kc_cpuid = -1;
211 	if (set->ks_flags & CPC_BIND_LWP_INHERIT)
212 		ctx->kc_flags |= KCPC_CTX_LWPINHERIT;
213 	ctx->kc_thread = t;
214 	t->t_cpc_ctx = ctx;
215 	/*
216 	 * Permit threads to look at their own hardware counters from userland.
217 	 */
218 	ctx->kc_flags |= KCPC_CTX_NONPRIV;
219 
220 	/*
221 	 * Create the data store for this set.
222 	 */
223 	set->ks_data = kmem_alloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
224 
225 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
226 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
227 		kcpc_ctx_free(ctx);
228 		t->t_cpc_ctx = NULL;
229 		return (error);
230 	}
231 
232 	set->ks_ctx = ctx;
233 	ctx->kc_set = set;
234 
235 	/*
236 	 * Add a device context to the subject thread.
237 	 */
238 	installctx(t, ctx, kcpc_save, kcpc_restore, NULL,
239 	    kcpc_lwp_create, NULL, kcpc_free);
240 
241 	/*
242 	 * Ask the backend to program the hardware.
243 	 */
244 	if (t == curthread) {
245 		kpreempt_disable();
246 		ctx->kc_rawtick = KCPC_GET_TICK();
247 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
248 		pcbe_ops->pcbe_program(ctx);
249 		kpreempt_enable();
250 	} else
251 		/*
252 		 * Since we are the agent LWP, we know the victim LWP is stopped
253 		 * until we're done here; no need to worry about preemption or
254 		 * migration here. We still use an atomic op to clear the flag
255 		 * to ensure the flags are always self-consistent; they can
256 		 * still be accessed from, for instance, another CPU doing a
257 		 * kcpc_invalidate_all().
258 		 */
259 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
260 
261 
262 	return (0);
263 }
264 
265 /*
266  * Walk through each request in the set and ask the PCBE to configure a
267  * corresponding counter.
268  */
269 static int
270 kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode)
271 {
272 	int		i;
273 	int		ret;
274 	kcpc_request_t	*rp;
275 
276 	for (i = 0; i < set->ks_nreqs; i++) {
277 		int n;
278 		rp = &set->ks_req[i];
279 
280 		n = rp->kr_picnum;
281 
282 		ASSERT(n >= 0 && n < cpc_ncounters);
283 
284 		ASSERT(ctx->kc_pics[n].kp_req == NULL);
285 
286 		if (rp->kr_flags & CPC_OVF_NOTIFY_EMT) {
287 			if ((pcbe_ops->pcbe_caps & CPC_CAP_OVERFLOW_INTERRUPT)
288 			    == 0) {
289 				*subcode = -1;
290 				return (ENOTSUP);
291 			}
292 			/*
293 			 * If any of the counters have requested overflow
294 			 * notification, we flag the context as being one that
295 			 * cares about overflow.
296 			 */
297 			ctx->kc_flags |= KCPC_CTX_SIGOVF;
298 		}
299 
300 		rp->kr_config = NULL;
301 		if ((ret = pcbe_ops->pcbe_configure(n, rp->kr_event,
302 		    rp->kr_preset, rp->kr_flags, rp->kr_nattrs, rp->kr_attr,
303 		    &(rp->kr_config), (void *)ctx)) != 0) {
304 			kcpc_free_configs(set);
305 			*subcode = ret;
306 			if (ret == CPC_ATTR_REQUIRES_PRIVILEGE)
307 				return (EACCES);
308 			return (EINVAL);
309 		}
310 
311 		ctx->kc_pics[n].kp_req = rp;
312 		rp->kr_picp = &ctx->kc_pics[n];
313 		rp->kr_data = set->ks_data + rp->kr_index;
314 		*rp->kr_data = rp->kr_preset;
315 	}
316 
317 	return (0);
318 }
319 
320 static void
321 kcpc_free_configs(kcpc_set_t *set)
322 {
323 	int i;
324 
325 	for (i = 0; i < set->ks_nreqs; i++)
326 		if (set->ks_req[i].kr_config != NULL)
327 			pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
328 }
329 
330 /*
331  * buf points to a user address and the data should be copied out to that
332  * address in the current process.
333  */
334 int
335 kcpc_sample(kcpc_set_t *set, uint64_t *buf, hrtime_t *hrtime, uint64_t *tick)
336 {
337 	kcpc_ctx_t	*ctx = set->ks_ctx;
338 	uint64_t	curtick = KCPC_GET_TICK();
339 
340 	if (ctx == NULL)
341 		return (EINVAL);
342 	else if (ctx->kc_flags & KCPC_CTX_INVALID)
343 		return (EAGAIN);
344 
345 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0) {
346 		/*
347 		 * Kernel preemption must be disabled while reading the
348 		 * hardware regs, and if this is a CPU-bound context, while
349 		 * checking the CPU binding of the current thread.
350 		 */
351 		kpreempt_disable();
352 
353 		if (ctx->kc_cpuid != -1) {
354 			if (curthread->t_bind_cpu != ctx->kc_cpuid) {
355 				kpreempt_enable();
356 				return (EAGAIN);
357 			}
358 		}
359 
360 		if (ctx->kc_thread == curthread) {
361 			ctx->kc_hrtime = gethrtime();
362 			pcbe_ops->pcbe_sample(ctx);
363 			ctx->kc_vtick += curtick - ctx->kc_rawtick;
364 			ctx->kc_rawtick = curtick;
365 		}
366 
367 		kpreempt_enable();
368 	}
369 
370 	if (copyout(set->ks_data, buf,
371 	    set->ks_nreqs * sizeof (uint64_t)) == -1)
372 		return (EFAULT);
373 	if (copyout(&ctx->kc_hrtime, hrtime, sizeof (uint64_t)) == -1)
374 		return (EFAULT);
375 	if (copyout(&ctx->kc_vtick, tick, sizeof (uint64_t)) == -1)
376 		return (EFAULT);
377 
378 	return (0);
379 }
380 
381 /*
382  * Stop the counters on the CPU this context is bound to.
383  */
384 static void
385 kcpc_stop_hw(kcpc_ctx_t *ctx)
386 {
387 	cpu_t *cp;
388 
389 	ASSERT((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED))
390 	    == KCPC_CTX_INVALID);
391 
392 	kpreempt_disable();
393 
394 	cp = cpu_get(ctx->kc_cpuid);
395 	ASSERT(cp != NULL);
396 
397 	if (cp == CPU) {
398 		pcbe_ops->pcbe_allstop();
399 		atomic_or_uint(&ctx->kc_flags,
400 		    KCPC_CTX_INVALID_STOPPED);
401 	} else
402 		kcpc_remote_stop(cp);
403 	kpreempt_enable();
404 }
405 
406 int
407 kcpc_unbind(kcpc_set_t *set)
408 {
409 	kcpc_ctx_t	*ctx = set->ks_ctx;
410 	kthread_t	*t;
411 
412 	if (ctx == NULL)
413 		return (EINVAL);
414 
415 	atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
416 
417 	if (ctx->kc_cpuid == -1) {
418 		t = ctx->kc_thread;
419 		/*
420 		 * The context is thread-bound and therefore has a device
421 		 * context.  It will be freed via removectx() calling
422 		 * freectx() calling kcpc_free().
423 		 */
424 		if (t == curthread &&
425 			(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
426 			kpreempt_disable();
427 			pcbe_ops->pcbe_allstop();
428 			atomic_or_uint(&ctx->kc_flags,
429 			    KCPC_CTX_INVALID_STOPPED);
430 			kpreempt_enable();
431 		}
432 #ifdef DEBUG
433 		if (removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
434 		    kcpc_lwp_create, NULL, kcpc_free) == 0)
435 			panic("kcpc_unbind: context %p not preset on thread %p",
436 			    ctx, t);
437 #else
438 		(void) removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
439 		    kcpc_lwp_create, NULL, kcpc_free);
440 #endif /* DEBUG */
441 		t->t_cpc_set = NULL;
442 		t->t_cpc_ctx = NULL;
443 	} else {
444 		/*
445 		 * If we are unbinding a CPU-bound set from a remote CPU, the
446 		 * native CPU's idle thread could be in the midst of programming
447 		 * this context onto the CPU. We grab the context's lock here to
448 		 * ensure that the idle thread is done with it. When we release
449 		 * the lock, the CPU no longer has a context and the idle thread
450 		 * will move on.
451 		 *
452 		 * cpu_lock must be held to prevent the CPU from being DR'd out
453 		 * while we disassociate the context from the cpu_t.
454 		 */
455 		cpu_t *cp;
456 		mutex_enter(&cpu_lock);
457 		cp = cpu_get(ctx->kc_cpuid);
458 		if (cp != NULL) {
459 			/*
460 			 * The CPU may have been DR'd out of the system.
461 			 */
462 			mutex_enter(&cp->cpu_cpc_ctxlock);
463 			if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0)
464 				kcpc_stop_hw(ctx);
465 			ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);
466 			cp->cpu_cpc_ctx = NULL;
467 			mutex_exit(&cp->cpu_cpc_ctxlock);
468 		}
469 		mutex_exit(&cpu_lock);
470 		if (ctx->kc_thread == curthread) {
471 			kcpc_free(ctx, 0);
472 			curthread->t_cpc_set = NULL;
473 		}
474 	}
475 
476 	return (0);
477 }
478 
479 int
480 kcpc_preset(kcpc_set_t *set, int index, uint64_t preset)
481 {
482 	int i;
483 
484 	ASSERT(set != NULL);
485 	ASSERT(set->ks_ctx != NULL);
486 	ASSERT(set->ks_ctx->kc_thread == curthread);
487 	ASSERT(set->ks_ctx->kc_cpuid == -1);
488 
489 	if (index < 0 || index >= set->ks_nreqs)
490 		return (EINVAL);
491 
492 	for (i = 0; i < set->ks_nreqs; i++)
493 		if (set->ks_req[i].kr_index == index)
494 			break;
495 	ASSERT(i != set->ks_nreqs);
496 
497 	set->ks_req[i].kr_preset = preset;
498 	return (0);
499 }
500 
501 int
502 kcpc_restart(kcpc_set_t *set)
503 {
504 	kcpc_ctx_t	*ctx = set->ks_ctx;
505 	int		i;
506 
507 	ASSERT(ctx != NULL);
508 	ASSERT(ctx->kc_thread == curthread);
509 	ASSERT(ctx->kc_cpuid == -1);
510 
511 	kpreempt_disable();
512 
513 	/*
514 	 * If the user is doing this on a running set, make sure the counters
515 	 * are stopped first.
516 	 */
517 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
518 		pcbe_ops->pcbe_allstop();
519 
520 	for (i = 0; i < set->ks_nreqs; i++) {
521 		*(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
522 		pcbe_ops->pcbe_configure(0, NULL, set->ks_req[i].kr_preset,
523 		    0, 0, NULL, &set->ks_req[i].kr_config, NULL);
524 	}
525 
526 	/*
527 	 * Ask the backend to program the hardware.
528 	 */
529 	ctx->kc_rawtick = KCPC_GET_TICK();
530 	atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
531 	pcbe_ops->pcbe_program(ctx);
532 	kpreempt_enable();
533 
534 	return (0);
535 }
536 
537 /*
538  * Caller must hold kcpc_cpuctx_lock.
539  */
540 int
541 kcpc_enable(kthread_t *t, int cmd, int enable)
542 {
543 	kcpc_ctx_t	*ctx = t->t_cpc_ctx;
544 	kcpc_set_t	*set = t->t_cpc_set;
545 	kcpc_set_t	*newset;
546 	int		i;
547 	int		flag;
548 	int		err;
549 
550 	ASSERT(RW_READ_HELD(&kcpc_cpuctx_lock));
551 
552 	if (ctx == NULL) {
553 		/*
554 		 * This thread has a set but no context; it must be a
555 		 * CPU-bound set.
556 		 */
557 		ASSERT(t->t_cpc_set != NULL);
558 		ASSERT(t->t_cpc_set->ks_ctx->kc_cpuid != -1);
559 		return (EINVAL);
560 	} else if (ctx->kc_flags & KCPC_CTX_INVALID)
561 		return (EAGAIN);
562 
563 	if (cmd == CPC_ENABLE) {
564 		if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
565 			return (EINVAL);
566 		kpreempt_disable();
567 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
568 		kcpc_restore(ctx);
569 		kpreempt_enable();
570 	} else if (cmd == CPC_DISABLE) {
571 		if (ctx->kc_flags & KCPC_CTX_FREEZE)
572 			return (EINVAL);
573 		kpreempt_disable();
574 		kcpc_save(ctx);
575 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE);
576 		kpreempt_enable();
577 	} else if (cmd == CPC_USR_EVENTS || cmd == CPC_SYS_EVENTS) {
578 		/*
579 		 * Strategy for usr/sys: stop counters and update set's presets
580 		 * with current counter values, unbind, update requests with
581 		 * new config, then re-bind.
582 		 */
583 		flag = (cmd == CPC_USR_EVENTS) ?
584 		    CPC_COUNT_USER: CPC_COUNT_SYSTEM;
585 
586 		kpreempt_disable();
587 		atomic_or_uint(&ctx->kc_flags,
588 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
589 		pcbe_ops->pcbe_allstop();
590 		kpreempt_enable();
591 		for (i = 0; i < set->ks_nreqs; i++) {
592 			set->ks_req[i].kr_preset = *(set->ks_req[i].kr_data);
593 			if (enable)
594 				set->ks_req[i].kr_flags |= flag;
595 			else
596 				set->ks_req[i].kr_flags &= ~flag;
597 		}
598 		newset = kcpc_dup_set(set);
599 		if (kcpc_unbind(set) != 0)
600 			return (EINVAL);
601 		t->t_cpc_set = newset;
602 		if (kcpc_bind_thread(newset, t, &err) != 0) {
603 			t->t_cpc_set = NULL;
604 			kcpc_free_set(newset);
605 			return (EINVAL);
606 		}
607 	} else
608 		return (EINVAL);
609 
610 	return (0);
611 }
612 
613 /*
614  * Provide PCBEs with a way of obtaining the configs of every counter which will
615  * be programmed together.
616  *
617  * If current is NULL, provide the first config.
618  *
619  * If data != NULL, caller wants to know where the data store associated with
620  * the config we return is located.
621  */
622 void *
623 kcpc_next_config(void *token, void *current, uint64_t **data)
624 {
625 	int		i;
626 	kcpc_pic_t	*pic;
627 	kcpc_ctx_t *ctx = (kcpc_ctx_t *)token;
628 
629 	if (current == NULL) {
630 		/*
631 		 * Client would like the first config, which may not be in
632 		 * counter 0; we need to search through the counters for the
633 		 * first config.
634 		 */
635 		for (i = 0; i < cpc_ncounters; i++)
636 			if (ctx->kc_pics[i].kp_req != NULL)
637 				break;
638 		/*
639 		 * There are no counters configured for the given context.
640 		 */
641 		if (i == cpc_ncounters)
642 			return (NULL);
643 	} else {
644 		/*
645 		 * There surely is a faster way to do this.
646 		 */
647 		for (i = 0; i < cpc_ncounters; i++) {
648 			pic = &ctx->kc_pics[i];
649 
650 			if (pic->kp_req != NULL &&
651 			    current == pic->kp_req->kr_config)
652 				break;
653 		}
654 
655 		/*
656 		 * We found the current config at picnum i. Now search for the
657 		 * next configured PIC.
658 		 */
659 		for (i++; i < cpc_ncounters; i++) {
660 			pic = &ctx->kc_pics[i];
661 			if (pic->kp_req != NULL)
662 				break;
663 		}
664 
665 		if (i == cpc_ncounters)
666 			return (NULL);
667 	}
668 
669 	if (data != NULL) {
670 		*data = ctx->kc_pics[i].kp_req->kr_data;
671 	}
672 
673 	return (ctx->kc_pics[i].kp_req->kr_config);
674 }
675 
676 
677 static kcpc_ctx_t *
678 kcpc_ctx_alloc(void)
679 {
680 	kcpc_ctx_t	*ctx;
681 	long		hash;
682 
683 	ctx = (kcpc_ctx_t *)kmem_alloc(sizeof (kcpc_ctx_t), KM_SLEEP);
684 
685 	hash = CPC_HASH_CTX(ctx);
686 	mutex_enter(&kcpc_ctx_llock[hash]);
687 	ctx->kc_next = kcpc_ctx_list[hash];
688 	kcpc_ctx_list[hash] = ctx;
689 	mutex_exit(&kcpc_ctx_llock[hash]);
690 
691 	ctx->kc_pics = (kcpc_pic_t *)kmem_zalloc(sizeof (kcpc_pic_t) *
692 	    cpc_ncounters, KM_SLEEP);
693 
694 	ctx->kc_flags = 0;
695 	ctx->kc_vtick = 0;
696 	ctx->kc_rawtick = 0;
697 	ctx->kc_cpuid = -1;
698 
699 	return (ctx);
700 }
701 
702 /*
703  * Copy set from ctx to the child context, cctx, if it has CPC_BIND_LWP_INHERIT
704  * in the flags.
705  */
706 static void
707 kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx)
708 {
709 	kcpc_set_t	*ks = ctx->kc_set, *cks;
710 	int		i, j;
711 	int		code;
712 
713 	ASSERT(ks != NULL);
714 
715 	if ((ks->ks_flags & CPC_BIND_LWP_INHERIT) == 0)
716 		return;
717 
718 	cks = kmem_alloc(sizeof (*cks), KM_SLEEP);
719 	cctx->kc_set = cks;
720 	cks->ks_flags = ks->ks_flags;
721 	cks->ks_nreqs = ks->ks_nreqs;
722 	cks->ks_req = kmem_alloc(cks->ks_nreqs *
723 	    sizeof (kcpc_request_t), KM_SLEEP);
724 	cks->ks_data = kmem_alloc(cks->ks_nreqs * sizeof (uint64_t),
725 	    KM_SLEEP);
726 	cks->ks_ctx = cctx;
727 
728 	for (i = 0; i < cks->ks_nreqs; i++) {
729 		cks->ks_req[i].kr_index = ks->ks_req[i].kr_index;
730 		cks->ks_req[i].kr_picnum = ks->ks_req[i].kr_picnum;
731 		(void) strncpy(cks->ks_req[i].kr_event,
732 		    ks->ks_req[i].kr_event, CPC_MAX_EVENT_LEN);
733 		cks->ks_req[i].kr_preset = ks->ks_req[i].kr_preset;
734 		cks->ks_req[i].kr_flags = ks->ks_req[i].kr_flags;
735 		cks->ks_req[i].kr_nattrs = ks->ks_req[i].kr_nattrs;
736 		if (ks->ks_req[i].kr_nattrs > 0) {
737 			cks->ks_req[i].kr_attr =
738 			    kmem_alloc(ks->ks_req[i].kr_nattrs *
739 				sizeof (kcpc_attr_t), KM_SLEEP);
740 		}
741 		for (j = 0; j < ks->ks_req[i].kr_nattrs; j++) {
742 			(void) strncpy(cks->ks_req[i].kr_attr[j].ka_name,
743 			    ks->ks_req[i].kr_attr[j].ka_name,
744 			    CPC_MAX_ATTR_LEN);
745 			cks->ks_req[i].kr_attr[j].ka_val =
746 			    ks->ks_req[i].kr_attr[j].ka_val;
747 		}
748 	}
749 	if (kcpc_configure_reqs(cctx, cks, &code) != 0)
750 		panic("kcpc_ctx_clone: configure of context %p with "
751 		    "set %p failed with subcode %d", cctx, cks, code);
752 }
753 
754 
755 static void
756 kcpc_ctx_free(kcpc_ctx_t *ctx)
757 {
758 	kcpc_ctx_t	**loc;
759 	long		hash = CPC_HASH_CTX(ctx);
760 
761 	mutex_enter(&kcpc_ctx_llock[hash]);
762 	loc = &kcpc_ctx_list[hash];
763 	ASSERT(*loc != NULL);
764 	while (*loc != ctx)
765 		loc = &(*loc)->kc_next;
766 	*loc = ctx->kc_next;
767 	mutex_exit(&kcpc_ctx_llock[hash]);
768 
769 	kmem_free(ctx->kc_pics, cpc_ncounters * sizeof (kcpc_pic_t));
770 	kmem_free(ctx, sizeof (*ctx));
771 }
772 
773 /*
774  * Generic interrupt handler used on hardware that generates
775  * overflow interrupts.
776  *
777  * Note: executed at high-level interrupt context!
778  */
779 /*ARGSUSED*/
780 kcpc_ctx_t *
781 kcpc_overflow_intr(caddr_t arg, uint64_t bitmap)
782 {
783 	kcpc_ctx_t	*ctx;
784 	kthread_t	*t = curthread;
785 	int		i;
786 
787 	/*
788 	 * On both x86 and UltraSPARC, we may deliver the high-level
789 	 * interrupt in kernel mode, just after we've started to run an
790 	 * interrupt thread.  (That's because the hardware helpfully
791 	 * delivers the overflow interrupt some random number of cycles
792 	 * after the instruction that caused the overflow by which time
793 	 * we're in some part of the kernel, not necessarily running on
794 	 * the right thread).
795 	 *
796 	 * Check for this case here -- find the pinned thread
797 	 * that was running when the interrupt went off.
798 	 */
799 	if (t->t_flag & T_INTR_THREAD) {
800 		klwp_t *lwp;
801 
802 		atomic_add_32(&kcpc_intrctx_count, 1);
803 
804 		/*
805 		 * Note that t_lwp is always set to point at the underlying
806 		 * thread, thus this will work in the presence of nested
807 		 * interrupts.
808 		 */
809 		ctx = NULL;
810 		if ((lwp = t->t_lwp) != NULL) {
811 			t = lwptot(lwp);
812 			ctx = t->t_cpc_ctx;
813 		}
814 	} else
815 		ctx = t->t_cpc_ctx;
816 
817 	if (ctx == NULL) {
818 		/*
819 		 * This can easily happen if we're using the counters in
820 		 * "shared" mode, for example, and an overflow interrupt
821 		 * occurs while we are running cpustat.  In that case, the
822 		 * bound thread that has the context that belongs to this
823 		 * CPU is almost certainly sleeping (if it was running on
824 		 * the CPU we'd have found it above), and the actual
825 		 * interrupted thread has no knowledge of performance counters!
826 		 */
827 		ctx = curthread->t_cpu->cpu_cpc_ctx;
828 		if (ctx != NULL) {
829 			/*
830 			 * Return the bound context for this CPU to
831 			 * the interrupt handler so that it can synchronously
832 			 * sample the hardware counters and restart them.
833 			 */
834 			return (ctx);
835 		}
836 
837 		/*
838 		 * As long as the overflow interrupt really is delivered early
839 		 * enough after trapping into the kernel to avoid switching
840 		 * threads, we must always be able to find the cpc context,
841 		 * or something went terribly wrong i.e. we ended up
842 		 * running a passivated interrupt thread, a kernel
843 		 * thread or we interrupted idle, all of which are Very Bad.
844 		 */
845 		if (kcpc_nullctx_panic)
846 			panic("null cpc context, thread %p", (void *)t);
847 		atomic_add_32(&kcpc_nullctx_count, 1);
848 	} else if ((ctx->kc_flags & KCPC_CTX_INVALID) == 0) {
849 		/*
850 		 * Schedule an ast to sample the counters, which will
851 		 * propagate any overflow into the virtualized performance
852 		 * counter(s), and may deliver a signal.
853 		 */
854 		ttolwp(t)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
855 		/*
856 		 * If a counter has overflowed which was counting on behalf of
857 		 * a request which specified CPC_OVF_NOTIFY_EMT, send the
858 		 * process a signal.
859 		 */
860 		for (i = 0; i < cpc_ncounters; i++) {
861 			if (ctx->kc_pics[i].kp_req != NULL &&
862 			    bitmap & (1 << i) &&
863 			    ctx->kc_pics[i].kp_req->kr_flags &
864 			    CPC_OVF_NOTIFY_EMT) {
865 				/*
866 				 * A signal has been requested for this PIC, so
867 				 * so freeze the context. The interrupt handler
868 				 * has already stopped the counter hardware.
869 				 */
870 				atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE);
871 				atomic_or_uint(&ctx->kc_pics[i].kp_flags,
872 				    KCPC_PIC_OVERFLOWED);
873 			}
874 		}
875 		aston(t);
876 	}
877 	return (NULL);
878 }
879 
880 /*
881  * The current thread context had an overflow interrupt; we're
882  * executing here in high-level interrupt context.
883  */
884 /*ARGSUSED*/
885 uint_t
886 kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2)
887 {
888 	kcpc_ctx_t	*ctx;
889 	uint64_t	bitmap;
890 
891 	if (pcbe_ops == NULL ||
892 	    (bitmap = pcbe_ops->pcbe_overflow_bitmap()) == 0)
893 		return (DDI_INTR_UNCLAIMED);
894 
895 	/*
896 	 * Prevent any further interrupts.
897 	 */
898 	pcbe_ops->pcbe_allstop();
899 
900 	/*
901 	 * Invoke the "generic" handler.
902 	 *
903 	 * If the interrupt has occurred in the context of an lwp owning
904 	 * the counters, then the handler posts an AST to the lwp to
905 	 * trigger the actual sampling, and optionally deliver a signal or
906 	 * restart the counters, on the way out of the kernel using
907 	 * kcpc_hw_overflow_ast() (see below).
908 	 *
909 	 * On the other hand, if the handler returns the context to us
910 	 * directly, then it means that there are no other threads in
911 	 * the middle of updating it, no AST has been posted, and so we
912 	 * should sample the counters here, and restart them with no
913 	 * further fuss.
914 	 */
915 	if ((ctx = kcpc_overflow_intr(arg1, bitmap)) != NULL) {
916 		uint64_t curtick = KCPC_GET_TICK();
917 
918 		ctx->kc_hrtime = gethrtime_waitfree();
919 		ctx->kc_vtick += curtick - ctx->kc_rawtick;
920 		ctx->kc_rawtick = curtick;
921 		pcbe_ops->pcbe_sample(ctx);
922 		pcbe_ops->pcbe_program(ctx);
923 	}
924 
925 	return (DDI_INTR_CLAIMED);
926 }
927 
928 /*
929  * Called from trap() when processing the ast posted by the high-level
930  * interrupt handler.
931  */
932 int
933 kcpc_overflow_ast()
934 {
935 	kcpc_ctx_t	*ctx = curthread->t_cpc_ctx;
936 	int		i;
937 	int		found = 0;
938 	uint64_t	curtick = KCPC_GET_TICK();
939 
940 	ASSERT(ctx != NULL);	/* Beware of interrupt skid. */
941 
942 	/*
943 	 * An overflow happened: sample the context to ensure that
944 	 * the overflow is propagated into the upper bits of the
945 	 * virtualized 64-bit counter(s).
946 	 */
947 	kpreempt_disable();
948 	ctx->kc_hrtime = gethrtime_waitfree();
949 	pcbe_ops->pcbe_sample(ctx);
950 	kpreempt_enable();
951 
952 	ctx->kc_vtick += curtick - ctx->kc_rawtick;
953 
954 	/*
955 	 * The interrupt handler has marked any pics with KCPC_PIC_OVERFLOWED
956 	 * if that pic generated an overflow and if the request it was counting
957 	 * on behalf of had CPC_OVERFLOW_REQUEST specified. We go through all
958 	 * pics in the context and clear the KCPC_PIC_OVERFLOWED flags. If we
959 	 * found any overflowed pics, keep the context frozen and return true
960 	 * (thus causing a signal to be sent).
961 	 */
962 	for (i = 0; i < cpc_ncounters; i++) {
963 		if (ctx->kc_pics[i].kp_flags & KCPC_PIC_OVERFLOWED) {
964 			atomic_and_uint(&ctx->kc_pics[i].kp_flags,
965 			    ~KCPC_PIC_OVERFLOWED);
966 			found = 1;
967 		}
968 	}
969 	if (found)
970 		return (1);
971 
972 	/*
973 	 * Otherwise, re-enable the counters and continue life as before.
974 	 */
975 	kpreempt_disable();
976 	atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
977 	pcbe_ops->pcbe_program(ctx);
978 	kpreempt_enable();
979 	return (0);
980 }
981 
982 /*
983  * Called when switching away from current thread.
984  */
985 static void
986 kcpc_save(kcpc_ctx_t *ctx)
987 {
988 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
989 		if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)
990 			return;
991 		/*
992 		 * This context has been invalidated but the counters have not
993 		 * been stopped. Stop them here and mark the context stopped.
994 		 */
995 		pcbe_ops->pcbe_allstop();
996 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID_STOPPED);
997 		return;
998 	}
999 
1000 	pcbe_ops->pcbe_allstop();
1001 	if (ctx->kc_flags & KCPC_CTX_FREEZE)
1002 		return;
1003 
1004 	/*
1005 	 * Need to sample for all reqs into each req's current mpic.
1006 	 */
1007 	ctx->kc_hrtime = gethrtime();
1008 	ctx->kc_vtick += KCPC_GET_TICK() - ctx->kc_rawtick;
1009 	pcbe_ops->pcbe_sample(ctx);
1010 }
1011 
1012 static void
1013 kcpc_restore(kcpc_ctx_t *ctx)
1014 {
1015 	if ((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED)) ==
1016 	    KCPC_CTX_INVALID)
1017 		/*
1018 		 * The context is invalidated but has not been marked stopped.
1019 		 * We mark it as such here because we will not start the
1020 		 * counters during this context switch.
1021 		 */
1022 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID_STOPPED);
1023 
1024 
1025 	if (ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_FREEZE))
1026 		return;
1027 
1028 	/*
1029 	 * While programming the hardware, the counters should be stopped. We
1030 	 * don't do an explicit pcbe_allstop() here because they should have
1031 	 * been stopped already by the last consumer.
1032 	 */
1033 	ctx->kc_rawtick = KCPC_GET_TICK();
1034 	pcbe_ops->pcbe_program(ctx);
1035 }
1036 
1037 /*
1038  * If kcpc_counts_include_idle is set to 0 by the sys admin, we add the the
1039  * following context operators to the idle thread on each CPU. They stop the
1040  * counters when the idle thread is switched on, and they start them again when
1041  * it is switched off.
1042  */
1043 
1044 /*ARGSUSED*/
1045 void
1046 kcpc_idle_save(struct cpu *cp)
1047 {
1048 	/*
1049 	 * The idle thread shouldn't be run anywhere else.
1050 	 */
1051 	ASSERT(CPU == cp);
1052 
1053 	/*
1054 	 * We must hold the CPU's context lock to ensure the context isn't freed
1055 	 * while we're looking at it.
1056 	 */
1057 	mutex_enter(&cp->cpu_cpc_ctxlock);
1058 
1059 	if ((cp->cpu_cpc_ctx == NULL) ||
1060 	    (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
1061 		mutex_exit(&cp->cpu_cpc_ctxlock);
1062 		return;
1063 	}
1064 
1065 	pcbe_ops->pcbe_program(cp->cpu_cpc_ctx);
1066 	mutex_exit(&cp->cpu_cpc_ctxlock);
1067 }
1068 
1069 void
1070 kcpc_idle_restore(struct cpu *cp)
1071 {
1072 	/*
1073 	 * The idle thread shouldn't be run anywhere else.
1074 	 */
1075 	ASSERT(CPU == cp);
1076 
1077 	/*
1078 	 * We must hold the CPU's context lock to ensure the context isn't freed
1079 	 * while we're looking at it.
1080 	 */
1081 	mutex_enter(&cp->cpu_cpc_ctxlock);
1082 
1083 	if ((cp->cpu_cpc_ctx == NULL) ||
1084 	    (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
1085 		mutex_exit(&cp->cpu_cpc_ctxlock);
1086 		return;
1087 	}
1088 
1089 	pcbe_ops->pcbe_allstop();
1090 	mutex_exit(&cp->cpu_cpc_ctxlock);
1091 }
1092 
1093 /*ARGSUSED*/
1094 static void
1095 kcpc_lwp_create(kthread_t *t, kthread_t *ct)
1096 {
1097 	kcpc_ctx_t	*ctx = t->t_cpc_ctx, *cctx;
1098 	int		i;
1099 
1100 	if (ctx == NULL || (ctx->kc_flags & KCPC_CTX_LWPINHERIT) == 0)
1101 		return;
1102 
1103 	rw_enter(&kcpc_cpuctx_lock, RW_READER);
1104 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
1105 		rw_exit(&kcpc_cpuctx_lock);
1106 		return;
1107 	}
1108 	cctx = kcpc_ctx_alloc();
1109 	kcpc_ctx_clone(ctx, cctx);
1110 	rw_exit(&kcpc_cpuctx_lock);
1111 
1112 	cctx->kc_flags = ctx->kc_flags;
1113 	cctx->kc_thread = ct;
1114 	cctx->kc_cpuid = -1;
1115 	ct->t_cpc_set = cctx->kc_set;
1116 	ct->t_cpc_ctx = cctx;
1117 
1118 	if (cctx->kc_flags & KCPC_CTX_SIGOVF) {
1119 		kcpc_set_t *ks = cctx->kc_set;
1120 		/*
1121 		 * Our contract with the user requires us to immediately send an
1122 		 * overflow signal to all children if we have the LWPINHERIT
1123 		 * and SIGOVF flags set. In addition, all counters should be
1124 		 * set to UINT64_MAX, and their pic's overflow flag turned on
1125 		 * so that our trap() processing knows to send a signal.
1126 		 */
1127 		atomic_or_uint(&cctx->kc_flags, KCPC_CTX_FREEZE);
1128 		for (i = 0; i < ks->ks_nreqs; i++) {
1129 			kcpc_request_t *kr = &ks->ks_req[i];
1130 
1131 			if (kr->kr_flags & CPC_OVF_NOTIFY_EMT) {
1132 				*(kr->kr_data) = UINT64_MAX;
1133 				kr->kr_picp->kp_flags |= KCPC_PIC_OVERFLOWED;
1134 			}
1135 		}
1136 		ttolwp(ct)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
1137 		aston(ct);
1138 	}
1139 
1140 	installctx(ct, cctx, kcpc_save, kcpc_restore,
1141 	    NULL, kcpc_lwp_create, NULL, kcpc_free);
1142 }
1143 
1144 /*
1145  * Counter Stoppage Theory
1146  *
1147  * The counters may need to be stopped properly at the following occasions:
1148  *
1149  * 1) An LWP exits.
1150  * 2) A thread exits.
1151  * 3) An LWP performs an exec().
1152  * 4) A bound set is unbound.
1153  *
1154  * In addition to stopping the counters, the CPC context (a kcpc_ctx_t) may need
1155  * to be freed as well.
1156  *
1157  * Case 1: kcpc_passivate(), called via lwp_exit(), stops the counters. Later on
1158  * when the thread is freed, kcpc_free(), called by freectx(), frees the
1159  * context.
1160  *
1161  * Case 2: same as case 1 except kcpc_passivate is called from thread_exit().
1162  *
1163  * Case 3: kcpc_free(), called via freectx() via exec(), recognizes that it has
1164  * been called from exec. It stops the counters _and_ frees the context.
1165  *
1166  * Case 4: kcpc_unbind() stops the hardware _and_ frees the context.
1167  *
1168  * CPU-bound counters are always stopped via kcpc_unbind().
1169  */
1170 
1171 /*
1172  * We're being called to delete the context; we ensure that all associated data
1173  * structures are freed, and that the hardware is passivated if this is an exec.
1174  */
1175 
1176 /*ARGSUSED*/
1177 static void
1178 kcpc_free(kcpc_ctx_t *ctx, int isexec)
1179 {
1180 	int		i;
1181 	kcpc_set_t	*set = ctx->kc_set;
1182 
1183 	ASSERT(set != NULL);
1184 
1185 	atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
1186 
1187 	if (isexec) {
1188 		/*
1189 		 * This thread is execing, and after the exec it should not have
1190 		 * any performance counter context. Stop the counters properly
1191 		 * here so the system isn't surprised by an overflow interrupt
1192 		 * later.
1193 		 */
1194 		if (ctx->kc_cpuid != -1) {
1195 			cpu_t *cp;
1196 			/*
1197 			 * CPU-bound context; stop the appropriate CPU's ctrs.
1198 			 * Hold cpu_lock while examining the CPU to ensure it
1199 			 * doesn't go away.
1200 			 */
1201 			mutex_enter(&cpu_lock);
1202 			cp = cpu_get(ctx->kc_cpuid);
1203 			/*
1204 			 * The CPU could have been DR'd out, so only stop the
1205 			 * CPU and clear its context pointer if the CPU still
1206 			 * exists.
1207 			 */
1208 			if (cp != NULL) {
1209 				mutex_enter(&cp->cpu_cpc_ctxlock);
1210 				kcpc_stop_hw(ctx);
1211 				cp->cpu_cpc_ctx = NULL;
1212 				mutex_exit(&cp->cpu_cpc_ctxlock);
1213 			}
1214 			mutex_exit(&cpu_lock);
1215 			ASSERT(curthread->t_cpc_ctx == NULL);
1216 		} else {
1217 			/*
1218 			 * Thread-bound context; stop _this_ CPU's counters.
1219 			 */
1220 			kpreempt_disable();
1221 			pcbe_ops->pcbe_allstop();
1222 			atomic_or_uint(&ctx->kc_flags,
1223 			    KCPC_CTX_INVALID_STOPPED);
1224 			kpreempt_enable();
1225 			curthread->t_cpc_ctx = NULL;
1226 		}
1227 
1228 		/*
1229 		 * Since we are being called from an exec and we know that
1230 		 * exec is not permitted via the agent thread, we should clean
1231 		 * up this thread's CPC state completely, and not leave dangling
1232 		 * CPC pointers behind.
1233 		 */
1234 		ASSERT(ctx->kc_thread == curthread);
1235 		curthread->t_cpc_set = NULL;
1236 	}
1237 
1238 	/*
1239 	 * Walk through each request in this context's set and free the PCBE's
1240 	 * configuration if it exists.
1241 	 */
1242 	for (i = 0; i < set->ks_nreqs; i++) {
1243 		if (set->ks_req[i].kr_config != NULL)
1244 			pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
1245 	}
1246 
1247 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
1248 	kcpc_ctx_free(ctx);
1249 	kcpc_free_set(set);
1250 }
1251 
1252 /*
1253  * Free the memory associated with a request set.
1254  */
1255 void
1256 kcpc_free_set(kcpc_set_t *set)
1257 {
1258 	int		i;
1259 	kcpc_request_t	*req;
1260 
1261 	ASSERT(set->ks_req != NULL);
1262 
1263 	for (i = 0; i < set->ks_nreqs; i++) {
1264 		req = &set->ks_req[i];
1265 
1266 		if (req->kr_nattrs != 0) {
1267 			kmem_free(req->kr_attr,
1268 			    req->kr_nattrs * sizeof (kcpc_attr_t));
1269 		}
1270 	}
1271 
1272 	kmem_free(set->ks_req, sizeof (kcpc_request_t) * set->ks_nreqs);
1273 	kmem_free(set, sizeof (kcpc_set_t));
1274 }
1275 
1276 /*
1277  * Grab every existing context and mark it as invalid.
1278  */
1279 void
1280 kcpc_invalidate_all(void)
1281 {
1282 	kcpc_ctx_t *ctx;
1283 	long hash;
1284 
1285 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++) {
1286 		mutex_enter(&kcpc_ctx_llock[hash]);
1287 		for (ctx = kcpc_ctx_list[hash]; ctx; ctx = ctx->kc_next)
1288 			atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
1289 		mutex_exit(&kcpc_ctx_llock[hash]);
1290 	}
1291 }
1292 
1293 /*
1294  * Called from lwp_exit() and thread_exit()
1295  */
1296 void
1297 kcpc_passivate(void)
1298 {
1299 	kcpc_ctx_t *ctx = curthread->t_cpc_ctx;
1300 	kcpc_set_t *set = curthread->t_cpc_set;
1301 
1302 	if (set == NULL)
1303 		return;
1304 
1305 	/*
1306 	 * We're cleaning up after this thread; ensure there are no dangling
1307 	 * CPC pointers left behind. The context and set will be freed by
1308 	 * freectx() in the case of an LWP-bound set, and by kcpc_unbind() in
1309 	 * the case of a CPU-bound set.
1310 	 */
1311 	curthread->t_cpc_ctx = NULL;
1312 
1313 	if (ctx == NULL) {
1314 		/*
1315 		 * This thread has a set but no context; it must be a CPU-bound
1316 		 * set. The hardware will be stopped via kcpc_unbind() when the
1317 		 * process exits and closes its file descriptors with
1318 		 * kcpc_close(). Our only job here is to clean up this thread's
1319 		 * state; the set will be freed with the unbind().
1320 		 */
1321 		(void) kcpc_unbind(set);
1322 		/*
1323 		 * Unbinding a set belonging to the current thread should clear
1324 		 * its set pointer.
1325 		 */
1326 		ASSERT(curthread->t_cpc_set == NULL);
1327 		return;
1328 	}
1329 
1330 	curthread->t_cpc_set = NULL;
1331 
1332 	/*
1333 	 * This thread/LWP is exiting but context switches will continue to
1334 	 * happen for a bit as the exit proceeds.  Kernel preemption must be
1335 	 * disabled here to prevent a race between checking or setting the
1336 	 * INVALID_STOPPED flag here and kcpc_restore() setting the flag during
1337 	 * a context switch.
1338 	 */
1339 
1340 	kpreempt_disable();
1341 	if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
1342 		pcbe_ops->pcbe_allstop();
1343 		atomic_or_uint(&ctx->kc_flags,
1344 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
1345 	}
1346 	kpreempt_enable();
1347 }
1348 
1349 /*
1350  * Assign the requests in the given set to the PICs in the context.
1351  * Returns 0 if successful, -1 on failure.
1352  */
1353 /*ARGSUSED*/
1354 static int
1355 kcpc_assign_reqs(kcpc_set_t *set, kcpc_ctx_t *ctx)
1356 {
1357 	int i;
1358 	int *picnum_save;
1359 
1360 	ASSERT(set->ks_nreqs <= cpc_ncounters);
1361 
1362 	/*
1363 	 * Provide kcpc_tryassign() with scratch space to avoid doing an
1364 	 * alloc/free with every invocation.
1365 	 */
1366 	picnum_save = kmem_alloc(set->ks_nreqs * sizeof (int), KM_SLEEP);
1367 	/*
1368 	 * kcpc_tryassign() blindly walks through each request in the set,
1369 	 * seeing if a counter can count its event. If yes, it assigns that
1370 	 * counter. However, that counter may have been the only capable counter
1371 	 * for _another_ request's event. The solution is to try every possible
1372 	 * request first. Note that this does not cover all solutions, as
1373 	 * that would require all unique orderings of requests, an n^n operation
1374 	 * which would be unacceptable for architectures with many counters.
1375 	 */
1376 	for (i = 0; i < set->ks_nreqs; i++)
1377 		if (kcpc_tryassign(set, i, picnum_save) == 0)
1378 			break;
1379 
1380 	kmem_free(picnum_save, set->ks_nreqs * sizeof (int));
1381 	if (i == set->ks_nreqs)
1382 		return (-1);
1383 	return (0);
1384 }
1385 
1386 static int
1387 kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch)
1388 {
1389 	int		i;
1390 	int		j;
1391 	uint64_t	bitmap = 0, resmap = 0;
1392 	uint64_t	ctrmap;
1393 
1394 	/*
1395 	 * We are attempting to assign the reqs to pics, but we may fail. If we
1396 	 * fail, we need to restore the state of the requests to what it was
1397 	 * when we found it, as some reqs may have been explicitly assigned to
1398 	 * a specific PIC beforehand. We do this by snapshotting the assignments
1399 	 * now and restoring from it later if we fail.
1400 	 *
1401 	 * Also we note here which counters have already been claimed by
1402 	 * requests with explicit counter assignments.
1403 	 */
1404 	for (i = 0; i < set->ks_nreqs; i++) {
1405 		scratch[i] = set->ks_req[i].kr_picnum;
1406 		if (set->ks_req[i].kr_picnum != -1)
1407 			resmap |= (1 << set->ks_req[i].kr_picnum);
1408 	}
1409 
1410 	/*
1411 	 * Walk through requests assigning them to the first PIC that is
1412 	 * capable.
1413 	 */
1414 	i = starting_req;
1415 	do {
1416 		if (set->ks_req[i].kr_picnum != -1) {
1417 			ASSERT((bitmap & (1 << set->ks_req[i].kr_picnum)) == 0);
1418 			bitmap |= (1 << set->ks_req[i].kr_picnum);
1419 			if (++i == set->ks_nreqs)
1420 				i = 0;
1421 			continue;
1422 		}
1423 
1424 		ctrmap = pcbe_ops->pcbe_event_coverage(set->ks_req[i].kr_event);
1425 		for (j = 0; j < cpc_ncounters; j++) {
1426 			if (ctrmap & (1 << j) && (bitmap & (1 << j)) == 0 &&
1427 			    (resmap & (1 << j)) == 0) {
1428 				/*
1429 				 * We can assign this counter because:
1430 				 *
1431 				 * 1. It can count the event (ctrmap)
1432 				 * 2. It hasn't been assigned yet (bitmap)
1433 				 * 3. It wasn't reserved by a request (resmap)
1434 				 */
1435 				bitmap |= (1 << j);
1436 				break;
1437 			}
1438 		}
1439 		if (j == cpc_ncounters) {
1440 			for (i = 0; i < set->ks_nreqs; i++)
1441 				set->ks_req[i].kr_picnum = scratch[i];
1442 			return (-1);
1443 		}
1444 		set->ks_req[i].kr_picnum = j;
1445 
1446 		if (++i == set->ks_nreqs)
1447 			i = 0;
1448 	} while (i != starting_req);
1449 
1450 	return (0);
1451 }
1452 
1453 kcpc_set_t *
1454 kcpc_dup_set(kcpc_set_t *set)
1455 {
1456 	kcpc_set_t	*new;
1457 	int		i;
1458 	int		j;
1459 
1460 	new = kmem_alloc(sizeof (*new), KM_SLEEP);
1461 	new->ks_flags = set->ks_flags;
1462 	new->ks_nreqs = set->ks_nreqs;
1463 	new->ks_req = kmem_alloc(set->ks_nreqs * sizeof (kcpc_request_t),
1464 	    KM_SLEEP);
1465 	new->ks_data = NULL;
1466 	new->ks_ctx = NULL;
1467 
1468 	for (i = 0; i < new->ks_nreqs; i++) {
1469 		new->ks_req[i].kr_config = NULL;
1470 		new->ks_req[i].kr_index = set->ks_req[i].kr_index;
1471 		new->ks_req[i].kr_picnum = set->ks_req[i].kr_picnum;
1472 		new->ks_req[i].kr_picp = NULL;
1473 		new->ks_req[i].kr_data = NULL;
1474 		(void) strncpy(new->ks_req[i].kr_event, set->ks_req[i].kr_event,
1475 		    CPC_MAX_EVENT_LEN);
1476 		new->ks_req[i].kr_preset = set->ks_req[i].kr_preset;
1477 		new->ks_req[i].kr_flags = set->ks_req[i].kr_flags;
1478 		new->ks_req[i].kr_nattrs = set->ks_req[i].kr_nattrs;
1479 		new->ks_req[i].kr_attr = kmem_alloc(new->ks_req[i].kr_nattrs *
1480 		    sizeof (kcpc_attr_t), KM_SLEEP);
1481 		for (j = 0; j < new->ks_req[i].kr_nattrs; j++) {
1482 			new->ks_req[i].kr_attr[j].ka_val =
1483 			    set->ks_req[i].kr_attr[j].ka_val;
1484 			(void) strncpy(new->ks_req[i].kr_attr[j].ka_name,
1485 			    set->ks_req[i].kr_attr[j].ka_name,
1486 			    CPC_MAX_ATTR_LEN);
1487 		}
1488 	}
1489 
1490 	return (new);
1491 }
1492 
1493 int
1494 kcpc_allow_nonpriv(void *token)
1495 {
1496 	return (((kcpc_ctx_t *)token)->kc_flags & KCPC_CTX_NONPRIV);
1497 }
1498 
1499 void
1500 kcpc_invalidate(kthread_t *t)
1501 {
1502 	kcpc_ctx_t *ctx = t->t_cpc_ctx;
1503 
1504 	if (ctx != NULL)
1505 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
1506 }
1507 
1508 /*
1509  * Given a PCBE ID, attempt to load a matching PCBE module. The strings given
1510  * are used to construct PCBE names, starting with the most specific,
1511  * "pcbe.first.second.third.fourth" and ending with the least specific,
1512  * "pcbe.first".
1513  *
1514  * Returns 0 if a PCBE was successfully loaded and -1 upon error.
1515  */
1516 int
1517 kcpc_pcbe_tryload(const char *prefix, uint_t first, uint_t second, uint_t third)
1518 {
1519 	uint_t s[3];
1520 
1521 	s[0] = first;
1522 	s[1] = second;
1523 	s[2] = third;
1524 
1525 	return (modload_qualified("pcbe",
1526 	    "pcbe", prefix, ".", s, 3) < 0 ? -1 : 0);
1527 }
1528