xref: /illumos-gate/usr/src/uts/common/disp/disp.c (revision 6d40a71e)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5ab761399Sesaxe  * Common Development and Distribution License (the "License").
6ab761399Sesaxe  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
217c478bd9Sstevel@tonic-gate /*
220e751525SEric Saxe  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
237c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
247c478bd9Sstevel@tonic-gate  */
257c478bd9Sstevel@tonic-gate 
26455e370cSJohn Levon /*
27c3377ee9SJohn Levon  * Copyright 2019 Joyent, Inc.
28455e370cSJohn Levon  */
29455e370cSJohn Levon 
307c478bd9Sstevel@tonic-gate /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
317c478bd9Sstevel@tonic-gate /*	  All Rights Reserved  	*/
327c478bd9Sstevel@tonic-gate 
337c478bd9Sstevel@tonic-gate 
347c478bd9Sstevel@tonic-gate #include <sys/types.h>
357c478bd9Sstevel@tonic-gate #include <sys/param.h>
367c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h>
377c478bd9Sstevel@tonic-gate #include <sys/signal.h>
387c478bd9Sstevel@tonic-gate #include <sys/user.h>
397c478bd9Sstevel@tonic-gate #include <sys/systm.h>
407c478bd9Sstevel@tonic-gate #include <sys/sysinfo.h>
417c478bd9Sstevel@tonic-gate #include <sys/var.h>
427c478bd9Sstevel@tonic-gate #include <sys/errno.h>
437c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
447c478bd9Sstevel@tonic-gate #include <sys/debug.h>
457c478bd9Sstevel@tonic-gate #include <sys/inline.h>
467c478bd9Sstevel@tonic-gate #include <sys/disp.h>
477c478bd9Sstevel@tonic-gate #include <sys/class.h>
487c478bd9Sstevel@tonic-gate #include <sys/bitmap.h>
497c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
507c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
517c478bd9Sstevel@tonic-gate #include <sys/vtrace.h>
527c478bd9Sstevel@tonic-gate #include <sys/tnf.h>
537c478bd9Sstevel@tonic-gate #include <sys/cpupart.h>
547c478bd9Sstevel@tonic-gate #include <sys/lgrp.h>
55fb2f18f8Sesaxe #include <sys/pg.h>
56fb2f18f8Sesaxe #include <sys/cmt.h>
57fb2f18f8Sesaxe #include <sys/bitset.h>
587c478bd9Sstevel@tonic-gate #include <sys/schedctl.h>
597c478bd9Sstevel@tonic-gate #include <sys/atomic.h>
607c478bd9Sstevel@tonic-gate #include <sys/dtrace.h>
617c478bd9Sstevel@tonic-gate #include <sys/sdt.h>
62057452c6Sjj209869 #include <sys/archsystm.h>
63c3377ee9SJohn Levon #include <sys/smt.h>
647c478bd9Sstevel@tonic-gate 
657c478bd9Sstevel@tonic-gate #include <vm/as.h>
667c478bd9Sstevel@tonic-gate 
677c478bd9Sstevel@tonic-gate #define	BOUND_CPU	0x1
687c478bd9Sstevel@tonic-gate #define	BOUND_PARTITION	0x2
697c478bd9Sstevel@tonic-gate #define	BOUND_INTR	0x4
707c478bd9Sstevel@tonic-gate 
717c478bd9Sstevel@tonic-gate /* Dispatch queue allocation structure and functions */
727c478bd9Sstevel@tonic-gate struct disp_queue_info {
737c478bd9Sstevel@tonic-gate 	disp_t	*dp;
747c478bd9Sstevel@tonic-gate 	dispq_t *olddispq;
757c478bd9Sstevel@tonic-gate 	dispq_t *newdispq;
767c478bd9Sstevel@tonic-gate 	ulong_t	*olddqactmap;
777c478bd9Sstevel@tonic-gate 	ulong_t	*newdqactmap;
787c478bd9Sstevel@tonic-gate 	int	oldnglobpris;
797c478bd9Sstevel@tonic-gate };
807c478bd9Sstevel@tonic-gate static void	disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
817c478bd9Sstevel@tonic-gate     disp_t *dp);
827c478bd9Sstevel@tonic-gate static void	disp_dq_assign(struct disp_queue_info *dptr, int numpris);
837c478bd9Sstevel@tonic-gate static void	disp_dq_free(struct disp_queue_info *dptr);
847c478bd9Sstevel@tonic-gate 
857c478bd9Sstevel@tonic-gate /* platform-specific routine to call when processor is idle */
867c478bd9Sstevel@tonic-gate static void	generic_idle_cpu();
877c478bd9Sstevel@tonic-gate void		(*idle_cpu)() = generic_idle_cpu;
887c478bd9Sstevel@tonic-gate 
897c478bd9Sstevel@tonic-gate /* routines invoked when a CPU enters/exits the idle loop */
907c478bd9Sstevel@tonic-gate static void	idle_enter();
917c478bd9Sstevel@tonic-gate static void	idle_exit();
927c478bd9Sstevel@tonic-gate 
937c478bd9Sstevel@tonic-gate /* platform-specific routine to call when thread is enqueued */
947c478bd9Sstevel@tonic-gate static void	generic_enq_thread(cpu_t *, int);
957c478bd9Sstevel@tonic-gate void		(*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
967c478bd9Sstevel@tonic-gate 
977c478bd9Sstevel@tonic-gate pri_t	kpreemptpri;		/* priority where kernel preemption applies */
987c478bd9Sstevel@tonic-gate pri_t	upreemptpri = 0; 	/* priority where normal preemption applies */
997c478bd9Sstevel@tonic-gate pri_t	intr_pri;		/* interrupt thread priority base level */
1007c478bd9Sstevel@tonic-gate 
101685679f7Sakolb #define	KPQPRI	-1 		/* pri where cpu affinity is dropped for kpq */
1027c478bd9Sstevel@tonic-gate pri_t	kpqpri = KPQPRI; 	/* can be set in /etc/system */
1037c478bd9Sstevel@tonic-gate disp_t	cpu0_disp;		/* boot CPU's dispatch queue */
1047c478bd9Sstevel@tonic-gate disp_lock_t	swapped_lock;	/* lock swapped threads and swap queue */
1057c478bd9Sstevel@tonic-gate int	nswapped;		/* total number of swapped threads */
1067c478bd9Sstevel@tonic-gate void	disp_swapped_enq(kthread_t *tp);
1077c478bd9Sstevel@tonic-gate static void	disp_swapped_setrun(kthread_t *tp);
1087c478bd9Sstevel@tonic-gate static void	cpu_resched(cpu_t *cp, pri_t tpri);
1097c478bd9Sstevel@tonic-gate 
1107c478bd9Sstevel@tonic-gate /*
1117c478bd9Sstevel@tonic-gate  * If this is set, only interrupt threads will cause kernel preemptions.
1127c478bd9Sstevel@tonic-gate  * This is done by changing the value of kpreemptpri.  kpreemptpri
113*6d40a71eSBryan Cantrill  * will either be the max sysclass pri or the min interrupt pri.
1147c478bd9Sstevel@tonic-gate  */
1157c478bd9Sstevel@tonic-gate int	only_intr_kpreempt;
1167c478bd9Sstevel@tonic-gate 
1177c478bd9Sstevel@tonic-gate extern void set_idle_cpu(int cpun);
1187c478bd9Sstevel@tonic-gate extern void unset_idle_cpu(int cpun);
1197c478bd9Sstevel@tonic-gate static void setkpdq(kthread_t *tp, int borf);
1207c478bd9Sstevel@tonic-gate #define	SETKP_BACK	0
1217c478bd9Sstevel@tonic-gate #define	SETKP_FRONT	1
1227c478bd9Sstevel@tonic-gate /*
1237c478bd9Sstevel@tonic-gate  * Parameter that determines how recently a thread must have run
1247c478bd9Sstevel@tonic-gate  * on the CPU to be considered loosely-bound to that CPU to reduce
1257c478bd9Sstevel@tonic-gate  * cold cache effects.  The interval is in hertz.
1267c478bd9Sstevel@tonic-gate  */
127fb2f18f8Sesaxe #define	RECHOOSE_INTERVAL 3
1287c478bd9Sstevel@tonic-gate int	rechoose_interval = RECHOOSE_INTERVAL;
1297c478bd9Sstevel@tonic-gate 
130685679f7Sakolb /*
131685679f7Sakolb  * Parameter that determines how long (in nanoseconds) a thread must
132685679f7Sakolb  * be sitting on a run queue before it can be stolen by another CPU
133685679f7Sakolb  * to reduce migrations.  The interval is in nanoseconds.
134685679f7Sakolb  *
13581588590Sbholler  * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
13681588590Sbholler  * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
13781588590Sbholler  * here indicating it is uninitiallized.
13881588590Sbholler  * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
13981588590Sbholler  *
140685679f7Sakolb  */
14181588590Sbholler #define	NOSTEAL_UNINITIALIZED	(-1)
14281588590Sbholler hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
14381588590Sbholler extern void cmp_set_nosteal_interval(void);
144685679f7Sakolb 
1457c478bd9Sstevel@tonic-gate id_t	defaultcid;	/* system "default" class; see dispadmin(1M) */
1467c478bd9Sstevel@tonic-gate 
1477c478bd9Sstevel@tonic-gate disp_lock_t	transition_lock;	/* lock on transitioning threads */
1487c478bd9Sstevel@tonic-gate disp_lock_t	stop_lock;		/* lock on stopped threads */
1497c478bd9Sstevel@tonic-gate 
1507c478bd9Sstevel@tonic-gate static void	cpu_dispqalloc(int numpris);
1517c478bd9Sstevel@tonic-gate 
152685679f7Sakolb /*
153685679f7Sakolb  * This gets returned by disp_getwork/disp_getbest if we couldn't steal
154685679f7Sakolb  * a thread because it was sitting on its run queue for a very short
155685679f7Sakolb  * period of time.
156685679f7Sakolb  */
157685679f7Sakolb #define	T_DONTSTEAL	(kthread_t *)(-1) /* returned by disp_getwork/getbest */
158685679f7Sakolb 
1597c478bd9Sstevel@tonic-gate static kthread_t	*disp_getwork(cpu_t *to);
1607c478bd9Sstevel@tonic-gate static kthread_t	*disp_getbest(disp_t *from);
1617c478bd9Sstevel@tonic-gate static kthread_t	*disp_ratify(kthread_t *tp, disp_t *kpq);
1627c478bd9Sstevel@tonic-gate 
1637c478bd9Sstevel@tonic-gate void	swtch_to(kthread_t *);
1647c478bd9Sstevel@tonic-gate 
1657c478bd9Sstevel@tonic-gate /*
1667c478bd9Sstevel@tonic-gate  * dispatcher and scheduler initialization
1677c478bd9Sstevel@tonic-gate  */
1687c478bd9Sstevel@tonic-gate 
1697c478bd9Sstevel@tonic-gate /*
1707c478bd9Sstevel@tonic-gate  * disp_setup - Common code to calculate and allocate dispatcher
1717c478bd9Sstevel@tonic-gate  *		variables and structures based on the maximum priority.
1727c478bd9Sstevel@tonic-gate  */
1737c478bd9Sstevel@tonic-gate static void
1747c478bd9Sstevel@tonic-gate disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
1757c478bd9Sstevel@tonic-gate {
1767c478bd9Sstevel@tonic-gate 	pri_t	newnglobpris;
1777c478bd9Sstevel@tonic-gate 
1787c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock));
1797c478bd9Sstevel@tonic-gate 
1807c478bd9Sstevel@tonic-gate 	newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
1817c478bd9Sstevel@tonic-gate 
1827c478bd9Sstevel@tonic-gate 	if (newnglobpris > oldnglobpris) {
1837c478bd9Sstevel@tonic-gate 		/*
1847c478bd9Sstevel@tonic-gate 		 * Allocate new kp queues for each CPU partition.
1857c478bd9Sstevel@tonic-gate 		 */
1867c478bd9Sstevel@tonic-gate 		cpupart_kpqalloc(newnglobpris);
1877c478bd9Sstevel@tonic-gate 
1887c478bd9Sstevel@tonic-gate 		/*
1897c478bd9Sstevel@tonic-gate 		 * Allocate new dispatch queues for each CPU.
1907c478bd9Sstevel@tonic-gate 		 */
1917c478bd9Sstevel@tonic-gate 		cpu_dispqalloc(newnglobpris);
1927c478bd9Sstevel@tonic-gate 
1937c478bd9Sstevel@tonic-gate 		/*
1947c478bd9Sstevel@tonic-gate 		 * compute new interrupt thread base priority
1957c478bd9Sstevel@tonic-gate 		 */
1967c478bd9Sstevel@tonic-gate 		intr_pri = maxglobpri;
1977c478bd9Sstevel@tonic-gate 		if (only_intr_kpreempt) {
1987c478bd9Sstevel@tonic-gate 			kpreemptpri = intr_pri + 1;
1997c478bd9Sstevel@tonic-gate 			if (kpqpri == KPQPRI)
2007c478bd9Sstevel@tonic-gate 				kpqpri = kpreemptpri;
2017c478bd9Sstevel@tonic-gate 		}
2027c478bd9Sstevel@tonic-gate 		v.v_nglobpris = newnglobpris;
2037c478bd9Sstevel@tonic-gate 	}
2047c478bd9Sstevel@tonic-gate }
2057c478bd9Sstevel@tonic-gate 
2067c478bd9Sstevel@tonic-gate /*
2077c478bd9Sstevel@tonic-gate  * dispinit - Called to initialize all loaded classes and the
2087c478bd9Sstevel@tonic-gate  *	      dispatcher framework.
2097c478bd9Sstevel@tonic-gate  */
2107c478bd9Sstevel@tonic-gate void
2117c478bd9Sstevel@tonic-gate dispinit(void)
2127c478bd9Sstevel@tonic-gate {
2137c478bd9Sstevel@tonic-gate 	id_t	cid;
2147c478bd9Sstevel@tonic-gate 	pri_t	maxglobpri;
2157c478bd9Sstevel@tonic-gate 	pri_t	cl_maxglobpri;
2167c478bd9Sstevel@tonic-gate 
2177c478bd9Sstevel@tonic-gate 	maxglobpri = -1;
2187c478bd9Sstevel@tonic-gate 
2197c478bd9Sstevel@tonic-gate 	/*
2207c478bd9Sstevel@tonic-gate 	 * Initialize transition lock, which will always be set.
2217c478bd9Sstevel@tonic-gate 	 */
2227c478bd9Sstevel@tonic-gate 	DISP_LOCK_INIT(&transition_lock);
2237c478bd9Sstevel@tonic-gate 	disp_lock_enter_high(&transition_lock);
2247c478bd9Sstevel@tonic-gate 	DISP_LOCK_INIT(&stop_lock);
2257c478bd9Sstevel@tonic-gate 
2267c478bd9Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
2277c478bd9Sstevel@tonic-gate 	CPU->cpu_disp->disp_maxrunpri = -1;
2287c478bd9Sstevel@tonic-gate 	CPU->cpu_disp->disp_max_unbound_pri = -1;
229fb2f18f8Sesaxe 
2307c478bd9Sstevel@tonic-gate 	/*
2317c478bd9Sstevel@tonic-gate 	 * Initialize the default CPU partition.
2327c478bd9Sstevel@tonic-gate 	 */
2337c478bd9Sstevel@tonic-gate 	cpupart_initialize_default();
2347c478bd9Sstevel@tonic-gate 	/*
2357c478bd9Sstevel@tonic-gate 	 * Call the class specific initialization functions for
2367c478bd9Sstevel@tonic-gate 	 * all pre-installed schedulers.
2377c478bd9Sstevel@tonic-gate 	 *
2387c478bd9Sstevel@tonic-gate 	 * We pass the size of a class specific parameter
2397c478bd9Sstevel@tonic-gate 	 * buffer to each of the initialization functions
2407c478bd9Sstevel@tonic-gate 	 * to try to catch problems with backward compatibility
2417c478bd9Sstevel@tonic-gate 	 * of class modules.
2427c478bd9Sstevel@tonic-gate 	 *
2437c478bd9Sstevel@tonic-gate 	 * For example a new class module running on an old system
2447c478bd9Sstevel@tonic-gate 	 * which didn't provide sufficiently large parameter buffers
2457c478bd9Sstevel@tonic-gate 	 * would be bad news. Class initialization modules can check for
2467c478bd9Sstevel@tonic-gate 	 * this and take action if they detect a problem.
2477c478bd9Sstevel@tonic-gate 	 */
2487c478bd9Sstevel@tonic-gate 
2497c478bd9Sstevel@tonic-gate 	for (cid = 0; cid < nclass; cid++) {
2507c478bd9Sstevel@tonic-gate 		sclass_t	*sc;
2517c478bd9Sstevel@tonic-gate 
2527c478bd9Sstevel@tonic-gate 		sc = &sclass[cid];
2537c478bd9Sstevel@tonic-gate 		if (SCHED_INSTALLED(sc)) {
2547c478bd9Sstevel@tonic-gate 			cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
2557c478bd9Sstevel@tonic-gate 			    &sc->cl_funcs);
2567c478bd9Sstevel@tonic-gate 			if (cl_maxglobpri > maxglobpri)
2577c478bd9Sstevel@tonic-gate 				maxglobpri = cl_maxglobpri;
2587c478bd9Sstevel@tonic-gate 		}
2597c478bd9Sstevel@tonic-gate 	}
260*6d40a71eSBryan Cantrill 
261*6d40a71eSBryan Cantrill 	/*
262*6d40a71eSBryan Cantrill 	 * Historically, kpreemptpri was set to v_maxsyspri + 1 -- which is
263*6d40a71eSBryan Cantrill 	 * to say, maxclsyspri + 1.  However, over time, the system has used
264*6d40a71eSBryan Cantrill 	 * more and more asynchronous kernel threads, with an increasing number
265*6d40a71eSBryan Cantrill 	 * of these doing work on direct behalf of higher-level software (e.g.,
266*6d40a71eSBryan Cantrill 	 * network processing).  This has led to potential priority inversions:
267*6d40a71eSBryan Cantrill 	 * threads doing low-priority lengthy kernel work can effectively
268*6d40a71eSBryan Cantrill 	 * delay kernel-level processing of higher-priority data. To minimize
269*6d40a71eSBryan Cantrill 	 * such inversions, we set kpreemptpri to be v_maxsyspri; anything in
270*6d40a71eSBryan Cantrill 	 * the kernel that runs at maxclsyspri will therefore induce kernel
271*6d40a71eSBryan Cantrill 	 * preemption, and this priority should be used if/when an asynchronous
272*6d40a71eSBryan Cantrill 	 * thread (or, as is often the case, task queue) is performing a task
273*6d40a71eSBryan Cantrill 	 * on behalf of higher-level software (or any task that is otherwise
274*6d40a71eSBryan Cantrill 	 * latency-sensitve).
275*6d40a71eSBryan Cantrill 	 */
276*6d40a71eSBryan Cantrill 	kpreemptpri = (pri_t)v.v_maxsyspri;
2777c478bd9Sstevel@tonic-gate 	if (kpqpri == KPQPRI)
2787c478bd9Sstevel@tonic-gate 		kpqpri = kpreemptpri;
2797c478bd9Sstevel@tonic-gate 
2807c478bd9Sstevel@tonic-gate 	ASSERT(maxglobpri >= 0);
2817c478bd9Sstevel@tonic-gate 	disp_setup(maxglobpri, 0);
2827c478bd9Sstevel@tonic-gate 
2837c478bd9Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
2847c478bd9Sstevel@tonic-gate 
2857c478bd9Sstevel@tonic-gate 	/*
28681588590Sbholler 	 * Platform specific sticky scheduler setup.
28781588590Sbholler 	 */
28881588590Sbholler 	if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
28981588590Sbholler 		cmp_set_nosteal_interval();
29081588590Sbholler 
29181588590Sbholler 	/*
2927c478bd9Sstevel@tonic-gate 	 * Get the default class ID; this may be later modified via
2937c478bd9Sstevel@tonic-gate 	 * dispadmin(1M).  This will load the class (normally TS) and that will
2947c478bd9Sstevel@tonic-gate 	 * call disp_add(), which is why we had to drop cpu_lock first.
2957c478bd9Sstevel@tonic-gate 	 */
2967c478bd9Sstevel@tonic-gate 	if (getcid(defaultclass, &defaultcid) != 0) {
2977c478bd9Sstevel@tonic-gate 		cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
2987c478bd9Sstevel@tonic-gate 		    defaultclass);
2997c478bd9Sstevel@tonic-gate 	}
3007c478bd9Sstevel@tonic-gate }
3017c478bd9Sstevel@tonic-gate 
3027c478bd9Sstevel@tonic-gate /*
3037c478bd9Sstevel@tonic-gate  * disp_add - Called with class pointer to initialize the dispatcher
3047c478bd9Sstevel@tonic-gate  *	      for a newly loaded class.
3057c478bd9Sstevel@tonic-gate  */
3067c478bd9Sstevel@tonic-gate void
3077c478bd9Sstevel@tonic-gate disp_add(sclass_t *clp)
3087c478bd9Sstevel@tonic-gate {
3097c478bd9Sstevel@tonic-gate 	pri_t	maxglobpri;
3107c478bd9Sstevel@tonic-gate 	pri_t	cl_maxglobpri;
3117c478bd9Sstevel@tonic-gate 
3127c478bd9Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
3137c478bd9Sstevel@tonic-gate 	/*
3147c478bd9Sstevel@tonic-gate 	 * Initialize the scheduler class.
3157c478bd9Sstevel@tonic-gate 	 */
3167c478bd9Sstevel@tonic-gate 	maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
3177c478bd9Sstevel@tonic-gate 	cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
3187c478bd9Sstevel@tonic-gate 	if (cl_maxglobpri > maxglobpri)
3197c478bd9Sstevel@tonic-gate 		maxglobpri = cl_maxglobpri;
3207c478bd9Sstevel@tonic-gate 
3217c478bd9Sstevel@tonic-gate 	/*
3227c478bd9Sstevel@tonic-gate 	 * Save old queue information.  Since we're initializing a
3237c478bd9Sstevel@tonic-gate 	 * new scheduling class which has just been loaded, then
3247c478bd9Sstevel@tonic-gate 	 * the size of the dispq may have changed.  We need to handle
3257c478bd9Sstevel@tonic-gate 	 * that here.
3267c478bd9Sstevel@tonic-gate 	 */
3277c478bd9Sstevel@tonic-gate 	disp_setup(maxglobpri, v.v_nglobpris);
3287c478bd9Sstevel@tonic-gate 
3297c478bd9Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
3307c478bd9Sstevel@tonic-gate }
3317c478bd9Sstevel@tonic-gate 
3327c478bd9Sstevel@tonic-gate 
3337c478bd9Sstevel@tonic-gate /*
3347c478bd9Sstevel@tonic-gate  * For each CPU, allocate new dispatch queues
3357c478bd9Sstevel@tonic-gate  * with the stated number of priorities.
3367c478bd9Sstevel@tonic-gate  */
3377c478bd9Sstevel@tonic-gate static void
3387c478bd9Sstevel@tonic-gate cpu_dispqalloc(int numpris)
3397c478bd9Sstevel@tonic-gate {
3407c478bd9Sstevel@tonic-gate 	cpu_t	*cpup;
3417c478bd9Sstevel@tonic-gate 	struct disp_queue_info	*disp_mem;
3427c478bd9Sstevel@tonic-gate 	int i, num;
3437c478bd9Sstevel@tonic-gate 
3447c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock));
3457c478bd9Sstevel@tonic-gate 
3467c478bd9Sstevel@tonic-gate 	disp_mem = kmem_zalloc(NCPU *
3477c478bd9Sstevel@tonic-gate 	    sizeof (struct disp_queue_info), KM_SLEEP);
3487c478bd9Sstevel@tonic-gate 
3497c478bd9Sstevel@tonic-gate 	/*
3507c478bd9Sstevel@tonic-gate 	 * This routine must allocate all of the memory before stopping
3517c478bd9Sstevel@tonic-gate 	 * the cpus because it must not sleep in kmem_alloc while the
3527c478bd9Sstevel@tonic-gate 	 * CPUs are stopped.  Locks they hold will not be freed until they
3537c478bd9Sstevel@tonic-gate 	 * are restarted.
3547c478bd9Sstevel@tonic-gate 	 */
3557c478bd9Sstevel@tonic-gate 	i = 0;
3567c478bd9Sstevel@tonic-gate 	cpup = cpu_list;
3577c478bd9Sstevel@tonic-gate 	do {
3587c478bd9Sstevel@tonic-gate 		disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
3597c478bd9Sstevel@tonic-gate 		i++;
3607c478bd9Sstevel@tonic-gate 		cpup = cpup->cpu_next;
3617c478bd9Sstevel@tonic-gate 	} while (cpup != cpu_list);
3627c478bd9Sstevel@tonic-gate 	num = i;
3637c478bd9Sstevel@tonic-gate 
3640ed5c46eSJosef 'Jeff' Sipek 	pause_cpus(NULL, NULL);
3657c478bd9Sstevel@tonic-gate 	for (i = 0; i < num; i++)
3667c478bd9Sstevel@tonic-gate 		disp_dq_assign(&disp_mem[i], numpris);
3677c478bd9Sstevel@tonic-gate 	start_cpus();
3687c478bd9Sstevel@tonic-gate 
3697c478bd9Sstevel@tonic-gate 	/*
3707c478bd9Sstevel@tonic-gate 	 * I must free all of the memory after starting the cpus because
3717c478bd9Sstevel@tonic-gate 	 * I can not risk sleeping in kmem_free while the cpus are stopped.
3727c478bd9Sstevel@tonic-gate 	 */
3737c478bd9Sstevel@tonic-gate 	for (i = 0; i < num; i++)
3747c478bd9Sstevel@tonic-gate 		disp_dq_free(&disp_mem[i]);
3757c478bd9Sstevel@tonic-gate 
3767c478bd9Sstevel@tonic-gate 	kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
3777c478bd9Sstevel@tonic-gate }
3787c478bd9Sstevel@tonic-gate 
3797c478bd9Sstevel@tonic-gate static void
3807c478bd9Sstevel@tonic-gate disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t	*dp)
3817c478bd9Sstevel@tonic-gate {
3827c478bd9Sstevel@tonic-gate 	dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
3837c478bd9Sstevel@tonic-gate 	dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
3847c478bd9Sstevel@tonic-gate 	    sizeof (long), KM_SLEEP);
3857c478bd9Sstevel@tonic-gate 	dptr->dp = dp;
3867c478bd9Sstevel@tonic-gate }
3877c478bd9Sstevel@tonic-gate 
3887c478bd9Sstevel@tonic-gate static void
3897c478bd9Sstevel@tonic-gate disp_dq_assign(struct disp_queue_info *dptr, int numpris)
3907c478bd9Sstevel@tonic-gate {
3917c478bd9Sstevel@tonic-gate 	disp_t	*dp;
3927c478bd9Sstevel@tonic-gate 
3937c478bd9Sstevel@tonic-gate 	dp = dptr->dp;
3947c478bd9Sstevel@tonic-gate 	dptr->olddispq = dp->disp_q;
3957c478bd9Sstevel@tonic-gate 	dptr->olddqactmap = dp->disp_qactmap;
3967c478bd9Sstevel@tonic-gate 	dptr->oldnglobpris = dp->disp_npri;
3977c478bd9Sstevel@tonic-gate 
3987c478bd9Sstevel@tonic-gate 	ASSERT(dptr->oldnglobpris < numpris);
3997c478bd9Sstevel@tonic-gate 
4007c478bd9Sstevel@tonic-gate 	if (dptr->olddispq != NULL) {
4017c478bd9Sstevel@tonic-gate 		/*
4027c478bd9Sstevel@tonic-gate 		 * Use kcopy because bcopy is platform-specific
4037c478bd9Sstevel@tonic-gate 		 * and could block while we might have paused the cpus.
4047c478bd9Sstevel@tonic-gate 		 */
4057c478bd9Sstevel@tonic-gate 		(void) kcopy(dptr->olddispq, dptr->newdispq,
4067c478bd9Sstevel@tonic-gate 		    dptr->oldnglobpris * sizeof (dispq_t));
4077c478bd9Sstevel@tonic-gate 		(void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
4087c478bd9Sstevel@tonic-gate 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
4097c478bd9Sstevel@tonic-gate 		    sizeof (long));
4107c478bd9Sstevel@tonic-gate 	}
4117c478bd9Sstevel@tonic-gate 	dp->disp_q = dptr->newdispq;
4127c478bd9Sstevel@tonic-gate 	dp->disp_qactmap = dptr->newdqactmap;
4137c478bd9Sstevel@tonic-gate 	dp->disp_q_limit = &dptr->newdispq[numpris];
4147c478bd9Sstevel@tonic-gate 	dp->disp_npri = numpris;
4157c478bd9Sstevel@tonic-gate }
4167c478bd9Sstevel@tonic-gate 
4177c478bd9Sstevel@tonic-gate static void
4187c478bd9Sstevel@tonic-gate disp_dq_free(struct disp_queue_info *dptr)
4197c478bd9Sstevel@tonic-gate {
4207c478bd9Sstevel@tonic-gate 	if (dptr->olddispq != NULL)
4217c478bd9Sstevel@tonic-gate 		kmem_free(dptr->olddispq,
4227c478bd9Sstevel@tonic-gate 		    dptr->oldnglobpris * sizeof (dispq_t));
4237c478bd9Sstevel@tonic-gate 	if (dptr->olddqactmap != NULL)
4247c478bd9Sstevel@tonic-gate 		kmem_free(dptr->olddqactmap,
4257c478bd9Sstevel@tonic-gate 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
4267c478bd9Sstevel@tonic-gate }
4277c478bd9Sstevel@tonic-gate 
4287c478bd9Sstevel@tonic-gate /*
4297c478bd9Sstevel@tonic-gate  * For a newly created CPU, initialize the dispatch queue.
4307c478bd9Sstevel@tonic-gate  * This is called before the CPU is known through cpu[] or on any lists.
4317c478bd9Sstevel@tonic-gate  */
4327c478bd9Sstevel@tonic-gate void
4337c478bd9Sstevel@tonic-gate disp_cpu_init(cpu_t *cp)
4347c478bd9Sstevel@tonic-gate {
4357c478bd9Sstevel@tonic-gate 	disp_t	*dp;
4367c478bd9Sstevel@tonic-gate 	dispq_t	*newdispq;
4377c478bd9Sstevel@tonic-gate 	ulong_t	*newdqactmap;
4387c478bd9Sstevel@tonic-gate 
4397c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock));	/* protect dispatcher queue sizes */
4407c478bd9Sstevel@tonic-gate 
4417c478bd9Sstevel@tonic-gate 	if (cp == cpu0_disp.disp_cpu)
4427c478bd9Sstevel@tonic-gate 		dp = &cpu0_disp;
4437c478bd9Sstevel@tonic-gate 	else
4447c478bd9Sstevel@tonic-gate 		dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
4457c478bd9Sstevel@tonic-gate 	bzero(dp, sizeof (disp_t));
4467c478bd9Sstevel@tonic-gate 	cp->cpu_disp = dp;
4477c478bd9Sstevel@tonic-gate 	dp->disp_cpu = cp;
4487c478bd9Sstevel@tonic-gate 	dp->disp_maxrunpri = -1;
4497c478bd9Sstevel@tonic-gate 	dp->disp_max_unbound_pri = -1;
4507c478bd9Sstevel@tonic-gate 	DISP_LOCK_INIT(&cp->cpu_thread_lock);
4517c478bd9Sstevel@tonic-gate 	/*
4527c478bd9Sstevel@tonic-gate 	 * Allocate memory for the dispatcher queue headers
4537c478bd9Sstevel@tonic-gate 	 * and the active queue bitmap.
4547c478bd9Sstevel@tonic-gate 	 */
4557c478bd9Sstevel@tonic-gate 	newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
4567c478bd9Sstevel@tonic-gate 	newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
4577c478bd9Sstevel@tonic-gate 	    sizeof (long), KM_SLEEP);
4587c478bd9Sstevel@tonic-gate 	dp->disp_q = newdispq;
4597c478bd9Sstevel@tonic-gate 	dp->disp_qactmap = newdqactmap;
4607c478bd9Sstevel@tonic-gate 	dp->disp_q_limit = &newdispq[v.v_nglobpris];
4617c478bd9Sstevel@tonic-gate 	dp->disp_npri = v.v_nglobpris;
4627c478bd9Sstevel@tonic-gate }
4637c478bd9Sstevel@tonic-gate 
4647c478bd9Sstevel@tonic-gate void
4657c478bd9Sstevel@tonic-gate disp_cpu_fini(cpu_t *cp)
4667c478bd9Sstevel@tonic-gate {
4677c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock));
4687c478bd9Sstevel@tonic-gate 
4697c478bd9Sstevel@tonic-gate 	disp_kp_free(cp->cpu_disp);
4707c478bd9Sstevel@tonic-gate 	if (cp->cpu_disp != &cpu0_disp)
4717c478bd9Sstevel@tonic-gate 		kmem_free(cp->cpu_disp, sizeof (disp_t));
4727c478bd9Sstevel@tonic-gate }
4737c478bd9Sstevel@tonic-gate 
4747c478bd9Sstevel@tonic-gate /*
4757c478bd9Sstevel@tonic-gate  * Allocate new, larger kpreempt dispatch queue to replace the old one.
4767c478bd9Sstevel@tonic-gate  */
4777c478bd9Sstevel@tonic-gate void
4787c478bd9Sstevel@tonic-gate disp_kp_alloc(disp_t *dq, pri_t npri)
4797c478bd9Sstevel@tonic-gate {
4807c478bd9Sstevel@tonic-gate 	struct disp_queue_info	mem_info;
4817c478bd9Sstevel@tonic-gate 
4827c478bd9Sstevel@tonic-gate 	if (npri > dq->disp_npri) {
4837c478bd9Sstevel@tonic-gate 		/*
4847c478bd9Sstevel@tonic-gate 		 * Allocate memory for the new array.
4857c478bd9Sstevel@tonic-gate 		 */
4867c478bd9Sstevel@tonic-gate 		disp_dq_alloc(&mem_info, npri, dq);
4877c478bd9Sstevel@tonic-gate 
4887c478bd9Sstevel@tonic-gate 		/*
4897c478bd9Sstevel@tonic-gate 		 * We need to copy the old structures to the new
4907c478bd9Sstevel@tonic-gate 		 * and free the old.
4917c478bd9Sstevel@tonic-gate 		 */
4927c478bd9Sstevel@tonic-gate 		disp_dq_assign(&mem_info, npri);
4937c478bd9Sstevel@tonic-gate 		disp_dq_free(&mem_info);
4947c478bd9Sstevel@tonic-gate 	}
4957c478bd9Sstevel@tonic-gate }
4967c478bd9Sstevel@tonic-gate 
4977c478bd9Sstevel@tonic-gate /*
4987c478bd9Sstevel@tonic-gate  * Free dispatch queue.
4997c478bd9Sstevel@tonic-gate  * Used for the kpreempt queues for a removed CPU partition and
5007c478bd9Sstevel@tonic-gate  * for the per-CPU queues of deleted CPUs.
5017c478bd9Sstevel@tonic-gate  */
5027c478bd9Sstevel@tonic-gate void
5037c478bd9Sstevel@tonic-gate disp_kp_free(disp_t *dq)
5047c478bd9Sstevel@tonic-gate {
5057c478bd9Sstevel@tonic-gate 	struct disp_queue_info	mem_info;
5067c478bd9Sstevel@tonic-gate 
5077c478bd9Sstevel@tonic-gate 	mem_info.olddispq = dq->disp_q;
5087c478bd9Sstevel@tonic-gate 	mem_info.olddqactmap = dq->disp_qactmap;
5097c478bd9Sstevel@tonic-gate 	mem_info.oldnglobpris = dq->disp_npri;
5107c478bd9Sstevel@tonic-gate 	disp_dq_free(&mem_info);
5117c478bd9Sstevel@tonic-gate }
5127c478bd9Sstevel@tonic-gate 
5137c478bd9Sstevel@tonic-gate /*
5147c478bd9Sstevel@tonic-gate  * End dispatcher and scheduler initialization.
5157c478bd9Sstevel@tonic-gate  */
5167c478bd9Sstevel@tonic-gate 
5177c478bd9Sstevel@tonic-gate /*
5187c478bd9Sstevel@tonic-gate  * See if there's anything to do other than remain idle.
5197c478bd9Sstevel@tonic-gate  * Return non-zero if there is.
5207c478bd9Sstevel@tonic-gate  *
5217c478bd9Sstevel@tonic-gate  * This function must be called with high spl, or with
5227c478bd9Sstevel@tonic-gate  * kernel preemption disabled to prevent the partition's
5237c478bd9Sstevel@tonic-gate  * active cpu list from changing while being traversed.
5247c478bd9Sstevel@tonic-gate  *
5256890d023SEric Saxe  * This is essentially a simpler version of disp_getwork()
5266890d023SEric Saxe  * to be called by CPUs preparing to "halt".
5277c478bd9Sstevel@tonic-gate  */
5287c478bd9Sstevel@tonic-gate int
5297c478bd9Sstevel@tonic-gate disp_anywork(void)
5307c478bd9Sstevel@tonic-gate {
5317c478bd9Sstevel@tonic-gate 	cpu_t		*cp = CPU;
5327c478bd9Sstevel@tonic-gate 	cpu_t		*ocp;
5336890d023SEric Saxe 	volatile int	*local_nrunnable = &cp->cpu_disp->disp_nrunnable;
5347c478bd9Sstevel@tonic-gate 
5357c478bd9Sstevel@tonic-gate 	if (!(cp->cpu_flags & CPU_OFFLINE)) {
5367c478bd9Sstevel@tonic-gate 		if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
5377c478bd9Sstevel@tonic-gate 			return (1);
5387c478bd9Sstevel@tonic-gate 
5396890d023SEric Saxe 		for (ocp = cp->cpu_next_part; ocp != cp;
5406890d023SEric Saxe 		    ocp = ocp->cpu_next_part) {
5416890d023SEric Saxe 			ASSERT(CPU_ACTIVE(ocp));
5426890d023SEric Saxe 
5436890d023SEric Saxe 			/*
5446890d023SEric Saxe 			 * Something has appeared on the local run queue.
5456890d023SEric Saxe 			 */
5466890d023SEric Saxe 			if (*local_nrunnable > 0)
5476890d023SEric Saxe 				return (1);
5486890d023SEric Saxe 			/*
5496890d023SEric Saxe 			 * If we encounter another idle CPU that will
5506890d023SEric Saxe 			 * soon be trolling around through disp_anywork()
5516890d023SEric Saxe 			 * terminate our walk here and let this other CPU
5526890d023SEric Saxe 			 * patrol the next part of the list.
5536890d023SEric Saxe 			 */
5546890d023SEric Saxe 			if (ocp->cpu_dispatch_pri == -1 &&
5556890d023SEric Saxe 			    (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
5566890d023SEric Saxe 				return (0);
5577c478bd9Sstevel@tonic-gate 			/*
5587c478bd9Sstevel@tonic-gate 			 * Work can be taken from another CPU if:
5597c478bd9Sstevel@tonic-gate 			 *	- There is unbound work on the run queue
5607c478bd9Sstevel@tonic-gate 			 *	- That work isn't a thread undergoing a
5617c478bd9Sstevel@tonic-gate 			 *	- context switch on an otherwise empty queue.
5627c478bd9Sstevel@tonic-gate 			 *	- The CPU isn't running the idle loop.
5637c478bd9Sstevel@tonic-gate 			 */
5647c478bd9Sstevel@tonic-gate 			if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
5657c478bd9Sstevel@tonic-gate 			    !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
5667c478bd9Sstevel@tonic-gate 			    ocp->cpu_disp->disp_nrunnable == 1) &&
5677c478bd9Sstevel@tonic-gate 			    ocp->cpu_dispatch_pri != -1)
5687c478bd9Sstevel@tonic-gate 				return (1);
5697c478bd9Sstevel@tonic-gate 		}
5707c478bd9Sstevel@tonic-gate 	}
5717c478bd9Sstevel@tonic-gate 	return (0);
5727c478bd9Sstevel@tonic-gate }
5737c478bd9Sstevel@tonic-gate 
5747c478bd9Sstevel@tonic-gate /*
5757c478bd9Sstevel@tonic-gate  * Called when CPU enters the idle loop
5767c478bd9Sstevel@tonic-gate  */
5777c478bd9Sstevel@tonic-gate static void
5787c478bd9Sstevel@tonic-gate idle_enter()
5797c478bd9Sstevel@tonic-gate {
5807c478bd9Sstevel@tonic-gate 	cpu_t		*cp = CPU;
5817c478bd9Sstevel@tonic-gate 
582eda89462Sesolom 	new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
5837c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cp, sys, idlethread, 1);
5847c478bd9Sstevel@tonic-gate 	set_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
5857c478bd9Sstevel@tonic-gate }
5867c478bd9Sstevel@tonic-gate 
5877c478bd9Sstevel@tonic-gate /*
5887c478bd9Sstevel@tonic-gate  * Called when CPU exits the idle loop
5897c478bd9Sstevel@tonic-gate  */
5907c478bd9Sstevel@tonic-gate static void
5917c478bd9Sstevel@tonic-gate idle_exit()
5927c478bd9Sstevel@tonic-gate {
5937c478bd9Sstevel@tonic-gate 	cpu_t		*cp = CPU;
5947c478bd9Sstevel@tonic-gate 
595eda89462Sesolom 	new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
5967c478bd9Sstevel@tonic-gate 	unset_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
5977c478bd9Sstevel@tonic-gate }
5987c478bd9Sstevel@tonic-gate 
5997c478bd9Sstevel@tonic-gate /*
6007c478bd9Sstevel@tonic-gate  * Idle loop.
6017c478bd9Sstevel@tonic-gate  */
6027c478bd9Sstevel@tonic-gate void
6037c478bd9Sstevel@tonic-gate idle()
6047c478bd9Sstevel@tonic-gate {
6057c478bd9Sstevel@tonic-gate 	struct cpu	*cp = CPU;		/* pointer to this CPU */
6067c478bd9Sstevel@tonic-gate 	kthread_t	*t;			/* taken thread */
6077c478bd9Sstevel@tonic-gate 
6087c478bd9Sstevel@tonic-gate 	idle_enter();
6097c478bd9Sstevel@tonic-gate 
6107c478bd9Sstevel@tonic-gate 	/*
6117c478bd9Sstevel@tonic-gate 	 * Uniprocessor version of idle loop.
6127c478bd9Sstevel@tonic-gate 	 * Do this until notified that we're on an actual multiprocessor.
6137c478bd9Sstevel@tonic-gate 	 */
6147c478bd9Sstevel@tonic-gate 	while (ncpus == 1) {
6157c478bd9Sstevel@tonic-gate 		if (cp->cpu_disp->disp_nrunnable == 0) {
6167c478bd9Sstevel@tonic-gate 			(*idle_cpu)();
6177c478bd9Sstevel@tonic-gate 			continue;
6187c478bd9Sstevel@tonic-gate 		}
6197c478bd9Sstevel@tonic-gate 		idle_exit();
6207c478bd9Sstevel@tonic-gate 		swtch();
6217c478bd9Sstevel@tonic-gate 
6227c478bd9Sstevel@tonic-gate 		idle_enter(); /* returned from swtch */
6237c478bd9Sstevel@tonic-gate 	}
6247c478bd9Sstevel@tonic-gate 
6257c478bd9Sstevel@tonic-gate 	/*
6267c478bd9Sstevel@tonic-gate 	 * Multiprocessor idle loop.
6277c478bd9Sstevel@tonic-gate 	 */
6287c478bd9Sstevel@tonic-gate 	for (;;) {
6297c478bd9Sstevel@tonic-gate 		/*
6307c478bd9Sstevel@tonic-gate 		 * If CPU is completely quiesced by p_online(2), just wait
6317c478bd9Sstevel@tonic-gate 		 * here with minimal bus traffic until put online.
6327c478bd9Sstevel@tonic-gate 		 */
6337c478bd9Sstevel@tonic-gate 		while (cp->cpu_flags & CPU_QUIESCED)
6347c478bd9Sstevel@tonic-gate 			(*idle_cpu)();
6357c478bd9Sstevel@tonic-gate 
6367c478bd9Sstevel@tonic-gate 		if (cp->cpu_disp->disp_nrunnable != 0) {
6377c478bd9Sstevel@tonic-gate 			idle_exit();
6387c478bd9Sstevel@tonic-gate 			swtch();
6397c478bd9Sstevel@tonic-gate 		} else {
6407c478bd9Sstevel@tonic-gate 			if (cp->cpu_flags & CPU_OFFLINE)
6417c478bd9Sstevel@tonic-gate 				continue;
6427c478bd9Sstevel@tonic-gate 			if ((t = disp_getwork(cp)) == NULL) {
6437c478bd9Sstevel@tonic-gate 				if (cp->cpu_chosen_level != -1) {
6447c478bd9Sstevel@tonic-gate 					disp_t *dp = cp->cpu_disp;
6457c478bd9Sstevel@tonic-gate 					disp_t *kpq;
6467c478bd9Sstevel@tonic-gate 
6477c478bd9Sstevel@tonic-gate 					disp_lock_enter(&dp->disp_lock);
6487c478bd9Sstevel@tonic-gate 					/*
6497c478bd9Sstevel@tonic-gate 					 * Set kpq under lock to prevent
6507c478bd9Sstevel@tonic-gate 					 * migration between partitions.
6517c478bd9Sstevel@tonic-gate 					 */
6527c478bd9Sstevel@tonic-gate 					kpq = &cp->cpu_part->cp_kp_queue;
6537c478bd9Sstevel@tonic-gate 					if (kpq->disp_maxrunpri == -1)
6547c478bd9Sstevel@tonic-gate 						cp->cpu_chosen_level = -1;
6557c478bd9Sstevel@tonic-gate 					disp_lock_exit(&dp->disp_lock);
6567c478bd9Sstevel@tonic-gate 				}
6577c478bd9Sstevel@tonic-gate 				(*idle_cpu)();
6587c478bd9Sstevel@tonic-gate 				continue;
6597c478bd9Sstevel@tonic-gate 			}
660685679f7Sakolb 			/*
661685679f7Sakolb 			 * If there was a thread but we couldn't steal
662685679f7Sakolb 			 * it, then keep trying.
663685679f7Sakolb 			 */
664685679f7Sakolb 			if (t == T_DONTSTEAL)
665685679f7Sakolb 				continue;
6667c478bd9Sstevel@tonic-gate 			idle_exit();
6677c478bd9Sstevel@tonic-gate 			swtch_to(t);
6687c478bd9Sstevel@tonic-gate 		}
6697c478bd9Sstevel@tonic-gate 		idle_enter(); /* returned from swtch/swtch_to */
6707c478bd9Sstevel@tonic-gate 	}
6717c478bd9Sstevel@tonic-gate }
6727c478bd9Sstevel@tonic-gate 
6737c478bd9Sstevel@tonic-gate 
6747c478bd9Sstevel@tonic-gate /*
6757c478bd9Sstevel@tonic-gate  * Preempt the currently running thread in favor of the highest
6767c478bd9Sstevel@tonic-gate  * priority thread.  The class of the current thread controls
6777c478bd9Sstevel@tonic-gate  * where it goes on the dispatcher queues. If panicking, turn
6787c478bd9Sstevel@tonic-gate  * preemption off.
6797c478bd9Sstevel@tonic-gate  */
6807c478bd9Sstevel@tonic-gate void
6817c478bd9Sstevel@tonic-gate preempt()
6827c478bd9Sstevel@tonic-gate {
6837c478bd9Sstevel@tonic-gate 	kthread_t 	*t = curthread;
6847c478bd9Sstevel@tonic-gate 	klwp_t 		*lwp = ttolwp(curthread);
6857c478bd9Sstevel@tonic-gate 
6867c478bd9Sstevel@tonic-gate 	if (panicstr)
6877c478bd9Sstevel@tonic-gate 		return;
6887c478bd9Sstevel@tonic-gate 
6897c478bd9Sstevel@tonic-gate 	TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
6907c478bd9Sstevel@tonic-gate 
6917c478bd9Sstevel@tonic-gate 	thread_lock(t);
6927c478bd9Sstevel@tonic-gate 
6937c478bd9Sstevel@tonic-gate 	if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
6947c478bd9Sstevel@tonic-gate 		/*
6957c478bd9Sstevel@tonic-gate 		 * this thread has already been chosen to be run on
6967c478bd9Sstevel@tonic-gate 		 * another CPU. Clear kprunrun on this CPU since we're
6977c478bd9Sstevel@tonic-gate 		 * already headed for swtch().
6987c478bd9Sstevel@tonic-gate 		 */
6997c478bd9Sstevel@tonic-gate 		CPU->cpu_kprunrun = 0;
7007c478bd9Sstevel@tonic-gate 		thread_unlock_nopreempt(t);
7017c478bd9Sstevel@tonic-gate 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
7027c478bd9Sstevel@tonic-gate 	} else {
7037c478bd9Sstevel@tonic-gate 		if (lwp != NULL)
7047c478bd9Sstevel@tonic-gate 			lwp->lwp_ru.nivcsw++;
7057c478bd9Sstevel@tonic-gate 		CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
7067c478bd9Sstevel@tonic-gate 		THREAD_TRANSITION(t);
7077c478bd9Sstevel@tonic-gate 		CL_PREEMPT(t);
7087c478bd9Sstevel@tonic-gate 		DTRACE_SCHED(preempt);
7097c478bd9Sstevel@tonic-gate 		thread_unlock_nopreempt(t);
7107c478bd9Sstevel@tonic-gate 
7117c478bd9Sstevel@tonic-gate 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
7127c478bd9Sstevel@tonic-gate 
7137c478bd9Sstevel@tonic-gate 		swtch();		/* clears CPU->cpu_runrun via disp() */
7147c478bd9Sstevel@tonic-gate 	}
7157c478bd9Sstevel@tonic-gate }
7167c478bd9Sstevel@tonic-gate 
7177c478bd9Sstevel@tonic-gate extern kthread_t *thread_unpin();
7187c478bd9Sstevel@tonic-gate 
7197c478bd9Sstevel@tonic-gate /*
7207c478bd9Sstevel@tonic-gate  * disp() - find the highest priority thread for this processor to run, and
7217c478bd9Sstevel@tonic-gate  * set it in TS_ONPROC state so that resume() can be called to run it.
7227c478bd9Sstevel@tonic-gate  */
7237c478bd9Sstevel@tonic-gate static kthread_t *
7247c478bd9Sstevel@tonic-gate disp()
7257c478bd9Sstevel@tonic-gate {
7267c478bd9Sstevel@tonic-gate 	cpu_t		*cpup;
7277c478bd9Sstevel@tonic-gate 	disp_t		*dp;
7287c478bd9Sstevel@tonic-gate 	kthread_t	*tp;
7297c478bd9Sstevel@tonic-gate 	dispq_t		*dq;
7307c478bd9Sstevel@tonic-gate 	int		maxrunword;
7317c478bd9Sstevel@tonic-gate 	pri_t		pri;
7327c478bd9Sstevel@tonic-gate 	disp_t		*kpq;
7337c478bd9Sstevel@tonic-gate 
7347c478bd9Sstevel@tonic-gate 	TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
7357c478bd9Sstevel@tonic-gate 
7367c478bd9Sstevel@tonic-gate 	cpup = CPU;
7377c478bd9Sstevel@tonic-gate 	/*
7387c478bd9Sstevel@tonic-gate 	 * Find the highest priority loaded, runnable thread.
7397c478bd9Sstevel@tonic-gate 	 */
7407c478bd9Sstevel@tonic-gate 	dp = cpup->cpu_disp;
7417c478bd9Sstevel@tonic-gate 
7427c478bd9Sstevel@tonic-gate reschedule:
7437c478bd9Sstevel@tonic-gate 	/*
7447c478bd9Sstevel@tonic-gate 	 * If there is more important work on the global queue with a better
7457c478bd9Sstevel@tonic-gate 	 * priority than the maximum on this CPU, take it now.
7467c478bd9Sstevel@tonic-gate 	 */
7477c478bd9Sstevel@tonic-gate 	kpq = &cpup->cpu_part->cp_kp_queue;
7487c478bd9Sstevel@tonic-gate 	while ((pri = kpq->disp_maxrunpri) >= 0 &&
7497c478bd9Sstevel@tonic-gate 	    pri >= dp->disp_maxrunpri &&
7507c478bd9Sstevel@tonic-gate 	    (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
7517c478bd9Sstevel@tonic-gate 	    (tp = disp_getbest(kpq)) != NULL) {
7527c478bd9Sstevel@tonic-gate 		if (disp_ratify(tp, kpq) != NULL) {
7537c478bd9Sstevel@tonic-gate 			TRACE_1(TR_FAC_DISP, TR_DISP_END,
7547c478bd9Sstevel@tonic-gate 			    "disp_end:tid %p", tp);
7557c478bd9Sstevel@tonic-gate 			return (tp);
7567c478bd9Sstevel@tonic-gate 		}
7577c478bd9Sstevel@tonic-gate 	}
7587c478bd9Sstevel@tonic-gate 
7597c478bd9Sstevel@tonic-gate 	disp_lock_enter(&dp->disp_lock);
7607c478bd9Sstevel@tonic-gate 	pri = dp->disp_maxrunpri;
7617c478bd9Sstevel@tonic-gate 
7627c478bd9Sstevel@tonic-gate 	/*
7637c478bd9Sstevel@tonic-gate 	 * If there is nothing to run, look at what's runnable on other queues.
7647c478bd9Sstevel@tonic-gate 	 * Choose the idle thread if the CPU is quiesced.
7657c478bd9Sstevel@tonic-gate 	 * Note that CPUs that have the CPU_OFFLINE flag set can still run
7667c478bd9Sstevel@tonic-gate 	 * interrupt threads, which will be the only threads on the CPU's own
7677c478bd9Sstevel@tonic-gate 	 * queue, but cannot run threads from other queues.
7687c478bd9Sstevel@tonic-gate 	 */
7697c478bd9Sstevel@tonic-gate 	if (pri == -1) {
7707c478bd9Sstevel@tonic-gate 		if (!(cpup->cpu_flags & CPU_OFFLINE)) {
7717c478bd9Sstevel@tonic-gate 			disp_lock_exit(&dp->disp_lock);
772685679f7Sakolb 			if ((tp = disp_getwork(cpup)) == NULL ||
773685679f7Sakolb 			    tp == T_DONTSTEAL) {
7747c478bd9Sstevel@tonic-gate 				tp = cpup->cpu_idle_thread;
7757c478bd9Sstevel@tonic-gate 				(void) splhigh();
7767c478bd9Sstevel@tonic-gate 				THREAD_ONPROC(tp, cpup);
7777c478bd9Sstevel@tonic-gate 				cpup->cpu_dispthread = tp;
7787c478bd9Sstevel@tonic-gate 				cpup->cpu_dispatch_pri = -1;
7797c478bd9Sstevel@tonic-gate 				cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
7807c478bd9Sstevel@tonic-gate 				cpup->cpu_chosen_level = -1;
7817c478bd9Sstevel@tonic-gate 			}
7827c478bd9Sstevel@tonic-gate 		} else {
7837c478bd9Sstevel@tonic-gate 			disp_lock_exit_high(&dp->disp_lock);
7847c478bd9Sstevel@tonic-gate 			tp = cpup->cpu_idle_thread;
7857c478bd9Sstevel@tonic-gate 			THREAD_ONPROC(tp, cpup);
7867c478bd9Sstevel@tonic-gate 			cpup->cpu_dispthread = tp;
7877c478bd9Sstevel@tonic-gate 			cpup->cpu_dispatch_pri = -1;
7887c478bd9Sstevel@tonic-gate 			cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
7897c478bd9Sstevel@tonic-gate 			cpup->cpu_chosen_level = -1;
7907c478bd9Sstevel@tonic-gate 		}
7917c478bd9Sstevel@tonic-gate 		TRACE_1(TR_FAC_DISP, TR_DISP_END,
7927c478bd9Sstevel@tonic-gate 		    "disp_end:tid %p", tp);
7937c478bd9Sstevel@tonic-gate 		return (tp);
7947c478bd9Sstevel@tonic-gate 	}
7957c478bd9Sstevel@tonic-gate 
7967c478bd9Sstevel@tonic-gate 	dq = &dp->disp_q[pri];
7977c478bd9Sstevel@tonic-gate 	tp = dq->dq_first;
7987c478bd9Sstevel@tonic-gate 
7997c478bd9Sstevel@tonic-gate 	ASSERT(tp != NULL);
8007c478bd9Sstevel@tonic-gate 	ASSERT(tp->t_schedflag & TS_LOAD);	/* thread must be swapped in */
8017c478bd9Sstevel@tonic-gate 
8027c478bd9Sstevel@tonic-gate 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
8037c478bd9Sstevel@tonic-gate 
8047c478bd9Sstevel@tonic-gate 	/*
8057c478bd9Sstevel@tonic-gate 	 * Found it so remove it from queue.
8067c478bd9Sstevel@tonic-gate 	 */
8077c478bd9Sstevel@tonic-gate 	dp->disp_nrunnable--;
8087c478bd9Sstevel@tonic-gate 	dq->dq_sruncnt--;
8097c478bd9Sstevel@tonic-gate 	if ((dq->dq_first = tp->t_link) == NULL) {
8107c478bd9Sstevel@tonic-gate 		ulong_t	*dqactmap = dp->disp_qactmap;
8117c478bd9Sstevel@tonic-gate 
8127c478bd9Sstevel@tonic-gate 		ASSERT(dq->dq_sruncnt == 0);
8137c478bd9Sstevel@tonic-gate 		dq->dq_last = NULL;
8147c478bd9Sstevel@tonic-gate 
8157c478bd9Sstevel@tonic-gate 		/*
8167c478bd9Sstevel@tonic-gate 		 * The queue is empty, so the corresponding bit needs to be
8177c478bd9Sstevel@tonic-gate 		 * turned off in dqactmap.   If nrunnable != 0 just took the
8187c478bd9Sstevel@tonic-gate 		 * last runnable thread off the
8197c478bd9Sstevel@tonic-gate 		 * highest queue, so recompute disp_maxrunpri.
8207c478bd9Sstevel@tonic-gate 		 */
8217c478bd9Sstevel@tonic-gate 		maxrunword = pri >> BT_ULSHIFT;
8227c478bd9Sstevel@tonic-gate 		dqactmap[maxrunword] &= ~BT_BIW(pri);
8237c478bd9Sstevel@tonic-gate 
8247c478bd9Sstevel@tonic-gate 		if (dp->disp_nrunnable == 0) {
8257c478bd9Sstevel@tonic-gate 			dp->disp_max_unbound_pri = -1;
8267c478bd9Sstevel@tonic-gate 			dp->disp_maxrunpri = -1;
8277c478bd9Sstevel@tonic-gate 		} else {
8287c478bd9Sstevel@tonic-gate 			int ipri;
8297c478bd9Sstevel@tonic-gate 
8307c478bd9Sstevel@tonic-gate 			ipri = bt_gethighbit(dqactmap, maxrunword);
8317c478bd9Sstevel@tonic-gate 			dp->disp_maxrunpri = ipri;
8327c478bd9Sstevel@tonic-gate 			if (ipri < dp->disp_max_unbound_pri)
8337c478bd9Sstevel@tonic-gate 				dp->disp_max_unbound_pri = ipri;
8347c478bd9Sstevel@tonic-gate 		}
8357c478bd9Sstevel@tonic-gate 	} else {
8367c478bd9Sstevel@tonic-gate 		tp->t_link = NULL;
8377c478bd9Sstevel@tonic-gate 	}
8387c478bd9Sstevel@tonic-gate 
8397c478bd9Sstevel@tonic-gate 	/*
8407c478bd9Sstevel@tonic-gate 	 * Set TS_DONT_SWAP flag to prevent another processor from swapping
8417c478bd9Sstevel@tonic-gate 	 * out this thread before we have a chance to run it.
8427c478bd9Sstevel@tonic-gate 	 * While running, it is protected against swapping by t_lock.
8437c478bd9Sstevel@tonic-gate 	 */
8447c478bd9Sstevel@tonic-gate 	tp->t_schedflag |= TS_DONT_SWAP;
8457c478bd9Sstevel@tonic-gate 	cpup->cpu_dispthread = tp;		/* protected by spl only */
8467c478bd9Sstevel@tonic-gate 	cpup->cpu_dispatch_pri = pri;
8477c478bd9Sstevel@tonic-gate 	ASSERT(pri == DISP_PRIO(tp));
8487c478bd9Sstevel@tonic-gate 	thread_onproc(tp, cpup);  		/* set t_state to TS_ONPROC */
8497c478bd9Sstevel@tonic-gate 	disp_lock_exit_high(&dp->disp_lock);	/* drop run queue lock */
8507c478bd9Sstevel@tonic-gate 
8517c478bd9Sstevel@tonic-gate 	ASSERT(tp != NULL);
8527c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_DISP, TR_DISP_END,
8537c478bd9Sstevel@tonic-gate 	    "disp_end:tid %p", tp);
8547c478bd9Sstevel@tonic-gate 
8557c478bd9Sstevel@tonic-gate 	if (disp_ratify(tp, kpq) == NULL)
8567c478bd9Sstevel@tonic-gate 		goto reschedule;
8577c478bd9Sstevel@tonic-gate 
8587c478bd9Sstevel@tonic-gate 	return (tp);
8597c478bd9Sstevel@tonic-gate }
8607c478bd9Sstevel@tonic-gate 
8617c478bd9Sstevel@tonic-gate /*
8627c478bd9Sstevel@tonic-gate  * swtch()
8637c478bd9Sstevel@tonic-gate  *	Find best runnable thread and run it.
8647c478bd9Sstevel@tonic-gate  *	Called with the current thread already switched to a new state,
8657c478bd9Sstevel@tonic-gate  *	on a sleep queue, run queue, stopped, and not zombied.
8667c478bd9Sstevel@tonic-gate  *	May be called at any spl level less than or equal to LOCK_LEVEL.
8677c478bd9Sstevel@tonic-gate  *	Always drops spl to the base level (spl0()).
8687c478bd9Sstevel@tonic-gate  */
8697c478bd9Sstevel@tonic-gate void
8707c478bd9Sstevel@tonic-gate swtch()
8717c478bd9Sstevel@tonic-gate {
8727c478bd9Sstevel@tonic-gate 	kthread_t	*t = curthread;
8737c478bd9Sstevel@tonic-gate 	kthread_t	*next;
8747c478bd9Sstevel@tonic-gate 	cpu_t		*cp;
8757c478bd9Sstevel@tonic-gate 
8767c478bd9Sstevel@tonic-gate 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
8777c478bd9Sstevel@tonic-gate 
8787c478bd9Sstevel@tonic-gate 	if (t->t_flag & T_INTR_THREAD)
8797c478bd9Sstevel@tonic-gate 		cpu_intr_swtch_enter(t);
8807c478bd9Sstevel@tonic-gate 
8817c478bd9Sstevel@tonic-gate 	if (t->t_intr != NULL) {
8827c478bd9Sstevel@tonic-gate 		/*
8837c478bd9Sstevel@tonic-gate 		 * We are an interrupt thread.  Setup and return
8847c478bd9Sstevel@tonic-gate 		 * the interrupted thread to be resumed.
8857c478bd9Sstevel@tonic-gate 		 */
8867c478bd9Sstevel@tonic-gate 		(void) splhigh();	/* block other scheduler action */
8877c478bd9Sstevel@tonic-gate 		cp = CPU;		/* now protected against migration */
8887c478bd9Sstevel@tonic-gate 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
8897c478bd9Sstevel@tonic-gate 		CPU_STATS_ADDQ(cp, sys, pswitch, 1);
8907c478bd9Sstevel@tonic-gate 		CPU_STATS_ADDQ(cp, sys, intrblk, 1);
8917c478bd9Sstevel@tonic-gate 		next = thread_unpin();
8927c478bd9Sstevel@tonic-gate 		TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
8937c478bd9Sstevel@tonic-gate 		resume_from_intr(next);
8947c478bd9Sstevel@tonic-gate 	} else {
8957c478bd9Sstevel@tonic-gate #ifdef	DEBUG
8967c478bd9Sstevel@tonic-gate 		if (t->t_state == TS_ONPROC &&
8977c478bd9Sstevel@tonic-gate 		    t->t_disp_queue->disp_cpu == CPU &&
8987c478bd9Sstevel@tonic-gate 		    t->t_preempt == 0) {
8997c478bd9Sstevel@tonic-gate 			thread_lock(t);
9007c478bd9Sstevel@tonic-gate 			ASSERT(t->t_state != TS_ONPROC ||
9017c478bd9Sstevel@tonic-gate 			    t->t_disp_queue->disp_cpu != CPU ||
9027c478bd9Sstevel@tonic-gate 			    t->t_preempt != 0);	/* cannot migrate */
9037c478bd9Sstevel@tonic-gate 			thread_unlock_nopreempt(t);
9047c478bd9Sstevel@tonic-gate 		}
9057c478bd9Sstevel@tonic-gate #endif	/* DEBUG */
9067c478bd9Sstevel@tonic-gate 		cp = CPU;
9077c478bd9Sstevel@tonic-gate 		next = disp();		/* returns with spl high */
9087c478bd9Sstevel@tonic-gate 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
9097c478bd9Sstevel@tonic-gate 
9107c478bd9Sstevel@tonic-gate 		/* OK to steal anything left on run queue */
9117c478bd9Sstevel@tonic-gate 		cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
9127c478bd9Sstevel@tonic-gate 
9137c478bd9Sstevel@tonic-gate 		if (next != t) {
9140e751525SEric Saxe 			hrtime_t now;
9150e751525SEric Saxe 
9160e751525SEric Saxe 			now = gethrtime_unscaled();
9170e751525SEric Saxe 			pg_ev_thread_swtch(cp, now, t, next);
9187c478bd9Sstevel@tonic-gate 
919f2bd4627Sjohansen 			/*
920f2bd4627Sjohansen 			 * If t was previously in the TS_ONPROC state,
921f2bd4627Sjohansen 			 * setfrontdq and setbackdq won't have set its t_waitrq.
922f2bd4627Sjohansen 			 * Since we now finally know that we're switching away
923f2bd4627Sjohansen 			 * from this thread, set its t_waitrq if it is on a run
924f2bd4627Sjohansen 			 * queue.
925f2bd4627Sjohansen 			 */
926f2bd4627Sjohansen 			if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
9270e751525SEric Saxe 				t->t_waitrq = now;
928f2bd4627Sjohansen 			}
929f2bd4627Sjohansen 
930f2bd4627Sjohansen 			/*
931f2bd4627Sjohansen 			 * restore mstate of thread that we are switching to
932f2bd4627Sjohansen 			 */
933f2bd4627Sjohansen 			restore_mstate(next);
934f2bd4627Sjohansen 
9357c478bd9Sstevel@tonic-gate 			CPU_STATS_ADDQ(cp, sys, pswitch, 1);
936d3d50737SRafael Vanoni 			cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
9377c478bd9Sstevel@tonic-gate 			TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
9387c478bd9Sstevel@tonic-gate 
9397c478bd9Sstevel@tonic-gate 			if (dtrace_vtime_active)
9407c478bd9Sstevel@tonic-gate 				dtrace_vtime_switch(next);
9417c478bd9Sstevel@tonic-gate 
9427c478bd9Sstevel@tonic-gate 			resume(next);
9437c478bd9Sstevel@tonic-gate 			/*
9447c478bd9Sstevel@tonic-gate 			 * The TR_RESUME_END and TR_SWTCH_END trace points
9457c478bd9Sstevel@tonic-gate 			 * appear at the end of resume(), because we may not
9467c478bd9Sstevel@tonic-gate 			 * return here
9477c478bd9Sstevel@tonic-gate 			 */
9487c478bd9Sstevel@tonic-gate 		} else {
9497c478bd9Sstevel@tonic-gate 			if (t->t_flag & T_INTR_THREAD)
9507c478bd9Sstevel@tonic-gate 				cpu_intr_swtch_exit(t);
9511dbbbf76SSudheer A 			/*
9521dbbbf76SSudheer A 			 * Threads that enqueue themselves on a run queue defer
9531dbbbf76SSudheer A 			 * setting t_waitrq. It is then either set in swtch()
9541dbbbf76SSudheer A 			 * when the CPU is actually yielded, or not at all if it
9551dbbbf76SSudheer A 			 * is remaining on the CPU.
9561dbbbf76SSudheer A 			 * There is however a window between where the thread
9571dbbbf76SSudheer A 			 * placed itself on a run queue, and where it selects
9581dbbbf76SSudheer A 			 * itself in disp(), where a third party (eg. clock()
9591dbbbf76SSudheer A 			 * doing tick processing) may have re-enqueued this
9601dbbbf76SSudheer A 			 * thread, setting t_waitrq in the process. We detect
9611dbbbf76SSudheer A 			 * this race by noticing that despite switching to
9621dbbbf76SSudheer A 			 * ourself, our t_waitrq has been set, and should be
9631dbbbf76SSudheer A 			 * cleared.
9641dbbbf76SSudheer A 			 */
9651dbbbf76SSudheer A 			if (t->t_waitrq != 0)
9661dbbbf76SSudheer A 				t->t_waitrq = 0;
9677c478bd9Sstevel@tonic-gate 
9680e751525SEric Saxe 			pg_ev_thread_remain(cp, t);
9690e751525SEric Saxe 
9707c478bd9Sstevel@tonic-gate 			DTRACE_SCHED(remain__cpu);
9717c478bd9Sstevel@tonic-gate 			TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
9727c478bd9Sstevel@tonic-gate 			(void) spl0();
9737c478bd9Sstevel@tonic-gate 		}
9747c478bd9Sstevel@tonic-gate 	}
9757c478bd9Sstevel@tonic-gate }
9767c478bd9Sstevel@tonic-gate 
9777c478bd9Sstevel@tonic-gate /*
9787c478bd9Sstevel@tonic-gate  * swtch_from_zombie()
9797c478bd9Sstevel@tonic-gate  *	Special case of swtch(), which allows checks for TS_ZOMB to be
9807c478bd9Sstevel@tonic-gate  *	eliminated from normal resume.
9817c478bd9Sstevel@tonic-gate  *	Find best runnable thread and run it.
9827c478bd9Sstevel@tonic-gate  *	Called with the current thread zombied.
9837c478bd9Sstevel@tonic-gate  *	Zombies cannot migrate, so CPU references are safe.
9847c478bd9Sstevel@tonic-gate  */
9857c478bd9Sstevel@tonic-gate void
9867c478bd9Sstevel@tonic-gate swtch_from_zombie()
9877c478bd9Sstevel@tonic-gate {
9887c478bd9Sstevel@tonic-gate 	kthread_t	*next;
9897c478bd9Sstevel@tonic-gate 	cpu_t		*cpu = CPU;
9907c478bd9Sstevel@tonic-gate 
9917c478bd9Sstevel@tonic-gate 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
9927c478bd9Sstevel@tonic-gate 
9937c478bd9Sstevel@tonic-gate 	ASSERT(curthread->t_state == TS_ZOMB);
9947c478bd9Sstevel@tonic-gate 
9957c478bd9Sstevel@tonic-gate 	next = disp();			/* returns with spl high */
9967c478bd9Sstevel@tonic-gate 	ASSERT(CPU_ON_INTR(CPU) == 0);	/* not called with PIL > 10 */
9977c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
9987c478bd9Sstevel@tonic-gate 	ASSERT(next != curthread);
9997c478bd9Sstevel@tonic-gate 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
10007c478bd9Sstevel@tonic-gate 
10010e751525SEric Saxe 	pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
10027c478bd9Sstevel@tonic-gate 
1003f2bd4627Sjohansen 	restore_mstate(next);
1004f2bd4627Sjohansen 
10057c478bd9Sstevel@tonic-gate 	if (dtrace_vtime_active)
10067c478bd9Sstevel@tonic-gate 		dtrace_vtime_switch(next);
10077c478bd9Sstevel@tonic-gate 
10087c478bd9Sstevel@tonic-gate 	resume_from_zombie(next);
10097c478bd9Sstevel@tonic-gate 	/*
10107c478bd9Sstevel@tonic-gate 	 * The TR_RESUME_END and TR_SWTCH_END trace points
10117c478bd9Sstevel@tonic-gate 	 * appear at the end of resume(), because we certainly will not
10127c478bd9Sstevel@tonic-gate 	 * return here
10137c478bd9Sstevel@tonic-gate 	 */
10147c478bd9Sstevel@tonic-gate }
10157c478bd9Sstevel@tonic-gate 
10167c478bd9Sstevel@tonic-gate #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
10177c478bd9Sstevel@tonic-gate 
1018057452c6Sjj209869 /*
1019057452c6Sjj209869  * search_disp_queues()
1020057452c6Sjj209869  *	Search the given dispatch queues for thread tp.
1021057452c6Sjj209869  *	Return 1 if tp is found, otherwise return 0.
1022057452c6Sjj209869  */
1023057452c6Sjj209869 static int
1024057452c6Sjj209869 search_disp_queues(disp_t *dp, kthread_t *tp)
1025057452c6Sjj209869 {
10267c478bd9Sstevel@tonic-gate 	dispq_t		*dq;
10277c478bd9Sstevel@tonic-gate 	dispq_t		*eq;
10287c478bd9Sstevel@tonic-gate 
10297c478bd9Sstevel@tonic-gate 	disp_lock_enter_high(&dp->disp_lock);
1030057452c6Sjj209869 
10317c478bd9Sstevel@tonic-gate 	for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
10327c478bd9Sstevel@tonic-gate 		kthread_t	*rp;
10337c478bd9Sstevel@tonic-gate 
1034057452c6Sjj209869 		ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1035057452c6Sjj209869 
10367c478bd9Sstevel@tonic-gate 		for (rp = dq->dq_first; rp; rp = rp->t_link)
10377c478bd9Sstevel@tonic-gate 			if (tp == rp) {
10387c478bd9Sstevel@tonic-gate 				disp_lock_exit_high(&dp->disp_lock);
10397c478bd9Sstevel@tonic-gate 				return (1);
10407c478bd9Sstevel@tonic-gate 			}
10417c478bd9Sstevel@tonic-gate 	}
10427c478bd9Sstevel@tonic-gate 	disp_lock_exit_high(&dp->disp_lock);
1043057452c6Sjj209869 
10447c478bd9Sstevel@tonic-gate 	return (0);
1045057452c6Sjj209869 }
1046057452c6Sjj209869 
1047057452c6Sjj209869 /*
1048057452c6Sjj209869  * thread_on_queue()
1049057452c6Sjj209869  *	Search all per-CPU dispatch queues and all partition-wide kpreempt
1050057452c6Sjj209869  *	queues for thread tp. Return 1 if tp is found, otherwise return 0.
1051057452c6Sjj209869  */
1052057452c6Sjj209869 static int
1053057452c6Sjj209869 thread_on_queue(kthread_t *tp)
1054057452c6Sjj209869 {
1055057452c6Sjj209869 	cpu_t		*cp;
1056057452c6Sjj209869 	struct cpupart	*part;
1057057452c6Sjj209869 
1058057452c6Sjj209869 	ASSERT(getpil() >= DISP_LEVEL);
1059057452c6Sjj209869 
1060057452c6Sjj209869 	/*
1061057452c6Sjj209869 	 * Search the per-CPU dispatch queues for tp.
1062057452c6Sjj209869 	 */
1063057452c6Sjj209869 	cp = CPU;
1064057452c6Sjj209869 	do {
1065057452c6Sjj209869 		if (search_disp_queues(cp->cpu_disp, tp))
1066057452c6Sjj209869 			return (1);
1067057452c6Sjj209869 	} while ((cp = cp->cpu_next_onln) != CPU);
1068057452c6Sjj209869 
1069057452c6Sjj209869 	/*
1070057452c6Sjj209869 	 * Search the partition-wide kpreempt queues for tp.
1071057452c6Sjj209869 	 */
1072057452c6Sjj209869 	part = CPU->cpu_part;
1073057452c6Sjj209869 	do {
1074057452c6Sjj209869 		if (search_disp_queues(&part->cp_kp_queue, tp))
1075057452c6Sjj209869 			return (1);
1076057452c6Sjj209869 	} while ((part = part->cp_next) != CPU->cpu_part);
1077057452c6Sjj209869 
1078057452c6Sjj209869 	return (0);
1079057452c6Sjj209869 }
1080057452c6Sjj209869 
10817c478bd9Sstevel@tonic-gate #else
10827c478bd9Sstevel@tonic-gate 
10837c478bd9Sstevel@tonic-gate #define	thread_on_queue(tp)	0	/* ASSERT must be !thread_on_queue */
10847c478bd9Sstevel@tonic-gate 
10857c478bd9Sstevel@tonic-gate #endif  /* DEBUG */
10867c478bd9Sstevel@tonic-gate 
10877c478bd9Sstevel@tonic-gate /*
10887c478bd9Sstevel@tonic-gate  * like swtch(), but switch to a specified thread taken from another CPU.
10897c478bd9Sstevel@tonic-gate  *	called with spl high..
10907c478bd9Sstevel@tonic-gate  */
10917c478bd9Sstevel@tonic-gate void
10927c478bd9Sstevel@tonic-gate swtch_to(kthread_t *next)
10937c478bd9Sstevel@tonic-gate {
10947c478bd9Sstevel@tonic-gate 	cpu_t			*cp = CPU;
10950e751525SEric Saxe 	hrtime_t		now;
10967c478bd9Sstevel@tonic-gate 
10977c478bd9Sstevel@tonic-gate 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
10987c478bd9Sstevel@tonic-gate 
10997c478bd9Sstevel@tonic-gate 	/*
11007c478bd9Sstevel@tonic-gate 	 * Update context switch statistics.
11017c478bd9Sstevel@tonic-gate 	 */
11027c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cp, sys, pswitch, 1);
11037c478bd9Sstevel@tonic-gate 
11047c478bd9Sstevel@tonic-gate 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
11057c478bd9Sstevel@tonic-gate 
11060e751525SEric Saxe 	now = gethrtime_unscaled();
11070e751525SEric Saxe 	pg_ev_thread_swtch(cp, now, curthread, next);
11087c478bd9Sstevel@tonic-gate 
11097c478bd9Sstevel@tonic-gate 	/* OK to steal anything left on run queue */
11107c478bd9Sstevel@tonic-gate 	cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
11117c478bd9Sstevel@tonic-gate 
11127c478bd9Sstevel@tonic-gate 	/* record last execution time */
1113d3d50737SRafael Vanoni 	cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
11147c478bd9Sstevel@tonic-gate 
1115f2bd4627Sjohansen 	/*
1116f2bd4627Sjohansen 	 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1117f2bd4627Sjohansen 	 * won't have set its t_waitrq.  Since we now finally know that we're
1118f2bd4627Sjohansen 	 * switching away from this thread, set its t_waitrq if it is on a run
1119f2bd4627Sjohansen 	 * queue.
1120f2bd4627Sjohansen 	 */
1121f2bd4627Sjohansen 	if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
11220e751525SEric Saxe 		curthread->t_waitrq = now;
1123f2bd4627Sjohansen 	}
1124f2bd4627Sjohansen 
1125f2bd4627Sjohansen 	/* restore next thread to previously running microstate */
1126f2bd4627Sjohansen 	restore_mstate(next);
1127f2bd4627Sjohansen 
11287c478bd9Sstevel@tonic-gate 	if (dtrace_vtime_active)
11297c478bd9Sstevel@tonic-gate 		dtrace_vtime_switch(next);
11307c478bd9Sstevel@tonic-gate 
11317c478bd9Sstevel@tonic-gate 	resume(next);
11327c478bd9Sstevel@tonic-gate 	/*
11337c478bd9Sstevel@tonic-gate 	 * The TR_RESUME_END and TR_SWTCH_END trace points
11347c478bd9Sstevel@tonic-gate 	 * appear at the end of resume(), because we may not
11357c478bd9Sstevel@tonic-gate 	 * return here
11367c478bd9Sstevel@tonic-gate 	 */
11377c478bd9Sstevel@tonic-gate }
11387c478bd9Sstevel@tonic-gate 
11397c478bd9Sstevel@tonic-gate static void
11407c478bd9Sstevel@tonic-gate cpu_resched(cpu_t *cp, pri_t tpri)
11417c478bd9Sstevel@tonic-gate {
11427c478bd9Sstevel@tonic-gate 	int	call_poke_cpu = 0;
11437c478bd9Sstevel@tonic-gate 	pri_t   cpupri = cp->cpu_dispatch_pri;
11447c478bd9Sstevel@tonic-gate 
1145455e370cSJohn Levon 	if (cpupri != CPU_IDLE_PRI && cpupri < tpri) {
11467c478bd9Sstevel@tonic-gate 		TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
11477c478bd9Sstevel@tonic-gate 		    "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
11487c478bd9Sstevel@tonic-gate 		if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
11497c478bd9Sstevel@tonic-gate 			cp->cpu_runrun = 1;
11507c478bd9Sstevel@tonic-gate 			aston(cp->cpu_dispthread);
11517c478bd9Sstevel@tonic-gate 			if (tpri < kpreemptpri && cp != CPU)
11527c478bd9Sstevel@tonic-gate 				call_poke_cpu = 1;
11537c478bd9Sstevel@tonic-gate 		}
11547c478bd9Sstevel@tonic-gate 		if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
11557c478bd9Sstevel@tonic-gate 			cp->cpu_kprunrun = 1;
11567c478bd9Sstevel@tonic-gate 			if (cp != CPU)
11577c478bd9Sstevel@tonic-gate 				call_poke_cpu = 1;
11587c478bd9Sstevel@tonic-gate 		}
11597c478bd9Sstevel@tonic-gate 	}
11607c478bd9Sstevel@tonic-gate 
11617c478bd9Sstevel@tonic-gate 	/*
11627c478bd9Sstevel@tonic-gate 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
11637c478bd9Sstevel@tonic-gate 	 */
11647c478bd9Sstevel@tonic-gate 	membar_enter();
11657c478bd9Sstevel@tonic-gate 
11667c478bd9Sstevel@tonic-gate 	if (call_poke_cpu)
11677c478bd9Sstevel@tonic-gate 		poke_cpu(cp->cpu_id);
11687c478bd9Sstevel@tonic-gate }
11697c478bd9Sstevel@tonic-gate 
11707c478bd9Sstevel@tonic-gate /*
11717c478bd9Sstevel@tonic-gate  * setbackdq() keeps runqs balanced such that the difference in length
11727c478bd9Sstevel@tonic-gate  * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
11737c478bd9Sstevel@tonic-gate  * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
11747c478bd9Sstevel@tonic-gate  * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
11757c478bd9Sstevel@tonic-gate  * try to keep runqs perfectly balanced regardless of the thread priority.
11767c478bd9Sstevel@tonic-gate  */
11777c478bd9Sstevel@tonic-gate #define	RUNQ_MATCH_PRI	16	/* pri below which queue lengths must match */
11787c478bd9Sstevel@tonic-gate #define	RUNQ_MAX_DIFF	2	/* maximum runq length difference */
11797c478bd9Sstevel@tonic-gate #define	RUNQ_LEN(cp, pri)	((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
11807c478bd9Sstevel@tonic-gate 
11817c478bd9Sstevel@tonic-gate /*
11826890d023SEric Saxe  * Macro that evaluates to true if it is likely that the thread has cache
11836890d023SEric Saxe  * warmth. This is based on the amount of time that has elapsed since the
11846890d023SEric Saxe  * thread last ran. If that amount of time is less than "rechoose_interval"
11856890d023SEric Saxe  * ticks, then we decide that the thread has enough cache warmth to warrant
11866890d023SEric Saxe  * some affinity for t->t_cpu.
11876890d023SEric Saxe  */
11886890d023SEric Saxe #define	THREAD_HAS_CACHE_WARMTH(thread)	\
11896890d023SEric Saxe 	((thread == curthread) ||	\
1190d3d50737SRafael Vanoni 	((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
11916890d023SEric Saxe /*
11927c478bd9Sstevel@tonic-gate  * Put the specified thread on the back of the dispatcher
11937c478bd9Sstevel@tonic-gate  * queue corresponding to its current priority.
11947c478bd9Sstevel@tonic-gate  *
11957c478bd9Sstevel@tonic-gate  * Called with the thread in transition, onproc or stopped state
11967c478bd9Sstevel@tonic-gate  * and locked (transition implies locked) and at high spl.
11977c478bd9Sstevel@tonic-gate  * Returns with the thread in TS_RUN state and still locked.
11987c478bd9Sstevel@tonic-gate  */
11997c478bd9Sstevel@tonic-gate void
12007c478bd9Sstevel@tonic-gate setbackdq(kthread_t *tp)
12017c478bd9Sstevel@tonic-gate {
12027c478bd9Sstevel@tonic-gate 	dispq_t	*dq;
12037c478bd9Sstevel@tonic-gate 	disp_t		*dp;
12047c478bd9Sstevel@tonic-gate 	cpu_t		*cp;
12057c478bd9Sstevel@tonic-gate 	pri_t		tpri;
12067c478bd9Sstevel@tonic-gate 	int		bound;
12076890d023SEric Saxe 	boolean_t	self;
12087c478bd9Sstevel@tonic-gate 
12097c478bd9Sstevel@tonic-gate 	ASSERT(THREAD_LOCK_HELD(tp));
12107c478bd9Sstevel@tonic-gate 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
12117c478bd9Sstevel@tonic-gate 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
12127c478bd9Sstevel@tonic-gate 
12137c478bd9Sstevel@tonic-gate 	/*
12147c478bd9Sstevel@tonic-gate 	 * If thread is "swapped" or on the swap queue don't
12157c478bd9Sstevel@tonic-gate 	 * queue it, but wake sched.
12167c478bd9Sstevel@tonic-gate 	 */
12177c478bd9Sstevel@tonic-gate 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
12187c478bd9Sstevel@tonic-gate 		disp_swapped_setrun(tp);
12197c478bd9Sstevel@tonic-gate 		return;
12207c478bd9Sstevel@tonic-gate 	}
12217c478bd9Sstevel@tonic-gate 
12226890d023SEric Saxe 	self = (tp == curthread);
12236890d023SEric Saxe 
1224abd41583Sgd209917 	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1225abd41583Sgd209917 		bound = 1;
1226abd41583Sgd209917 	else
1227abd41583Sgd209917 		bound = 0;
1228abd41583Sgd209917 
12297c478bd9Sstevel@tonic-gate 	tpri = DISP_PRIO(tp);
12307c478bd9Sstevel@tonic-gate 	if (ncpus == 1)
12317c478bd9Sstevel@tonic-gate 		cp = tp->t_cpu;
1232abd41583Sgd209917 	else if (!bound) {
12337c478bd9Sstevel@tonic-gate 		if (tpri >= kpqpri) {
12347c478bd9Sstevel@tonic-gate 			setkpdq(tp, SETKP_BACK);
12357c478bd9Sstevel@tonic-gate 			return;
12367c478bd9Sstevel@tonic-gate 		}
12376890d023SEric Saxe 
12387c478bd9Sstevel@tonic-gate 		/*
12396890d023SEric Saxe 		 * We'll generally let this thread continue to run where
12406890d023SEric Saxe 		 * it last ran...but will consider migration if:
1241455e370cSJohn Levon 		 * - The thread probably doesn't have much cache warmth.
1242c3377ee9SJohn Levon 		 * - SMT exclusion would prefer us to run elsewhere
12436890d023SEric Saxe 		 * - The CPU where it last ran is the target of an offline
12446890d023SEric Saxe 		 *   request.
1245455e370cSJohn Levon 		 * - The thread last ran outside its home lgroup.
12467c478bd9Sstevel@tonic-gate 		 */
12476890d023SEric Saxe 		if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1248c3377ee9SJohn Levon 		    !smt_should_run(tp, tp->t_cpu) ||
1249455e370cSJohn Levon 		    (tp->t_cpu == cpu_inmotion) ||
1250455e370cSJohn Levon 		    !LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1251455e370cSJohn Levon 			cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
12526890d023SEric Saxe 		} else {
12536890d023SEric Saxe 			cp = tp->t_cpu;
12546890d023SEric Saxe 		}
12557c478bd9Sstevel@tonic-gate 
12567c478bd9Sstevel@tonic-gate 		if (tp->t_cpupart == cp->cpu_part) {
12577c478bd9Sstevel@tonic-gate 			int	qlen;
12587c478bd9Sstevel@tonic-gate 
12597c478bd9Sstevel@tonic-gate 			/*
1260fb2f18f8Sesaxe 			 * Perform any CMT load balancing
12617c478bd9Sstevel@tonic-gate 			 */
1262fb2f18f8Sesaxe 			cp = cmt_balance(tp, cp);
12637c478bd9Sstevel@tonic-gate 
12647c478bd9Sstevel@tonic-gate 			/*
12657c478bd9Sstevel@tonic-gate 			 * Balance across the run queues
12667c478bd9Sstevel@tonic-gate 			 */
12677c478bd9Sstevel@tonic-gate 			qlen = RUNQ_LEN(cp, tpri);
12687c478bd9Sstevel@tonic-gate 			if (tpri >= RUNQ_MATCH_PRI &&
12697c478bd9Sstevel@tonic-gate 			    !(tp->t_schedflag & TS_RUNQMATCH))
12707c478bd9Sstevel@tonic-gate 				qlen -= RUNQ_MAX_DIFF;
12717c478bd9Sstevel@tonic-gate 			if (qlen > 0) {
1272685679f7Sakolb 				cpu_t *newcp;
12737c478bd9Sstevel@tonic-gate 
1274685679f7Sakolb 				if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1275685679f7Sakolb 					newcp = cp->cpu_next_part;
1276685679f7Sakolb 				} else if ((newcp = cp->cpu_next_lpl) == cp) {
1277685679f7Sakolb 					newcp = cp->cpu_next_part;
12787c478bd9Sstevel@tonic-gate 				}
1279685679f7Sakolb 
1280c3377ee9SJohn Levon 				if (smt_should_run(tp, newcp) &&
1281455e370cSJohn Levon 				    RUNQ_LEN(newcp, tpri) < qlen) {
1282685679f7Sakolb 					DTRACE_PROBE3(runq__balance,
1283685679f7Sakolb 					    kthread_t *, tp,
1284685679f7Sakolb 					    cpu_t *, cp, cpu_t *, newcp);
1285685679f7Sakolb 					cp = newcp;
1286685679f7Sakolb 				}
12877c478bd9Sstevel@tonic-gate 			}
12887c478bd9Sstevel@tonic-gate 		} else {
12897c478bd9Sstevel@tonic-gate 			/*
12907c478bd9Sstevel@tonic-gate 			 * Migrate to a cpu in the new partition.
12917c478bd9Sstevel@tonic-gate 			 */
1292455e370cSJohn Levon 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, tp,
1293455e370cSJohn Levon 			    tp->t_pri);
12947c478bd9Sstevel@tonic-gate 		}
12957c478bd9Sstevel@tonic-gate 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
12967c478bd9Sstevel@tonic-gate 	} else {
12977c478bd9Sstevel@tonic-gate 		/*
12987c478bd9Sstevel@tonic-gate 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
12997c478bd9Sstevel@tonic-gate 		 * a short time until weak binding that existed when the
13007c478bd9Sstevel@tonic-gate 		 * strong binding was established has dropped) so we must
13017c478bd9Sstevel@tonic-gate 		 * favour weak binding over strong.
13027c478bd9Sstevel@tonic-gate 		 */
13037c478bd9Sstevel@tonic-gate 		cp = tp->t_weakbound_cpu ?
13047c478bd9Sstevel@tonic-gate 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
13057c478bd9Sstevel@tonic-gate 	}
1306f2bd4627Sjohansen 	/*
1307f2bd4627Sjohansen 	 * A thread that is ONPROC may be temporarily placed on the run queue
1308f2bd4627Sjohansen 	 * but then chosen to run again by disp.  If the thread we're placing on
1309f2bd4627Sjohansen 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1310f2bd4627Sjohansen 	 * replacement process is actually scheduled in swtch().  In this
1311f2bd4627Sjohansen 	 * situation, curthread is the only thread that could be in the ONPROC
1312f2bd4627Sjohansen 	 * state.
1313f2bd4627Sjohansen 	 */
13146890d023SEric Saxe 	if ((!self) && (tp->t_waitrq == 0)) {
1315f2bd4627Sjohansen 		hrtime_t curtime;
1316f2bd4627Sjohansen 
1317f2bd4627Sjohansen 		curtime = gethrtime_unscaled();
1318f2bd4627Sjohansen 		(void) cpu_update_pct(tp, curtime);
1319f2bd4627Sjohansen 		tp->t_waitrq = curtime;
1320f2bd4627Sjohansen 	} else {
1321f2bd4627Sjohansen 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1322f2bd4627Sjohansen 	}
1323f2bd4627Sjohansen 
13247c478bd9Sstevel@tonic-gate 	dp = cp->cpu_disp;
13257c478bd9Sstevel@tonic-gate 	disp_lock_enter_high(&dp->disp_lock);
13267c478bd9Sstevel@tonic-gate 
13277c478bd9Sstevel@tonic-gate 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
13287c478bd9Sstevel@tonic-gate 	TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
13297c478bd9Sstevel@tonic-gate 	    tpri, cp, tp);
13307c478bd9Sstevel@tonic-gate 
13317c478bd9Sstevel@tonic-gate #ifndef NPROBE
13327c478bd9Sstevel@tonic-gate 	/* Kernel probe */
13337c478bd9Sstevel@tonic-gate 	if (tnf_tracing_active)
13347c478bd9Sstevel@tonic-gate 		tnf_thread_queue(tp, cp, tpri);
13357c478bd9Sstevel@tonic-gate #endif /* NPROBE */
13367c478bd9Sstevel@tonic-gate 
13377c478bd9Sstevel@tonic-gate 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
13387c478bd9Sstevel@tonic-gate 
13397c478bd9Sstevel@tonic-gate 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
13407c478bd9Sstevel@tonic-gate 	tp->t_disp_queue = dp;
13417c478bd9Sstevel@tonic-gate 	tp->t_link = NULL;
13427c478bd9Sstevel@tonic-gate 
13437c478bd9Sstevel@tonic-gate 	dq = &dp->disp_q[tpri];
13447c478bd9Sstevel@tonic-gate 	dp->disp_nrunnable++;
1345685679f7Sakolb 	if (!bound)
1346685679f7Sakolb 		dp->disp_steal = 0;
13477c478bd9Sstevel@tonic-gate 	membar_enter();
13487c478bd9Sstevel@tonic-gate 
13497c478bd9Sstevel@tonic-gate 	if (dq->dq_sruncnt++ != 0) {
13507c478bd9Sstevel@tonic-gate 		ASSERT(dq->dq_first != NULL);
13517c478bd9Sstevel@tonic-gate 		dq->dq_last->t_link = tp;
13527c478bd9Sstevel@tonic-gate 		dq->dq_last = tp;
13537c478bd9Sstevel@tonic-gate 	} else {
13547c478bd9Sstevel@tonic-gate 		ASSERT(dq->dq_first == NULL);
13557c478bd9Sstevel@tonic-gate 		ASSERT(dq->dq_last == NULL);
13567c478bd9Sstevel@tonic-gate 		dq->dq_first = dq->dq_last = tp;
13577c478bd9Sstevel@tonic-gate 		BT_SET(dp->disp_qactmap, tpri);
13587c478bd9Sstevel@tonic-gate 		if (tpri > dp->disp_maxrunpri) {
13597c478bd9Sstevel@tonic-gate 			dp->disp_maxrunpri = tpri;
13607c478bd9Sstevel@tonic-gate 			membar_enter();
13617c478bd9Sstevel@tonic-gate 			cpu_resched(cp, tpri);
13627c478bd9Sstevel@tonic-gate 		}
13637c478bd9Sstevel@tonic-gate 	}
13647c478bd9Sstevel@tonic-gate 
13657c478bd9Sstevel@tonic-gate 	if (!bound && tpri > dp->disp_max_unbound_pri) {
13666890d023SEric Saxe 		if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
13677c478bd9Sstevel@tonic-gate 			/*
13687c478bd9Sstevel@tonic-gate 			 * If there are no other unbound threads on the
13697c478bd9Sstevel@tonic-gate 			 * run queue, don't allow other CPUs to steal
13707c478bd9Sstevel@tonic-gate 			 * this thread while we are in the middle of a
13717c478bd9Sstevel@tonic-gate 			 * context switch. We may just switch to it
13727c478bd9Sstevel@tonic-gate 			 * again right away. CPU_DISP_DONTSTEAL is cleared
13737c478bd9Sstevel@tonic-gate 			 * in swtch and swtch_to.
13747c478bd9Sstevel@tonic-gate 			 */
13757c478bd9Sstevel@tonic-gate 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
13767c478bd9Sstevel@tonic-gate 		}
13777c478bd9Sstevel@tonic-gate 		dp->disp_max_unbound_pri = tpri;
13787c478bd9Sstevel@tonic-gate 	}
13797c478bd9Sstevel@tonic-gate 	(*disp_enq_thread)(cp, bound);
13807c478bd9Sstevel@tonic-gate }
13817c478bd9Sstevel@tonic-gate 
13827c478bd9Sstevel@tonic-gate /*
13837c478bd9Sstevel@tonic-gate  * Put the specified thread on the front of the dispatcher
13847c478bd9Sstevel@tonic-gate  * queue corresponding to its current priority.
13857c478bd9Sstevel@tonic-gate  *
13867c478bd9Sstevel@tonic-gate  * Called with the thread in transition, onproc or stopped state
13877c478bd9Sstevel@tonic-gate  * and locked (transition implies locked) and at high spl.
13887c478bd9Sstevel@tonic-gate  * Returns with the thread in TS_RUN state and still locked.
13897c478bd9Sstevel@tonic-gate  */
13907c478bd9Sstevel@tonic-gate void
13917c478bd9Sstevel@tonic-gate setfrontdq(kthread_t *tp)
13927c478bd9Sstevel@tonic-gate {
13937c478bd9Sstevel@tonic-gate 	disp_t		*dp;
13947c478bd9Sstevel@tonic-gate 	dispq_t		*dq;
13957c478bd9Sstevel@tonic-gate 	cpu_t		*cp;
13967c478bd9Sstevel@tonic-gate 	pri_t		tpri;
13977c478bd9Sstevel@tonic-gate 	int		bound;
13987c478bd9Sstevel@tonic-gate 
13997c478bd9Sstevel@tonic-gate 	ASSERT(THREAD_LOCK_HELD(tp));
14007c478bd9Sstevel@tonic-gate 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
14017c478bd9Sstevel@tonic-gate 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
14027c478bd9Sstevel@tonic-gate 
14037c478bd9Sstevel@tonic-gate 	/*
14047c478bd9Sstevel@tonic-gate 	 * If thread is "swapped" or on the swap queue don't
14057c478bd9Sstevel@tonic-gate 	 * queue it, but wake sched.
14067c478bd9Sstevel@tonic-gate 	 */
14077c478bd9Sstevel@tonic-gate 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
14087c478bd9Sstevel@tonic-gate 		disp_swapped_setrun(tp);
14097c478bd9Sstevel@tonic-gate 		return;
14107c478bd9Sstevel@tonic-gate 	}
14117c478bd9Sstevel@tonic-gate 
1412abd41583Sgd209917 	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1413abd41583Sgd209917 		bound = 1;
1414abd41583Sgd209917 	else
1415abd41583Sgd209917 		bound = 0;
1416abd41583Sgd209917 
14177c478bd9Sstevel@tonic-gate 	tpri = DISP_PRIO(tp);
14187c478bd9Sstevel@tonic-gate 	if (ncpus == 1)
14197c478bd9Sstevel@tonic-gate 		cp = tp->t_cpu;
1420abd41583Sgd209917 	else if (!bound) {
14217c478bd9Sstevel@tonic-gate 		if (tpri >= kpqpri) {
14227c478bd9Sstevel@tonic-gate 			setkpdq(tp, SETKP_FRONT);
14237c478bd9Sstevel@tonic-gate 			return;
14247c478bd9Sstevel@tonic-gate 		}
14257c478bd9Sstevel@tonic-gate 		cp = tp->t_cpu;
14267c478bd9Sstevel@tonic-gate 		if (tp->t_cpupart == cp->cpu_part) {
14277c478bd9Sstevel@tonic-gate 			/*
14286890d023SEric Saxe 			 * We'll generally let this thread continue to run
14296890d023SEric Saxe 			 * where it last ran, but will consider migration if:
1430455e370cSJohn Levon 			 * - The thread last ran outside its home lgroup.
14316890d023SEric Saxe 			 * - The CPU where it last ran is the target of an
14326890d023SEric Saxe 			 *   offline request (a thread_nomigrate() on the in
14336890d023SEric Saxe 			 *   motion CPU relies on this when forcing a preempt).
14346890d023SEric Saxe 			 * - The thread isn't the highest priority thread where
14356890d023SEric Saxe 			 *   it last ran, and it is considered not likely to
14366890d023SEric Saxe 			 *   have significant cache warmth.
14377c478bd9Sstevel@tonic-gate 			 */
1438455e370cSJohn Levon 			if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp) ||
1439455e370cSJohn Levon 			    cp == cpu_inmotion ||
1440455e370cSJohn Levon 			    (tpri < cp->cpu_disp->disp_maxrunpri &&
1441455e370cSJohn Levon 			    !THREAD_HAS_CACHE_WARMTH(tp))) {
1442455e370cSJohn Levon 				cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
14436890d023SEric Saxe 			}
14447c478bd9Sstevel@tonic-gate 		} else {
14457c478bd9Sstevel@tonic-gate 			/*
14467c478bd9Sstevel@tonic-gate 			 * Migrate to a cpu in the new partition.
14477c478bd9Sstevel@tonic-gate 			 */
14487c478bd9Sstevel@tonic-gate 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1449455e370cSJohn Levon 			    tp, tp->t_pri);
14507c478bd9Sstevel@tonic-gate 		}
14517c478bd9Sstevel@tonic-gate 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
14527c478bd9Sstevel@tonic-gate 	} else {
14537c478bd9Sstevel@tonic-gate 		/*
14547c478bd9Sstevel@tonic-gate 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
14557c478bd9Sstevel@tonic-gate 		 * a short time until weak binding that existed when the
14567c478bd9Sstevel@tonic-gate 		 * strong binding was established has dropped) so we must
14577c478bd9Sstevel@tonic-gate 		 * favour weak binding over strong.
14587c478bd9Sstevel@tonic-gate 		 */
14597c478bd9Sstevel@tonic-gate 		cp = tp->t_weakbound_cpu ?
14607c478bd9Sstevel@tonic-gate 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
14617c478bd9Sstevel@tonic-gate 	}
1462f2bd4627Sjohansen 
1463f2bd4627Sjohansen 	/*
1464f2bd4627Sjohansen 	 * A thread that is ONPROC may be temporarily placed on the run queue
1465f2bd4627Sjohansen 	 * but then chosen to run again by disp.  If the thread we're placing on
1466f2bd4627Sjohansen 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1467f2bd4627Sjohansen 	 * replacement process is actually scheduled in swtch().  In this
1468f2bd4627Sjohansen 	 * situation, curthread is the only thread that could be in the ONPROC
1469f2bd4627Sjohansen 	 * state.
1470f2bd4627Sjohansen 	 */
1471f2bd4627Sjohansen 	if ((tp != curthread) && (tp->t_waitrq == 0)) {
1472f2bd4627Sjohansen 		hrtime_t curtime;
1473f2bd4627Sjohansen 
1474f2bd4627Sjohansen 		curtime = gethrtime_unscaled();
1475f2bd4627Sjohansen 		(void) cpu_update_pct(tp, curtime);
1476f2bd4627Sjohansen 		tp->t_waitrq = curtime;
1477f2bd4627Sjohansen 	} else {
1478f2bd4627Sjohansen 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1479f2bd4627Sjohansen 	}
1480f2bd4627Sjohansen 
14817c478bd9Sstevel@tonic-gate 	dp = cp->cpu_disp;
14827c478bd9Sstevel@tonic-gate 	disp_lock_enter_high(&dp->disp_lock);
14837c478bd9Sstevel@tonic-gate 
14847c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
14857c478bd9Sstevel@tonic-gate 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
14867c478bd9Sstevel@tonic-gate 
14877c478bd9Sstevel@tonic-gate #ifndef NPROBE
14887c478bd9Sstevel@tonic-gate 	/* Kernel probe */
14897c478bd9Sstevel@tonic-gate 	if (tnf_tracing_active)
14907c478bd9Sstevel@tonic-gate 		tnf_thread_queue(tp, cp, tpri);
14917c478bd9Sstevel@tonic-gate #endif /* NPROBE */
14927c478bd9Sstevel@tonic-gate 
14937c478bd9Sstevel@tonic-gate 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
14947c478bd9Sstevel@tonic-gate 
14957c478bd9Sstevel@tonic-gate 	THREAD_RUN(tp, &dp->disp_lock);		/* set TS_RUN state and lock */
14967c478bd9Sstevel@tonic-gate 	tp->t_disp_queue = dp;
14977c478bd9Sstevel@tonic-gate 
14987c478bd9Sstevel@tonic-gate 	dq = &dp->disp_q[tpri];
14997c478bd9Sstevel@tonic-gate 	dp->disp_nrunnable++;
1500685679f7Sakolb 	if (!bound)
1501685679f7Sakolb 		dp->disp_steal = 0;
15027c478bd9Sstevel@tonic-gate 	membar_enter();
15037c478bd9Sstevel@tonic-gate 
15047c478bd9Sstevel@tonic-gate 	if (dq->dq_sruncnt++ != 0) {
15057c478bd9Sstevel@tonic-gate 		ASSERT(dq->dq_last != NULL);
15067c478bd9Sstevel@tonic-gate 		tp->t_link = dq->dq_first;
15077c478bd9Sstevel@tonic-gate 		dq->dq_first = tp;
15087c478bd9Sstevel@tonic-gate 	} else {
15097c478bd9Sstevel@tonic-gate 		ASSERT(dq->dq_last == NULL);
15107c478bd9Sstevel@tonic-gate 		ASSERT(dq->dq_first == NULL);
15117c478bd9Sstevel@tonic-gate 		tp->t_link = NULL;
15127c478bd9Sstevel@tonic-gate 		dq->dq_first = dq->dq_last = tp;
15137c478bd9Sstevel@tonic-gate 		BT_SET(dp->disp_qactmap, tpri);
15147c478bd9Sstevel@tonic-gate 		if (tpri > dp->disp_maxrunpri) {
15157c478bd9Sstevel@tonic-gate 			dp->disp_maxrunpri = tpri;
15167c478bd9Sstevel@tonic-gate 			membar_enter();
15177c478bd9Sstevel@tonic-gate 			cpu_resched(cp, tpri);
15187c478bd9Sstevel@tonic-gate 		}
15197c478bd9Sstevel@tonic-gate 	}
15207c478bd9Sstevel@tonic-gate 
15217c478bd9Sstevel@tonic-gate 	if (!bound && tpri > dp->disp_max_unbound_pri) {
15227c478bd9Sstevel@tonic-gate 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
15237c478bd9Sstevel@tonic-gate 		    cp == CPU) {
15247c478bd9Sstevel@tonic-gate 			/*
15257c478bd9Sstevel@tonic-gate 			 * If there are no other unbound threads on the
15267c478bd9Sstevel@tonic-gate 			 * run queue, don't allow other CPUs to steal
15277c478bd9Sstevel@tonic-gate 			 * this thread while we are in the middle of a
15287c478bd9Sstevel@tonic-gate 			 * context switch. We may just switch to it
15297c478bd9Sstevel@tonic-gate 			 * again right away. CPU_DISP_DONTSTEAL is cleared
15307c478bd9Sstevel@tonic-gate 			 * in swtch and swtch_to.
15317c478bd9Sstevel@tonic-gate 			 */
15327c478bd9Sstevel@tonic-gate 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
15337c478bd9Sstevel@tonic-gate 		}
15347c478bd9Sstevel@tonic-gate 		dp->disp_max_unbound_pri = tpri;
15357c478bd9Sstevel@tonic-gate 	}
15367c478bd9Sstevel@tonic-gate 	(*disp_enq_thread)(cp, bound);
15377c478bd9Sstevel@tonic-gate }
15387c478bd9Sstevel@tonic-gate 
15397c478bd9Sstevel@tonic-gate /*
15407c478bd9Sstevel@tonic-gate  * Put a high-priority unbound thread on the kp queue
15417c478bd9Sstevel@tonic-gate  */
15427c478bd9Sstevel@tonic-gate static void
15437c478bd9Sstevel@tonic-gate setkpdq(kthread_t *tp, int borf)
15447c478bd9Sstevel@tonic-gate {
15457c478bd9Sstevel@tonic-gate 	dispq_t	*dq;
15467c478bd9Sstevel@tonic-gate 	disp_t	*dp;
15477c478bd9Sstevel@tonic-gate 	cpu_t	*cp;
15487c478bd9Sstevel@tonic-gate 	pri_t	tpri;
15497c478bd9Sstevel@tonic-gate 
15507c478bd9Sstevel@tonic-gate 	tpri = DISP_PRIO(tp);
15517c478bd9Sstevel@tonic-gate 
15527c478bd9Sstevel@tonic-gate 	dp = &tp->t_cpupart->cp_kp_queue;
15537c478bd9Sstevel@tonic-gate 	disp_lock_enter_high(&dp->disp_lock);
15547c478bd9Sstevel@tonic-gate 
15557c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
15567c478bd9Sstevel@tonic-gate 
15577c478bd9Sstevel@tonic-gate 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
15587c478bd9Sstevel@tonic-gate 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
15597c478bd9Sstevel@tonic-gate 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
15607c478bd9Sstevel@tonic-gate 	tp->t_disp_queue = dp;
15617c478bd9Sstevel@tonic-gate 	dp->disp_nrunnable++;
15627c478bd9Sstevel@tonic-gate 	dq = &dp->disp_q[tpri];
15637c478bd9Sstevel@tonic-gate 
15647c478bd9Sstevel@tonic-gate 	if (dq->dq_sruncnt++ != 0) {
15657c478bd9Sstevel@tonic-gate 		if (borf == SETKP_BACK) {
15667c478bd9Sstevel@tonic-gate 			ASSERT(dq->dq_first != NULL);
15677c478bd9Sstevel@tonic-gate 			tp->t_link = NULL;
15687c478bd9Sstevel@tonic-gate 			dq->dq_last->t_link = tp;
15697c478bd9Sstevel@tonic-gate 			dq->dq_last = tp;
15707c478bd9Sstevel@tonic-gate 		} else {
15717c478bd9Sstevel@tonic-gate 			ASSERT(dq->dq_last != NULL);
15727c478bd9Sstevel@tonic-gate 			tp->t_link = dq->dq_first;
15737c478bd9Sstevel@tonic-gate 			dq->dq_first = tp;
15747c478bd9Sstevel@tonic-gate 		}
15757c478bd9Sstevel@tonic-gate 	} else {
15767c478bd9Sstevel@tonic-gate 		if (borf == SETKP_BACK) {
15777c478bd9Sstevel@tonic-gate 			ASSERT(dq->dq_first == NULL);
15787c478bd9Sstevel@tonic-gate 			ASSERT(dq->dq_last == NULL);
15797c478bd9Sstevel@tonic-gate 			dq->dq_first = dq->dq_last = tp;
15807c478bd9Sstevel@tonic-gate 		} else {
15817c478bd9Sstevel@tonic-gate 			ASSERT(dq->dq_last == NULL);
15827c478bd9Sstevel@tonic-gate 			ASSERT(dq->dq_first == NULL);
15837c478bd9Sstevel@tonic-gate 			tp->t_link = NULL;
15847c478bd9Sstevel@tonic-gate 			dq->dq_first = dq->dq_last = tp;
15857c478bd9Sstevel@tonic-gate 		}
15867c478bd9Sstevel@tonic-gate 		BT_SET(dp->disp_qactmap, tpri);
15877c478bd9Sstevel@tonic-gate 		if (tpri > dp->disp_max_unbound_pri)
15887c478bd9Sstevel@tonic-gate 			dp->disp_max_unbound_pri = tpri;
15897c478bd9Sstevel@tonic-gate 		if (tpri > dp->disp_maxrunpri) {
15907c478bd9Sstevel@tonic-gate 			dp->disp_maxrunpri = tpri;
15917c478bd9Sstevel@tonic-gate 			membar_enter();
15927c478bd9Sstevel@tonic-gate 		}
15937c478bd9Sstevel@tonic-gate 	}
15947c478bd9Sstevel@tonic-gate 
15957c478bd9Sstevel@tonic-gate 	cp = tp->t_cpu;
15967c478bd9Sstevel@tonic-gate 	if (tp->t_cpupart != cp->cpu_part) {
15977c478bd9Sstevel@tonic-gate 		/* migrate to a cpu in the new partition */
15987c478bd9Sstevel@tonic-gate 		cp = tp->t_cpupart->cp_cpulist;
15997c478bd9Sstevel@tonic-gate 	}
1600455e370cSJohn Levon 	cp = disp_lowpri_cpu(cp, tp, tp->t_pri);
16017c478bd9Sstevel@tonic-gate 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
16027c478bd9Sstevel@tonic-gate 	ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
16037c478bd9Sstevel@tonic-gate 
16047c478bd9Sstevel@tonic-gate #ifndef NPROBE
16057c478bd9Sstevel@tonic-gate 	/* Kernel probe */
16067c478bd9Sstevel@tonic-gate 	if (tnf_tracing_active)
16077c478bd9Sstevel@tonic-gate 		tnf_thread_queue(tp, cp, tpri);
16087c478bd9Sstevel@tonic-gate #endif /* NPROBE */
16097c478bd9Sstevel@tonic-gate 
16107c478bd9Sstevel@tonic-gate 	if (cp->cpu_chosen_level < tpri)
16117c478bd9Sstevel@tonic-gate 		cp->cpu_chosen_level = tpri;
16127c478bd9Sstevel@tonic-gate 	cpu_resched(cp, tpri);
16137c478bd9Sstevel@tonic-gate 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
16147c478bd9Sstevel@tonic-gate 	(*disp_enq_thread)(cp, 0);
16157c478bd9Sstevel@tonic-gate }
16167c478bd9Sstevel@tonic-gate 
16177c478bd9Sstevel@tonic-gate /*
16187c478bd9Sstevel@tonic-gate  * Remove a thread from the dispatcher queue if it is on it.
16197c478bd9Sstevel@tonic-gate  * It is not an error if it is not found but we return whether
16207c478bd9Sstevel@tonic-gate  * or not it was found in case the caller wants to check.
16217c478bd9Sstevel@tonic-gate  */
16227c478bd9Sstevel@tonic-gate int
16237c478bd9Sstevel@tonic-gate dispdeq(kthread_t *tp)
16247c478bd9Sstevel@tonic-gate {
16257c478bd9Sstevel@tonic-gate 	disp_t		*dp;
16267c478bd9Sstevel@tonic-gate 	dispq_t		*dq;
16277c478bd9Sstevel@tonic-gate 	kthread_t	*rp;
16287c478bd9Sstevel@tonic-gate 	kthread_t	*trp;
16297c478bd9Sstevel@tonic-gate 	kthread_t	**ptp;
16307c478bd9Sstevel@tonic-gate 	int		tpri;
16317c478bd9Sstevel@tonic-gate 
16327c478bd9Sstevel@tonic-gate 	ASSERT(THREAD_LOCK_HELD(tp));
16337c478bd9Sstevel@tonic-gate 
16347c478bd9Sstevel@tonic-gate 	if (tp->t_state != TS_RUN)
16357c478bd9Sstevel@tonic-gate 		return (0);
16367c478bd9Sstevel@tonic-gate 
16377c478bd9Sstevel@tonic-gate 	/*
16387c478bd9Sstevel@tonic-gate 	 * The thread is "swapped" or is on the swap queue and
16397c478bd9Sstevel@tonic-gate 	 * hence no longer on the run queue, so return true.
16407c478bd9Sstevel@tonic-gate 	 */
16417c478bd9Sstevel@tonic-gate 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
16427c478bd9Sstevel@tonic-gate 		return (1);
16437c478bd9Sstevel@tonic-gate 
16447c478bd9Sstevel@tonic-gate 	tpri = DISP_PRIO(tp);
16457c478bd9Sstevel@tonic-gate 	dp = tp->t_disp_queue;
16467c478bd9Sstevel@tonic-gate 	ASSERT(tpri < dp->disp_npri);
16477c478bd9Sstevel@tonic-gate 	dq = &dp->disp_q[tpri];
16487c478bd9Sstevel@tonic-gate 	ptp = &dq->dq_first;
16497c478bd9Sstevel@tonic-gate 	rp = *ptp;
16507c478bd9Sstevel@tonic-gate 	trp = NULL;
16517c478bd9Sstevel@tonic-gate 
16527c478bd9Sstevel@tonic-gate 	ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
16537c478bd9Sstevel@tonic-gate 
16547c478bd9Sstevel@tonic-gate 	/*
16557c478bd9Sstevel@tonic-gate 	 * Search for thread in queue.
16567c478bd9Sstevel@tonic-gate 	 * Double links would simplify this at the expense of disp/setrun.
16577c478bd9Sstevel@tonic-gate 	 */
16587c478bd9Sstevel@tonic-gate 	while (rp != tp && rp != NULL) {
16597c478bd9Sstevel@tonic-gate 		trp = rp;
16607c478bd9Sstevel@tonic-gate 		ptp = &trp->t_link;
16617c478bd9Sstevel@tonic-gate 		rp = trp->t_link;
16627c478bd9Sstevel@tonic-gate 	}
16637c478bd9Sstevel@tonic-gate 
16647c478bd9Sstevel@tonic-gate 	if (rp == NULL) {
16657c478bd9Sstevel@tonic-gate 		panic("dispdeq: thread not on queue");
16667c478bd9Sstevel@tonic-gate 	}
16677c478bd9Sstevel@tonic-gate 
16687c478bd9Sstevel@tonic-gate 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
16697c478bd9Sstevel@tonic-gate 
16707c478bd9Sstevel@tonic-gate 	/*
16717c478bd9Sstevel@tonic-gate 	 * Found it so remove it from queue.
16727c478bd9Sstevel@tonic-gate 	 */
16737c478bd9Sstevel@tonic-gate 	if ((*ptp = rp->t_link) == NULL)
16747c478bd9Sstevel@tonic-gate 		dq->dq_last = trp;
16757c478bd9Sstevel@tonic-gate 
16767c478bd9Sstevel@tonic-gate 	dp->disp_nrunnable--;
16777c478bd9Sstevel@tonic-gate 	if (--dq->dq_sruncnt == 0) {
16787c478bd9Sstevel@tonic-gate 		dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
16797c478bd9Sstevel@tonic-gate 		if (dp->disp_nrunnable == 0) {
16807c478bd9Sstevel@tonic-gate 			dp->disp_max_unbound_pri = -1;
16817c478bd9Sstevel@tonic-gate 			dp->disp_maxrunpri = -1;
16827c478bd9Sstevel@tonic-gate 		} else if (tpri == dp->disp_maxrunpri) {
16837c478bd9Sstevel@tonic-gate 			int ipri;
16847c478bd9Sstevel@tonic-gate 
16857c478bd9Sstevel@tonic-gate 			ipri = bt_gethighbit(dp->disp_qactmap,
16867c478bd9Sstevel@tonic-gate 			    dp->disp_maxrunpri >> BT_ULSHIFT);
16877c478bd9Sstevel@tonic-gate 			if (ipri < dp->disp_max_unbound_pri)
16887c478bd9Sstevel@tonic-gate 				dp->disp_max_unbound_pri = ipri;
16897c478bd9Sstevel@tonic-gate 			dp->disp_maxrunpri = ipri;
16907c478bd9Sstevel@tonic-gate 		}
16917c478bd9Sstevel@tonic-gate 	}
16927c478bd9Sstevel@tonic-gate 	tp->t_link = NULL;
16937c478bd9Sstevel@tonic-gate 	THREAD_TRANSITION(tp);		/* put in intermediate state */
16947c478bd9Sstevel@tonic-gate 	return (1);
16957c478bd9Sstevel@tonic-gate }
16967c478bd9Sstevel@tonic-gate 
16977c478bd9Sstevel@tonic-gate 
16987c478bd9Sstevel@tonic-gate /*
16997c478bd9Sstevel@tonic-gate  * dq_sruninc and dq_srundec are public functions for
17007c478bd9Sstevel@tonic-gate  * incrementing/decrementing the sruncnts when a thread on
17017c478bd9Sstevel@tonic-gate  * a dispatcher queue is made schedulable/unschedulable by
17027c478bd9Sstevel@tonic-gate  * resetting the TS_LOAD flag.
17037c478bd9Sstevel@tonic-gate  *
17047c478bd9Sstevel@tonic-gate  * The caller MUST have the thread lock and therefore the dispatcher
17057c478bd9Sstevel@tonic-gate  * queue lock so that the operation which changes
17067c478bd9Sstevel@tonic-gate  * the flag, the operation that checks the status of the thread to
17077c478bd9Sstevel@tonic-gate  * determine if it's on a disp queue AND the call to this function
17087c478bd9Sstevel@tonic-gate  * are one atomic operation with respect to interrupts.
17097c478bd9Sstevel@tonic-gate  */
17107c478bd9Sstevel@tonic-gate 
17117c478bd9Sstevel@tonic-gate /*
17127c478bd9Sstevel@tonic-gate  * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
17137c478bd9Sstevel@tonic-gate  */
17147c478bd9Sstevel@tonic-gate void
17157c478bd9Sstevel@tonic-gate dq_sruninc(kthread_t *t)
17167c478bd9Sstevel@tonic-gate {
17177c478bd9Sstevel@tonic-gate 	ASSERT(t->t_state == TS_RUN);
17187c478bd9Sstevel@tonic-gate 	ASSERT(t->t_schedflag & TS_LOAD);
17197c478bd9Sstevel@tonic-gate 
17207c478bd9Sstevel@tonic-gate 	THREAD_TRANSITION(t);
17217c478bd9Sstevel@tonic-gate 	setfrontdq(t);
17227c478bd9Sstevel@tonic-gate }
17237c478bd9Sstevel@tonic-gate 
17247c478bd9Sstevel@tonic-gate /*
17257c478bd9Sstevel@tonic-gate  * See comment on calling conventions above.
17267c478bd9Sstevel@tonic-gate  * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
17277c478bd9Sstevel@tonic-gate  */
17287c478bd9Sstevel@tonic-gate void
17297c478bd9Sstevel@tonic-gate dq_srundec(kthread_t *t)
17307c478bd9Sstevel@tonic-gate {
17317c478bd9Sstevel@tonic-gate 	ASSERT(t->t_schedflag & TS_LOAD);
17327c478bd9Sstevel@tonic-gate 
17337c478bd9Sstevel@tonic-gate 	(void) dispdeq(t);
17347c478bd9Sstevel@tonic-gate 	disp_swapped_enq(t);
17357c478bd9Sstevel@tonic-gate }
17367c478bd9Sstevel@tonic-gate 
17377c478bd9Sstevel@tonic-gate /*
17387c478bd9Sstevel@tonic-gate  * Change the dispatcher lock of thread to the "swapped_lock"
17397c478bd9Sstevel@tonic-gate  * and return with thread lock still held.
17407c478bd9Sstevel@tonic-gate  *
17417c478bd9Sstevel@tonic-gate  * Called with thread_lock held, in transition state, and at high spl.
17427c478bd9Sstevel@tonic-gate  */
17437c478bd9Sstevel@tonic-gate void
17447c478bd9Sstevel@tonic-gate disp_swapped_enq(kthread_t *tp)
17457c478bd9Sstevel@tonic-gate {
17467c478bd9Sstevel@tonic-gate 	ASSERT(THREAD_LOCK_HELD(tp));
17477c478bd9Sstevel@tonic-gate 	ASSERT(tp->t_schedflag & TS_LOAD);
17487c478bd9Sstevel@tonic-gate 
17497c478bd9Sstevel@tonic-gate 	switch (tp->t_state) {
17507c478bd9Sstevel@tonic-gate 	case TS_RUN:
17517c478bd9Sstevel@tonic-gate 		disp_lock_enter_high(&swapped_lock);
17527c478bd9Sstevel@tonic-gate 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
17537c478bd9Sstevel@tonic-gate 		break;
17547c478bd9Sstevel@tonic-gate 	case TS_ONPROC:
17557c478bd9Sstevel@tonic-gate 		disp_lock_enter_high(&swapped_lock);
17567c478bd9Sstevel@tonic-gate 		THREAD_TRANSITION(tp);
17577c478bd9Sstevel@tonic-gate 		wake_sched_sec = 1;		/* tell clock to wake sched */
17587c478bd9Sstevel@tonic-gate 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
17597c478bd9Sstevel@tonic-gate 		break;
17607c478bd9Sstevel@tonic-gate 	default:
17617c478bd9Sstevel@tonic-gate 		panic("disp_swapped: tp: %p bad t_state", (void *)tp);
17627c478bd9Sstevel@tonic-gate 	}
17637c478bd9Sstevel@tonic-gate }
17647c478bd9Sstevel@tonic-gate 
17657c478bd9Sstevel@tonic-gate /*
17667c478bd9Sstevel@tonic-gate  * This routine is called by setbackdq/setfrontdq if the thread is
17677c478bd9Sstevel@tonic-gate  * not loaded or loaded and on the swap queue.
17687c478bd9Sstevel@tonic-gate  *
17697c478bd9Sstevel@tonic-gate  * Thread state TS_SLEEP implies that a swapped thread
17707c478bd9Sstevel@tonic-gate  * has been woken up and needs to be swapped in by the swapper.
17717c478bd9Sstevel@tonic-gate  *
17727c478bd9Sstevel@tonic-gate  * Thread state TS_RUN, it implies that the priority of a swapped
17737c478bd9Sstevel@tonic-gate  * thread is being increased by scheduling class (e.g. ts_update).
17747c478bd9Sstevel@tonic-gate  */
17757c478bd9Sstevel@tonic-gate static void
17767c478bd9Sstevel@tonic-gate disp_swapped_setrun(kthread_t *tp)
17777c478bd9Sstevel@tonic-gate {
17787c478bd9Sstevel@tonic-gate 	ASSERT(THREAD_LOCK_HELD(tp));
17797c478bd9Sstevel@tonic-gate 	ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
17807c478bd9Sstevel@tonic-gate 
17817c478bd9Sstevel@tonic-gate 	switch (tp->t_state) {
17827c478bd9Sstevel@tonic-gate 	case TS_SLEEP:
17837c478bd9Sstevel@tonic-gate 		disp_lock_enter_high(&swapped_lock);
17847c478bd9Sstevel@tonic-gate 		/*
17857c478bd9Sstevel@tonic-gate 		 * Wakeup sched immediately (i.e., next tick) if the
17867c478bd9Sstevel@tonic-gate 		 * thread priority is above maxclsyspri.
17877c478bd9Sstevel@tonic-gate 		 */
17887c478bd9Sstevel@tonic-gate 		if (DISP_PRIO(tp) > maxclsyspri)
17897c478bd9Sstevel@tonic-gate 			wake_sched = 1;
17907c478bd9Sstevel@tonic-gate 		else
17917c478bd9Sstevel@tonic-gate 			wake_sched_sec = 1;
17927c478bd9Sstevel@tonic-gate 		THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
17937c478bd9Sstevel@tonic-gate 		break;
17947c478bd9Sstevel@tonic-gate 	case TS_RUN:				/* called from ts_update */
17957c478bd9Sstevel@tonic-gate 		break;
17967c478bd9Sstevel@tonic-gate 	default:
17978793b36bSNick Todd 		panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
17987c478bd9Sstevel@tonic-gate 	}
17997c478bd9Sstevel@tonic-gate }
18007c478bd9Sstevel@tonic-gate 
18017c478bd9Sstevel@tonic-gate /*
18027c478bd9Sstevel@tonic-gate  *	Make a thread give up its processor.  Find the processor on
18037c478bd9Sstevel@tonic-gate  *	which this thread is executing, and have that processor
18047c478bd9Sstevel@tonic-gate  *	preempt.
180535a5a358SJonathan Adams  *
180635a5a358SJonathan Adams  *	We allow System Duty Cycle (SDC) threads to be preempted even if
180735a5a358SJonathan Adams  *	they are running at kernel priorities.  To implement this, we always
180835a5a358SJonathan Adams  *	set cpu_kprunrun; this ensures preempt() will be called.  Since SDC
180935a5a358SJonathan Adams  *	calls cpu_surrender() very often, we only preempt if there is anyone
181035a5a358SJonathan Adams  *	competing with us.
18117c478bd9Sstevel@tonic-gate  */
18127c478bd9Sstevel@tonic-gate void
18137c478bd9Sstevel@tonic-gate cpu_surrender(kthread_t *tp)
18147c478bd9Sstevel@tonic-gate {
18157c478bd9Sstevel@tonic-gate 	cpu_t	*cpup;
18167c478bd9Sstevel@tonic-gate 	int	max_pri;
18177c478bd9Sstevel@tonic-gate 	int	max_run_pri;
18187c478bd9Sstevel@tonic-gate 	klwp_t	*lwp;
18197c478bd9Sstevel@tonic-gate 
18207c478bd9Sstevel@tonic-gate 	ASSERT(THREAD_LOCK_HELD(tp));
18217c478bd9Sstevel@tonic-gate 
18227c478bd9Sstevel@tonic-gate 	if (tp->t_state != TS_ONPROC)
18237c478bd9Sstevel@tonic-gate 		return;
18247c478bd9Sstevel@tonic-gate 	cpup = tp->t_disp_queue->disp_cpu;	/* CPU thread dispatched to */
18257c478bd9Sstevel@tonic-gate 	max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
18267c478bd9Sstevel@tonic-gate 	max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
18277c478bd9Sstevel@tonic-gate 	if (max_pri < max_run_pri)
18287c478bd9Sstevel@tonic-gate 		max_pri = max_run_pri;
18297c478bd9Sstevel@tonic-gate 
183035a5a358SJonathan Adams 	if (tp->t_cid == sysdccid) {
183135a5a358SJonathan Adams 		uint_t t_pri = DISP_PRIO(tp);
183235a5a358SJonathan Adams 		if (t_pri > max_pri)
183335a5a358SJonathan Adams 			return;		/* we are not competing w/ anyone */
183435a5a358SJonathan Adams 		cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
183535a5a358SJonathan Adams 	} else {
18367c478bd9Sstevel@tonic-gate 		cpup->cpu_runrun = 1;
18377c478bd9Sstevel@tonic-gate 		if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
18387c478bd9Sstevel@tonic-gate 			cpup->cpu_kprunrun = 1;
18397c478bd9Sstevel@tonic-gate 		}
184035a5a358SJonathan Adams 	}
18417c478bd9Sstevel@tonic-gate 
18427c478bd9Sstevel@tonic-gate 	/*
18437c478bd9Sstevel@tonic-gate 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
18447c478bd9Sstevel@tonic-gate 	 */
18457c478bd9Sstevel@tonic-gate 	membar_enter();
18467c478bd9Sstevel@tonic-gate 
18477c478bd9Sstevel@tonic-gate 	DTRACE_SCHED1(surrender, kthread_t *, tp);
18487c478bd9Sstevel@tonic-gate 
18497c478bd9Sstevel@tonic-gate 	/*
18507c478bd9Sstevel@tonic-gate 	 * Make the target thread take an excursion through trap()
18517c478bd9Sstevel@tonic-gate 	 * to do preempt() (unless we're already in trap or post_syscall,
18527c478bd9Sstevel@tonic-gate 	 * calling cpu_surrender via CL_TRAPRET).
18537c478bd9Sstevel@tonic-gate 	 */
18547c478bd9Sstevel@tonic-gate 	if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
18557c478bd9Sstevel@tonic-gate 	    lwp->lwp_state != LWP_USER) {
18567c478bd9Sstevel@tonic-gate 		aston(tp);
18577c478bd9Sstevel@tonic-gate 		if (cpup != CPU)
18587c478bd9Sstevel@tonic-gate 			poke_cpu(cpup->cpu_id);
18597c478bd9Sstevel@tonic-gate 	}
18607c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
18617c478bd9Sstevel@tonic-gate 	    "cpu_surrender:tid %p cpu %p", tp, cpup);
18627c478bd9Sstevel@tonic-gate }
18637c478bd9Sstevel@tonic-gate 
18647c478bd9Sstevel@tonic-gate /*
18657c478bd9Sstevel@tonic-gate  * Commit to and ratify a scheduling decision
18667c478bd9Sstevel@tonic-gate  */
18677c478bd9Sstevel@tonic-gate /*ARGSUSED*/
18687c478bd9Sstevel@tonic-gate static kthread_t *
18697c478bd9Sstevel@tonic-gate disp_ratify(kthread_t *tp, disp_t *kpq)
18707c478bd9Sstevel@tonic-gate {
18717c478bd9Sstevel@tonic-gate 	pri_t	tpri, maxpri;
18727c478bd9Sstevel@tonic-gate 	pri_t	maxkpri;
18737c478bd9Sstevel@tonic-gate 	cpu_t	*cpup;
18747c478bd9Sstevel@tonic-gate 
18757c478bd9Sstevel@tonic-gate 	ASSERT(tp != NULL);
18767c478bd9Sstevel@tonic-gate 	/*
18777c478bd9Sstevel@tonic-gate 	 * Commit to, then ratify scheduling decision
18787c478bd9Sstevel@tonic-gate 	 */
18797c478bd9Sstevel@tonic-gate 	cpup = CPU;
18807c478bd9Sstevel@tonic-gate 	if (cpup->cpu_runrun != 0)
18817c478bd9Sstevel@tonic-gate 		cpup->cpu_runrun = 0;
18827c478bd9Sstevel@tonic-gate 	if (cpup->cpu_kprunrun != 0)
18837c478bd9Sstevel@tonic-gate 		cpup->cpu_kprunrun = 0;
18847c478bd9Sstevel@tonic-gate 	if (cpup->cpu_chosen_level != -1)
18857c478bd9Sstevel@tonic-gate 		cpup->cpu_chosen_level = -1;
18867c478bd9Sstevel@tonic-gate 	membar_enter();
18877c478bd9Sstevel@tonic-gate 	tpri = DISP_PRIO(tp);
18887c478bd9Sstevel@tonic-gate 	maxpri = cpup->cpu_disp->disp_maxrunpri;
18897c478bd9Sstevel@tonic-gate 	maxkpri = kpq->disp_maxrunpri;
18907c478bd9Sstevel@tonic-gate 	if (maxpri < maxkpri)
18917c478bd9Sstevel@tonic-gate 		maxpri = maxkpri;
18927c478bd9Sstevel@tonic-gate 	if (tpri < maxpri) {
18937c478bd9Sstevel@tonic-gate 		/*
18947c478bd9Sstevel@tonic-gate 		 * should have done better
18957c478bd9Sstevel@tonic-gate 		 * put this one back and indicate to try again
18967c478bd9Sstevel@tonic-gate 		 */
18977c478bd9Sstevel@tonic-gate 		cpup->cpu_dispthread = curthread;	/* fixup dispthread */
18987c478bd9Sstevel@tonic-gate 		cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
18997c478bd9Sstevel@tonic-gate 		thread_lock_high(tp);
19007c478bd9Sstevel@tonic-gate 		THREAD_TRANSITION(tp);
19017c478bd9Sstevel@tonic-gate 		setfrontdq(tp);
19027c478bd9Sstevel@tonic-gate 		thread_unlock_nopreempt(tp);
19037c478bd9Sstevel@tonic-gate 
19047c478bd9Sstevel@tonic-gate 		tp = NULL;
19057c478bd9Sstevel@tonic-gate 	}
19067c478bd9Sstevel@tonic-gate 	return (tp);
19077c478bd9Sstevel@tonic-gate }
19087c478bd9Sstevel@tonic-gate 
19097c478bd9Sstevel@tonic-gate /*
19107c478bd9Sstevel@tonic-gate  * See if there is any work on the dispatcher queue for other CPUs.
19117c478bd9Sstevel@tonic-gate  * If there is, dequeue the best thread and return.
19127c478bd9Sstevel@tonic-gate  */
19137c478bd9Sstevel@tonic-gate static kthread_t *
19147c478bd9Sstevel@tonic-gate disp_getwork(cpu_t *cp)
19157c478bd9Sstevel@tonic-gate {
19167c478bd9Sstevel@tonic-gate 	cpu_t		*ocp;		/* other CPU */
19177c478bd9Sstevel@tonic-gate 	cpu_t		*ocp_start;
19187c478bd9Sstevel@tonic-gate 	cpu_t		*tcp;		/* target local CPU */
19197c478bd9Sstevel@tonic-gate 	kthread_t	*tp;
1920685679f7Sakolb 	kthread_t	*retval = NULL;
19217c478bd9Sstevel@tonic-gate 	pri_t		maxpri;
19227c478bd9Sstevel@tonic-gate 	disp_t		*kpq;		/* kp queue for this partition */
19237c478bd9Sstevel@tonic-gate 	lpl_t		*lpl, *lpl_leaf;
19246890d023SEric Saxe 	int		leafidx, startidx;
1925685679f7Sakolb 	hrtime_t	stealtime;
19266890d023SEric Saxe 	lgrp_id_t	local_id;
19277c478bd9Sstevel@tonic-gate 
19287c478bd9Sstevel@tonic-gate 	maxpri = -1;
19297c478bd9Sstevel@tonic-gate 	tcp = NULL;
19307c478bd9Sstevel@tonic-gate 
19317c478bd9Sstevel@tonic-gate 	kpq = &cp->cpu_part->cp_kp_queue;
19327c478bd9Sstevel@tonic-gate 	while (kpq->disp_maxrunpri >= 0) {
19337c478bd9Sstevel@tonic-gate 		/*
19347c478bd9Sstevel@tonic-gate 		 * Try to take a thread from the kp_queue.
19357c478bd9Sstevel@tonic-gate 		 */
19367c478bd9Sstevel@tonic-gate 		tp = (disp_getbest(kpq));
19377c478bd9Sstevel@tonic-gate 		if (tp)
19387c478bd9Sstevel@tonic-gate 			return (disp_ratify(tp, kpq));
19397c478bd9Sstevel@tonic-gate 	}
19407c478bd9Sstevel@tonic-gate 
1941ab761399Sesaxe 	kpreempt_disable();		/* protect the cpu_active list */
19427c478bd9Sstevel@tonic-gate 
19437c478bd9Sstevel@tonic-gate 	/*
19447c478bd9Sstevel@tonic-gate 	 * Try to find something to do on another CPU's run queue.
19457c478bd9Sstevel@tonic-gate 	 * Loop through all other CPUs looking for the one with the highest
19467c478bd9Sstevel@tonic-gate 	 * priority unbound thread.
19477c478bd9Sstevel@tonic-gate 	 *
19487c478bd9Sstevel@tonic-gate 	 * On NUMA machines, the partition's CPUs are consulted in order of
19497c478bd9Sstevel@tonic-gate 	 * distance from the current CPU. This way, the first available
19507c478bd9Sstevel@tonic-gate 	 * work found is also the closest, and will suffer the least
19517c478bd9Sstevel@tonic-gate 	 * from being migrated.
19527c478bd9Sstevel@tonic-gate 	 */
19537c478bd9Sstevel@tonic-gate 	lpl = lpl_leaf = cp->cpu_lpl;
19546890d023SEric Saxe 	local_id = lpl_leaf->lpl_lgrpid;
19556890d023SEric Saxe 	leafidx = startidx = 0;
19567c478bd9Sstevel@tonic-gate 
19577c478bd9Sstevel@tonic-gate 	/*
19587c478bd9Sstevel@tonic-gate 	 * This loop traverses the lpl hierarchy. Higher level lpls represent
19597c478bd9Sstevel@tonic-gate 	 * broader levels of locality
19607c478bd9Sstevel@tonic-gate 	 */
19617c478bd9Sstevel@tonic-gate 	do {
19627c478bd9Sstevel@tonic-gate 		/* This loop iterates over the lpl's leaves */
19637c478bd9Sstevel@tonic-gate 		do {
19647c478bd9Sstevel@tonic-gate 			if (lpl_leaf != cp->cpu_lpl)
19657c478bd9Sstevel@tonic-gate 				ocp = lpl_leaf->lpl_cpus;
19667c478bd9Sstevel@tonic-gate 			else
19677c478bd9Sstevel@tonic-gate 				ocp = cp->cpu_next_lpl;
19687c478bd9Sstevel@tonic-gate 
19697c478bd9Sstevel@tonic-gate 			/* This loop iterates over the CPUs in the leaf */
19707c478bd9Sstevel@tonic-gate 			ocp_start = ocp;
19717c478bd9Sstevel@tonic-gate 			do {
19727c478bd9Sstevel@tonic-gate 				pri_t pri;
19737c478bd9Sstevel@tonic-gate 
19747c478bd9Sstevel@tonic-gate 				ASSERT(CPU_ACTIVE(ocp));
19757c478bd9Sstevel@tonic-gate 
19767c478bd9Sstevel@tonic-gate 				/*
197739bac370Sesaxe 				 * End our stroll around this lpl if:
19787c478bd9Sstevel@tonic-gate 				 *
19797c478bd9Sstevel@tonic-gate 				 * - Something became runnable on the local
198039bac370Sesaxe 				 *   queue...which also ends our stroll around
198139bac370Sesaxe 				 *   the partition.
19827c478bd9Sstevel@tonic-gate 				 *
198339bac370Sesaxe 				 * - We happen across another idle CPU.
198439bac370Sesaxe 				 *   Since it is patrolling the next portion
198539bac370Sesaxe 				 *   of the lpl's list (assuming it's not
19866890d023SEric Saxe 				 *   halted, or busy servicing an interrupt),
19876890d023SEric Saxe 				 *   move to the next higher level of locality.
19887c478bd9Sstevel@tonic-gate 				 */
198939bac370Sesaxe 				if (cp->cpu_disp->disp_nrunnable != 0) {
199039bac370Sesaxe 					kpreempt_enable();
199139bac370Sesaxe 					return (NULL);
199239bac370Sesaxe 				}
19937c478bd9Sstevel@tonic-gate 				if (ocp->cpu_dispatch_pri == -1) {
19947c478bd9Sstevel@tonic-gate 					if (ocp->cpu_disp_flags &
19956890d023SEric Saxe 					    CPU_DISP_HALTED ||
19966890d023SEric Saxe 					    ocp->cpu_intr_actv != 0)
19977c478bd9Sstevel@tonic-gate 						continue;
199839bac370Sesaxe 					else
19996890d023SEric Saxe 						goto next_level;
20007c478bd9Sstevel@tonic-gate 				}
20017c478bd9Sstevel@tonic-gate 
20027c478bd9Sstevel@tonic-gate 				/*
20037c478bd9Sstevel@tonic-gate 				 * If there's only one thread and the CPU
20047c478bd9Sstevel@tonic-gate 				 * is in the middle of a context switch,
20057c478bd9Sstevel@tonic-gate 				 * or it's currently running the idle thread,
20067c478bd9Sstevel@tonic-gate 				 * don't steal it.
20077c478bd9Sstevel@tonic-gate 				 */
20087c478bd9Sstevel@tonic-gate 				if ((ocp->cpu_disp_flags &
20097c478bd9Sstevel@tonic-gate 				    CPU_DISP_DONTSTEAL) &&
20107c478bd9Sstevel@tonic-gate 				    ocp->cpu_disp->disp_nrunnable == 1)
20117c478bd9Sstevel@tonic-gate 					continue;
20127c478bd9Sstevel@tonic-gate 
20137c478bd9Sstevel@tonic-gate 				pri = ocp->cpu_disp->disp_max_unbound_pri;
20147c478bd9Sstevel@tonic-gate 				if (pri > maxpri) {
2015685679f7Sakolb 					/*
2016685679f7Sakolb 					 * Don't steal threads that we attempted
2017fb2f18f8Sesaxe 					 * to steal recently until they're ready
2018fb2f18f8Sesaxe 					 * to be stolen again.
2019685679f7Sakolb 					 */
2020685679f7Sakolb 					stealtime = ocp->cpu_disp->disp_steal;
2021685679f7Sakolb 					if (stealtime == 0 ||
2022685679f7Sakolb 					    stealtime - gethrtime() <= 0) {
20237c478bd9Sstevel@tonic-gate 						maxpri = pri;
20247c478bd9Sstevel@tonic-gate 						tcp = ocp;
2025685679f7Sakolb 					} else {
2026685679f7Sakolb 						/*
2027685679f7Sakolb 						 * Don't update tcp, just set
2028685679f7Sakolb 						 * the retval to T_DONTSTEAL, so
2029685679f7Sakolb 						 * that if no acceptable CPUs
2030685679f7Sakolb 						 * are found the return value
2031685679f7Sakolb 						 * will be T_DONTSTEAL rather
2032685679f7Sakolb 						 * then NULL.
2033685679f7Sakolb 						 */
2034685679f7Sakolb 						retval = T_DONTSTEAL;
2035685679f7Sakolb 					}
20367c478bd9Sstevel@tonic-gate 				}
20377c478bd9Sstevel@tonic-gate 			} while ((ocp = ocp->cpu_next_lpl) != ocp_start);
20387c478bd9Sstevel@tonic-gate 
20396890d023SEric Saxe 			/*
20406890d023SEric Saxe 			 * Iterate to the next leaf lpl in the resource set
20416890d023SEric Saxe 			 * at this level of locality. If we hit the end of
20426890d023SEric Saxe 			 * the set, wrap back around to the beginning.
20436890d023SEric Saxe 			 *
20446890d023SEric Saxe 			 * Note: This iteration is NULL terminated for a reason
20456890d023SEric Saxe 			 * see lpl_topo_bootstrap() in lgrp.c for details.
20466890d023SEric Saxe 			 */
20477c478bd9Sstevel@tonic-gate 			if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
20487c478bd9Sstevel@tonic-gate 				leafidx = 0;
20497c478bd9Sstevel@tonic-gate 				lpl_leaf = lpl->lpl_rset[leafidx];
20507c478bd9Sstevel@tonic-gate 			}
20516890d023SEric Saxe 		} while (leafidx != startidx);
20527c478bd9Sstevel@tonic-gate 
20536890d023SEric Saxe next_level:
20546890d023SEric Saxe 		/*
20556890d023SEric Saxe 		 * Expand the search to include farther away CPUs (next
20566890d023SEric Saxe 		 * locality level). The closer CPUs that have already been
20576890d023SEric Saxe 		 * checked will be checked again. In doing so, idle CPUs
20586890d023SEric Saxe 		 * will tend to be more aggresive about stealing from CPUs
20596890d023SEric Saxe 		 * that are closer (since the closer CPUs will be considered
20606890d023SEric Saxe 		 * more often).
20616890d023SEric Saxe 		 * Begin at this level with the CPUs local leaf lpl.
20626890d023SEric Saxe 		 */
20636890d023SEric Saxe 		if ((lpl = lpl->lpl_parent) != NULL) {
20646890d023SEric Saxe 			leafidx = startidx = lpl->lpl_id2rset[local_id];
20656890d023SEric Saxe 			lpl_leaf = lpl->lpl_rset[leafidx];
20666890d023SEric Saxe 		}
20677c478bd9Sstevel@tonic-gate 	} while (!tcp && lpl);
20687c478bd9Sstevel@tonic-gate 
2069ab761399Sesaxe 	kpreempt_enable();
20707c478bd9Sstevel@tonic-gate 
20717c478bd9Sstevel@tonic-gate 	/*
20727c478bd9Sstevel@tonic-gate 	 * If another queue looks good, and there is still nothing on
20737c478bd9Sstevel@tonic-gate 	 * the local queue, try to transfer one or more threads
20747c478bd9Sstevel@tonic-gate 	 * from it to our queue.
20757c478bd9Sstevel@tonic-gate 	 */
20767c478bd9Sstevel@tonic-gate 	if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2077685679f7Sakolb 		tp = disp_getbest(tcp->cpu_disp);
2078685679f7Sakolb 		if (tp == NULL || tp == T_DONTSTEAL)
2079685679f7Sakolb 			return (tp);
20807c478bd9Sstevel@tonic-gate 		return (disp_ratify(tp, kpq));
20817c478bd9Sstevel@tonic-gate 	}
2082685679f7Sakolb 	return (retval);
20837c478bd9Sstevel@tonic-gate }
20847c478bd9Sstevel@tonic-gate 
20857c478bd9Sstevel@tonic-gate 
20867c478bd9Sstevel@tonic-gate /*
20877c478bd9Sstevel@tonic-gate  * disp_fix_unbound_pri()
20887c478bd9Sstevel@tonic-gate  *	Determines the maximum priority of unbound threads on the queue.
20897c478bd9Sstevel@tonic-gate  *	The priority is kept for the queue, but is only increased, never
20907c478bd9Sstevel@tonic-gate  *	reduced unless some CPU is looking for something on that queue.
20917c478bd9Sstevel@tonic-gate  *
20927c478bd9Sstevel@tonic-gate  *	The priority argument is the known upper limit.
20937c478bd9Sstevel@tonic-gate  *
20947c478bd9Sstevel@tonic-gate  *	Perhaps this should be kept accurately, but that probably means
20957c478bd9Sstevel@tonic-gate  *	separate bitmaps for bound and unbound threads.  Since only idled
20967c478bd9Sstevel@tonic-gate  *	CPUs will have to do this recalculation, it seems better this way.
20977c478bd9Sstevel@tonic-gate  */
20987c478bd9Sstevel@tonic-gate static void
20997c478bd9Sstevel@tonic-gate disp_fix_unbound_pri(disp_t *dp, pri_t pri)
21007c478bd9Sstevel@tonic-gate {
21017c478bd9Sstevel@tonic-gate 	kthread_t	*tp;
21027c478bd9Sstevel@tonic-gate 	dispq_t		*dq;
21037c478bd9Sstevel@tonic-gate 	ulong_t		*dqactmap = dp->disp_qactmap;
21047c478bd9Sstevel@tonic-gate 	ulong_t		mapword;
21057c478bd9Sstevel@tonic-gate 	int		wx;
21067c478bd9Sstevel@tonic-gate 
21077c478bd9Sstevel@tonic-gate 	ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
21087c478bd9Sstevel@tonic-gate 
21097c478bd9Sstevel@tonic-gate 	ASSERT(pri >= 0);			/* checked by caller */
21107c478bd9Sstevel@tonic-gate 
21117c478bd9Sstevel@tonic-gate 	/*
21127c478bd9Sstevel@tonic-gate 	 * Start the search at the next lowest priority below the supplied
21137c478bd9Sstevel@tonic-gate 	 * priority.  This depends on the bitmap implementation.
21147c478bd9Sstevel@tonic-gate 	 */
21157c478bd9Sstevel@tonic-gate 	do {
21167c478bd9Sstevel@tonic-gate 		wx = pri >> BT_ULSHIFT;		/* index of word in map */
21177c478bd9Sstevel@tonic-gate 
21187c478bd9Sstevel@tonic-gate 		/*
21197c478bd9Sstevel@tonic-gate 		 * Form mask for all lower priorities in the word.
21207c478bd9Sstevel@tonic-gate 		 */
21217c478bd9Sstevel@tonic-gate 		mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
21227c478bd9Sstevel@tonic-gate 
21237c478bd9Sstevel@tonic-gate 		/*
21247c478bd9Sstevel@tonic-gate 		 * Get next lower active priority.
21257c478bd9Sstevel@tonic-gate 		 */
21267c478bd9Sstevel@tonic-gate 		if (mapword != 0) {
21277c478bd9Sstevel@tonic-gate 			pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
21287c478bd9Sstevel@tonic-gate 		} else if (wx > 0) {
21297c478bd9Sstevel@tonic-gate 			pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
21307c478bd9Sstevel@tonic-gate 			if (pri < 0)
21317c478bd9Sstevel@tonic-gate 				break;
21327c478bd9Sstevel@tonic-gate 		} else {
21337c478bd9Sstevel@tonic-gate 			pri = -1;
21347c478bd9Sstevel@tonic-gate 			break;
21357c478bd9Sstevel@tonic-gate 		}
21367c478bd9Sstevel@tonic-gate 
21377c478bd9Sstevel@tonic-gate 		/*
21387c478bd9Sstevel@tonic-gate 		 * Search the queue for unbound, runnable threads.
21397c478bd9Sstevel@tonic-gate 		 */
21407c478bd9Sstevel@tonic-gate 		dq = &dp->disp_q[pri];
21417c478bd9Sstevel@tonic-gate 		tp = dq->dq_first;
21427c478bd9Sstevel@tonic-gate 
21437c478bd9Sstevel@tonic-gate 		while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
21447c478bd9Sstevel@tonic-gate 			tp = tp->t_link;
21457c478bd9Sstevel@tonic-gate 		}
21467c478bd9Sstevel@tonic-gate 
21477c478bd9Sstevel@tonic-gate 		/*
21487c478bd9Sstevel@tonic-gate 		 * If a thread was found, set the priority and return.
21497c478bd9Sstevel@tonic-gate 		 */
21507c478bd9Sstevel@tonic-gate 	} while (tp == NULL);
21517c478bd9Sstevel@tonic-gate 
21527c478bd9Sstevel@tonic-gate 	/*
21537c478bd9Sstevel@tonic-gate 	 * pri holds the maximum unbound thread priority or -1.
21547c478bd9Sstevel@tonic-gate 	 */
21557c478bd9Sstevel@tonic-gate 	if (dp->disp_max_unbound_pri != pri)
21567c478bd9Sstevel@tonic-gate 		dp->disp_max_unbound_pri = pri;
21577c478bd9Sstevel@tonic-gate }
21587c478bd9Sstevel@tonic-gate 
21597c478bd9Sstevel@tonic-gate /*
21607c478bd9Sstevel@tonic-gate  * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
21617c478bd9Sstevel@tonic-gate  * 	check if the CPU to which is was previously bound should have
21627c478bd9Sstevel@tonic-gate  * 	its disp_max_unbound_pri increased.
21637c478bd9Sstevel@tonic-gate  */
21647c478bd9Sstevel@tonic-gate void
21657c478bd9Sstevel@tonic-gate disp_adjust_unbound_pri(kthread_t *tp)
21667c478bd9Sstevel@tonic-gate {
21677c478bd9Sstevel@tonic-gate 	disp_t *dp;
21687c478bd9Sstevel@tonic-gate 	pri_t tpri;
21697c478bd9Sstevel@tonic-gate 
21707c478bd9Sstevel@tonic-gate 	ASSERT(THREAD_LOCK_HELD(tp));
21717c478bd9Sstevel@tonic-gate 
21727c478bd9Sstevel@tonic-gate 	/*
21737c478bd9Sstevel@tonic-gate 	 * Don't do anything if the thread is not bound, or
21747c478bd9Sstevel@tonic-gate 	 * currently not runnable or swapped out.
21757c478bd9Sstevel@tonic-gate 	 */
21767c478bd9Sstevel@tonic-gate 	if (tp->t_bound_cpu == NULL ||
21777c478bd9Sstevel@tonic-gate 	    tp->t_state != TS_RUN ||
21787c478bd9Sstevel@tonic-gate 	    tp->t_schedflag & TS_ON_SWAPQ)
21797c478bd9Sstevel@tonic-gate 		return;
21807c478bd9Sstevel@tonic-gate 
21817c478bd9Sstevel@tonic-gate 	tpri = DISP_PRIO(tp);
21827c478bd9Sstevel@tonic-gate 	dp = tp->t_bound_cpu->cpu_disp;
21837c478bd9Sstevel@tonic-gate 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
21847c478bd9Sstevel@tonic-gate 	if (tpri > dp->disp_max_unbound_pri)
21857c478bd9Sstevel@tonic-gate 		dp->disp_max_unbound_pri = tpri;
21867c478bd9Sstevel@tonic-gate }
21877c478bd9Sstevel@tonic-gate 
21887c478bd9Sstevel@tonic-gate /*
2189685679f7Sakolb  * disp_getbest()
2190685679f7Sakolb  *   De-queue the highest priority unbound runnable thread.
2191685679f7Sakolb  *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
2192685679f7Sakolb  *   Returns NULL if nothing found.
2193685679f7Sakolb  *   Returns T_DONTSTEAL if the thread was not stealable.
2194685679f7Sakolb  *   so that the caller will try again later.
21957c478bd9Sstevel@tonic-gate  *
2196685679f7Sakolb  *   Passed a pointer to a dispatch queue not associated with this CPU, and
2197685679f7Sakolb  *   its type.
21987c478bd9Sstevel@tonic-gate  */
21997c478bd9Sstevel@tonic-gate static kthread_t *
22007c478bd9Sstevel@tonic-gate disp_getbest(disp_t *dp)
22017c478bd9Sstevel@tonic-gate {
22027c478bd9Sstevel@tonic-gate 	kthread_t	*tp;
22037c478bd9Sstevel@tonic-gate 	dispq_t		*dq;
22047c478bd9Sstevel@tonic-gate 	pri_t		pri;
2205685679f7Sakolb 	cpu_t		*cp, *tcp;
2206685679f7Sakolb 	boolean_t	allbound;
22077c478bd9Sstevel@tonic-gate 
22087c478bd9Sstevel@tonic-gate 	disp_lock_enter(&dp->disp_lock);
22097c478bd9Sstevel@tonic-gate 
22107c478bd9Sstevel@tonic-gate 	/*
22117c478bd9Sstevel@tonic-gate 	 * If there is nothing to run, or the CPU is in the middle of a
22127c478bd9Sstevel@tonic-gate 	 * context switch of the only thread, return NULL.
22137c478bd9Sstevel@tonic-gate 	 */
2214685679f7Sakolb 	tcp = dp->disp_cpu;
2215685679f7Sakolb 	cp = CPU;
22167c478bd9Sstevel@tonic-gate 	pri = dp->disp_max_unbound_pri;
22177c478bd9Sstevel@tonic-gate 	if (pri == -1 ||
2218685679f7Sakolb 	    (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2219685679f7Sakolb 	    tcp->cpu_disp->disp_nrunnable == 1)) {
22207c478bd9Sstevel@tonic-gate 		disp_lock_exit_nopreempt(&dp->disp_lock);
22217c478bd9Sstevel@tonic-gate 		return (NULL);
22227c478bd9Sstevel@tonic-gate 	}
22237c478bd9Sstevel@tonic-gate 
22247c478bd9Sstevel@tonic-gate 	dq = &dp->disp_q[pri];
2225685679f7Sakolb 
22267c478bd9Sstevel@tonic-gate 
22277c478bd9Sstevel@tonic-gate 	/*
2228685679f7Sakolb 	 * Assume that all threads are bound on this queue, and change it
2229685679f7Sakolb 	 * later when we find out that it is not the case.
22307c478bd9Sstevel@tonic-gate 	 */
2231685679f7Sakolb 	allbound = B_TRUE;
2232685679f7Sakolb 	for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2233685679f7Sakolb 		hrtime_t now, nosteal, rqtime;
2234685679f7Sakolb 
2235685679f7Sakolb 		/*
2236685679f7Sakolb 		 * Skip over bound threads which could be here even
2237685679f7Sakolb 		 * though disp_max_unbound_pri indicated this level.
2238685679f7Sakolb 		 */
2239685679f7Sakolb 		if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2240685679f7Sakolb 			continue;
2241685679f7Sakolb 
2242685679f7Sakolb 		/*
2243685679f7Sakolb 		 * We've got some unbound threads on this queue, so turn
2244685679f7Sakolb 		 * the allbound flag off now.
2245685679f7Sakolb 		 */
2246685679f7Sakolb 		allbound = B_FALSE;
2247685679f7Sakolb 
2248685679f7Sakolb 		/*
2249685679f7Sakolb 		 * The thread is a candidate for stealing from its run queue. We
2250685679f7Sakolb 		 * don't want to steal threads that became runnable just a
2251685679f7Sakolb 		 * moment ago. This improves CPU affinity for threads that get
2252685679f7Sakolb 		 * preempted for short periods of time and go back on the run
2253685679f7Sakolb 		 * queue.
2254685679f7Sakolb 		 *
2255685679f7Sakolb 		 * We want to let it stay on its run queue if it was only placed
2256685679f7Sakolb 		 * there recently and it was running on the same CPU before that
2257685679f7Sakolb 		 * to preserve its cache investment. For the thread to remain on
2258685679f7Sakolb 		 * its run queue, ALL of the following conditions must be
2259685679f7Sakolb 		 * satisfied:
2260685679f7Sakolb 		 *
2261685679f7Sakolb 		 * - the disp queue should not be the kernel preemption queue
2262685679f7Sakolb 		 * - delayed idle stealing should not be disabled
2263685679f7Sakolb 		 * - nosteal_nsec should be non-zero
2264685679f7Sakolb 		 * - it should run with user priority
2265685679f7Sakolb 		 * - it should be on the run queue of the CPU where it was
2266685679f7Sakolb 		 *   running before being placed on the run queue
2267685679f7Sakolb 		 * - it should be the only thread on the run queue (to prevent
2268685679f7Sakolb 		 *   extra scheduling latency for other threads)
2269685679f7Sakolb 		 * - it should sit on the run queue for less than per-chip
2270685679f7Sakolb 		 *   nosteal interval or global nosteal interval
2271685679f7Sakolb 		 * - in case of CPUs with shared cache it should sit in a run
2272685679f7Sakolb 		 *   queue of a CPU from a different chip
2273685679f7Sakolb 		 *
2274685679f7Sakolb 		 * The checks are arranged so that the ones that are faster are
2275685679f7Sakolb 		 * placed earlier.
2276685679f7Sakolb 		 */
2277685679f7Sakolb 		if (tcp == NULL ||
2278685679f7Sakolb 		    pri >= minclsyspri ||
2279685679f7Sakolb 		    tp->t_cpu != tcp)
2280685679f7Sakolb 			break;
2281685679f7Sakolb 
2282685679f7Sakolb 		/*
2283fb2f18f8Sesaxe 		 * Steal immediately if, due to CMT processor architecture
2284fb2f18f8Sesaxe 		 * migraiton between cp and tcp would incur no performance
2285fb2f18f8Sesaxe 		 * penalty.
2286685679f7Sakolb 		 */
2287fb2f18f8Sesaxe 		if (pg_cmt_can_migrate(cp, tcp))
2288685679f7Sakolb 			break;
2289685679f7Sakolb 
2290fb2f18f8Sesaxe 		nosteal = nosteal_nsec;
2291fb2f18f8Sesaxe 		if (nosteal == 0)
2292685679f7Sakolb 			break;
2293685679f7Sakolb 
2294685679f7Sakolb 		/*
2295685679f7Sakolb 		 * Calculate time spent sitting on run queue
2296685679f7Sakolb 		 */
2297685679f7Sakolb 		now = gethrtime_unscaled();
2298685679f7Sakolb 		rqtime = now - tp->t_waitrq;
2299685679f7Sakolb 		scalehrtime(&rqtime);
2300685679f7Sakolb 
2301685679f7Sakolb 		/*
2302685679f7Sakolb 		 * Steal immediately if the time spent on this run queue is more
2303685679f7Sakolb 		 * than allowed nosteal delay.
2304685679f7Sakolb 		 *
2305685679f7Sakolb 		 * Negative rqtime check is needed here to avoid infinite
2306685679f7Sakolb 		 * stealing delays caused by unlikely but not impossible
2307685679f7Sakolb 		 * drifts between CPU times on different CPUs.
2308685679f7Sakolb 		 */
2309685679f7Sakolb 		if (rqtime > nosteal || rqtime < 0)
2310685679f7Sakolb 			break;
2311685679f7Sakolb 
2312685679f7Sakolb 		DTRACE_PROBE4(nosteal, kthread_t *, tp,
2313685679f7Sakolb 		    cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2314685679f7Sakolb 		scalehrtime(&now);
2315685679f7Sakolb 		/*
2316685679f7Sakolb 		 * Calculate when this thread becomes stealable
2317685679f7Sakolb 		 */
2318685679f7Sakolb 		now += (nosteal - rqtime);
2319685679f7Sakolb 
2320685679f7Sakolb 		/*
2321685679f7Sakolb 		 * Calculate time when some thread becomes stealable
2322685679f7Sakolb 		 */
2323685679f7Sakolb 		if (now < dp->disp_steal)
2324685679f7Sakolb 			dp->disp_steal = now;
23257c478bd9Sstevel@tonic-gate 	}
23267c478bd9Sstevel@tonic-gate 
23277c478bd9Sstevel@tonic-gate 	/*
23287c478bd9Sstevel@tonic-gate 	 * If there were no unbound threads on this queue, find the queue
2329685679f7Sakolb 	 * where they are and then return later. The value of
2330685679f7Sakolb 	 * disp_max_unbound_pri is not always accurate because it isn't
2331685679f7Sakolb 	 * reduced until another idle CPU looks for work.
2332685679f7Sakolb 	 */
2333685679f7Sakolb 	if (allbound)
2334685679f7Sakolb 		disp_fix_unbound_pri(dp, pri);
2335685679f7Sakolb 
2336685679f7Sakolb 	/*
2337685679f7Sakolb 	 * If we reached the end of the queue and found no unbound threads
2338685679f7Sakolb 	 * then return NULL so that other CPUs will be considered.  If there
2339685679f7Sakolb 	 * are unbound threads but they cannot yet be stolen, then
2340685679f7Sakolb 	 * return T_DONTSTEAL and try again later.
23417c478bd9Sstevel@tonic-gate 	 */
23427c478bd9Sstevel@tonic-gate 	if (tp == NULL) {
23437c478bd9Sstevel@tonic-gate 		disp_lock_exit_nopreempt(&dp->disp_lock);
2344685679f7Sakolb 		return (allbound ? NULL : T_DONTSTEAL);
23457c478bd9Sstevel@tonic-gate 	}
23467c478bd9Sstevel@tonic-gate 
23477c478bd9Sstevel@tonic-gate 	/*
23487c478bd9Sstevel@tonic-gate 	 * Found a runnable, unbound thread, so remove it from queue.
23497c478bd9Sstevel@tonic-gate 	 * dispdeq() requires that we have the thread locked, and we do,
23507c478bd9Sstevel@tonic-gate 	 * by virtue of holding the dispatch queue lock.  dispdeq() will
23517c478bd9Sstevel@tonic-gate 	 * put the thread in transition state, thereby dropping the dispq
23527c478bd9Sstevel@tonic-gate 	 * lock.
23537c478bd9Sstevel@tonic-gate 	 */
2354685679f7Sakolb 
23557c478bd9Sstevel@tonic-gate #ifdef DEBUG
23567c478bd9Sstevel@tonic-gate 	{
23577c478bd9Sstevel@tonic-gate 		int	thread_was_on_queue;
23587c478bd9Sstevel@tonic-gate 
23597c478bd9Sstevel@tonic-gate 		thread_was_on_queue = dispdeq(tp);	/* drops disp_lock */
23607c478bd9Sstevel@tonic-gate 		ASSERT(thread_was_on_queue);
23617c478bd9Sstevel@tonic-gate 	}
2362685679f7Sakolb 
23637c478bd9Sstevel@tonic-gate #else /* DEBUG */
23647c478bd9Sstevel@tonic-gate 	(void) dispdeq(tp);			/* drops disp_lock */
23657c478bd9Sstevel@tonic-gate #endif /* DEBUG */
23667c478bd9Sstevel@tonic-gate 
2367685679f7Sakolb 	/*
2368685679f7Sakolb 	 * Reset the disp_queue steal time - we do not know what is the smallest
2369685679f7Sakolb 	 * value across the queue is.
2370685679f7Sakolb 	 */
2371685679f7Sakolb 	dp->disp_steal = 0;
2372685679f7Sakolb 
23737c478bd9Sstevel@tonic-gate 	tp->t_schedflag |= TS_DONT_SWAP;
23747c478bd9Sstevel@tonic-gate 
23757c478bd9Sstevel@tonic-gate 	/*
23767c478bd9Sstevel@tonic-gate 	 * Setup thread to run on the current CPU.
23777c478bd9Sstevel@tonic-gate 	 */
23787c478bd9Sstevel@tonic-gate 	tp->t_disp_queue = cp->cpu_disp;
23797c478bd9Sstevel@tonic-gate 
23807c478bd9Sstevel@tonic-gate 	cp->cpu_dispthread = tp;		/* protected by spl only */
23817c478bd9Sstevel@tonic-gate 	cp->cpu_dispatch_pri = pri;
23820f500aa6Sbpramod 
23830f500aa6Sbpramod 	/*
23840f500aa6Sbpramod 	 * There can be a memory synchronization race between disp_getbest()
23850f500aa6Sbpramod 	 * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
23860f500aa6Sbpramod 	 * to preempt the current thread to run the enqueued thread while
23870f500aa6Sbpramod 	 * disp_getbest() and disp_ratify() are changing the current thread
23880f500aa6Sbpramod 	 * to the stolen thread. This may lead to a situation where
23890f500aa6Sbpramod 	 * cpu_resched() tries to preempt the wrong thread and the
23900f500aa6Sbpramod 	 * stolen thread continues to run on the CPU which has been tagged
23910f500aa6Sbpramod 	 * for preemption.
23920f500aa6Sbpramod 	 * Later the clock thread gets enqueued but doesn't get to run on the
23930f500aa6Sbpramod 	 * CPU causing the system to hang.
23940f500aa6Sbpramod 	 *
23950f500aa6Sbpramod 	 * To avoid this, grabbing and dropping the disp_lock (which does
23960f500aa6Sbpramod 	 * a memory barrier) is needed to synchronize the execution of
23970f500aa6Sbpramod 	 * cpu_resched() with disp_getbest() and disp_ratify() and
23980f500aa6Sbpramod 	 * synchronize the memory read and written by cpu_resched(),
23990f500aa6Sbpramod 	 * disp_getbest(), and disp_ratify() with each other.
24000f500aa6Sbpramod 	 *  (see CR#6482861 for more details).
24010f500aa6Sbpramod 	 */
24020f500aa6Sbpramod 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
24030f500aa6Sbpramod 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
24040f500aa6Sbpramod 
24057c478bd9Sstevel@tonic-gate 	ASSERT(pri == DISP_PRIO(tp));
24067c478bd9Sstevel@tonic-gate 
2407685679f7Sakolb 	DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2408685679f7Sakolb 
24097c478bd9Sstevel@tonic-gate 	thread_onproc(tp, cp);			/* set t_state to TS_ONPROC */
24107c478bd9Sstevel@tonic-gate 
24117c478bd9Sstevel@tonic-gate 	/*
24127c478bd9Sstevel@tonic-gate 	 * Return with spl high so that swtch() won't need to raise it.
24137c478bd9Sstevel@tonic-gate 	 * The disp_lock was dropped by dispdeq().
24147c478bd9Sstevel@tonic-gate 	 */
24157c478bd9Sstevel@tonic-gate 
24167c478bd9Sstevel@tonic-gate 	return (tp);
24177c478bd9Sstevel@tonic-gate }
24187c478bd9Sstevel@tonic-gate 
24197c478bd9Sstevel@tonic-gate /*
24207c478bd9Sstevel@tonic-gate  * disp_bound_common() - common routine for higher level functions
24217c478bd9Sstevel@tonic-gate  *	that check for bound threads under certain conditions.
24227c478bd9Sstevel@tonic-gate  *	If 'threadlistsafe' is set then there is no need to acquire
24237c478bd9Sstevel@tonic-gate  *	pidlock to stop the thread list from changing (eg, if
24247c478bd9Sstevel@tonic-gate  *	disp_bound_* is called with cpus paused).
24257c478bd9Sstevel@tonic-gate  */
24267c478bd9Sstevel@tonic-gate static int
24277c478bd9Sstevel@tonic-gate disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
24287c478bd9Sstevel@tonic-gate {
24297c478bd9Sstevel@tonic-gate 	int		found = 0;
24307c478bd9Sstevel@tonic-gate 	kthread_t	*tp;
24317c478bd9Sstevel@tonic-gate 
24327c478bd9Sstevel@tonic-gate 	ASSERT(flag);
24337c478bd9Sstevel@tonic-gate 
24347c478bd9Sstevel@tonic-gate 	if (!threadlistsafe)
24357c478bd9Sstevel@tonic-gate 		mutex_enter(&pidlock);
24367c478bd9Sstevel@tonic-gate 	tp = curthread;		/* faster than allthreads */
24377c478bd9Sstevel@tonic-gate 	do {
24387c478bd9Sstevel@tonic-gate 		if (tp->t_state != TS_FREE) {
24397c478bd9Sstevel@tonic-gate 			/*
24407c478bd9Sstevel@tonic-gate 			 * If an interrupt thread is busy, but the
24417c478bd9Sstevel@tonic-gate 			 * caller doesn't care (i.e. BOUND_INTR is off),
24427c478bd9Sstevel@tonic-gate 			 * then just ignore it and continue through.
24437c478bd9Sstevel@tonic-gate 			 */
24447c478bd9Sstevel@tonic-gate 			if ((tp->t_flag & T_INTR_THREAD) &&
24457c478bd9Sstevel@tonic-gate 			    !(flag & BOUND_INTR))
24467c478bd9Sstevel@tonic-gate 				continue;
24477c478bd9Sstevel@tonic-gate 
24487c478bd9Sstevel@tonic-gate 			/*
24497c478bd9Sstevel@tonic-gate 			 * Skip the idle thread for the CPU
24507c478bd9Sstevel@tonic-gate 			 * we're about to set offline.
24517c478bd9Sstevel@tonic-gate 			 */
24527c478bd9Sstevel@tonic-gate 			if (tp == cp->cpu_idle_thread)
24537c478bd9Sstevel@tonic-gate 				continue;
24547c478bd9Sstevel@tonic-gate 
24557c478bd9Sstevel@tonic-gate 			/*
24567c478bd9Sstevel@tonic-gate 			 * Skip the pause thread for the CPU
24577c478bd9Sstevel@tonic-gate 			 * we're about to set offline.
24587c478bd9Sstevel@tonic-gate 			 */
24597c478bd9Sstevel@tonic-gate 			if (tp == cp->cpu_pause_thread)
24607c478bd9Sstevel@tonic-gate 				continue;
24617c478bd9Sstevel@tonic-gate 
24627c478bd9Sstevel@tonic-gate 			if ((flag & BOUND_CPU) &&
24637c478bd9Sstevel@tonic-gate 			    (tp->t_bound_cpu == cp ||
24647c478bd9Sstevel@tonic-gate 			    tp->t_bind_cpu == cp->cpu_id ||
24657c478bd9Sstevel@tonic-gate 			    tp->t_weakbound_cpu == cp)) {
24667c478bd9Sstevel@tonic-gate 				found = 1;
24677c478bd9Sstevel@tonic-gate 				break;
24687c478bd9Sstevel@tonic-gate 			}
24697c478bd9Sstevel@tonic-gate 
24707c478bd9Sstevel@tonic-gate 			if ((flag & BOUND_PARTITION) &&
24717c478bd9Sstevel@tonic-gate 			    (tp->t_cpupart == cp->cpu_part)) {
24727c478bd9Sstevel@tonic-gate 				found = 1;
24737c478bd9Sstevel@tonic-gate 				break;
24747c478bd9Sstevel@tonic-gate 			}
24757c478bd9Sstevel@tonic-gate 		}
24767c478bd9Sstevel@tonic-gate 	} while ((tp = tp->t_next) != curthread && found == 0);
24777c478bd9Sstevel@tonic-gate 	if (!threadlistsafe)
24787c478bd9Sstevel@tonic-gate 		mutex_exit(&pidlock);
24797c478bd9Sstevel@tonic-gate 	return (found);
24807c478bd9Sstevel@tonic-gate }
24817c478bd9Sstevel@tonic-gate 
24827c478bd9Sstevel@tonic-gate /*
24837c478bd9Sstevel@tonic-gate  * disp_bound_threads - return nonzero if threads are bound to the processor.
24847c478bd9Sstevel@tonic-gate  *	Called infrequently.  Keep this simple.
24857c478bd9Sstevel@tonic-gate  *	Includes threads that are asleep or stopped but not onproc.
24867c478bd9Sstevel@tonic-gate  */
24877c478bd9Sstevel@tonic-gate int
24887c478bd9Sstevel@tonic-gate disp_bound_threads(cpu_t *cp, int threadlistsafe)
24897c478bd9Sstevel@tonic-gate {
24907c478bd9Sstevel@tonic-gate 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
24917c478bd9Sstevel@tonic-gate }
24927c478bd9Sstevel@tonic-gate 
24937c478bd9Sstevel@tonic-gate /*
24947c478bd9Sstevel@tonic-gate  * disp_bound_anythreads - return nonzero if _any_ threads are bound
24957c478bd9Sstevel@tonic-gate  * to the given processor, including interrupt threads.
24967c478bd9Sstevel@tonic-gate  */
24977c478bd9Sstevel@tonic-gate int
24987c478bd9Sstevel@tonic-gate disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
24997c478bd9Sstevel@tonic-gate {
25007c478bd9Sstevel@tonic-gate 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
25017c478bd9Sstevel@tonic-gate }
25027c478bd9Sstevel@tonic-gate 
25037c478bd9Sstevel@tonic-gate /*
25047c478bd9Sstevel@tonic-gate  * disp_bound_partition - return nonzero if threads are bound to the same
25057c478bd9Sstevel@tonic-gate  * partition as the processor.
25067c478bd9Sstevel@tonic-gate  *	Called infrequently.  Keep this simple.
25077c478bd9Sstevel@tonic-gate  *	Includes threads that are asleep or stopped but not onproc.
25087c478bd9Sstevel@tonic-gate  */
25097c478bd9Sstevel@tonic-gate int
25107c478bd9Sstevel@tonic-gate disp_bound_partition(cpu_t *cp, int threadlistsafe)
25117c478bd9Sstevel@tonic-gate {
25127c478bd9Sstevel@tonic-gate 	return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
25137c478bd9Sstevel@tonic-gate }
25147c478bd9Sstevel@tonic-gate 
25157c478bd9Sstevel@tonic-gate /*
25167c478bd9Sstevel@tonic-gate  * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
25177c478bd9Sstevel@tonic-gate  * threads to other CPUs.
25187c478bd9Sstevel@tonic-gate  */
25197c478bd9Sstevel@tonic-gate void
25207c478bd9Sstevel@tonic-gate disp_cpu_inactive(cpu_t *cp)
25217c478bd9Sstevel@tonic-gate {
25227c478bd9Sstevel@tonic-gate 	kthread_t	*tp;
25237c478bd9Sstevel@tonic-gate 	disp_t		*dp = cp->cpu_disp;
25247c478bd9Sstevel@tonic-gate 	dispq_t		*dq;
25257c478bd9Sstevel@tonic-gate 	pri_t		pri;
25267c478bd9Sstevel@tonic-gate 	int		wasonq;
25277c478bd9Sstevel@tonic-gate 
25287c478bd9Sstevel@tonic-gate 	disp_lock_enter(&dp->disp_lock);
25297c478bd9Sstevel@tonic-gate 	while ((pri = dp->disp_max_unbound_pri) != -1) {
25307c478bd9Sstevel@tonic-gate 		dq = &dp->disp_q[pri];
25317c478bd9Sstevel@tonic-gate 		tp = dq->dq_first;
25327c478bd9Sstevel@tonic-gate 
25337c478bd9Sstevel@tonic-gate 		/*
25347c478bd9Sstevel@tonic-gate 		 * Skip over bound threads.
25357c478bd9Sstevel@tonic-gate 		 */
25367c478bd9Sstevel@tonic-gate 		while (tp != NULL && tp->t_bound_cpu != NULL) {
25377c478bd9Sstevel@tonic-gate 			tp = tp->t_link;
25387c478bd9Sstevel@tonic-gate 		}
25397c478bd9Sstevel@tonic-gate 
25407c478bd9Sstevel@tonic-gate 		if (tp == NULL) {
25417c478bd9Sstevel@tonic-gate 			/* disp_max_unbound_pri must be inaccurate, so fix it */
25427c478bd9Sstevel@tonic-gate 			disp_fix_unbound_pri(dp, pri);
25437c478bd9Sstevel@tonic-gate 			continue;
25447c478bd9Sstevel@tonic-gate 		}
25457c478bd9Sstevel@tonic-gate 
25467c478bd9Sstevel@tonic-gate 		wasonq = dispdeq(tp);		/* drops disp_lock */
25477c478bd9Sstevel@tonic-gate 		ASSERT(wasonq);
25487c478bd9Sstevel@tonic-gate 		ASSERT(tp->t_weakbound_cpu == NULL);
25497c478bd9Sstevel@tonic-gate 
25507c478bd9Sstevel@tonic-gate 		setbackdq(tp);
25517c478bd9Sstevel@tonic-gate 		/*
25527c478bd9Sstevel@tonic-gate 		 * Called from cpu_offline:
25537c478bd9Sstevel@tonic-gate 		 *
25547c478bd9Sstevel@tonic-gate 		 * cp has already been removed from the list of active cpus
25557c478bd9Sstevel@tonic-gate 		 * and tp->t_cpu has been changed so there is no risk of
25567c478bd9Sstevel@tonic-gate 		 * tp ending up back on cp.
25577c478bd9Sstevel@tonic-gate 		 *
25587c478bd9Sstevel@tonic-gate 		 * Called from cpupart_move_cpu:
25597c478bd9Sstevel@tonic-gate 		 *
25607c478bd9Sstevel@tonic-gate 		 * The cpu has moved to a new cpupart.  Any threads that
25617c478bd9Sstevel@tonic-gate 		 * were on it's dispatch queues before the move remain
25627c478bd9Sstevel@tonic-gate 		 * in the old partition and can't run in the new partition.
25637c478bd9Sstevel@tonic-gate 		 */
25647c478bd9Sstevel@tonic-gate 		ASSERT(tp->t_cpu != cp);
25657c478bd9Sstevel@tonic-gate 		thread_unlock(tp);
25667c478bd9Sstevel@tonic-gate 
25677c478bd9Sstevel@tonic-gate 		disp_lock_enter(&dp->disp_lock);
25687c478bd9Sstevel@tonic-gate 	}
25697c478bd9Sstevel@tonic-gate 	disp_lock_exit(&dp->disp_lock);
25707c478bd9Sstevel@tonic-gate }
25717c478bd9Sstevel@tonic-gate 
25727c478bd9Sstevel@tonic-gate /*
2573455e370cSJohn Levon  * Return a score rating this CPU for running this thread: lower is better.
25747c478bd9Sstevel@tonic-gate  *
2575455e370cSJohn Levon  * If curthread is looking for a new CPU, then we ignore cpu_dispatch_pri for
2576455e370cSJohn Levon  * curcpu (as that's our own priority).
25777c478bd9Sstevel@tonic-gate  *
2578455e370cSJohn Levon  * If a cpu is the target of an offline request, then try to avoid it.
25797c478bd9Sstevel@tonic-gate  *
2580455e370cSJohn Levon  * Otherwise we'll use double the effective dispatcher priority for the CPU.
25817c478bd9Sstevel@tonic-gate  *
2582c3377ee9SJohn Levon  * We do this so smt_adjust_cpu_score() can increment the score if needed,
2583455e370cSJohn Levon  * without ending up over-riding a dispatcher priority.
2584455e370cSJohn Levon  */
2585455e370cSJohn Levon static pri_t
2586455e370cSJohn Levon cpu_score(cpu_t *cp, kthread_t *tp)
2587455e370cSJohn Levon {
2588455e370cSJohn Levon 	pri_t score;
2589455e370cSJohn Levon 
2590455e370cSJohn Levon 	if (tp == curthread && cp == curthread->t_cpu)
2591455e370cSJohn Levon 		score = 2 * CPU_IDLE_PRI;
2592455e370cSJohn Levon 	else if (cp == cpu_inmotion)
2593455e370cSJohn Levon 		score = SHRT_MAX;
2594455e370cSJohn Levon 	else
2595455e370cSJohn Levon 		score = 2 * cp->cpu_dispatch_pri;
2596455e370cSJohn Levon 
2597455e370cSJohn Levon 	if (2 * cp->cpu_disp->disp_maxrunpri > score)
2598455e370cSJohn Levon 		score = 2 * cp->cpu_disp->disp_maxrunpri;
2599455e370cSJohn Levon 	if (2 * cp->cpu_chosen_level > score)
2600455e370cSJohn Levon 		score = 2 * cp->cpu_chosen_level;
2601455e370cSJohn Levon 
2602c3377ee9SJohn Levon 	return (smt_adjust_cpu_score(tp, cp, score));
2603455e370cSJohn Levon }
2604455e370cSJohn Levon 
2605455e370cSJohn Levon /*
2606455e370cSJohn Levon  * disp_lowpri_cpu - find a suitable CPU to run the given thread.
26077c478bd9Sstevel@tonic-gate  *
2608455e370cSJohn Levon  * We are looking for a CPU with an effective dispatch priority lower than the
2609455e370cSJohn Levon  * thread's, so that the thread will run immediately rather than be enqueued.
2610455e370cSJohn Levon  * For NUMA locality, we prefer "home" CPUs within the thread's ->t_lpl group.
2611455e370cSJohn Levon  * If we don't find an available CPU there, we will expand our search to include
2612455e370cSJohn Levon  * wider locality levels. (Note these groups are already divided by CPU
2613455e370cSJohn Levon  * partition.)
2614455e370cSJohn Levon  *
2615455e370cSJohn Levon  * If the thread cannot immediately run on *any* CPU, we'll enqueue ourselves on
2616455e370cSJohn Levon  * the best home CPU we found.
2617455e370cSJohn Levon  *
2618455e370cSJohn Levon  * The hint passed in is used as a starting point so we don't favor CPU 0 or any
2619455e370cSJohn Levon  * other CPU.  The caller should pass in the most recently used CPU for the
2620455e370cSJohn Levon  * thread; it's of course possible that this CPU isn't in the home lgroup.
2621455e370cSJohn Levon  *
2622455e370cSJohn Levon  * This function must be called at either high SPL, or with preemption disabled,
2623455e370cSJohn Levon  * so that the "hint" CPU cannot be removed from the online CPU list while we
2624455e370cSJohn Levon  * are traversing it.
26257c478bd9Sstevel@tonic-gate  */
26267c478bd9Sstevel@tonic-gate cpu_t *
2627455e370cSJohn Levon disp_lowpri_cpu(cpu_t *hint, kthread_t *tp, pri_t tpri)
26287c478bd9Sstevel@tonic-gate {
26297c478bd9Sstevel@tonic-gate 	cpu_t	*bestcpu;
26307c478bd9Sstevel@tonic-gate 	cpu_t	*besthomecpu;
26317c478bd9Sstevel@tonic-gate 	cpu_t   *cp, *cpstart;
26327c478bd9Sstevel@tonic-gate 
26337c478bd9Sstevel@tonic-gate 	klgrpset_t	done;
26347c478bd9Sstevel@tonic-gate 
26357c478bd9Sstevel@tonic-gate 	lpl_t		*lpl_iter, *lpl_leaf;
26367c478bd9Sstevel@tonic-gate 
26377c478bd9Sstevel@tonic-gate 	ASSERT(hint != NULL);
2638455e370cSJohn Levon 	ASSERT(tp->t_lpl->lpl_ncpu > 0);
26397c478bd9Sstevel@tonic-gate 
26407c478bd9Sstevel@tonic-gate 	bestcpu = besthomecpu = NULL;
26417c478bd9Sstevel@tonic-gate 	klgrpset_clear(done);
26427c478bd9Sstevel@tonic-gate 
2643455e370cSJohn Levon 	lpl_iter = tp->t_lpl;
26447c478bd9Sstevel@tonic-gate 
26457c478bd9Sstevel@tonic-gate 	do {
2646455e370cSJohn Levon 		pri_t best = SHRT_MAX;
2647455e370cSJohn Levon 		klgrpset_t cur_set;
26487c478bd9Sstevel@tonic-gate 
26497c478bd9Sstevel@tonic-gate 		klgrpset_clear(cur_set);
26507c478bd9Sstevel@tonic-gate 
2651455e370cSJohn Levon 		for (int i = 0; i < lpl_iter->lpl_nrset; i++) {
26527c478bd9Sstevel@tonic-gate 			lpl_leaf = lpl_iter->lpl_rset[i];
26537c478bd9Sstevel@tonic-gate 			if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
26547c478bd9Sstevel@tonic-gate 				continue;
26557c478bd9Sstevel@tonic-gate 
26567c478bd9Sstevel@tonic-gate 			klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
26577c478bd9Sstevel@tonic-gate 
26587c478bd9Sstevel@tonic-gate 			if (hint->cpu_lpl == lpl_leaf)
26597c478bd9Sstevel@tonic-gate 				cp = cpstart = hint;
26607c478bd9Sstevel@tonic-gate 			else
26617c478bd9Sstevel@tonic-gate 				cp = cpstart = lpl_leaf->lpl_cpus;
26627c478bd9Sstevel@tonic-gate 
26637c478bd9Sstevel@tonic-gate 			do {
2664455e370cSJohn Levon 				pri_t score = cpu_score(cp, tp);
2665455e370cSJohn Levon 
2666455e370cSJohn Levon 				if (score < best) {
2667455e370cSJohn Levon 					best = score;
26687c478bd9Sstevel@tonic-gate 					bestcpu = cp;
2669455e370cSJohn Levon 
2670455e370cSJohn Levon 					/* An idle CPU: we're done. */
2671455e370cSJohn Levon 					if (score / 2 == CPU_IDLE_PRI)
2672455e370cSJohn Levon 						goto out;
26737c478bd9Sstevel@tonic-gate 				}
26747c478bd9Sstevel@tonic-gate 			} while ((cp = cp->cpu_next_lpl) != cpstart);
26757c478bd9Sstevel@tonic-gate 		}
26767c478bd9Sstevel@tonic-gate 
2677455e370cSJohn Levon 		if (bestcpu != NULL && tpri > (best / 2))
2678455e370cSJohn Levon 			goto out;
2679455e370cSJohn Levon 
26807c478bd9Sstevel@tonic-gate 		if (besthomecpu == NULL)
26817c478bd9Sstevel@tonic-gate 			besthomecpu = bestcpu;
2682455e370cSJohn Levon 
26837c478bd9Sstevel@tonic-gate 		/*
26847c478bd9Sstevel@tonic-gate 		 * Add the lgrps we just considered to the "done" set
26857c478bd9Sstevel@tonic-gate 		 */
26867c478bd9Sstevel@tonic-gate 		klgrpset_or(done, cur_set);
26877c478bd9Sstevel@tonic-gate 
26887c478bd9Sstevel@tonic-gate 	} while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
26897c478bd9Sstevel@tonic-gate 
26907c478bd9Sstevel@tonic-gate 	/*
26917c478bd9Sstevel@tonic-gate 	 * The specified priority isn't high enough to run immediately
26927c478bd9Sstevel@tonic-gate 	 * anywhere, so just return the best CPU from the home lgroup.
26937c478bd9Sstevel@tonic-gate 	 */
2694455e370cSJohn Levon 	bestcpu = besthomecpu;
2695455e370cSJohn Levon 
2696455e370cSJohn Levon out:
2697455e370cSJohn Levon 	ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2698455e370cSJohn Levon 	return (bestcpu);
26997c478bd9Sstevel@tonic-gate }
27007c478bd9Sstevel@tonic-gate 
27017c478bd9Sstevel@tonic-gate /*
27027c478bd9Sstevel@tonic-gate  * This routine provides the generic idle cpu function for all processors.
27037c478bd9Sstevel@tonic-gate  * If a processor has some specific code to execute when idle (say, to stop
27047c478bd9Sstevel@tonic-gate  * the pipeline and save power) then that routine should be defined in the
27057c478bd9Sstevel@tonic-gate  * processors specific code (module_xx.c) and the global variable idle_cpu
27067c478bd9Sstevel@tonic-gate  * set to that function.
27077c478bd9Sstevel@tonic-gate  */
27087c478bd9Sstevel@tonic-gate static void
27097c478bd9Sstevel@tonic-gate generic_idle_cpu(void)
27107c478bd9Sstevel@tonic-gate {
27117c478bd9Sstevel@tonic-gate }
27127c478bd9Sstevel@tonic-gate 
27137c478bd9Sstevel@tonic-gate /*ARGSUSED*/
27147c478bd9Sstevel@tonic-gate static void
27157c478bd9Sstevel@tonic-gate generic_enq_thread(cpu_t *cpu, int bound)
27167c478bd9Sstevel@tonic-gate {
27177c478bd9Sstevel@tonic-gate }
2718455e370cSJohn Levon 
2719455e370cSJohn Levon cpu_t *
2720455e370cSJohn Levon disp_choose_best_cpu(void)
2721455e370cSJohn Levon {
2722455e370cSJohn Levon 	kthread_t *t = curthread;
2723455e370cSJohn Levon 	cpu_t *curcpu = CPU;
2724455e370cSJohn Levon 
2725455e370cSJohn Levon 	ASSERT(t->t_preempt > 0);
2726455e370cSJohn Levon 	ASSERT(t->t_state == TS_ONPROC);
2727455e370cSJohn Levon 	ASSERT(t->t_schedflag & TS_VCPU);
2728455e370cSJohn Levon 
2729c3377ee9SJohn Levon 	if (smt_should_run(t, curcpu))
2730455e370cSJohn Levon 		return (curcpu);
2731455e370cSJohn Levon 
2732455e370cSJohn Levon 	return (disp_lowpri_cpu(curcpu, t, t->t_pri));
2733455e370cSJohn Levon }
2734