xref: /illumos-gate/usr/src/uts/i86pc/io/dr/dr_quiesce.c (revision ef150c2b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * Copyright 2023 Oxide Computer Company
28  */
29 
30 /*
31  * A CPR derivative specifically for starfire/starcat
32  * X86 doesn't make use of the quiesce interfaces, it's kept for simplicity.
33  */
34 
35 #include <sys/types.h>
36 #include <sys/systm.h>
37 #include <sys/machparam.h>
38 #include <sys/machsystm.h>
39 #include <sys/ddi.h>
40 #define	SUNDDI_IMPL
41 #include <sys/sunddi.h>
42 #include <sys/sunndi.h>
43 #include <sys/devctl.h>
44 #include <sys/time.h>
45 #include <sys/kmem.h>
46 #include <nfs/lm.h>
47 #include <sys/ddi_impldefs.h>
48 #include <sys/ndi_impldefs.h>
49 #include <sys/obpdefs.h>
50 #include <sys/cmn_err.h>
51 #include <sys/debug.h>
52 #include <sys/errno.h>
53 #include <sys/callb.h>
54 #include <sys/clock.h>
55 #include <sys/x_call.h>
56 #include <sys/cpuvar.h>
57 #include <sys/epm.h>
58 #include <sys/vfs.h>
59 #include <sys/promif.h>
60 #include <sys/conf.h>
61 #include <sys/cyclic.h>
62 
63 #include <sys/dr.h>
64 #include <sys/dr_util.h>
65 
66 extern void	e_ddi_enter_driver_list(struct devnames *dnp, int *listcnt);
67 extern void	e_ddi_exit_driver_list(struct devnames *dnp, int listcnt);
68 extern int	is_pseudo_device(dev_info_t *dip);
69 
70 extern kmutex_t	cpu_lock;
71 extern dr_unsafe_devs_t dr_unsafe_devs;
72 
73 static int		dr_is_real_device(dev_info_t *dip);
74 static int		dr_is_unsafe_major(major_t major);
75 static int		dr_bypass_device(char *dname);
76 static int		dr_check_dip(dev_info_t *dip, void *arg, uint_t ref);
77 static int		dr_resolve_devname(dev_info_t *dip, char *buffer,
78 				char *alias);
79 static sbd_error_t	*drerr_int(int e_code, uint64_t *arr, int idx,
80 				int majors);
81 static int		dr_add_int(uint64_t *arr, int idx, int len,
82 				uint64_t val);
83 
84 int dr_pt_test_suspend(dr_handle_t *hp);
85 
86 /*
87  * dr_quiesce.c interface
88  * NOTE: states used internally by dr_suspend and dr_resume
89  */
90 typedef enum dr_suspend_state {
91 	DR_SRSTATE_BEGIN = 0,
92 	DR_SRSTATE_USER,
93 	DR_SRSTATE_DRIVER,
94 	DR_SRSTATE_FULL
95 } suspend_state_t;
96 
97 struct dr_sr_handle {
98 	dr_handle_t		*sr_dr_handlep;
99 	dev_info_t		*sr_failed_dip;
100 	suspend_state_t		sr_suspend_state;
101 	uint_t			sr_flags;
102 	uint64_t		sr_err_ints[DR_MAX_ERR_INT];
103 	int			sr_err_idx;
104 };
105 
106 #define	SR_FLAG_WATCHDOG	0x1
107 
108 /*
109  * XXX
110  * This hack will go away before RTI.  Just for testing.
111  * List of drivers to bypass when performing a suspend.
112  */
113 static char *dr_bypass_list[] = {
114 	""
115 };
116 
117 
118 #define		SKIP_SYNC	/* bypass sync ops in dr_suspend */
119 
120 /*
121  * dr_skip_user_threads is used to control if user threads should
122  * be suspended.  If dr_skip_user_threads is true, the rest of the
123  * flags are not used; if it is false, dr_check_user_stop_result
124  * will be used to control whether or not we need to check suspend
125  * result, and dr_allow_blocked_threads will be used to control
126  * whether or not we allow suspend to continue if there are blocked
127  * threads.  We allow all combinations of dr_check_user_stop_result
128  * and dr_allow_block_threads, even though it might not make much
129  * sense to not allow block threads when we don't even check stop
130  * result.
131  */
132 static int	dr_skip_user_threads = 0;	/* default to FALSE */
133 static int	dr_check_user_stop_result = 1;	/* default to TRUE */
134 static int	dr_allow_blocked_threads = 1;	/* default to TRUE */
135 
136 #define	DR_CPU_LOOP_MSEC	1000
137 
138 static void
139 dr_stop_intr(void)
140 {
141 	ASSERT(MUTEX_HELD(&cpu_lock));
142 
143 	kpreempt_disable();
144 	cyclic_suspend();
145 }
146 
147 static void
148 dr_enable_intr(void)
149 {
150 	ASSERT(MUTEX_HELD(&cpu_lock));
151 
152 	cyclic_resume();
153 	kpreempt_enable();
154 }
155 
156 dr_sr_handle_t *
157 dr_get_sr_handle(dr_handle_t *hp)
158 {
159 	dr_sr_handle_t *srh;
160 
161 	srh = GETSTRUCT(dr_sr_handle_t, 1);
162 	srh->sr_dr_handlep = hp;
163 
164 	return (srh);
165 }
166 
167 void
168 dr_release_sr_handle(dr_sr_handle_t *srh)
169 {
170 	ASSERT(srh->sr_failed_dip == NULL);
171 	FREESTRUCT(srh, dr_sr_handle_t, 1);
172 }
173 
174 static int
175 dr_is_real_device(dev_info_t *dip)
176 {
177 	struct regspec *regbuf = NULL;
178 	int length = 0;
179 	int rc;
180 
181 	if (ddi_get_driver(dip) == NULL)
182 		return (0);
183 
184 	if (DEVI(dip)->devi_pm_flags & (PMC_NEEDS_SR|PMC_PARENTAL_SR))
185 		return (1);
186 	if (DEVI(dip)->devi_pm_flags & PMC_NO_SR)
187 		return (0);
188 
189 	/*
190 	 * now the general case
191 	 */
192 	rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "reg",
193 	    (caddr_t)&regbuf, &length);
194 	ASSERT(rc != DDI_PROP_NO_MEMORY);
195 	if (rc != DDI_PROP_SUCCESS) {
196 		return (0);
197 	} else {
198 		if ((length > 0) && (regbuf != NULL))
199 			kmem_free(regbuf, length);
200 		return (1);
201 	}
202 }
203 
204 static int
205 dr_is_unsafe_major(major_t major)
206 {
207 	char    *dname, **cpp;
208 	int	i, ndevs;
209 
210 	if ((dname = ddi_major_to_name(major)) == NULL) {
211 		PR_QR("dr_is_unsafe_major: invalid major # %d\n", major);
212 		return (0);
213 	}
214 
215 	ndevs = dr_unsafe_devs.ndevs;
216 	for (i = 0, cpp = dr_unsafe_devs.devnames; i < ndevs; i++) {
217 		if (strcmp(dname, *cpp++) == 0)
218 			return (1);
219 	}
220 	return (0);
221 }
222 
223 static int
224 dr_bypass_device(char *dname)
225 {
226 	int i;
227 	char **lname;
228 
229 	if (dname == NULL)
230 		return (0);
231 
232 	/* check the bypass list */
233 	for (i = 0, lname = &dr_bypass_list[i]; **lname != '\0'; lname++) {
234 		if (strcmp(dname, dr_bypass_list[i++]) == 0)
235 			return (1);
236 	}
237 	return (0);
238 }
239 
240 static int
241 dr_resolve_devname(dev_info_t *dip, char *buffer, char *alias)
242 {
243 	major_t	devmajor;
244 	char	*aka, *name;
245 
246 	*buffer = *alias = 0;
247 
248 	if (dip == NULL)
249 		return (-1);
250 
251 	if ((name = ddi_get_name(dip)) == NULL)
252 		name = "<null name>";
253 
254 	aka = name;
255 
256 	if ((devmajor = ddi_name_to_major(aka)) != DDI_MAJOR_T_NONE)
257 		aka = ddi_major_to_name(devmajor);
258 
259 	(void) strcpy(buffer, name);
260 
261 	if (strcmp(name, aka))
262 		(void) strcpy(alias, aka);
263 	else
264 		*alias = 0;
265 
266 	return (0);
267 }
268 
269 struct dr_ref {
270 	int		*refcount;
271 	int		*refcount_non_gldv3;
272 	uint64_t	*arr;
273 	int		*idx;
274 	int		len;
275 };
276 
277 /* ARGSUSED */
278 static int
279 dr_check_dip(dev_info_t *dip, void *arg, uint_t ref)
280 {
281 	major_t		major;
282 	char		*dname;
283 	struct dr_ref	*rp = (struct dr_ref *)arg;
284 
285 	if (dip == NULL)
286 		return (DDI_WALK_CONTINUE);
287 
288 	if (!dr_is_real_device(dip))
289 		return (DDI_WALK_CONTINUE);
290 
291 	dname = ddi_binding_name(dip);
292 
293 	if (dr_bypass_device(dname))
294 		return (DDI_WALK_CONTINUE);
295 
296 	if (dname && ((major = ddi_name_to_major(dname)) != (major_t)-1)) {
297 		if (ref && rp->refcount) {
298 			*rp->refcount += ref;
299 			PR_QR("\n  %s (major# %d) is referenced(%u)\n", dname,
300 			    major, ref);
301 		}
302 		if (ref && rp->refcount_non_gldv3) {
303 			if (NETWORK_PHYSDRV(major) && !GLDV3_DRV(major))
304 				*rp->refcount_non_gldv3 += ref;
305 		}
306 		if (dr_is_unsafe_major(major) && i_ddi_devi_attached(dip)) {
307 			PR_QR("\n  %s (major# %d) not hotpluggable\n", dname,
308 			    major);
309 			if (rp->arr != NULL && rp->idx != NULL)
310 				*rp->idx = dr_add_int(rp->arr, *rp->idx,
311 				    rp->len, (uint64_t)major);
312 		}
313 	}
314 	return (DDI_WALK_CONTINUE);
315 }
316 
317 static int
318 dr_check_unsafe_major(dev_info_t *dip, void *arg)
319 {
320 	return (dr_check_dip(dip, arg, 0));
321 }
322 
323 
324 /*ARGSUSED*/
325 void
326 dr_check_devices(dev_info_t *dip, int *refcount, dr_handle_t *handle,
327     uint64_t *arr, int *idx, int len, int *refcount_non_gldv3)
328 {
329 	struct dr_ref bref = {0};
330 
331 	if (dip == NULL)
332 		return;
333 
334 	bref.refcount = refcount;
335 	bref.refcount_non_gldv3 = refcount_non_gldv3;
336 	bref.arr = arr;
337 	bref.idx = idx;
338 	bref.len = len;
339 
340 	ASSERT(e_ddi_branch_held(dip));
341 	(void) e_ddi_branch_referenced(dip, dr_check_dip, &bref);
342 }
343 
344 /*
345  * The "dip" argument's parent (if it exists) must be held busy.
346  */
347 static int
348 dr_suspend_devices(dev_info_t *dip, dr_sr_handle_t *srh)
349 {
350 	dr_handle_t	*handle;
351 	major_t		major;
352 	char		*dname;
353 
354 	/*
355 	 * If dip is the root node, it has no siblings and it is
356 	 * always held. If dip is not the root node, dr_suspend_devices()
357 	 * will be invoked with the parent held busy.
358 	 */
359 	for (; dip != NULL; dip = ddi_get_next_sibling(dip)) {
360 		char	d_name[40], d_alias[40], *d_info;
361 
362 		ndi_devi_enter(dip);
363 		if (dr_suspend_devices(ddi_get_child(dip), srh)) {
364 			ndi_devi_exit(dip);
365 			return (ENXIO);
366 		}
367 		ndi_devi_exit(dip);
368 
369 		if (!dr_is_real_device(dip))
370 			continue;
371 
372 		major = (major_t)-1;
373 		if ((dname = ddi_binding_name(dip)) != NULL)
374 			major = ddi_name_to_major(dname);
375 
376 		if (dr_bypass_device(dname)) {
377 			PR_QR(" bypassed suspend of %s (major# %d)\n", dname,
378 			    major);
379 			continue;
380 		}
381 
382 		if (drmach_verify_sr(dip, 1)) {
383 			PR_QR(" bypassed suspend of %s (major# %d)\n", dname,
384 			    major);
385 			continue;
386 		}
387 
388 		if ((d_info = ddi_get_name_addr(dip)) == NULL)
389 			d_info = "<null>";
390 
391 		d_name[0] = 0;
392 		if (dr_resolve_devname(dip, d_name, d_alias) == 0) {
393 			if (d_alias[0] != 0) {
394 				prom_printf("\tsuspending %s@%s (aka %s)\n",
395 				    d_name, d_info, d_alias);
396 			} else {
397 				prom_printf("\tsuspending %s@%s\n", d_name,
398 				    d_info);
399 			}
400 		} else {
401 			prom_printf("\tsuspending %s@%s\n", dname, d_info);
402 		}
403 
404 		if (devi_detach(dip, DDI_SUSPEND) != DDI_SUCCESS) {
405 			prom_printf("\tFAILED to suspend %s@%s\n",
406 			    d_name[0] ? d_name : dname, d_info);
407 
408 			srh->sr_err_idx = dr_add_int(srh->sr_err_ints,
409 			    srh->sr_err_idx, DR_MAX_ERR_INT, (uint64_t)major);
410 
411 			ndi_hold_devi(dip);
412 			srh->sr_failed_dip = dip;
413 
414 			handle = srh->sr_dr_handlep;
415 			dr_op_err(CE_IGNORE, handle, ESBD_SUSPEND, "%s@%s",
416 			    d_name[0] ? d_name : dname, d_info);
417 
418 			return (DDI_FAILURE);
419 		}
420 	}
421 
422 	return (DDI_SUCCESS);
423 }
424 
425 static void
426 dr_resume_devices(dev_info_t *start, dr_sr_handle_t *srh)
427 {
428 	dr_handle_t	*handle;
429 	dev_info_t	*dip, *next, *last = NULL;
430 	major_t		major;
431 	char		*bn;
432 
433 	major = (major_t)-1;
434 
435 	/* attach in reverse device tree order */
436 	while (last != start) {
437 		dip = start;
438 		next = ddi_get_next_sibling(dip);
439 		while (next != last && dip != srh->sr_failed_dip) {
440 			dip = next;
441 			next = ddi_get_next_sibling(dip);
442 		}
443 		if (dip == srh->sr_failed_dip) {
444 			/* release hold acquired in dr_suspend_devices() */
445 			srh->sr_failed_dip = NULL;
446 			ndi_rele_devi(dip);
447 		} else if (dr_is_real_device(dip) &&
448 		    srh->sr_failed_dip == NULL) {
449 
450 			if ((bn = ddi_binding_name(dip)) != NULL) {
451 				major = ddi_name_to_major(bn);
452 			} else {
453 				bn = "<null>";
454 			}
455 			if (!dr_bypass_device(bn) &&
456 			    !drmach_verify_sr(dip, 0)) {
457 				char	d_name[40], d_alias[40], *d_info;
458 
459 				d_name[0] = 0;
460 				d_info = ddi_get_name_addr(dip);
461 				if (d_info == NULL)
462 					d_info = "<null>";
463 
464 				if (!dr_resolve_devname(dip, d_name, d_alias)) {
465 					if (d_alias[0] != 0) {
466 						prom_printf("\tresuming "
467 						    "%s@%s (aka %s)\n", d_name,
468 						    d_info, d_alias);
469 					} else {
470 						prom_printf("\tresuming "
471 						    "%s@%s\n", d_name, d_info);
472 					}
473 				} else {
474 					prom_printf("\tresuming %s@%s\n", bn,
475 					    d_info);
476 				}
477 
478 				if (devi_attach(dip, DDI_RESUME) !=
479 				    DDI_SUCCESS) {
480 					/*
481 					 * Print a console warning,
482 					 * set an e_code of ESBD_RESUME,
483 					 * and save the driver major
484 					 * number in the e_rsc.
485 					 */
486 					prom_printf("\tFAILED to resume %s@%s",
487 					    d_name[0] ? d_name : bn, d_info);
488 
489 					srh->sr_err_idx =
490 					    dr_add_int(srh->sr_err_ints,
491 					    srh->sr_err_idx, DR_MAX_ERR_INT,
492 					    (uint64_t)major);
493 
494 					handle = srh->sr_dr_handlep;
495 
496 					dr_op_err(CE_IGNORE, handle,
497 					    ESBD_RESUME, "%s@%s",
498 					    d_name[0] ? d_name : bn, d_info);
499 				}
500 			}
501 		}
502 
503 		/* Hold parent busy while walking its children */
504 		ndi_devi_enter(dip);
505 		dr_resume_devices(ddi_get_child(dip), srh);
506 		ndi_devi_exit(dip);
507 		last = dip;
508 	}
509 }
510 
511 /*
512  * True if thread is virtually stopped.  Similar to CPR_VSTOPPED
513  * but from DR point of view.  These user threads are waiting in
514  * the kernel.  Once they complete in the kernel, they will process
515  * the stop signal and stop.
516  */
517 #define	DR_VSTOPPED(t)			\
518 	((t)->t_state == TS_SLEEP &&	\
519 	(t)->t_wchan != NULL &&		\
520 	(t)->t_astflag &&		\
521 	((t)->t_proc_flag & TP_CHKPT))
522 
523 /* ARGSUSED */
524 static int
525 dr_stop_user_threads(dr_sr_handle_t *srh)
526 {
527 	int		count;
528 	int		bailout;
529 	dr_handle_t	*handle = srh->sr_dr_handlep;
530 	static fn_t	f = "dr_stop_user_threads";
531 	kthread_id_t	tp;
532 
533 	extern void add_one_utstop();
534 	extern void utstop_timedwait(clock_t);
535 	extern void utstop_init(void);
536 
537 #define	DR_UTSTOP_RETRY	4
538 #define	DR_UTSTOP_WAIT	hz
539 
540 	if (dr_skip_user_threads)
541 		return (DDI_SUCCESS);
542 
543 	utstop_init();
544 
545 	/* we need to try a few times to get past fork, etc. */
546 	srh->sr_err_idx = 0;
547 	for (count = 0; count < DR_UTSTOP_RETRY; count++) {
548 		/* walk the entire threadlist */
549 		mutex_enter(&pidlock);
550 		for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) {
551 			proc_t *p = ttoproc(tp);
552 
553 			/* handle kernel threads separately */
554 			if (p->p_as == &kas || p->p_stat == SZOMB)
555 				continue;
556 
557 			mutex_enter(&p->p_lock);
558 			thread_lock(tp);
559 
560 			if (tp->t_state == TS_STOPPED) {
561 				/* add another reason to stop this thread */
562 				tp->t_schedflag &= ~TS_RESUME;
563 			} else {
564 				tp->t_proc_flag |= TP_CHKPT;
565 
566 				thread_unlock(tp);
567 				mutex_exit(&p->p_lock);
568 				add_one_utstop();
569 				mutex_enter(&p->p_lock);
570 				thread_lock(tp);
571 
572 				aston(tp);
573 
574 				if (ISWAKEABLE(tp) || ISWAITING(tp)) {
575 					setrun_locked(tp);
576 				}
577 
578 			}
579 
580 			/* grab thread if needed */
581 			if (tp->t_state == TS_ONPROC && tp->t_cpu != CPU)
582 				poke_cpu(tp->t_cpu->cpu_id);
583 
584 
585 			thread_unlock(tp);
586 			mutex_exit(&p->p_lock);
587 		}
588 		mutex_exit(&pidlock);
589 
590 
591 		/* let everything catch up */
592 		utstop_timedwait(count * count * DR_UTSTOP_WAIT);
593 
594 
595 		/* now, walk the threadlist again to see if we are done */
596 		mutex_enter(&pidlock);
597 		for (tp = curthread->t_next, bailout = 0;
598 		    tp != curthread; tp = tp->t_next) {
599 			proc_t *p = ttoproc(tp);
600 
601 			/* handle kernel threads separately */
602 			if (p->p_as == &kas || p->p_stat == SZOMB)
603 				continue;
604 
605 			/*
606 			 * If this thread didn't stop, and we don't allow
607 			 * unstopped blocked threads, bail.
608 			 */
609 			thread_lock(tp);
610 			if (!CPR_ISTOPPED(tp) &&
611 			    !(dr_allow_blocked_threads &&
612 			    DR_VSTOPPED(tp))) {
613 				bailout = 1;
614 				if (count == DR_UTSTOP_RETRY - 1) {
615 					/*
616 					 * save the pid for later reporting
617 					 */
618 					srh->sr_err_idx =
619 					    dr_add_int(srh->sr_err_ints,
620 					    srh->sr_err_idx, DR_MAX_ERR_INT,
621 					    (uint64_t)p->p_pid);
622 
623 					cmn_err(CE_WARN, "%s: "
624 					    "failed to stop thread: "
625 					    "process=%s, pid=%d",
626 					    f, p->p_user.u_psargs, p->p_pid);
627 
628 					PR_QR("%s: failed to stop thread: "
629 					    "process=%s, pid=%d, t_id=0x%p, "
630 					    "t_state=0x%x, t_proc_flag=0x%x, "
631 					    "t_schedflag=0x%x\n",
632 					    f, p->p_user.u_psargs, p->p_pid,
633 					    (void *)tp, tp->t_state,
634 					    tp->t_proc_flag, tp->t_schedflag);
635 				}
636 
637 			}
638 			thread_unlock(tp);
639 		}
640 		mutex_exit(&pidlock);
641 
642 		/* were all the threads stopped? */
643 		if (!bailout)
644 			break;
645 	}
646 
647 	/* were we unable to stop all threads after a few tries? */
648 	if (bailout) {
649 		handle->h_err = drerr_int(ESBD_UTHREAD, srh->sr_err_ints,
650 		    srh->sr_err_idx, 0);
651 		return (ESRCH);
652 	}
653 
654 	return (DDI_SUCCESS);
655 }
656 
657 static void
658 dr_start_user_threads(void)
659 {
660 	kthread_id_t tp;
661 
662 	mutex_enter(&pidlock);
663 
664 	/* walk all threads and release them */
665 	for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) {
666 		proc_t *p = ttoproc(tp);
667 
668 		/* skip kernel threads */
669 		if (ttoproc(tp)->p_as == &kas)
670 			continue;
671 
672 		mutex_enter(&p->p_lock);
673 		tp->t_proc_flag &= ~TP_CHKPT;
674 		mutex_exit(&p->p_lock);
675 
676 		thread_lock(tp);
677 		if (CPR_ISTOPPED(tp)) {
678 			/* back on the runq */
679 			tp->t_schedflag |= TS_RESUME;
680 			setrun_locked(tp);
681 		}
682 		thread_unlock(tp);
683 	}
684 
685 	mutex_exit(&pidlock);
686 }
687 
688 static void
689 dr_signal_user(int sig)
690 {
691 	struct proc *p;
692 
693 	mutex_enter(&pidlock);
694 
695 	for (p = practive; p != NULL; p = p->p_next) {
696 		/* only user threads */
697 		if (p->p_exec == NULL || p->p_stat == SZOMB ||
698 		    p == proc_init || p == ttoproc(curthread))
699 			continue;
700 
701 		mutex_enter(&p->p_lock);
702 		sigtoproc(p, NULL, sig);
703 		mutex_exit(&p->p_lock);
704 	}
705 
706 	mutex_exit(&pidlock);
707 
708 	/* add a bit of delay */
709 	delay(hz);
710 }
711 
712 void
713 dr_resume(dr_sr_handle_t *srh)
714 {
715 	switch (srh->sr_suspend_state) {
716 	case DR_SRSTATE_FULL:
717 
718 		ASSERT(MUTEX_HELD(&cpu_lock));
719 
720 		/*
721 		 * Prevent false alarm in tod_validate() due to tod
722 		 * value change between suspend and resume
723 		 */
724 		mutex_enter(&tod_lock);
725 		tod_status_set(TOD_DR_RESUME_DONE);
726 		mutex_exit(&tod_lock);
727 
728 		dr_enable_intr();	/* enable intr & clock */
729 
730 		start_cpus();
731 		mutex_exit(&cpu_lock);
732 
733 		/*
734 		 * This should only be called if drmach_suspend_last()
735 		 * was called and state transitioned to DR_SRSTATE_FULL
736 		 * to prevent resume attempts on device instances that
737 		 * were not previously suspended.
738 		 */
739 		drmach_resume_first();
740 
741 		/* FALLTHROUGH */
742 
743 	case DR_SRSTATE_DRIVER:
744 		/*
745 		 * resume drivers
746 		 */
747 		srh->sr_err_idx = 0;
748 
749 		/* no parent dip to hold busy */
750 		dr_resume_devices(ddi_root_node(), srh);
751 
752 		if (srh->sr_err_idx && srh->sr_dr_handlep) {
753 			(srh->sr_dr_handlep)->h_err = drerr_int(ESBD_RESUME,
754 			    srh->sr_err_ints, srh->sr_err_idx, 1);
755 		}
756 
757 		/*
758 		 * resume the lock manager
759 		 */
760 		lm_cprresume();
761 
762 		/* FALLTHROUGH */
763 
764 	case DR_SRSTATE_USER:
765 		/*
766 		 * finally, resume user threads
767 		 */
768 		if (!dr_skip_user_threads) {
769 			prom_printf("DR: resuming user threads...\n");
770 			dr_start_user_threads();
771 		}
772 		/* FALLTHROUGH */
773 
774 	case DR_SRSTATE_BEGIN:
775 	default:
776 		/*
777 		 * let those who care know that we've just resumed
778 		 */
779 		PR_QR("sending SIGTHAW...\n");
780 		dr_signal_user(SIGTHAW);
781 		break;
782 	}
783 
784 	prom_printf("DR: resume COMPLETED\n");
785 }
786 
787 int
788 dr_suspend(dr_sr_handle_t *srh)
789 {
790 	dr_handle_t	*handle;
791 	int		force;
792 	int		dev_errs_idx;
793 	uint64_t	dev_errs[DR_MAX_ERR_INT];
794 	int		rc = DDI_SUCCESS;
795 
796 	handle = srh->sr_dr_handlep;
797 
798 	force = dr_cmd_flags(handle) & SBD_FLAG_FORCE;
799 
800 	prom_printf("\nDR: suspending user threads...\n");
801 	srh->sr_suspend_state = DR_SRSTATE_USER;
802 	if (((rc = dr_stop_user_threads(srh)) != DDI_SUCCESS) &&
803 	    dr_check_user_stop_result) {
804 		dr_resume(srh);
805 		return (rc);
806 	}
807 
808 	if (!force) {
809 		struct dr_ref drc = {0};
810 
811 		prom_printf("\nDR: checking devices...\n");
812 		dev_errs_idx = 0;
813 
814 		drc.arr = dev_errs;
815 		drc.idx = &dev_errs_idx;
816 		drc.len = DR_MAX_ERR_INT;
817 
818 		/*
819 		 * Since the root node can never go away, it
820 		 * doesn't have to be held.
821 		 */
822 		ddi_walk_devs(ddi_root_node(), dr_check_unsafe_major, &drc);
823 		if (dev_errs_idx) {
824 			handle->h_err = drerr_int(ESBD_UNSAFE, dev_errs,
825 			    dev_errs_idx, 1);
826 			dr_resume(srh);
827 			return (DDI_FAILURE);
828 		}
829 		PR_QR("done\n");
830 	} else {
831 		prom_printf("\nDR: dr_suspend invoked with force flag\n");
832 	}
833 
834 #ifndef	SKIP_SYNC
835 	/*
836 	 * This sync swap out all user pages
837 	 */
838 	vfs_sync(SYNC_ALL);
839 #endif
840 
841 	/*
842 	 * special treatment for lock manager
843 	 */
844 	lm_cprsuspend();
845 
846 #ifndef	SKIP_SYNC
847 	/*
848 	 * sync the file system in case we never make it back
849 	 */
850 	sync();
851 #endif
852 
853 	/*
854 	 * now suspend drivers
855 	 */
856 	prom_printf("DR: suspending drivers...\n");
857 	srh->sr_suspend_state = DR_SRSTATE_DRIVER;
858 	srh->sr_err_idx = 0;
859 	/* No parent to hold busy */
860 	if ((rc = dr_suspend_devices(ddi_root_node(), srh)) != DDI_SUCCESS) {
861 		if (srh->sr_err_idx && srh->sr_dr_handlep) {
862 			(srh->sr_dr_handlep)->h_err = drerr_int(ESBD_SUSPEND,
863 			    srh->sr_err_ints, srh->sr_err_idx, 1);
864 		}
865 		dr_resume(srh);
866 		return (rc);
867 	}
868 
869 	drmach_suspend_last();
870 
871 	/*
872 	 * finally, grab all cpus
873 	 */
874 	srh->sr_suspend_state = DR_SRSTATE_FULL;
875 
876 	mutex_enter(&cpu_lock);
877 	pause_cpus(NULL, NULL);
878 	dr_stop_intr();
879 
880 	return (rc);
881 }
882 
883 int
884 dr_pt_test_suspend(dr_handle_t *hp)
885 {
886 	dr_sr_handle_t *srh;
887 	int		err;
888 	uint_t		psmerr;
889 	static fn_t	f = "dr_pt_test_suspend";
890 
891 	PR_QR("%s...\n", f);
892 
893 	srh = dr_get_sr_handle(hp);
894 	if ((err = dr_suspend(srh)) == DDI_SUCCESS) {
895 		dr_resume(srh);
896 		if ((hp->h_err) && ((psmerr = hp->h_err->e_code) != 0)) {
897 			PR_QR("%s: error on dr_resume()", f);
898 			switch (psmerr) {
899 			case ESBD_RESUME:
900 				PR_QR("Couldn't resume devices: %s\n",
901 				    DR_GET_E_RSC(hp->h_err));
902 				break;
903 
904 			case ESBD_KTHREAD:
905 				PR_ALL("psmerr is ESBD_KTHREAD\n");
906 				break;
907 			default:
908 				PR_ALL("Resume error unknown = %d\n", psmerr);
909 				break;
910 			}
911 		}
912 	} else {
913 		PR_ALL("%s: dr_suspend() failed, err = 0x%x\n", f, err);
914 		psmerr = hp->h_err ? hp->h_err->e_code : ESBD_NOERROR;
915 		switch (psmerr) {
916 		case ESBD_UNSAFE:
917 			PR_ALL("Unsafe devices (major #): %s\n",
918 			    DR_GET_E_RSC(hp->h_err));
919 			break;
920 
921 		case ESBD_RTTHREAD:
922 			PR_ALL("RT threads (PIDs): %s\n",
923 			    DR_GET_E_RSC(hp->h_err));
924 			break;
925 
926 		case ESBD_UTHREAD:
927 			PR_ALL("User threads (PIDs): %s\n",
928 			    DR_GET_E_RSC(hp->h_err));
929 			break;
930 
931 		case ESBD_SUSPEND:
932 			PR_ALL("Non-suspendable devices (major #): %s\n",
933 			    DR_GET_E_RSC(hp->h_err));
934 			break;
935 
936 		case ESBD_RESUME:
937 			PR_ALL("Could not resume devices (major #): %s\n",
938 			    DR_GET_E_RSC(hp->h_err));
939 			break;
940 
941 		case ESBD_KTHREAD:
942 			PR_ALL("psmerr is ESBD_KTHREAD\n");
943 			break;
944 
945 		case ESBD_NOERROR:
946 			PR_ALL("sbd_error_t error code not set\n");
947 			break;
948 
949 		default:
950 			PR_ALL("Unknown error psmerr = %d\n", psmerr);
951 			break;
952 		}
953 	}
954 	dr_release_sr_handle(srh);
955 
956 	return (0);
957 }
958 
959 /*
960  * Add a new integer value to the end of an array.  Don't allow duplicates to
961  * appear in the array, and don't allow the array to overflow.  Return the new
962  * total number of entries in the array.
963  */
964 static int
965 dr_add_int(uint64_t *arr, int idx, int len, uint64_t val)
966 {
967 	int i;
968 
969 	if (arr == NULL)
970 		return (0);
971 
972 	if (idx >= len)
973 		return (idx);
974 
975 	for (i = 0; i < idx; i++) {
976 		if (arr[i] == val)
977 			return (idx);
978 	}
979 
980 	arr[idx++] = val;
981 
982 	return (idx);
983 }
984 
985 /*
986  * Construct an sbd_error_t featuring a string representation of an array of
987  * integers as its e_rsc.
988  */
989 static sbd_error_t *
990 drerr_int(int e_code, uint64_t *arr, int idx, int majors)
991 {
992 	int		i, n, buf_len, buf_idx, buf_avail;
993 	char		*dname;
994 	char		*buf;
995 	sbd_error_t	*new_sbd_err;
996 	static char	s_ellipsis[] = "...";
997 
998 	if (arr == NULL || idx <= 0)
999 		return (NULL);
1000 
1001 	/* MAXPATHLEN is the size of the e_rsc field in sbd_error_t. */
1002 	buf = (char *)kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1003 
1004 	/*
1005 	 * This is the total working area of the buffer.  It must be computed
1006 	 * as the size of 'buf', minus reserved space for the null terminator
1007 	 * and the ellipsis string.
1008 	 */
1009 	buf_len = MAXPATHLEN - (strlen(s_ellipsis) + 1);
1010 
1011 	/* Construct a string representation of the array values */
1012 	for (buf_idx = 0, i = 0; i < idx; i++) {
1013 		buf_avail = buf_len - buf_idx;
1014 		if (majors) {
1015 			dname = ddi_major_to_name(arr[i]);
1016 			if (dname) {
1017 				n = snprintf(&buf[buf_idx], buf_avail, "%s, ",
1018 				    dname);
1019 			} else {
1020 				n = snprintf(&buf[buf_idx], buf_avail,
1021 				    "major %" PRIu64 ", ", arr[i]);
1022 			}
1023 		} else {
1024 			n = snprintf(&buf[buf_idx], buf_avail, "%" PRIu64 ", ",
1025 			    arr[i]);
1026 		}
1027 
1028 		/* An ellipsis gets appended when no more values fit */
1029 		if (n >= buf_avail) {
1030 			(void) strcpy(&buf[buf_idx], s_ellipsis);
1031 			break;
1032 		}
1033 
1034 		buf_idx += n;
1035 	}
1036 
1037 	/* If all the contents fit, remove the trailing comma */
1038 	if (n < buf_avail) {
1039 		buf[--buf_idx] = '\0';
1040 		buf[--buf_idx] = '\0';
1041 	}
1042 
1043 	/* Return an sbd_error_t with the buffer and e_code */
1044 	new_sbd_err = drerr_new(1, e_code, buf);
1045 	kmem_free(buf, MAXPATHLEN);
1046 	return (new_sbd_err);
1047 }
1048