xref: /illumos-gate/usr/src/uts/common/os/exit.c (revision e8031f0a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"	/* from SVr4.0 1.74 */
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/sysmacros.h>
35 #include <sys/systm.h>
36 #include <sys/cred.h>
37 #include <sys/user.h>
38 #include <sys/errno.h>
39 #include <sys/proc.h>
40 #include <sys/ucontext.h>
41 #include <sys/procfs.h>
42 #include <sys/vnode.h>
43 #include <sys/acct.h>
44 #include <sys/var.h>
45 #include <sys/cmn_err.h>
46 #include <sys/debug.h>
47 #include <sys/wait.h>
48 #include <sys/siginfo.h>
49 #include <sys/procset.h>
50 #include <sys/class.h>
51 #include <sys/file.h>
52 #include <sys/session.h>
53 #include <sys/kmem.h>
54 #include <sys/vtrace.h>
55 #include <sys/prsystm.h>
56 #include <sys/ipc.h>
57 #include <sys/sem_impl.h>
58 #include <c2/audit.h>
59 #include <sys/aio_impl.h>
60 #include <vm/as.h>
61 #include <sys/poll.h>
62 #include <sys/door.h>
63 #include <sys/lwpchan_impl.h>
64 #include <sys/utrap.h>
65 #include <sys/task.h>
66 #include <sys/exacct.h>
67 #include <sys/cyclic.h>
68 #include <sys/schedctl.h>
69 #include <sys/rctl.h>
70 #include <sys/contract_impl.h>
71 #include <sys/contract/process_impl.h>
72 #include <sys/list.h>
73 #include <sys/dtrace.h>
74 #include <sys/pool.h>
75 #include <sys/sdt.h>
76 #include <sys/corectl.h>
77 
78 /*
79  * convert code/data pair into old style wait status
80  */
81 int
82 wstat(int code, int data)
83 {
84 	int stat = (data & 0377);
85 
86 	switch (code) {
87 	case CLD_EXITED:
88 		stat <<= 8;
89 		break;
90 	case CLD_DUMPED:
91 		stat |= WCOREFLG;
92 		break;
93 	case CLD_KILLED:
94 		break;
95 	case CLD_TRAPPED:
96 	case CLD_STOPPED:
97 		stat <<= 8;
98 		stat |= WSTOPFLG;
99 		break;
100 	case CLD_CONTINUED:
101 		stat = WCONTFLG;
102 		break;
103 	default:
104 		cmn_err(CE_PANIC, "wstat: bad code");
105 		/* NOTREACHED */
106 	}
107 	return (stat);
108 }
109 
110 static char *
111 exit_reason(char *buf, size_t bufsz, int what, int why)
112 {
113 	switch (why) {
114 	case CLD_EXITED:
115 		(void) snprintf(buf, bufsz, "exited with status %d", what);
116 		break;
117 	case CLD_KILLED:
118 		(void) snprintf(buf, bufsz, "exited on fatal signal %d", what);
119 		break;
120 	case CLD_DUMPED:
121 		(void) snprintf(buf, bufsz, "core dumped on signal %d", what);
122 		break;
123 	default:
124 		(void) snprintf(buf, bufsz, "encountered unknown error "
125 		    "(%d, %d)", why, what);
126 		break;
127 	}
128 
129 	return (buf);
130 }
131 
132 /*
133  * exit system call: pass back caller's arg.
134  */
135 void
136 rexit(int rval)
137 {
138 	exit(CLD_EXITED, rval);
139 }
140 
141 /*
142  * Called by proc_exit() when a zone's init exits, presumably because
143  * it failed.  As long as the given zone is still in the "running"
144  * state, we will re-exec() init, but first we need to reset things
145  * which are usually inherited across exec() but will break init's
146  * assumption that it is being exec()'d from a virgin process.  Most
147  * importantly this includes closing all file descriptors (exec only
148  * closes those marked close-on-exec) and resetting signals (exec only
149  * resets handled signals, and we need to clear any signals which
150  * killed init).  Anything else that exec(2) says would be inherited,
151  * but would affect the execution of init, needs to be reset.
152  */
153 static int
154 restart_init(int what, int why)
155 {
156 	kthread_t *t = curthread;
157 	klwp_t *lwp = ttolwp(t);
158 	proc_t *p = ttoproc(t);
159 	user_t *up = PTOU(p);
160 
161 	vnode_t *oldcd, *oldrd;
162 	sess_t *sp;
163 	int i, err;
164 	char reason_buf[64];
165 	const char *ipath;
166 
167 	/*
168 	 * Let zone admin (and global zone admin if this is for a non-global
169 	 * zone) know that init has failed and will be restarted.
170 	 */
171 	zcmn_err(p->p_zone->zone_id, CE_WARN,
172 	    "init(1M) %s: restarting automatically",
173 	    exit_reason(reason_buf, sizeof (reason_buf), what, why));
174 
175 	if (!INGLOBALZONE(p)) {
176 		cmn_err(CE_WARN, "init(1M) for zone %s (pid %d) %s: "
177 		    "restarting automatically",
178 		    p->p_zone->zone_name, p->p_pid, reason_buf);
179 	}
180 
181 	/*
182 	 * Remove any fpollinfo_t's for this (last) thread from our file
183 	 * descriptors so closeall() can ASSERT() that they're all gone.
184 	 * Then close all open file descriptors in the process.
185 	 */
186 	pollcleanup();
187 	closeall(P_FINFO(p));
188 
189 	/*
190 	 * Grab p_lock and begin clearing miscellaneous global process
191 	 * state that needs to be reset before we exec the new init(1M).
192 	 */
193 
194 	mutex_enter(&p->p_lock);
195 	prbarrier(p);
196 
197 	p->p_flag &= ~(SKILLED | SEXTKILLED | SEXITING | SDOCORE);
198 	up->u_cmask = CMASK;
199 
200 	sigemptyset(&t->t_hold);
201 	sigemptyset(&t->t_sig);
202 	sigemptyset(&t->t_extsig);
203 
204 	sigemptyset(&p->p_sig);
205 	sigemptyset(&p->p_extsig);
206 
207 	sigdelq(p, t, 0);
208 	sigdelq(p, NULL, 0);
209 
210 	if (p->p_killsqp) {
211 		siginfofree(p->p_killsqp);
212 		p->p_killsqp = NULL;
213 	}
214 
215 	/*
216 	 * Reset any signals that are ignored back to the default disposition.
217 	 * Other u_signal members will be cleared when exec calls sigdefault().
218 	 */
219 	for (i = 1; i < NSIG; i++) {
220 		if (up->u_signal[i - 1] == SIG_IGN) {
221 			up->u_signal[i - 1] = SIG_DFL;
222 			sigemptyset(&up->u_sigmask[i - 1]);
223 		}
224 	}
225 
226 	/*
227 	 * Clear the current signal, any signal info associated with it, and
228 	 * any signal information from contracts and/or contract templates.
229 	 */
230 	lwp->lwp_cursig = 0;
231 	lwp->lwp_extsig = 0;
232 	if (lwp->lwp_curinfo != NULL) {
233 		siginfofree(lwp->lwp_curinfo);
234 		lwp->lwp_curinfo = NULL;
235 	}
236 	lwp_ctmpl_clear(lwp);
237 
238 	/*
239 	 * Reset both the process root directory and the current working
240 	 * directory to the root of the zone just as we do during boot.
241 	 */
242 	VN_HOLD(p->p_zone->zone_rootvp);
243 	oldrd = up->u_rdir;
244 	up->u_rdir = p->p_zone->zone_rootvp;
245 
246 	VN_HOLD(p->p_zone->zone_rootvp);
247 	oldcd = up->u_cdir;
248 	up->u_cdir = p->p_zone->zone_rootvp;
249 
250 	if (up->u_cwd != NULL) {
251 		refstr_rele(up->u_cwd);
252 		up->u_cwd = NULL;
253 	}
254 
255 	mutex_exit(&p->p_lock);
256 
257 	if (oldrd != NULL)
258 		VN_RELE(oldrd);
259 	if (oldcd != NULL)
260 		VN_RELE(oldcd);
261 
262 	/*
263 	 * Free the controlling tty.
264 	 */
265 	mutex_enter(&pidlock);
266 	sp = p->p_sessp;
267 	if (sp->s_sidp == p->p_pidp && sp->s_vp != NULL) {
268 		mutex_exit(&pidlock);
269 		freectty(sp);
270 	} else {
271 		mutex_exit(&pidlock);
272 	}
273 
274 	/*
275 	 * Now exec() the new init(1M) on top of the current process.  If we
276 	 * succeed, the caller will treat this like a successful system call.
277 	 * If we fail, we issue messages and the caller will proceed with exit.
278 	 */
279 	ipath = INGLOBALZONE(p) ? initname : zone_initname;
280 	err = exec_init(ipath, 0, NULL);
281 
282 	if (err == 0)
283 		return (0);
284 
285 	zcmn_err(p->p_zone->zone_id, CE_WARN,
286 	    "failed to restart init(1M) (err=%d): system reboot required", err);
287 
288 	if (!INGLOBALZONE(p)) {
289 		cmn_err(CE_WARN, "failed to restart init(1M) for zone %s "
290 		    "(pid %d, err=%d): zoneadm(1M) boot required",
291 		    p->p_zone->zone_name, p->p_pid, err);
292 	}
293 
294 	return (-1);
295 }
296 
297 /*
298  * Release resources.
299  * Enter zombie state.
300  * Wake up parent and init processes,
301  * and dispose of children.
302  */
303 void
304 exit(int why, int what)
305 {
306 	/*
307 	 * If proc_exit() fails, then some other lwp in the process
308 	 * got there first.  We just have to call lwp_exit() to allow
309 	 * the other lwp to finish exiting the process.  Otherwise we're
310 	 * restarting init, and should return.
311 	 */
312 	if (proc_exit(why, what) != 0) {
313 		mutex_enter(&curproc->p_lock);
314 		ASSERT(curproc->p_flag & SEXITLWPS);
315 		lwp_exit();
316 		/* NOTREACHED */
317 	}
318 }
319 
320 /*
321  * Set the SEXITING flag on the process, after making sure /proc does
322  * not have it locked.  This is done in more places than proc_exit(),
323  * so it is a separate function.
324  */
325 void
326 proc_is_exiting(proc_t *p)
327 {
328 	mutex_enter(&p->p_lock);
329 	prbarrier(p);
330 	p->p_flag |= SEXITING;
331 	mutex_exit(&p->p_lock);
332 }
333 
334 /*
335  * Return value:
336  *   1 - exitlwps() failed, call (or continue) lwp_exit()
337  *   0 - restarting init.  Return through system call path
338  */
339 int
340 proc_exit(int why, int what)
341 {
342 	kthread_t *t = curthread;
343 	klwp_t *lwp = ttolwp(t);
344 	proc_t *p = ttoproc(t);
345 	zone_t *z = p->p_zone;
346 	timeout_id_t tmp_id;
347 	int rv;
348 	proc_t *q;
349 	sess_t *sp;
350 	task_t *tk;
351 	vnode_t *exec_vp, *execdir_vp, *cdir, *rdir;
352 	sigqueue_t *sqp;
353 	lwpdir_t *lwpdir;
354 	uint_t lwpdir_sz;
355 	lwpdir_t **tidhash;
356 	uint_t tidhash_sz;
357 	refstr_t *cwd;
358 	hrtime_t hrutime, hrstime;
359 
360 	/*
361 	 * Stop and discard the process's lwps except for the current one,
362 	 * unless some other lwp beat us to it.  If exitlwps() fails then
363 	 * return and the calling lwp will call (or continue in) lwp_exit().
364 	 */
365 	proc_is_exiting(p);
366 	if (exitlwps(0) != 0)
367 		return (1);
368 
369 	DTRACE_PROC(lwp__exit);
370 	DTRACE_PROC1(exit, int, why);
371 
372 	/*
373 	 * Don't let init exit unless zone_icode() failed its exec, or
374 	 * we are shutting down the zone or the machine.
375 	 *
376 	 * Since we are single threaded, we don't need to lock the
377 	 * following accesses to zone_proc_initpid.
378 	 */
379 	if (p->p_pid == z->zone_proc_initpid) {
380 		if (z->zone_boot_err == 0 &&
381 		    zone_status_get(z) < ZONE_IS_SHUTTING_DOWN &&
382 		    zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN &&
383 		    restart_init(what, why) == 0)
384 			return (0);
385 		/*
386 		 * Since we didn't or couldn't restart init, we clear
387 		 * the zone's init state and proceed with exit
388 		 * processing.
389 		 */
390 		z->zone_proc_initpid = -1;
391 	}
392 
393 	/*
394 	 * Allocate a sigqueue now, before we grab locks.
395 	 * It will be given to sigcld(), below.
396 	 */
397 	sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
398 
399 	/*
400 	 * revoke any doors created by the process.
401 	 */
402 	if (p->p_door_list)
403 		door_exit();
404 
405 	/*
406 	 * Release schedctl data structures.
407 	 */
408 	if (p->p_pagep)
409 		schedctl_proc_cleanup();
410 
411 	/*
412 	 * make sure all pending kaio has completed.
413 	 */
414 	if (p->p_aio)
415 		aio_cleanup_exit();
416 
417 	/*
418 	 * discard the lwpchan cache.
419 	 */
420 	if (p->p_lcp != NULL)
421 		lwpchan_destroy_cache(0);
422 
423 	/*
424 	 * Clean up any DTrace helper actions or probes for the process.
425 	 */
426 	if (p->p_dtrace_helpers != NULL) {
427 		ASSERT(dtrace_helpers_cleanup != NULL);
428 		(*dtrace_helpers_cleanup)();
429 	}
430 
431 	/* untimeout the realtime timers */
432 	if (p->p_itimer != NULL)
433 		timer_exit();
434 
435 	if ((tmp_id = p->p_alarmid) != 0) {
436 		p->p_alarmid = 0;
437 		(void) untimeout(tmp_id);
438 	}
439 
440 	/*
441 	 * Remove any fpollinfo_t's for this (last) thread from our file
442 	 * descriptors so closeall() can ASSERT() that they're all gone.
443 	 */
444 	pollcleanup();
445 
446 	if (p->p_rprof_cyclic != CYCLIC_NONE) {
447 		mutex_enter(&cpu_lock);
448 		cyclic_remove(p->p_rprof_cyclic);
449 		mutex_exit(&cpu_lock);
450 	}
451 
452 	mutex_enter(&p->p_lock);
453 
454 	/*
455 	 * Clean up any DTrace probes associated with this process.
456 	 */
457 	if (p->p_dtrace_probes) {
458 		ASSERT(dtrace_fasttrap_exit_ptr != NULL);
459 		dtrace_fasttrap_exit_ptr(p);
460 	}
461 
462 	while ((tmp_id = p->p_itimerid) != 0) {
463 		p->p_itimerid = 0;
464 		mutex_exit(&p->p_lock);
465 		(void) untimeout(tmp_id);
466 		mutex_enter(&p->p_lock);
467 	}
468 
469 	lwp_cleanup();
470 
471 	/*
472 	 * We are about to exit; prevent our resource associations from
473 	 * being changed.
474 	 */
475 	pool_barrier_enter();
476 
477 	/*
478 	 * Block the process against /proc now that we have really
479 	 * acquired p->p_lock (to manipulate p_tlist at least).
480 	 */
481 	prbarrier(p);
482 
483 #ifdef	SUN_SRC_COMPAT
484 	if (code == CLD_KILLED)
485 		u.u_acflag |= AXSIG;
486 #endif
487 	sigfillset(&p->p_ignore);
488 	sigemptyset(&p->p_siginfo);
489 	sigemptyset(&p->p_sig);
490 	sigemptyset(&p->p_extsig);
491 	sigemptyset(&t->t_sig);
492 	sigemptyset(&t->t_extsig);
493 	sigemptyset(&p->p_sigmask);
494 	sigdelq(p, t, 0);
495 	lwp->lwp_cursig = 0;
496 	lwp->lwp_extsig = 0;
497 	p->p_flag &= ~(SKILLED | SEXTKILLED);
498 	if (lwp->lwp_curinfo) {
499 		siginfofree(lwp->lwp_curinfo);
500 		lwp->lwp_curinfo = NULL;
501 	}
502 
503 	t->t_proc_flag |= TP_LWPEXIT;
504 	ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
505 	prlwpexit(t);		/* notify /proc */
506 	lwp_hash_out(p, t->t_tid);
507 	prexit(p);
508 
509 	p->p_lwpcnt = 0;
510 	p->p_tlist = NULL;
511 	sigqfree(p);
512 	term_mstate(t);
513 	p->p_mterm = gethrtime();
514 
515 	exec_vp = p->p_exec;
516 	execdir_vp = p->p_execdir;
517 	p->p_exec = NULLVP;
518 	p->p_execdir = NULLVP;
519 	mutex_exit(&p->p_lock);
520 	if (exec_vp)
521 		VN_RELE(exec_vp);
522 	if (execdir_vp)
523 		VN_RELE(execdir_vp);
524 
525 	pr_free_watched_pages(p);
526 
527 	closeall(P_FINFO(p));
528 
529 	mutex_enter(&pidlock);
530 	sp = p->p_sessp;
531 	if (sp->s_sidp == p->p_pidp && sp->s_vp != NULL) {
532 		mutex_exit(&pidlock);
533 		freectty(sp);
534 	} else
535 		mutex_exit(&pidlock);
536 
537 #if defined(__sparc)
538 	if (p->p_utraps != NULL)
539 		utrap_free(p);
540 #endif
541 	if (p->p_semacct)			/* IPC semaphore exit */
542 		semexit(p);
543 	rv = wstat(why, what);
544 
545 	acct(rv & 0xff);
546 	exacct_commit_proc(p, rv);
547 
548 	/*
549 	 * Release any resources associated with C2 auditing
550 	 */
551 #ifdef C2_AUDIT
552 	if (audit_active) {
553 		/*
554 		 * audit exit system call
555 		 */
556 		audit_exit(why, what);
557 	}
558 #endif
559 
560 	/*
561 	 * Free address space.
562 	 */
563 	relvm();
564 
565 	/*
566 	 * Release held contracts.
567 	 */
568 	contract_exit(p);
569 
570 	/*
571 	 * Depart our encapsulating process contract.
572 	 */
573 	if ((p->p_flag & SSYS) == 0) {
574 		ASSERT(p->p_ct_process);
575 		contract_process_exit(p->p_ct_process, p, rv);
576 	}
577 
578 	/*
579 	 * Remove pool association, and block if requested by pool_do_bind.
580 	 */
581 	mutex_enter(&p->p_lock);
582 	ASSERT(p->p_pool->pool_ref > 0);
583 	atomic_add_32(&p->p_pool->pool_ref, -1);
584 	p->p_pool = pool_default;
585 	/*
586 	 * Now that our address space has been freed and all other threads
587 	 * in this process have exited, set the PEXITED pool flag.  This
588 	 * tells the pools subsystems to ignore this process if it was
589 	 * requested to rebind this process to a new pool.
590 	 */
591 	p->p_poolflag |= PEXITED;
592 	pool_barrier_exit();
593 	mutex_exit(&p->p_lock);
594 
595 	mutex_enter(&pidlock);
596 
597 	/*
598 	 * Delete this process from the newstate list of its parent. We
599 	 * will put it in the right place in the sigcld in the end.
600 	 */
601 	delete_ns(p->p_parent, p);
602 
603 	/*
604 	 * Reassign the orphans to the next of kin.
605 	 * Don't rearrange init's orphanage.
606 	 */
607 	if ((q = p->p_orphan) != NULL && p != proc_init) {
608 
609 		proc_t *nokp = p->p_nextofkin;
610 
611 		for (;;) {
612 			q->p_nextofkin = nokp;
613 			if (q->p_nextorph == NULL)
614 				break;
615 			q = q->p_nextorph;
616 		}
617 		q->p_nextorph = nokp->p_orphan;
618 		nokp->p_orphan = p->p_orphan;
619 		p->p_orphan = NULL;
620 	}
621 
622 	/*
623 	 * Reassign the children to init.
624 	 * Don't try to assign init's children to init.
625 	 */
626 	if ((q = p->p_child) != NULL && p != proc_init) {
627 		struct proc	*np;
628 		struct proc	*initp = proc_init;
629 		boolean_t	setzonetop = B_FALSE;
630 
631 		if (!INGLOBALZONE(curproc))
632 			setzonetop = B_TRUE;
633 
634 		pgdetach(p);
635 
636 		do {
637 			np = q->p_sibling;
638 			/*
639 			 * Delete it from its current parent new state
640 			 * list and add it to init new state list
641 			 */
642 			delete_ns(q->p_parent, q);
643 
644 			q->p_ppid = 1;
645 			if (setzonetop) {
646 				mutex_enter(&q->p_lock);
647 				q->p_flag |= SZONETOP;
648 				mutex_exit(&q->p_lock);
649 			}
650 			q->p_parent = initp;
651 
652 			/*
653 			 * Since q will be the first child,
654 			 * it will not have a previous sibling.
655 			 */
656 			q->p_psibling = NULL;
657 			if (initp->p_child) {
658 				initp->p_child->p_psibling = q;
659 			}
660 			q->p_sibling = initp->p_child;
661 			initp->p_child = q;
662 			if (q->p_proc_flag & P_PR_PTRACE) {
663 				mutex_enter(&q->p_lock);
664 				sigtoproc(q, NULL, SIGKILL);
665 				mutex_exit(&q->p_lock);
666 			}
667 			/*
668 			 * sigcld() will add the child to parents
669 			 * newstate list.
670 			 */
671 			if (q->p_stat == SZOMB)
672 				sigcld(q, NULL);
673 		} while ((q = np) != NULL);
674 
675 		p->p_child = NULL;
676 		ASSERT(p->p_child_ns == NULL);
677 	}
678 
679 	TRACE_1(TR_FAC_PROC, TR_PROC_EXIT, "proc_exit: %p", p);
680 
681 	mutex_enter(&p->p_lock);
682 	CL_EXIT(curthread); /* tell the scheduler that curthread is exiting */
683 
684 	hrutime = mstate_aggr_state(p, LMS_USER);
685 	hrstime = mstate_aggr_state(p, LMS_SYSTEM);
686 	p->p_utime = (clock_t)NSEC_TO_TICK(hrutime) + p->p_cutime;
687 	p->p_stime = (clock_t)NSEC_TO_TICK(hrstime) + p->p_cstime;
688 
689 	p->p_acct[LMS_USER]	+= p->p_cacct[LMS_USER];
690 	p->p_acct[LMS_SYSTEM]	+= p->p_cacct[LMS_SYSTEM];
691 	p->p_acct[LMS_TRAP]	+= p->p_cacct[LMS_TRAP];
692 	p->p_acct[LMS_TFAULT]	+= p->p_cacct[LMS_TFAULT];
693 	p->p_acct[LMS_DFAULT]	+= p->p_cacct[LMS_DFAULT];
694 	p->p_acct[LMS_KFAULT]	+= p->p_cacct[LMS_KFAULT];
695 	p->p_acct[LMS_USER_LOCK] += p->p_cacct[LMS_USER_LOCK];
696 	p->p_acct[LMS_SLEEP]	+= p->p_cacct[LMS_SLEEP];
697 	p->p_acct[LMS_WAIT_CPU]	+= p->p_cacct[LMS_WAIT_CPU];
698 	p->p_acct[LMS_STOPPED]	+= p->p_cacct[LMS_STOPPED];
699 
700 	p->p_ru.minflt	+= p->p_cru.minflt;
701 	p->p_ru.majflt	+= p->p_cru.majflt;
702 	p->p_ru.nswap	+= p->p_cru.nswap;
703 	p->p_ru.inblock	+= p->p_cru.inblock;
704 	p->p_ru.oublock	+= p->p_cru.oublock;
705 	p->p_ru.msgsnd	+= p->p_cru.msgsnd;
706 	p->p_ru.msgrcv	+= p->p_cru.msgrcv;
707 	p->p_ru.nsignals += p->p_cru.nsignals;
708 	p->p_ru.nvcsw	+= p->p_cru.nvcsw;
709 	p->p_ru.nivcsw	+= p->p_cru.nivcsw;
710 	p->p_ru.sysc	+= p->p_cru.sysc;
711 	p->p_ru.ioch	+= p->p_cru.ioch;
712 
713 	p->p_stat = SZOMB;
714 	p->p_proc_flag &= ~P_PR_PTRACE;
715 	p->p_wdata = what;
716 	p->p_wcode = (char)why;
717 
718 	cdir = PTOU(p)->u_cdir;
719 	rdir = PTOU(p)->u_rdir;
720 	cwd = PTOU(p)->u_cwd;
721 
722 	/*
723 	 * Release resource controls, as they are no longer enforceable.
724 	 */
725 	rctl_set_free(p->p_rctls);
726 
727 	/*
728 	 * Give up task and project memberships.  Decrement tk_nlwps counter
729 	 * for our task.max-lwps resource control.  An extended accounting
730 	 * record, if that facility is active, is scheduled to be written.
731 	 * Zombie processes are false members of task0 for the remainder of
732 	 * their lifetime; no accounting information is recorded for them.
733 	 */
734 	tk = p->p_task;
735 
736 	mutex_enter(&p->p_zone->zone_nlwps_lock);
737 	tk->tk_nlwps--;
738 	tk->tk_proj->kpj_nlwps--;
739 	p->p_zone->zone_nlwps--;
740 	mutex_exit(&p->p_zone->zone_nlwps_lock);
741 	task_detach(p);
742 	p->p_task = task0p;
743 
744 	/*
745 	 * Clear the lwp directory and the lwpid hash table
746 	 * now that /proc can't bother us any more.
747 	 * We free the memory below, after dropping p->p_lock.
748 	 */
749 	lwpdir = p->p_lwpdir;
750 	lwpdir_sz = p->p_lwpdir_sz;
751 	tidhash = p->p_tidhash;
752 	tidhash_sz = p->p_tidhash_sz;
753 	p->p_lwpdir = NULL;
754 	p->p_lwpfree = NULL;
755 	p->p_lwpdir_sz = 0;
756 	p->p_tidhash = NULL;
757 	p->p_tidhash_sz = 0;
758 
759 	/*
760 	 * If the process has context ops installed, call the exit routine
761 	 * on behalf of this last remaining thread. Normally exitpctx() is
762 	 * called during thread_exit() or lwp_exit(), but because this is the
763 	 * last thread in the process, we must call it here. By the time
764 	 * thread_exit() is called (below), the association with the relevant
765 	 * process has been lost.
766 	 *
767 	 * We also free the context here.
768 	 */
769 	if (p->p_pctx) {
770 		kpreempt_disable();
771 		exitpctx(p);
772 		kpreempt_enable();
773 
774 		freepctx(p, 0);
775 	}
776 
777 	/*
778 	 * curthread's proc pointer is changed to point at p0 because
779 	 * curthread's original proc pointer can be freed as soon as
780 	 * the child sends a SIGCLD to its parent.
781 	 */
782 	t->t_procp = &p0;
783 
784 	mutex_exit(&p->p_lock);
785 	sigcld(p, sqp);
786 	mutex_exit(&pidlock);
787 
788 	task_rele(tk);
789 
790 	kmem_free(lwpdir, lwpdir_sz * sizeof (lwpdir_t));
791 	kmem_free(tidhash, tidhash_sz * sizeof (lwpdir_t *));
792 
793 	/*
794 	 * We don't release u_cdir and u_rdir until SZOMB is set.
795 	 * This protects us against dofusers().
796 	 */
797 	VN_RELE(cdir);
798 	if (rdir)
799 		VN_RELE(rdir);
800 	if (cwd)
801 		refstr_rele(cwd);
802 
803 	lwp_pcb_exit();
804 
805 	thread_exit();
806 	/* NOTREACHED */
807 }
808 
809 /*
810  * Format siginfo structure for wait system calls.
811  */
812 void
813 winfo(proc_t *pp, k_siginfo_t *ip, int waitflag)
814 {
815 	ASSERT(MUTEX_HELD(&pidlock));
816 
817 	bzero(ip, sizeof (k_siginfo_t));
818 	ip->si_signo = SIGCLD;
819 	ip->si_code = pp->p_wcode;
820 	ip->si_pid = pp->p_pid;
821 	ip->si_ctid = PRCTID(pp);
822 	ip->si_zoneid = pp->p_zone->zone_id;
823 	ip->si_status = pp->p_wdata;
824 	ip->si_stime = pp->p_stime;
825 	ip->si_utime = pp->p_utime;
826 
827 	if (waitflag) {
828 		pp->p_wcode = 0;
829 		pp->p_wdata = 0;
830 		pp->p_pidflag &= ~CLDPEND;
831 	}
832 }
833 
834 /*
835  * Wait system call.
836  * Search for a terminated (zombie) child,
837  * finally lay it to rest, and collect its status.
838  * Look also for stopped children,
839  * and pass back status from them.
840  */
841 int
842 waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
843 {
844 	int found;
845 	proc_t *cp, *pp;
846 	proc_t **nsp;
847 	int proc_gone;
848 	int waitflag = !(options & WNOWAIT);
849 
850 	/*
851 	 * Obsolete flag, defined here only for binary compatibility
852 	 * with old statically linked executables.  Delete this when
853 	 * we no longer care about these old and broken applications.
854 	 */
855 #define	_WNOCHLD	0400
856 	options &= ~_WNOCHLD;
857 
858 	if (options == 0 || (options & ~WOPTMASK))
859 		return (EINVAL);
860 
861 	switch (idtype) {
862 	case P_PID:
863 	case P_PGID:
864 		if (id < 0 || id >= maxpid)
865 			return (EINVAL);
866 		/* FALLTHROUGH */
867 	case P_ALL:
868 		break;
869 	default:
870 		return (EINVAL);
871 	}
872 
873 	pp = ttoproc(curthread);
874 
875 	/*
876 	 * lock parent mutex so that sibling chain can be searched.
877 	 */
878 	mutex_enter(&pidlock);
879 
880 	/*
881 	 * if we are only looking for exited processes and child_ns list
882 	 * is empty no reason to look at all children.
883 	 */
884 	if (idtype == P_ALL &&
885 	    (options & (WOPTMASK & ~WNOWAIT)) == (WNOHANG | WEXITED) &&
886 		pp->p_child_ns == NULL) {
887 
888 		if (pp->p_child) {
889 			mutex_exit(&pidlock);
890 			bzero(ip, sizeof (k_siginfo_t));
891 			return (0);
892 		}
893 		mutex_exit(&pidlock);
894 		return (ECHILD);
895 	}
896 
897 	while ((cp = pp->p_child) != NULL) {
898 
899 		proc_gone = 0;
900 
901 		for (nsp = &pp->p_child_ns; *nsp; nsp = &(*nsp)->p_sibling_ns) {
902 			if (idtype == P_PID && id != (*nsp)->p_pid) {
903 				continue;
904 			}
905 			if (idtype == P_PGID && id != (*nsp)->p_pgrp) {
906 				continue;
907 			}
908 
909 			switch ((*nsp)->p_wcode) {
910 
911 			case CLD_TRAPPED:
912 			case CLD_STOPPED:
913 			case CLD_CONTINUED:
914 				cmn_err(CE_PANIC,
915 				    "waitid: wrong state %d on the p_newstate"
916 				    " list", (*nsp)->p_wcode);
917 				break;
918 
919 			case CLD_EXITED:
920 			case CLD_DUMPED:
921 			case CLD_KILLED:
922 				if (!(options & WEXITED)) {
923 					/*
924 					 * Count how many are already gone
925 					 * for good.
926 					 */
927 					proc_gone++;
928 					break;
929 				}
930 				if (!waitflag) {
931 					winfo((*nsp), ip, 0);
932 				} else {
933 					proc_t *xp = *nsp;
934 					winfo(xp, ip, 1);
935 					freeproc(xp);
936 				}
937 				mutex_exit(&pidlock);
938 				if (waitflag) {		/* accept SIGCLD */
939 					sigcld_delete(ip);
940 					sigcld_repost();
941 				}
942 				return (0);
943 			}
944 
945 			if (idtype == P_PID)
946 				break;
947 		}
948 
949 		/*
950 		 * Wow! None of the threads on the p_sibling_ns list were
951 		 * interesting threads. Check all the kids!
952 		 */
953 		found = 0;
954 		cp = pp->p_child;
955 		do {
956 			if (idtype == P_PID && id != cp->p_pid) {
957 				continue;
958 			}
959 			if (idtype == P_PGID && id != cp->p_pgrp) {
960 				continue;
961 			}
962 
963 			found++;
964 
965 			switch (cp->p_wcode) {
966 			case CLD_TRAPPED:
967 				if (!(options & WTRAPPED))
968 					break;
969 				winfo(cp, ip, waitflag);
970 				mutex_exit(&pidlock);
971 				if (waitflag) {		/* accept SIGCLD */
972 					sigcld_delete(ip);
973 					sigcld_repost();
974 				}
975 				return (0);
976 
977 			case CLD_STOPPED:
978 				if (!(options & WSTOPPED))
979 					break;
980 				/* Is it still stopped? */
981 				mutex_enter(&cp->p_lock);
982 				if (!jobstopped(cp)) {
983 					mutex_exit(&cp->p_lock);
984 					break;
985 				}
986 				mutex_exit(&cp->p_lock);
987 				winfo(cp, ip, waitflag);
988 				mutex_exit(&pidlock);
989 				if (waitflag) {		/* accept SIGCLD */
990 					sigcld_delete(ip);
991 					sigcld_repost();
992 				}
993 				return (0);
994 
995 			case CLD_CONTINUED:
996 				if (!(options & WCONTINUED))
997 					break;
998 				winfo(cp, ip, waitflag);
999 				mutex_exit(&pidlock);
1000 				if (waitflag) {		/* accept SIGCLD */
1001 					sigcld_delete(ip);
1002 					sigcld_repost();
1003 				}
1004 				return (0);
1005 
1006 			case CLD_EXITED:
1007 			case CLD_DUMPED:
1008 			case CLD_KILLED:
1009 				/*
1010 				 * Don't complain if a process was found in
1011 				 * the first loop but we broke out of the loop
1012 				 * because of the arguments passed to us.
1013 				 */
1014 				if (proc_gone == 0) {
1015 					cmn_err(CE_PANIC,
1016 					    "waitid: wrong state on the"
1017 					    " p_child list");
1018 				} else {
1019 					break;
1020 				}
1021 			}
1022 
1023 			if (idtype == P_PID)
1024 				break;
1025 		} while ((cp = cp->p_sibling) != NULL);
1026 
1027 		/*
1028 		 * If we found no interesting processes at all,
1029 		 * break out and return ECHILD.
1030 		 */
1031 		if (found + proc_gone == 0)
1032 			break;
1033 
1034 		if (options & WNOHANG) {
1035 			bzero(ip, sizeof (k_siginfo_t));
1036 			/*
1037 			 * We should set ip->si_signo = SIGCLD,
1038 			 * but there is an SVVS test that expects
1039 			 * ip->si_signo to be zero in this case.
1040 			 */
1041 			mutex_exit(&pidlock);
1042 			return (0);
1043 		}
1044 
1045 		/*
1046 		 * If we found no processes of interest that could
1047 		 * change state while we wait, we don't wait at all.
1048 		 * Get out with ECHILD according to SVID.
1049 		 */
1050 		if (found == proc_gone)
1051 			break;
1052 
1053 		if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) {
1054 			mutex_exit(&pidlock);
1055 			return (EINTR);
1056 		}
1057 	}
1058 	mutex_exit(&pidlock);
1059 	return (ECHILD);
1060 }
1061 
1062 /*
1063  * For implementations that don't require binary compatibility,
1064  * the wait system call may be made into a library call to the
1065  * waitid system call.
1066  */
1067 int64_t
1068 wait(void)
1069 {
1070 	int error;
1071 	k_siginfo_t info;
1072 	rval_t	r;
1073 
1074 	if (error =  waitid(P_ALL, (id_t)0, &info, WEXITED|WTRAPPED))
1075 		return (set_errno(error));
1076 	r.r_val1 = info.si_pid;
1077 	r.r_val2 = wstat(info.si_code, info.si_status);
1078 	return (r.r_vals);
1079 }
1080 
1081 int
1082 waitsys(idtype_t idtype, id_t id, siginfo_t *infop, int options)
1083 {
1084 	int error;
1085 	k_siginfo_t info;
1086 
1087 	if (error = waitid(idtype, id, &info, options))
1088 		return (set_errno(error));
1089 	if (copyout(&info, infop, sizeof (k_siginfo_t)))
1090 		return (set_errno(EFAULT));
1091 	return (0);
1092 }
1093 
1094 #ifdef _SYSCALL32_IMPL
1095 
1096 int
1097 waitsys32(idtype_t idtype, id_t id, siginfo_t *infop, int options)
1098 {
1099 	int error;
1100 	k_siginfo_t info;
1101 	siginfo32_t info32;
1102 
1103 	if (error = waitid(idtype, id, &info, options))
1104 		return (set_errno(error));
1105 	siginfo_kto32(&info, &info32);
1106 	if (copyout(&info32, infop, sizeof (info32)))
1107 		return (set_errno(EFAULT));
1108 	return (0);
1109 }
1110 
1111 #endif	/* _SYSCALL32_IMPL */
1112 
1113 void
1114 proc_detach(proc_t *p)
1115 {
1116 	proc_t *q;
1117 
1118 	ASSERT(MUTEX_HELD(&pidlock));
1119 
1120 	q = p->p_parent;
1121 	ASSERT(q != NULL);
1122 
1123 	/*
1124 	 * Take it off the newstate list of its parent
1125 	 */
1126 	delete_ns(q, p);
1127 
1128 	if (q->p_child == p) {
1129 		q->p_child = p->p_sibling;
1130 		/*
1131 		 * If the parent has no children, it better not
1132 		 * have any with new states either!
1133 		 */
1134 		ASSERT(q->p_child ? 1 : q->p_child_ns == NULL);
1135 	}
1136 
1137 	if (p->p_sibling) {
1138 		p->p_sibling->p_psibling = p->p_psibling;
1139 	}
1140 
1141 	if (p->p_psibling) {
1142 		p->p_psibling->p_sibling = p->p_sibling;
1143 	}
1144 }
1145 
1146 /*
1147  * Remove zombie children from the process table.
1148  */
1149 void
1150 freeproc(proc_t *p)
1151 {
1152 	proc_t *q;
1153 
1154 	ASSERT(p->p_stat == SZOMB);
1155 	ASSERT(p->p_tlist == NULL);
1156 	ASSERT(MUTEX_HELD(&pidlock));
1157 
1158 	sigdelq(p, NULL, 0);
1159 	if (p->p_killsqp) {
1160 		siginfofree(p->p_killsqp);
1161 		p->p_killsqp = NULL;
1162 	}
1163 
1164 	prfree(p);	/* inform /proc */
1165 
1166 	/*
1167 	 * Don't free the init processes.
1168 	 * Other dying processes will access it.
1169 	 */
1170 	if (p == proc_init)
1171 		return;
1172 
1173 
1174 	/*
1175 	 * We wait until now to free the cred structure because a
1176 	 * zombie process's credentials may be examined by /proc.
1177 	 * No cred locking needed because there are no threads at this point.
1178 	 */
1179 	upcount_dec(crgetruid(p->p_cred), crgetzoneid(p->p_cred));
1180 	crfree(p->p_cred);
1181 	if (p->p_corefile != NULL) {
1182 		corectl_path_rele(p->p_corefile);
1183 		p->p_corefile = NULL;
1184 	}
1185 	if (p->p_content != NULL) {
1186 		corectl_content_rele(p->p_content);
1187 		p->p_content = NULL;
1188 	}
1189 
1190 	if (p->p_nextofkin && !((p->p_nextofkin->p_flag & SNOWAIT) ||
1191 	    (PTOU(p->p_nextofkin)->u_signal[SIGCLD - 1] == SIG_IGN))) {
1192 		/*
1193 		 * This should still do the right thing since p_utime/stime
1194 		 * get set to the correct value on process exit, so it
1195 		 * should get properly updated
1196 		 */
1197 		p->p_nextofkin->p_cutime += p->p_utime;
1198 		p->p_nextofkin->p_cstime += p->p_stime;
1199 
1200 		p->p_nextofkin->p_cacct[LMS_USER] += p->p_acct[LMS_USER];
1201 		p->p_nextofkin->p_cacct[LMS_SYSTEM] += p->p_acct[LMS_SYSTEM];
1202 		p->p_nextofkin->p_cacct[LMS_TRAP] += p->p_acct[LMS_TRAP];
1203 		p->p_nextofkin->p_cacct[LMS_TFAULT] += p->p_acct[LMS_TFAULT];
1204 		p->p_nextofkin->p_cacct[LMS_DFAULT] += p->p_acct[LMS_DFAULT];
1205 		p->p_nextofkin->p_cacct[LMS_KFAULT] += p->p_acct[LMS_KFAULT];
1206 		p->p_nextofkin->p_cacct[LMS_USER_LOCK]
1207 		    += p->p_acct[LMS_USER_LOCK];
1208 		p->p_nextofkin->p_cacct[LMS_SLEEP] += p->p_acct[LMS_SLEEP];
1209 		p->p_nextofkin->p_cacct[LMS_WAIT_CPU]
1210 		    += p->p_acct[LMS_WAIT_CPU];
1211 		p->p_nextofkin->p_cacct[LMS_STOPPED] += p->p_acct[LMS_STOPPED];
1212 
1213 		p->p_nextofkin->p_cru.minflt	+= p->p_ru.minflt;
1214 		p->p_nextofkin->p_cru.majflt	+= p->p_ru.majflt;
1215 		p->p_nextofkin->p_cru.nswap	+= p->p_ru.nswap;
1216 		p->p_nextofkin->p_cru.inblock	+= p->p_ru.inblock;
1217 		p->p_nextofkin->p_cru.oublock	+= p->p_ru.oublock;
1218 		p->p_nextofkin->p_cru.msgsnd	+= p->p_ru.msgsnd;
1219 		p->p_nextofkin->p_cru.msgrcv	+= p->p_ru.msgrcv;
1220 		p->p_nextofkin->p_cru.nsignals	+= p->p_ru.nsignals;
1221 		p->p_nextofkin->p_cru.nvcsw	+= p->p_ru.nvcsw;
1222 		p->p_nextofkin->p_cru.nivcsw	+= p->p_ru.nivcsw;
1223 		p->p_nextofkin->p_cru.sysc	+= p->p_ru.sysc;
1224 		p->p_nextofkin->p_cru.ioch	+= p->p_ru.ioch;
1225 
1226 	}
1227 
1228 	q = p->p_nextofkin;
1229 	if (q && q->p_orphan == p)
1230 		q->p_orphan = p->p_nextorph;
1231 	else if (q) {
1232 		for (q = q->p_orphan; q; q = q->p_nextorph)
1233 			if (q->p_nextorph == p)
1234 				break;
1235 		ASSERT(q && q->p_nextorph == p);
1236 		q->p_nextorph = p->p_nextorph;
1237 	}
1238 
1239 	proc_detach(p);
1240 	pid_exit(p);	/* frees pid and proc structure */
1241 }
1242 
1243 /*
1244  * Delete process "child" from the newstate list of process "parent"
1245  */
1246 void
1247 delete_ns(proc_t *parent, proc_t *child)
1248 {
1249 	proc_t **ns;
1250 
1251 	ASSERT(MUTEX_HELD(&pidlock));
1252 	ASSERT(child->p_parent == parent);
1253 	for (ns = &parent->p_child_ns; *ns != NULL; ns = &(*ns)->p_sibling_ns) {
1254 		if (*ns == child) {
1255 
1256 			ASSERT((*ns)->p_parent == parent);
1257 
1258 			*ns = child->p_sibling_ns;
1259 			child->p_sibling_ns = NULL;
1260 			return;
1261 		}
1262 	}
1263 }
1264 
1265 /*
1266  * Add process "child" to the new state list of process "parent"
1267  */
1268 void
1269 add_ns(proc_t *parent, proc_t *child)
1270 {
1271 	ASSERT(child->p_sibling_ns == NULL);
1272 	child->p_sibling_ns = parent->p_child_ns;
1273 	parent->p_child_ns = child;
1274 }
1275