xref: /illumos-gate/usr/src/uts/common/os/exec.c (revision 80ab886d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*	Copyright (c) 1988 AT&T	*/
29 /*	  All Rights Reserved  	*/
30 
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/sysmacros.h>
35 #include <sys/systm.h>
36 #include <sys/signal.h>
37 #include <sys/cred_impl.h>
38 #include <sys/policy.h>
39 #include <sys/user.h>
40 #include <sys/errno.h>
41 #include <sys/file.h>
42 #include <sys/vfs.h>
43 #include <sys/vnode.h>
44 #include <sys/mman.h>
45 #include <sys/acct.h>
46 #include <sys/cpuvar.h>
47 #include <sys/proc.h>
48 #include <sys/cmn_err.h>
49 #include <sys/debug.h>
50 #include <sys/pathname.h>
51 #include <sys/vm.h>
52 #include <sys/vtrace.h>
53 #include <sys/exec.h>
54 #include <sys/exechdr.h>
55 #include <sys/kmem.h>
56 #include <sys/prsystm.h>
57 #include <sys/modctl.h>
58 #include <sys/vmparam.h>
59 #include <sys/schedctl.h>
60 #include <sys/utrap.h>
61 #include <sys/systeminfo.h>
62 #include <sys/stack.h>
63 #include <sys/rctl.h>
64 #include <sys/dtrace.h>
65 #include <sys/lwpchan_impl.h>
66 #include <sys/pool.h>
67 #include <sys/sdt.h>
68 
69 #include <c2/audit.h>
70 
71 #include <vm/hat.h>
72 #include <vm/anon.h>
73 #include <vm/as.h>
74 #include <vm/seg.h>
75 #include <vm/seg_vn.h>
76 
77 #define	PRIV_RESET		0x01	/* needs to reset privs */
78 #define	PRIV_SETID		0x02	/* needs to change uids */
79 #define	PRIV_SETUGID		0x04	/* is setuid/setgid/forced privs */
80 #define	PRIV_INCREASE		0x08	/* child runs with more privs */
81 #define	MAC_FLAGS		0x10	/* need to adjust MAC flags */
82 
83 static int execsetid(struct vnode *, struct vattr *, uid_t *, uid_t *);
84 static int hold_execsw(struct execsw *);
85 
86 uint_t auxv_hwcap = 0;	/* auxv AT_SUN_HWCAP value; determined on the fly */
87 #if defined(_SYSCALL32_IMPL)
88 uint_t auxv_hwcap32 = 0;	/* 32-bit version of auxv_hwcap */
89 #endif
90 
91 int exec_lpg_disable = 0;
92 
93 #define	PSUIDFLAGS		(SNOCD|SUGID)
94 
95 /*
96  * exec() - wrapper around exece providing NULL environment pointer
97  */
98 int
99 exec(const char *fname, const char **argp)
100 {
101 	return (exece(fname, argp, NULL));
102 }
103 
104 /*
105  * exece() - system call wrapper around exec_common()
106  */
107 int
108 exece(const char *fname, const char **argp, const char **envp)
109 {
110 	int error;
111 
112 	error = exec_common(fname, argp, envp);
113 	return (error ? (set_errno(error)) : 0);
114 }
115 
116 int
117 exec_common(const char *fname, const char **argp, const char **envp)
118 {
119 	vnode_t *vp = NULL, *dir = NULL, *tmpvp = NULL;
120 	proc_t *p = ttoproc(curthread);
121 	klwp_t *lwp = ttolwp(curthread);
122 	struct user *up = PTOU(p);
123 	long execsz;		/* temporary count of exec size */
124 	int i;
125 	int error;
126 	char exec_file[MAXCOMLEN+1];
127 	struct pathname pn;
128 	struct pathname resolvepn;
129 	struct uarg args;
130 	struct execa ua;
131 	k_sigset_t savedmask;
132 	lwpdir_t *lwpdir = NULL;
133 	lwpdir_t **tidhash;
134 	lwpdir_t *old_lwpdir = NULL;
135 	uint_t old_lwpdir_sz;
136 	lwpdir_t **old_tidhash;
137 	uint_t old_tidhash_sz;
138 	lwpent_t *lep;
139 
140 	/*
141 	 * exec() is not supported for the /proc agent lwp.
142 	 */
143 	if (curthread == p->p_agenttp)
144 		return (ENOTSUP);
145 
146 	if ((error = secpolicy_basic_exec(CRED())) != 0)
147 		return (error);
148 
149 	/*
150 	 * Inform /proc that an exec() has started.
151 	 * Hold signals that are ignored by default so that we will
152 	 * not be interrupted by a signal that will be ignored after
153 	 * successful completion of gexec().
154 	 */
155 	mutex_enter(&p->p_lock);
156 	prexecstart();
157 	schedctl_finish_sigblock(curthread);
158 	savedmask = curthread->t_hold;
159 	sigorset(&curthread->t_hold, &ignoredefault);
160 	mutex_exit(&p->p_lock);
161 
162 	/*
163 	 * Look up path name and remember last component for later.
164 	 * To help coreadm expand its %d token, we attempt to save
165 	 * the directory containing the executable in p_execdir. The
166 	 * first call to lookuppn() may fail and return EINVAL because
167 	 * dirvpp is non-NULL. In that case, we make a second call to
168 	 * lookuppn() with dirvpp set to NULL; p_execdir will be NULL,
169 	 * but coreadm is allowed to expand %d to the empty string and
170 	 * there are other cases in which that failure may occur.
171 	 */
172 	if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
173 		goto out;
174 	pn_alloc(&resolvepn);
175 	if ((error = lookuppn(&pn, &resolvepn, FOLLOW, &dir, &vp)) != 0) {
176 		pn_free(&resolvepn);
177 		pn_free(&pn);
178 		if (error != EINVAL)
179 			goto out;
180 
181 		dir = NULL;
182 		if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
183 			goto out;
184 		pn_alloc(&resolvepn);
185 		if ((error = lookuppn(&pn, &resolvepn, FOLLOW, NULLVPP,
186 		    &vp)) != 0) {
187 			pn_free(&resolvepn);
188 			pn_free(&pn);
189 			goto out;
190 		}
191 	}
192 	if (vp == NULL) {
193 		if (dir != NULL)
194 			VN_RELE(dir);
195 		error = ENOENT;
196 		pn_free(&resolvepn);
197 		pn_free(&pn);
198 		goto out;
199 	}
200 
201 	/*
202 	 * We do not allow executing files in attribute directories.
203 	 * We test this by determining whether the resolved path
204 	 * contains a "/" when we're in an attribute directory;
205 	 * only if the pathname does not contain a "/" the resolved path
206 	 * points to a file in the current working (attribute) directory.
207 	 */
208 	if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 &&
209 	    strchr(resolvepn.pn_path, '/') == NULL) {
210 		if (dir != NULL)
211 			VN_RELE(dir);
212 		error = EACCES;
213 		pn_free(&resolvepn);
214 		pn_free(&pn);
215 		VN_RELE(vp);
216 		goto out;
217 	}
218 
219 	bzero(exec_file, MAXCOMLEN+1);
220 	(void) strncpy(exec_file, pn.pn_path, MAXCOMLEN);
221 	bzero(&args, sizeof (args));
222 	args.pathname = resolvepn.pn_path;
223 	/* don't free resolvepn until we are done with args */
224 	pn_free(&pn);
225 
226 	/*
227 	 * Specific exec handlers, or policies determined via
228 	 * /etc/system may override the historical default.
229 	 */
230 	args.stk_prot = PROT_ZFOD;
231 	args.dat_prot = PROT_ZFOD;
232 
233 	CPU_STATS_ADD_K(sys, sysexec, 1);
234 	DTRACE_PROC1(exec, char *, args.pathname);
235 
236 	ua.fname = fname;
237 	ua.argp = argp;
238 	ua.envp = envp;
239 
240 	if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz,
241 	    exec_file, p->p_cred)) != 0) {
242 		VN_RELE(vp);
243 		if (dir != NULL)
244 			VN_RELE(dir);
245 		pn_free(&resolvepn);
246 		goto fail;
247 	}
248 
249 	/*
250 	 * Free floating point registers (sun4u only)
251 	 */
252 	ASSERT(lwp != NULL);
253 	lwp_freeregs(lwp, 1);
254 
255 	/*
256 	 * Free thread and process context ops.
257 	 */
258 	if (curthread->t_ctx)
259 		freectx(curthread, 1);
260 	if (p->p_pctx)
261 		freepctx(p, 1);
262 
263 	/*
264 	 * Remember file name for accounting; clear any cached DTrace predicate.
265 	 */
266 	up->u_acflag &= ~AFORK;
267 	bcopy(exec_file, up->u_comm, MAXCOMLEN+1);
268 	curthread->t_predcache = NULL;
269 
270 	/*
271 	 * Clear contract template state
272 	 */
273 	lwp_ctmpl_clear(lwp);
274 
275 	/*
276 	 * Save the directory in which we found the executable for expanding
277 	 * the %d token used in core file patterns.
278 	 */
279 	mutex_enter(&p->p_lock);
280 	tmpvp = p->p_execdir;
281 	p->p_execdir = dir;
282 	if (p->p_execdir != NULL)
283 		VN_HOLD(p->p_execdir);
284 	mutex_exit(&p->p_lock);
285 
286 	if (tmpvp != NULL)
287 		VN_RELE(tmpvp);
288 
289 	/*
290 	 * Reset stack state to the user stack, clear set of signals
291 	 * caught on the signal stack, and reset list of signals that
292 	 * restart system calls; the new program's environment should
293 	 * not be affected by detritus from the old program.  Any
294 	 * pending held signals remain held, so don't clear t_hold.
295 	 */
296 	mutex_enter(&p->p_lock);
297 	lwp->lwp_oldcontext = 0;
298 	lwp->lwp_ustack = 0;
299 	lwp->lwp_old_stk_ctl = 0;
300 	sigemptyset(&up->u_signodefer);
301 	sigemptyset(&up->u_sigonstack);
302 	sigemptyset(&up->u_sigresethand);
303 	lwp->lwp_sigaltstack.ss_sp = 0;
304 	lwp->lwp_sigaltstack.ss_size = 0;
305 	lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
306 
307 	/*
308 	 * Make saved resource limit == current resource limit.
309 	 */
310 	for (i = 0; i < RLIM_NLIMITS; i++) {
311 		/*CONSTCOND*/
312 		if (RLIM_SAVED(i)) {
313 			(void) rctl_rlimit_get(rctlproc_legacy[i], p,
314 			    &up->u_saved_rlimit[i]);
315 		}
316 	}
317 
318 	/*
319 	 * If the action was to catch the signal, then the action
320 	 * must be reset to SIG_DFL.
321 	 */
322 	sigdefault(p);
323 	p->p_flag &= ~(SNOWAIT|SJCTL);
324 	p->p_flag |= (SEXECED|SMSACCT|SMSFORK);
325 	up->u_signal[SIGCLD - 1] = SIG_DFL;
326 
327 	/*
328 	 * Delete the dot4 sigqueues/signotifies.
329 	 */
330 	sigqfree(p);
331 
332 	mutex_exit(&p->p_lock);
333 
334 	mutex_enter(&p->p_pflock);
335 	p->p_prof.pr_base = NULL;
336 	p->p_prof.pr_size = 0;
337 	p->p_prof.pr_off = 0;
338 	p->p_prof.pr_scale = 0;
339 	p->p_prof.pr_samples = 0;
340 	mutex_exit(&p->p_pflock);
341 
342 	ASSERT(curthread->t_schedctl == NULL);
343 
344 #if defined(__sparc)
345 	if (p->p_utraps != NULL)
346 		utrap_free(p);
347 #endif	/* __sparc */
348 
349 	/*
350 	 * Close all close-on-exec files.
351 	 */
352 	close_exec(P_FINFO(p));
353 	TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);
354 	setregs(&args);
355 
356 	/* Mark this as an executable vnode */
357 	mutex_enter(&vp->v_lock);
358 	vp->v_flag |= VVMEXEC;
359 	mutex_exit(&vp->v_lock);
360 
361 	VN_RELE(vp);
362 	if (dir != NULL)
363 		VN_RELE(dir);
364 	pn_free(&resolvepn);
365 
366 	/*
367 	 * Allocate a new lwp directory and lwpid hash table if necessary.
368 	 */
369 	if (curthread->t_tid != 1 || p->p_lwpdir_sz != 2) {
370 		lwpdir = kmem_zalloc(2 * sizeof (lwpdir_t), KM_SLEEP);
371 		lwpdir->ld_next = lwpdir + 1;
372 		tidhash = kmem_zalloc(2 * sizeof (lwpdir_t *), KM_SLEEP);
373 		if (p->p_lwpdir != NULL)
374 			lep = p->p_lwpdir[curthread->t_dslot].ld_entry;
375 		else
376 			lep = kmem_zalloc(sizeof (*lep), KM_SLEEP);
377 	}
378 
379 	mutex_enter(&p->p_lock);
380 	prbarrier(p);
381 
382 	/*
383 	 * Reset lwp id to the default value of 1.
384 	 * This is a single-threaded process now
385 	 * and lwp #1 is lwp_wait()able by default.
386 	 * The t_unpark flag should not be inherited.
387 	 */
388 	ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
389 	curthread->t_tid = 1;
390 	curthread->t_unpark = 0;
391 	curthread->t_proc_flag |= TP_TWAIT;
392 	curthread->t_proc_flag &= ~TP_DAEMON;	/* daemons shouldn't exec */
393 	p->p_lwpdaemon = 0;			/* but oh well ... */
394 	p->p_lwpid = 1;
395 
396 	/*
397 	 * Install the newly-allocated lwp directory and lwpid hash table
398 	 * and insert the current thread into the new hash table.
399 	 */
400 	if (lwpdir != NULL) {
401 		old_lwpdir = p->p_lwpdir;
402 		old_lwpdir_sz = p->p_lwpdir_sz;
403 		old_tidhash = p->p_tidhash;
404 		old_tidhash_sz = p->p_tidhash_sz;
405 		p->p_lwpdir = p->p_lwpfree = lwpdir;
406 		p->p_lwpdir_sz = 2;
407 		p->p_tidhash = tidhash;
408 		p->p_tidhash_sz = 2;
409 		lep->le_thread = curthread;
410 		lep->le_lwpid = curthread->t_tid;
411 		lep->le_start = curthread->t_start;
412 		lwp_hash_in(p, lep);
413 	}
414 	/*
415 	 * Restore the saved signal mask and
416 	 * inform /proc that the exec() has finished.
417 	 */
418 	curthread->t_hold = savedmask;
419 	prexecend();
420 	mutex_exit(&p->p_lock);
421 	if (old_lwpdir) {
422 		kmem_free(old_lwpdir, old_lwpdir_sz * sizeof (lwpdir_t));
423 		kmem_free(old_tidhash, old_tidhash_sz * sizeof (lwpdir_t *));
424 	}
425 	ASSERT(error == 0);
426 	DTRACE_PROC(exec__success);
427 	return (0);
428 
429 fail:
430 	DTRACE_PROC1(exec__failure, int, error);
431 out:		/* error return */
432 	mutex_enter(&p->p_lock);
433 	curthread->t_hold = savedmask;
434 	prexecend();
435 	mutex_exit(&p->p_lock);
436 	ASSERT(error != 0);
437 	return (error);
438 }
439 
440 
441 /*
442  * Perform generic exec duties and switchout to object-file specific
443  * handler.
444  */
445 int
446 gexec(
447 	struct vnode **vpp,
448 	struct execa *uap,
449 	struct uarg *args,
450 	struct intpdata *idatap,
451 	int level,
452 	long *execsz,
453 	caddr_t exec_file,
454 	struct cred *cred)
455 {
456 	struct vnode *vp;
457 	proc_t *pp = ttoproc(curthread);
458 	struct execsw *eswp;
459 	int error = 0;
460 	int suidflags = 0;
461 	ssize_t resid;
462 	uid_t uid, gid;
463 	struct vattr vattr;
464 	char magbuf[MAGIC_BYTES];
465 	int setid;
466 	cred_t *oldcred, *newcred = NULL;
467 	int privflags = 0;
468 	int setidfl;
469 
470 	/*
471 	 * If the SNOCD or SUGID flag is set, turn it off and remember the
472 	 * previous setting so we can restore it if we encounter an error.
473 	 */
474 	if (level == 0 && (pp->p_flag & PSUIDFLAGS)) {
475 		mutex_enter(&pp->p_lock);
476 		suidflags = pp->p_flag & PSUIDFLAGS;
477 		pp->p_flag &= ~PSUIDFLAGS;
478 		mutex_exit(&pp->p_lock);
479 	}
480 
481 	if ((error = execpermissions(*vpp, &vattr, args)) != 0)
482 		goto bad;
483 
484 	/* need to open vnode for stateful file systems like rfs */
485 	if ((error = VOP_OPEN(vpp, FREAD, CRED())) != 0)
486 		goto bad;
487 	vp = *vpp;
488 
489 	/*
490 	 * Note: to support binary compatibility with SunOS a.out
491 	 * executables, we read in the first four bytes, as the
492 	 * magic number is in bytes 2-3.
493 	 */
494 	if (error = vn_rdwr(UIO_READ, vp, magbuf, sizeof (magbuf),
495 	    (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid))
496 		goto bad;
497 	if (resid != 0)
498 		goto bad;
499 
500 	if ((eswp = findexec_by_hdr(magbuf)) == NULL)
501 		goto bad;
502 
503 	if (level == 0 &&
504 	    (privflags = execsetid(vp, &vattr, &uid, &gid)) != 0) {
505 
506 		newcred = cred = crdup(cred);
507 
508 		/* If we can, drop the PA bit */
509 		if ((privflags & PRIV_RESET) != 0)
510 			priv_adjust_PA(cred);
511 
512 		if (privflags & PRIV_SETID) {
513 			cred->cr_uid = uid;
514 			cred->cr_gid = gid;
515 			cred->cr_suid = uid;
516 			cred->cr_sgid = gid;
517 		}
518 
519 		if (privflags & MAC_FLAGS) {
520 			if (!(CR_FLAGS(cred) & NET_MAC_AWARE_INHERIT))
521 				CR_FLAGS(cred) &= ~NET_MAC_AWARE;
522 			CR_FLAGS(cred) &= ~NET_MAC_AWARE_INHERIT;
523 		}
524 
525 		/*
526 		 * Implement the privilege updates:
527 		 *
528 		 * Restrict with L:
529 		 *
530 		 *	I' = I & L
531 		 *
532 		 *	E' = P' = (I' + F) & A
533 		 *
534 		 * But if running under ptrace, we cap I with P.
535 		 */
536 		if ((privflags & PRIV_RESET) != 0) {
537 			if ((privflags & PRIV_INCREASE) != 0 &&
538 			    (pp->p_proc_flag & P_PR_PTRACE) != 0)
539 				priv_intersect(&CR_OPPRIV(cred),
540 						    &CR_IPRIV(cred));
541 			priv_intersect(&CR_LPRIV(cred), &CR_IPRIV(cred));
542 			CR_EPRIV(cred) = CR_PPRIV(cred) = CR_IPRIV(cred);
543 			priv_adjust_PA(cred);
544 		}
545 	}
546 
547 	/* SunOS 4.x buy-back */
548 	if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) &&
549 	    (vattr.va_mode & (VSUID|VSGID))) {
550 		cmn_err(CE_NOTE,
551 		    "!%s, uid %d: setuid execution not allowed, dev=%lx",
552 		    exec_file, cred->cr_uid, vp->v_vfsp->vfs_dev);
553 	}
554 
555 	/*
556 	 * execsetid() told us whether or not we had to change the
557 	 * credentials of the process.  In privflags, it told us
558 	 * whether we gained any privileges or executed a set-uid executable.
559 	 */
560 	setid = (privflags & (PRIV_SETUGID|PRIV_INCREASE));
561 
562 	/*
563 	 * Use /etc/system variable to determine if the stack
564 	 * should be marked as executable by default.
565 	 */
566 	if (noexec_user_stack)
567 		args->stk_prot &= ~PROT_EXEC;
568 
569 	args->execswp = eswp; /* Save execsw pointer in uarg for exec_func */
570 
571 	/*
572 	 * Traditionally, the setid flags told the sub processes whether
573 	 * the file just executed was set-uid or set-gid; this caused
574 	 * some confusion as the 'setid' flag did not match the SUGID
575 	 * process flag which is only set when the uids/gids do not match.
576 	 * A script set-gid/set-uid to the real uid/gid would start with
577 	 * /dev/fd/X but an executable would happily trust LD_LIBRARY_PATH.
578 	 * Now we flag those cases where the calling process cannot
579 	 * be trusted to influence the newly exec'ed process, either
580 	 * because it runs with more privileges or when the uids/gids
581 	 * do in fact not match.
582 	 * This also makes the runtime linker agree with the on exec
583 	 * values of SNOCD and SUGID.
584 	 */
585 	setidfl = 0;
586 	if (cred->cr_uid != cred->cr_ruid || (cred->cr_rgid != cred->cr_gid &&
587 	    !supgroupmember(cred->cr_gid, cred))) {
588 		setidfl |= EXECSETID_UGIDS;
589 	}
590 	if (setid & PRIV_SETUGID)
591 		setidfl |= EXECSETID_SETID;
592 	if (setid & PRIV_INCREASE)
593 		setidfl |= EXECSETID_PRIVS;
594 
595 	error = (*eswp->exec_func)(vp, uap, args, idatap, level, execsz,
596 		setidfl, exec_file, cred);
597 	rw_exit(eswp->exec_lock);
598 	if (error != 0) {
599 		if (newcred != NULL)
600 			crfree(newcred);
601 		goto bad;
602 	}
603 
604 	if (level == 0) {
605 		mutex_enter(&pp->p_crlock);
606 		if (newcred != NULL) {
607 			/*
608 			 * Free the old credentials, and set the new ones.
609 			 * Do this for both the process and the (single) thread.
610 			 */
611 			crfree(pp->p_cred);
612 			pp->p_cred = cred;	/* cred already held for proc */
613 			crhold(cred);		/* hold new cred for thread */
614 			/*
615 			 * DTrace accesses t_cred in probe context.  t_cred
616 			 * must always be either NULL, or point to a valid,
617 			 * allocated cred structure.
618 			 */
619 			oldcred = curthread->t_cred;
620 			curthread->t_cred = cred;
621 			crfree(oldcred);
622 		}
623 		/*
624 		 * On emerging from a successful exec(), the saved
625 		 * uid and gid equal the effective uid and gid.
626 		 */
627 		cred->cr_suid = cred->cr_uid;
628 		cred->cr_sgid = cred->cr_gid;
629 
630 		/*
631 		 * If the real and effective ids do not match, this
632 		 * is a setuid process that should not dump core.
633 		 * The group comparison is tricky; we prevent the code
634 		 * from flagging SNOCD when executing with an effective gid
635 		 * which is a supplementary group.
636 		 */
637 		if (cred->cr_ruid != cred->cr_uid ||
638 		    (cred->cr_rgid != cred->cr_gid &&
639 		    !supgroupmember(cred->cr_gid, cred)) ||
640 		    (privflags & PRIV_INCREASE) != 0)
641 			suidflags = PSUIDFLAGS;
642 		else
643 			suidflags = 0;
644 
645 		mutex_exit(&pp->p_crlock);
646 		if (suidflags) {
647 			mutex_enter(&pp->p_lock);
648 			pp->p_flag |= suidflags;
649 			mutex_exit(&pp->p_lock);
650 		}
651 		if (setid && (pp->p_proc_flag & P_PR_PTRACE) == 0) {
652 			/*
653 			 * If process is traced via /proc, arrange to
654 			 * invalidate the associated /proc vnode.
655 			 */
656 			if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))
657 				args->traceinval = 1;
658 		}
659 		if (pp->p_proc_flag & P_PR_PTRACE)
660 			psignal(pp, SIGTRAP);
661 		if (args->traceinval)
662 			prinvalidate(&pp->p_user);
663 	}
664 
665 	return (0);
666 bad:
667 	if (error == 0)
668 		error = ENOEXEC;
669 
670 	if (suidflags) {
671 		mutex_enter(&pp->p_lock);
672 		pp->p_flag |= suidflags;
673 		mutex_exit(&pp->p_lock);
674 	}
675 	return (error);
676 }
677 
678 extern char *execswnames[];
679 
680 struct execsw *
681 allocate_execsw(char *name, char *magic, size_t magic_size)
682 {
683 	int i, j;
684 	char *ename;
685 	char *magicp;
686 
687 	mutex_enter(&execsw_lock);
688 	for (i = 0; i < nexectype; i++) {
689 		if (execswnames[i] == NULL) {
690 			ename = kmem_alloc(strlen(name) + 1, KM_SLEEP);
691 			(void) strcpy(ename, name);
692 			execswnames[i] = ename;
693 			/*
694 			 * Set the magic number last so that we
695 			 * don't need to hold the execsw_lock in
696 			 * findexectype().
697 			 */
698 			magicp = kmem_alloc(magic_size, KM_SLEEP);
699 			for (j = 0; j < magic_size; j++)
700 				magicp[j] = magic[j];
701 			execsw[i].exec_magic = magicp;
702 			mutex_exit(&execsw_lock);
703 			return (&execsw[i]);
704 		}
705 	}
706 	mutex_exit(&execsw_lock);
707 	return (NULL);
708 }
709 
710 /*
711  * Find the exec switch table entry with the corresponding magic string.
712  */
713 struct execsw *
714 findexecsw(char *magic)
715 {
716 	struct execsw *eswp;
717 
718 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
719 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
720 		if (magic && eswp->exec_maglen != 0 &&
721 		    bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0)
722 			return (eswp);
723 	}
724 	return (NULL);
725 }
726 
727 /*
728  * Find the execsw[] index for the given exec header string by looking for the
729  * magic string at a specified offset and length for each kind of executable
730  * file format until one matches.  If no execsw[] entry is found, try to
731  * autoload a module for this magic string.
732  */
733 struct execsw *
734 findexec_by_hdr(char *header)
735 {
736 	struct execsw *eswp;
737 
738 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
739 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
740 		if (header && eswp->exec_maglen != 0 &&
741 		    bcmp(&header[eswp->exec_magoff], eswp->exec_magic,
742 			    eswp->exec_maglen) == 0) {
743 			if (hold_execsw(eswp) != 0)
744 				return (NULL);
745 			return (eswp);
746 		}
747 	}
748 	return (NULL);	/* couldn't find the type */
749 }
750 
751 /*
752  * Find the execsw[] index for the given magic string.  If no execsw[] entry
753  * is found, try to autoload a module for this magic string.
754  */
755 struct execsw *
756 findexec_by_magic(char *magic)
757 {
758 	struct execsw *eswp;
759 
760 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
761 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
762 		if (magic && eswp->exec_maglen != 0 &&
763 		    bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0) {
764 			if (hold_execsw(eswp) != 0)
765 				return (NULL);
766 			return (eswp);
767 		}
768 	}
769 	return (NULL);	/* couldn't find the type */
770 }
771 
772 static int
773 hold_execsw(struct execsw *eswp)
774 {
775 	char *name;
776 
777 	rw_enter(eswp->exec_lock, RW_READER);
778 	while (!LOADED_EXEC(eswp)) {
779 		rw_exit(eswp->exec_lock);
780 		name = execswnames[eswp-execsw];
781 		ASSERT(name);
782 		if (modload("exec", name) == -1)
783 			return (-1);
784 		rw_enter(eswp->exec_lock, RW_READER);
785 	}
786 	return (0);
787 }
788 
789 static int
790 execsetid(struct vnode *vp, struct vattr *vattrp, uid_t *uidp, uid_t *gidp)
791 {
792 	proc_t *pp = ttoproc(curthread);
793 	uid_t uid, gid;
794 	cred_t *cr = pp->p_cred;
795 	int privflags = 0;
796 
797 	/*
798 	 * Remember credentials.
799 	 */
800 	uid = cr->cr_uid;
801 	gid = cr->cr_gid;
802 
803 	/* Will try to reset the PRIV_AWARE bit later. */
804 	if ((CR_FLAGS(cr) & (PRIV_AWARE|PRIV_AWARE_INHERIT)) == PRIV_AWARE)
805 		privflags |= PRIV_RESET;
806 
807 	if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) == 0) {
808 		/*
809 		 * Set-uid root execution only allowed if the limit set
810 		 * holds all unsafe privileges.
811 		 */
812 		if ((vattrp->va_mode & VSUID) && (vattrp->va_uid != 0 ||
813 		    priv_issubset(&priv_unsafe, &CR_LPRIV(cr)))) {
814 			uid = vattrp->va_uid;
815 			privflags |= PRIV_SETUGID;
816 		}
817 		if (vattrp->va_mode & VSGID) {
818 			gid = vattrp->va_gid;
819 			privflags |= PRIV_SETUGID;
820 		}
821 	}
822 
823 	/*
824 	 * Do we need to change our credential anyway?
825 	 * This is the case when E != I or P != I, as
826 	 * we need to do the assignments (with F empty and A full)
827 	 * Or when I is not a subset of L; in that case we need to
828 	 * enforce L.
829 	 *
830 	 *		I' = L & I
831 	 *
832 	 *		E' = P' = (I' + F) & A
833 	 * or
834 	 *		E' = P' = I'
835 	 */
836 	if (!priv_isequalset(&CR_EPRIV(cr), &CR_IPRIV(cr)) ||
837 	    !priv_issubset(&CR_IPRIV(cr), &CR_LPRIV(cr)) ||
838 	    !priv_isequalset(&CR_PPRIV(cr), &CR_IPRIV(cr)))
839 		privflags |= PRIV_RESET;
840 
841 	/* If MAC-aware flag(s) are on, need to update cred to remove. */
842 	if ((CR_FLAGS(cr) & NET_MAC_AWARE) ||
843 	    (CR_FLAGS(cr) & NET_MAC_AWARE_INHERIT))
844 		privflags |= MAC_FLAGS;
845 
846 	/*
847 	 * When we introduce the "forced" set then we will need
848 	 * to set PRIV_INCREASE here if I not a subset of P.
849 	 * If the "allowed" set is introduced we will need to do
850 	 * a similar thing; however, it seems more reasonable to
851 	 * have the allowed set reduce "L": script language interpreters
852 	 * would typically have an allowed set of "all".
853 	 */
854 
855 	/*
856 	 * Set setuid/setgid protections if no ptrace() compatibility.
857 	 * For privileged processes, honor setuid/setgid even in
858 	 * the presence of ptrace() compatibility.
859 	 */
860 	if (((pp->p_proc_flag & P_PR_PTRACE) == 0 ||
861 	    PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, (uid == 0))) &&
862 	    (cr->cr_uid != uid ||
863 	    cr->cr_gid != gid ||
864 	    cr->cr_suid != uid ||
865 	    cr->cr_sgid != gid)) {
866 		*uidp = uid;
867 		*gidp = gid;
868 		privflags |= PRIV_SETID;
869 	}
870 	return (privflags);
871 }
872 
873 int
874 execpermissions(struct vnode *vp, struct vattr *vattrp, struct uarg *args)
875 {
876 	int error;
877 	proc_t *p = ttoproc(curthread);
878 
879 	vattrp->va_mask = AT_MODE | AT_UID | AT_GID | AT_SIZE;
880 	if (error = VOP_GETATTR(vp, vattrp, ATTR_EXEC, p->p_cred))
881 		return (error);
882 	/*
883 	 * Check the access mode.
884 	 * If VPROC, ask /proc if the file is an object file.
885 	 */
886 	if ((error = VOP_ACCESS(vp, VEXEC, 0, p->p_cred)) != 0 ||
887 	    !(vp->v_type == VREG || (vp->v_type == VPROC && pr_isobject(vp))) ||
888 	    (vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0 ||
889 	    (vattrp->va_mode & (VEXEC|(VEXEC>>3)|(VEXEC>>6))) == 0) {
890 		if (error == 0)
891 			error = EACCES;
892 		return (error);
893 	}
894 
895 	if ((p->p_plist || (p->p_proc_flag & (P_PR_PTRACE|P_PR_TRACE))) &&
896 	    (error = VOP_ACCESS(vp, VREAD, 0, p->p_cred))) {
897 		/*
898 		 * If process is under ptrace(2) compatibility,
899 		 * fail the exec(2).
900 		 */
901 		if (p->p_proc_flag & P_PR_PTRACE)
902 			goto bad;
903 		/*
904 		 * Process is traced via /proc.
905 		 * Arrange to invalidate the /proc vnode.
906 		 */
907 		args->traceinval = 1;
908 	}
909 	return (0);
910 bad:
911 	if (error == 0)
912 		error = ENOEXEC;
913 	return (error);
914 }
915 
916 /*
917  * Map a section of an executable file into the user's
918  * address space.
919  */
920 int
921 execmap(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen,
922     off_t offset, int prot, int page, uint_t szc)
923 {
924 	int error = 0;
925 	off_t oldoffset;
926 	caddr_t zfodbase, oldaddr;
927 	size_t end, oldlen;
928 	size_t zfoddiff;
929 	label_t ljb;
930 	proc_t *p = ttoproc(curthread);
931 
932 	oldaddr = addr;
933 	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
934 	if (len) {
935 		oldlen = len;
936 		len += ((size_t)oldaddr - (size_t)addr);
937 		oldoffset = offset;
938 		offset = (off_t)((uintptr_t)offset & PAGEMASK);
939 		if (page) {
940 			spgcnt_t  prefltmem, availm, npages;
941 			int preread;
942 			uint_t mflag = MAP_PRIVATE | MAP_FIXED;
943 
944 			if ((prot & (PROT_WRITE | PROT_EXEC)) == PROT_EXEC) {
945 				mflag |= MAP_TEXT;
946 			} else {
947 				mflag |= MAP_INITDATA;
948 			}
949 
950 			if (valid_usr_range(addr, len, prot, p->p_as,
951 			    p->p_as->a_userlimit) != RANGE_OKAY) {
952 				error = ENOMEM;
953 				goto bad;
954 			}
955 			if (error = VOP_MAP(vp, (offset_t)offset,
956 			    p->p_as, &addr, len, prot, PROT_ALL,
957 			    mflag, CRED()))
958 				goto bad;
959 
960 			/*
961 			 * If the segment can fit, then we prefault
962 			 * the entire segment in.  This is based on the
963 			 * model that says the best working set of a
964 			 * small program is all of its pages.
965 			 */
966 			npages = (spgcnt_t)btopr(len);
967 			prefltmem = freemem - desfree;
968 			preread =
969 			    (npages < prefltmem && len < PGTHRESH) ? 1 : 0;
970 
971 			/*
972 			 * If we aren't prefaulting the segment,
973 			 * increment "deficit", if necessary to ensure
974 			 * that pages will become available when this
975 			 * process starts executing.
976 			 */
977 			availm = freemem - lotsfree;
978 			if (preread == 0 && npages > availm &&
979 			    deficit < lotsfree) {
980 				deficit += MIN((pgcnt_t)(npages - availm),
981 				    lotsfree - deficit);
982 			}
983 
984 			if (preread) {
985 				TRACE_2(TR_FAC_PROC, TR_EXECMAP_PREREAD,
986 				    "execmap preread:freemem %d size %lu",
987 				    freemem, len);
988 				(void) as_fault(p->p_as->a_hat, p->p_as,
989 				    (caddr_t)addr, len, F_INVAL, S_READ);
990 			}
991 		} else {
992 			if (valid_usr_range(addr, len, prot, p->p_as,
993 			    p->p_as->a_userlimit) != RANGE_OKAY) {
994 				error = ENOMEM;
995 				goto bad;
996 			}
997 
998 			if (error = as_map(p->p_as, addr, len,
999 			    segvn_create, zfod_argsp))
1000 				goto bad;
1001 			/*
1002 			 * Read in the segment in one big chunk.
1003 			 */
1004 			if (error = vn_rdwr(UIO_READ, vp, (caddr_t)oldaddr,
1005 			    oldlen, (offset_t)oldoffset, UIO_USERSPACE, 0,
1006 			    (rlim64_t)0, CRED(), (ssize_t *)0))
1007 				goto bad;
1008 			/*
1009 			 * Now set protections.
1010 			 */
1011 			if (prot != PROT_ZFOD) {
1012 				(void) as_setprot(p->p_as, (caddr_t)addr,
1013 				    len, prot);
1014 			}
1015 		}
1016 	}
1017 
1018 	if (zfodlen) {
1019 		end = (size_t)addr + len;
1020 		zfodbase = (caddr_t)roundup(end, PAGESIZE);
1021 		zfoddiff = (uintptr_t)zfodbase - end;
1022 		if (zfoddiff) {
1023 			if (on_fault(&ljb)) {
1024 				no_fault();
1025 				error = EFAULT;
1026 				goto bad;
1027 			}
1028 			uzero((void *)end, zfoddiff);
1029 			no_fault();
1030 		}
1031 		if (zfodlen > zfoddiff) {
1032 			struct segvn_crargs crargs =
1033 			    SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
1034 
1035 			zfodlen -= zfoddiff;
1036 			if (valid_usr_range(zfodbase, zfodlen, prot, p->p_as,
1037 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1038 				error = ENOMEM;
1039 				goto bad;
1040 			}
1041 			crargs.szc = szc;
1042 			if (error = as_map(p->p_as, (caddr_t)zfodbase,
1043 			    zfodlen, segvn_create, &crargs))
1044 				goto bad;
1045 			if (prot != PROT_ZFOD) {
1046 				(void) as_setprot(p->p_as, (caddr_t)zfodbase,
1047 				    zfodlen, prot);
1048 			}
1049 		}
1050 	}
1051 	return (0);
1052 bad:
1053 	return (error);
1054 }
1055 
1056 void
1057 setexecenv(struct execenv *ep)
1058 {
1059 	proc_t *p = ttoproc(curthread);
1060 	klwp_t *lwp = ttolwp(curthread);
1061 	struct vnode *vp;
1062 
1063 	p->p_bssbase = ep->ex_bssbase;
1064 	p->p_brkbase = ep->ex_brkbase;
1065 	p->p_brksize = ep->ex_brksize;
1066 	if (p->p_exec)
1067 		VN_RELE(p->p_exec);	/* out with the old */
1068 	vp = p->p_exec = ep->ex_vp;
1069 	if (vp != NULL)
1070 		VN_HOLD(vp);		/* in with the new */
1071 
1072 	lwp->lwp_sigaltstack.ss_sp = 0;
1073 	lwp->lwp_sigaltstack.ss_size = 0;
1074 	lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
1075 }
1076 
1077 int
1078 execopen(struct vnode **vpp, int *fdp)
1079 {
1080 	struct vnode *vp = *vpp;
1081 	file_t *fp;
1082 	int error = 0;
1083 	int filemode = FREAD;
1084 
1085 	VN_HOLD(vp);		/* open reference */
1086 	if (error = falloc(NULL, filemode, &fp, fdp)) {
1087 		VN_RELE(vp);
1088 		*fdp = -1;	/* just in case falloc changed value */
1089 		return (error);
1090 	}
1091 	if (error = VOP_OPEN(&vp, filemode, CRED())) {
1092 		VN_RELE(vp);
1093 		setf(*fdp, NULL);
1094 		unfalloc(fp);
1095 		*fdp = -1;
1096 		return (error);
1097 	}
1098 	*vpp = vp;		/* vnode should not have changed */
1099 	fp->f_vnode = vp;
1100 	mutex_exit(&fp->f_tlock);
1101 	setf(*fdp, fp);
1102 	return (0);
1103 }
1104 
1105 int
1106 execclose(int fd)
1107 {
1108 	return (closeandsetf(fd, NULL));
1109 }
1110 
1111 
1112 /*
1113  * noexec stub function.
1114  */
1115 /*ARGSUSED*/
1116 int
1117 noexec(
1118     struct vnode *vp,
1119     struct execa *uap,
1120     struct uarg *args,
1121     struct intpdata *idatap,
1122     int level,
1123     long *execsz,
1124     int setid,
1125     caddr_t exec_file,
1126     struct cred *cred)
1127 {
1128 	cmn_err(CE_WARN, "missing exec capability for %s", uap->fname);
1129 	return (ENOEXEC);
1130 }
1131 
1132 /*
1133  * Support routines for building a user stack.
1134  *
1135  * execve(path, argv, envp) must construct a new stack with the specified
1136  * arguments and environment variables (see exec_args() for a description
1137  * of the user stack layout).  To do this, we copy the arguments and
1138  * environment variables from the old user address space into the kernel,
1139  * free the old as, create the new as, and copy our buffered information
1140  * to the new stack.  Our kernel buffer has the following structure:
1141  *
1142  *	+-----------------------+ <--- stk_base + stk_size
1143  *	| string offsets	|
1144  *	+-----------------------+ <--- stk_offp
1145  *	|			|
1146  *	| STK_AVAIL() space	|
1147  *	|			|
1148  *	+-----------------------+ <--- stk_strp
1149  *	| strings		|
1150  *	+-----------------------+ <--- stk_base
1151  *
1152  * When we add a string, we store the string's contents (including the null
1153  * terminator) at stk_strp, and we store the offset of the string relative to
1154  * stk_base at --stk_offp.  At strings are added, stk_strp increases and
1155  * stk_offp decreases.  The amount of space remaining, STK_AVAIL(), is just
1156  * the difference between these pointers.  If we run out of space, we return
1157  * an error and exec_args() starts all over again with a buffer twice as large.
1158  * When we're all done, the kernel buffer looks like this:
1159  *
1160  *	+-----------------------+ <--- stk_base + stk_size
1161  *	| argv[0] offset	|
1162  *	+-----------------------+
1163  *	| ...			|
1164  *	+-----------------------+
1165  *	| argv[argc-1] offset	|
1166  *	+-----------------------+
1167  *	| envp[0] offset	|
1168  *	+-----------------------+
1169  *	| ...			|
1170  *	+-----------------------+
1171  *	| envp[envc-1] offset	|
1172  *	+-----------------------+
1173  *	| AT_SUN_PLATFORM offset|
1174  *	+-----------------------+
1175  *	| AT_SUN_EXECNAME offset|
1176  *	+-----------------------+ <--- stk_offp
1177  *	|			|
1178  *	| STK_AVAIL() space	|
1179  *	|			|
1180  *	+-----------------------+ <--- stk_strp
1181  *	| AT_SUN_EXECNAME offset|
1182  *	+-----------------------+
1183  *	| AT_SUN_PLATFORM offset|
1184  *	+-----------------------+
1185  *	| envp[envc-1] string	|
1186  *	+-----------------------+
1187  *	| ...			|
1188  *	+-----------------------+
1189  *	| envp[0] string	|
1190  *	+-----------------------+
1191  *	| argv[argc-1] string	|
1192  *	+-----------------------+
1193  *	| ...			|
1194  *	+-----------------------+
1195  *	| argv[0] string	|
1196  *	+-----------------------+ <--- stk_base
1197  */
1198 
1199 #define	STK_AVAIL(args)		((char *)(args)->stk_offp - (args)->stk_strp)
1200 
1201 /*
1202  * Add a string to the stack.
1203  */
1204 static int
1205 stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)
1206 {
1207 	int error;
1208 	size_t len;
1209 
1210 	if (STK_AVAIL(args) < sizeof (int))
1211 		return (E2BIG);
1212 	*--args->stk_offp = args->stk_strp - args->stk_base;
1213 
1214 	if (segflg == UIO_USERSPACE) {
1215 		error = copyinstr(sp, args->stk_strp, STK_AVAIL(args), &len);
1216 		if (error != 0)
1217 			return (error);
1218 	} else {
1219 		len = strlen(sp) + 1;
1220 		if (len > STK_AVAIL(args))
1221 			return (E2BIG);
1222 		bcopy(sp, args->stk_strp, len);
1223 	}
1224 
1225 	args->stk_strp += len;
1226 
1227 	return (0);
1228 }
1229 
1230 static int
1231 stk_getptr(uarg_t *args, char *src, char **dst)
1232 {
1233 	int error;
1234 
1235 	if (args->from_model == DATAMODEL_NATIVE) {
1236 		ulong_t ptr;
1237 		error = fulword(src, &ptr);
1238 		*dst = (caddr_t)ptr;
1239 	} else {
1240 		uint32_t ptr;
1241 		error = fuword32(src, &ptr);
1242 		*dst = (caddr_t)(uintptr_t)ptr;
1243 	}
1244 	return (error);
1245 }
1246 
1247 static int
1248 stk_putptr(uarg_t *args, char *addr, char *value)
1249 {
1250 	if (args->to_model == DATAMODEL_NATIVE)
1251 		return (sulword(addr, (ulong_t)value));
1252 	else
1253 		return (suword32(addr, (uint32_t)(uintptr_t)value));
1254 }
1255 
1256 static int
1257 stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1258 {
1259 	char *sp;
1260 	int argc, error;
1261 	int argv_empty = 0;
1262 	size_t ptrsize = args->from_ptrsize;
1263 	size_t size, pad;
1264 	char *argv = (char *)uap->argp;
1265 	char *envp = (char *)uap->envp;
1266 
1267 	/*
1268 	 * Copy interpreter's name and argument to argv[0] and argv[1].
1269 	 */
1270 	if (intp != NULL && intp->intp_name != NULL) {
1271 		if ((error = stk_add(args, intp->intp_name, UIO_SYSSPACE)) != 0)
1272 			return (error);
1273 		if (intp->intp_arg != NULL &&
1274 		    (error = stk_add(args, intp->intp_arg, UIO_SYSSPACE)) != 0)
1275 			return (error);
1276 		if (args->fname != NULL)
1277 			error = stk_add(args, args->fname, UIO_SYSSPACE);
1278 		else
1279 			error = stk_add(args, uap->fname, UIO_USERSPACE);
1280 		if (error)
1281 			return (error);
1282 
1283 		/*
1284 		 * Check for an empty argv[].
1285 		 */
1286 		if (stk_getptr(args, argv, &sp))
1287 			return (EFAULT);
1288 		if (sp == NULL)
1289 			argv_empty = 1;
1290 
1291 		argv += ptrsize;		/* ignore original argv[0] */
1292 	}
1293 
1294 	if (argv_empty == 0) {
1295 		/*
1296 		 * Add argv[] strings to the stack.
1297 		 */
1298 		for (;;) {
1299 			if (stk_getptr(args, argv, &sp))
1300 				return (EFAULT);
1301 			if (sp == NULL)
1302 				break;
1303 			if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1304 				return (error);
1305 			argv += ptrsize;
1306 		}
1307 	}
1308 	argc = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1309 	args->arglen = args->stk_strp - args->stk_base;
1310 
1311 	/*
1312 	 * Add environ[] strings to the stack.
1313 	 */
1314 	if (envp != NULL) {
1315 		for (;;) {
1316 			if (stk_getptr(args, envp, &sp))
1317 				return (EFAULT);
1318 			if (sp == NULL)
1319 				break;
1320 			if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1321 				return (error);
1322 			envp += ptrsize;
1323 		}
1324 	}
1325 	args->na = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1326 	args->ne = args->na - argc;
1327 
1328 	/*
1329 	 * Add AT_SUN_PLATFORM and AT_SUN_EXECNAME strings to the stack.
1330 	 */
1331 	if (auxvpp != NULL && *auxvpp != NULL) {
1332 		if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0)
1333 			return (error);
1334 		if ((error = stk_add(args, args->pathname, UIO_SYSSPACE)) != 0)
1335 			return (error);
1336 	}
1337 
1338 	/*
1339 	 * Compute the size of the stack.  This includes all the pointers,
1340 	 * the space reserved for the aux vector, and all the strings.
1341 	 * The total number of pointers is args->na (which is argc + envc)
1342 	 * plus 4 more: (1) a pointer's worth of space for argc; (2) the NULL
1343 	 * after the last argument (i.e. argv[argc]); (3) the NULL after the
1344 	 * last environment variable (i.e. envp[envc]); and (4) the NULL after
1345 	 * all the strings, at the very top of the stack.
1346 	 */
1347 	size = (args->na + 4) * args->to_ptrsize + args->auxsize +
1348 	    (args->stk_strp - args->stk_base);
1349 
1350 	/*
1351 	 * Pad the string section with zeroes to align the stack size.
1352 	 */
1353 	pad = P2NPHASE(size, args->stk_align);
1354 
1355 	if (STK_AVAIL(args) < pad)
1356 		return (E2BIG);
1357 
1358 	args->usrstack_size = size + pad;
1359 
1360 	while (pad-- != 0)
1361 		*args->stk_strp++ = 0;
1362 
1363 	args->nc = args->stk_strp - args->stk_base;
1364 
1365 	return (0);
1366 }
1367 
1368 static int
1369 stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
1370 {
1371 	size_t ptrsize = args->to_ptrsize;
1372 	ssize_t pslen;
1373 	char *kstrp = args->stk_base;
1374 	char *ustrp = usrstack - args->nc - ptrsize;
1375 	char *usp = usrstack - args->usrstack_size;
1376 	int *offp = (int *)(args->stk_base + args->stk_size);
1377 	int envc = args->ne;
1378 	int argc = args->na - envc;
1379 	int i;
1380 
1381 	/*
1382 	 * Record argc for /proc.
1383 	 */
1384 	up->u_argc = argc;
1385 
1386 	/*
1387 	 * Put argc on the stack.  Note that even though it's an int,
1388 	 * it always consumes ptrsize bytes (for alignment).
1389 	 */
1390 	if (stk_putptr(args, usp, (char *)(uintptr_t)argc))
1391 		return (-1);
1392 
1393 	/*
1394 	 * Add argc space (ptrsize) to usp and record argv for /proc.
1395 	 */
1396 	up->u_argv = (uintptr_t)(usp += ptrsize);
1397 
1398 	/*
1399 	 * Put the argv[] pointers on the stack.
1400 	 */
1401 	for (i = 0; i < argc; i++, usp += ptrsize)
1402 		if (stk_putptr(args, usp, &ustrp[*--offp]))
1403 			return (-1);
1404 
1405 	/*
1406 	 * Copy arguments to u_psargs.
1407 	 */
1408 	pslen = MIN(args->arglen, PSARGSZ) - 1;
1409 	for (i = 0; i < pslen; i++)
1410 		up->u_psargs[i] = (kstrp[i] == '\0' ? ' ' : kstrp[i]);
1411 	while (i < PSARGSZ)
1412 		up->u_psargs[i++] = '\0';
1413 
1414 	/*
1415 	 * Add space for argv[]'s NULL terminator (ptrsize) to usp and
1416 	 * record envp for /proc.
1417 	 */
1418 	up->u_envp = (uintptr_t)(usp += ptrsize);
1419 
1420 	/*
1421 	 * Put the envp[] pointers on the stack.
1422 	 */
1423 	for (i = 0; i < envc; i++, usp += ptrsize)
1424 		if (stk_putptr(args, usp, &ustrp[*--offp]))
1425 			return (-1);
1426 
1427 	/*
1428 	 * Add space for envp[]'s NULL terminator (ptrsize) to usp and
1429 	 * remember where the stack ends, which is also where auxv begins.
1430 	 */
1431 	args->stackend = usp += ptrsize;
1432 
1433 	/*
1434 	 * Put all the argv[], envp[], and auxv strings on the stack.
1435 	 */
1436 	if (copyout(args->stk_base, ustrp, args->nc))
1437 		return (-1);
1438 
1439 	/*
1440 	 * Fill in the aux vector now that we know the user stack addresses
1441 	 * for the AT_SUN_PLATFORM and AT_SUN_EXECNAME strings.
1442 	 */
1443 	if (auxvpp != NULL && *auxvpp != NULL) {
1444 		if (args->to_model == DATAMODEL_NATIVE) {
1445 			auxv_t **a = (auxv_t **)auxvpp;
1446 			ADDAUX(*a, AT_SUN_PLATFORM, (long)&ustrp[*--offp])
1447 			ADDAUX(*a, AT_SUN_EXECNAME, (long)&ustrp[*--offp])
1448 		} else {
1449 			auxv32_t **a = (auxv32_t **)auxvpp;
1450 			ADDAUX(*a,
1451 			    AT_SUN_PLATFORM, (int)(uintptr_t)&ustrp[*--offp])
1452 			ADDAUX(*a,
1453 			    AT_SUN_EXECNAME, (int)(uintptr_t)&ustrp[*--offp]);
1454 		}
1455 	}
1456 
1457 	return (0);
1458 }
1459 
1460 #ifdef DEBUG
1461 int mpss_brkpgszsel = 0;
1462 int mpss_stkpgszsel = 0;
1463 #endif
1464 
1465 /*
1466  * Initialize a new user stack with the specified arguments and environment.
1467  * The initial user stack layout is as follows:
1468  *
1469  *	User Stack
1470  *	+---------------+ <--- curproc->p_usrstack
1471  *	| NULL		|
1472  *	+---------------+
1473  *	|		|
1474  *	| auxv strings	|
1475  *	|		|
1476  *	+---------------+
1477  *	|		|
1478  *	| envp strings	|
1479  *	|		|
1480  *	+---------------+
1481  *	|		|
1482  *	| argv strings	|
1483  *	|		|
1484  *	+---------------+ <--- ustrp
1485  *	|		|
1486  *	| aux vector	|
1487  *	|		|
1488  *	+---------------+ <--- auxv
1489  *	| NULL		|
1490  *	+---------------+
1491  *	| envp[envc-1]	|
1492  *	+---------------+
1493  *	| ...		|
1494  *	+---------------+
1495  *	| envp[0]	|
1496  *	+---------------+ <--- envp[]
1497  *	| NULL		|
1498  *	+---------------+
1499  *	| argv[argc-1]	|
1500  *	+---------------+
1501  *	| ...		|
1502  *	+---------------+
1503  *	| argv[0]	|
1504  *	+---------------+ <--- argv[]
1505  *	| argc		|
1506  *	+---------------+ <--- stack base
1507  */
1508 int
1509 exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1510 {
1511 	size_t size;
1512 	int error;
1513 	proc_t *p = ttoproc(curthread);
1514 	user_t *up = PTOU(p);
1515 	char *usrstack;
1516 	rctl_entity_p_t e;
1517 
1518 	struct as *as;
1519 
1520 	args->from_model = p->p_model;
1521 	if (p->p_model == DATAMODEL_NATIVE) {
1522 		args->from_ptrsize = sizeof (long);
1523 	} else {
1524 		args->from_ptrsize = sizeof (int32_t);
1525 	}
1526 
1527 	if (args->to_model == DATAMODEL_NATIVE) {
1528 		args->to_ptrsize = sizeof (long);
1529 		args->ncargs = NCARGS;
1530 		args->stk_align = STACK_ALIGN;
1531 		usrstack = (char *)USRSTACK;
1532 	} else {
1533 		args->to_ptrsize = sizeof (int32_t);
1534 		args->ncargs = NCARGS32;
1535 		args->stk_align = STACK_ALIGN32;
1536 		usrstack = (char *)USRSTACK32;
1537 	}
1538 
1539 	ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);
1540 
1541 #if defined(__sparc)
1542 	/*
1543 	 * Make sure user register windows are empty before
1544 	 * attempting to make a new stack.
1545 	 */
1546 	(void) flush_user_windows_to_stack(NULL);
1547 #endif
1548 
1549 	for (size = PAGESIZE; ; size *= 2) {
1550 		args->stk_size = size;
1551 		args->stk_base = kmem_alloc(size, KM_SLEEP);
1552 		args->stk_strp = args->stk_base;
1553 		args->stk_offp = (int *)(args->stk_base + size);
1554 		error = stk_copyin(uap, args, intp, auxvpp);
1555 		if (error == 0)
1556 			break;
1557 		kmem_free(args->stk_base, size);
1558 		if (error != E2BIG && error != ENAMETOOLONG)
1559 			return (error);
1560 		if (size >= args->ncargs)
1561 			return (E2BIG);
1562 	}
1563 
1564 	size = args->usrstack_size;
1565 
1566 	ASSERT(error == 0);
1567 	ASSERT(P2PHASE(size, args->stk_align) == 0);
1568 	ASSERT((ssize_t)STK_AVAIL(args) >= 0);
1569 
1570 	if (size > args->ncargs) {
1571 		kmem_free(args->stk_base, args->stk_size);
1572 		return (E2BIG);
1573 	}
1574 
1575 	/*
1576 	 * Leave only the current lwp and force the other lwps to exit.
1577 	 * If another lwp beat us to the punch by calling exit(), bail out.
1578 	 */
1579 	if ((error = exitlwps(0)) != 0) {
1580 		kmem_free(args->stk_base, args->stk_size);
1581 		return (error);
1582 	}
1583 
1584 	/*
1585 	 * Revoke any doors created by the process.
1586 	 */
1587 	if (p->p_door_list)
1588 		door_exit();
1589 
1590 	/*
1591 	 * Release schedctl data structures.
1592 	 */
1593 	if (p->p_pagep)
1594 		schedctl_proc_cleanup();
1595 
1596 	/*
1597 	 * Clean up any DTrace helpers for the process.
1598 	 */
1599 	if (p->p_dtrace_helpers != NULL) {
1600 		ASSERT(dtrace_helpers_cleanup != NULL);
1601 		(*dtrace_helpers_cleanup)();
1602 	}
1603 
1604 	mutex_enter(&p->p_lock);
1605 	/*
1606 	 * Cleanup the DTrace provider associated with this process.
1607 	 */
1608 	if (p->p_dtrace_probes) {
1609 		ASSERT(dtrace_fasttrap_exec_ptr != NULL);
1610 		dtrace_fasttrap_exec_ptr(p);
1611 	}
1612 	mutex_exit(&p->p_lock);
1613 
1614 	/*
1615 	 * discard the lwpchan cache.
1616 	 */
1617 	if (p->p_lcp != NULL)
1618 		lwpchan_destroy_cache(1);
1619 
1620 	/*
1621 	 * Delete the POSIX timers.
1622 	 */
1623 	if (p->p_itimer != NULL)
1624 		timer_exit();
1625 
1626 #ifdef C2_AUDIT
1627 	if (audit_active)
1628 		audit_exec(args->stk_base, args->stk_base + args->arglen,
1629 		    args->na - args->ne, args->ne);
1630 #endif
1631 
1632 	/*
1633 	 * Ensure that we don't change resource associations while we
1634 	 * change address spaces.
1635 	 */
1636 	mutex_enter(&p->p_lock);
1637 	pool_barrier_enter();
1638 	mutex_exit(&p->p_lock);
1639 
1640 	/*
1641 	 * Destroy the old address space and create a new one.
1642 	 * From here on, any errors are fatal to the exec()ing process.
1643 	 * On error we return -1, which means the caller must SIGKILL
1644 	 * the process.
1645 	 */
1646 	relvm();
1647 
1648 	mutex_enter(&p->p_lock);
1649 	pool_barrier_exit();
1650 	mutex_exit(&p->p_lock);
1651 
1652 	up->u_execsw = args->execswp;
1653 
1654 	p->p_brkbase = NULL;
1655 	p->p_brksize = 0;
1656 	p->p_stksize = 0;
1657 	p->p_model = args->to_model;
1658 	p->p_usrstack = usrstack;
1659 	p->p_stkprot = args->stk_prot;
1660 	p->p_datprot = args->dat_prot;
1661 
1662 	/*
1663 	 * Reset resource controls such that all controls are again active as
1664 	 * well as appropriate to the potentially new address model for the
1665 	 * process.
1666 	 */
1667 	e.rcep_p.proc = p;
1668 	e.rcep_t = RCENTITY_PROCESS;
1669 	rctl_set_reset(p->p_rctls, p, &e);
1670 
1671 	if (exec_lpg_disable == 0) {
1672 #ifdef DEBUG
1673 		uint_t pgsizes = page_num_pagesizes();
1674 		uint_t szc;
1675 #endif
1676 		p->p_brkpageszc = args->brkpageszc;
1677 		p->p_stkpageszc = args->stkpageszc;
1678 
1679 		if (p->p_brkpageszc == 0) {
1680 			p->p_brkpageszc = page_szc(map_pgsz(MAPPGSZ_HEAP,
1681 			    p, 0, 0, NULL));
1682 		}
1683 		if (p->p_stkpageszc == 0) {
1684 			p->p_stkpageszc = page_szc(map_pgsz(MAPPGSZ_STK,
1685 			    p, 0, 0, NULL));
1686 		}
1687 
1688 #ifdef DEBUG
1689 		if (mpss_brkpgszsel != 0) {
1690 			if (mpss_brkpgszsel == -1) {
1691 				szc = ((uint_t)gethrtime() >> 8) % pgsizes;
1692 			} else {
1693 				szc = mpss_brkpgszsel % pgsizes;
1694 			}
1695 			p->p_brkpageszc = szc;
1696 		}
1697 
1698 		if (mpss_stkpgszsel != 0) {
1699 			if (mpss_stkpgszsel == -1) {
1700 				szc = ((uint_t)gethrtime() >> 7) % pgsizes;
1701 			} else {
1702 				szc = mpss_stkpgszsel % pgsizes;
1703 			}
1704 			p->p_stkpageszc = szc;
1705 		}
1706 
1707 #endif
1708 		mutex_enter(&p->p_lock);
1709 		p->p_flag |= SAUTOLPG;	/* kernel controls page sizes */
1710 		mutex_exit(&p->p_lock);
1711 
1712 	} else {
1713 		p->p_brkpageszc = 0;
1714 		p->p_stkpageszc = 0;
1715 	}
1716 
1717 	exec_set_sp(size);
1718 
1719 	as = as_alloc();
1720 	p->p_as = as;
1721 	if (p->p_model == DATAMODEL_ILP32)
1722 		as->a_userlimit = (caddr_t)USERLIMIT32;
1723 	(void) hat_setup(as->a_hat, HAT_ALLOC);
1724 
1725 	/*
1726 	 * Finally, write out the contents of the new stack.
1727 	 */
1728 	error = stk_copyout(args, usrstack, auxvpp, up);
1729 	kmem_free(args->stk_base, args->stk_size);
1730 	return (error);
1731 }
1732