xref: /illumos-gate/usr/src/uts/common/os/exec.c (revision da604a3e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*	Copyright (c) 1988 AT&T	*/
29 /*	  All Rights Reserved  	*/
30 
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/sysmacros.h>
35 #include <sys/systm.h>
36 #include <sys/signal.h>
37 #include <sys/cred_impl.h>
38 #include <sys/policy.h>
39 #include <sys/user.h>
40 #include <sys/errno.h>
41 #include <sys/file.h>
42 #include <sys/vfs.h>
43 #include <sys/vnode.h>
44 #include <sys/mman.h>
45 #include <sys/acct.h>
46 #include <sys/cpuvar.h>
47 #include <sys/proc.h>
48 #include <sys/cmn_err.h>
49 #include <sys/debug.h>
50 #include <sys/pathname.h>
51 #include <sys/vm.h>
52 #include <sys/vtrace.h>
53 #include <sys/exec.h>
54 #include <sys/exechdr.h>
55 #include <sys/kmem.h>
56 #include <sys/prsystm.h>
57 #include <sys/modctl.h>
58 #include <sys/vmparam.h>
59 #include <sys/schedctl.h>
60 #include <sys/utrap.h>
61 #include <sys/systeminfo.h>
62 #include <sys/stack.h>
63 #include <sys/rctl.h>
64 #include <sys/dtrace.h>
65 #include <sys/lwpchan_impl.h>
66 #include <sys/pool.h>
67 #include <sys/sdt.h>
68 #include <sys/brand.h>
69 
70 #include <c2/audit.h>
71 
72 #include <vm/hat.h>
73 #include <vm/anon.h>
74 #include <vm/as.h>
75 #include <vm/seg.h>
76 #include <vm/seg_vn.h>
77 
78 #define	PRIV_RESET		0x01	/* needs to reset privs */
79 #define	PRIV_SETID		0x02	/* needs to change uids */
80 #define	PRIV_SETUGID		0x04	/* is setuid/setgid/forced privs */
81 #define	PRIV_INCREASE		0x08	/* child runs with more privs */
82 #define	MAC_FLAGS		0x10	/* need to adjust MAC flags */
83 
84 static int execsetid(struct vnode *, struct vattr *, uid_t *, uid_t *);
85 static int hold_execsw(struct execsw *);
86 
87 uint_t auxv_hwcap = 0;	/* auxv AT_SUN_HWCAP value; determined on the fly */
88 #if defined(_SYSCALL32_IMPL)
89 uint_t auxv_hwcap32 = 0;	/* 32-bit version of auxv_hwcap */
90 #endif
91 
92 int exec_lpg_disable = 0;
93 #define	PSUIDFLAGS		(SNOCD|SUGID)
94 
95 /*
96  * exec() - wrapper around exece providing NULL environment pointer
97  */
98 int
99 exec(const char *fname, const char **argp)
100 {
101 	return (exece(fname, argp, NULL));
102 }
103 
104 /*
105  * exece() - system call wrapper around exec_common()
106  */
107 int
108 exece(const char *fname, const char **argp, const char **envp)
109 {
110 	int error;
111 
112 	error = exec_common(fname, argp, envp, EBA_NONE);
113 	return (error ? (set_errno(error)) : 0);
114 }
115 
116 int
117 exec_common(const char *fname, const char **argp, const char **envp,
118     int brand_action)
119 {
120 	vnode_t *vp = NULL, *dir = NULL, *tmpvp = NULL;
121 	proc_t *p = ttoproc(curthread);
122 	klwp_t *lwp = ttolwp(curthread);
123 	struct user *up = PTOU(p);
124 	long execsz;		/* temporary count of exec size */
125 	int i;
126 	int error;
127 	char exec_file[MAXCOMLEN+1];
128 	struct pathname pn;
129 	struct pathname resolvepn;
130 	struct uarg args;
131 	struct execa ua;
132 	k_sigset_t savedmask;
133 	lwpdir_t *lwpdir = NULL;
134 	lwpdir_t **tidhash;
135 	lwpdir_t *old_lwpdir = NULL;
136 	uint_t old_lwpdir_sz;
137 	lwpdir_t **old_tidhash;
138 	uint_t old_tidhash_sz;
139 	lwpent_t *lep;
140 	int brandme = 0;
141 
142 	/*
143 	 * exec() is not supported for the /proc agent lwp.
144 	 */
145 	if (curthread == p->p_agenttp)
146 		return (ENOTSUP);
147 
148 	if ((error = secpolicy_basic_exec(CRED())) != 0)
149 		return (error);
150 
151 	if (brand_action != EBA_NONE) {
152 		/*
153 		 * Brand actions are not supported for processes that are not
154 		 * running in a branded zone.
155 		 */
156 		if (!ZONE_IS_BRANDED(p->p_zone))
157 			return (ENOTSUP);
158 
159 		if (brand_action == EBA_NATIVE) {
160 			/* Only branded processes can be unbranded */
161 			if (!PROC_IS_BRANDED(p))
162 				return (ENOTSUP);
163 		} else {
164 			/* Only unbranded processes can be branded */
165 			if (PROC_IS_BRANDED(p))
166 				return (ENOTSUP);
167 			brandme = 1;
168 		}
169 	} else {
170 		/*
171 		 * If this is a native zone, or if the process is already
172 		 * branded, then we don't need to do anything.  If this is
173 		 * a native process in a branded zone, we need to brand the
174 		 * process as it exec()s the new binary.
175 		 */
176 		if (ZONE_IS_BRANDED(p->p_zone) && !PROC_IS_BRANDED(p))
177 			brandme = 1;
178 	}
179 
180 	/*
181 	 * Inform /proc that an exec() has started.
182 	 * Hold signals that are ignored by default so that we will
183 	 * not be interrupted by a signal that will be ignored after
184 	 * successful completion of gexec().
185 	 */
186 	mutex_enter(&p->p_lock);
187 	prexecstart();
188 	schedctl_finish_sigblock(curthread);
189 	savedmask = curthread->t_hold;
190 	sigorset(&curthread->t_hold, &ignoredefault);
191 	mutex_exit(&p->p_lock);
192 
193 	/*
194 	 * Look up path name and remember last component for later.
195 	 * To help coreadm expand its %d token, we attempt to save
196 	 * the directory containing the executable in p_execdir. The
197 	 * first call to lookuppn() may fail and return EINVAL because
198 	 * dirvpp is non-NULL. In that case, we make a second call to
199 	 * lookuppn() with dirvpp set to NULL; p_execdir will be NULL,
200 	 * but coreadm is allowed to expand %d to the empty string and
201 	 * there are other cases in which that failure may occur.
202 	 */
203 	if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
204 		goto out;
205 	pn_alloc(&resolvepn);
206 	if ((error = lookuppn(&pn, &resolvepn, FOLLOW, &dir, &vp)) != 0) {
207 		pn_free(&resolvepn);
208 		pn_free(&pn);
209 		if (error != EINVAL)
210 			goto out;
211 
212 		dir = NULL;
213 		if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
214 			goto out;
215 		pn_alloc(&resolvepn);
216 		if ((error = lookuppn(&pn, &resolvepn, FOLLOW, NULLVPP,
217 		    &vp)) != 0) {
218 			pn_free(&resolvepn);
219 			pn_free(&pn);
220 			goto out;
221 		}
222 	}
223 	if (vp == NULL) {
224 		if (dir != NULL)
225 			VN_RELE(dir);
226 		error = ENOENT;
227 		pn_free(&resolvepn);
228 		pn_free(&pn);
229 		goto out;
230 	}
231 
232 	/*
233 	 * We do not allow executing files in attribute directories.
234 	 * We test this by determining whether the resolved path
235 	 * contains a "/" when we're in an attribute directory;
236 	 * only if the pathname does not contain a "/" the resolved path
237 	 * points to a file in the current working (attribute) directory.
238 	 */
239 	if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 &&
240 	    strchr(resolvepn.pn_path, '/') == NULL) {
241 		if (dir != NULL)
242 			VN_RELE(dir);
243 		error = EACCES;
244 		pn_free(&resolvepn);
245 		pn_free(&pn);
246 		VN_RELE(vp);
247 		goto out;
248 	}
249 
250 	bzero(exec_file, MAXCOMLEN+1);
251 	(void) strncpy(exec_file, pn.pn_path, MAXCOMLEN);
252 	bzero(&args, sizeof (args));
253 	args.pathname = resolvepn.pn_path;
254 	/* don't free resolvepn until we are done with args */
255 	pn_free(&pn);
256 
257 	/*
258 	 * Specific exec handlers, or policies determined via
259 	 * /etc/system may override the historical default.
260 	 */
261 	args.stk_prot = PROT_ZFOD;
262 	args.dat_prot = PROT_ZFOD;
263 
264 	CPU_STATS_ADD_K(sys, sysexec, 1);
265 	DTRACE_PROC1(exec, char *, args.pathname);
266 
267 	ua.fname = fname;
268 	ua.argp = argp;
269 	ua.envp = envp;
270 
271 	/* If necessary, brand this process before we start the exec. */
272 	if (brandme != 0)
273 		brand_setbrand(p);
274 
275 	if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz,
276 	    exec_file, p->p_cred, brand_action)) != 0) {
277 		if (brandme != 0)
278 			BROP(p)->b_proc_exit(p, lwp);
279 		VN_RELE(vp);
280 		if (dir != NULL)
281 			VN_RELE(dir);
282 		pn_free(&resolvepn);
283 		goto fail;
284 	}
285 
286 	/*
287 	 * Free floating point registers (sun4u only)
288 	 */
289 	ASSERT(lwp != NULL);
290 	lwp_freeregs(lwp, 1);
291 
292 	/*
293 	 * Free thread and process context ops.
294 	 */
295 	if (curthread->t_ctx)
296 		freectx(curthread, 1);
297 	if (p->p_pctx)
298 		freepctx(p, 1);
299 
300 	/*
301 	 * Remember file name for accounting; clear any cached DTrace predicate.
302 	 */
303 	up->u_acflag &= ~AFORK;
304 	bcopy(exec_file, up->u_comm, MAXCOMLEN+1);
305 	curthread->t_predcache = NULL;
306 
307 	/*
308 	 * Clear contract template state
309 	 */
310 	lwp_ctmpl_clear(lwp);
311 
312 	/*
313 	 * Save the directory in which we found the executable for expanding
314 	 * the %d token used in core file patterns.
315 	 */
316 	mutex_enter(&p->p_lock);
317 	tmpvp = p->p_execdir;
318 	p->p_execdir = dir;
319 	if (p->p_execdir != NULL)
320 		VN_HOLD(p->p_execdir);
321 	mutex_exit(&p->p_lock);
322 
323 	if (tmpvp != NULL)
324 		VN_RELE(tmpvp);
325 
326 	/*
327 	 * Reset stack state to the user stack, clear set of signals
328 	 * caught on the signal stack, and reset list of signals that
329 	 * restart system calls; the new program's environment should
330 	 * not be affected by detritus from the old program.  Any
331 	 * pending held signals remain held, so don't clear t_hold.
332 	 */
333 	mutex_enter(&p->p_lock);
334 	lwp->lwp_oldcontext = 0;
335 	lwp->lwp_ustack = 0;
336 	lwp->lwp_old_stk_ctl = 0;
337 	sigemptyset(&up->u_signodefer);
338 	sigemptyset(&up->u_sigonstack);
339 	sigemptyset(&up->u_sigresethand);
340 	lwp->lwp_sigaltstack.ss_sp = 0;
341 	lwp->lwp_sigaltstack.ss_size = 0;
342 	lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
343 
344 	/*
345 	 * Make saved resource limit == current resource limit.
346 	 */
347 	for (i = 0; i < RLIM_NLIMITS; i++) {
348 		/*CONSTCOND*/
349 		if (RLIM_SAVED(i)) {
350 			(void) rctl_rlimit_get(rctlproc_legacy[i], p,
351 			    &up->u_saved_rlimit[i]);
352 		}
353 	}
354 
355 	/*
356 	 * If the action was to catch the signal, then the action
357 	 * must be reset to SIG_DFL.
358 	 */
359 	sigdefault(p);
360 	p->p_flag &= ~(SNOWAIT|SJCTL);
361 	p->p_flag |= (SEXECED|SMSACCT|SMSFORK);
362 	up->u_signal[SIGCLD - 1] = SIG_DFL;
363 
364 	/*
365 	 * Delete the dot4 sigqueues/signotifies.
366 	 */
367 	sigqfree(p);
368 
369 	mutex_exit(&p->p_lock);
370 
371 	mutex_enter(&p->p_pflock);
372 	p->p_prof.pr_base = NULL;
373 	p->p_prof.pr_size = 0;
374 	p->p_prof.pr_off = 0;
375 	p->p_prof.pr_scale = 0;
376 	p->p_prof.pr_samples = 0;
377 	mutex_exit(&p->p_pflock);
378 
379 	ASSERT(curthread->t_schedctl == NULL);
380 
381 #if defined(__sparc)
382 	if (p->p_utraps != NULL)
383 		utrap_free(p);
384 #endif	/* __sparc */
385 
386 	/*
387 	 * Close all close-on-exec files.
388 	 */
389 	close_exec(P_FINFO(p));
390 	TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);
391 
392 	/* Unbrand ourself if requested. */
393 	if (brand_action == EBA_NATIVE)
394 		BROP(p)->b_proc_exit(p, lwp);
395 	ASSERT((brand_action != EBA_NATIVE) || !PROC_IS_BRANDED(p));
396 
397 	setregs(&args);
398 
399 	/* Mark this as an executable vnode */
400 	mutex_enter(&vp->v_lock);
401 	vp->v_flag |= VVMEXEC;
402 	mutex_exit(&vp->v_lock);
403 
404 	VN_RELE(vp);
405 	if (dir != NULL)
406 		VN_RELE(dir);
407 	pn_free(&resolvepn);
408 
409 	/*
410 	 * Allocate a new lwp directory and lwpid hash table if necessary.
411 	 */
412 	if (curthread->t_tid != 1 || p->p_lwpdir_sz != 2) {
413 		lwpdir = kmem_zalloc(2 * sizeof (lwpdir_t), KM_SLEEP);
414 		lwpdir->ld_next = lwpdir + 1;
415 		tidhash = kmem_zalloc(2 * sizeof (lwpdir_t *), KM_SLEEP);
416 		if (p->p_lwpdir != NULL)
417 			lep = p->p_lwpdir[curthread->t_dslot].ld_entry;
418 		else
419 			lep = kmem_zalloc(sizeof (*lep), KM_SLEEP);
420 	}
421 
422 	if (PROC_IS_BRANDED(p))
423 		BROP(p)->b_exec();
424 
425 	mutex_enter(&p->p_lock);
426 	prbarrier(p);
427 
428 	/*
429 	 * Reset lwp id to the default value of 1.
430 	 * This is a single-threaded process now
431 	 * and lwp #1 is lwp_wait()able by default.
432 	 * The t_unpark flag should not be inherited.
433 	 */
434 	ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
435 	curthread->t_tid = 1;
436 	curthread->t_unpark = 0;
437 	curthread->t_proc_flag |= TP_TWAIT;
438 	curthread->t_proc_flag &= ~TP_DAEMON;	/* daemons shouldn't exec */
439 	p->p_lwpdaemon = 0;			/* but oh well ... */
440 	p->p_lwpid = 1;
441 
442 	/*
443 	 * Install the newly-allocated lwp directory and lwpid hash table
444 	 * and insert the current thread into the new hash table.
445 	 */
446 	if (lwpdir != NULL) {
447 		old_lwpdir = p->p_lwpdir;
448 		old_lwpdir_sz = p->p_lwpdir_sz;
449 		old_tidhash = p->p_tidhash;
450 		old_tidhash_sz = p->p_tidhash_sz;
451 		p->p_lwpdir = p->p_lwpfree = lwpdir;
452 		p->p_lwpdir_sz = 2;
453 		p->p_tidhash = tidhash;
454 		p->p_tidhash_sz = 2;
455 		lep->le_thread = curthread;
456 		lep->le_lwpid = curthread->t_tid;
457 		lep->le_start = curthread->t_start;
458 		lwp_hash_in(p, lep);
459 	}
460 
461 	/*
462 	 * Restore the saved signal mask and
463 	 * inform /proc that the exec() has finished.
464 	 */
465 	curthread->t_hold = savedmask;
466 	prexecend();
467 	mutex_exit(&p->p_lock);
468 	if (old_lwpdir) {
469 		kmem_free(old_lwpdir, old_lwpdir_sz * sizeof (lwpdir_t));
470 		kmem_free(old_tidhash, old_tidhash_sz * sizeof (lwpdir_t *));
471 	}
472 
473 	ASSERT(error == 0);
474 	DTRACE_PROC(exec__success);
475 	return (0);
476 
477 fail:
478 	DTRACE_PROC1(exec__failure, int, error);
479 out:		/* error return */
480 	mutex_enter(&p->p_lock);
481 	curthread->t_hold = savedmask;
482 	prexecend();
483 	mutex_exit(&p->p_lock);
484 	ASSERT(error != 0);
485 	return (error);
486 }
487 
488 
489 /*
490  * Perform generic exec duties and switchout to object-file specific
491  * handler.
492  */
493 int
494 gexec(
495 	struct vnode **vpp,
496 	struct execa *uap,
497 	struct uarg *args,
498 	struct intpdata *idatap,
499 	int level,
500 	long *execsz,
501 	caddr_t exec_file,
502 	struct cred *cred,
503 	int brand_action)
504 {
505 	struct vnode *vp;
506 	proc_t *pp = ttoproc(curthread);
507 	struct execsw *eswp;
508 	int error = 0;
509 	int suidflags = 0;
510 	ssize_t resid;
511 	uid_t uid, gid;
512 	struct vattr vattr;
513 	char magbuf[MAGIC_BYTES];
514 	int setid;
515 	cred_t *oldcred, *newcred = NULL;
516 	int privflags = 0;
517 	int setidfl;
518 
519 	/*
520 	 * If the SNOCD or SUGID flag is set, turn it off and remember the
521 	 * previous setting so we can restore it if we encounter an error.
522 	 */
523 	if (level == 0 && (pp->p_flag & PSUIDFLAGS)) {
524 		mutex_enter(&pp->p_lock);
525 		suidflags = pp->p_flag & PSUIDFLAGS;
526 		pp->p_flag &= ~PSUIDFLAGS;
527 		mutex_exit(&pp->p_lock);
528 	}
529 
530 	if ((error = execpermissions(*vpp, &vattr, args)) != 0)
531 		goto bad;
532 
533 	/* need to open vnode for stateful file systems like rfs */
534 	if ((error = VOP_OPEN(vpp, FREAD, CRED())) != 0)
535 		goto bad;
536 	vp = *vpp;
537 
538 	/*
539 	 * Note: to support binary compatibility with SunOS a.out
540 	 * executables, we read in the first four bytes, as the
541 	 * magic number is in bytes 2-3.
542 	 */
543 	if (error = vn_rdwr(UIO_READ, vp, magbuf, sizeof (magbuf),
544 	    (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid))
545 		goto bad;
546 	if (resid != 0)
547 		goto bad;
548 
549 	if ((eswp = findexec_by_hdr(magbuf)) == NULL)
550 		goto bad;
551 
552 	if (level == 0 &&
553 	    (privflags = execsetid(vp, &vattr, &uid, &gid)) != 0) {
554 
555 		newcred = cred = crdup(cred);
556 
557 		/* If we can, drop the PA bit */
558 		if ((privflags & PRIV_RESET) != 0)
559 			priv_adjust_PA(cred);
560 
561 		if (privflags & PRIV_SETID) {
562 			cred->cr_uid = uid;
563 			cred->cr_gid = gid;
564 			cred->cr_suid = uid;
565 			cred->cr_sgid = gid;
566 		}
567 
568 		if (privflags & MAC_FLAGS) {
569 			if (!(CR_FLAGS(cred) & NET_MAC_AWARE_INHERIT))
570 				CR_FLAGS(cred) &= ~NET_MAC_AWARE;
571 			CR_FLAGS(cred) &= ~NET_MAC_AWARE_INHERIT;
572 		}
573 
574 		/*
575 		 * Implement the privilege updates:
576 		 *
577 		 * Restrict with L:
578 		 *
579 		 *	I' = I & L
580 		 *
581 		 *	E' = P' = (I' + F) & A
582 		 *
583 		 * But if running under ptrace, we cap I with P.
584 		 */
585 		if ((privflags & PRIV_RESET) != 0) {
586 			if ((privflags & PRIV_INCREASE) != 0 &&
587 			    (pp->p_proc_flag & P_PR_PTRACE) != 0)
588 				priv_intersect(&CR_OPPRIV(cred),
589 						    &CR_IPRIV(cred));
590 			priv_intersect(&CR_LPRIV(cred), &CR_IPRIV(cred));
591 			CR_EPRIV(cred) = CR_PPRIV(cred) = CR_IPRIV(cred);
592 			priv_adjust_PA(cred);
593 		}
594 	}
595 
596 	/* SunOS 4.x buy-back */
597 	if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) &&
598 	    (vattr.va_mode & (VSUID|VSGID))) {
599 		cmn_err(CE_NOTE,
600 		    "!%s, uid %d: setuid execution not allowed, dev=%lx",
601 		    exec_file, cred->cr_uid, vp->v_vfsp->vfs_dev);
602 	}
603 
604 	/*
605 	 * execsetid() told us whether or not we had to change the
606 	 * credentials of the process.  In privflags, it told us
607 	 * whether we gained any privileges or executed a set-uid executable.
608 	 */
609 	setid = (privflags & (PRIV_SETUGID|PRIV_INCREASE));
610 
611 	/*
612 	 * Use /etc/system variable to determine if the stack
613 	 * should be marked as executable by default.
614 	 */
615 	if (noexec_user_stack)
616 		args->stk_prot &= ~PROT_EXEC;
617 
618 	args->execswp = eswp; /* Save execsw pointer in uarg for exec_func */
619 
620 	/*
621 	 * Traditionally, the setid flags told the sub processes whether
622 	 * the file just executed was set-uid or set-gid; this caused
623 	 * some confusion as the 'setid' flag did not match the SUGID
624 	 * process flag which is only set when the uids/gids do not match.
625 	 * A script set-gid/set-uid to the real uid/gid would start with
626 	 * /dev/fd/X but an executable would happily trust LD_LIBRARY_PATH.
627 	 * Now we flag those cases where the calling process cannot
628 	 * be trusted to influence the newly exec'ed process, either
629 	 * because it runs with more privileges or when the uids/gids
630 	 * do in fact not match.
631 	 * This also makes the runtime linker agree with the on exec
632 	 * values of SNOCD and SUGID.
633 	 */
634 	setidfl = 0;
635 	if (cred->cr_uid != cred->cr_ruid || (cred->cr_rgid != cred->cr_gid &&
636 	    !supgroupmember(cred->cr_gid, cred))) {
637 		setidfl |= EXECSETID_UGIDS;
638 	}
639 	if (setid & PRIV_SETUGID)
640 		setidfl |= EXECSETID_SETID;
641 	if (setid & PRIV_INCREASE)
642 		setidfl |= EXECSETID_PRIVS;
643 
644 	error = (*eswp->exec_func)(vp, uap, args, idatap, level, execsz,
645 		setidfl, exec_file, cred, brand_action);
646 	rw_exit(eswp->exec_lock);
647 	if (error != 0) {
648 		if (newcred != NULL)
649 			crfree(newcred);
650 		goto bad;
651 	}
652 
653 	if (level == 0) {
654 		mutex_enter(&pp->p_crlock);
655 		if (newcred != NULL) {
656 			/*
657 			 * Free the old credentials, and set the new ones.
658 			 * Do this for both the process and the (single) thread.
659 			 */
660 			crfree(pp->p_cred);
661 			pp->p_cred = cred;	/* cred already held for proc */
662 			crhold(cred);		/* hold new cred for thread */
663 			/*
664 			 * DTrace accesses t_cred in probe context.  t_cred
665 			 * must always be either NULL, or point to a valid,
666 			 * allocated cred structure.
667 			 */
668 			oldcred = curthread->t_cred;
669 			curthread->t_cred = cred;
670 			crfree(oldcred);
671 		}
672 		/*
673 		 * On emerging from a successful exec(), the saved
674 		 * uid and gid equal the effective uid and gid.
675 		 */
676 		cred->cr_suid = cred->cr_uid;
677 		cred->cr_sgid = cred->cr_gid;
678 
679 		/*
680 		 * If the real and effective ids do not match, this
681 		 * is a setuid process that should not dump core.
682 		 * The group comparison is tricky; we prevent the code
683 		 * from flagging SNOCD when executing with an effective gid
684 		 * which is a supplementary group.
685 		 */
686 		if (cred->cr_ruid != cred->cr_uid ||
687 		    (cred->cr_rgid != cred->cr_gid &&
688 		    !supgroupmember(cred->cr_gid, cred)) ||
689 		    (privflags & PRIV_INCREASE) != 0)
690 			suidflags = PSUIDFLAGS;
691 		else
692 			suidflags = 0;
693 
694 		mutex_exit(&pp->p_crlock);
695 		if (suidflags) {
696 			mutex_enter(&pp->p_lock);
697 			pp->p_flag |= suidflags;
698 			mutex_exit(&pp->p_lock);
699 		}
700 		if (setid && (pp->p_proc_flag & P_PR_PTRACE) == 0) {
701 			/*
702 			 * If process is traced via /proc, arrange to
703 			 * invalidate the associated /proc vnode.
704 			 */
705 			if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))
706 				args->traceinval = 1;
707 		}
708 		if (pp->p_proc_flag & P_PR_PTRACE)
709 			psignal(pp, SIGTRAP);
710 		if (args->traceinval)
711 			prinvalidate(&pp->p_user);
712 	}
713 
714 	return (0);
715 bad:
716 	if (error == 0)
717 		error = ENOEXEC;
718 
719 	if (suidflags) {
720 		mutex_enter(&pp->p_lock);
721 		pp->p_flag |= suidflags;
722 		mutex_exit(&pp->p_lock);
723 	}
724 	return (error);
725 }
726 
727 extern char *execswnames[];
728 
729 struct execsw *
730 allocate_execsw(char *name, char *magic, size_t magic_size)
731 {
732 	int i, j;
733 	char *ename;
734 	char *magicp;
735 
736 	mutex_enter(&execsw_lock);
737 	for (i = 0; i < nexectype; i++) {
738 		if (execswnames[i] == NULL) {
739 			ename = kmem_alloc(strlen(name) + 1, KM_SLEEP);
740 			(void) strcpy(ename, name);
741 			execswnames[i] = ename;
742 			/*
743 			 * Set the magic number last so that we
744 			 * don't need to hold the execsw_lock in
745 			 * findexectype().
746 			 */
747 			magicp = kmem_alloc(magic_size, KM_SLEEP);
748 			for (j = 0; j < magic_size; j++)
749 				magicp[j] = magic[j];
750 			execsw[i].exec_magic = magicp;
751 			mutex_exit(&execsw_lock);
752 			return (&execsw[i]);
753 		}
754 	}
755 	mutex_exit(&execsw_lock);
756 	return (NULL);
757 }
758 
759 /*
760  * Find the exec switch table entry with the corresponding magic string.
761  */
762 struct execsw *
763 findexecsw(char *magic)
764 {
765 	struct execsw *eswp;
766 
767 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
768 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
769 		if (magic && eswp->exec_maglen != 0 &&
770 		    bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0)
771 			return (eswp);
772 	}
773 	return (NULL);
774 }
775 
776 /*
777  * Find the execsw[] index for the given exec header string by looking for the
778  * magic string at a specified offset and length for each kind of executable
779  * file format until one matches.  If no execsw[] entry is found, try to
780  * autoload a module for this magic string.
781  */
782 struct execsw *
783 findexec_by_hdr(char *header)
784 {
785 	struct execsw *eswp;
786 
787 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
788 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
789 		if (header && eswp->exec_maglen != 0 &&
790 		    bcmp(&header[eswp->exec_magoff], eswp->exec_magic,
791 			    eswp->exec_maglen) == 0) {
792 			if (hold_execsw(eswp) != 0)
793 				return (NULL);
794 			return (eswp);
795 		}
796 	}
797 	return (NULL);	/* couldn't find the type */
798 }
799 
800 /*
801  * Find the execsw[] index for the given magic string.  If no execsw[] entry
802  * is found, try to autoload a module for this magic string.
803  */
804 struct execsw *
805 findexec_by_magic(char *magic)
806 {
807 	struct execsw *eswp;
808 
809 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
810 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
811 		if (magic && eswp->exec_maglen != 0 &&
812 		    bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0) {
813 			if (hold_execsw(eswp) != 0)
814 				return (NULL);
815 			return (eswp);
816 		}
817 	}
818 	return (NULL);	/* couldn't find the type */
819 }
820 
821 static int
822 hold_execsw(struct execsw *eswp)
823 {
824 	char *name;
825 
826 	rw_enter(eswp->exec_lock, RW_READER);
827 	while (!LOADED_EXEC(eswp)) {
828 		rw_exit(eswp->exec_lock);
829 		name = execswnames[eswp-execsw];
830 		ASSERT(name);
831 		if (modload("exec", name) == -1)
832 			return (-1);
833 		rw_enter(eswp->exec_lock, RW_READER);
834 	}
835 	return (0);
836 }
837 
838 static int
839 execsetid(struct vnode *vp, struct vattr *vattrp, uid_t *uidp, uid_t *gidp)
840 {
841 	proc_t *pp = ttoproc(curthread);
842 	uid_t uid, gid;
843 	cred_t *cr = pp->p_cred;
844 	int privflags = 0;
845 
846 	/*
847 	 * Remember credentials.
848 	 */
849 	uid = cr->cr_uid;
850 	gid = cr->cr_gid;
851 
852 	/* Will try to reset the PRIV_AWARE bit later. */
853 	if ((CR_FLAGS(cr) & (PRIV_AWARE|PRIV_AWARE_INHERIT)) == PRIV_AWARE)
854 		privflags |= PRIV_RESET;
855 
856 	if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) == 0) {
857 		/*
858 		 * Set-uid root execution only allowed if the limit set
859 		 * holds all unsafe privileges.
860 		 */
861 		if ((vattrp->va_mode & VSUID) && (vattrp->va_uid != 0 ||
862 		    priv_issubset(&priv_unsafe, &CR_LPRIV(cr)))) {
863 			uid = vattrp->va_uid;
864 			privflags |= PRIV_SETUGID;
865 		}
866 		if (vattrp->va_mode & VSGID) {
867 			gid = vattrp->va_gid;
868 			privflags |= PRIV_SETUGID;
869 		}
870 	}
871 
872 	/*
873 	 * Do we need to change our credential anyway?
874 	 * This is the case when E != I or P != I, as
875 	 * we need to do the assignments (with F empty and A full)
876 	 * Or when I is not a subset of L; in that case we need to
877 	 * enforce L.
878 	 *
879 	 *		I' = L & I
880 	 *
881 	 *		E' = P' = (I' + F) & A
882 	 * or
883 	 *		E' = P' = I'
884 	 */
885 	if (!priv_isequalset(&CR_EPRIV(cr), &CR_IPRIV(cr)) ||
886 	    !priv_issubset(&CR_IPRIV(cr), &CR_LPRIV(cr)) ||
887 	    !priv_isequalset(&CR_PPRIV(cr), &CR_IPRIV(cr)))
888 		privflags |= PRIV_RESET;
889 
890 	/* If MAC-aware flag(s) are on, need to update cred to remove. */
891 	if ((CR_FLAGS(cr) & NET_MAC_AWARE) ||
892 	    (CR_FLAGS(cr) & NET_MAC_AWARE_INHERIT))
893 		privflags |= MAC_FLAGS;
894 
895 	/*
896 	 * When we introduce the "forced" set then we will need
897 	 * to set PRIV_INCREASE here if I not a subset of P.
898 	 * If the "allowed" set is introduced we will need to do
899 	 * a similar thing; however, it seems more reasonable to
900 	 * have the allowed set reduce "L": script language interpreters
901 	 * would typically have an allowed set of "all".
902 	 */
903 
904 	/*
905 	 * Set setuid/setgid protections if no ptrace() compatibility.
906 	 * For privileged processes, honor setuid/setgid even in
907 	 * the presence of ptrace() compatibility.
908 	 */
909 	if (((pp->p_proc_flag & P_PR_PTRACE) == 0 ||
910 	    PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, (uid == 0))) &&
911 	    (cr->cr_uid != uid ||
912 	    cr->cr_gid != gid ||
913 	    cr->cr_suid != uid ||
914 	    cr->cr_sgid != gid)) {
915 		*uidp = uid;
916 		*gidp = gid;
917 		privflags |= PRIV_SETID;
918 	}
919 	return (privflags);
920 }
921 
922 int
923 execpermissions(struct vnode *vp, struct vattr *vattrp, struct uarg *args)
924 {
925 	int error;
926 	proc_t *p = ttoproc(curthread);
927 
928 	vattrp->va_mask = AT_MODE | AT_UID | AT_GID | AT_SIZE;
929 	if (error = VOP_GETATTR(vp, vattrp, ATTR_EXEC, p->p_cred))
930 		return (error);
931 	/*
932 	 * Check the access mode.
933 	 * If VPROC, ask /proc if the file is an object file.
934 	 */
935 	if ((error = VOP_ACCESS(vp, VEXEC, 0, p->p_cred)) != 0 ||
936 	    !(vp->v_type == VREG || (vp->v_type == VPROC && pr_isobject(vp))) ||
937 	    (vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0 ||
938 	    (vattrp->va_mode & (VEXEC|(VEXEC>>3)|(VEXEC>>6))) == 0) {
939 		if (error == 0)
940 			error = EACCES;
941 		return (error);
942 	}
943 
944 	if ((p->p_plist || (p->p_proc_flag & (P_PR_PTRACE|P_PR_TRACE))) &&
945 	    (error = VOP_ACCESS(vp, VREAD, 0, p->p_cred))) {
946 		/*
947 		 * If process is under ptrace(2) compatibility,
948 		 * fail the exec(2).
949 		 */
950 		if (p->p_proc_flag & P_PR_PTRACE)
951 			goto bad;
952 		/*
953 		 * Process is traced via /proc.
954 		 * Arrange to invalidate the /proc vnode.
955 		 */
956 		args->traceinval = 1;
957 	}
958 	return (0);
959 bad:
960 	if (error == 0)
961 		error = ENOEXEC;
962 	return (error);
963 }
964 
965 /*
966  * Map a section of an executable file into the user's
967  * address space.
968  */
969 int
970 execmap(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen,
971     off_t offset, int prot, int page, uint_t szc)
972 {
973 	int error = 0;
974 	off_t oldoffset;
975 	caddr_t zfodbase, oldaddr;
976 	size_t end, oldlen;
977 	size_t zfoddiff;
978 	label_t ljb;
979 	proc_t *p = ttoproc(curthread);
980 
981 	oldaddr = addr;
982 	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
983 	if (len) {
984 		oldlen = len;
985 		len += ((size_t)oldaddr - (size_t)addr);
986 		oldoffset = offset;
987 		offset = (off_t)((uintptr_t)offset & PAGEMASK);
988 		if (page) {
989 			spgcnt_t  prefltmem, availm, npages;
990 			int preread;
991 			uint_t mflag = MAP_PRIVATE | MAP_FIXED;
992 
993 			if ((prot & (PROT_WRITE | PROT_EXEC)) == PROT_EXEC) {
994 				mflag |= MAP_TEXT;
995 			} else {
996 				mflag |= MAP_INITDATA;
997 			}
998 
999 			if (valid_usr_range(addr, len, prot, p->p_as,
1000 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1001 				error = ENOMEM;
1002 				goto bad;
1003 			}
1004 			if (error = VOP_MAP(vp, (offset_t)offset,
1005 			    p->p_as, &addr, len, prot, PROT_ALL,
1006 			    mflag, CRED()))
1007 				goto bad;
1008 
1009 			/*
1010 			 * If the segment can fit, then we prefault
1011 			 * the entire segment in.  This is based on the
1012 			 * model that says the best working set of a
1013 			 * small program is all of its pages.
1014 			 */
1015 			npages = (spgcnt_t)btopr(len);
1016 			prefltmem = freemem - desfree;
1017 			preread =
1018 			    (npages < prefltmem && len < PGTHRESH) ? 1 : 0;
1019 
1020 			/*
1021 			 * If we aren't prefaulting the segment,
1022 			 * increment "deficit", if necessary to ensure
1023 			 * that pages will become available when this
1024 			 * process starts executing.
1025 			 */
1026 			availm = freemem - lotsfree;
1027 			if (preread == 0 && npages > availm &&
1028 			    deficit < lotsfree) {
1029 				deficit += MIN((pgcnt_t)(npages - availm),
1030 				    lotsfree - deficit);
1031 			}
1032 
1033 			if (preread) {
1034 				TRACE_2(TR_FAC_PROC, TR_EXECMAP_PREREAD,
1035 				    "execmap preread:freemem %d size %lu",
1036 				    freemem, len);
1037 				(void) as_fault(p->p_as->a_hat, p->p_as,
1038 				    (caddr_t)addr, len, F_INVAL, S_READ);
1039 			}
1040 		} else {
1041 			if (valid_usr_range(addr, len, prot, p->p_as,
1042 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1043 				error = ENOMEM;
1044 				goto bad;
1045 			}
1046 
1047 			if (error = as_map(p->p_as, addr, len,
1048 			    segvn_create, zfod_argsp))
1049 				goto bad;
1050 			/*
1051 			 * Read in the segment in one big chunk.
1052 			 */
1053 			if (error = vn_rdwr(UIO_READ, vp, (caddr_t)oldaddr,
1054 			    oldlen, (offset_t)oldoffset, UIO_USERSPACE, 0,
1055 			    (rlim64_t)0, CRED(), (ssize_t *)0))
1056 				goto bad;
1057 			/*
1058 			 * Now set protections.
1059 			 */
1060 			if (prot != PROT_ZFOD) {
1061 				(void) as_setprot(p->p_as, (caddr_t)addr,
1062 				    len, prot);
1063 			}
1064 		}
1065 	}
1066 
1067 	if (zfodlen) {
1068 		struct as *as = curproc->p_as;
1069 		struct seg *seg;
1070 		uint_t zprot = 0;
1071 
1072 		end = (size_t)addr + len;
1073 		zfodbase = (caddr_t)roundup(end, PAGESIZE);
1074 		zfoddiff = (uintptr_t)zfodbase - end;
1075 		if (zfoddiff) {
1076 			/*
1077 			 * Before we go to zero the remaining space on the last
1078 			 * page, make sure we have write permission.
1079 			 */
1080 
1081 			AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1082 			seg = as_segat(curproc->p_as, (caddr_t)end);
1083 			if (seg != NULL)
1084 				SEGOP_GETPROT(seg, (caddr_t)end, zfoddiff - 1,
1085 				    &zprot);
1086 			AS_LOCK_EXIT(as, &as->a_lock);
1087 
1088 			if (seg != NULL && (zprot & PROT_WRITE) == 0) {
1089 				(void) as_setprot(as, (caddr_t)end,
1090 				    zfoddiff - 1, zprot | PROT_WRITE);
1091 			}
1092 
1093 			if (on_fault(&ljb)) {
1094 				no_fault();
1095 				if (seg != NULL && (zprot & PROT_WRITE) == 0)
1096 					(void) as_setprot(as, (caddr_t)end,
1097 					zfoddiff - 1, zprot);
1098 				error = EFAULT;
1099 				goto bad;
1100 			}
1101 			uzero((void *)end, zfoddiff);
1102 			no_fault();
1103 			if (seg != NULL && (zprot & PROT_WRITE) == 0)
1104 				(void) as_setprot(as, (caddr_t)end,
1105 				    zfoddiff - 1, zprot);
1106 		}
1107 		if (zfodlen > zfoddiff) {
1108 			struct segvn_crargs crargs =
1109 			    SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
1110 
1111 			zfodlen -= zfoddiff;
1112 			if (valid_usr_range(zfodbase, zfodlen, prot, p->p_as,
1113 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1114 				error = ENOMEM;
1115 				goto bad;
1116 			}
1117 			crargs.szc = szc;
1118 			if (error = as_map(p->p_as, (caddr_t)zfodbase,
1119 			    zfodlen, segvn_create, &crargs))
1120 				goto bad;
1121 			if (prot != PROT_ZFOD) {
1122 				(void) as_setprot(p->p_as, (caddr_t)zfodbase,
1123 				    zfodlen, prot);
1124 			}
1125 		}
1126 	}
1127 	return (0);
1128 bad:
1129 	return (error);
1130 }
1131 
1132 void
1133 setexecenv(struct execenv *ep)
1134 {
1135 	proc_t *p = ttoproc(curthread);
1136 	klwp_t *lwp = ttolwp(curthread);
1137 	struct vnode *vp;
1138 
1139 	p->p_bssbase = ep->ex_bssbase;
1140 	p->p_brkbase = ep->ex_brkbase;
1141 	p->p_brksize = ep->ex_brksize;
1142 	if (p->p_exec)
1143 		VN_RELE(p->p_exec);	/* out with the old */
1144 	vp = p->p_exec = ep->ex_vp;
1145 	if (vp != NULL)
1146 		VN_HOLD(vp);		/* in with the new */
1147 
1148 	lwp->lwp_sigaltstack.ss_sp = 0;
1149 	lwp->lwp_sigaltstack.ss_size = 0;
1150 	lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
1151 }
1152 
1153 int
1154 execopen(struct vnode **vpp, int *fdp)
1155 {
1156 	struct vnode *vp = *vpp;
1157 	file_t *fp;
1158 	int error = 0;
1159 	int filemode = FREAD;
1160 
1161 	VN_HOLD(vp);		/* open reference */
1162 	if (error = falloc(NULL, filemode, &fp, fdp)) {
1163 		VN_RELE(vp);
1164 		*fdp = -1;	/* just in case falloc changed value */
1165 		return (error);
1166 	}
1167 	if (error = VOP_OPEN(&vp, filemode, CRED())) {
1168 		VN_RELE(vp);
1169 		setf(*fdp, NULL);
1170 		unfalloc(fp);
1171 		*fdp = -1;
1172 		return (error);
1173 	}
1174 	*vpp = vp;		/* vnode should not have changed */
1175 	fp->f_vnode = vp;
1176 	mutex_exit(&fp->f_tlock);
1177 	setf(*fdp, fp);
1178 	return (0);
1179 }
1180 
1181 int
1182 execclose(int fd)
1183 {
1184 	return (closeandsetf(fd, NULL));
1185 }
1186 
1187 
1188 /*
1189  * noexec stub function.
1190  */
1191 /*ARGSUSED*/
1192 int
1193 noexec(
1194     struct vnode *vp,
1195     struct execa *uap,
1196     struct uarg *args,
1197     struct intpdata *idatap,
1198     int level,
1199     long *execsz,
1200     int setid,
1201     caddr_t exec_file,
1202     struct cred *cred)
1203 {
1204 	cmn_err(CE_WARN, "missing exec capability for %s", uap->fname);
1205 	return (ENOEXEC);
1206 }
1207 
1208 /*
1209  * Support routines for building a user stack.
1210  *
1211  * execve(path, argv, envp) must construct a new stack with the specified
1212  * arguments and environment variables (see exec_args() for a description
1213  * of the user stack layout).  To do this, we copy the arguments and
1214  * environment variables from the old user address space into the kernel,
1215  * free the old as, create the new as, and copy our buffered information
1216  * to the new stack.  Our kernel buffer has the following structure:
1217  *
1218  *	+-----------------------+ <--- stk_base + stk_size
1219  *	| string offsets	|
1220  *	+-----------------------+ <--- stk_offp
1221  *	|			|
1222  *	| STK_AVAIL() space	|
1223  *	|			|
1224  *	+-----------------------+ <--- stk_strp
1225  *	| strings		|
1226  *	+-----------------------+ <--- stk_base
1227  *
1228  * When we add a string, we store the string's contents (including the null
1229  * terminator) at stk_strp, and we store the offset of the string relative to
1230  * stk_base at --stk_offp.  At strings are added, stk_strp increases and
1231  * stk_offp decreases.  The amount of space remaining, STK_AVAIL(), is just
1232  * the difference between these pointers.  If we run out of space, we return
1233  * an error and exec_args() starts all over again with a buffer twice as large.
1234  * When we're all done, the kernel buffer looks like this:
1235  *
1236  *	+-----------------------+ <--- stk_base + stk_size
1237  *	| argv[0] offset	|
1238  *	+-----------------------+
1239  *	| ...			|
1240  *	+-----------------------+
1241  *	| argv[argc-1] offset	|
1242  *	+-----------------------+
1243  *	| envp[0] offset	|
1244  *	+-----------------------+
1245  *	| ...			|
1246  *	+-----------------------+
1247  *	| envp[envc-1] offset	|
1248  *	+-----------------------+
1249  *	| AT_SUN_PLATFORM offset|
1250  *	+-----------------------+
1251  *	| AT_SUN_EXECNAME offset|
1252  *	+-----------------------+ <--- stk_offp
1253  *	|			|
1254  *	| STK_AVAIL() space	|
1255  *	|			|
1256  *	+-----------------------+ <--- stk_strp
1257  *	| AT_SUN_EXECNAME offset|
1258  *	+-----------------------+
1259  *	| AT_SUN_PLATFORM offset|
1260  *	+-----------------------+
1261  *	| envp[envc-1] string	|
1262  *	+-----------------------+
1263  *	| ...			|
1264  *	+-----------------------+
1265  *	| envp[0] string	|
1266  *	+-----------------------+
1267  *	| argv[argc-1] string	|
1268  *	+-----------------------+
1269  *	| ...			|
1270  *	+-----------------------+
1271  *	| argv[0] string	|
1272  *	+-----------------------+ <--- stk_base
1273  */
1274 
1275 #define	STK_AVAIL(args)		((char *)(args)->stk_offp - (args)->stk_strp)
1276 
1277 /*
1278  * Add a string to the stack.
1279  */
1280 static int
1281 stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)
1282 {
1283 	int error;
1284 	size_t len;
1285 
1286 	if (STK_AVAIL(args) < sizeof (int))
1287 		return (E2BIG);
1288 	*--args->stk_offp = args->stk_strp - args->stk_base;
1289 
1290 	if (segflg == UIO_USERSPACE) {
1291 		error = copyinstr(sp, args->stk_strp, STK_AVAIL(args), &len);
1292 		if (error != 0)
1293 			return (error);
1294 	} else {
1295 		len = strlen(sp) + 1;
1296 		if (len > STK_AVAIL(args))
1297 			return (E2BIG);
1298 		bcopy(sp, args->stk_strp, len);
1299 	}
1300 
1301 	args->stk_strp += len;
1302 
1303 	return (0);
1304 }
1305 
1306 static int
1307 stk_getptr(uarg_t *args, char *src, char **dst)
1308 {
1309 	int error;
1310 
1311 	if (args->from_model == DATAMODEL_NATIVE) {
1312 		ulong_t ptr;
1313 		error = fulword(src, &ptr);
1314 		*dst = (caddr_t)ptr;
1315 	} else {
1316 		uint32_t ptr;
1317 		error = fuword32(src, &ptr);
1318 		*dst = (caddr_t)(uintptr_t)ptr;
1319 	}
1320 	return (error);
1321 }
1322 
1323 static int
1324 stk_putptr(uarg_t *args, char *addr, char *value)
1325 {
1326 	if (args->to_model == DATAMODEL_NATIVE)
1327 		return (sulword(addr, (ulong_t)value));
1328 	else
1329 		return (suword32(addr, (uint32_t)(uintptr_t)value));
1330 }
1331 
1332 static int
1333 stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1334 {
1335 	char *sp;
1336 	int argc, error;
1337 	int argv_empty = 0;
1338 	size_t ptrsize = args->from_ptrsize;
1339 	size_t size, pad;
1340 	char *argv = (char *)uap->argp;
1341 	char *envp = (char *)uap->envp;
1342 
1343 	/*
1344 	 * Copy interpreter's name and argument to argv[0] and argv[1].
1345 	 */
1346 	if (intp != NULL && intp->intp_name != NULL) {
1347 		if ((error = stk_add(args, intp->intp_name, UIO_SYSSPACE)) != 0)
1348 			return (error);
1349 		if (intp->intp_arg != NULL &&
1350 		    (error = stk_add(args, intp->intp_arg, UIO_SYSSPACE)) != 0)
1351 			return (error);
1352 		if (args->fname != NULL)
1353 			error = stk_add(args, args->fname, UIO_SYSSPACE);
1354 		else
1355 			error = stk_add(args, uap->fname, UIO_USERSPACE);
1356 		if (error)
1357 			return (error);
1358 
1359 		/*
1360 		 * Check for an empty argv[].
1361 		 */
1362 		if (stk_getptr(args, argv, &sp))
1363 			return (EFAULT);
1364 		if (sp == NULL)
1365 			argv_empty = 1;
1366 
1367 		argv += ptrsize;		/* ignore original argv[0] */
1368 	}
1369 
1370 	if (argv_empty == 0) {
1371 		/*
1372 		 * Add argv[] strings to the stack.
1373 		 */
1374 		for (;;) {
1375 			if (stk_getptr(args, argv, &sp))
1376 				return (EFAULT);
1377 			if (sp == NULL)
1378 				break;
1379 			if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1380 				return (error);
1381 			argv += ptrsize;
1382 		}
1383 	}
1384 	argc = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1385 	args->arglen = args->stk_strp - args->stk_base;
1386 
1387 	/*
1388 	 * Add environ[] strings to the stack.
1389 	 */
1390 	if (envp != NULL) {
1391 		for (;;) {
1392 			if (stk_getptr(args, envp, &sp))
1393 				return (EFAULT);
1394 			if (sp == NULL)
1395 				break;
1396 			if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1397 				return (error);
1398 			envp += ptrsize;
1399 		}
1400 	}
1401 	args->na = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1402 	args->ne = args->na - argc;
1403 
1404 	/*
1405 	 * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and
1406 	 * AT_SUN_EMULATOR strings to the stack.
1407 	 */
1408 	if (auxvpp != NULL && *auxvpp != NULL) {
1409 		if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0)
1410 			return (error);
1411 		if ((error = stk_add(args, args->pathname, UIO_SYSSPACE)) != 0)
1412 			return (error);
1413 		if (args->brandname != NULL &&
1414 		    (error = stk_add(args, args->brandname,
1415 			UIO_SYSSPACE)) != 0)
1416 			return (error);
1417 		if (args->emulator != NULL &&
1418 		    (error = stk_add(args, args->emulator,
1419 			UIO_SYSSPACE)) != 0)
1420 			return (error);
1421 	}
1422 
1423 	/*
1424 	 * Compute the size of the stack.  This includes all the pointers,
1425 	 * the space reserved for the aux vector, and all the strings.
1426 	 * The total number of pointers is args->na (which is argc + envc)
1427 	 * plus 4 more: (1) a pointer's worth of space for argc; (2) the NULL
1428 	 * after the last argument (i.e. argv[argc]); (3) the NULL after the
1429 	 * last environment variable (i.e. envp[envc]); and (4) the NULL after
1430 	 * all the strings, at the very top of the stack.
1431 	 */
1432 	size = (args->na + 4) * args->to_ptrsize + args->auxsize +
1433 	    (args->stk_strp - args->stk_base);
1434 
1435 	/*
1436 	 * Pad the string section with zeroes to align the stack size.
1437 	 */
1438 	pad = P2NPHASE(size, args->stk_align);
1439 
1440 	if (STK_AVAIL(args) < pad)
1441 		return (E2BIG);
1442 
1443 	args->usrstack_size = size + pad;
1444 
1445 	while (pad-- != 0)
1446 		*args->stk_strp++ = 0;
1447 
1448 	args->nc = args->stk_strp - args->stk_base;
1449 
1450 	return (0);
1451 }
1452 
1453 static int
1454 stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
1455 {
1456 	size_t ptrsize = args->to_ptrsize;
1457 	ssize_t pslen;
1458 	char *kstrp = args->stk_base;
1459 	char *ustrp = usrstack - args->nc - ptrsize;
1460 	char *usp = usrstack - args->usrstack_size;
1461 	int *offp = (int *)(args->stk_base + args->stk_size);
1462 	int envc = args->ne;
1463 	int argc = args->na - envc;
1464 	int i;
1465 
1466 	/*
1467 	 * Record argc for /proc.
1468 	 */
1469 	up->u_argc = argc;
1470 
1471 	/*
1472 	 * Put argc on the stack.  Note that even though it's an int,
1473 	 * it always consumes ptrsize bytes (for alignment).
1474 	 */
1475 	if (stk_putptr(args, usp, (char *)(uintptr_t)argc))
1476 		return (-1);
1477 
1478 	/*
1479 	 * Add argc space (ptrsize) to usp and record argv for /proc.
1480 	 */
1481 	up->u_argv = (uintptr_t)(usp += ptrsize);
1482 
1483 	/*
1484 	 * Put the argv[] pointers on the stack.
1485 	 */
1486 	for (i = 0; i < argc; i++, usp += ptrsize)
1487 		if (stk_putptr(args, usp, &ustrp[*--offp]))
1488 			return (-1);
1489 
1490 	/*
1491 	 * Copy arguments to u_psargs.
1492 	 */
1493 	pslen = MIN(args->arglen, PSARGSZ) - 1;
1494 	for (i = 0; i < pslen; i++)
1495 		up->u_psargs[i] = (kstrp[i] == '\0' ? ' ' : kstrp[i]);
1496 	while (i < PSARGSZ)
1497 		up->u_psargs[i++] = '\0';
1498 
1499 	/*
1500 	 * Add space for argv[]'s NULL terminator (ptrsize) to usp and
1501 	 * record envp for /proc.
1502 	 */
1503 	up->u_envp = (uintptr_t)(usp += ptrsize);
1504 
1505 	/*
1506 	 * Put the envp[] pointers on the stack.
1507 	 */
1508 	for (i = 0; i < envc; i++, usp += ptrsize)
1509 		if (stk_putptr(args, usp, &ustrp[*--offp]))
1510 			return (-1);
1511 
1512 	/*
1513 	 * Add space for envp[]'s NULL terminator (ptrsize) to usp and
1514 	 * remember where the stack ends, which is also where auxv begins.
1515 	 */
1516 	args->stackend = usp += ptrsize;
1517 
1518 	/*
1519 	 * Put all the argv[], envp[], and auxv strings on the stack.
1520 	 */
1521 	if (copyout(args->stk_base, ustrp, args->nc))
1522 		return (-1);
1523 
1524 	/*
1525 	 * Fill in the aux vector now that we know the user stack addresses
1526 	 * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and
1527 	 * AT_SUN_EMULATOR strings.
1528 	 */
1529 	if (auxvpp != NULL && *auxvpp != NULL) {
1530 		if (args->to_model == DATAMODEL_NATIVE) {
1531 			auxv_t **a = (auxv_t **)auxvpp;
1532 			ADDAUX(*a, AT_SUN_PLATFORM, (long)&ustrp[*--offp])
1533 			ADDAUX(*a, AT_SUN_EXECNAME, (long)&ustrp[*--offp])
1534 			if (args->brandname != NULL)
1535 				ADDAUX(*a,
1536 				    AT_SUN_BRANDNAME, (long)&ustrp[*--offp])
1537 			if (args->emulator != NULL)
1538 				ADDAUX(*a,
1539 				    AT_SUN_EMULATOR, (long)&ustrp[*--offp])
1540 		} else {
1541 			auxv32_t **a = (auxv32_t **)auxvpp;
1542 			ADDAUX(*a,
1543 			    AT_SUN_PLATFORM, (int)(uintptr_t)&ustrp[*--offp])
1544 			ADDAUX(*a,
1545 			    AT_SUN_EXECNAME, (int)(uintptr_t)&ustrp[*--offp])
1546 			if (args->brandname != NULL)
1547 				ADDAUX(*a, AT_SUN_BRANDNAME,
1548 				    (int)(uintptr_t)&ustrp[*--offp])
1549 			if (args->emulator != NULL)
1550 				ADDAUX(*a, AT_SUN_EMULATOR,
1551 				    (int)(uintptr_t)&ustrp[*--offp])
1552 		}
1553 	}
1554 
1555 	return (0);
1556 }
1557 
1558 #ifdef DEBUG
1559 int mpss_brkpgszsel = 0;
1560 int mpss_stkpgszsel = 0;
1561 #endif
1562 
1563 /*
1564  * Initialize a new user stack with the specified arguments and environment.
1565  * The initial user stack layout is as follows:
1566  *
1567  *	User Stack
1568  *	+---------------+ <--- curproc->p_usrstack
1569  *	| NULL		|
1570  *	+---------------+
1571  *	|		|
1572  *	| auxv strings	|
1573  *	|		|
1574  *	+---------------+
1575  *	|		|
1576  *	| envp strings	|
1577  *	|		|
1578  *	+---------------+
1579  *	|		|
1580  *	| argv strings	|
1581  *	|		|
1582  *	+---------------+ <--- ustrp
1583  *	|		|
1584  *	| aux vector	|
1585  *	|		|
1586  *	+---------------+ <--- auxv
1587  *	| NULL		|
1588  *	+---------------+
1589  *	| envp[envc-1]	|
1590  *	+---------------+
1591  *	| ...		|
1592  *	+---------------+
1593  *	| envp[0]	|
1594  *	+---------------+ <--- envp[]
1595  *	| NULL		|
1596  *	+---------------+
1597  *	| argv[argc-1]	|
1598  *	+---------------+
1599  *	| ...		|
1600  *	+---------------+
1601  *	| argv[0]	|
1602  *	+---------------+ <--- argv[]
1603  *	| argc		|
1604  *	+---------------+ <--- stack base
1605  */
1606 int
1607 exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1608 {
1609 	size_t size;
1610 	int error;
1611 	proc_t *p = ttoproc(curthread);
1612 	user_t *up = PTOU(p);
1613 	char *usrstack;
1614 	rctl_entity_p_t e;
1615 
1616 	struct as *as;
1617 
1618 	args->from_model = p->p_model;
1619 	if (p->p_model == DATAMODEL_NATIVE) {
1620 		args->from_ptrsize = sizeof (long);
1621 	} else {
1622 		args->from_ptrsize = sizeof (int32_t);
1623 	}
1624 
1625 	if (args->to_model == DATAMODEL_NATIVE) {
1626 		args->to_ptrsize = sizeof (long);
1627 		args->ncargs = NCARGS;
1628 		args->stk_align = STACK_ALIGN;
1629 		usrstack = (char *)USRSTACK;
1630 	} else {
1631 		args->to_ptrsize = sizeof (int32_t);
1632 		args->ncargs = NCARGS32;
1633 		args->stk_align = STACK_ALIGN32;
1634 		usrstack = (char *)USRSTACK32;
1635 	}
1636 
1637 	ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);
1638 
1639 #if defined(__sparc)
1640 	/*
1641 	 * Make sure user register windows are empty before
1642 	 * attempting to make a new stack.
1643 	 */
1644 	(void) flush_user_windows_to_stack(NULL);
1645 #endif
1646 
1647 	for (size = PAGESIZE; ; size *= 2) {
1648 		args->stk_size = size;
1649 		args->stk_base = kmem_alloc(size, KM_SLEEP);
1650 		args->stk_strp = args->stk_base;
1651 		args->stk_offp = (int *)(args->stk_base + size);
1652 		error = stk_copyin(uap, args, intp, auxvpp);
1653 		if (error == 0)
1654 			break;
1655 		kmem_free(args->stk_base, size);
1656 		if (error != E2BIG && error != ENAMETOOLONG)
1657 			return (error);
1658 		if (size >= args->ncargs)
1659 			return (E2BIG);
1660 	}
1661 
1662 	size = args->usrstack_size;
1663 
1664 	ASSERT(error == 0);
1665 	ASSERT(P2PHASE(size, args->stk_align) == 0);
1666 	ASSERT((ssize_t)STK_AVAIL(args) >= 0);
1667 
1668 	if (size > args->ncargs) {
1669 		kmem_free(args->stk_base, args->stk_size);
1670 		return (E2BIG);
1671 	}
1672 
1673 	/*
1674 	 * Leave only the current lwp and force the other lwps to exit.
1675 	 * If another lwp beat us to the punch by calling exit(), bail out.
1676 	 */
1677 	if ((error = exitlwps(0)) != 0) {
1678 		kmem_free(args->stk_base, args->stk_size);
1679 		return (error);
1680 	}
1681 
1682 	/*
1683 	 * Revoke any doors created by the process.
1684 	 */
1685 	if (p->p_door_list)
1686 		door_exit();
1687 
1688 	/*
1689 	 * Release schedctl data structures.
1690 	 */
1691 	if (p->p_pagep)
1692 		schedctl_proc_cleanup();
1693 
1694 	/*
1695 	 * Clean up any DTrace helpers for the process.
1696 	 */
1697 	if (p->p_dtrace_helpers != NULL) {
1698 		ASSERT(dtrace_helpers_cleanup != NULL);
1699 		(*dtrace_helpers_cleanup)();
1700 	}
1701 
1702 	mutex_enter(&p->p_lock);
1703 	/*
1704 	 * Cleanup the DTrace provider associated with this process.
1705 	 */
1706 	if (p->p_dtrace_probes) {
1707 		ASSERT(dtrace_fasttrap_exec_ptr != NULL);
1708 		dtrace_fasttrap_exec_ptr(p);
1709 	}
1710 	mutex_exit(&p->p_lock);
1711 
1712 	/*
1713 	 * discard the lwpchan cache.
1714 	 */
1715 	if (p->p_lcp != NULL)
1716 		lwpchan_destroy_cache(1);
1717 
1718 	/*
1719 	 * Delete the POSIX timers.
1720 	 */
1721 	if (p->p_itimer != NULL)
1722 		timer_exit();
1723 
1724 #ifdef C2_AUDIT
1725 	if (audit_active)
1726 		audit_exec(args->stk_base, args->stk_base + args->arglen,
1727 		    args->na - args->ne, args->ne);
1728 #endif
1729 
1730 	/*
1731 	 * Ensure that we don't change resource associations while we
1732 	 * change address spaces.
1733 	 */
1734 	mutex_enter(&p->p_lock);
1735 	pool_barrier_enter();
1736 	mutex_exit(&p->p_lock);
1737 
1738 	/*
1739 	 * Destroy the old address space and create a new one.
1740 	 * From here on, any errors are fatal to the exec()ing process.
1741 	 * On error we return -1, which means the caller must SIGKILL
1742 	 * the process.
1743 	 */
1744 	relvm();
1745 
1746 	mutex_enter(&p->p_lock);
1747 	pool_barrier_exit();
1748 	mutex_exit(&p->p_lock);
1749 
1750 	up->u_execsw = args->execswp;
1751 
1752 	p->p_brkbase = NULL;
1753 	p->p_brksize = 0;
1754 	p->p_stksize = 0;
1755 	p->p_model = args->to_model;
1756 	p->p_usrstack = usrstack;
1757 	p->p_stkprot = args->stk_prot;
1758 	p->p_datprot = args->dat_prot;
1759 
1760 	/*
1761 	 * Reset resource controls such that all controls are again active as
1762 	 * well as appropriate to the potentially new address model for the
1763 	 * process.
1764 	 */
1765 	e.rcep_p.proc = p;
1766 	e.rcep_t = RCENTITY_PROCESS;
1767 	rctl_set_reset(p->p_rctls, p, &e);
1768 
1769 	if (exec_lpg_disable == 0) {
1770 #ifdef DEBUG
1771 		uint_t pgsizes = page_num_pagesizes();
1772 		uint_t szc;
1773 #endif
1774 		p->p_brkpageszc = args->brkpageszc;
1775 		p->p_stkpageszc = args->stkpageszc;
1776 
1777 		if (p->p_brkpageszc == 0) {
1778 			p->p_brkpageszc = page_szc(map_pgsz(MAPPGSZ_HEAP,
1779 			    p, 0, 0, NULL));
1780 		}
1781 		if (p->p_stkpageszc == 0) {
1782 			p->p_stkpageszc = page_szc(map_pgsz(MAPPGSZ_STK,
1783 			    p, 0, 0, NULL));
1784 		}
1785 
1786 #ifdef DEBUG
1787 		if (mpss_brkpgszsel != 0) {
1788 			if (mpss_brkpgszsel == -1) {
1789 				szc = ((uint_t)gethrtime() >> 8) % pgsizes;
1790 			} else {
1791 				szc = mpss_brkpgszsel % pgsizes;
1792 			}
1793 			p->p_brkpageszc = szc;
1794 		}
1795 
1796 		if (mpss_stkpgszsel != 0) {
1797 			if (mpss_stkpgszsel == -1) {
1798 				szc = ((uint_t)gethrtime() >> 7) % pgsizes;
1799 			} else {
1800 				szc = mpss_stkpgszsel % pgsizes;
1801 			}
1802 			p->p_stkpageszc = szc;
1803 		}
1804 
1805 #endif
1806 		mutex_enter(&p->p_lock);
1807 		p->p_flag |= SAUTOLPG;	/* kernel controls page sizes */
1808 		mutex_exit(&p->p_lock);
1809 
1810 	} else {
1811 		p->p_brkpageszc = 0;
1812 		p->p_stkpageszc = 0;
1813 	}
1814 
1815 	exec_set_sp(size);
1816 
1817 	as = as_alloc();
1818 	p->p_as = as;
1819 	if (p->p_model == DATAMODEL_ILP32)
1820 		as->a_userlimit = (caddr_t)USERLIMIT32;
1821 	(void) hat_setup(as->a_hat, HAT_ALLOC);
1822 
1823 	/*
1824 	 * Finally, write out the contents of the new stack.
1825 	 */
1826 	error = stk_copyout(args, usrstack, auxvpp, up);
1827 	kmem_free(args->stk_base, args->stk_size);
1828 	return (error);
1829 }
1830