xref: /illumos-gate/usr/src/uts/sparc/os/syscall.c (revision 0dfe541e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/param.h>
27 #include <sys/vmparam.h>
28 #include <sys/types.h>
29 #include <sys/sysmacros.h>
30 #include <sys/systm.h>
31 #include <sys/cmn_err.h>
32 #include <sys/signal.h>
33 #include <sys/stack.h>
34 #include <sys/cred.h>
35 #include <sys/user.h>
36 #include <sys/debug.h>
37 #include <sys/errno.h>
38 #include <sys/proc.h>
39 #include <sys/var.h>
40 #include <sys/inline.h>
41 #include <sys/syscall.h>
42 #include <sys/ucontext.h>
43 #include <sys/cpuvar.h>
44 #include <sys/siginfo.h>
45 #include <sys/trap.h>
46 #include <sys/machtrap.h>
47 #include <sys/sysinfo.h>
48 #include <sys/procfs.h>
49 #include <sys/prsystm.h>
50 #include <sys/fpu/fpusystm.h>
51 #include <sys/modctl.h>
52 #include <sys/aio_impl.h>
53 #include <c2/audit.h>
54 #include <sys/tnf.h>
55 #include <sys/tnf_probe.h>
56 #include <sys/machpcb.h>
57 #include <sys/privregs.h>
58 #include <sys/copyops.h>
59 #include <sys/timer.h>
60 #include <sys/priv.h>
61 #include <sys/msacct.h>
62 
63 int syscalltrace = 0;
64 #ifdef SYSCALLTRACE
65 static kmutex_t	systrace_lock;		/* syscall tracing lock */
66 #endif /* SYSCALLTRACE */
67 
68 static krwlock_t *lock_syscall(struct sysent *, uint_t);
69 
70 #ifdef _SYSCALL32_IMPL
71 static struct sysent *
72 lwp_getsysent(klwp_t *lwp)
73 {
74 	if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE)
75 		return (sysent);
76 	return (sysent32);
77 }
78 #define	LWP_GETSYSENT(lwp)	(lwp_getsysent(lwp))
79 #else
80 #define	LWP_GETSYSENT(lwp)	(sysent)
81 #endif
82 
83 /*
84  * Called to restore the lwp's register window just before
85  * returning to user level (only if the registers have been
86  * fetched or modified through /proc).
87  */
88 /*ARGSUSED1*/
89 void
90 xregrestore(klwp_t *lwp, int shared)
91 {
92 	/*
93 	 * If locals+ins were modified by /proc copy them out.
94 	 * Also copy to the shared window, if necessary.
95 	 */
96 	if (lwp->lwp_pcb.pcb_xregstat == XREGMODIFIED) {
97 		struct machpcb *mpcb = lwptompcb(lwp);
98 		caddr_t sp = (caddr_t)lwptoregs(lwp)->r_sp;
99 
100 		size_t rwinsize;
101 		caddr_t rwp;
102 		int is64;
103 
104 		if (lwp_getdatamodel(lwp) == DATAMODEL_LP64) {
105 			rwinsize = sizeof (struct rwindow);
106 			rwp = sp + STACK_BIAS;
107 			is64 = 1;
108 		} else {
109 			rwinsize = sizeof (struct rwindow32);
110 			sp = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)sp;
111 			rwp = sp;
112 			is64 = 0;
113 		}
114 
115 		if (is64)
116 			(void) copyout_nowatch(&lwp->lwp_pcb.pcb_xregs,
117 			    rwp, rwinsize);
118 		else {
119 			struct rwindow32 rwindow32;
120 			int watched;
121 
122 			watched = watch_disable_addr(rwp, rwinsize, S_WRITE);
123 			rwindow_nto32(&lwp->lwp_pcb.pcb_xregs, &rwindow32);
124 			(void) copyout(&rwindow32, rwp, rwinsize);
125 			if (watched)
126 				watch_enable_addr(rwp, rwinsize, S_WRITE);
127 		}
128 
129 		/* also copy to the user return window */
130 		mpcb->mpcb_rsp[0] = sp;
131 		mpcb->mpcb_rsp[1] = NULL;
132 		bcopy(&lwp->lwp_pcb.pcb_xregs, &mpcb->mpcb_rwin[0],
133 		    sizeof (lwp->lwp_pcb.pcb_xregs));
134 	}
135 	lwp->lwp_pcb.pcb_xregstat = XREGNONE;
136 }
137 
138 
139 /*
140  * Get the arguments to the current system call.
141  *	lwp->lwp_ap normally points to the out regs in the reg structure.
142  *	If the user is going to change the out registers and might want to
143  *	get the args (for /proc tracing), it must copy the args elsewhere
144  *	via save_syscall_args().
145  */
146 uint_t
147 get_syscall_args(klwp_t *lwp, long *argp, int *nargsp)
148 {
149 	kthread_t	*t = lwptot(lwp);
150 	uint_t	code = t->t_sysnum;
151 	long	mask;
152 	long	*ap;
153 	int	nargs;
154 
155 	if (lwptoproc(lwp)->p_model == DATAMODEL_ILP32)
156 		mask = (uint32_t)0xffffffffU;
157 	else
158 		mask = 0xffffffffffffffff;
159 
160 	if (code != 0 && code < NSYSCALL) {
161 
162 		nargs = LWP_GETSYSENT(lwp)[code].sy_narg;
163 
164 		ASSERT(nargs <= MAXSYSARGS);
165 
166 		*nargsp = nargs;
167 		ap = lwp->lwp_ap;
168 		while (nargs-- > 0)
169 			*argp++ = *ap++ & mask;
170 	} else {
171 		*nargsp = 0;
172 	}
173 	return (code);
174 }
175 
176 #ifdef _SYSCALL32_IMPL
177 /*
178  * Get the arguments to the current 32-bit system call.
179  */
180 uint_t
181 get_syscall32_args(klwp_t *lwp, int *argp, int *nargsp)
182 {
183 	long args[MAXSYSARGS];
184 	uint_t i, code;
185 
186 	code = get_syscall_args(lwp, args, nargsp);
187 	for (i = 0; i != *nargsp; i++)
188 		*argp++ = (int)args[i];
189 	return (code);
190 }
191 #endif
192 
193 /*
194  *	Save the system call arguments in a safe place.
195  *	lwp->lwp_ap normally points to the out regs in the reg structure.
196  *	If the user is going to change the out registers, g1, or the stack,
197  *	and might want to get the args (for /proc tracing), it must copy
198  *	the args elsewhere via save_syscall_args().
199  *
200  *	This may be called from stop() even when we're not in a system call.
201  *	Since there's no easy way to tell, this must be safe (not panic).
202  *	If the copyins get data faults, return non-zero.
203  */
204 int
205 save_syscall_args()
206 {
207 	kthread_t	*t = curthread;
208 	klwp_t		*lwp = ttolwp(t);
209 	struct regs	*rp = lwptoregs(lwp);
210 	uint_t		code = t->t_sysnum;
211 	uint_t		nargs;
212 	int		i;
213 	caddr_t		ua;
214 	model_t		datamodel;
215 
216 	if (lwp->lwp_argsaved || code == 0)
217 		return (0);		/* args already saved or not needed */
218 
219 	if (code >= NSYSCALL) {
220 		nargs = 0;		/* illegal syscall */
221 	} else {
222 		struct sysent *se = LWP_GETSYSENT(lwp);
223 		struct sysent *callp = se + code;
224 
225 		nargs = callp->sy_narg;
226 		if (LOADABLE_SYSCALL(callp) && nargs == 0) {
227 			krwlock_t	*module_lock;
228 
229 			/*
230 			 * Find out how many arguments the system
231 			 * call uses.
232 			 *
233 			 * We have the property that loaded syscalls
234 			 * never change the number of arguments they
235 			 * use after they've been loaded once.  This
236 			 * allows us to stop for /proc tracing without
237 			 * holding the module lock.
238 			 * /proc is assured that sy_narg is valid.
239 			 */
240 			module_lock = lock_syscall(se, code);
241 			nargs = callp->sy_narg;
242 			rw_exit(module_lock);
243 		}
244 	}
245 
246 	/*
247 	 * Fetch the system call arguments.
248 	 */
249 	if (nargs == 0)
250 		goto out;
251 
252 
253 	ASSERT(nargs <= MAXSYSARGS);
254 
255 	if ((datamodel = lwp_getdatamodel(lwp)) == DATAMODEL_ILP32) {
256 
257 		if (rp->r_g1 == 0) {	/* indirect syscall */
258 
259 			lwp->lwp_arg[0] = (uint32_t)rp->r_o1;
260 			lwp->lwp_arg[1] = (uint32_t)rp->r_o2;
261 			lwp->lwp_arg[2] = (uint32_t)rp->r_o3;
262 			lwp->lwp_arg[3] = (uint32_t)rp->r_o4;
263 			lwp->lwp_arg[4] = (uint32_t)rp->r_o5;
264 			if (nargs > 5) {
265 				ua = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)
266 				    (rp->r_sp + MINFRAME32);
267 				for (i = 5; i < nargs; i++) {
268 					uint32_t a;
269 					if (fuword32(ua, &a) != 0)
270 						return (-1);
271 					lwp->lwp_arg[i] = a;
272 					ua += sizeof (a);
273 				}
274 			}
275 		} else {
276 			lwp->lwp_arg[0] = (uint32_t)rp->r_o0;
277 			lwp->lwp_arg[1] = (uint32_t)rp->r_o1;
278 			lwp->lwp_arg[2] = (uint32_t)rp->r_o2;
279 			lwp->lwp_arg[3] = (uint32_t)rp->r_o3;
280 			lwp->lwp_arg[4] = (uint32_t)rp->r_o4;
281 			lwp->lwp_arg[5] = (uint32_t)rp->r_o5;
282 			if (nargs > 6) {
283 				ua = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)
284 				    (rp->r_sp + MINFRAME32);
285 				for (i = 6; i < nargs; i++) {
286 					uint32_t a;
287 					if (fuword32(ua, &a) != 0)
288 						return (-1);
289 					lwp->lwp_arg[i] = a;
290 					ua += sizeof (a);
291 				}
292 			}
293 		}
294 	} else {
295 		ASSERT(datamodel == DATAMODEL_LP64);
296 		lwp->lwp_arg[0] = rp->r_o0;
297 		lwp->lwp_arg[1] = rp->r_o1;
298 		lwp->lwp_arg[2] = rp->r_o2;
299 		lwp->lwp_arg[3] = rp->r_o3;
300 		lwp->lwp_arg[4] = rp->r_o4;
301 		lwp->lwp_arg[5] = rp->r_o5;
302 		if (nargs > 6) {
303 			ua = (caddr_t)rp->r_sp + MINFRAME + STACK_BIAS;
304 			for (i = 6; i < nargs; i++) {
305 				unsigned long a;
306 				if (fulword(ua, &a) != 0)
307 					return (-1);
308 				lwp->lwp_arg[i] = a;
309 				ua += sizeof (a);
310 			}
311 		}
312 	}
313 
314 out:
315 	lwp->lwp_ap = lwp->lwp_arg;
316 	lwp->lwp_argsaved = 1;
317 	t->t_post_sys = 1;	/* so lwp_ap will be reset */
318 	return (0);
319 }
320 
321 void
322 reset_syscall_args(void)
323 {
324 	klwp_t *lwp = ttolwp(curthread);
325 
326 	lwp->lwp_ap = (long *)&lwptoregs(lwp)->r_o0;
327 	lwp->lwp_argsaved = 0;
328 }
329 
330 /*
331  * nonexistent system call-- signal lwp (may want to handle it)
332  * flag error if lwp won't see signal immediately
333  * This works for old or new calling sequence.
334  */
335 int64_t
336 nosys(void)
337 {
338 	tsignal(curthread, SIGSYS);
339 	return ((int64_t)set_errno(ENOSYS));
340 }
341 
342 int
343 nosys32(void)
344 {
345 	return (nosys());
346 }
347 
348 /*
349  * Perform pre-system-call processing, including stopping for tracing,
350  * auditing, microstate-accounting, etc.
351  *
352  * This routine is called only if the t_pre_sys flag is set.  Any condition
353  * requiring pre-syscall handling must set the t_pre_sys flag.  If the
354  * condition is persistent, this routine will repost t_pre_sys.
355  */
356 int
357 pre_syscall(int arg0)
358 {
359 	unsigned int code;
360 	kthread_t *t = curthread;
361 	proc_t *p = ttoproc(t);
362 	klwp_t *lwp = ttolwp(t);
363 	struct regs *rp = lwptoregs(lwp);
364 	int	repost;
365 
366 	t->t_pre_sys = repost = 0;	/* clear pre-syscall processing flag */
367 
368 	ASSERT(t->t_schedflag & TS_DONT_SWAP);
369 
370 	syscall_mstate(LMS_USER, LMS_SYSTEM);
371 
372 	/*
373 	 * The syscall arguments in the out registers should be pointed to
374 	 * by lwp_ap.  If the args need to be copied so that the outs can
375 	 * be changed without losing the ability to get the args for /proc,
376 	 * they can be saved by save_syscall_args(), and lwp_ap will be
377 	 * restored by post_syscall().
378 	 */
379 	ASSERT(lwp->lwp_ap == (long *)&rp->r_o0);
380 
381 	/*
382 	 * Make sure the thread is holding the latest credentials for the
383 	 * process.  The credentials in the process right now apply to this
384 	 * thread for the entire system call.
385 	 */
386 	if (t->t_cred != p->p_cred) {
387 		cred_t *oldcred = t->t_cred;
388 		/*
389 		 * DTrace accesses t_cred in probe context.  t_cred must
390 		 * always be either NULL, or point to a valid, allocated cred
391 		 * structure.
392 		 */
393 		t->t_cred = crgetcred();
394 		crfree(oldcred);
395 	}
396 
397 	/*
398 	 * Undo special arrangements to single-step the lwp
399 	 * so that a debugger will see valid register contents.
400 	 * Also so that the pc is valid for syncfpu().
401 	 * Also so that a syscall like exec() can be stepped.
402 	 */
403 	if (lwp->lwp_pcb.pcb_step != STEP_NONE) {
404 		(void) prundostep();
405 		repost = 1;
406 	}
407 
408 	/*
409 	 * Check for indirect system call in case we stop for tracing.
410 	 * Don't allow multiple indirection.
411 	 */
412 	code = t->t_sysnum;
413 	if (code == 0 && arg0 != 0) {		/* indirect syscall */
414 		code = arg0;
415 		t->t_sysnum = arg0;
416 	}
417 
418 	/*
419 	 * From the proc(4) manual page:
420 	 * When entry to a system call is being traced, the traced process
421 	 * stops after having begun the call to the system but before the
422 	 * system call arguments have been fetched from the process.
423 	 * If proc changes the args we must refetch them after starting.
424 	 */
425 	if (PTOU(p)->u_systrap) {
426 		if (prismember(&PTOU(p)->u_entrymask, code)) {
427 			/*
428 			 * Recheck stop condition, now that lock is held.
429 			 */
430 			mutex_enter(&p->p_lock);
431 			if (PTOU(p)->u_systrap &&
432 			    prismember(&PTOU(p)->u_entrymask, code)) {
433 				stop(PR_SYSENTRY, code);
434 				/*
435 				 * Must refetch args since they were
436 				 * possibly modified by /proc.  Indicate
437 				 * that the valid copy is in the
438 				 * registers.
439 				 */
440 				lwp->lwp_argsaved = 0;
441 				lwp->lwp_ap = (long *)&rp->r_o0;
442 			}
443 			mutex_exit(&p->p_lock);
444 		}
445 		repost = 1;
446 	}
447 
448 	if (lwp->lwp_sysabort) {
449 		/*
450 		 * lwp_sysabort may have been set via /proc while the process
451 		 * was stopped on PR_SYSENTRY.  If so, abort the system call.
452 		 * Override any error from the copyin() of the arguments.
453 		 */
454 		lwp->lwp_sysabort = 0;
455 		(void) set_errno(EINTR); /* sets post-sys processing */
456 		t->t_pre_sys = 1;	/* repost anyway */
457 		return (1);		/* don't do system call, return EINTR */
458 	}
459 
460 	/* begin auditing for this syscall */
461 	if (audit_active == C2AUDIT_LOADED) {
462 		uint32_t auditing = au_zone_getstate(NULL);
463 
464 		if (auditing & AU_AUDIT_MASK) {
465 			int error;
466 			if (error = audit_start(T_SYSCALL, code, auditing, \
467 			    0, lwp)) {
468 				t->t_pre_sys = 1;	/* repost anyway */
469 				lwp->lwp_error = 0;	/* for old drivers */
470 				return (error);
471 			}
472 			repost = 1;
473 		}
474 	}
475 
476 #ifndef NPROBE
477 	/* Kernel probe */
478 	if (tnf_tracing_active) {
479 		TNF_PROBE_1(syscall_start, "syscall thread", /* CSTYLED */,
480 			tnf_sysnum,	sysnum,		t->t_sysnum);
481 		t->t_post_sys = 1;	/* make sure post_syscall runs */
482 		repost = 1;
483 	}
484 #endif /* NPROBE */
485 
486 #ifdef SYSCALLTRACE
487 	if (syscalltrace) {
488 		int i;
489 		long *ap;
490 		char *cp;
491 		char *sysname;
492 		struct sysent *callp;
493 
494 		if (code >= NSYSCALL)
495 			callp = &nosys_ent;	/* nosys has no args */
496 		else
497 			callp = LWP_GETSYSENT(lwp) + code;
498 		(void) save_syscall_args();
499 		mutex_enter(&systrace_lock);
500 		printf("%d: ", p->p_pid);
501 		if (code >= NSYSCALL)
502 			printf("0x%x", code);
503 		else {
504 			sysname = mod_getsysname(code);
505 			printf("%s[0x%x]", sysname == NULL ? "NULL" :
506 			    sysname, code);
507 		}
508 		cp = "(";
509 		for (i = 0, ap = lwp->lwp_ap; i < callp->sy_narg; i++, ap++) {
510 			printf("%s%lx", cp, *ap);
511 			cp = ", ";
512 		}
513 		if (i)
514 			printf(")");
515 		printf(" %s id=0x%p\n", PTOU(p)->u_comm, curthread);
516 		mutex_exit(&systrace_lock);
517 	}
518 #endif /* SYSCALLTRACE */
519 
520 	/*
521 	 * If there was a continuing reason for pre-syscall processing,
522 	 * set the t_pre_sys flag for the next system call.
523 	 */
524 	if (repost)
525 		t->t_pre_sys = 1;
526 	lwp->lwp_error = 0;	/* for old drivers */
527 	lwp->lwp_badpriv = PRIV_NONE;	/* for privilege tracing */
528 	return (0);
529 }
530 
531 /*
532  * Post-syscall processing.  Perform abnormal system call completion
533  * actions such as /proc tracing, profiling, signals, preemption, etc.
534  *
535  * This routine is called only if t_post_sys, t_sig_check, or t_astflag is set.
536  * Any condition requiring pre-syscall handling must set one of these.
537  * If the condition is persistent, this routine will repost t_post_sys.
538  */
539 void
540 post_syscall(long rval1, long rval2)
541 {
542 	kthread_t	*t = curthread;
543 	proc_t	*p = curproc;
544 	klwp_t	*lwp = ttolwp(t);
545 	struct regs *rp = lwptoregs(lwp);
546 	uint_t	error;
547 	int	code = t->t_sysnum;
548 	int	repost = 0;
549 	int	proc_stop = 0;		/* non-zero if stopping for /proc */
550 	int	sigprof = 0;		/* non-zero if sending SIGPROF */
551 
552 	t->t_post_sys = 0;
553 
554 	error = lwp->lwp_errno;
555 
556 	/*
557 	 * Code can be zero if this is a new LWP returning after a forkall(),
558 	 * other than the one which matches the one in the parent which called
559 	 * forkall().  In these LWPs, skip most of post-syscall activity.
560 	 */
561 	if (code == 0)
562 		goto sig_check;
563 
564 	/* put out audit record for this syscall */
565 	if (AU_AUDITING()) {
566 		rval_t	rval;	/* fix audit_finish() someday */
567 
568 		/* XX64 -- truncation of 64-bit return values? */
569 		rval.r_val1 = (int)rval1;
570 		rval.r_val2 = (int)rval2;
571 		audit_finish(T_SYSCALL, code, error, &rval);
572 		repost = 1;
573 	}
574 
575 	if (curthread->t_pdmsg != NULL) {
576 		char *m = curthread->t_pdmsg;
577 
578 		uprintf("%s", m);
579 		kmem_free(m, strlen(m) + 1);
580 		curthread->t_pdmsg = NULL;
581 	}
582 
583 	/*
584 	 * If we're going to stop for /proc tracing, set the flag and
585 	 * save the arguments so that the return values don't smash them.
586 	 */
587 	if (PTOU(p)->u_systrap) {
588 		if (prismember(&PTOU(p)->u_exitmask, code)) {
589 			proc_stop = 1;
590 			(void) save_syscall_args();
591 		}
592 		repost = 1;
593 	}
594 
595 	/*
596 	 * Similarly check to see if SIGPROF might be sent.
597 	 */
598 	if (curthread->t_rprof != NULL &&
599 	    curthread->t_rprof->rp_anystate != 0) {
600 		(void) save_syscall_args();
601 		sigprof = 1;
602 	}
603 
604 	if (lwp->lwp_eosys == NORMALRETURN) {
605 		if (error == 0) {
606 #ifdef SYSCALLTRACE
607 			if (syscalltrace) {
608 				mutex_enter(&systrace_lock);
609 				printf(
610 				    "%d: r_val1=0x%lx, r_val2=0x%lx, id 0x%p\n",
611 				    p->p_pid, rval1, rval2, curthread);
612 				mutex_exit(&systrace_lock);
613 			}
614 #endif /* SYSCALLTRACE */
615 			rp->r_tstate &= ~TSTATE_IC;
616 			rp->r_o0 = rval1;
617 			rp->r_o1 = rval2;
618 		} else {
619 			int sig;
620 
621 #ifdef SYSCALLTRACE
622 			if (syscalltrace) {
623 				mutex_enter(&systrace_lock);
624 				printf("%d: error=%d, id 0x%p\n",
625 				    p->p_pid, error, curthread);
626 				mutex_exit(&systrace_lock);
627 			}
628 #endif /* SYSCALLTRACE */
629 			if (error == EINTR && t->t_activefd.a_stale)
630 				error = EBADF;
631 			if (error == EINTR &&
632 			    (sig = lwp->lwp_cursig) != 0 &&
633 			    sigismember(&PTOU(p)->u_sigrestart, sig) &&
634 			    PTOU(p)->u_signal[sig - 1] != SIG_DFL &&
635 			    PTOU(p)->u_signal[sig - 1] != SIG_IGN)
636 				error = ERESTART;
637 			rp->r_o0 = error;
638 			rp->r_tstate |= TSTATE_IC;
639 		}
640 		/*
641 		 * The default action is to redo the trap instruction.
642 		 * We increment the pc and npc past it for NORMALRETURN.
643 		 * JUSTRETURN has set up a new pc and npc already.
644 		 * If we are a cloned thread of forkall(), don't
645 		 * adjust here because we have already inherited
646 		 * the adjusted values from our clone.
647 		 */
648 		if (!(t->t_flag & T_FORKALL)) {
649 			rp->r_pc = rp->r_npc;
650 			rp->r_npc += 4;
651 		}
652 	}
653 
654 	/*
655 	 * From the proc(4) manual page:
656 	 * When exit from a system call is being traced, the traced process
657 	 * stops on completion of the system call just prior to checking for
658 	 * signals and returning to user level.  At this point all return
659 	 * values have been stored into the traced process's saved registers.
660 	 */
661 	if (proc_stop) {
662 		mutex_enter(&p->p_lock);
663 		if (PTOU(p)->u_systrap &&
664 		    prismember(&PTOU(p)->u_exitmask, code))
665 			stop(PR_SYSEXIT, code);
666 		mutex_exit(&p->p_lock);
667 	}
668 
669 	/*
670 	 * If we are the parent returning from a successful
671 	 * vfork, wait for the child to exec or exit.
672 	 * This code must be here and not in the bowels of the system
673 	 * so that /proc can intercept exit from vfork in a timely way.
674 	 */
675 	if (t->t_flag & T_VFPARENT) {
676 		ASSERT(code == SYS_vfork || code == SYS_forksys);
677 		ASSERT(rp->r_o1 == 0 && error == 0);
678 		vfwait((pid_t)rval1);
679 		t->t_flag &= ~T_VFPARENT;
680 	}
681 
682 	/*
683 	 * If profiling is active, bill the current PC in user-land
684 	 * and keep reposting until profiling is disabled.
685 	 */
686 	if (p->p_prof.pr_scale) {
687 		if (lwp->lwp_oweupc)
688 			profil_tick(rp->r_pc);
689 		repost = 1;
690 	}
691 
692 sig_check:
693 	/*
694 	 * Reset flag for next time.
695 	 * We must do this after stopping on PR_SYSEXIT
696 	 * because /proc uses the information in lwp_eosys.
697 	 */
698 	lwp->lwp_eosys = NORMALRETURN;
699 	clear_stale_fd();
700 	t->t_flag &= ~T_FORKALL;
701 
702 	if (t->t_astflag | t->t_sig_check) {
703 		/*
704 		 * Turn off the AST flag before checking all the conditions that
705 		 * may have caused an AST.  This flag is on whenever a signal or
706 		 * unusual condition should be handled after the next trap or
707 		 * syscall.
708 		 */
709 		astoff(t);
710 		t->t_sig_check = 0;
711 
712 		/*
713 		 * The following check is legal for the following reasons:
714 		 *	1) The thread we are checking, is ourselves, so there is
715 		 *	   no way the proc can go away.
716 		 *	2) The only time we need to be protected by the
717 		 *	   lock is if the binding is changed.
718 		 *
719 		 *	Note we will still take the lock and check the binding
720 		 *	if the condition was true without the lock held.  This
721 		 *	prevents lock contention among threads owned by the
722 		 *	same proc.
723 		 */
724 
725 		if (curthread->t_proc_flag & TP_CHANGEBIND) {
726 			mutex_enter(&p->p_lock);
727 			if (curthread->t_proc_flag & TP_CHANGEBIND) {
728 				timer_lwpbind();
729 				curthread->t_proc_flag &= ~TP_CHANGEBIND;
730 			}
731 			mutex_exit(&p->p_lock);
732 		}
733 
734 		/*
735 		 * for kaio requests on the special kaio poll queue,
736 		 * copyout their results to user memory.
737 		 */
738 		if (p->p_aio)
739 			aio_cleanup(0);
740 
741 		/*
742 		 * If this LWP was asked to hold, call holdlwp(), which will
743 		 * stop.  holdlwps() sets this up and calls pokelwps() which
744 		 * sets the AST flag.
745 		 *
746 		 * Also check TP_EXITLWP, since this is used by fresh new LWPs
747 		 * through lwp_rtt().  That flag is set if the lwp_create(2)
748 		 * syscall failed after creating the LWP.
749 		 */
750 		if (ISHOLD(p) || (t->t_proc_flag & TP_EXITLWP))
751 			holdlwp();
752 
753 		/*
754 		 * All code that sets signals and makes ISSIG_PENDING
755 		 * evaluate true must set t_sig_check afterwards.
756 		 */
757 		if (ISSIG_PENDING(t, lwp, p)) {
758 			if (issig(FORREAL))
759 				psig();
760 			t->t_sig_check = 1;	/* recheck next time */
761 		}
762 
763 		if (sigprof) {
764 			int nargs = (code > 0 && code < NSYSCALL)?
765 			    LWP_GETSYSENT(lwp)[code].sy_narg : 0;
766 			realsigprof(code, nargs, error);
767 			t->t_sig_check = 1;	/* recheck next time */
768 		}
769 
770 		/*
771 		 * If a performance counter overflow interrupt was
772 		 * delivered *during* the syscall, then re-enable the
773 		 * AST so that we take a trip through trap() to cause
774 		 * the SIGEMT to be delivered.
775 		 */
776 		if (lwp->lwp_pcb.pcb_flags & CPC_OVERFLOW)
777 			aston(t);
778 
779 		/*
780 		 * If an asynchronous hardware error is pending, turn AST flag
781 		 * back on.  AST will be checked again before we return to user
782 		 * mode and we'll come back through trap() to handle the error.
783 		 */
784 		if (lwp->lwp_pcb.pcb_flags & ASYNC_HWERR)
785 			aston(t);
786 	}
787 
788 	/*
789 	 * Restore register window if a debugger modified it.
790 	 * Set up to perform a single-step if a debugger requested it.
791 	 */
792 	if (lwp->lwp_pcb.pcb_xregstat != XREGNONE)
793 		xregrestore(lwp, 1);
794 
795 	lwp->lwp_errno = 0;		/* clear error for next time */
796 
797 #ifndef NPROBE
798 	/* Kernel probe */
799 	if (tnf_tracing_active) {
800 		TNF_PROBE_3(syscall_end, "syscall thread", /* CSTYLED */,
801 		    tnf_long,	rval1,		rval1,
802 		    tnf_long,	rval2,		rval2,
803 		    tnf_long,	errno,		(long)error);
804 		repost = 1;
805 	}
806 #endif /* NPROBE */
807 
808 	/*
809 	 * Set state to LWP_USER here so preempt won't give us a kernel
810 	 * priority if it occurs after this point.  Call CL_TRAPRET() to
811 	 * restore the user-level priority.
812 	 *
813 	 * It is important that no locks (other than spinlocks) be entered
814 	 * after this point before returning to user mode (unless lwp_state
815 	 * is set back to LWP_SYS).
816 	 *
817 	 * Sampled times past this point are charged to the user.
818 	 */
819 	lwp->lwp_state = LWP_USER;
820 
821 	if (t->t_trapret) {
822 		t->t_trapret = 0;
823 		thread_lock(t);
824 		CL_TRAPRET(t);
825 		thread_unlock(t);
826 	}
827 	if (CPU->cpu_runrun || t->t_schedflag & TS_ANYWAITQ)
828 		preempt();
829 	prunstop();
830 
831 	/*
832 	 * t_post_sys will be set if pcb_step is active.
833 	 */
834 	if (lwp->lwp_pcb.pcb_step != STEP_NONE) {
835 		prdostep();
836 		repost = 1;
837 	}
838 
839 	t->t_sysnum = 0;	/* no longer in a system call */
840 
841 	/*
842 	 * In case the args were copied to the lwp, reset the
843 	 * pointer so the next syscall will have the right lwp_ap pointer.
844 	 */
845 	lwp->lwp_ap = (long *)&rp->r_o0;
846 	lwp->lwp_argsaved = 0;
847 
848 	/*
849 	 * If there was a continuing reason for post-syscall processing,
850 	 * set the t_post_sys flag for the next system call.
851 	 */
852 	if (repost)
853 		t->t_post_sys = 1;
854 
855 	/*
856 	 * If there is a ustack registered for this lwp, and the stack rlimit
857 	 * has been altered, read in the ustack. If the saved stack rlimit
858 	 * matches the bounds of the ustack, update the ustack to reflect
859 	 * the new rlimit. If the new stack rlimit is RLIM_INFINITY, disable
860 	 * stack checking by setting the size to 0.
861 	 */
862 	if (lwp->lwp_ustack != 0 && lwp->lwp_old_stk_ctl != 0) {
863 		rlim64_t new_size;
864 		model_t model;
865 		caddr_t top;
866 		struct rlimit64 rl;
867 
868 		mutex_enter(&p->p_lock);
869 		new_size = p->p_stk_ctl;
870 		model = p->p_model;
871 		top = p->p_usrstack;
872 		(void) rctl_rlimit_get(rctlproc_legacy[RLIMIT_STACK], p, &rl);
873 		mutex_exit(&p->p_lock);
874 
875 		if (rl.rlim_cur == RLIM64_INFINITY)
876 			new_size = 0;
877 
878 		if (model == DATAMODEL_NATIVE) {
879 			stack_t stk;
880 
881 			if (copyin((stack_t *)lwp->lwp_ustack, &stk,
882 			    sizeof (stack_t)) == 0 &&
883 			    (stk.ss_size == lwp->lwp_old_stk_ctl ||
884 			    stk.ss_size == 0) &&
885 			    stk.ss_sp == top - stk.ss_size) {
886 				stk.ss_sp = (void *)((uintptr_t)stk.ss_sp +
887 				    stk.ss_size - new_size);
888 				stk.ss_size = new_size;
889 
890 				(void) copyout(&stk,
891 				    (stack_t *)lwp->lwp_ustack,
892 				    sizeof (stack_t));
893 			}
894 		} else {
895 			stack32_t stk32;
896 
897 			if (copyin((stack32_t *)lwp->lwp_ustack, &stk32,
898 			    sizeof (stack32_t)) == 0 &&
899 			    (stk32.ss_size == lwp->lwp_old_stk_ctl ||
900 			    stk32.ss_size == 0) &&
901 			    stk32.ss_sp ==
902 			    (caddr32_t)(uintptr_t)(top - stk32.ss_size)) {
903 				stk32.ss_sp += stk32.ss_size - new_size;
904 				stk32.ss_size = new_size;
905 
906 				(void) copyout(&stk32,
907 				    (stack32_t *)lwp->lwp_ustack,
908 				    sizeof (stack32_t));
909 			}
910 		}
911 
912 		lwp->lwp_old_stk_ctl = 0;
913 	}
914 
915 	syscall_mstate(LMS_SYSTEM, LMS_USER);
916 }
917 
918 /*
919  * Call a system call which takes a pointer to the user args struct and
920  * a pointer to the return values.  This is a bit slower than the standard
921  * C arg-passing method in some cases.
922  */
923 int64_t
924 syscall_ap()
925 {
926 	uint_t	error;
927 	struct sysent *callp;
928 	rval_t	rval;
929 	klwp_t	*lwp = ttolwp(curthread);
930 	struct regs *rp = lwptoregs(lwp);
931 
932 	callp = LWP_GETSYSENT(lwp) + curthread->t_sysnum;
933 
934 	/*
935 	 * If the arguments don't fit in registers %o0 - o5, make sure they
936 	 * have been copied to the lwp_arg array.
937 	 */
938 	if (callp->sy_narg > 6 && save_syscall_args())
939 		return ((int64_t)set_errno(EFAULT));
940 
941 	rval.r_val1 = 0;
942 	rval.r_val2 = (int)rp->r_o1;
943 	lwp->lwp_error = 0;	/* for old drivers */
944 	error = (*(callp->sy_call))(lwp->lwp_ap, &rval);
945 	if (error)
946 		return ((int64_t)set_errno(error));
947 	return (rval.r_vals);
948 }
949 
950 /*
951  * Load system call module.
952  *	Returns with pointer to held read lock for module.
953  */
954 static krwlock_t *
955 lock_syscall(struct sysent *table, uint_t code)
956 {
957 	krwlock_t	*module_lock;
958 	struct modctl	*modp;
959 	int		id;
960 	struct sysent   *callp;
961 
962 	module_lock = table[code].sy_lock;
963 	callp = &table[code];
964 
965 	/*
966 	 * Optimization to only call modload if we don't have a loaded
967 	 * syscall.
968 	 */
969 	rw_enter(module_lock, RW_READER);
970 	if (LOADED_SYSCALL(callp))
971 		return (module_lock);
972 	rw_exit(module_lock);
973 
974 	for (;;) {
975 		if ((id = modload("sys", syscallnames[code])) == -1)
976 			break;
977 
978 		/*
979 		 * If we loaded successfully at least once, the modctl
980 		 * will still be valid, so we try to grab it by filename.
981 		 * If this call fails, it's because the mod_filename
982 		 * was changed after the call to modload() (mod_hold_by_name()
983 		 * is the likely culprit).  We can safely just take
984 		 * another lap if this is the case;  the modload() will
985 		 * change the mod_filename back to one by which we can
986 		 * find the modctl.
987 		 */
988 		modp = mod_find_by_filename("sys", syscallnames[code]);
989 
990 		if (modp == NULL)
991 			continue;
992 
993 		mutex_enter(&mod_lock);
994 
995 		if (!modp->mod_installed) {
996 			mutex_exit(&mod_lock);
997 			continue;
998 		}
999 		break;
1000 	}
1001 
1002 	rw_enter(module_lock, RW_READER);
1003 
1004 	if (id != -1)
1005 		mutex_exit(&mod_lock);
1006 
1007 	return (module_lock);
1008 }
1009 
1010 /*
1011  * Loadable syscall support.
1012  *	If needed, load the module, then reserve it by holding a read
1013  *	lock for the duration of the call.
1014  *	Later, if the syscall is not unloadable, it could patch the vector.
1015  */
1016 /*ARGSUSED*/
1017 int64_t
1018 loadable_syscall(
1019     long a0, long a1, long a2, long a3,
1020     long a4, long a5, long a6, long a7)
1021 {
1022 	int64_t		rval;
1023 	struct sysent	*callp;
1024 	struct sysent	*se = LWP_GETSYSENT(ttolwp(curthread));
1025 	krwlock_t	*module_lock;
1026 	int		code;
1027 
1028 	code = curthread->t_sysnum;
1029 	callp = se + code;
1030 
1031 	/*
1032 	 * Try to autoload the system call if necessary.
1033 	 */
1034 	module_lock = lock_syscall(se, code);
1035 	THREAD_KPRI_RELEASE();	/* drop priority given by rw_enter */
1036 
1037 	/*
1038 	 * we've locked either the loaded syscall or nosys
1039 	 */
1040 	if (callp->sy_flags & SE_ARGC) {
1041 		int64_t (*sy_call)();
1042 
1043 		sy_call = (int64_t (*)())callp->sy_call;
1044 		rval = (*sy_call)(a0, a1, a2, a3, a4, a5);
1045 	} else {
1046 		rval = syscall_ap();
1047 	}
1048 
1049 	THREAD_KPRI_REQUEST();	/* regain priority from read lock */
1050 	rw_exit(module_lock);
1051 	return (rval);
1052 }
1053 
1054 /*
1055  * Handle indirect system calls.
1056  *	This interface should be deprecated.  The library can handle
1057  *	this more efficiently, but keep this implementation for old binaries.
1058  *
1059  * XX64	Needs some work.
1060  */
1061 int64_t
1062 indir(int code, long a0, long a1, long a2, long a3, long a4)
1063 {
1064 	klwp_t		*lwp = ttolwp(curthread);
1065 	struct sysent	*callp;
1066 
1067 	if (code <= 0 || code >= NSYSCALL)
1068 		return (nosys());
1069 
1070 	ASSERT(lwp->lwp_ap != NULL);
1071 
1072 	curthread->t_sysnum = code;
1073 	callp = LWP_GETSYSENT(lwp) + code;
1074 
1075 	/*
1076 	 * Handle argument setup, unless already done in pre_syscall().
1077 	 */
1078 	if (callp->sy_narg > 5) {
1079 		if (save_syscall_args())	/* move args to LWP array */
1080 			return ((int64_t)set_errno(EFAULT));
1081 	} else if (!lwp->lwp_argsaved) {
1082 		long *ap;
1083 
1084 		ap = lwp->lwp_ap;		/* args haven't been saved */
1085 		lwp->lwp_ap = ap + 1;		/* advance arg pointer */
1086 		curthread->t_post_sys = 1;	/* so lwp_ap will be reset */
1087 	}
1088 	return ((*callp->sy_callc)(a0, a1, a2, a3, a4, lwp->lwp_arg[5]));
1089 }
1090 
1091 /*
1092  * set_errno - set an error return from the current system call.
1093  *	This could be a macro.
1094  *	This returns the value it is passed, so that the caller can
1095  *	use tail-recursion-elimination and do return (set_errno(ERRNO));
1096  */
1097 uint_t
1098 set_errno(uint_t error)
1099 {
1100 	ASSERT(error != 0);		/* must not be used to clear errno */
1101 
1102 	curthread->t_post_sys = 1;	/* have post_syscall do error return */
1103 	return (ttolwp(curthread)->lwp_errno = error);
1104 }
1105 
1106 /*
1107  * set_proc_pre_sys - Set pre-syscall processing for entire process.
1108  */
1109 void
1110 set_proc_pre_sys(proc_t *p)
1111 {
1112 	kthread_t	*t;
1113 	kthread_t	*first;
1114 
1115 	ASSERT(MUTEX_HELD(&p->p_lock));
1116 
1117 	t = first = p->p_tlist;
1118 	do {
1119 		t->t_pre_sys = 1;
1120 	} while ((t = t->t_forw) != first);
1121 }
1122 
1123 /*
1124  * set_proc_post_sys - Set post-syscall processing for entire process.
1125  */
1126 void
1127 set_proc_post_sys(proc_t *p)
1128 {
1129 	kthread_t	*t;
1130 	kthread_t	*first;
1131 
1132 	ASSERT(MUTEX_HELD(&p->p_lock));
1133 
1134 	t = first = p->p_tlist;
1135 	do {
1136 		t->t_post_sys = 1;
1137 	} while ((t = t->t_forw) != first);
1138 }
1139 
1140 /*
1141  * set_proc_sys - Set pre- and post-syscall processing for entire process.
1142  */
1143 void
1144 set_proc_sys(proc_t *p)
1145 {
1146 	kthread_t	*t;
1147 	kthread_t	*first;
1148 
1149 	ASSERT(MUTEX_HELD(&p->p_lock));
1150 
1151 	t = first = p->p_tlist;
1152 	do {
1153 		t->t_pre_sys = 1;
1154 		t->t_post_sys = 1;
1155 	} while ((t = t->t_forw) != first);
1156 }
1157 
1158 /*
1159  * set_all_proc_sys - set pre- and post-syscall processing flags for all
1160  * user processes.
1161  *
1162  * This is needed when auditing, tracing, or other facilities which affect
1163  * all processes are turned on.
1164  */
1165 void
1166 set_all_proc_sys()
1167 {
1168 	kthread_t	*t;
1169 	kthread_t	*first;
1170 
1171 	mutex_enter(&pidlock);
1172 	t = first = curthread;
1173 	do {
1174 		t->t_pre_sys = 1;
1175 		t->t_post_sys = 1;
1176 	} while ((t = t->t_next) != first);
1177 	mutex_exit(&pidlock);
1178 }
1179 
1180 /*
1181  * set_all_zone_usr_proc_sys - set pre- and post-syscall processing flags for
1182  * all user processes running in the zone of the current process
1183  *
1184  * This is needed when auditing is turned on.
1185  */
1186 void
1187 set_all_zone_usr_proc_sys(zoneid_t zoneid)
1188 {
1189 	proc_t	    *p;
1190 	kthread_t   *t;
1191 
1192 	mutex_enter(&pidlock);
1193 	for (p = practive; p != NULL; p = p->p_next) {
1194 		/* skip kernel processes */
1195 		if (p->p_exec == NULLVP || p->p_as == &kas ||
1196 		    p->p_stat == SIDL || p->p_stat == SZOMB ||
1197 		    (p->p_flag & (SSYS | SEXITING | SEXITLWPS)))
1198 			continue;
1199 		/*
1200 		 * Only processes in the given zone (eventually in
1201 		 * all zones) are taken into account
1202 		 */
1203 		if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) {
1204 			mutex_enter(&p->p_lock);
1205 			if ((t = p->p_tlist) == NULL) {
1206 				mutex_exit(&p->p_lock);
1207 				continue;
1208 			}
1209 			/*
1210 			 * Set pre- and post-syscall processing flags
1211 			 * for all threads of the process
1212 			 */
1213 			do {
1214 				t->t_pre_sys = 1;
1215 				t->t_post_sys = 1;
1216 			} while (p->p_tlist != (t = t->t_forw));
1217 			mutex_exit(&p->p_lock);
1218 		}
1219 	}
1220 	mutex_exit(&pidlock);
1221 }
1222 
1223 /*
1224  * set_proc_ast - Set asynchronous service trap (AST) flag for all
1225  * threads in process.
1226  */
1227 void
1228 set_proc_ast(proc_t *p)
1229 {
1230 	kthread_t	*t;
1231 	kthread_t	*first;
1232 
1233 	ASSERT(MUTEX_HELD(&p->p_lock));
1234 
1235 	t = first = p->p_tlist;
1236 	do {
1237 		aston(t);
1238 	} while ((t = t->t_forw) != first);
1239 }
1240