xref: /illumos-gate/usr/src/uts/common/os/vm_subr.c (revision 23d9e5ac)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*23d9e5acSMichael Corcoran  * Common Development and Distribution License (the "License").
6*23d9e5acSMichael Corcoran  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
217c478bd9Sstevel@tonic-gate /*
22*23d9e5acSMichael Corcoran  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
237c478bd9Sstevel@tonic-gate  */
247c478bd9Sstevel@tonic-gate 
257c478bd9Sstevel@tonic-gate /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
267c478bd9Sstevel@tonic-gate /*	  All Rights Reserved  	*/
277c478bd9Sstevel@tonic-gate 
287c478bd9Sstevel@tonic-gate /*
297c478bd9Sstevel@tonic-gate  * University Copyright- Copyright (c) 1982, 1986, 1988
307c478bd9Sstevel@tonic-gate  * The Regents of the University of California
317c478bd9Sstevel@tonic-gate  * All Rights Reserved
327c478bd9Sstevel@tonic-gate  *
337c478bd9Sstevel@tonic-gate  * University Acknowledgment- Portions of this document are derived from
347c478bd9Sstevel@tonic-gate  * software developed by the University of California, Berkeley, and its
357c478bd9Sstevel@tonic-gate  * contributors.
367c478bd9Sstevel@tonic-gate  */
377c478bd9Sstevel@tonic-gate 
387c478bd9Sstevel@tonic-gate #include <sys/types.h>
397c478bd9Sstevel@tonic-gate #include <sys/t_lock.h>
407c478bd9Sstevel@tonic-gate #include <sys/param.h>
417c478bd9Sstevel@tonic-gate #include <sys/errno.h>
427c478bd9Sstevel@tonic-gate #include <sys/debug.h>
437c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
447c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
457c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h>
467c478bd9Sstevel@tonic-gate #include <sys/inline.h>
477c478bd9Sstevel@tonic-gate #include <sys/buf.h>
487c478bd9Sstevel@tonic-gate #include <sys/uio.h>
497c478bd9Sstevel@tonic-gate #include <sys/user.h>
507c478bd9Sstevel@tonic-gate #include <sys/proc.h>
517c478bd9Sstevel@tonic-gate #include <sys/systm.h>
527c478bd9Sstevel@tonic-gate #include <sys/vmsystm.h>
537c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
547c478bd9Sstevel@tonic-gate #include <sys/mman.h>
557c478bd9Sstevel@tonic-gate #include <sys/cred.h>
567c478bd9Sstevel@tonic-gate #include <sys/vnode.h>
577c478bd9Sstevel@tonic-gate #include <sys/file.h>
587c478bd9Sstevel@tonic-gate #include <sys/vm.h>
597c478bd9Sstevel@tonic-gate 
607c478bd9Sstevel@tonic-gate #include <sys/swap.h>
617c478bd9Sstevel@tonic-gate #include <sys/vtrace.h>
627c478bd9Sstevel@tonic-gate #include <sys/tnf_probe.h>
637c478bd9Sstevel@tonic-gate #include <sys/fs/snode.h>
647c478bd9Sstevel@tonic-gate #include <sys/copyops.h>
657c478bd9Sstevel@tonic-gate #include <sys/conf.h>
667c478bd9Sstevel@tonic-gate #include <sys/sdt.h>
677c478bd9Sstevel@tonic-gate 
687c478bd9Sstevel@tonic-gate #include <vm/anon.h>
697c478bd9Sstevel@tonic-gate #include <vm/hat.h>
707c478bd9Sstevel@tonic-gate #include <vm/as.h>
717c478bd9Sstevel@tonic-gate #include <vm/seg.h>
727c478bd9Sstevel@tonic-gate #include <vm/page.h>
737c478bd9Sstevel@tonic-gate #include <vm/seg_vn.h>
747c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h>
757c478bd9Sstevel@tonic-gate 
767c478bd9Sstevel@tonic-gate extern int maxphys;
777c478bd9Sstevel@tonic-gate 
787c478bd9Sstevel@tonic-gate void
797c478bd9Sstevel@tonic-gate minphys(struct buf *bp)
807c478bd9Sstevel@tonic-gate {
817c478bd9Sstevel@tonic-gate 	if (bp->b_bcount > maxphys)
827c478bd9Sstevel@tonic-gate 		bp->b_bcount = maxphys;
837c478bd9Sstevel@tonic-gate }
847c478bd9Sstevel@tonic-gate 
857c478bd9Sstevel@tonic-gate /*
867c478bd9Sstevel@tonic-gate  * use kmem_cache_create for physio buffers. This has shown
877c478bd9Sstevel@tonic-gate  * a better cache distribution compared to buffers on the
887c478bd9Sstevel@tonic-gate  * stack. It also avoids semaphore construction/deconstruction
897c478bd9Sstevel@tonic-gate  * per request
907c478bd9Sstevel@tonic-gate  */
917c478bd9Sstevel@tonic-gate 
927c478bd9Sstevel@tonic-gate static struct kmem_cache *physio_buf_cache;
937c478bd9Sstevel@tonic-gate 
947c478bd9Sstevel@tonic-gate /* ARGSUSED */
957c478bd9Sstevel@tonic-gate static int
967c478bd9Sstevel@tonic-gate physio_buf_constructor(void *buf, void *cdrarg, int kmflags)
977c478bd9Sstevel@tonic-gate {
987c478bd9Sstevel@tonic-gate 	bioinit((struct buf *)buf);
997c478bd9Sstevel@tonic-gate 	return (0);
1007c478bd9Sstevel@tonic-gate }
1017c478bd9Sstevel@tonic-gate 
1027c478bd9Sstevel@tonic-gate /* ARGSUSED */
1037c478bd9Sstevel@tonic-gate static void
1047c478bd9Sstevel@tonic-gate physio_buf_destructor(void *buf, void *cdrarg)
1057c478bd9Sstevel@tonic-gate {
1067c478bd9Sstevel@tonic-gate 	biofini((struct buf *)buf);
1077c478bd9Sstevel@tonic-gate }
1087c478bd9Sstevel@tonic-gate 
1097c478bd9Sstevel@tonic-gate void
1107c478bd9Sstevel@tonic-gate physio_bufs_init(void)
1117c478bd9Sstevel@tonic-gate {
1127c478bd9Sstevel@tonic-gate 	physio_buf_cache = kmem_cache_create("physio_buf_cache",
113*23d9e5acSMichael Corcoran 	    sizeof (struct buf), 0, physio_buf_constructor,
114*23d9e5acSMichael Corcoran 	    physio_buf_destructor, NULL, NULL, NULL, 0);
1157c478bd9Sstevel@tonic-gate }
1167c478bd9Sstevel@tonic-gate 
1177c478bd9Sstevel@tonic-gate 
1187c478bd9Sstevel@tonic-gate 
1197c478bd9Sstevel@tonic-gate /*
1207c478bd9Sstevel@tonic-gate  * initiate raw I/O request
1217c478bd9Sstevel@tonic-gate  *
1227c478bd9Sstevel@tonic-gate  * allocate buf header if necessary
1237c478bd9Sstevel@tonic-gate  * adjust max size of each I/O request
1247c478bd9Sstevel@tonic-gate  * lock down user pages and verify access protections
1257c478bd9Sstevel@tonic-gate  * call driver's strategy routine to submit request
1267c478bd9Sstevel@tonic-gate  * wait for I/O completion
1277c478bd9Sstevel@tonic-gate  * unlock user pages and free allocated buf header
1287c478bd9Sstevel@tonic-gate  */
1297c478bd9Sstevel@tonic-gate 
1307c478bd9Sstevel@tonic-gate int
1317c478bd9Sstevel@tonic-gate default_physio(int (*strat)(struct buf *), struct buf *bp, dev_t dev,
1327c478bd9Sstevel@tonic-gate 	int rw, void (*mincnt)(struct buf *), struct uio *uio)
1337c478bd9Sstevel@tonic-gate {
1347c478bd9Sstevel@tonic-gate 	struct iovec *iov;
1357c478bd9Sstevel@tonic-gate 	struct proc *procp;
1367c478bd9Sstevel@tonic-gate 	struct as *asp;
1377c478bd9Sstevel@tonic-gate 	ssize_t c;
1387c478bd9Sstevel@tonic-gate 	char *a;
1397c478bd9Sstevel@tonic-gate 	int error = 0;
1407c478bd9Sstevel@tonic-gate 	page_t **pplist;
1417c478bd9Sstevel@tonic-gate 	int allocbuf = 0;
1427c478bd9Sstevel@tonic-gate 
1437c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_START, "physio_start: bp %p", bp);
1447c478bd9Sstevel@tonic-gate 
1457c478bd9Sstevel@tonic-gate 	/* Kernel probe */
1467c478bd9Sstevel@tonic-gate 	TNF_PROBE_4(physio_start, "io rawio", /* CSTYLED */,
1477c478bd9Sstevel@tonic-gate 	    tnf_device,		device,		dev,
1487c478bd9Sstevel@tonic-gate 	    tnf_offset,		offset,		uio->uio_loffset,
1497c478bd9Sstevel@tonic-gate 	    tnf_size,		size,		uio->uio_resid,
1507c478bd9Sstevel@tonic-gate 	    tnf_bioflags,	rw,		rw);
1517c478bd9Sstevel@tonic-gate 
1527c478bd9Sstevel@tonic-gate 	if (rw == B_READ) {
1537c478bd9Sstevel@tonic-gate 		CPU_STATS_ADD_K(sys, phread, 1);
1547c478bd9Sstevel@tonic-gate 	} else {
1557c478bd9Sstevel@tonic-gate 		CPU_STATS_ADD_K(sys, phwrite, 1);
1567c478bd9Sstevel@tonic-gate 	}
1577c478bd9Sstevel@tonic-gate 
1587c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_START,
1597c478bd9Sstevel@tonic-gate 	    "getbuf_start: bp %p", bp);
1607c478bd9Sstevel@tonic-gate 
1617c478bd9Sstevel@tonic-gate 	if (bp == NULL) {
1627c478bd9Sstevel@tonic-gate 		bp = kmem_cache_alloc(physio_buf_cache, KM_SLEEP);
1637c478bd9Sstevel@tonic-gate 		bp->b_iodone = NULL;
1647c478bd9Sstevel@tonic-gate 		bp->b_resid = 0;
1657c478bd9Sstevel@tonic-gate 		allocbuf = 1;
1667c478bd9Sstevel@tonic-gate 	}
1677c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_END, "getbuf_end: bp %p", bp);
1687c478bd9Sstevel@tonic-gate 
1697c478bd9Sstevel@tonic-gate 	if (uio->uio_segflg == UIO_USERSPACE) {
1707c478bd9Sstevel@tonic-gate 		procp = ttoproc(curthread);
1717c478bd9Sstevel@tonic-gate 		asp = procp->p_as;
1727c478bd9Sstevel@tonic-gate 	} else {
1737c478bd9Sstevel@tonic-gate 		procp = NULL;
1747c478bd9Sstevel@tonic-gate 		asp = &kas;
1757c478bd9Sstevel@tonic-gate 	}
1767c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
1777c478bd9Sstevel@tonic-gate 
1787c478bd9Sstevel@tonic-gate 	/*
1797c478bd9Sstevel@tonic-gate 	 * We need to prepare this buffer for the io:::start probe, including
1807c478bd9Sstevel@tonic-gate 	 * NULL'ing out the file, clearing the offset, and filling in the
1817c478bd9Sstevel@tonic-gate 	 * b_dip field.
1827c478bd9Sstevel@tonic-gate 	 */
1837c478bd9Sstevel@tonic-gate 	bp->b_file = NULL;
1847c478bd9Sstevel@tonic-gate 	bp->b_offset = -1;
1857c478bd9Sstevel@tonic-gate 
1867c478bd9Sstevel@tonic-gate 	if (dev != NODEV) {
1877c478bd9Sstevel@tonic-gate 		(void) devopsp[getmajor(dev)]->devo_getinfo(NULL,
1887c478bd9Sstevel@tonic-gate 		    DDI_INFO_DEVT2DEVINFO, (void *)dev, (void **)&bp->b_dip);
1897c478bd9Sstevel@tonic-gate 	} else {
1907c478bd9Sstevel@tonic-gate 		bp->b_dip = NULL;
1917c478bd9Sstevel@tonic-gate 	}
1927c478bd9Sstevel@tonic-gate 
1937c478bd9Sstevel@tonic-gate 	while (uio->uio_iovcnt > 0) {
1947c478bd9Sstevel@tonic-gate 		iov = uio->uio_iov;
1957c478bd9Sstevel@tonic-gate 
1967c478bd9Sstevel@tonic-gate 		bp->b_error = 0;
1977c478bd9Sstevel@tonic-gate 		bp->b_proc = procp;
1987c478bd9Sstevel@tonic-gate 
1997c478bd9Sstevel@tonic-gate 		while (iov->iov_len > 0) {
2007c478bd9Sstevel@tonic-gate 			if (uio->uio_resid == 0)
2017c478bd9Sstevel@tonic-gate 				break;
2027c478bd9Sstevel@tonic-gate 			if (uio->uio_loffset < 0) {
2037c478bd9Sstevel@tonic-gate 				error = EINVAL;
2047c478bd9Sstevel@tonic-gate 				break;
2057c478bd9Sstevel@tonic-gate 			}
2067c478bd9Sstevel@tonic-gate #ifdef	_ILP32
2077c478bd9Sstevel@tonic-gate 			/*
2087c478bd9Sstevel@tonic-gate 			 * For 32-bit kernels, check against SPEC_MAXOFFSET_T
2097c478bd9Sstevel@tonic-gate 			 * which represents the maximum size that can be
2107c478bd9Sstevel@tonic-gate 			 * supported by the IO subsystem.
2117c478bd9Sstevel@tonic-gate 			 * XXX this code assumes a D_64BIT driver.
2127c478bd9Sstevel@tonic-gate 			 */
2137c478bd9Sstevel@tonic-gate 			if (uio->uio_loffset > SPEC_MAXOFFSET_T) {
2147c478bd9Sstevel@tonic-gate 				error = EINVAL;
2157c478bd9Sstevel@tonic-gate 				break;
2167c478bd9Sstevel@tonic-gate 			}
2177c478bd9Sstevel@tonic-gate #endif	/* _ILP32 */
2187c478bd9Sstevel@tonic-gate 			bp->b_flags = B_BUSY | B_PHYS | rw;
2197c478bd9Sstevel@tonic-gate 			bp->b_edev = dev;
2207c478bd9Sstevel@tonic-gate 			bp->b_lblkno = btodt(uio->uio_loffset);
2217c478bd9Sstevel@tonic-gate 
2227c478bd9Sstevel@tonic-gate 			/*
2237c478bd9Sstevel@tonic-gate 			 * Don't count on b_addr remaining untouched by the
2247c478bd9Sstevel@tonic-gate 			 * code below (it may be reset because someone does
2257c478bd9Sstevel@tonic-gate 			 * a bp_mapin on the buffer) -- reset from the iov
2267c478bd9Sstevel@tonic-gate 			 * each time through, updating the iov's base address
2277c478bd9Sstevel@tonic-gate 			 * instead.
2287c478bd9Sstevel@tonic-gate 			 */
2297c478bd9Sstevel@tonic-gate 			a = bp->b_un.b_addr = iov->iov_base;
2307c478bd9Sstevel@tonic-gate 			bp->b_bcount = MIN(iov->iov_len, uio->uio_resid);
2317c478bd9Sstevel@tonic-gate 			(*mincnt)(bp);
2327c478bd9Sstevel@tonic-gate 			c = bp->b_bcount;
2337c478bd9Sstevel@tonic-gate 
2347c478bd9Sstevel@tonic-gate 			TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_START,
2357c478bd9Sstevel@tonic-gate 			    "as_pagelock_start: bp %p", bp);
2367c478bd9Sstevel@tonic-gate 
2377c478bd9Sstevel@tonic-gate 			error = as_pagelock(asp, &pplist, a,
2387c478bd9Sstevel@tonic-gate 			    c, rw == B_READ? S_WRITE : S_READ);
2397c478bd9Sstevel@tonic-gate 
2407c478bd9Sstevel@tonic-gate 			TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_END,
2417c478bd9Sstevel@tonic-gate 			    "as_pagelock_end:");
2427c478bd9Sstevel@tonic-gate 
2437c478bd9Sstevel@tonic-gate 			if (error != 0) {
2447c478bd9Sstevel@tonic-gate 				bp->b_flags |= B_ERROR;
2457c478bd9Sstevel@tonic-gate 				bp->b_error = error;
246*23d9e5acSMichael Corcoran 				bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS);
2477c478bd9Sstevel@tonic-gate 				break;
2487c478bd9Sstevel@tonic-gate 			}
2497c478bd9Sstevel@tonic-gate 			bp->b_shadow = pplist;
2507c478bd9Sstevel@tonic-gate 			if (pplist != NULL) {
2517c478bd9Sstevel@tonic-gate 				bp->b_flags |= B_SHADOW;
2527c478bd9Sstevel@tonic-gate 			}
2537c478bd9Sstevel@tonic-gate 
2547c478bd9Sstevel@tonic-gate 			DTRACE_IO1(start, struct buf *, bp);
2557c478bd9Sstevel@tonic-gate 			bp->b_flags |= B_STARTED;
2567c478bd9Sstevel@tonic-gate 
2577c478bd9Sstevel@tonic-gate 			(void) (*strat)(bp);
2587c478bd9Sstevel@tonic-gate 			error = biowait(bp);
2597c478bd9Sstevel@tonic-gate 
2607c478bd9Sstevel@tonic-gate 			/*
2617c478bd9Sstevel@tonic-gate 			 * unlock the pages
2627c478bd9Sstevel@tonic-gate 			 */
2637c478bd9Sstevel@tonic-gate 			TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_START,
2647c478bd9Sstevel@tonic-gate 			    "as_pageunlock_start: bp %p", bp);
2657c478bd9Sstevel@tonic-gate 
2667c478bd9Sstevel@tonic-gate 			as_pageunlock(asp, pplist, a, c,
2677c478bd9Sstevel@tonic-gate 			    rw == B_READ? S_WRITE : S_READ);
2687c478bd9Sstevel@tonic-gate 
2697c478bd9Sstevel@tonic-gate 			TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_END,
2707c478bd9Sstevel@tonic-gate 			    "as_pageunlock_end:");
2717c478bd9Sstevel@tonic-gate 
2727c478bd9Sstevel@tonic-gate 			c -= bp->b_resid;
2737c478bd9Sstevel@tonic-gate 			iov->iov_base += c;
2747c478bd9Sstevel@tonic-gate 			iov->iov_len -= c;
2757c478bd9Sstevel@tonic-gate 			uio->uio_resid -= c;
2767c478bd9Sstevel@tonic-gate 			uio->uio_loffset += c;
2777c478bd9Sstevel@tonic-gate 			/* bp->b_resid - temp kludge for tape drives */
2787c478bd9Sstevel@tonic-gate 			if (bp->b_resid || error)
2797c478bd9Sstevel@tonic-gate 				break;
2807c478bd9Sstevel@tonic-gate 		}
2817c478bd9Sstevel@tonic-gate 		bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
2827c478bd9Sstevel@tonic-gate 		/* bp->b_resid - temp kludge for tape drives */
2837c478bd9Sstevel@tonic-gate 		if (bp->b_resid || error)
2847c478bd9Sstevel@tonic-gate 			break;
2857c478bd9Sstevel@tonic-gate 		uio->uio_iov++;
2867c478bd9Sstevel@tonic-gate 		uio->uio_iovcnt--;
2877c478bd9Sstevel@tonic-gate 	}
2887c478bd9Sstevel@tonic-gate 
2897c478bd9Sstevel@tonic-gate 	if (allocbuf) {
2907c478bd9Sstevel@tonic-gate 		kmem_cache_free(physio_buf_cache, bp);
2917c478bd9Sstevel@tonic-gate 	}
2927c478bd9Sstevel@tonic-gate 
2937c478bd9Sstevel@tonic-gate 	/* Kernel probe */
2947c478bd9Sstevel@tonic-gate 	TNF_PROBE_1(physio_end, "io rawio", /* CSTYLED */,
2957c478bd9Sstevel@tonic-gate 		tnf_device,	device,		dev);
2967c478bd9Sstevel@tonic-gate 
2977c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_END, "physio_end: bp %p", bp);
2987c478bd9Sstevel@tonic-gate 
2997c478bd9Sstevel@tonic-gate 	return (error);
3007c478bd9Sstevel@tonic-gate }
3017c478bd9Sstevel@tonic-gate 
3027c478bd9Sstevel@tonic-gate /*
3037c478bd9Sstevel@tonic-gate  * Returns 0 on success, or an error on failure.
3047c478bd9Sstevel@tonic-gate  *
3057c478bd9Sstevel@tonic-gate  * This function is no longer a part of the DDI/DKI.
3067c478bd9Sstevel@tonic-gate  * However, for compatibility, its interface should not
3077c478bd9Sstevel@tonic-gate  * be changed and it should not be removed from the kernel.
3087c478bd9Sstevel@tonic-gate  */
3097c478bd9Sstevel@tonic-gate int
3107c478bd9Sstevel@tonic-gate useracc(void *addr, size_t count, int access)
3117c478bd9Sstevel@tonic-gate {
3127c478bd9Sstevel@tonic-gate 	uint_t prot;
3137c478bd9Sstevel@tonic-gate 
3147c478bd9Sstevel@tonic-gate 	prot = PROT_USER | ((access == B_READ) ? PROT_READ : PROT_WRITE);
3157c478bd9Sstevel@tonic-gate 	return (as_checkprot(ttoproc(curthread)->p_as, addr, count, prot));
3167c478bd9Sstevel@tonic-gate }
3177c478bd9Sstevel@tonic-gate 
3187c478bd9Sstevel@tonic-gate #define	MAX_MAPIN_PAGES	8
3197c478bd9Sstevel@tonic-gate 
3207c478bd9Sstevel@tonic-gate /*
3217c478bd9Sstevel@tonic-gate  * This function temporarily "borrows" user pages for kernel use. If
3227c478bd9Sstevel@tonic-gate  * "cow" is on, it also sets up copy-on-write protection (only feasible
3237c478bd9Sstevel@tonic-gate  * on MAP_PRIVATE segment) on the user mappings, to protect the borrowed
3247c478bd9Sstevel@tonic-gate  * pages from any changes by the user. The caller is responsible for
3257c478bd9Sstevel@tonic-gate  * unlocking and tearing down cow settings when it's done with the pages.
3267c478bd9Sstevel@tonic-gate  * For an example, see kcfree().
3277c478bd9Sstevel@tonic-gate  *
3287c478bd9Sstevel@tonic-gate  * Pages behind [uaddr..uaddr+*lenp] under address space "as" are locked
3297c478bd9Sstevel@tonic-gate  * (shared), and mapped into kernel address range [kaddr..kaddr+*lenp] if
3307c478bd9Sstevel@tonic-gate  * kaddr != -1. On entering this function, cached_ppp contains a list
3317c478bd9Sstevel@tonic-gate  * of pages that are mapped into [kaddr..kaddr+*lenp] already (from a
3327c478bd9Sstevel@tonic-gate  * previous call). Thus if same pages remain behind [uaddr..uaddr+*lenp],
3337c478bd9Sstevel@tonic-gate  * the kernel map won't need to be reloaded again.
3347c478bd9Sstevel@tonic-gate  *
3357c478bd9Sstevel@tonic-gate  * For cow == 1, if the pages are anonymous pages, it also bumps the anon
3367c478bd9Sstevel@tonic-gate  * reference count, and change the user-mapping to read-only. This
3377c478bd9Sstevel@tonic-gate  * scheme should work on all types of segment drivers. But to be safe,
3387c478bd9Sstevel@tonic-gate  * we check against segvn here.
3397c478bd9Sstevel@tonic-gate  *
3407c478bd9Sstevel@tonic-gate  * Since this function is used to emulate copyin() semantic, it checks
3417c478bd9Sstevel@tonic-gate  * to make sure the user-mappings allow "user-read".
3427c478bd9Sstevel@tonic-gate  *
3437c478bd9Sstevel@tonic-gate  * On exit "lenp" contains the number of bytes successfully locked and
3447c478bd9Sstevel@tonic-gate  * mapped in. For the unsuccessful ones, the caller can fall back to
3457c478bd9Sstevel@tonic-gate  * copyin().
3467c478bd9Sstevel@tonic-gate  *
3477c478bd9Sstevel@tonic-gate  * Error return:
3487c478bd9Sstevel@tonic-gate  * ENOTSUP - operation like this is not supported either on this segment
3497c478bd9Sstevel@tonic-gate  * type, or on this platform type.
3507c478bd9Sstevel@tonic-gate  */
3517c478bd9Sstevel@tonic-gate int
3527c478bd9Sstevel@tonic-gate cow_mapin(struct as *as, caddr_t uaddr, caddr_t kaddr, struct page **cached_ppp,
3537c478bd9Sstevel@tonic-gate     struct anon **app, size_t *lenp, int cow)
3547c478bd9Sstevel@tonic-gate {
3557c478bd9Sstevel@tonic-gate 	struct		hat *hat;
3567c478bd9Sstevel@tonic-gate 	struct seg	*seg;
3577c478bd9Sstevel@tonic-gate 	caddr_t		base;
3587c478bd9Sstevel@tonic-gate 	page_t		*pp, *ppp[MAX_MAPIN_PAGES];
3597c478bd9Sstevel@tonic-gate 	long		i;
3607c478bd9Sstevel@tonic-gate 	int		flags;
3617c478bd9Sstevel@tonic-gate 	size_t		size, total = *lenp;
3627c478bd9Sstevel@tonic-gate 	char		first = 1;
3637c478bd9Sstevel@tonic-gate 	faultcode_t	res;
3647c478bd9Sstevel@tonic-gate 
3657c478bd9Sstevel@tonic-gate 	*lenp = 0;
3667c478bd9Sstevel@tonic-gate 	if (cow) {
3677c478bd9Sstevel@tonic-gate 		AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3687c478bd9Sstevel@tonic-gate 		seg = as_findseg(as, uaddr, 0);
3697c478bd9Sstevel@tonic-gate 		if ((seg == NULL) || ((base = seg->s_base) > uaddr) ||
3707c478bd9Sstevel@tonic-gate 		    (uaddr + total) > base + seg->s_size) {
3717c478bd9Sstevel@tonic-gate 			AS_LOCK_EXIT(as, &as->a_lock);
3727c478bd9Sstevel@tonic-gate 			return (EINVAL);
3737c478bd9Sstevel@tonic-gate 		}
3747c478bd9Sstevel@tonic-gate 		/*
3757c478bd9Sstevel@tonic-gate 		 * The COW scheme should work for all segment types.
3767c478bd9Sstevel@tonic-gate 		 * But to be safe, we check against segvn.
3777c478bd9Sstevel@tonic-gate 		 */
3787c478bd9Sstevel@tonic-gate 		if (seg->s_ops != &segvn_ops) {
3797c478bd9Sstevel@tonic-gate 			AS_LOCK_EXIT(as, &as->a_lock);
3807c478bd9Sstevel@tonic-gate 			return (ENOTSUP);
3817c478bd9Sstevel@tonic-gate 		} else if ((SEGOP_GETTYPE(seg, uaddr) & MAP_PRIVATE) == 0) {
3827c478bd9Sstevel@tonic-gate 			AS_LOCK_EXIT(as, &as->a_lock);
3837c478bd9Sstevel@tonic-gate 			return (ENOTSUP);
3847c478bd9Sstevel@tonic-gate 		}
3857c478bd9Sstevel@tonic-gate 	}
3867c478bd9Sstevel@tonic-gate 	hat = as->a_hat;
3877c478bd9Sstevel@tonic-gate 	size = total;
3887c478bd9Sstevel@tonic-gate tryagain:
3897c478bd9Sstevel@tonic-gate 	/*
3907c478bd9Sstevel@tonic-gate 	 * If (cow), hat_softlock will also change the usr protection to RO.
3917c478bd9Sstevel@tonic-gate 	 * This is the first step toward setting up cow. Before we
3927c478bd9Sstevel@tonic-gate 	 * bump up an_refcnt, we can't allow any cow-fault on this
3937c478bd9Sstevel@tonic-gate 	 * address. Otherwise segvn_fault will change the protection back
3947c478bd9Sstevel@tonic-gate 	 * to RW upon seeing an_refcnt == 1.
3957c478bd9Sstevel@tonic-gate 	 * The solution is to hold the writer lock on "as".
3967c478bd9Sstevel@tonic-gate 	 */
3977c478bd9Sstevel@tonic-gate 	res = hat_softlock(hat, uaddr, &size, &ppp[0], cow ? HAT_COW : 0);
3987c478bd9Sstevel@tonic-gate 	size = total - size;
3997c478bd9Sstevel@tonic-gate 	*lenp += size;
4007c478bd9Sstevel@tonic-gate 	size = size >> PAGESHIFT;
4017c478bd9Sstevel@tonic-gate 	i = 0;
4027c478bd9Sstevel@tonic-gate 	while (i < size) {
4037c478bd9Sstevel@tonic-gate 		pp = ppp[i];
4047c478bd9Sstevel@tonic-gate 		if (cow) {
4057c478bd9Sstevel@tonic-gate 			kmutex_t *ahm;
4067c478bd9Sstevel@tonic-gate 			/*
4077c478bd9Sstevel@tonic-gate 			 * Another solution is to hold SE_EXCL on pp, and
4087c478bd9Sstevel@tonic-gate 			 * disable PROT_WRITE. This also works for MAP_SHARED
4097c478bd9Sstevel@tonic-gate 			 * segment. The disadvantage is that it locks the
4107c478bd9Sstevel@tonic-gate 			 * page from being used by anybody else.
4117c478bd9Sstevel@tonic-gate 			 */
412*23d9e5acSMichael Corcoran 			ahm = AH_MUTEX(pp->p_vnode, pp->p_offset);
4137c478bd9Sstevel@tonic-gate 			mutex_enter(ahm);
4147c478bd9Sstevel@tonic-gate 			*app = swap_anon(pp->p_vnode, pp->p_offset);
4157c478bd9Sstevel@tonic-gate 			/*
4167c478bd9Sstevel@tonic-gate 			 * Since we are holding the as lock, this avoids a
4177c478bd9Sstevel@tonic-gate 			 * potential race with anon_decref. (segvn_unmap and
4187c478bd9Sstevel@tonic-gate 			 * segvn_free needs the as writer lock to do anon_free.)
4197c478bd9Sstevel@tonic-gate 			 */
4207c478bd9Sstevel@tonic-gate 			if (*app != NULL) {
4217c478bd9Sstevel@tonic-gate #if 0
4227c478bd9Sstevel@tonic-gate 				if ((*app)->an_refcnt == 0)
4237c478bd9Sstevel@tonic-gate 				/*
4247c478bd9Sstevel@tonic-gate 				 * Consider the following senario (unlikey
4257c478bd9Sstevel@tonic-gate 				 * though):
4267c478bd9Sstevel@tonic-gate 				 * 1. an_refcnt == 2
4277c478bd9Sstevel@tonic-gate 				 * 2. we solftlock the page.
4287c478bd9Sstevel@tonic-gate 				 * 3. cow ocurrs on this addr. So a new ap,
4297c478bd9Sstevel@tonic-gate 				 * page and mapping is established on addr.
4307c478bd9Sstevel@tonic-gate 				 * 4. an_refcnt drops to 1 (segvn_faultpage
4317c478bd9Sstevel@tonic-gate 				 * -> anon_decref(oldap))
4327c478bd9Sstevel@tonic-gate 				 * 5. the last ref to ap also drops (from
4337c478bd9Sstevel@tonic-gate 				 * another as). It ends up blocked inside
4347c478bd9Sstevel@tonic-gate 				 * anon_decref trying to get page's excl lock.
4357c478bd9Sstevel@tonic-gate 				 * 6. Later kcfree unlocks the page, call
4367c478bd9Sstevel@tonic-gate 				 * anon_decref -> oops, ap is gone already.
4377c478bd9Sstevel@tonic-gate 				 *
4387c478bd9Sstevel@tonic-gate 				 * Holding as writer lock solves all problems.
4397c478bd9Sstevel@tonic-gate 				 */
4407c478bd9Sstevel@tonic-gate 					*app = NULL;
4417c478bd9Sstevel@tonic-gate 				else
4427c478bd9Sstevel@tonic-gate #endif
4437c478bd9Sstevel@tonic-gate 					(*app)->an_refcnt++;
4447c478bd9Sstevel@tonic-gate 			}
4457c478bd9Sstevel@tonic-gate 			mutex_exit(ahm);
4467c478bd9Sstevel@tonic-gate 		} else {
4477c478bd9Sstevel@tonic-gate 			*app = NULL;
4487c478bd9Sstevel@tonic-gate 		}
4497c478bd9Sstevel@tonic-gate 		if (kaddr != (caddr_t)-1) {
4507c478bd9Sstevel@tonic-gate 			if (pp != *cached_ppp) {
4517c478bd9Sstevel@tonic-gate 				if (*cached_ppp == NULL)
4527c478bd9Sstevel@tonic-gate 					flags = HAT_LOAD_LOCK | HAT_NOSYNC |
4537c478bd9Sstevel@tonic-gate 					    HAT_LOAD_NOCONSIST;
4547c478bd9Sstevel@tonic-gate 				else
4557c478bd9Sstevel@tonic-gate 					flags = HAT_LOAD_REMAP |
4567c478bd9Sstevel@tonic-gate 					    HAT_LOAD_NOCONSIST;
4577c478bd9Sstevel@tonic-gate 				/*
4587c478bd9Sstevel@tonic-gate 				 * In order to cache the kernel mapping after
4597c478bd9Sstevel@tonic-gate 				 * the user page is unlocked, we call
4607c478bd9Sstevel@tonic-gate 				 * hat_devload instead of hat_memload so
4617c478bd9Sstevel@tonic-gate 				 * that the kernel mapping we set up here is
4627c478bd9Sstevel@tonic-gate 				 * "invisible" to the rest of the world. This
4637c478bd9Sstevel@tonic-gate 				 * is not very pretty. But as long as the
4647c478bd9Sstevel@tonic-gate 				 * caller bears the responsibility of keeping
4657c478bd9Sstevel@tonic-gate 				 * cache consistency, we should be ok -
4667c478bd9Sstevel@tonic-gate 				 * HAT_NOCONSIST will get us a uncached
4677c478bd9Sstevel@tonic-gate 				 * mapping on VAC. hat_softlock will flush
4687c478bd9Sstevel@tonic-gate 				 * a VAC_WRITEBACK cache. Therefore the kaddr
4697c478bd9Sstevel@tonic-gate 				 * doesn't have to be of the same vcolor as
4707c478bd9Sstevel@tonic-gate 				 * uaddr.
4717c478bd9Sstevel@tonic-gate 				 * The alternative is - change hat_devload
4727c478bd9Sstevel@tonic-gate 				 * to get a cached mapping. Allocate a kaddr
4737c478bd9Sstevel@tonic-gate 				 * with the same vcolor as uaddr. Then
4747c478bd9Sstevel@tonic-gate 				 * hat_softlock won't need to flush the VAC.
4757c478bd9Sstevel@tonic-gate 				 */
4767c478bd9Sstevel@tonic-gate 				hat_devload(kas.a_hat, kaddr, PAGESIZE,
4777c478bd9Sstevel@tonic-gate 				    page_pptonum(pp), PROT_READ, flags);
4787c478bd9Sstevel@tonic-gate 				*cached_ppp = pp;
4797c478bd9Sstevel@tonic-gate 			}
4807c478bd9Sstevel@tonic-gate 			kaddr += PAGESIZE;
4817c478bd9Sstevel@tonic-gate 		}
4827c478bd9Sstevel@tonic-gate 		cached_ppp++;
4837c478bd9Sstevel@tonic-gate 		app++;
4847c478bd9Sstevel@tonic-gate 		++i;
4857c478bd9Sstevel@tonic-gate 	}
4867c478bd9Sstevel@tonic-gate 	if (cow) {
4877c478bd9Sstevel@tonic-gate 		AS_LOCK_EXIT(as, &as->a_lock);
4887c478bd9Sstevel@tonic-gate 	}
4897c478bd9Sstevel@tonic-gate 	if (first && res == FC_NOMAP) {
4907c478bd9Sstevel@tonic-gate 		/*
4917c478bd9Sstevel@tonic-gate 		 * If the address is not mapped yet, we call as_fault to
4927c478bd9Sstevel@tonic-gate 		 * fault the pages in. We could've fallen back to copy and
4937c478bd9Sstevel@tonic-gate 		 * let it fault in the pages. But for a mapped file, we
4947c478bd9Sstevel@tonic-gate 		 * normally reference each page only once. For zero-copy to
4957c478bd9Sstevel@tonic-gate 		 * be of any use, we'd better fall in the page now and try
4967c478bd9Sstevel@tonic-gate 		 * again.
4977c478bd9Sstevel@tonic-gate 		 */
4987c478bd9Sstevel@tonic-gate 		first = 0;
4997c478bd9Sstevel@tonic-gate 		size = size << PAGESHIFT;
5007c478bd9Sstevel@tonic-gate 		uaddr += size;
5017c478bd9Sstevel@tonic-gate 		total -= size;
5027c478bd9Sstevel@tonic-gate 		size = total;
5037c478bd9Sstevel@tonic-gate 		res = as_fault(as->a_hat, as, uaddr, size, F_INVAL, S_READ);
5047c478bd9Sstevel@tonic-gate 		if (cow)
5057c478bd9Sstevel@tonic-gate 			AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
5067c478bd9Sstevel@tonic-gate 		goto tryagain;
5077c478bd9Sstevel@tonic-gate 	}
5087c478bd9Sstevel@tonic-gate 	switch (res) {
5097c478bd9Sstevel@tonic-gate 	case FC_NOSUPPORT:
5107c478bd9Sstevel@tonic-gate 		return (ENOTSUP);
5117c478bd9Sstevel@tonic-gate 	case FC_PROT:	/* Pretend we don't know about it. This will be */
5127c478bd9Sstevel@tonic-gate 			/* caught by the caller when uiomove fails. */
5137c478bd9Sstevel@tonic-gate 	case FC_NOMAP:
5147c478bd9Sstevel@tonic-gate 	case FC_OBJERR:
5157c478bd9Sstevel@tonic-gate 	default:
5167c478bd9Sstevel@tonic-gate 		return (0);
5177c478bd9Sstevel@tonic-gate 	}
5187c478bd9Sstevel@tonic-gate }
519