xref: /illumos-gate/usr/src/uts/common/os/vm_subr.c (revision 7c478bd9)
1*7c478bd9Sstevel@tonic-gate /*
2*7c478bd9Sstevel@tonic-gate  * CDDL HEADER START
3*7c478bd9Sstevel@tonic-gate  *
4*7c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*7c478bd9Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
6*7c478bd9Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
7*7c478bd9Sstevel@tonic-gate  * with the License.
8*7c478bd9Sstevel@tonic-gate  *
9*7c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*7c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
11*7c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
12*7c478bd9Sstevel@tonic-gate  * and limitations under the License.
13*7c478bd9Sstevel@tonic-gate  *
14*7c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
15*7c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*7c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
17*7c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
18*7c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
19*7c478bd9Sstevel@tonic-gate  *
20*7c478bd9Sstevel@tonic-gate  * CDDL HEADER END
21*7c478bd9Sstevel@tonic-gate  */
22*7c478bd9Sstevel@tonic-gate /*
23*7c478bd9Sstevel@tonic-gate  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24*7c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
25*7c478bd9Sstevel@tonic-gate  */
26*7c478bd9Sstevel@tonic-gate 
27*7c478bd9Sstevel@tonic-gate /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28*7c478bd9Sstevel@tonic-gate /*	  All Rights Reserved  	*/
29*7c478bd9Sstevel@tonic-gate 
30*7c478bd9Sstevel@tonic-gate /*
31*7c478bd9Sstevel@tonic-gate  * University Copyright- Copyright (c) 1982, 1986, 1988
32*7c478bd9Sstevel@tonic-gate  * The Regents of the University of California
33*7c478bd9Sstevel@tonic-gate  * All Rights Reserved
34*7c478bd9Sstevel@tonic-gate  *
35*7c478bd9Sstevel@tonic-gate  * University Acknowledgment- Portions of this document are derived from
36*7c478bd9Sstevel@tonic-gate  * software developed by the University of California, Berkeley, and its
37*7c478bd9Sstevel@tonic-gate  * contributors.
38*7c478bd9Sstevel@tonic-gate  */
39*7c478bd9Sstevel@tonic-gate 
40*7c478bd9Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
41*7c478bd9Sstevel@tonic-gate 
42*7c478bd9Sstevel@tonic-gate #include <sys/types.h>
43*7c478bd9Sstevel@tonic-gate #include <sys/t_lock.h>
44*7c478bd9Sstevel@tonic-gate #include <sys/param.h>
45*7c478bd9Sstevel@tonic-gate #include <sys/errno.h>
46*7c478bd9Sstevel@tonic-gate #include <sys/debug.h>
47*7c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
48*7c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
49*7c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h>
50*7c478bd9Sstevel@tonic-gate #include <sys/inline.h>
51*7c478bd9Sstevel@tonic-gate #include <sys/buf.h>
52*7c478bd9Sstevel@tonic-gate #include <sys/uio.h>
53*7c478bd9Sstevel@tonic-gate #include <sys/user.h>
54*7c478bd9Sstevel@tonic-gate #include <sys/proc.h>
55*7c478bd9Sstevel@tonic-gate #include <sys/systm.h>
56*7c478bd9Sstevel@tonic-gate #include <sys/vmsystm.h>
57*7c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
58*7c478bd9Sstevel@tonic-gate #include <sys/mman.h>
59*7c478bd9Sstevel@tonic-gate #include <sys/cred.h>
60*7c478bd9Sstevel@tonic-gate #include <sys/vnode.h>
61*7c478bd9Sstevel@tonic-gate #include <sys/file.h>
62*7c478bd9Sstevel@tonic-gate #include <sys/vm.h>
63*7c478bd9Sstevel@tonic-gate 
64*7c478bd9Sstevel@tonic-gate #include <sys/swap.h>
65*7c478bd9Sstevel@tonic-gate #include <sys/vtrace.h>
66*7c478bd9Sstevel@tonic-gate #include <sys/tnf_probe.h>
67*7c478bd9Sstevel@tonic-gate #include <sys/fs/snode.h>
68*7c478bd9Sstevel@tonic-gate #include <sys/copyops.h>
69*7c478bd9Sstevel@tonic-gate #include <sys/conf.h>
70*7c478bd9Sstevel@tonic-gate #include <sys/sdt.h>
71*7c478bd9Sstevel@tonic-gate 
72*7c478bd9Sstevel@tonic-gate #include <vm/anon.h>
73*7c478bd9Sstevel@tonic-gate #include <vm/hat.h>
74*7c478bd9Sstevel@tonic-gate #include <vm/as.h>
75*7c478bd9Sstevel@tonic-gate #include <vm/seg.h>
76*7c478bd9Sstevel@tonic-gate #include <vm/page.h>
77*7c478bd9Sstevel@tonic-gate #include <vm/seg_vn.h>
78*7c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h>
79*7c478bd9Sstevel@tonic-gate 
80*7c478bd9Sstevel@tonic-gate extern int maxphys;
81*7c478bd9Sstevel@tonic-gate 
82*7c478bd9Sstevel@tonic-gate void
83*7c478bd9Sstevel@tonic-gate minphys(struct buf *bp)
84*7c478bd9Sstevel@tonic-gate {
85*7c478bd9Sstevel@tonic-gate 	if (bp->b_bcount > maxphys)
86*7c478bd9Sstevel@tonic-gate 		bp->b_bcount = maxphys;
87*7c478bd9Sstevel@tonic-gate }
88*7c478bd9Sstevel@tonic-gate 
89*7c478bd9Sstevel@tonic-gate /*
90*7c478bd9Sstevel@tonic-gate  * use kmem_cache_create for physio buffers. This has shown
91*7c478bd9Sstevel@tonic-gate  * a better cache distribution compared to buffers on the
92*7c478bd9Sstevel@tonic-gate  * stack. It also avoids semaphore construction/deconstruction
93*7c478bd9Sstevel@tonic-gate  * per request
94*7c478bd9Sstevel@tonic-gate  */
95*7c478bd9Sstevel@tonic-gate 
96*7c478bd9Sstevel@tonic-gate static struct kmem_cache *physio_buf_cache;
97*7c478bd9Sstevel@tonic-gate 
98*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
99*7c478bd9Sstevel@tonic-gate static int
100*7c478bd9Sstevel@tonic-gate physio_buf_constructor(void *buf, void *cdrarg, int kmflags)
101*7c478bd9Sstevel@tonic-gate {
102*7c478bd9Sstevel@tonic-gate 	bioinit((struct buf *)buf);
103*7c478bd9Sstevel@tonic-gate 	return (0);
104*7c478bd9Sstevel@tonic-gate }
105*7c478bd9Sstevel@tonic-gate 
106*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
107*7c478bd9Sstevel@tonic-gate static void
108*7c478bd9Sstevel@tonic-gate physio_buf_destructor(void *buf, void *cdrarg)
109*7c478bd9Sstevel@tonic-gate {
110*7c478bd9Sstevel@tonic-gate 	biofini((struct buf *)buf);
111*7c478bd9Sstevel@tonic-gate }
112*7c478bd9Sstevel@tonic-gate 
113*7c478bd9Sstevel@tonic-gate void
114*7c478bd9Sstevel@tonic-gate physio_bufs_init(void)
115*7c478bd9Sstevel@tonic-gate {
116*7c478bd9Sstevel@tonic-gate 	physio_buf_cache = kmem_cache_create("physio_buf_cache",
117*7c478bd9Sstevel@tonic-gate 		sizeof (struct buf), 0,
118*7c478bd9Sstevel@tonic-gate 		physio_buf_constructor, physio_buf_destructor,
119*7c478bd9Sstevel@tonic-gate 		NULL, NULL, NULL, 0);
120*7c478bd9Sstevel@tonic-gate }
121*7c478bd9Sstevel@tonic-gate 
122*7c478bd9Sstevel@tonic-gate 
123*7c478bd9Sstevel@tonic-gate 
124*7c478bd9Sstevel@tonic-gate /*
125*7c478bd9Sstevel@tonic-gate  * initiate raw I/O request
126*7c478bd9Sstevel@tonic-gate  *
127*7c478bd9Sstevel@tonic-gate  * allocate buf header if necessary
128*7c478bd9Sstevel@tonic-gate  * adjust max size of each I/O request
129*7c478bd9Sstevel@tonic-gate  * lock down user pages and verify access protections
130*7c478bd9Sstevel@tonic-gate  * call driver's strategy routine to submit request
131*7c478bd9Sstevel@tonic-gate  * wait for I/O completion
132*7c478bd9Sstevel@tonic-gate  * unlock user pages and free allocated buf header
133*7c478bd9Sstevel@tonic-gate  */
134*7c478bd9Sstevel@tonic-gate 
135*7c478bd9Sstevel@tonic-gate int
136*7c478bd9Sstevel@tonic-gate default_physio(int (*strat)(struct buf *), struct buf *bp, dev_t dev,
137*7c478bd9Sstevel@tonic-gate 	int rw, void (*mincnt)(struct buf *), struct uio *uio)
138*7c478bd9Sstevel@tonic-gate {
139*7c478bd9Sstevel@tonic-gate 	struct iovec *iov;
140*7c478bd9Sstevel@tonic-gate 	struct proc *procp;
141*7c478bd9Sstevel@tonic-gate 	struct as *asp;
142*7c478bd9Sstevel@tonic-gate 	ssize_t c;
143*7c478bd9Sstevel@tonic-gate 	char *a;
144*7c478bd9Sstevel@tonic-gate 	int error = 0;
145*7c478bd9Sstevel@tonic-gate 	page_t **pplist;
146*7c478bd9Sstevel@tonic-gate 	int allocbuf = 0;
147*7c478bd9Sstevel@tonic-gate 
148*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_START, "physio_start: bp %p", bp);
149*7c478bd9Sstevel@tonic-gate 
150*7c478bd9Sstevel@tonic-gate 	/* Kernel probe */
151*7c478bd9Sstevel@tonic-gate 	TNF_PROBE_4(physio_start, "io rawio", /* CSTYLED */,
152*7c478bd9Sstevel@tonic-gate 		tnf_device,	device,		dev,
153*7c478bd9Sstevel@tonic-gate 		tnf_offset,	offset,		uio->uio_loffset,
154*7c478bd9Sstevel@tonic-gate 		tnf_size,	size,		uio->uio_resid,
155*7c478bd9Sstevel@tonic-gate 		tnf_bioflags,	rw,		rw);
156*7c478bd9Sstevel@tonic-gate 
157*7c478bd9Sstevel@tonic-gate 	if (rw == B_READ) {
158*7c478bd9Sstevel@tonic-gate 		CPU_STATS_ADD_K(sys, phread, 1);
159*7c478bd9Sstevel@tonic-gate 	} else {
160*7c478bd9Sstevel@tonic-gate 		CPU_STATS_ADD_K(sys, phwrite, 1);
161*7c478bd9Sstevel@tonic-gate 	}
162*7c478bd9Sstevel@tonic-gate 
163*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_START,
164*7c478bd9Sstevel@tonic-gate 		"getbuf_start: bp %p", bp);
165*7c478bd9Sstevel@tonic-gate 
166*7c478bd9Sstevel@tonic-gate 	if (bp == NULL) {
167*7c478bd9Sstevel@tonic-gate 		bp = kmem_cache_alloc(physio_buf_cache, KM_SLEEP);
168*7c478bd9Sstevel@tonic-gate 		bp->b_iodone = NULL;
169*7c478bd9Sstevel@tonic-gate 		bp->b_resid = 0;
170*7c478bd9Sstevel@tonic-gate 		allocbuf = 1;
171*7c478bd9Sstevel@tonic-gate 	}
172*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_END, "getbuf_end: bp %p", bp);
173*7c478bd9Sstevel@tonic-gate 
174*7c478bd9Sstevel@tonic-gate 	if (uio->uio_segflg == UIO_USERSPACE) {
175*7c478bd9Sstevel@tonic-gate 		procp = ttoproc(curthread);
176*7c478bd9Sstevel@tonic-gate 		asp = procp->p_as;
177*7c478bd9Sstevel@tonic-gate 	} else {
178*7c478bd9Sstevel@tonic-gate 		procp = NULL;
179*7c478bd9Sstevel@tonic-gate 		asp = &kas;
180*7c478bd9Sstevel@tonic-gate 	}
181*7c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
182*7c478bd9Sstevel@tonic-gate 
183*7c478bd9Sstevel@tonic-gate 	/*
184*7c478bd9Sstevel@tonic-gate 	 * We need to prepare this buffer for the io:::start probe, including
185*7c478bd9Sstevel@tonic-gate 	 * NULL'ing out the file, clearing the offset, and filling in the
186*7c478bd9Sstevel@tonic-gate 	 * b_dip field.
187*7c478bd9Sstevel@tonic-gate 	 */
188*7c478bd9Sstevel@tonic-gate 	bp->b_file = NULL;
189*7c478bd9Sstevel@tonic-gate 	bp->b_offset = -1;
190*7c478bd9Sstevel@tonic-gate 
191*7c478bd9Sstevel@tonic-gate 	if (dev != NODEV) {
192*7c478bd9Sstevel@tonic-gate 		(void) devopsp[getmajor(dev)]->devo_getinfo(NULL,
193*7c478bd9Sstevel@tonic-gate 		    DDI_INFO_DEVT2DEVINFO, (void *)dev, (void **)&bp->b_dip);
194*7c478bd9Sstevel@tonic-gate 	} else {
195*7c478bd9Sstevel@tonic-gate 		bp->b_dip = NULL;
196*7c478bd9Sstevel@tonic-gate 	}
197*7c478bd9Sstevel@tonic-gate 
198*7c478bd9Sstevel@tonic-gate 	while (uio->uio_iovcnt > 0) {
199*7c478bd9Sstevel@tonic-gate 		iov = uio->uio_iov;
200*7c478bd9Sstevel@tonic-gate 
201*7c478bd9Sstevel@tonic-gate 		bp->b_error = 0;
202*7c478bd9Sstevel@tonic-gate 		bp->b_proc = procp;
203*7c478bd9Sstevel@tonic-gate 
204*7c478bd9Sstevel@tonic-gate 		while (iov->iov_len > 0) {
205*7c478bd9Sstevel@tonic-gate 			if (uio->uio_resid == 0)
206*7c478bd9Sstevel@tonic-gate 				break;
207*7c478bd9Sstevel@tonic-gate 			if (uio->uio_loffset < 0) {
208*7c478bd9Sstevel@tonic-gate 				error = EINVAL;
209*7c478bd9Sstevel@tonic-gate 				break;
210*7c478bd9Sstevel@tonic-gate 			}
211*7c478bd9Sstevel@tonic-gate #ifdef	_ILP32
212*7c478bd9Sstevel@tonic-gate 			/*
213*7c478bd9Sstevel@tonic-gate 			 * For 32-bit kernels, check against SPEC_MAXOFFSET_T
214*7c478bd9Sstevel@tonic-gate 			 * which represents the maximum size that can be
215*7c478bd9Sstevel@tonic-gate 			 * supported by the IO subsystem.
216*7c478bd9Sstevel@tonic-gate 			 * XXX this code assumes a D_64BIT driver.
217*7c478bd9Sstevel@tonic-gate 			 */
218*7c478bd9Sstevel@tonic-gate 			if (uio->uio_loffset > SPEC_MAXOFFSET_T) {
219*7c478bd9Sstevel@tonic-gate 				error = EINVAL;
220*7c478bd9Sstevel@tonic-gate 				break;
221*7c478bd9Sstevel@tonic-gate 			}
222*7c478bd9Sstevel@tonic-gate #endif	/* _ILP32 */
223*7c478bd9Sstevel@tonic-gate 			bp->b_flags = B_BUSY | B_PHYS | rw;
224*7c478bd9Sstevel@tonic-gate 			bp->b_edev = dev;
225*7c478bd9Sstevel@tonic-gate 			bp->b_lblkno = btodt(uio->uio_loffset);
226*7c478bd9Sstevel@tonic-gate 
227*7c478bd9Sstevel@tonic-gate 			/*
228*7c478bd9Sstevel@tonic-gate 			 * Don't count on b_addr remaining untouched by the
229*7c478bd9Sstevel@tonic-gate 			 * code below (it may be reset because someone does
230*7c478bd9Sstevel@tonic-gate 			 * a bp_mapin on the buffer) -- reset from the iov
231*7c478bd9Sstevel@tonic-gate 			 * each time through, updating the iov's base address
232*7c478bd9Sstevel@tonic-gate 			 * instead.
233*7c478bd9Sstevel@tonic-gate 			 */
234*7c478bd9Sstevel@tonic-gate 			a = bp->b_un.b_addr = iov->iov_base;
235*7c478bd9Sstevel@tonic-gate 			bp->b_bcount = MIN(iov->iov_len, uio->uio_resid);
236*7c478bd9Sstevel@tonic-gate 			(*mincnt)(bp);
237*7c478bd9Sstevel@tonic-gate 			c = bp->b_bcount;
238*7c478bd9Sstevel@tonic-gate 
239*7c478bd9Sstevel@tonic-gate 			TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_START,
240*7c478bd9Sstevel@tonic-gate 			    "as_pagelock_start: bp %p", bp);
241*7c478bd9Sstevel@tonic-gate 
242*7c478bd9Sstevel@tonic-gate 			error = as_pagelock(asp, &pplist, a,
243*7c478bd9Sstevel@tonic-gate 			    c, rw == B_READ? S_WRITE : S_READ);
244*7c478bd9Sstevel@tonic-gate 
245*7c478bd9Sstevel@tonic-gate 			TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_END,
246*7c478bd9Sstevel@tonic-gate 			    "as_pagelock_end:");
247*7c478bd9Sstevel@tonic-gate 
248*7c478bd9Sstevel@tonic-gate 			if (error != 0) {
249*7c478bd9Sstevel@tonic-gate 				bp->b_flags |= B_ERROR;
250*7c478bd9Sstevel@tonic-gate 				bp->b_error = error;
251*7c478bd9Sstevel@tonic-gate 				bp->b_flags &=
252*7c478bd9Sstevel@tonic-gate 					~(B_BUSY|B_WANTED|B_PHYS);
253*7c478bd9Sstevel@tonic-gate 				break;
254*7c478bd9Sstevel@tonic-gate 			}
255*7c478bd9Sstevel@tonic-gate 			bp->b_shadow = pplist;
256*7c478bd9Sstevel@tonic-gate 			if (pplist != NULL) {
257*7c478bd9Sstevel@tonic-gate 				bp->b_flags |= B_SHADOW;
258*7c478bd9Sstevel@tonic-gate 			}
259*7c478bd9Sstevel@tonic-gate 
260*7c478bd9Sstevel@tonic-gate 			DTRACE_IO1(start, struct buf *, bp);
261*7c478bd9Sstevel@tonic-gate 			bp->b_flags |= B_STARTED;
262*7c478bd9Sstevel@tonic-gate 
263*7c478bd9Sstevel@tonic-gate 			(void) (*strat)(bp);
264*7c478bd9Sstevel@tonic-gate 			error = biowait(bp);
265*7c478bd9Sstevel@tonic-gate 
266*7c478bd9Sstevel@tonic-gate 			/*
267*7c478bd9Sstevel@tonic-gate 			 * unlock the pages
268*7c478bd9Sstevel@tonic-gate 			 */
269*7c478bd9Sstevel@tonic-gate 			TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_START,
270*7c478bd9Sstevel@tonic-gate 				"as_pageunlock_start: bp %p", bp);
271*7c478bd9Sstevel@tonic-gate 
272*7c478bd9Sstevel@tonic-gate 			as_pageunlock(asp, pplist, a, c,
273*7c478bd9Sstevel@tonic-gate 				rw == B_READ? S_WRITE : S_READ);
274*7c478bd9Sstevel@tonic-gate 
275*7c478bd9Sstevel@tonic-gate 			TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_END,
276*7c478bd9Sstevel@tonic-gate 				"as_pageunlock_end:");
277*7c478bd9Sstevel@tonic-gate 
278*7c478bd9Sstevel@tonic-gate 			c -= bp->b_resid;
279*7c478bd9Sstevel@tonic-gate 			iov->iov_base += c;
280*7c478bd9Sstevel@tonic-gate 			iov->iov_len -= c;
281*7c478bd9Sstevel@tonic-gate 			uio->uio_resid -= c;
282*7c478bd9Sstevel@tonic-gate 			uio->uio_loffset += c;
283*7c478bd9Sstevel@tonic-gate 			/* bp->b_resid - temp kludge for tape drives */
284*7c478bd9Sstevel@tonic-gate 			if (bp->b_resid || error)
285*7c478bd9Sstevel@tonic-gate 				break;
286*7c478bd9Sstevel@tonic-gate 		}
287*7c478bd9Sstevel@tonic-gate 		bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
288*7c478bd9Sstevel@tonic-gate 		/* bp->b_resid - temp kludge for tape drives */
289*7c478bd9Sstevel@tonic-gate 		if (bp->b_resid || error)
290*7c478bd9Sstevel@tonic-gate 			break;
291*7c478bd9Sstevel@tonic-gate 		uio->uio_iov++;
292*7c478bd9Sstevel@tonic-gate 		uio->uio_iovcnt--;
293*7c478bd9Sstevel@tonic-gate 	}
294*7c478bd9Sstevel@tonic-gate 
295*7c478bd9Sstevel@tonic-gate 	if (allocbuf) {
296*7c478bd9Sstevel@tonic-gate 		kmem_cache_free(physio_buf_cache, bp);
297*7c478bd9Sstevel@tonic-gate 	}
298*7c478bd9Sstevel@tonic-gate 
299*7c478bd9Sstevel@tonic-gate 	/* Kernel probe */
300*7c478bd9Sstevel@tonic-gate 	TNF_PROBE_1(physio_end, "io rawio", /* CSTYLED */,
301*7c478bd9Sstevel@tonic-gate 		tnf_device,	device,		dev);
302*7c478bd9Sstevel@tonic-gate 
303*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_END, "physio_end: bp %p", bp);
304*7c478bd9Sstevel@tonic-gate 
305*7c478bd9Sstevel@tonic-gate 	return (error);
306*7c478bd9Sstevel@tonic-gate }
307*7c478bd9Sstevel@tonic-gate 
308*7c478bd9Sstevel@tonic-gate /*
309*7c478bd9Sstevel@tonic-gate  * Returns 0 on success, or an error on failure.
310*7c478bd9Sstevel@tonic-gate  *
311*7c478bd9Sstevel@tonic-gate  * This function is no longer a part of the DDI/DKI.
312*7c478bd9Sstevel@tonic-gate  * However, for compatibility, its interface should not
313*7c478bd9Sstevel@tonic-gate  * be changed and it should not be removed from the kernel.
314*7c478bd9Sstevel@tonic-gate  */
315*7c478bd9Sstevel@tonic-gate int
316*7c478bd9Sstevel@tonic-gate useracc(void *addr, size_t count, int access)
317*7c478bd9Sstevel@tonic-gate {
318*7c478bd9Sstevel@tonic-gate 	uint_t prot;
319*7c478bd9Sstevel@tonic-gate 
320*7c478bd9Sstevel@tonic-gate 	prot = PROT_USER | ((access == B_READ) ? PROT_READ : PROT_WRITE);
321*7c478bd9Sstevel@tonic-gate 	return (as_checkprot(ttoproc(curthread)->p_as, addr, count, prot));
322*7c478bd9Sstevel@tonic-gate }
323*7c478bd9Sstevel@tonic-gate 
324*7c478bd9Sstevel@tonic-gate #define	MAX_MAPIN_PAGES	8
325*7c478bd9Sstevel@tonic-gate 
326*7c478bd9Sstevel@tonic-gate /*
327*7c478bd9Sstevel@tonic-gate  * This function temporarily "borrows" user pages for kernel use. If
328*7c478bd9Sstevel@tonic-gate  * "cow" is on, it also sets up copy-on-write protection (only feasible
329*7c478bd9Sstevel@tonic-gate  * on MAP_PRIVATE segment) on the user mappings, to protect the borrowed
330*7c478bd9Sstevel@tonic-gate  * pages from any changes by the user. The caller is responsible for
331*7c478bd9Sstevel@tonic-gate  * unlocking and tearing down cow settings when it's done with the pages.
332*7c478bd9Sstevel@tonic-gate  * For an example, see kcfree().
333*7c478bd9Sstevel@tonic-gate  *
334*7c478bd9Sstevel@tonic-gate  * Pages behind [uaddr..uaddr+*lenp] under address space "as" are locked
335*7c478bd9Sstevel@tonic-gate  * (shared), and mapped into kernel address range [kaddr..kaddr+*lenp] if
336*7c478bd9Sstevel@tonic-gate  * kaddr != -1. On entering this function, cached_ppp contains a list
337*7c478bd9Sstevel@tonic-gate  * of pages that are mapped into [kaddr..kaddr+*lenp] already (from a
338*7c478bd9Sstevel@tonic-gate  * previous call). Thus if same pages remain behind [uaddr..uaddr+*lenp],
339*7c478bd9Sstevel@tonic-gate  * the kernel map won't need to be reloaded again.
340*7c478bd9Sstevel@tonic-gate  *
341*7c478bd9Sstevel@tonic-gate  * For cow == 1, if the pages are anonymous pages, it also bumps the anon
342*7c478bd9Sstevel@tonic-gate  * reference count, and change the user-mapping to read-only. This
343*7c478bd9Sstevel@tonic-gate  * scheme should work on all types of segment drivers. But to be safe,
344*7c478bd9Sstevel@tonic-gate  * we check against segvn here.
345*7c478bd9Sstevel@tonic-gate  *
346*7c478bd9Sstevel@tonic-gate  * Since this function is used to emulate copyin() semantic, it checks
347*7c478bd9Sstevel@tonic-gate  * to make sure the user-mappings allow "user-read".
348*7c478bd9Sstevel@tonic-gate  *
349*7c478bd9Sstevel@tonic-gate  * On exit "lenp" contains the number of bytes successfully locked and
350*7c478bd9Sstevel@tonic-gate  * mapped in. For the unsuccessful ones, the caller can fall back to
351*7c478bd9Sstevel@tonic-gate  * copyin().
352*7c478bd9Sstevel@tonic-gate  *
353*7c478bd9Sstevel@tonic-gate  * Error return:
354*7c478bd9Sstevel@tonic-gate  * ENOTSUP - operation like this is not supported either on this segment
355*7c478bd9Sstevel@tonic-gate  * type, or on this platform type.
356*7c478bd9Sstevel@tonic-gate  */
357*7c478bd9Sstevel@tonic-gate int
358*7c478bd9Sstevel@tonic-gate cow_mapin(struct as *as, caddr_t uaddr, caddr_t kaddr, struct page **cached_ppp,
359*7c478bd9Sstevel@tonic-gate     struct anon **app, size_t *lenp, int cow)
360*7c478bd9Sstevel@tonic-gate {
361*7c478bd9Sstevel@tonic-gate 	struct		hat *hat;
362*7c478bd9Sstevel@tonic-gate 	struct seg	*seg;
363*7c478bd9Sstevel@tonic-gate 	caddr_t		base;
364*7c478bd9Sstevel@tonic-gate 	page_t		*pp, *ppp[MAX_MAPIN_PAGES];
365*7c478bd9Sstevel@tonic-gate 	long		i;
366*7c478bd9Sstevel@tonic-gate 	int		flags;
367*7c478bd9Sstevel@tonic-gate 	size_t		size, total = *lenp;
368*7c478bd9Sstevel@tonic-gate 	char		first = 1;
369*7c478bd9Sstevel@tonic-gate 	faultcode_t	res;
370*7c478bd9Sstevel@tonic-gate 
371*7c478bd9Sstevel@tonic-gate 	*lenp = 0;
372*7c478bd9Sstevel@tonic-gate 	if (cow) {
373*7c478bd9Sstevel@tonic-gate 		AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
374*7c478bd9Sstevel@tonic-gate 		seg = as_findseg(as, uaddr, 0);
375*7c478bd9Sstevel@tonic-gate 		if ((seg == NULL) || ((base = seg->s_base) > uaddr) ||
376*7c478bd9Sstevel@tonic-gate 			(uaddr + total) > base + seg->s_size) {
377*7c478bd9Sstevel@tonic-gate 			AS_LOCK_EXIT(as, &as->a_lock);
378*7c478bd9Sstevel@tonic-gate 			return (EINVAL);
379*7c478bd9Sstevel@tonic-gate 		}
380*7c478bd9Sstevel@tonic-gate 		/*
381*7c478bd9Sstevel@tonic-gate 		 * The COW scheme should work for all segment types.
382*7c478bd9Sstevel@tonic-gate 		 * But to be safe, we check against segvn.
383*7c478bd9Sstevel@tonic-gate 		 */
384*7c478bd9Sstevel@tonic-gate 		if (seg->s_ops != &segvn_ops) {
385*7c478bd9Sstevel@tonic-gate 			AS_LOCK_EXIT(as, &as->a_lock);
386*7c478bd9Sstevel@tonic-gate 			return (ENOTSUP);
387*7c478bd9Sstevel@tonic-gate 		} else if ((SEGOP_GETTYPE(seg, uaddr) & MAP_PRIVATE) == 0) {
388*7c478bd9Sstevel@tonic-gate 			AS_LOCK_EXIT(as, &as->a_lock);
389*7c478bd9Sstevel@tonic-gate 			return (ENOTSUP);
390*7c478bd9Sstevel@tonic-gate 		}
391*7c478bd9Sstevel@tonic-gate 	}
392*7c478bd9Sstevel@tonic-gate 	hat = as->a_hat;
393*7c478bd9Sstevel@tonic-gate 	size = total;
394*7c478bd9Sstevel@tonic-gate tryagain:
395*7c478bd9Sstevel@tonic-gate 	/*
396*7c478bd9Sstevel@tonic-gate 	 * If (cow), hat_softlock will also change the usr protection to RO.
397*7c478bd9Sstevel@tonic-gate 	 * This is the first step toward setting up cow. Before we
398*7c478bd9Sstevel@tonic-gate 	 * bump up an_refcnt, we can't allow any cow-fault on this
399*7c478bd9Sstevel@tonic-gate 	 * address. Otherwise segvn_fault will change the protection back
400*7c478bd9Sstevel@tonic-gate 	 * to RW upon seeing an_refcnt == 1.
401*7c478bd9Sstevel@tonic-gate 	 * The solution is to hold the writer lock on "as".
402*7c478bd9Sstevel@tonic-gate 	 */
403*7c478bd9Sstevel@tonic-gate 	res = hat_softlock(hat, uaddr, &size, &ppp[0], cow ? HAT_COW : 0);
404*7c478bd9Sstevel@tonic-gate 	size = total - size;
405*7c478bd9Sstevel@tonic-gate 	*lenp += size;
406*7c478bd9Sstevel@tonic-gate 	size = size >> PAGESHIFT;
407*7c478bd9Sstevel@tonic-gate 	i = 0;
408*7c478bd9Sstevel@tonic-gate 	while (i < size) {
409*7c478bd9Sstevel@tonic-gate 		pp = ppp[i];
410*7c478bd9Sstevel@tonic-gate 		if (cow) {
411*7c478bd9Sstevel@tonic-gate 			kmutex_t *ahm;
412*7c478bd9Sstevel@tonic-gate 			/*
413*7c478bd9Sstevel@tonic-gate 			 * Another solution is to hold SE_EXCL on pp, and
414*7c478bd9Sstevel@tonic-gate 			 * disable PROT_WRITE. This also works for MAP_SHARED
415*7c478bd9Sstevel@tonic-gate 			 * segment. The disadvantage is that it locks the
416*7c478bd9Sstevel@tonic-gate 			 * page from being used by anybody else.
417*7c478bd9Sstevel@tonic-gate 			 */
418*7c478bd9Sstevel@tonic-gate 			ahm = &anonhash_lock[
419*7c478bd9Sstevel@tonic-gate 			    AH_LOCK(pp->p_vnode, pp->p_offset)];
420*7c478bd9Sstevel@tonic-gate 			mutex_enter(ahm);
421*7c478bd9Sstevel@tonic-gate 			*app = swap_anon(pp->p_vnode, pp->p_offset);
422*7c478bd9Sstevel@tonic-gate 			/*
423*7c478bd9Sstevel@tonic-gate 			 * Since we are holding the as lock, this avoids a
424*7c478bd9Sstevel@tonic-gate 			 * potential race with anon_decref. (segvn_unmap and
425*7c478bd9Sstevel@tonic-gate 			 * segvn_free needs the as writer lock to do anon_free.)
426*7c478bd9Sstevel@tonic-gate 			 */
427*7c478bd9Sstevel@tonic-gate 			if (*app != NULL) {
428*7c478bd9Sstevel@tonic-gate #if 0
429*7c478bd9Sstevel@tonic-gate 				if ((*app)->an_refcnt == 0)
430*7c478bd9Sstevel@tonic-gate 				/*
431*7c478bd9Sstevel@tonic-gate 				 * Consider the following senario (unlikey
432*7c478bd9Sstevel@tonic-gate 				 * though):
433*7c478bd9Sstevel@tonic-gate 				 * 1. an_refcnt == 2
434*7c478bd9Sstevel@tonic-gate 				 * 2. we solftlock the page.
435*7c478bd9Sstevel@tonic-gate 				 * 3. cow ocurrs on this addr. So a new ap,
436*7c478bd9Sstevel@tonic-gate 				 * page and mapping is established on addr.
437*7c478bd9Sstevel@tonic-gate 				 * 4. an_refcnt drops to 1 (segvn_faultpage
438*7c478bd9Sstevel@tonic-gate 				 * -> anon_decref(oldap))
439*7c478bd9Sstevel@tonic-gate 				 * 5. the last ref to ap also drops (from
440*7c478bd9Sstevel@tonic-gate 				 * another as). It ends up blocked inside
441*7c478bd9Sstevel@tonic-gate 				 * anon_decref trying to get page's excl lock.
442*7c478bd9Sstevel@tonic-gate 				 * 6. Later kcfree unlocks the page, call
443*7c478bd9Sstevel@tonic-gate 				 * anon_decref -> oops, ap is gone already.
444*7c478bd9Sstevel@tonic-gate 				 *
445*7c478bd9Sstevel@tonic-gate 				 * Holding as writer lock solves all problems.
446*7c478bd9Sstevel@tonic-gate 				 */
447*7c478bd9Sstevel@tonic-gate 					*app = NULL;
448*7c478bd9Sstevel@tonic-gate 				else
449*7c478bd9Sstevel@tonic-gate #endif
450*7c478bd9Sstevel@tonic-gate 					(*app)->an_refcnt++;
451*7c478bd9Sstevel@tonic-gate 			}
452*7c478bd9Sstevel@tonic-gate 			mutex_exit(ahm);
453*7c478bd9Sstevel@tonic-gate 		} else {
454*7c478bd9Sstevel@tonic-gate 			*app = NULL;
455*7c478bd9Sstevel@tonic-gate 		}
456*7c478bd9Sstevel@tonic-gate 		if (kaddr != (caddr_t)-1) {
457*7c478bd9Sstevel@tonic-gate 			if (pp != *cached_ppp) {
458*7c478bd9Sstevel@tonic-gate 				if (*cached_ppp == NULL)
459*7c478bd9Sstevel@tonic-gate 					flags = HAT_LOAD_LOCK | HAT_NOSYNC |
460*7c478bd9Sstevel@tonic-gate 					    HAT_LOAD_NOCONSIST;
461*7c478bd9Sstevel@tonic-gate 				else
462*7c478bd9Sstevel@tonic-gate 					flags = HAT_LOAD_REMAP |
463*7c478bd9Sstevel@tonic-gate 					    HAT_LOAD_NOCONSIST;
464*7c478bd9Sstevel@tonic-gate 				/*
465*7c478bd9Sstevel@tonic-gate 				 * In order to cache the kernel mapping after
466*7c478bd9Sstevel@tonic-gate 				 * the user page is unlocked, we call
467*7c478bd9Sstevel@tonic-gate 				 * hat_devload instead of hat_memload so
468*7c478bd9Sstevel@tonic-gate 				 * that the kernel mapping we set up here is
469*7c478bd9Sstevel@tonic-gate 				 * "invisible" to the rest of the world. This
470*7c478bd9Sstevel@tonic-gate 				 * is not very pretty. But as long as the
471*7c478bd9Sstevel@tonic-gate 				 * caller bears the responsibility of keeping
472*7c478bd9Sstevel@tonic-gate 				 * cache consistency, we should be ok -
473*7c478bd9Sstevel@tonic-gate 				 * HAT_NOCONSIST will get us a uncached
474*7c478bd9Sstevel@tonic-gate 				 * mapping on VAC. hat_softlock will flush
475*7c478bd9Sstevel@tonic-gate 				 * a VAC_WRITEBACK cache. Therefore the kaddr
476*7c478bd9Sstevel@tonic-gate 				 * doesn't have to be of the same vcolor as
477*7c478bd9Sstevel@tonic-gate 				 * uaddr.
478*7c478bd9Sstevel@tonic-gate 				 * The alternative is - change hat_devload
479*7c478bd9Sstevel@tonic-gate 				 * to get a cached mapping. Allocate a kaddr
480*7c478bd9Sstevel@tonic-gate 				 * with the same vcolor as uaddr. Then
481*7c478bd9Sstevel@tonic-gate 				 * hat_softlock won't need to flush the VAC.
482*7c478bd9Sstevel@tonic-gate 				 */
483*7c478bd9Sstevel@tonic-gate 				hat_devload(kas.a_hat, kaddr, PAGESIZE,
484*7c478bd9Sstevel@tonic-gate 				    page_pptonum(pp), PROT_READ, flags);
485*7c478bd9Sstevel@tonic-gate 				*cached_ppp = pp;
486*7c478bd9Sstevel@tonic-gate 			}
487*7c478bd9Sstevel@tonic-gate 			kaddr += PAGESIZE;
488*7c478bd9Sstevel@tonic-gate 		}
489*7c478bd9Sstevel@tonic-gate 		cached_ppp++;
490*7c478bd9Sstevel@tonic-gate 		app++;
491*7c478bd9Sstevel@tonic-gate 		++i;
492*7c478bd9Sstevel@tonic-gate 	}
493*7c478bd9Sstevel@tonic-gate 	if (cow) {
494*7c478bd9Sstevel@tonic-gate 		AS_LOCK_EXIT(as, &as->a_lock);
495*7c478bd9Sstevel@tonic-gate 	}
496*7c478bd9Sstevel@tonic-gate 	if (first && res == FC_NOMAP) {
497*7c478bd9Sstevel@tonic-gate 		/*
498*7c478bd9Sstevel@tonic-gate 		 * If the address is not mapped yet, we call as_fault to
499*7c478bd9Sstevel@tonic-gate 		 * fault the pages in. We could've fallen back to copy and
500*7c478bd9Sstevel@tonic-gate 		 * let it fault in the pages. But for a mapped file, we
501*7c478bd9Sstevel@tonic-gate 		 * normally reference each page only once. For zero-copy to
502*7c478bd9Sstevel@tonic-gate 		 * be of any use, we'd better fall in the page now and try
503*7c478bd9Sstevel@tonic-gate 		 * again.
504*7c478bd9Sstevel@tonic-gate 		 */
505*7c478bd9Sstevel@tonic-gate 		first = 0;
506*7c478bd9Sstevel@tonic-gate 		size = size << PAGESHIFT;
507*7c478bd9Sstevel@tonic-gate 		uaddr += size;
508*7c478bd9Sstevel@tonic-gate 		total -= size;
509*7c478bd9Sstevel@tonic-gate 		size = total;
510*7c478bd9Sstevel@tonic-gate 		res = as_fault(as->a_hat, as, uaddr, size, F_INVAL, S_READ);
511*7c478bd9Sstevel@tonic-gate 		if (cow)
512*7c478bd9Sstevel@tonic-gate 			AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
513*7c478bd9Sstevel@tonic-gate 		goto tryagain;
514*7c478bd9Sstevel@tonic-gate 	}
515*7c478bd9Sstevel@tonic-gate 	switch (res) {
516*7c478bd9Sstevel@tonic-gate 	case FC_NOSUPPORT:
517*7c478bd9Sstevel@tonic-gate 		return (ENOTSUP);
518*7c478bd9Sstevel@tonic-gate 	case FC_PROT:	/* Pretend we don't know about it. This will be */
519*7c478bd9Sstevel@tonic-gate 			/* caught by the caller when uiomove fails. */
520*7c478bd9Sstevel@tonic-gate 	case FC_NOMAP:
521*7c478bd9Sstevel@tonic-gate 	case FC_OBJERR:
522*7c478bd9Sstevel@tonic-gate 	default:
523*7c478bd9Sstevel@tonic-gate 		return (0);
524*7c478bd9Sstevel@tonic-gate 	}
525*7c478bd9Sstevel@tonic-gate }
526