17c478bd9Sstevel@tonic-gate /* 27c478bd9Sstevel@tonic-gate * CDDL HEADER START 37c478bd9Sstevel@tonic-gate * 47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 523d9e5acSMichael Corcoran * Common Development and Distribution License (the "License"). 623d9e5acSMichael Corcoran * You may not use this file except in compliance with the License. 77c478bd9Sstevel@tonic-gate * 87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 117c478bd9Sstevel@tonic-gate * and limitations under the License. 127c478bd9Sstevel@tonic-gate * 137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 187c478bd9Sstevel@tonic-gate * 197c478bd9Sstevel@tonic-gate * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 217c478bd9Sstevel@tonic-gate /* 2223d9e5acSMichael Corcoran * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 237c478bd9Sstevel@tonic-gate */ 247c478bd9Sstevel@tonic-gate 257c478bd9Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 267c478bd9Sstevel@tonic-gate /* All Rights Reserved */ 277c478bd9Sstevel@tonic-gate 287c478bd9Sstevel@tonic-gate /* 297c478bd9Sstevel@tonic-gate * University Copyright- Copyright (c) 1982, 1986, 1988 307c478bd9Sstevel@tonic-gate * The Regents of the University of California 317c478bd9Sstevel@tonic-gate * All Rights Reserved 327c478bd9Sstevel@tonic-gate * 337c478bd9Sstevel@tonic-gate * University Acknowledgment- Portions of this document are derived from 347c478bd9Sstevel@tonic-gate * software developed by the University of California, Berkeley, and its 357c478bd9Sstevel@tonic-gate * contributors. 367c478bd9Sstevel@tonic-gate */ 377c478bd9Sstevel@tonic-gate 387c478bd9Sstevel@tonic-gate #include <sys/types.h> 397c478bd9Sstevel@tonic-gate #include <sys/t_lock.h> 407c478bd9Sstevel@tonic-gate #include <sys/param.h> 417c478bd9Sstevel@tonic-gate #include <sys/errno.h> 427c478bd9Sstevel@tonic-gate #include <sys/debug.h> 437c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h> 447c478bd9Sstevel@tonic-gate #include <sys/kmem.h> 457c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h> 467c478bd9Sstevel@tonic-gate #include <sys/inline.h> 477c478bd9Sstevel@tonic-gate #include <sys/buf.h> 487c478bd9Sstevel@tonic-gate #include <sys/uio.h> 497c478bd9Sstevel@tonic-gate #include <sys/user.h> 507c478bd9Sstevel@tonic-gate #include <sys/proc.h> 517c478bd9Sstevel@tonic-gate #include <sys/systm.h> 527c478bd9Sstevel@tonic-gate #include <sys/vmsystm.h> 537c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h> 547c478bd9Sstevel@tonic-gate #include <sys/mman.h> 557c478bd9Sstevel@tonic-gate #include <sys/cred.h> 567c478bd9Sstevel@tonic-gate #include <sys/vnode.h> 577c478bd9Sstevel@tonic-gate #include <sys/file.h> 587c478bd9Sstevel@tonic-gate #include <sys/vm.h> 597c478bd9Sstevel@tonic-gate 607c478bd9Sstevel@tonic-gate #include <sys/swap.h> 617c478bd9Sstevel@tonic-gate #include <sys/vtrace.h> 627c478bd9Sstevel@tonic-gate #include <sys/tnf_probe.h> 637c478bd9Sstevel@tonic-gate #include <sys/fs/snode.h> 647c478bd9Sstevel@tonic-gate #include <sys/copyops.h> 657c478bd9Sstevel@tonic-gate #include <sys/conf.h> 667c478bd9Sstevel@tonic-gate #include <sys/sdt.h> 677c478bd9Sstevel@tonic-gate 687c478bd9Sstevel@tonic-gate #include <vm/anon.h> 697c478bd9Sstevel@tonic-gate #include <vm/hat.h> 707c478bd9Sstevel@tonic-gate #include <vm/as.h> 717c478bd9Sstevel@tonic-gate #include <vm/seg.h> 727c478bd9Sstevel@tonic-gate #include <vm/page.h> 737c478bd9Sstevel@tonic-gate #include <vm/seg_vn.h> 747c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h> 757c478bd9Sstevel@tonic-gate 767c478bd9Sstevel@tonic-gate extern int maxphys; 777c478bd9Sstevel@tonic-gate 787c478bd9Sstevel@tonic-gate void 797c478bd9Sstevel@tonic-gate minphys(struct buf *bp) 807c478bd9Sstevel@tonic-gate { 817c478bd9Sstevel@tonic-gate if (bp->b_bcount > maxphys) 827c478bd9Sstevel@tonic-gate bp->b_bcount = maxphys; 837c478bd9Sstevel@tonic-gate } 847c478bd9Sstevel@tonic-gate 857c478bd9Sstevel@tonic-gate /* 867c478bd9Sstevel@tonic-gate * use kmem_cache_create for physio buffers. This has shown 877c478bd9Sstevel@tonic-gate * a better cache distribution compared to buffers on the 887c478bd9Sstevel@tonic-gate * stack. It also avoids semaphore construction/deconstruction 897c478bd9Sstevel@tonic-gate * per request 907c478bd9Sstevel@tonic-gate */ 917c478bd9Sstevel@tonic-gate 927c478bd9Sstevel@tonic-gate static struct kmem_cache *physio_buf_cache; 937c478bd9Sstevel@tonic-gate 947c478bd9Sstevel@tonic-gate /* ARGSUSED */ 957c478bd9Sstevel@tonic-gate static int 967c478bd9Sstevel@tonic-gate physio_buf_constructor(void *buf, void *cdrarg, int kmflags) 977c478bd9Sstevel@tonic-gate { 987c478bd9Sstevel@tonic-gate bioinit((struct buf *)buf); 997c478bd9Sstevel@tonic-gate return (0); 1007c478bd9Sstevel@tonic-gate } 1017c478bd9Sstevel@tonic-gate 1027c478bd9Sstevel@tonic-gate /* ARGSUSED */ 1037c478bd9Sstevel@tonic-gate static void 1047c478bd9Sstevel@tonic-gate physio_buf_destructor(void *buf, void *cdrarg) 1057c478bd9Sstevel@tonic-gate { 1067c478bd9Sstevel@tonic-gate biofini((struct buf *)buf); 1077c478bd9Sstevel@tonic-gate } 1087c478bd9Sstevel@tonic-gate 1097c478bd9Sstevel@tonic-gate void 1107c478bd9Sstevel@tonic-gate physio_bufs_init(void) 1117c478bd9Sstevel@tonic-gate { 1127c478bd9Sstevel@tonic-gate physio_buf_cache = kmem_cache_create("physio_buf_cache", 11323d9e5acSMichael Corcoran sizeof (struct buf), 0, physio_buf_constructor, 11423d9e5acSMichael Corcoran physio_buf_destructor, NULL, NULL, NULL, 0); 1157c478bd9Sstevel@tonic-gate } 1167c478bd9Sstevel@tonic-gate 1177c478bd9Sstevel@tonic-gate 1187c478bd9Sstevel@tonic-gate 1197c478bd9Sstevel@tonic-gate /* 1207c478bd9Sstevel@tonic-gate * initiate raw I/O request 1217c478bd9Sstevel@tonic-gate * 1227c478bd9Sstevel@tonic-gate * allocate buf header if necessary 1237c478bd9Sstevel@tonic-gate * adjust max size of each I/O request 1247c478bd9Sstevel@tonic-gate * lock down user pages and verify access protections 1257c478bd9Sstevel@tonic-gate * call driver's strategy routine to submit request 1267c478bd9Sstevel@tonic-gate * wait for I/O completion 1277c478bd9Sstevel@tonic-gate * unlock user pages and free allocated buf header 1287c478bd9Sstevel@tonic-gate */ 1297c478bd9Sstevel@tonic-gate 1307c478bd9Sstevel@tonic-gate int 1317c478bd9Sstevel@tonic-gate default_physio(int (*strat)(struct buf *), struct buf *bp, dev_t dev, 1327c478bd9Sstevel@tonic-gate int rw, void (*mincnt)(struct buf *), struct uio *uio) 1337c478bd9Sstevel@tonic-gate { 1347c478bd9Sstevel@tonic-gate struct iovec *iov; 1357c478bd9Sstevel@tonic-gate struct proc *procp; 1367c478bd9Sstevel@tonic-gate struct as *asp; 1377c478bd9Sstevel@tonic-gate ssize_t c; 1387c478bd9Sstevel@tonic-gate char *a; 1397c478bd9Sstevel@tonic-gate int error = 0; 1407c478bd9Sstevel@tonic-gate page_t **pplist; 1417c478bd9Sstevel@tonic-gate int allocbuf = 0; 1427c478bd9Sstevel@tonic-gate 1437c478bd9Sstevel@tonic-gate TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_START, "physio_start: bp %p", bp); 1447c478bd9Sstevel@tonic-gate 1457c478bd9Sstevel@tonic-gate /* Kernel probe */ 1467c478bd9Sstevel@tonic-gate TNF_PROBE_4(physio_start, "io rawio", /* CSTYLED */, 1477c478bd9Sstevel@tonic-gate tnf_device, device, dev, 1487c478bd9Sstevel@tonic-gate tnf_offset, offset, uio->uio_loffset, 1497c478bd9Sstevel@tonic-gate tnf_size, size, uio->uio_resid, 1507c478bd9Sstevel@tonic-gate tnf_bioflags, rw, rw); 1517c478bd9Sstevel@tonic-gate 1527c478bd9Sstevel@tonic-gate if (rw == B_READ) { 1537c478bd9Sstevel@tonic-gate CPU_STATS_ADD_K(sys, phread, 1); 1547c478bd9Sstevel@tonic-gate } else { 1557c478bd9Sstevel@tonic-gate CPU_STATS_ADD_K(sys, phwrite, 1); 1567c478bd9Sstevel@tonic-gate } 1577c478bd9Sstevel@tonic-gate 1587c478bd9Sstevel@tonic-gate TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_START, 1597c478bd9Sstevel@tonic-gate "getbuf_start: bp %p", bp); 1607c478bd9Sstevel@tonic-gate 1617c478bd9Sstevel@tonic-gate if (bp == NULL) { 1627c478bd9Sstevel@tonic-gate bp = kmem_cache_alloc(physio_buf_cache, KM_SLEEP); 1637c478bd9Sstevel@tonic-gate bp->b_iodone = NULL; 1647c478bd9Sstevel@tonic-gate bp->b_resid = 0; 1657c478bd9Sstevel@tonic-gate allocbuf = 1; 1667c478bd9Sstevel@tonic-gate } 1677c478bd9Sstevel@tonic-gate TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_END, "getbuf_end: bp %p", bp); 1687c478bd9Sstevel@tonic-gate 1697c478bd9Sstevel@tonic-gate if (uio->uio_segflg == UIO_USERSPACE) { 1707c478bd9Sstevel@tonic-gate procp = ttoproc(curthread); 1717c478bd9Sstevel@tonic-gate asp = procp->p_as; 1727c478bd9Sstevel@tonic-gate } else { 1737c478bd9Sstevel@tonic-gate procp = NULL; 1747c478bd9Sstevel@tonic-gate asp = &kas; 1757c478bd9Sstevel@tonic-gate } 1767c478bd9Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 1777c478bd9Sstevel@tonic-gate 1787c478bd9Sstevel@tonic-gate /* 1797c478bd9Sstevel@tonic-gate * We need to prepare this buffer for the io:::start probe, including 1807c478bd9Sstevel@tonic-gate * NULL'ing out the file, clearing the offset, and filling in the 1817c478bd9Sstevel@tonic-gate * b_dip field. 1827c478bd9Sstevel@tonic-gate */ 1837c478bd9Sstevel@tonic-gate bp->b_file = NULL; 1847c478bd9Sstevel@tonic-gate bp->b_offset = -1; 1857c478bd9Sstevel@tonic-gate 1867c478bd9Sstevel@tonic-gate if (dev != NODEV) { 1877c478bd9Sstevel@tonic-gate (void) devopsp[getmajor(dev)]->devo_getinfo(NULL, 1887c478bd9Sstevel@tonic-gate DDI_INFO_DEVT2DEVINFO, (void *)dev, (void **)&bp->b_dip); 1897c478bd9Sstevel@tonic-gate } else { 1907c478bd9Sstevel@tonic-gate bp->b_dip = NULL; 1917c478bd9Sstevel@tonic-gate } 1927c478bd9Sstevel@tonic-gate 1937c478bd9Sstevel@tonic-gate while (uio->uio_iovcnt > 0) { 1947c478bd9Sstevel@tonic-gate iov = uio->uio_iov; 1957c478bd9Sstevel@tonic-gate 1967c478bd9Sstevel@tonic-gate bp->b_error = 0; 1977c478bd9Sstevel@tonic-gate bp->b_proc = procp; 1987c478bd9Sstevel@tonic-gate 1997c478bd9Sstevel@tonic-gate while (iov->iov_len > 0) { 2007c478bd9Sstevel@tonic-gate if (uio->uio_resid == 0) 2017c478bd9Sstevel@tonic-gate break; 2027c478bd9Sstevel@tonic-gate if (uio->uio_loffset < 0) { 2037c478bd9Sstevel@tonic-gate error = EINVAL; 2047c478bd9Sstevel@tonic-gate break; 2057c478bd9Sstevel@tonic-gate } 2067c478bd9Sstevel@tonic-gate #ifdef _ILP32 2077c478bd9Sstevel@tonic-gate /* 2087c478bd9Sstevel@tonic-gate * For 32-bit kernels, check against SPEC_MAXOFFSET_T 2097c478bd9Sstevel@tonic-gate * which represents the maximum size that can be 2107c478bd9Sstevel@tonic-gate * supported by the IO subsystem. 2117c478bd9Sstevel@tonic-gate * XXX this code assumes a D_64BIT driver. 2127c478bd9Sstevel@tonic-gate */ 2137c478bd9Sstevel@tonic-gate if (uio->uio_loffset > SPEC_MAXOFFSET_T) { 2147c478bd9Sstevel@tonic-gate error = EINVAL; 2157c478bd9Sstevel@tonic-gate break; 2167c478bd9Sstevel@tonic-gate } 2177c478bd9Sstevel@tonic-gate #endif /* _ILP32 */ 2187c478bd9Sstevel@tonic-gate bp->b_flags = B_BUSY | B_PHYS | rw; 2197c478bd9Sstevel@tonic-gate bp->b_edev = dev; 2207c478bd9Sstevel@tonic-gate bp->b_lblkno = btodt(uio->uio_loffset); 2217c478bd9Sstevel@tonic-gate 2227c478bd9Sstevel@tonic-gate /* 2237c478bd9Sstevel@tonic-gate * Don't count on b_addr remaining untouched by the 2247c478bd9Sstevel@tonic-gate * code below (it may be reset because someone does 2257c478bd9Sstevel@tonic-gate * a bp_mapin on the buffer) -- reset from the iov 2267c478bd9Sstevel@tonic-gate * each time through, updating the iov's base address 2277c478bd9Sstevel@tonic-gate * instead. 2287c478bd9Sstevel@tonic-gate */ 2297c478bd9Sstevel@tonic-gate a = bp->b_un.b_addr = iov->iov_base; 2307c478bd9Sstevel@tonic-gate bp->b_bcount = MIN(iov->iov_len, uio->uio_resid); 2317c478bd9Sstevel@tonic-gate (*mincnt)(bp); 2327c478bd9Sstevel@tonic-gate c = bp->b_bcount; 2337c478bd9Sstevel@tonic-gate 2347c478bd9Sstevel@tonic-gate TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_START, 2357c478bd9Sstevel@tonic-gate "as_pagelock_start: bp %p", bp); 2367c478bd9Sstevel@tonic-gate 2377c478bd9Sstevel@tonic-gate error = as_pagelock(asp, &pplist, a, 2387c478bd9Sstevel@tonic-gate c, rw == B_READ? S_WRITE : S_READ); 2397c478bd9Sstevel@tonic-gate 2407c478bd9Sstevel@tonic-gate TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_END, 2417c478bd9Sstevel@tonic-gate "as_pagelock_end:"); 2427c478bd9Sstevel@tonic-gate 2437c478bd9Sstevel@tonic-gate if (error != 0) { 2447c478bd9Sstevel@tonic-gate bp->b_flags |= B_ERROR; 2457c478bd9Sstevel@tonic-gate bp->b_error = error; 24623d9e5acSMichael Corcoran bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS); 2477c478bd9Sstevel@tonic-gate break; 2487c478bd9Sstevel@tonic-gate } 2497c478bd9Sstevel@tonic-gate bp->b_shadow = pplist; 2507c478bd9Sstevel@tonic-gate if (pplist != NULL) { 2517c478bd9Sstevel@tonic-gate bp->b_flags |= B_SHADOW; 2527c478bd9Sstevel@tonic-gate } 2537c478bd9Sstevel@tonic-gate 2547c478bd9Sstevel@tonic-gate DTRACE_IO1(start, struct buf *, bp); 2557c478bd9Sstevel@tonic-gate bp->b_flags |= B_STARTED; 2567c478bd9Sstevel@tonic-gate 2577c478bd9Sstevel@tonic-gate (void) (*strat)(bp); 2587c478bd9Sstevel@tonic-gate error = biowait(bp); 2597c478bd9Sstevel@tonic-gate 2607c478bd9Sstevel@tonic-gate /* 2617c478bd9Sstevel@tonic-gate * unlock the pages 2627c478bd9Sstevel@tonic-gate */ 2637c478bd9Sstevel@tonic-gate TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_START, 2647c478bd9Sstevel@tonic-gate "as_pageunlock_start: bp %p", bp); 2657c478bd9Sstevel@tonic-gate 2667c478bd9Sstevel@tonic-gate as_pageunlock(asp, pplist, a, c, 2677c478bd9Sstevel@tonic-gate rw == B_READ? S_WRITE : S_READ); 2687c478bd9Sstevel@tonic-gate 2697c478bd9Sstevel@tonic-gate TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_END, 2707c478bd9Sstevel@tonic-gate "as_pageunlock_end:"); 2717c478bd9Sstevel@tonic-gate 2727c478bd9Sstevel@tonic-gate c -= bp->b_resid; 2737c478bd9Sstevel@tonic-gate iov->iov_base += c; 2747c478bd9Sstevel@tonic-gate iov->iov_len -= c; 2757c478bd9Sstevel@tonic-gate uio->uio_resid -= c; 2767c478bd9Sstevel@tonic-gate uio->uio_loffset += c; 2777c478bd9Sstevel@tonic-gate /* bp->b_resid - temp kludge for tape drives */ 2787c478bd9Sstevel@tonic-gate if (bp->b_resid || error) 2797c478bd9Sstevel@tonic-gate break; 2807c478bd9Sstevel@tonic-gate } 2817c478bd9Sstevel@tonic-gate bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW); 2827c478bd9Sstevel@tonic-gate /* bp->b_resid - temp kludge for tape drives */ 2837c478bd9Sstevel@tonic-gate if (bp->b_resid || error) 2847c478bd9Sstevel@tonic-gate break; 2857c478bd9Sstevel@tonic-gate uio->uio_iov++; 2867c478bd9Sstevel@tonic-gate uio->uio_iovcnt--; 2877c478bd9Sstevel@tonic-gate } 2887c478bd9Sstevel@tonic-gate 2897c478bd9Sstevel@tonic-gate if (allocbuf) { 2907c478bd9Sstevel@tonic-gate kmem_cache_free(physio_buf_cache, bp); 2917c478bd9Sstevel@tonic-gate } 2927c478bd9Sstevel@tonic-gate 2937c478bd9Sstevel@tonic-gate /* Kernel probe */ 2947c478bd9Sstevel@tonic-gate TNF_PROBE_1(physio_end, "io rawio", /* CSTYLED */, 2957c478bd9Sstevel@tonic-gate tnf_device, device, dev); 2967c478bd9Sstevel@tonic-gate 2977c478bd9Sstevel@tonic-gate TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_END, "physio_end: bp %p", bp); 2987c478bd9Sstevel@tonic-gate 2997c478bd9Sstevel@tonic-gate return (error); 3007c478bd9Sstevel@tonic-gate } 3017c478bd9Sstevel@tonic-gate 3027c478bd9Sstevel@tonic-gate /* 3037c478bd9Sstevel@tonic-gate * Returns 0 on success, or an error on failure. 3047c478bd9Sstevel@tonic-gate * 3057c478bd9Sstevel@tonic-gate * This function is no longer a part of the DDI/DKI. 3067c478bd9Sstevel@tonic-gate * However, for compatibility, its interface should not 3077c478bd9Sstevel@tonic-gate * be changed and it should not be removed from the kernel. 3087c478bd9Sstevel@tonic-gate */ 3097c478bd9Sstevel@tonic-gate int 3107c478bd9Sstevel@tonic-gate useracc(void *addr, size_t count, int access) 3117c478bd9Sstevel@tonic-gate { 3127c478bd9Sstevel@tonic-gate uint_t prot; 3137c478bd9Sstevel@tonic-gate 3147c478bd9Sstevel@tonic-gate prot = PROT_USER | ((access == B_READ) ? PROT_READ : PROT_WRITE); 3157c478bd9Sstevel@tonic-gate return (as_checkprot(ttoproc(curthread)->p_as, addr, count, prot)); 3167c478bd9Sstevel@tonic-gate } 3177c478bd9Sstevel@tonic-gate 3187c478bd9Sstevel@tonic-gate #define MAX_MAPIN_PAGES 8 3197c478bd9Sstevel@tonic-gate 3207c478bd9Sstevel@tonic-gate /* 3217c478bd9Sstevel@tonic-gate * This function temporarily "borrows" user pages for kernel use. If 3227c478bd9Sstevel@tonic-gate * "cow" is on, it also sets up copy-on-write protection (only feasible 3237c478bd9Sstevel@tonic-gate * on MAP_PRIVATE segment) on the user mappings, to protect the borrowed 3247c478bd9Sstevel@tonic-gate * pages from any changes by the user. The caller is responsible for 3257c478bd9Sstevel@tonic-gate * unlocking and tearing down cow settings when it's done with the pages. 3267c478bd9Sstevel@tonic-gate * For an example, see kcfree(). 3277c478bd9Sstevel@tonic-gate * 3287c478bd9Sstevel@tonic-gate * Pages behind [uaddr..uaddr+*lenp] under address space "as" are locked 3297c478bd9Sstevel@tonic-gate * (shared), and mapped into kernel address range [kaddr..kaddr+*lenp] if 3307c478bd9Sstevel@tonic-gate * kaddr != -1. On entering this function, cached_ppp contains a list 3317c478bd9Sstevel@tonic-gate * of pages that are mapped into [kaddr..kaddr+*lenp] already (from a 3327c478bd9Sstevel@tonic-gate * previous call). Thus if same pages remain behind [uaddr..uaddr+*lenp], 3337c478bd9Sstevel@tonic-gate * the kernel map won't need to be reloaded again. 3347c478bd9Sstevel@tonic-gate * 3357c478bd9Sstevel@tonic-gate * For cow == 1, if the pages are anonymous pages, it also bumps the anon 3367c478bd9Sstevel@tonic-gate * reference count, and change the user-mapping to read-only. This 3377c478bd9Sstevel@tonic-gate * scheme should work on all types of segment drivers. But to be safe, 3387c478bd9Sstevel@tonic-gate * we check against segvn here. 3397c478bd9Sstevel@tonic-gate * 3407c478bd9Sstevel@tonic-gate * Since this function is used to emulate copyin() semantic, it checks 3417c478bd9Sstevel@tonic-gate * to make sure the user-mappings allow "user-read". 3427c478bd9Sstevel@tonic-gate * 3437c478bd9Sstevel@tonic-gate * On exit "lenp" contains the number of bytes successfully locked and 3447c478bd9Sstevel@tonic-gate * mapped in. For the unsuccessful ones, the caller can fall back to 3457c478bd9Sstevel@tonic-gate * copyin(). 3467c478bd9Sstevel@tonic-gate * 3477c478bd9Sstevel@tonic-gate * Error return: 3487c478bd9Sstevel@tonic-gate * ENOTSUP - operation like this is not supported either on this segment 3497c478bd9Sstevel@tonic-gate * type, or on this platform type. 3507c478bd9Sstevel@tonic-gate */ 3517c478bd9Sstevel@tonic-gate int 3527c478bd9Sstevel@tonic-gate cow_mapin(struct as *as, caddr_t uaddr, caddr_t kaddr, struct page **cached_ppp, 3537c478bd9Sstevel@tonic-gate struct anon **app, size_t *lenp, int cow) 3547c478bd9Sstevel@tonic-gate { 3557c478bd9Sstevel@tonic-gate struct hat *hat; 3567c478bd9Sstevel@tonic-gate struct seg *seg; 3577c478bd9Sstevel@tonic-gate caddr_t base; 3587c478bd9Sstevel@tonic-gate page_t *pp, *ppp[MAX_MAPIN_PAGES]; 3597c478bd9Sstevel@tonic-gate long i; 3607c478bd9Sstevel@tonic-gate int flags; 3617c478bd9Sstevel@tonic-gate size_t size, total = *lenp; 3627c478bd9Sstevel@tonic-gate char first = 1; 3637c478bd9Sstevel@tonic-gate faultcode_t res; 3647c478bd9Sstevel@tonic-gate 3657c478bd9Sstevel@tonic-gate *lenp = 0; 3667c478bd9Sstevel@tonic-gate if (cow) { 367*dc32d872SJosef 'Jeff' Sipek AS_LOCK_ENTER(as, RW_WRITER); 3687c478bd9Sstevel@tonic-gate seg = as_findseg(as, uaddr, 0); 3697c478bd9Sstevel@tonic-gate if ((seg == NULL) || ((base = seg->s_base) > uaddr) || 3707c478bd9Sstevel@tonic-gate (uaddr + total) > base + seg->s_size) { 371*dc32d872SJosef 'Jeff' Sipek AS_LOCK_EXIT(as); 3727c478bd9Sstevel@tonic-gate return (EINVAL); 3737c478bd9Sstevel@tonic-gate } 3747c478bd9Sstevel@tonic-gate /* 3757c478bd9Sstevel@tonic-gate * The COW scheme should work for all segment types. 3767c478bd9Sstevel@tonic-gate * But to be safe, we check against segvn. 3777c478bd9Sstevel@tonic-gate */ 3787c478bd9Sstevel@tonic-gate if (seg->s_ops != &segvn_ops) { 379*dc32d872SJosef 'Jeff' Sipek AS_LOCK_EXIT(as); 3807c478bd9Sstevel@tonic-gate return (ENOTSUP); 3817c478bd9Sstevel@tonic-gate } else if ((SEGOP_GETTYPE(seg, uaddr) & MAP_PRIVATE) == 0) { 382*dc32d872SJosef 'Jeff' Sipek AS_LOCK_EXIT(as); 3837c478bd9Sstevel@tonic-gate return (ENOTSUP); 3847c478bd9Sstevel@tonic-gate } 3857c478bd9Sstevel@tonic-gate } 3867c478bd9Sstevel@tonic-gate hat = as->a_hat; 3877c478bd9Sstevel@tonic-gate size = total; 3887c478bd9Sstevel@tonic-gate tryagain: 3897c478bd9Sstevel@tonic-gate /* 3907c478bd9Sstevel@tonic-gate * If (cow), hat_softlock will also change the usr protection to RO. 3917c478bd9Sstevel@tonic-gate * This is the first step toward setting up cow. Before we 3927c478bd9Sstevel@tonic-gate * bump up an_refcnt, we can't allow any cow-fault on this 3937c478bd9Sstevel@tonic-gate * address. Otherwise segvn_fault will change the protection back 3947c478bd9Sstevel@tonic-gate * to RW upon seeing an_refcnt == 1. 3957c478bd9Sstevel@tonic-gate * The solution is to hold the writer lock on "as". 3967c478bd9Sstevel@tonic-gate */ 3977c478bd9Sstevel@tonic-gate res = hat_softlock(hat, uaddr, &size, &ppp[0], cow ? HAT_COW : 0); 3987c478bd9Sstevel@tonic-gate size = total - size; 3997c478bd9Sstevel@tonic-gate *lenp += size; 4007c478bd9Sstevel@tonic-gate size = size >> PAGESHIFT; 4017c478bd9Sstevel@tonic-gate i = 0; 4027c478bd9Sstevel@tonic-gate while (i < size) { 4037c478bd9Sstevel@tonic-gate pp = ppp[i]; 4047c478bd9Sstevel@tonic-gate if (cow) { 4057c478bd9Sstevel@tonic-gate kmutex_t *ahm; 4067c478bd9Sstevel@tonic-gate /* 4077c478bd9Sstevel@tonic-gate * Another solution is to hold SE_EXCL on pp, and 4087c478bd9Sstevel@tonic-gate * disable PROT_WRITE. This also works for MAP_SHARED 4097c478bd9Sstevel@tonic-gate * segment. The disadvantage is that it locks the 4107c478bd9Sstevel@tonic-gate * page from being used by anybody else. 4117c478bd9Sstevel@tonic-gate */ 41223d9e5acSMichael Corcoran ahm = AH_MUTEX(pp->p_vnode, pp->p_offset); 4137c478bd9Sstevel@tonic-gate mutex_enter(ahm); 4147c478bd9Sstevel@tonic-gate *app = swap_anon(pp->p_vnode, pp->p_offset); 4157c478bd9Sstevel@tonic-gate /* 4167c478bd9Sstevel@tonic-gate * Since we are holding the as lock, this avoids a 4177c478bd9Sstevel@tonic-gate * potential race with anon_decref. (segvn_unmap and 4187c478bd9Sstevel@tonic-gate * segvn_free needs the as writer lock to do anon_free.) 4197c478bd9Sstevel@tonic-gate */ 4207c478bd9Sstevel@tonic-gate if (*app != NULL) { 4217c478bd9Sstevel@tonic-gate #if 0 4227c478bd9Sstevel@tonic-gate if ((*app)->an_refcnt == 0) 4237c478bd9Sstevel@tonic-gate /* 4247c478bd9Sstevel@tonic-gate * Consider the following senario (unlikey 4257c478bd9Sstevel@tonic-gate * though): 4267c478bd9Sstevel@tonic-gate * 1. an_refcnt == 2 4277c478bd9Sstevel@tonic-gate * 2. we solftlock the page. 4287c478bd9Sstevel@tonic-gate * 3. cow ocurrs on this addr. So a new ap, 4297c478bd9Sstevel@tonic-gate * page and mapping is established on addr. 4307c478bd9Sstevel@tonic-gate * 4. an_refcnt drops to 1 (segvn_faultpage 4317c478bd9Sstevel@tonic-gate * -> anon_decref(oldap)) 4327c478bd9Sstevel@tonic-gate * 5. the last ref to ap also drops (from 4337c478bd9Sstevel@tonic-gate * another as). It ends up blocked inside 4347c478bd9Sstevel@tonic-gate * anon_decref trying to get page's excl lock. 4357c478bd9Sstevel@tonic-gate * 6. Later kcfree unlocks the page, call 4367c478bd9Sstevel@tonic-gate * anon_decref -> oops, ap is gone already. 4377c478bd9Sstevel@tonic-gate * 4387c478bd9Sstevel@tonic-gate * Holding as writer lock solves all problems. 4397c478bd9Sstevel@tonic-gate */ 4407c478bd9Sstevel@tonic-gate *app = NULL; 4417c478bd9Sstevel@tonic-gate else 4427c478bd9Sstevel@tonic-gate #endif 4437c478bd9Sstevel@tonic-gate (*app)->an_refcnt++; 4447c478bd9Sstevel@tonic-gate } 4457c478bd9Sstevel@tonic-gate mutex_exit(ahm); 4467c478bd9Sstevel@tonic-gate } else { 4477c478bd9Sstevel@tonic-gate *app = NULL; 4487c478bd9Sstevel@tonic-gate } 4497c478bd9Sstevel@tonic-gate if (kaddr != (caddr_t)-1) { 4507c478bd9Sstevel@tonic-gate if (pp != *cached_ppp) { 4517c478bd9Sstevel@tonic-gate if (*cached_ppp == NULL) 4527c478bd9Sstevel@tonic-gate flags = HAT_LOAD_LOCK | HAT_NOSYNC | 4537c478bd9Sstevel@tonic-gate HAT_LOAD_NOCONSIST; 4547c478bd9Sstevel@tonic-gate else 4557c478bd9Sstevel@tonic-gate flags = HAT_LOAD_REMAP | 4567c478bd9Sstevel@tonic-gate HAT_LOAD_NOCONSIST; 4577c478bd9Sstevel@tonic-gate /* 4587c478bd9Sstevel@tonic-gate * In order to cache the kernel mapping after 4597c478bd9Sstevel@tonic-gate * the user page is unlocked, we call 4607c478bd9Sstevel@tonic-gate * hat_devload instead of hat_memload so 4617c478bd9Sstevel@tonic-gate * that the kernel mapping we set up here is 4627c478bd9Sstevel@tonic-gate * "invisible" to the rest of the world. This 4637c478bd9Sstevel@tonic-gate * is not very pretty. But as long as the 4647c478bd9Sstevel@tonic-gate * caller bears the responsibility of keeping 4657c478bd9Sstevel@tonic-gate * cache consistency, we should be ok - 4667c478bd9Sstevel@tonic-gate * HAT_NOCONSIST will get us a uncached 4677c478bd9Sstevel@tonic-gate * mapping on VAC. hat_softlock will flush 4687c478bd9Sstevel@tonic-gate * a VAC_WRITEBACK cache. Therefore the kaddr 4697c478bd9Sstevel@tonic-gate * doesn't have to be of the same vcolor as 4707c478bd9Sstevel@tonic-gate * uaddr. 4717c478bd9Sstevel@tonic-gate * The alternative is - change hat_devload 4727c478bd9Sstevel@tonic-gate * to get a cached mapping. Allocate a kaddr 4737c478bd9Sstevel@tonic-gate * with the same vcolor as uaddr. Then 4747c478bd9Sstevel@tonic-gate * hat_softlock won't need to flush the VAC. 4757c478bd9Sstevel@tonic-gate */ 4767c478bd9Sstevel@tonic-gate hat_devload(kas.a_hat, kaddr, PAGESIZE, 4777c478bd9Sstevel@tonic-gate page_pptonum(pp), PROT_READ, flags); 4787c478bd9Sstevel@tonic-gate *cached_ppp = pp; 4797c478bd9Sstevel@tonic-gate } 4807c478bd9Sstevel@tonic-gate kaddr += PAGESIZE; 4817c478bd9Sstevel@tonic-gate } 4827c478bd9Sstevel@tonic-gate cached_ppp++; 4837c478bd9Sstevel@tonic-gate app++; 4847c478bd9Sstevel@tonic-gate ++i; 4857c478bd9Sstevel@tonic-gate } 4867c478bd9Sstevel@tonic-gate if (cow) { 487*dc32d872SJosef 'Jeff' Sipek AS_LOCK_EXIT(as); 4887c478bd9Sstevel@tonic-gate } 4897c478bd9Sstevel@tonic-gate if (first && res == FC_NOMAP) { 4907c478bd9Sstevel@tonic-gate /* 4917c478bd9Sstevel@tonic-gate * If the address is not mapped yet, we call as_fault to 4927c478bd9Sstevel@tonic-gate * fault the pages in. We could've fallen back to copy and 4937c478bd9Sstevel@tonic-gate * let it fault in the pages. But for a mapped file, we 4947c478bd9Sstevel@tonic-gate * normally reference each page only once. For zero-copy to 4957c478bd9Sstevel@tonic-gate * be of any use, we'd better fall in the page now and try 4967c478bd9Sstevel@tonic-gate * again. 4977c478bd9Sstevel@tonic-gate */ 4987c478bd9Sstevel@tonic-gate first = 0; 4997c478bd9Sstevel@tonic-gate size = size << PAGESHIFT; 5007c478bd9Sstevel@tonic-gate uaddr += size; 5017c478bd9Sstevel@tonic-gate total -= size; 5027c478bd9Sstevel@tonic-gate size = total; 5037c478bd9Sstevel@tonic-gate res = as_fault(as->a_hat, as, uaddr, size, F_INVAL, S_READ); 5047c478bd9Sstevel@tonic-gate if (cow) 505*dc32d872SJosef 'Jeff' Sipek AS_LOCK_ENTER(as, RW_WRITER); 5067c478bd9Sstevel@tonic-gate goto tryagain; 5077c478bd9Sstevel@tonic-gate } 5087c478bd9Sstevel@tonic-gate switch (res) { 5097c478bd9Sstevel@tonic-gate case FC_NOSUPPORT: 5107c478bd9Sstevel@tonic-gate return (ENOTSUP); 5117c478bd9Sstevel@tonic-gate case FC_PROT: /* Pretend we don't know about it. This will be */ 5127c478bd9Sstevel@tonic-gate /* caught by the caller when uiomove fails. */ 5137c478bd9Sstevel@tonic-gate case FC_NOMAP: 5147c478bd9Sstevel@tonic-gate case FC_OBJERR: 5157c478bd9Sstevel@tonic-gate default: 5167c478bd9Sstevel@tonic-gate return (0); 5177c478bd9Sstevel@tonic-gate } 5187c478bd9Sstevel@tonic-gate } 519