1 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2017 by Delphix. All rights reserved. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 #include <sys/types.h> 41 #include <sys/thread.h> 42 #include <sys/t_lock.h> 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/bitmap.h> 46 #include <sys/buf.h> 47 #include <sys/cmn_err.h> 48 #include <sys/conf.h> 49 #include <sys/ddi.h> 50 #include <sys/debug.h> 51 #include <sys/errno.h> 52 #include <sys/time.h> 53 #include <sys/fcntl.h> 54 #include <sys/flock.h> 55 #include <sys/file.h> 56 #include <sys/kmem.h> 57 #include <sys/mman.h> 58 #include <sys/vmsystm.h> 59 #include <sys/open.h> 60 #include <sys/swap.h> 61 #include <sys/sysmacros.h> 62 #include <sys/uio.h> 63 #include <sys/vfs.h> 64 #include <sys/vfs_opreg.h> 65 #include <sys/vnode.h> 66 #include <sys/stat.h> 67 #include <sys/poll.h> 68 #include <sys/zmod.h> 69 #include <sys/fs/decomp.h> 70 71 #include <vm/hat.h> 72 #include <vm/as.h> 73 #include <vm/page.h> 74 #include <vm/pvn.h> 75 #include <vm/seg_vn.h> 76 #include <vm/seg_kmem.h> 77 #include <vm/seg_map.h> 78 79 #include <fs/fs_subr.h> 80 81 /* 82 * dcfs - A filesystem for automatic decompressing of fiocompressed files 83 * 84 * This filesystem is a layered filesystem that sits on top of a normal 85 * persistent filesystem and provides automatic decompression of files 86 * that have been previously compressed and stored on the host file system. 87 * This is a pseudo filesystem in that it does not persist data, rather it 88 * intercepts file lookup requests on the host filesystem and provides 89 * transparent decompression of those files. Currently the only supported 90 * host filesystem is ufs. 91 * 92 * A file is compressed via a userland utility (currently cmd/boot/fiocompress) 93 * and marked by fiocompress as a compressed file via a flag in the on-disk 94 * inode (set via a ufs ioctl() - see `ufs_vnops.c`ufs_ioctl()`_FIO_COMPRESSED 95 * ufs_lookup checks for this flag and if set, passes control to decompvp 96 * a function defined in this (dcfs) filesystem. decomvp uncompresses the file 97 * and returns a dcfs vnode to the VFS layer. 98 * 99 * dcfs is layered on top of ufs and passes requests involving persistence 100 * to the underlying ufs filesystem. The compressed files currently cannot be 101 * written to. 102 */ 103 104 105 /* 106 * Define data structures within this file. 107 */ 108 #define DCSHFT 5 109 #define DCTABLESIZE 16 110 111 #if ((DCTABLESIZE & (DCTABLESIZE - 1)) == 0) 112 #define DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) & (DCTABLESIZE - 1)) 113 #else 114 #define DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) % DTABLESIZEC) 115 #endif 116 117 #define DCLRUSIZE 16 118 119 #define DCCACHESIZE 4 120 121 #define rounddown(x, y) ((x) & ~((y) - 1)) 122 123 struct dcnode *dctable[DCTABLESIZE]; 124 125 struct dcnode *dclru; 126 static int dclru_len; 127 128 kmutex_t dctable_lock; 129 130 dev_t dcdev; 131 struct vfs dc_vfs; 132 133 struct kmem_cache *dcnode_cache; 134 struct kmem_cache *dcbuf_cache[DCCACHESIZE]; 135 136 kmutex_t dccache_lock; 137 138 static int dcinit(int, char *); 139 140 static struct dcnode *dcnode_alloc(void); 141 static void dcnode_free(struct dcnode *); 142 static void dcnode_recycle(struct dcnode *); 143 144 static void dcinsert(struct dcnode *); 145 static void dcdelete(struct dcnode *); 146 static struct dcnode *dcfind(struct vnode *); 147 static void dclru_add(struct dcnode *); 148 static void dclru_sub(struct dcnode *); 149 150 151 /* 152 * This is the loadable module wrapper. 153 */ 154 #include <sys/modctl.h> 155 156 struct vfsops *dc_vfsops; 157 158 static vfsdef_t vfw = { 159 VFSDEF_VERSION, 160 "dcfs", 161 dcinit, 162 VSW_ZMOUNT, 163 NULL 164 }; 165 166 /* 167 * Module linkage information for the kernel. 168 */ 169 extern struct mod_ops mod_fsops; 170 171 static struct modlfs modlfs = { 172 &mod_fsops, "compressed filesystem", &vfw 173 }; 174 175 static struct modlinkage modlinkage = { 176 MODREV_1, (void *)&modlfs, NULL 177 }; 178 179 int 180 _init() 181 { 182 return (mod_install(&modlinkage)); 183 } 184 185 int 186 _info(struct modinfo *modinfop) 187 { 188 return (mod_info(&modlinkage, modinfop)); 189 } 190 191 192 static int dc_open(struct vnode **, int, struct cred *, caller_context_t *); 193 static int dc_close(struct vnode *, int, int, offset_t, 194 struct cred *, caller_context_t *); 195 static int dc_read(struct vnode *, struct uio *, int, struct cred *, 196 struct caller_context *); 197 static int dc_getattr(struct vnode *, struct vattr *, int, 198 struct cred *, caller_context_t *); 199 static int dc_setattr(struct vnode *, struct vattr *, int, struct cred *, 200 struct caller_context *); 201 static int dc_access(struct vnode *, int, int, 202 struct cred *, caller_context_t *); 203 static int dc_fsync(struct vnode *, int, struct cred *, caller_context_t *); 204 static void dc_inactive(struct vnode *, struct cred *, caller_context_t *); 205 static int dc_fid(struct vnode *, struct fid *, caller_context_t *); 206 static int dc_seek(struct vnode *, offset_t, offset_t *, caller_context_t *); 207 static int dc_frlock(struct vnode *, int, struct flock64 *, int, offset_t, 208 struct flk_callback *, struct cred *, caller_context_t *); 209 static int dc_realvp(struct vnode *, struct vnode **, caller_context_t *); 210 static int dc_getpage(struct vnode *, offset_t, size_t, uint_t *, 211 struct page **, size_t, struct seg *, caddr_t, enum seg_rw, 212 struct cred *, caller_context_t *); 213 static int dc_putpage(struct vnode *, offset_t, size_t, int, 214 struct cred *, caller_context_t *); 215 static int dc_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t, 216 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *); 217 static int dc_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 218 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *); 219 static int dc_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 220 uint_t, uint_t, uint_t, struct cred *, caller_context_t *); 221 222 struct vnodeops *dc_vnodeops; 223 224 const fs_operation_def_t dc_vnodeops_template[] = { 225 VOPNAME_OPEN, { .vop_open = dc_open }, 226 VOPNAME_CLOSE, { .vop_close = dc_close }, 227 VOPNAME_READ, { .vop_read = dc_read }, 228 VOPNAME_GETATTR, { .vop_getattr = dc_getattr }, 229 VOPNAME_SETATTR, { .vop_setattr = dc_setattr }, 230 VOPNAME_ACCESS, { .vop_access = dc_access }, 231 VOPNAME_FSYNC, { .vop_fsync = dc_fsync }, 232 VOPNAME_INACTIVE, { .vop_inactive = dc_inactive }, 233 VOPNAME_FID, { .vop_fid = dc_fid }, 234 VOPNAME_SEEK, { .vop_seek = dc_seek }, 235 VOPNAME_FRLOCK, { .vop_frlock = dc_frlock }, 236 VOPNAME_REALVP, { .vop_realvp = dc_realvp }, 237 VOPNAME_GETPAGE, { .vop_getpage = dc_getpage }, 238 VOPNAME_PUTPAGE, { .vop_putpage = dc_putpage }, 239 VOPNAME_MAP, { .vop_map = dc_map }, 240 VOPNAME_ADDMAP, { .vop_addmap = dc_addmap }, 241 VOPNAME_DELMAP, { .vop_delmap = dc_delmap }, 242 NULL, NULL 243 }; 244 245 /*ARGSUSED*/ 246 static int 247 dc_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ctp) 248 { 249 return (0); 250 } 251 252 /*ARGSUSED*/ 253 static int 254 dc_close(struct vnode *vp, int flag, int count, offset_t off, 255 struct cred *cr, caller_context_t *ctp) 256 { 257 (void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 258 cleanshares(vp, ttoproc(curthread)->p_pid); 259 return (0); 260 } 261 262 /*ARGSUSED*/ 263 static int 264 dc_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr, 265 struct caller_context *ct) 266 { 267 struct dcnode *dp = VTODC(vp); 268 size_t rdsize = MAX(MAXBSIZE, dp->dc_hdr->ch_blksize); 269 size_t fsize = dp->dc_hdr->ch_fsize; 270 int error; 271 272 /* 273 * Loop through file with segmap, decompression will occur 274 * in dc_getapage 275 */ 276 do { 277 caddr_t base; 278 size_t n; 279 offset_t mapon; 280 281 /* 282 * read to end of block or file 283 */ 284 mapon = uiop->uio_loffset & (rdsize - 1); 285 n = MIN(rdsize - mapon, uiop->uio_resid); 286 n = MIN(n, fsize - uiop->uio_loffset); 287 if (n == 0) 288 return (0); /* at EOF */ 289 290 base = segmap_getmapflt(segkmap, vp, uiop->uio_loffset, n, 1, 291 S_READ); 292 error = uiomove(base + mapon, n, UIO_READ, uiop); 293 if (!error) { 294 uint_t flags; 295 296 if (n + mapon == rdsize || uiop->uio_loffset == fsize) 297 flags = SM_DONTNEED; 298 else 299 flags = 0; 300 error = segmap_release(segkmap, base, flags); 301 } else 302 (void) segmap_release(segkmap, base, 0); 303 } while (!error && uiop->uio_resid); 304 305 return (error); 306 } 307 308 static int 309 dc_getattr(struct vnode *vp, struct vattr *vap, int flags, 310 cred_t *cred, caller_context_t *ctp) 311 { 312 struct dcnode *dp = VTODC(vp); 313 struct vnode *subvp = dp->dc_subvp; 314 int error; 315 316 error = VOP_GETATTR(subvp, vap, flags, cred, ctp); 317 318 /* substitute uncompressed size */ 319 vap->va_size = dp->dc_hdr->ch_fsize; 320 return (error); 321 } 322 323 static int 324 dc_setattr(struct vnode *vp, struct vattr *vap, int flags, cred_t *cred, 325 caller_context_t *ctp) 326 { 327 struct dcnode *dp = VTODC(vp); 328 struct vnode *subvp = dp->dc_subvp; 329 330 return (VOP_SETATTR(subvp, vap, flags, cred, ctp)); 331 } 332 333 static int 334 dc_access(struct vnode *vp, int mode, int flags, 335 cred_t *cred, caller_context_t *ctp) 336 { 337 struct dcnode *dp = VTODC(vp); 338 struct vnode *subvp = dp->dc_subvp; 339 340 return (VOP_ACCESS(subvp, mode, flags, cred, ctp)); 341 } 342 343 /*ARGSUSED*/ 344 static int 345 dc_fsync(vnode_t *vp, int syncflag, cred_t *cred, caller_context_t *ctp) 346 { 347 return (0); 348 } 349 350 /*ARGSUSED*/ 351 static void 352 dc_inactive(struct vnode *vp, cred_t *cr, caller_context_t *ctp) 353 { 354 struct dcnode *dp = VTODC(vp); 355 356 mutex_enter(&dctable_lock); 357 mutex_enter(&vp->v_lock); 358 ASSERT(vp->v_count >= 1); 359 VN_RELE_LOCKED(vp); 360 if (vp->v_count != 0) { 361 /* 362 * Somebody accessed the dcnode before we got a chance to 363 * remove it. They will remove it when they do a vn_rele. 364 */ 365 mutex_exit(&vp->v_lock); 366 mutex_exit(&dctable_lock); 367 return; 368 } 369 mutex_exit(&vp->v_lock); 370 371 dcnode_free(dp); 372 373 mutex_exit(&dctable_lock); 374 } 375 376 static int 377 dc_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ctp) 378 { 379 struct dcnode *dp = VTODC(vp); 380 struct vnode *subvp = dp->dc_subvp; 381 382 return (VOP_FID(subvp, fidp, ctp)); 383 } 384 385 static int 386 dc_seek(struct vnode *vp, offset_t oof, offset_t *noffp, caller_context_t *ctp) 387 { 388 struct dcnode *dp = VTODC(vp); 389 struct vnode *subvp = dp->dc_subvp; 390 391 return (VOP_SEEK(subvp, oof, noffp, ctp)); 392 } 393 394 static int 395 dc_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag, 396 offset_t offset, struct flk_callback *flk_cbp, 397 cred_t *cr, caller_context_t *ctp) 398 { 399 struct dcnode *dp = VTODC(vp); 400 int error; 401 struct vattr vattr; 402 403 /* 404 * If file is being mapped, disallow frlock. 405 */ 406 vattr.va_mask = AT_MODE; 407 if (error = VOP_GETATTR(dp->dc_subvp, &vattr, 0, cr, ctp)) 408 return (error); 409 if (dp->dc_mapcnt > 0 && MANDLOCK(vp, vattr.va_mode)) 410 return (EAGAIN); 411 412 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ctp)); 413 } 414 415 /*ARGSUSED*/ 416 static int 417 dc_getblock_miss(struct vnode *vp, offset_t off, size_t len, struct page **ppp, 418 struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr) 419 { 420 struct dcnode *dp = VTODC(vp); 421 struct comphdr *hdr = dp->dc_hdr; 422 struct page *pp; 423 struct buf *bp; 424 caddr_t saddr; 425 off_t cblkno; 426 size_t rdoff, rdsize, dsize; 427 long xlen; 428 int error, zerr; 429 430 ASSERT(len == hdr->ch_blksize); 431 /* 432 * Get destination pages and make them addressable 433 */ 434 pp = page_create_va(vp, off, len, PG_WAIT, seg, addr); 435 bp = pageio_setup(pp, len, vp, B_READ); 436 bp_mapin(bp); 437 438 /* 439 * read compressed data from subordinate vnode 440 */ 441 saddr = kmem_cache_alloc(dp->dc_bufcache, KM_SLEEP); 442 cblkno = off / len; 443 rdoff = hdr->ch_blkmap[cblkno]; 444 rdsize = hdr->ch_blkmap[cblkno + 1] - rdoff; 445 error = vn_rdwr(UIO_READ, dp->dc_subvp, saddr, rdsize, rdoff, 446 UIO_SYSSPACE, 0, 0, cr, NULL); 447 if (error) 448 goto cleanup; 449 450 /* 451 * Uncompress 452 */ 453 dsize = len; 454 zerr = z_uncompress(bp->b_un.b_addr, &dsize, saddr, dp->dc_zmax); 455 if (zerr != Z_OK) { 456 error = EIO; 457 goto cleanup; 458 } 459 460 /* 461 * Handle EOF 462 */ 463 xlen = hdr->ch_fsize - off; 464 if (xlen < len) { 465 bzero(bp->b_un.b_addr + xlen, len - xlen); 466 if (dsize != xlen) 467 error = EIO; 468 } else if (dsize != len) 469 error = EIO; 470 471 /* 472 * Clean up 473 */ 474 cleanup: 475 kmem_cache_free(dp->dc_bufcache, saddr); 476 pageio_done(bp); 477 *ppp = pp; 478 return (error); 479 } 480 481 static int 482 dc_getblock(struct vnode *vp, offset_t off, size_t len, struct page **ppp, 483 struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr) 484 { 485 struct page *pp, *plist = NULL; 486 offset_t pgoff; 487 int rdblk; 488 489 /* 490 * pvn_read_kluster() doesn't quite do what we want, since it 491 * thinks sub block reads are ok. Here we always decompress 492 * a full block. 493 */ 494 495 /* 496 * Check page cache 497 */ 498 rdblk = 0; 499 for (pgoff = off; pgoff < off + len; pgoff += PAGESIZE) { 500 pp = page_lookup(vp, pgoff, SE_EXCL); 501 if (pp == NULL) { 502 rdblk = 1; 503 break; 504 } 505 page_io_lock(pp); 506 page_add(&plist, pp); 507 plist = plist->p_next; 508 } 509 if (!rdblk) { 510 *ppp = plist; 511 return (0); /* all pages in cache */ 512 } 513 514 /* 515 * Undo any locks so getblock_miss has an open field 516 */ 517 if (plist != NULL) 518 pvn_io_done(plist); 519 520 return (dc_getblock_miss(vp, off, len, ppp, seg, addr, rw, cr)); 521 } 522 523 static int 524 dc_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) 525 { 526 struct vnode *rvp; 527 528 vp = VTODC(vp)->dc_subvp; 529 if (VOP_REALVP(vp, &rvp, ct) == 0) 530 vp = rvp; 531 *vpp = vp; 532 return (0); 533 } 534 535 /*ARGSUSED10*/ 536 static int 537 dc_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp, 538 struct page *pl[], size_t plsz, struct seg *seg, caddr_t addr, 539 enum seg_rw rw, struct cred *cr, caller_context_t *ctp) 540 { 541 struct dcnode *dp = VTODC(vp); 542 struct comphdr *hdr = dp->dc_hdr; 543 struct page *pp, *plist = NULL; 544 caddr_t vp_baddr; 545 offset_t vp_boff, vp_bend; 546 size_t bsize = hdr->ch_blksize; 547 int nblks, error; 548 549 /* does not support write */ 550 if (rw == S_WRITE) { 551 panic("write attempt on compressed file"); 552 /*NOTREACHED*/ 553 } 554 555 if (protp) 556 *protp = PROT_ALL; 557 /* 558 * We don't support asynchronous operation at the moment, so 559 * just pretend we did it. If the pages are ever actually 560 * needed, they'll get brought in then. 561 */ 562 if (pl == NULL) 563 return (0); 564 565 /* 566 * Calc block start and end offsets 567 */ 568 vp_boff = rounddown(off, bsize); 569 vp_bend = roundup(off + len, bsize); 570 vp_baddr = (caddr_t)rounddown((uintptr_t)addr, bsize); 571 572 nblks = (vp_bend - vp_boff) / bsize; 573 while (nblks--) { 574 error = dc_getblock(vp, vp_boff, bsize, &pp, seg, vp_baddr, 575 rw, cr); 576 page_list_concat(&plist, &pp); 577 vp_boff += bsize; 578 vp_baddr += bsize; 579 } 580 if (!error) 581 pvn_plist_init(plist, pl, plsz, off, len, rw); 582 else 583 pvn_read_done(plist, B_ERROR); 584 return (error); 585 } 586 587 /* 588 * This function should never be called. We need to have it to pass 589 * it as an argument to other functions. 590 */ 591 /*ARGSUSED*/ 592 static int 593 dc_putapage(struct vnode *vp, struct page *pp, u_offset_t *offp, size_t *lenp, 594 int flags, struct cred *cr) 595 { 596 /* should never happen */ 597 cmn_err(CE_PANIC, "dcfs: dc_putapage: dirty page"); 598 /*NOTREACHED*/ 599 return (0); 600 } 601 602 603 /* 604 * The only flags we support are B_INVAL, B_FREE and B_DONTNEED. 605 * B_INVAL is set by: 606 * 607 * 1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag. 608 * 2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice 609 * which translates to an MC_SYNC with the MS_INVALIDATE flag. 610 * 611 * The B_FREE (as well as the B_DONTNEED) flag is set when the 612 * MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked 613 * from SEGVN to release pages behind a pagefault. 614 */ 615 /*ARGSUSED5*/ 616 static int 617 dc_putpage(struct vnode *vp, offset_t off, size_t len, int flags, 618 struct cred *cr, caller_context_t *ctp) 619 { 620 int error = 0; 621 622 if (vp->v_count == 0) { 623 panic("dcfs_putpage: bad v_count"); 624 /*NOTREACHED*/ 625 } 626 627 if (vp->v_flag & VNOMAP) 628 return (ENOSYS); 629 630 if (!vn_has_cached_data(vp)) /* no pages mapped */ 631 return (0); 632 633 if (len == 0) /* from 'off' to EOF */ 634 error = pvn_vplist_dirty(vp, off, dc_putapage, flags, cr); 635 else { 636 offset_t io_off; 637 se_t se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED; 638 639 for (io_off = off; io_off < off + len; io_off += PAGESIZE) { 640 page_t *pp; 641 642 /* 643 * We insist on getting the page only if we are 644 * about to invalidate, free or write it and 645 * the B_ASYNC flag is not set. 646 */ 647 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) 648 pp = page_lookup(vp, io_off, se); 649 else 650 pp = page_lookup_nowait(vp, io_off, se); 651 652 if (pp == NULL) 653 continue; 654 /* 655 * Normally pvn_getdirty() should return 0, which 656 * impies that it has done the job for us. 657 * The shouldn't-happen scenario is when it returns 1. 658 * This means that the page has been modified and 659 * needs to be put back. 660 * Since we can't write to a dcfs compressed file, 661 * we fake a failed I/O and force pvn_write_done() 662 * to destroy the page. 663 */ 664 if (pvn_getdirty(pp, flags) == 1) { 665 cmn_err(CE_NOTE, "dc_putpage: dirty page"); 666 pvn_write_done(pp, flags | 667 B_ERROR | B_WRITE | B_INVAL | B_FORCE); 668 } 669 } 670 } 671 return (error); 672 } 673 674 static int 675 dc_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp, 676 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, 677 struct cred *cred, caller_context_t *ctp) 678 { 679 struct vattr vattr; 680 struct segvn_crargs vn_a; 681 int error; 682 683 if (vp->v_flag & VNOMAP) 684 return (ENOSYS); 685 686 if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0) 687 return (ENXIO); 688 689 /* 690 * If file is being locked, disallow mapping. 691 */ 692 if (error = VOP_GETATTR(VTODC(vp)->dc_subvp, &vattr, 0, cred, ctp)) 693 return (error); 694 if (vn_has_mandatory_locks(vp, vattr.va_mode)) 695 return (EAGAIN); 696 697 as_rangelock(as); 698 699 if ((flags & MAP_FIXED) == 0) { 700 map_addr(addrp, len, off, 1, flags); 701 if (*addrp == NULL) { 702 as_rangeunlock(as); 703 return (ENOMEM); 704 } 705 } else { 706 /* 707 * User specified address - blow away any previous mappings 708 */ 709 (void) as_unmap(as, *addrp, len); 710 } 711 712 vn_a.vp = vp; 713 vn_a.offset = off; 714 vn_a.type = flags & MAP_TYPE; 715 vn_a.prot = prot; 716 vn_a.maxprot = maxprot; 717 vn_a.flags = flags & ~MAP_TYPE; 718 vn_a.cred = cred; 719 vn_a.amp = NULL; 720 vn_a.szc = 0; 721 vn_a.lgrp_mem_policy_flags = 0; 722 723 error = as_map(as, *addrp, len, segvn_create, &vn_a); 724 as_rangeunlock(as); 725 return (error); 726 } 727 728 /*ARGSUSED*/ 729 static int 730 dc_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr, 731 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, 732 struct cred *cr, caller_context_t *ctp) 733 { 734 struct dcnode *dp; 735 736 if (vp->v_flag & VNOMAP) 737 return (ENOSYS); 738 739 dp = VTODC(vp); 740 mutex_enter(&dp->dc_lock); 741 dp->dc_mapcnt += btopr(len); 742 mutex_exit(&dp->dc_lock); 743 return (0); 744 } 745 746 /*ARGSUSED*/ 747 static int 748 dc_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr, 749 size_t len, uint_t prot, uint_t maxprot, uint_t flags, 750 struct cred *cr, caller_context_t *ctp) 751 { 752 struct dcnode *dp; 753 754 if (vp->v_flag & VNOMAP) 755 return (ENOSYS); 756 757 dp = VTODC(vp); 758 mutex_enter(&dp->dc_lock); 759 dp->dc_mapcnt -= btopr(len); 760 ASSERT(dp->dc_mapcnt >= 0); 761 mutex_exit(&dp->dc_lock); 762 return (0); 763 } 764 765 /* 766 * Constructor/destructor routines for dcnodes 767 */ 768 /*ARGSUSED1*/ 769 static int 770 dcnode_constructor(void *buf, void *cdrarg, int kmflags) 771 { 772 struct dcnode *dp = buf; 773 struct vnode *vp; 774 775 vp = dp->dc_vp = vn_alloc(kmflags); 776 if (vp == NULL) { 777 return (-1); 778 } 779 vp->v_data = dp; 780 vp->v_type = VREG; 781 vp->v_flag = VNOSWAP; 782 vp->v_vfsp = &dc_vfs; 783 vn_setops(vp, dc_vnodeops); 784 vn_exists(vp); 785 786 mutex_init(&dp->dc_lock, NULL, MUTEX_DEFAULT, NULL); 787 dp->dc_mapcnt = 0; 788 dp->dc_lrunext = dp->dc_lruprev = NULL; 789 dp->dc_hdr = NULL; 790 dp->dc_subvp = NULL; 791 return (0); 792 } 793 794 /*ARGSUSED*/ 795 static void 796 dcnode_destructor(void *buf, void *cdrarg) 797 { 798 struct dcnode *dp = buf; 799 struct vnode *vp = DCTOV(dp); 800 801 mutex_destroy(&dp->dc_lock); 802 803 VERIFY(dp->dc_hdr == NULL); 804 VERIFY(dp->dc_subvp == NULL); 805 vn_invalid(vp); 806 vn_free(vp); 807 } 808 809 static struct dcnode * 810 dcnode_alloc(void) 811 { 812 struct dcnode *dp; 813 814 /* 815 * If the free list is above DCLRUSIZE 816 * re-use one from it 817 */ 818 mutex_enter(&dctable_lock); 819 if (dclru_len < DCLRUSIZE) { 820 mutex_exit(&dctable_lock); 821 dp = kmem_cache_alloc(dcnode_cache, KM_SLEEP); 822 } else { 823 ASSERT(dclru != NULL); 824 dp = dclru; 825 dclru_sub(dp); 826 dcdelete(dp); 827 mutex_exit(&dctable_lock); 828 dcnode_recycle(dp); 829 } 830 return (dp); 831 } 832 833 static void 834 dcnode_free(struct dcnode *dp) 835 { 836 struct vnode *vp = DCTOV(dp); 837 838 ASSERT(MUTEX_HELD(&dctable_lock)); 839 840 /* 841 * If no cached pages, no need to put it on lru 842 */ 843 if (!vn_has_cached_data(vp)) { 844 dcdelete(dp); 845 dcnode_recycle(dp); 846 kmem_cache_free(dcnode_cache, dp); 847 return; 848 } 849 850 /* 851 * Add to lru, if it's over the limit, free from head 852 */ 853 dclru_add(dp); 854 if (dclru_len > DCLRUSIZE) { 855 dp = dclru; 856 dclru_sub(dp); 857 dcdelete(dp); 858 dcnode_recycle(dp); 859 kmem_cache_free(dcnode_cache, dp); 860 } 861 } 862 863 static void 864 dcnode_recycle(struct dcnode *dp) 865 { 866 struct vnode *vp; 867 868 vp = DCTOV(dp); 869 870 VN_RELE(dp->dc_subvp); 871 dp->dc_subvp = NULL; 872 (void) pvn_vplist_dirty(vp, 0, dc_putapage, B_INVAL, NULL); 873 kmem_free(dp->dc_hdr, dp->dc_hdrsize); 874 dp->dc_hdr = NULL; 875 dp->dc_hdrsize = dp->dc_zmax = 0; 876 dp->dc_bufcache = NULL; 877 dp->dc_mapcnt = 0; 878 vn_reinit(vp); 879 vp->v_type = VREG; 880 vp->v_flag = VNOSWAP; 881 vp->v_vfsp = &dc_vfs; 882 } 883 884 static int 885 dcinit(int fstype, char *name) 886 { 887 static const fs_operation_def_t dc_vfsops_template[] = { 888 NULL, NULL 889 }; 890 int error; 891 major_t dev; 892 893 error = vfs_setfsops(fstype, dc_vfsops_template, &dc_vfsops); 894 if (error) { 895 cmn_err(CE_WARN, "dcinit: bad vfs ops template"); 896 return (error); 897 } 898 VFS_INIT(&dc_vfs, dc_vfsops, NULL); 899 dc_vfs.vfs_flag = VFS_RDONLY; 900 dc_vfs.vfs_fstype = fstype; 901 if ((dev = getudev()) == (major_t)-1) 902 dev = 0; 903 dcdev = makedevice(dev, 0); 904 dc_vfs.vfs_dev = dcdev; 905 906 error = vn_make_ops(name, dc_vnodeops_template, &dc_vnodeops); 907 if (error != 0) { 908 (void) vfs_freevfsops_by_type(fstype); 909 cmn_err(CE_WARN, "dcinit: bad vnode ops template"); 910 return (error); 911 } 912 913 mutex_init(&dctable_lock, NULL, MUTEX_DEFAULT, NULL); 914 mutex_init(&dccache_lock, NULL, MUTEX_DEFAULT, NULL); 915 dcnode_cache = kmem_cache_create("dcnode_cache", sizeof (struct dcnode), 916 0, dcnode_constructor, dcnode_destructor, NULL, NULL, NULL, 0); 917 918 return (0); 919 } 920 921 /* 922 * Return shadow vnode with the given vp as its subordinate 923 */ 924 struct vnode * 925 decompvp(struct vnode *vp, cred_t *cred, caller_context_t *ctp) 926 { 927 struct dcnode *dp, *ndp; 928 struct comphdr thdr, *hdr; 929 struct kmem_cache **cpp; 930 struct vattr vattr; 931 size_t hdrsize, bsize; 932 int error; 933 934 /* 935 * See if we have an existing shadow 936 * If none, we have to manufacture one 937 */ 938 mutex_enter(&dctable_lock); 939 dp = dcfind(vp); 940 mutex_exit(&dctable_lock); 941 if (dp != NULL) 942 return (DCTOV(dp)); 943 944 /* 945 * Make sure it's a valid compressed file 946 */ 947 hdr = &thdr; 948 error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, sizeof (struct comphdr), 0, 949 UIO_SYSSPACE, 0, 0, cred, NULL); 950 if (error || hdr->ch_magic != CH_MAGIC_ZLIB || 951 hdr->ch_version != CH_VERSION || hdr->ch_algorithm != CH_ALG_ZLIB || 952 hdr->ch_fsize == 0 || hdr->ch_blksize < PAGESIZE || 953 hdr->ch_blksize > ptob(DCCACHESIZE) || !ISP2(hdr->ch_blksize)) 954 return (NULL); 955 956 /* get underlying file size */ 957 if (VOP_GETATTR(vp, &vattr, 0, cred, ctp) != 0) 958 return (NULL); 959 960 /* 961 * Re-read entire header 962 */ 963 hdrsize = hdr->ch_blkmap[0] + sizeof (uint64_t); 964 hdr = kmem_alloc(hdrsize, KM_SLEEP); 965 error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, hdrsize, 0, UIO_SYSSPACE, 966 0, 0, cred, NULL); 967 if (error) { 968 kmem_free(hdr, hdrsize); 969 return (NULL); 970 } 971 972 /* 973 * add extra blkmap entry to make dc_getblock()'s 974 * life easier 975 */ 976 bsize = hdr->ch_blksize; 977 hdr->ch_blkmap[((hdr->ch_fsize-1) / bsize) + 1] = vattr.va_size; 978 979 ndp = dcnode_alloc(); 980 ndp->dc_subvp = vp; 981 VN_HOLD(vp); 982 ndp->dc_hdr = hdr; 983 ndp->dc_hdrsize = hdrsize; 984 985 /* 986 * Allocate kmem cache if none there already 987 */ 988 ndp->dc_zmax = ZMAXBUF(bsize); 989 cpp = &dcbuf_cache[btop(bsize)]; 990 mutex_enter(&dccache_lock); 991 if (*cpp == NULL) 992 *cpp = kmem_cache_create("dcbuf_cache", ndp->dc_zmax, 0, NULL, 993 NULL, NULL, NULL, NULL, 0); 994 mutex_exit(&dccache_lock); 995 ndp->dc_bufcache = *cpp; 996 997 /* 998 * Recheck table in case someone else created shadow 999 * while we were blocked above. 1000 */ 1001 mutex_enter(&dctable_lock); 1002 dp = dcfind(vp); 1003 if (dp != NULL) { 1004 mutex_exit(&dctable_lock); 1005 dcnode_recycle(ndp); 1006 kmem_cache_free(dcnode_cache, ndp); 1007 return (DCTOV(dp)); 1008 } 1009 dcinsert(ndp); 1010 mutex_exit(&dctable_lock); 1011 1012 return (DCTOV(ndp)); 1013 } 1014 1015 1016 /* 1017 * dcnode lookup table 1018 * These routines maintain a table of dcnodes hashed by their 1019 * subordinate vnode so that they can be found if they already 1020 * exist in the vnode cache 1021 */ 1022 1023 /* 1024 * Put a dcnode in the table. 1025 */ 1026 static void 1027 dcinsert(struct dcnode *newdp) 1028 { 1029 int idx = DCHASH(newdp->dc_subvp); 1030 1031 ASSERT(MUTEX_HELD(&dctable_lock)); 1032 newdp->dc_hash = dctable[idx]; 1033 dctable[idx] = newdp; 1034 } 1035 1036 /* 1037 * Remove a dcnode from the hash table. 1038 */ 1039 void 1040 dcdelete(struct dcnode *deldp) 1041 { 1042 int idx = DCHASH(deldp->dc_subvp); 1043 struct dcnode *dp, *prevdp; 1044 1045 ASSERT(MUTEX_HELD(&dctable_lock)); 1046 dp = dctable[idx]; 1047 if (dp == deldp) 1048 dctable[idx] = dp->dc_hash; 1049 else { 1050 for (prevdp = dp, dp = dp->dc_hash; dp != NULL; 1051 prevdp = dp, dp = dp->dc_hash) { 1052 if (dp == deldp) { 1053 prevdp->dc_hash = dp->dc_hash; 1054 break; 1055 } 1056 } 1057 } 1058 ASSERT(dp != NULL); 1059 } 1060 1061 /* 1062 * Find a shadow vnode in the dctable hash list. 1063 */ 1064 static struct dcnode * 1065 dcfind(struct vnode *vp) 1066 { 1067 struct dcnode *dp; 1068 1069 ASSERT(MUTEX_HELD(&dctable_lock)); 1070 for (dp = dctable[DCHASH(vp)]; dp != NULL; dp = dp->dc_hash) 1071 if (dp->dc_subvp == vp) { 1072 VN_HOLD(DCTOV(dp)); 1073 if (dp->dc_lrunext) 1074 dclru_sub(dp); 1075 return (dp); 1076 } 1077 return (NULL); 1078 } 1079 1080 #ifdef DEBUG 1081 static int 1082 dclru_count(void) 1083 { 1084 struct dcnode *dp; 1085 int i = 0; 1086 1087 if (dclru == NULL) 1088 return (0); 1089 for (dp = dclru; dp->dc_lrunext != dclru; dp = dp->dc_lrunext) 1090 i++; 1091 return (i + 1); 1092 } 1093 #endif 1094 1095 static void 1096 dclru_add(struct dcnode *dp) 1097 { 1098 /* 1099 * Add to dclru as double-link chain 1100 */ 1101 ASSERT(MUTEX_HELD(&dctable_lock)); 1102 if (dclru == NULL) { 1103 dclru = dp; 1104 dp->dc_lruprev = dp->dc_lrunext = dp; 1105 } else { 1106 struct dcnode *last = dclru->dc_lruprev; 1107 1108 dclru->dc_lruprev = dp; 1109 last->dc_lrunext = dp; 1110 dp->dc_lruprev = last; 1111 dp->dc_lrunext = dclru; 1112 } 1113 dclru_len++; 1114 ASSERT(dclru_len == dclru_count()); 1115 } 1116 1117 static void 1118 dclru_sub(struct dcnode *dp) 1119 { 1120 ASSERT(MUTEX_HELD(&dctable_lock)); 1121 dp->dc_lrunext->dc_lruprev = dp->dc_lruprev; 1122 dp->dc_lruprev->dc_lrunext = dp->dc_lrunext; 1123 if (dp == dclru) 1124 dclru = dp->dc_lrunext == dp ? NULL : dp->dc_lrunext; 1125 dp->dc_lrunext = dp->dc_lruprev = NULL; 1126 dclru_len--; 1127 ASSERT(dclru_len == dclru_count()); 1128 } 1129