1
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2017 by Delphix. All rights reserved.
25 */
26
27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30 /*
31 * University Copyright- Copyright (c) 1982, 1986, 1988
32 * The Regents of the University of California
33 * All Rights Reserved
34 *
35 * University Acknowledgment- Portions of this document are derived from
36 * software developed by the University of California, Berkeley, and its
37 * contributors.
38 */
39
40 #include <sys/types.h>
41 #include <sys/thread.h>
42 #include <sys/t_lock.h>
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/bitmap.h>
46 #include <sys/buf.h>
47 #include <sys/cmn_err.h>
48 #include <sys/conf.h>
49 #include <sys/ddi.h>
50 #include <sys/debug.h>
51 #include <sys/errno.h>
52 #include <sys/time.h>
53 #include <sys/fcntl.h>
54 #include <sys/flock.h>
55 #include <sys/file.h>
56 #include <sys/kmem.h>
57 #include <sys/mman.h>
58 #include <sys/vmsystm.h>
59 #include <sys/open.h>
60 #include <sys/swap.h>
61 #include <sys/sysmacros.h>
62 #include <sys/uio.h>
63 #include <sys/vfs.h>
64 #include <sys/vfs_opreg.h>
65 #include <sys/vnode.h>
66 #include <sys/stat.h>
67 #include <sys/poll.h>
68 #include <sys/zmod.h>
69 #include <sys/fs/decomp.h>
70
71 #include <vm/hat.h>
72 #include <vm/as.h>
73 #include <vm/page.h>
74 #include <vm/pvn.h>
75 #include <vm/seg_vn.h>
76 #include <vm/seg_kmem.h>
77 #include <vm/seg_map.h>
78
79 #include <fs/fs_subr.h>
80
81 /*
82 * dcfs - A filesystem for automatic decompressing of fiocompressed files
83 *
84 * This filesystem is a layered filesystem that sits on top of a normal
85 * persistent filesystem and provides automatic decompression of files
86 * that have been previously compressed and stored on the host file system.
87 * This is a pseudo filesystem in that it does not persist data, rather it
88 * intercepts file lookup requests on the host filesystem and provides
89 * transparent decompression of those files. Currently the only supported
90 * host filesystem is ufs.
91 *
92 * A file is compressed via a userland utility (currently cmd/boot/fiocompress)
93 * and marked by fiocompress as a compressed file via a flag in the on-disk
94 * inode (set via a ufs ioctl() - see `ufs_vnops.c`ufs_ioctl()`_FIO_COMPRESSED
95 * ufs_lookup checks for this flag and if set, passes control to decompvp
96 * a function defined in this (dcfs) filesystem. decomvp uncompresses the file
97 * and returns a dcfs vnode to the VFS layer.
98 *
99 * dcfs is layered on top of ufs and passes requests involving persistence
100 * to the underlying ufs filesystem. The compressed files currently cannot be
101 * written to.
102 */
103
104
105 /*
106 * Define data structures within this file.
107 */
108 #define DCSHFT 5
109 #define DCTABLESIZE 16
110
111 #if ((DCTABLESIZE & (DCTABLESIZE - 1)) == 0)
112 #define DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) & (DCTABLESIZE - 1))
113 #else
114 #define DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) % DTABLESIZEC)
115 #endif
116
117 #define DCLRUSIZE 16
118
119 #define DCCACHESIZE 4
120
121 #define rounddown(x, y) ((x) & ~((y) - 1))
122
123 struct dcnode *dctable[DCTABLESIZE];
124
125 struct dcnode *dclru;
126 static int dclru_len;
127
128 kmutex_t dctable_lock;
129
130 dev_t dcdev;
131 struct vfs dc_vfs;
132
133 struct kmem_cache *dcnode_cache;
134 struct kmem_cache *dcbuf_cache[DCCACHESIZE];
135
136 kmutex_t dccache_lock;
137
138 static int dcinit(int, char *);
139
140 static struct dcnode *dcnode_alloc(void);
141 static void dcnode_free(struct dcnode *);
142 static void dcnode_recycle(struct dcnode *);
143
144 static void dcinsert(struct dcnode *);
145 static void dcdelete(struct dcnode *);
146 static struct dcnode *dcfind(struct vnode *);
147 static void dclru_add(struct dcnode *);
148 static void dclru_sub(struct dcnode *);
149
150
151 /*
152 * This is the loadable module wrapper.
153 */
154 #include <sys/modctl.h>
155
156 struct vfsops *dc_vfsops;
157
158 static vfsdef_t vfw = {
159 VFSDEF_VERSION,
160 "dcfs",
161 dcinit,
162 VSW_ZMOUNT,
163 NULL
164 };
165
166 /*
167 * Module linkage information for the kernel.
168 */
169 extern struct mod_ops mod_fsops;
170
171 static struct modlfs modlfs = {
172 &mod_fsops, "compressed filesystem", &vfw
173 };
174
175 static struct modlinkage modlinkage = {
176 MODREV_1, (void *)&modlfs, NULL
177 };
178
179 int
_init()180 _init()
181 {
182 return (mod_install(&modlinkage));
183 }
184
185 int
_info(struct modinfo * modinfop)186 _info(struct modinfo *modinfop)
187 {
188 return (mod_info(&modlinkage, modinfop));
189 }
190
191
192 static int dc_open(struct vnode **, int, struct cred *, caller_context_t *);
193 static int dc_close(struct vnode *, int, int, offset_t,
194 struct cred *, caller_context_t *);
195 static int dc_read(struct vnode *, struct uio *, int, struct cred *,
196 struct caller_context *);
197 static int dc_getattr(struct vnode *, struct vattr *, int,
198 struct cred *, caller_context_t *);
199 static int dc_setattr(struct vnode *, struct vattr *, int, struct cred *,
200 struct caller_context *);
201 static int dc_access(struct vnode *, int, int,
202 struct cred *, caller_context_t *);
203 static int dc_fsync(struct vnode *, int, struct cred *, caller_context_t *);
204 static void dc_inactive(struct vnode *, struct cred *, caller_context_t *);
205 static int dc_fid(struct vnode *, struct fid *, caller_context_t *);
206 static int dc_seek(struct vnode *, offset_t, offset_t *, caller_context_t *);
207 static int dc_frlock(struct vnode *, int, struct flock64 *, int, offset_t,
208 struct flk_callback *, struct cred *, caller_context_t *);
209 static int dc_realvp(struct vnode *, struct vnode **, caller_context_t *);
210 static int dc_getpage(struct vnode *, offset_t, size_t, uint_t *,
211 struct page **, size_t, struct seg *, caddr_t, enum seg_rw,
212 struct cred *, caller_context_t *);
213 static int dc_putpage(struct vnode *, offset_t, size_t, int,
214 struct cred *, caller_context_t *);
215 static int dc_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t,
216 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
217 static int dc_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
218 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
219 static int dc_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
220 uint_t, uint_t, uint_t, struct cred *, caller_context_t *);
221
222 struct vnodeops *dc_vnodeops;
223
224 const fs_operation_def_t dc_vnodeops_template[] = {
225 VOPNAME_OPEN, { .vop_open = dc_open },
226 VOPNAME_CLOSE, { .vop_close = dc_close },
227 VOPNAME_READ, { .vop_read = dc_read },
228 VOPNAME_GETATTR, { .vop_getattr = dc_getattr },
229 VOPNAME_SETATTR, { .vop_setattr = dc_setattr },
230 VOPNAME_ACCESS, { .vop_access = dc_access },
231 VOPNAME_FSYNC, { .vop_fsync = dc_fsync },
232 VOPNAME_INACTIVE, { .vop_inactive = dc_inactive },
233 VOPNAME_FID, { .vop_fid = dc_fid },
234 VOPNAME_SEEK, { .vop_seek = dc_seek },
235 VOPNAME_FRLOCK, { .vop_frlock = dc_frlock },
236 VOPNAME_REALVP, { .vop_realvp = dc_realvp },
237 VOPNAME_GETPAGE, { .vop_getpage = dc_getpage },
238 VOPNAME_PUTPAGE, { .vop_putpage = dc_putpage },
239 VOPNAME_MAP, { .vop_map = dc_map },
240 VOPNAME_ADDMAP, { .vop_addmap = dc_addmap },
241 VOPNAME_DELMAP, { .vop_delmap = dc_delmap },
242 NULL, NULL
243 };
244
245 /*ARGSUSED*/
246 static int
dc_open(struct vnode ** vpp,int flag,struct cred * cr,caller_context_t * ctp)247 dc_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ctp)
248 {
249 return (0);
250 }
251
252 /*ARGSUSED*/
253 static int
dc_close(struct vnode * vp,int flag,int count,offset_t off,struct cred * cr,caller_context_t * ctp)254 dc_close(struct vnode *vp, int flag, int count, offset_t off,
255 struct cred *cr, caller_context_t *ctp)
256 {
257 (void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
258 cleanshares(vp, ttoproc(curthread)->p_pid);
259 return (0);
260 }
261
262 /*ARGSUSED*/
263 static int
dc_read(struct vnode * vp,struct uio * uiop,int ioflag,struct cred * cr,struct caller_context * ct)264 dc_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
265 struct caller_context *ct)
266 {
267 struct dcnode *dp = VTODC(vp);
268 size_t rdsize = MAX(MAXBSIZE, dp->dc_hdr->ch_blksize);
269 size_t fsize = dp->dc_hdr->ch_fsize;
270 int error;
271
272 /*
273 * Loop through file with segmap, decompression will occur
274 * in dc_getapage
275 */
276 do {
277 caddr_t base;
278 size_t n;
279 offset_t mapon;
280
281 /*
282 * read to end of block or file
283 */
284 mapon = uiop->uio_loffset & (rdsize - 1);
285 n = MIN(rdsize - mapon, uiop->uio_resid);
286 n = MIN(n, fsize - uiop->uio_loffset);
287 if (n == 0)
288 return (0); /* at EOF */
289
290 base = segmap_getmapflt(segkmap, vp, uiop->uio_loffset, n, 1,
291 S_READ);
292 error = uiomove(base + mapon, n, UIO_READ, uiop);
293 if (!error) {
294 uint_t flags;
295
296 if (n + mapon == rdsize || uiop->uio_loffset == fsize)
297 flags = SM_DONTNEED;
298 else
299 flags = 0;
300 error = segmap_release(segkmap, base, flags);
301 } else
302 (void) segmap_release(segkmap, base, 0);
303 } while (!error && uiop->uio_resid);
304
305 return (error);
306 }
307
308 static int
dc_getattr(struct vnode * vp,struct vattr * vap,int flags,cred_t * cred,caller_context_t * ctp)309 dc_getattr(struct vnode *vp, struct vattr *vap, int flags,
310 cred_t *cred, caller_context_t *ctp)
311 {
312 struct dcnode *dp = VTODC(vp);
313 struct vnode *subvp = dp->dc_subvp;
314 int error;
315
316 error = VOP_GETATTR(subvp, vap, flags, cred, ctp);
317
318 /* substitute uncompressed size */
319 vap->va_size = dp->dc_hdr->ch_fsize;
320 return (error);
321 }
322
323 static int
dc_setattr(struct vnode * vp,struct vattr * vap,int flags,cred_t * cred,caller_context_t * ctp)324 dc_setattr(struct vnode *vp, struct vattr *vap, int flags, cred_t *cred,
325 caller_context_t *ctp)
326 {
327 struct dcnode *dp = VTODC(vp);
328 struct vnode *subvp = dp->dc_subvp;
329
330 return (VOP_SETATTR(subvp, vap, flags, cred, ctp));
331 }
332
333 static int
dc_access(struct vnode * vp,int mode,int flags,cred_t * cred,caller_context_t * ctp)334 dc_access(struct vnode *vp, int mode, int flags,
335 cred_t *cred, caller_context_t *ctp)
336 {
337 struct dcnode *dp = VTODC(vp);
338 struct vnode *subvp = dp->dc_subvp;
339
340 return (VOP_ACCESS(subvp, mode, flags, cred, ctp));
341 }
342
343 /*ARGSUSED*/
344 static int
dc_fsync(vnode_t * vp,int syncflag,cred_t * cred,caller_context_t * ctp)345 dc_fsync(vnode_t *vp, int syncflag, cred_t *cred, caller_context_t *ctp)
346 {
347 return (0);
348 }
349
350 /*ARGSUSED*/
351 static void
dc_inactive(struct vnode * vp,cred_t * cr,caller_context_t * ctp)352 dc_inactive(struct vnode *vp, cred_t *cr, caller_context_t *ctp)
353 {
354 struct dcnode *dp = VTODC(vp);
355
356 mutex_enter(&dctable_lock);
357 mutex_enter(&vp->v_lock);
358 ASSERT(vp->v_count >= 1);
359 VN_RELE_LOCKED(vp);
360 if (vp->v_count != 0) {
361 /*
362 * Somebody accessed the dcnode before we got a chance to
363 * remove it. They will remove it when they do a vn_rele.
364 */
365 mutex_exit(&vp->v_lock);
366 mutex_exit(&dctable_lock);
367 return;
368 }
369 mutex_exit(&vp->v_lock);
370
371 dcnode_free(dp);
372
373 mutex_exit(&dctable_lock);
374 }
375
376 static int
dc_fid(struct vnode * vp,struct fid * fidp,caller_context_t * ctp)377 dc_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ctp)
378 {
379 struct dcnode *dp = VTODC(vp);
380 struct vnode *subvp = dp->dc_subvp;
381
382 return (VOP_FID(subvp, fidp, ctp));
383 }
384
385 static int
dc_seek(struct vnode * vp,offset_t oof,offset_t * noffp,caller_context_t * ctp)386 dc_seek(struct vnode *vp, offset_t oof, offset_t *noffp, caller_context_t *ctp)
387 {
388 struct dcnode *dp = VTODC(vp);
389 struct vnode *subvp = dp->dc_subvp;
390
391 return (VOP_SEEK(subvp, oof, noffp, ctp));
392 }
393
394 static int
dc_frlock(struct vnode * vp,int cmd,struct flock64 * bfp,int flag,offset_t offset,struct flk_callback * flk_cbp,cred_t * cr,caller_context_t * ctp)395 dc_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
396 offset_t offset, struct flk_callback *flk_cbp,
397 cred_t *cr, caller_context_t *ctp)
398 {
399 struct dcnode *dp = VTODC(vp);
400 int error;
401 struct vattr vattr;
402
403 /*
404 * If file is being mapped, disallow frlock.
405 */
406 vattr.va_mask = AT_MODE;
407 if (error = VOP_GETATTR(dp->dc_subvp, &vattr, 0, cr, ctp))
408 return (error);
409 if (dp->dc_mapcnt > 0 && MANDLOCK(vp, vattr.va_mode))
410 return (EAGAIN);
411
412 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ctp));
413 }
414
415 /*ARGSUSED*/
416 static int
dc_getblock_miss(struct vnode * vp,offset_t off,size_t len,struct page ** ppp,struct seg * seg,caddr_t addr,enum seg_rw rw,struct cred * cr)417 dc_getblock_miss(struct vnode *vp, offset_t off, size_t len, struct page **ppp,
418 struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr)
419 {
420 struct dcnode *dp = VTODC(vp);
421 struct comphdr *hdr = dp->dc_hdr;
422 struct page *pp;
423 struct buf *bp;
424 caddr_t saddr;
425 off_t cblkno;
426 size_t rdoff, rdsize, dsize;
427 long xlen;
428 int error, zerr;
429
430 ASSERT(len == hdr->ch_blksize);
431 /*
432 * Get destination pages and make them addressable
433 */
434 pp = page_create_va(vp, off, len, PG_WAIT, seg, addr);
435 bp = pageio_setup(pp, len, vp, B_READ);
436 bp_mapin(bp);
437
438 /*
439 * read compressed data from subordinate vnode
440 */
441 saddr = kmem_cache_alloc(dp->dc_bufcache, KM_SLEEP);
442 cblkno = off / len;
443 rdoff = hdr->ch_blkmap[cblkno];
444 rdsize = hdr->ch_blkmap[cblkno + 1] - rdoff;
445 error = vn_rdwr(UIO_READ, dp->dc_subvp, saddr, rdsize, rdoff,
446 UIO_SYSSPACE, 0, 0, cr, NULL);
447 if (error)
448 goto cleanup;
449
450 /*
451 * Uncompress
452 */
453 dsize = len;
454 zerr = z_uncompress(bp->b_un.b_addr, &dsize, saddr, dp->dc_zmax);
455 if (zerr != Z_OK) {
456 error = EIO;
457 goto cleanup;
458 }
459
460 /*
461 * Handle EOF
462 */
463 xlen = hdr->ch_fsize - off;
464 if (xlen < len) {
465 bzero(bp->b_un.b_addr + xlen, len - xlen);
466 if (dsize != xlen)
467 error = EIO;
468 } else if (dsize != len)
469 error = EIO;
470
471 /*
472 * Clean up
473 */
474 cleanup:
475 kmem_cache_free(dp->dc_bufcache, saddr);
476 pageio_done(bp);
477 *ppp = pp;
478 return (error);
479 }
480
481 static int
dc_getblock(struct vnode * vp,offset_t off,size_t len,struct page ** ppp,struct seg * seg,caddr_t addr,enum seg_rw rw,struct cred * cr)482 dc_getblock(struct vnode *vp, offset_t off, size_t len, struct page **ppp,
483 struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr)
484 {
485 struct page *pp, *plist = NULL;
486 offset_t pgoff;
487 int rdblk;
488
489 /*
490 * pvn_read_kluster() doesn't quite do what we want, since it
491 * thinks sub block reads are ok. Here we always decompress
492 * a full block.
493 */
494
495 /*
496 * Check page cache
497 */
498 rdblk = 0;
499 for (pgoff = off; pgoff < off + len; pgoff += PAGESIZE) {
500 pp = page_lookup(vp, pgoff, SE_EXCL);
501 if (pp == NULL) {
502 rdblk = 1;
503 break;
504 }
505 page_io_lock(pp);
506 page_add(&plist, pp);
507 plist = plist->p_next;
508 }
509 if (!rdblk) {
510 *ppp = plist;
511 return (0); /* all pages in cache */
512 }
513
514 /*
515 * Undo any locks so getblock_miss has an open field
516 */
517 if (plist != NULL)
518 pvn_io_done(plist);
519
520 return (dc_getblock_miss(vp, off, len, ppp, seg, addr, rw, cr));
521 }
522
523 static int
dc_realvp(vnode_t * vp,vnode_t ** vpp,caller_context_t * ct)524 dc_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
525 {
526 struct vnode *rvp;
527
528 vp = VTODC(vp)->dc_subvp;
529 if (VOP_REALVP(vp, &rvp, ct) == 0)
530 vp = rvp;
531 *vpp = vp;
532 return (0);
533 }
534
535 /*ARGSUSED10*/
536 static int
dc_getpage(struct vnode * vp,offset_t off,size_t len,uint_t * protp,struct page * pl[],size_t plsz,struct seg * seg,caddr_t addr,enum seg_rw rw,struct cred * cr,caller_context_t * ctp)537 dc_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
538 struct page *pl[], size_t plsz, struct seg *seg, caddr_t addr,
539 enum seg_rw rw, struct cred *cr, caller_context_t *ctp)
540 {
541 struct dcnode *dp = VTODC(vp);
542 struct comphdr *hdr = dp->dc_hdr;
543 struct page *pp, *plist = NULL;
544 caddr_t vp_baddr;
545 offset_t vp_boff, vp_bend;
546 size_t bsize = hdr->ch_blksize;
547 int nblks, error;
548
549 /* does not support write */
550 if (rw == S_WRITE) {
551 panic("write attempt on compressed file");
552 /*NOTREACHED*/
553 }
554
555 if (protp)
556 *protp = PROT_ALL;
557 /*
558 * We don't support asynchronous operation at the moment, so
559 * just pretend we did it. If the pages are ever actually
560 * needed, they'll get brought in then.
561 */
562 if (pl == NULL)
563 return (0);
564
565 /*
566 * Calc block start and end offsets
567 */
568 vp_boff = rounddown(off, bsize);
569 vp_bend = roundup(off + len, bsize);
570 vp_baddr = (caddr_t)rounddown((uintptr_t)addr, bsize);
571
572 nblks = (vp_bend - vp_boff) / bsize;
573 while (nblks--) {
574 error = dc_getblock(vp, vp_boff, bsize, &pp, seg, vp_baddr,
575 rw, cr);
576 page_list_concat(&plist, &pp);
577 vp_boff += bsize;
578 vp_baddr += bsize;
579 }
580 if (!error)
581 pvn_plist_init(plist, pl, plsz, off, len, rw);
582 else
583 pvn_read_done(plist, B_ERROR);
584 return (error);
585 }
586
587 /*
588 * This function should never be called. We need to have it to pass
589 * it as an argument to other functions.
590 */
591 /*ARGSUSED*/
592 static int
dc_putapage(struct vnode * vp,struct page * pp,u_offset_t * offp,size_t * lenp,int flags,struct cred * cr)593 dc_putapage(struct vnode *vp, struct page *pp, u_offset_t *offp, size_t *lenp,
594 int flags, struct cred *cr)
595 {
596 /* should never happen */
597 cmn_err(CE_PANIC, "dcfs: dc_putapage: dirty page");
598 /*NOTREACHED*/
599 return (0);
600 }
601
602
603 /*
604 * The only flags we support are B_INVAL, B_FREE and B_DONTNEED.
605 * B_INVAL is set by:
606 *
607 * 1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag.
608 * 2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice
609 * which translates to an MC_SYNC with the MS_INVALIDATE flag.
610 *
611 * The B_FREE (as well as the B_DONTNEED) flag is set when the
612 * MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked
613 * from SEGVN to release pages behind a pagefault.
614 */
615 /*ARGSUSED5*/
616 static int
dc_putpage(struct vnode * vp,offset_t off,size_t len,int flags,struct cred * cr,caller_context_t * ctp)617 dc_putpage(struct vnode *vp, offset_t off, size_t len, int flags,
618 struct cred *cr, caller_context_t *ctp)
619 {
620 int error = 0;
621
622 if (vp->v_count == 0) {
623 panic("dcfs_putpage: bad v_count");
624 /*NOTREACHED*/
625 }
626
627 if (vp->v_flag & VNOMAP)
628 return (ENOSYS);
629
630 if (!vn_has_cached_data(vp)) /* no pages mapped */
631 return (0);
632
633 if (len == 0) /* from 'off' to EOF */
634 error = pvn_vplist_dirty(vp, off, dc_putapage, flags, cr);
635 else {
636 offset_t io_off;
637 se_t se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
638
639 for (io_off = off; io_off < off + len; io_off += PAGESIZE) {
640 page_t *pp;
641
642 /*
643 * We insist on getting the page only if we are
644 * about to invalidate, free or write it and
645 * the B_ASYNC flag is not set.
646 */
647 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0))
648 pp = page_lookup(vp, io_off, se);
649 else
650 pp = page_lookup_nowait(vp, io_off, se);
651
652 if (pp == NULL)
653 continue;
654 /*
655 * Normally pvn_getdirty() should return 0, which
656 * impies that it has done the job for us.
657 * The shouldn't-happen scenario is when it returns 1.
658 * This means that the page has been modified and
659 * needs to be put back.
660 * Since we can't write to a dcfs compressed file,
661 * we fake a failed I/O and force pvn_write_done()
662 * to destroy the page.
663 */
664 if (pvn_getdirty(pp, flags) == 1) {
665 cmn_err(CE_NOTE, "dc_putpage: dirty page");
666 pvn_write_done(pp, flags |
667 B_ERROR | B_WRITE | B_INVAL | B_FORCE);
668 }
669 }
670 }
671 return (error);
672 }
673
674 static int
dc_map(struct vnode * vp,offset_t off,struct as * as,caddr_t * addrp,size_t len,uchar_t prot,uchar_t maxprot,uint_t flags,struct cred * cred,caller_context_t * ctp)675 dc_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp,
676 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
677 struct cred *cred, caller_context_t *ctp)
678 {
679 struct vattr vattr;
680 struct segvn_crargs vn_a;
681 int error;
682
683 if (vp->v_flag & VNOMAP)
684 return (ENOSYS);
685
686 if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0)
687 return (ENXIO);
688
689 /*
690 * If file is being locked, disallow mapping.
691 */
692 if (error = VOP_GETATTR(VTODC(vp)->dc_subvp, &vattr, 0, cred, ctp))
693 return (error);
694 if (vn_has_mandatory_locks(vp, vattr.va_mode))
695 return (EAGAIN);
696
697 as_rangelock(as);
698
699 if ((flags & MAP_FIXED) == 0) {
700 map_addr(addrp, len, off, 1, flags);
701 if (*addrp == NULL) {
702 as_rangeunlock(as);
703 return (ENOMEM);
704 }
705 } else {
706 /*
707 * User specified address - blow away any previous mappings
708 */
709 (void) as_unmap(as, *addrp, len);
710 }
711
712 vn_a.vp = vp;
713 vn_a.offset = off;
714 vn_a.type = flags & MAP_TYPE;
715 vn_a.prot = prot;
716 vn_a.maxprot = maxprot;
717 vn_a.flags = flags & ~MAP_TYPE;
718 vn_a.cred = cred;
719 vn_a.amp = NULL;
720 vn_a.szc = 0;
721 vn_a.lgrp_mem_policy_flags = 0;
722
723 error = as_map(as, *addrp, len, segvn_create, &vn_a);
724 as_rangeunlock(as);
725 return (error);
726 }
727
728 /*ARGSUSED*/
729 static int
dc_addmap(struct vnode * vp,offset_t off,struct as * as,caddr_t addr,size_t len,uchar_t prot,uchar_t maxprot,uint_t flags,struct cred * cr,caller_context_t * ctp)730 dc_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
731 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
732 struct cred *cr, caller_context_t *ctp)
733 {
734 struct dcnode *dp;
735
736 if (vp->v_flag & VNOMAP)
737 return (ENOSYS);
738
739 dp = VTODC(vp);
740 mutex_enter(&dp->dc_lock);
741 dp->dc_mapcnt += btopr(len);
742 mutex_exit(&dp->dc_lock);
743 return (0);
744 }
745
746 /*ARGSUSED*/
747 static int
dc_delmap(struct vnode * vp,offset_t off,struct as * as,caddr_t addr,size_t len,uint_t prot,uint_t maxprot,uint_t flags,struct cred * cr,caller_context_t * ctp)748 dc_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
749 size_t len, uint_t prot, uint_t maxprot, uint_t flags,
750 struct cred *cr, caller_context_t *ctp)
751 {
752 struct dcnode *dp;
753
754 if (vp->v_flag & VNOMAP)
755 return (ENOSYS);
756
757 dp = VTODC(vp);
758 mutex_enter(&dp->dc_lock);
759 dp->dc_mapcnt -= btopr(len);
760 ASSERT(dp->dc_mapcnt >= 0);
761 mutex_exit(&dp->dc_lock);
762 return (0);
763 }
764
765 /*
766 * Constructor/destructor routines for dcnodes
767 */
768 /*ARGSUSED1*/
769 static int
dcnode_constructor(void * buf,void * cdrarg,int kmflags)770 dcnode_constructor(void *buf, void *cdrarg, int kmflags)
771 {
772 struct dcnode *dp = buf;
773 struct vnode *vp;
774
775 vp = dp->dc_vp = vn_alloc(kmflags);
776 if (vp == NULL) {
777 return (-1);
778 }
779 vp->v_data = dp;
780 vp->v_type = VREG;
781 vp->v_flag = VNOSWAP;
782 vp->v_vfsp = &dc_vfs;
783 vn_setops(vp, dc_vnodeops);
784 vn_exists(vp);
785
786 mutex_init(&dp->dc_lock, NULL, MUTEX_DEFAULT, NULL);
787 dp->dc_mapcnt = 0;
788 dp->dc_lrunext = dp->dc_lruprev = NULL;
789 dp->dc_hdr = NULL;
790 dp->dc_subvp = NULL;
791 return (0);
792 }
793
794 /*ARGSUSED*/
795 static void
dcnode_destructor(void * buf,void * cdrarg)796 dcnode_destructor(void *buf, void *cdrarg)
797 {
798 struct dcnode *dp = buf;
799 struct vnode *vp = DCTOV(dp);
800
801 mutex_destroy(&dp->dc_lock);
802
803 VERIFY(dp->dc_hdr == NULL);
804 VERIFY(dp->dc_subvp == NULL);
805 vn_invalid(vp);
806 vn_free(vp);
807 }
808
809 static struct dcnode *
dcnode_alloc(void)810 dcnode_alloc(void)
811 {
812 struct dcnode *dp;
813
814 /*
815 * If the free list is above DCLRUSIZE
816 * re-use one from it
817 */
818 mutex_enter(&dctable_lock);
819 if (dclru_len < DCLRUSIZE) {
820 mutex_exit(&dctable_lock);
821 dp = kmem_cache_alloc(dcnode_cache, KM_SLEEP);
822 } else {
823 ASSERT(dclru != NULL);
824 dp = dclru;
825 dclru_sub(dp);
826 dcdelete(dp);
827 mutex_exit(&dctable_lock);
828 dcnode_recycle(dp);
829 }
830 return (dp);
831 }
832
833 static void
dcnode_free(struct dcnode * dp)834 dcnode_free(struct dcnode *dp)
835 {
836 struct vnode *vp = DCTOV(dp);
837
838 ASSERT(MUTEX_HELD(&dctable_lock));
839
840 /*
841 * If no cached pages, no need to put it on lru
842 */
843 if (!vn_has_cached_data(vp)) {
844 dcdelete(dp);
845 dcnode_recycle(dp);
846 kmem_cache_free(dcnode_cache, dp);
847 return;
848 }
849
850 /*
851 * Add to lru, if it's over the limit, free from head
852 */
853 dclru_add(dp);
854 if (dclru_len > DCLRUSIZE) {
855 dp = dclru;
856 dclru_sub(dp);
857 dcdelete(dp);
858 dcnode_recycle(dp);
859 kmem_cache_free(dcnode_cache, dp);
860 }
861 }
862
863 static void
dcnode_recycle(struct dcnode * dp)864 dcnode_recycle(struct dcnode *dp)
865 {
866 struct vnode *vp;
867
868 vp = DCTOV(dp);
869
870 VN_RELE(dp->dc_subvp);
871 dp->dc_subvp = NULL;
872 (void) pvn_vplist_dirty(vp, 0, dc_putapage, B_INVAL, NULL);
873 kmem_free(dp->dc_hdr, dp->dc_hdrsize);
874 dp->dc_hdr = NULL;
875 dp->dc_hdrsize = dp->dc_zmax = 0;
876 dp->dc_bufcache = NULL;
877 dp->dc_mapcnt = 0;
878 vn_reinit(vp);
879 vp->v_type = VREG;
880 vp->v_flag = VNOSWAP;
881 vp->v_vfsp = &dc_vfs;
882 }
883
884 static int
dcinit(int fstype,char * name)885 dcinit(int fstype, char *name)
886 {
887 static const fs_operation_def_t dc_vfsops_template[] = {
888 NULL, NULL
889 };
890 int error;
891 major_t dev;
892
893 error = vfs_setfsops(fstype, dc_vfsops_template, &dc_vfsops);
894 if (error) {
895 cmn_err(CE_WARN, "dcinit: bad vfs ops template");
896 return (error);
897 }
898 VFS_INIT(&dc_vfs, dc_vfsops, NULL);
899 dc_vfs.vfs_flag = VFS_RDONLY;
900 dc_vfs.vfs_fstype = fstype;
901 if ((dev = getudev()) == (major_t)-1)
902 dev = 0;
903 dcdev = makedevice(dev, 0);
904 dc_vfs.vfs_dev = dcdev;
905
906 error = vn_make_ops(name, dc_vnodeops_template, &dc_vnodeops);
907 if (error != 0) {
908 (void) vfs_freevfsops_by_type(fstype);
909 cmn_err(CE_WARN, "dcinit: bad vnode ops template");
910 return (error);
911 }
912
913 mutex_init(&dctable_lock, NULL, MUTEX_DEFAULT, NULL);
914 mutex_init(&dccache_lock, NULL, MUTEX_DEFAULT, NULL);
915 dcnode_cache = kmem_cache_create("dcnode_cache", sizeof (struct dcnode),
916 0, dcnode_constructor, dcnode_destructor, NULL, NULL, NULL, 0);
917
918 return (0);
919 }
920
921 /*
922 * Return shadow vnode with the given vp as its subordinate
923 */
924 struct vnode *
decompvp(struct vnode * vp,cred_t * cred,caller_context_t * ctp)925 decompvp(struct vnode *vp, cred_t *cred, caller_context_t *ctp)
926 {
927 struct dcnode *dp, *ndp;
928 struct comphdr thdr, *hdr;
929 struct kmem_cache **cpp;
930 struct vattr vattr;
931 size_t hdrsize, bsize;
932 int error;
933
934 /*
935 * See if we have an existing shadow
936 * If none, we have to manufacture one
937 */
938 mutex_enter(&dctable_lock);
939 dp = dcfind(vp);
940 mutex_exit(&dctable_lock);
941 if (dp != NULL)
942 return (DCTOV(dp));
943
944 /*
945 * Make sure it's a valid compressed file
946 */
947 hdr = &thdr;
948 error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, sizeof (struct comphdr), 0,
949 UIO_SYSSPACE, 0, 0, cred, NULL);
950 if (error || hdr->ch_magic != CH_MAGIC_ZLIB ||
951 hdr->ch_version != CH_VERSION || hdr->ch_algorithm != CH_ALG_ZLIB ||
952 hdr->ch_fsize == 0 || hdr->ch_blksize < PAGESIZE ||
953 hdr->ch_blksize > ptob(DCCACHESIZE) || !ISP2(hdr->ch_blksize))
954 return (NULL);
955
956 /* get underlying file size */
957 if (VOP_GETATTR(vp, &vattr, 0, cred, ctp) != 0)
958 return (NULL);
959
960 /*
961 * Re-read entire header
962 */
963 hdrsize = hdr->ch_blkmap[0] + sizeof (uint64_t);
964 hdr = kmem_alloc(hdrsize, KM_SLEEP);
965 error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, hdrsize, 0, UIO_SYSSPACE,
966 0, 0, cred, NULL);
967 if (error) {
968 kmem_free(hdr, hdrsize);
969 return (NULL);
970 }
971
972 /*
973 * add extra blkmap entry to make dc_getblock()'s
974 * life easier
975 */
976 bsize = hdr->ch_blksize;
977 hdr->ch_blkmap[((hdr->ch_fsize-1) / bsize) + 1] = vattr.va_size;
978
979 ndp = dcnode_alloc();
980 ndp->dc_subvp = vp;
981 VN_HOLD(vp);
982 ndp->dc_hdr = hdr;
983 ndp->dc_hdrsize = hdrsize;
984
985 /*
986 * Allocate kmem cache if none there already
987 */
988 ndp->dc_zmax = ZMAXBUF(bsize);
989 cpp = &dcbuf_cache[btop(bsize)];
990 mutex_enter(&dccache_lock);
991 if (*cpp == NULL)
992 *cpp = kmem_cache_create("dcbuf_cache", ndp->dc_zmax, 0, NULL,
993 NULL, NULL, NULL, NULL, 0);
994 mutex_exit(&dccache_lock);
995 ndp->dc_bufcache = *cpp;
996
997 /*
998 * Recheck table in case someone else created shadow
999 * while we were blocked above.
1000 */
1001 mutex_enter(&dctable_lock);
1002 dp = dcfind(vp);
1003 if (dp != NULL) {
1004 mutex_exit(&dctable_lock);
1005 dcnode_recycle(ndp);
1006 kmem_cache_free(dcnode_cache, ndp);
1007 return (DCTOV(dp));
1008 }
1009 dcinsert(ndp);
1010 mutex_exit(&dctable_lock);
1011
1012 return (DCTOV(ndp));
1013 }
1014
1015
1016 /*
1017 * dcnode lookup table
1018 * These routines maintain a table of dcnodes hashed by their
1019 * subordinate vnode so that they can be found if they already
1020 * exist in the vnode cache
1021 */
1022
1023 /*
1024 * Put a dcnode in the table.
1025 */
1026 static void
dcinsert(struct dcnode * newdp)1027 dcinsert(struct dcnode *newdp)
1028 {
1029 int idx = DCHASH(newdp->dc_subvp);
1030
1031 ASSERT(MUTEX_HELD(&dctable_lock));
1032 newdp->dc_hash = dctable[idx];
1033 dctable[idx] = newdp;
1034 }
1035
1036 /*
1037 * Remove a dcnode from the hash table.
1038 */
1039 void
dcdelete(struct dcnode * deldp)1040 dcdelete(struct dcnode *deldp)
1041 {
1042 int idx = DCHASH(deldp->dc_subvp);
1043 struct dcnode *dp, *prevdp;
1044
1045 ASSERT(MUTEX_HELD(&dctable_lock));
1046 dp = dctable[idx];
1047 if (dp == deldp)
1048 dctable[idx] = dp->dc_hash;
1049 else {
1050 for (prevdp = dp, dp = dp->dc_hash; dp != NULL;
1051 prevdp = dp, dp = dp->dc_hash) {
1052 if (dp == deldp) {
1053 prevdp->dc_hash = dp->dc_hash;
1054 break;
1055 }
1056 }
1057 }
1058 ASSERT(dp != NULL);
1059 }
1060
1061 /*
1062 * Find a shadow vnode in the dctable hash list.
1063 */
1064 static struct dcnode *
dcfind(struct vnode * vp)1065 dcfind(struct vnode *vp)
1066 {
1067 struct dcnode *dp;
1068
1069 ASSERT(MUTEX_HELD(&dctable_lock));
1070 for (dp = dctable[DCHASH(vp)]; dp != NULL; dp = dp->dc_hash)
1071 if (dp->dc_subvp == vp) {
1072 VN_HOLD(DCTOV(dp));
1073 if (dp->dc_lrunext)
1074 dclru_sub(dp);
1075 return (dp);
1076 }
1077 return (NULL);
1078 }
1079
1080 #ifdef DEBUG
1081 static int
dclru_count(void)1082 dclru_count(void)
1083 {
1084 struct dcnode *dp;
1085 int i = 0;
1086
1087 if (dclru == NULL)
1088 return (0);
1089 for (dp = dclru; dp->dc_lrunext != dclru; dp = dp->dc_lrunext)
1090 i++;
1091 return (i + 1);
1092 }
1093 #endif
1094
1095 static void
dclru_add(struct dcnode * dp)1096 dclru_add(struct dcnode *dp)
1097 {
1098 /*
1099 * Add to dclru as double-link chain
1100 */
1101 ASSERT(MUTEX_HELD(&dctable_lock));
1102 if (dclru == NULL) {
1103 dclru = dp;
1104 dp->dc_lruprev = dp->dc_lrunext = dp;
1105 } else {
1106 struct dcnode *last = dclru->dc_lruprev;
1107
1108 dclru->dc_lruprev = dp;
1109 last->dc_lrunext = dp;
1110 dp->dc_lruprev = last;
1111 dp->dc_lrunext = dclru;
1112 }
1113 dclru_len++;
1114 ASSERT(dclru_len == dclru_count());
1115 }
1116
1117 static void
dclru_sub(struct dcnode * dp)1118 dclru_sub(struct dcnode *dp)
1119 {
1120 ASSERT(MUTEX_HELD(&dctable_lock));
1121 dp->dc_lrunext->dc_lruprev = dp->dc_lruprev;
1122 dp->dc_lruprev->dc_lrunext = dp->dc_lrunext;
1123 if (dp == dclru)
1124 dclru = dp->dc_lrunext == dp ? NULL : dp->dc_lrunext;
1125 dp->dc_lrunext = dp->dc_lruprev = NULL;
1126 dclru_len--;
1127 ASSERT(dclru_len == dclru_count());
1128 }
1129