1 /* 2 * Copyright (c) 2010 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * Implements new VFS/VM coherency functions. For conforming VFSs 37 * we treat the backing VM object slightly differently. Instead of 38 * maintaining a number of pages to exactly fit the size of the file 39 * we instead maintain pages to fit the entire contents of the last 40 * buffer cache buffer used by the file. 41 * 42 * For VFSs like NFS and HAMMER which use (generally speaking) fixed 43 * sized buffers this greatly reduces the complexity of VFS/VM interactions. 44 * 45 * Truncations no longer invalidate pages covered by the buffer cache 46 * beyond the file EOF which still fit within the file's last buffer. 47 * We simply unmap them and do not allow userland to fault them in. 48 * 49 * The VFS is no longer responsible for zero-filling buffers during a 50 * truncation, the last buffer will be automatically zero-filled by 51 * nvtruncbuf(). 52 * 53 * This code is intended to (eventually) replace vtruncbuf() and 54 * vnode_pager_setsize(). 55 */ 56 57 #include <sys/param.h> 58 #include <sys/systm.h> 59 #include <sys/buf.h> 60 #include <sys/conf.h> 61 #include <sys/fcntl.h> 62 #include <sys/file.h> 63 #include <sys/kernel.h> 64 #include <sys/malloc.h> 65 #include <sys/mount.h> 66 #include <sys/proc.h> 67 #include <sys/socket.h> 68 #include <sys/stat.h> 69 #include <sys/sysctl.h> 70 #include <sys/unistd.h> 71 #include <sys/vmmeter.h> 72 #include <sys/vnode.h> 73 74 #include <machine/limits.h> 75 76 #include <vm/vm.h> 77 #include <vm/vm_object.h> 78 #include <vm/vm_extern.h> 79 #include <vm/vm_kern.h> 80 #include <vm/pmap.h> 81 #include <vm/vm_map.h> 82 #include <vm/vm_page.h> 83 #include <vm/vm_pager.h> 84 #include <vm/vnode_pager.h> 85 #include <vm/vm_zone.h> 86 87 #include <sys/buf2.h> 88 #include <vm/vm_page2.h> 89 90 static int nvtruncbuf_bp_trunc_cmp(struct buf *bp, void *data); 91 static int nvtruncbuf_bp_trunc(struct buf *bp, void *data); 92 static int nvtruncbuf_bp_metasync_cmp(struct buf *bp, void *data); 93 static int nvtruncbuf_bp_metasync(struct buf *bp, void *data); 94 95 /* 96 * Truncate a file's buffer and pages to a specified length. The 97 * byte-granular length of the file is specified along with the block 98 * size of the buffer containing that offset. 99 * 100 * If the last buffer straddles the length its contents will be zero-filled 101 * as appropriate. All buffers and pages after the last buffer will be 102 * destroyed. The last buffer itself will be destroyed only if the length 103 * is exactly aligned with it. 104 * 105 * UFS typically passes the old block size prior to the actual truncation, 106 * then later resizes the block based on the new file size. NFS uses a 107 * fixed block size and doesn't care. HAMMER uses a block size based on 108 * the offset which is fixed for any particular offset. 109 * 110 * When zero-filling we must bdwrite() to avoid a window of opportunity 111 * where the kernel might throw away a clean buffer and the filesystem 112 * then attempts to bread() it again before completing (or as part of) 113 * the extension. The filesystem is still responsible for zero-filling 114 * any remainder when writing to the media in the strategy function when 115 * it is able to do so without the page being mapped. The page may still 116 * be mapped by userland here. 117 * 118 * When modifying a buffer we must clear any cached raw disk offset. 119 * bdwrite() will call BMAP on it again. Some filesystems, like HAMMER, 120 * never overwrite existing data blocks. 121 */ 122 123 struct truncbuf_info { 124 struct vnode *vp; 125 off_t truncloffset; /* truncation point */ 126 int clean; /* clean tree, else dirty tree */ 127 }; 128 129 int 130 nvtruncbuf(struct vnode *vp, off_t length, int blksize, int boff, int flags) 131 { 132 struct truncbuf_info info; 133 off_t truncboffset; 134 const char *filename; 135 struct buf *bp; 136 int count; 137 int error; 138 139 /* 140 * Round up to the *next* block, then destroy the buffers in question. 141 * Since we are only removing some of the buffers we must rely on the 142 * scan count to determine whether a loop is necessary. 143 * 144 * Destroy any pages beyond the last buffer. 145 */ 146 if (boff < 0) 147 boff = (int)(length % blksize); 148 if (boff) 149 info.truncloffset = length + (blksize - boff); 150 else 151 info.truncloffset = length; 152 info.vp = vp; 153 lwkt_gettoken(&vp->v_token); 154 do { 155 info.clean = 1; 156 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, 157 nvtruncbuf_bp_trunc_cmp, 158 nvtruncbuf_bp_trunc, &info); 159 info.clean = 0; 160 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 161 nvtruncbuf_bp_trunc_cmp, 162 nvtruncbuf_bp_trunc, &info); 163 } while(count); 164 165 nvnode_pager_setsize(vp, length, blksize, boff); 166 167 /* 168 * Zero-fill the area beyond the file EOF that still fits within 169 * the last buffer. We must mark the buffer as dirty even though 170 * the modified area is beyond EOF to avoid races where the kernel 171 * might flush the buffer before the filesystem is able to reallocate 172 * the block. 173 * 174 * The VFS is responsible for dealing with the actual truncation. 175 * 176 * Only do this if NVEXTF_TRIVIAL is not set, otherwise it is up to 177 * the VFS to handle the block straddling the EOF. 178 */ 179 if (boff && (flags & NVEXTF_TRIVIAL) == 0) { 180 truncboffset = length - boff; 181 error = bread_kvabio(vp, truncboffset, blksize, &bp); 182 if (error == 0) { 183 bkvasync(bp); 184 bzero(bp->b_data + boff, blksize - boff); 185 if (bp->b_flags & B_DELWRI) { 186 if (bp->b_dirtyoff > boff) 187 bp->b_dirtyoff = boff; 188 if (bp->b_dirtyend > boff) 189 bp->b_dirtyend = boff; 190 } 191 bp->b_bio2.bio_offset = NOOFFSET; 192 if (flags & NVEXTF_BUWRITE) 193 buwrite(bp); 194 else 195 bdwrite(bp); 196 } else { 197 kprintf("nvtruncbuf: bread error %d @0x%016jx\n", 198 error, truncboffset); 199 bp->b_flags |= B_INVAL | B_RELBUF; 200 brelse(bp); 201 } 202 } else { 203 error = 0; 204 } 205 206 /* 207 * For safety, fsync any remaining metadata if the file is not being 208 * truncated to 0. Since the metadata does not represent the entire 209 * dirty list we have to rely on the hit count to ensure that we get 210 * all of it. 211 * 212 * This is typically applicable only to UFS. NFS and HAMMER do 213 * not store indirect blocks in the per-vnode buffer cache. 214 */ 215 if (length > 0) { 216 do { 217 count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 218 nvtruncbuf_bp_metasync_cmp, 219 nvtruncbuf_bp_metasync, &info); 220 } while (count); 221 } 222 223 /* 224 * It is possible to have in-progress I/O from buffers that were 225 * not part of the truncation. This should not happen if we 226 * are truncating to 0-length. 227 */ 228 bio_track_wait(&vp->v_track_write, 0, 0); 229 230 /* 231 * Debugging only 232 */ 233 spin_lock(&vp->v_spin); 234 filename = TAILQ_FIRST(&vp->v_namecache) ? 235 TAILQ_FIRST(&vp->v_namecache)->nc_name : "?"; 236 spin_unlock(&vp->v_spin); 237 238 /* 239 * Make sure no buffers were instantiated while we were trying 240 * to clean out the remaining VM pages. This could occur due 241 * to busy dirty VM pages being flushed out to disk. 242 */ 243 do { 244 info.clean = 1; 245 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, 246 nvtruncbuf_bp_trunc_cmp, 247 nvtruncbuf_bp_trunc, &info); 248 info.clean = 0; 249 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 250 nvtruncbuf_bp_trunc_cmp, 251 nvtruncbuf_bp_trunc, &info); 252 if (count) { 253 kprintf("Warning: vtruncbuf(): Had to re-clean %d " 254 "left over buffers in %s\n", count, filename); 255 } 256 } while(count); 257 258 lwkt_reltoken(&vp->v_token); 259 260 return (error); 261 } 262 263 /* 264 * The callback buffer is beyond the new file EOF and must be destroyed. 265 * Note that the compare function must conform to the RB_SCAN's requirements. 266 */ 267 static 268 int 269 nvtruncbuf_bp_trunc_cmp(struct buf *bp, void *data) 270 { 271 struct truncbuf_info *info = data; 272 273 if (bp->b_loffset >= info->truncloffset) 274 return(0); 275 return(-1); 276 } 277 278 static 279 int 280 nvtruncbuf_bp_trunc(struct buf *bp, void *data) 281 { 282 struct truncbuf_info *info = data; 283 284 /* 285 * Do not try to use a buffer we cannot immediately lock, 286 * but sleep anyway to prevent a livelock. The code will 287 * loop until all buffers can be acted upon. 288 */ 289 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 290 atomic_add_int(&bp->b_refs, 1); 291 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0) 292 BUF_UNLOCK(bp); 293 atomic_subtract_int(&bp->b_refs, 1); 294 } else if ((info->clean && (bp->b_flags & B_DELWRI)) || 295 (info->clean == 0 && (bp->b_flags & B_DELWRI) == 0) || 296 bp->b_vp != info->vp || 297 nvtruncbuf_bp_trunc_cmp(bp, data)) { 298 BUF_UNLOCK(bp); 299 } else { 300 bremfree(bp); 301 bp->b_flags |= (B_INVAL | B_RELBUF | B_NOCACHE); 302 brelse(bp); 303 } 304 lwkt_yield(); 305 return(1); 306 } 307 308 /* 309 * Fsync all meta-data after truncating a file to be non-zero. Only metadata 310 * blocks (with a negative loffset) are scanned. 311 * Note that the compare function must conform to the RB_SCAN's requirements. 312 */ 313 static int 314 nvtruncbuf_bp_metasync_cmp(struct buf *bp, void *data __unused) 315 { 316 if (bp->b_loffset < 0) 317 return(0); 318 lwkt_yield(); 319 return(1); 320 } 321 322 static int 323 nvtruncbuf_bp_metasync(struct buf *bp, void *data) 324 { 325 struct truncbuf_info *info = data; 326 327 /* 328 * Do not try to use a buffer we cannot immediately lock, 329 * but sleep anyway to prevent a livelock. The code will 330 * loop until all buffers can be acted upon. 331 */ 332 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 333 atomic_add_int(&bp->b_refs, 1); 334 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0) 335 BUF_UNLOCK(bp); 336 atomic_subtract_int(&bp->b_refs, 1); 337 } else if ((bp->b_flags & B_DELWRI) == 0 || 338 bp->b_vp != info->vp || 339 nvtruncbuf_bp_metasync_cmp(bp, data)) { 340 BUF_UNLOCK(bp); 341 } else { 342 bremfree(bp); 343 bawrite(bp); 344 } 345 lwkt_yield(); 346 return(1); 347 } 348 349 /* 350 * Extend a file's buffer and pages to a new, larger size. The block size 351 * at both the old and new length must be passed, but buffer cache operations 352 * will only be performed on the old block. The new nlength/nblksize will 353 * be used to properly set the VM object size. 354 * 355 * To make this explicit we require the old length to passed even though 356 * we can acquire it from vp->v_filesize, which also avoids potential 357 * corruption if the filesystem and vp get desynchronized somehow. 358 * 359 * If the caller intends to immediately write into the newly extended 360 * space pass NVEXTF_TRIVIAL. If not set, the original buffer will be 361 * zero-filled as necessary to clean out any junk in the extended space. 362 * If non-zero the original buffer (straddling EOF) is not touched. 363 * 364 * When zero-filling we must bdwrite() to avoid a window of opportunity 365 * where the kernel might throw away a clean buffer and the filesystem 366 * then attempts to bread() it again before completing (or as part of) 367 * the extension. The filesystem is still responsible for zero-filling 368 * any remainder when writing to the media in the strategy function when 369 * it is able to do so without the page being mapped. The page may still 370 * be mapped by userland here. 371 * 372 * When modifying a buffer we must clear any cached raw disk offset. 373 * bdwrite() will call BMAP on it again. Some filesystems, like HAMMER, 374 * never overwrite existing data blocks. 375 */ 376 int 377 nvextendbuf(struct vnode *vp, off_t olength, off_t nlength, 378 int oblksize, int nblksize, int oboff, int nboff, int flags) 379 { 380 off_t truncboffset; 381 struct buf *bp; 382 int error; 383 384 error = 0; 385 nvnode_pager_setsize(vp, nlength, nblksize, nboff); 386 if ((flags & NVEXTF_TRIVIAL) == 0) { 387 if (oboff < 0) 388 oboff = (int)(olength % oblksize); 389 truncboffset = olength - oboff; 390 391 if (oboff) { 392 error = bread_kvabio(vp, truncboffset, oblksize, &bp); 393 if (error == 0) { 394 bkvasync(bp); 395 bzero(bp->b_data + oboff, oblksize - oboff); 396 bp->b_bio2.bio_offset = NOOFFSET; 397 if (flags & NVEXTF_BUWRITE) 398 buwrite(bp); 399 else 400 bdwrite(bp); 401 } else { 402 kprintf("nvextendbuf: bread EOF @ %016jx " 403 "error %d\n", 404 truncboffset, error); 405 bp->b_flags |= B_INVAL | B_RELBUF; 406 brelse(bp); 407 } 408 } 409 } 410 return (error); 411 } 412 413 /* 414 * Set vp->v_filesize and vp->v_object->size, destroy pages beyond 415 * the last buffer when truncating. 416 * 417 * This function does not do any zeroing or invalidating of partially 418 * overlapping pages. Zeroing is the responsibility of nvtruncbuf(). 419 * However, it does unmap VM pages from the user address space on a 420 * page-granular (verses buffer cache granular) basis. 421 * 422 * If boff is passed as -1 the base offset of the buffer cache buffer is 423 * calculated from length and blksize. Filesystems such as UFS which deal 424 * with fragments have to specify a boff >= 0 since the base offset cannot 425 * be calculated from length and blksize. 426 * 427 * For UFS blksize is the 'new' blocksize, used only to determine how large 428 * the VM object must become. 429 */ 430 void 431 nvnode_pager_setsize(struct vnode *vp, off_t length, int blksize, int boff) 432 { 433 vm_pindex_t nobjsize; 434 vm_pindex_t oobjsize; 435 vm_pindex_t pi; 436 vm_object_t object; 437 vm_page_t m; 438 off_t truncboffset; 439 440 /* 441 * Degenerate conditions 442 */ 443 if ((object = vp->v_object) == NULL) 444 return; 445 vm_object_hold(object); 446 if (length == vp->v_filesize) { 447 vm_object_drop(object); 448 return; 449 } 450 451 /* 452 * Calculate the size of the VM object, coverage includes 453 * the buffer straddling EOF. If EOF is buffer-aligned 454 * we don't bother. 455 * 456 * Buffers do not have to be page-aligned. Make sure 457 * nobjsize is beyond the last page of the buffer. 458 */ 459 if (boff < 0) 460 boff = (int)(length % blksize); 461 truncboffset = length - boff; 462 oobjsize = object->size; 463 if (boff) 464 nobjsize = OFF_TO_IDX(truncboffset + blksize + PAGE_MASK); 465 else 466 nobjsize = OFF_TO_IDX(truncboffset + PAGE_MASK); 467 object->size = nobjsize; 468 469 if (length < vp->v_filesize) { 470 /* 471 * File has shrunk, toss any cached pages beyond 472 * the end of the buffer (blksize aligned) for the 473 * new EOF. 474 */ 475 vp->v_filesize = length; 476 if (nobjsize < oobjsize) { 477 vm_object_page_remove(object, nobjsize, oobjsize, 478 FALSE); 479 } 480 481 /* 482 * Unmap any pages (page aligned) beyond the new EOF. 483 * The pages remain part of the (last) buffer and are not 484 * invalidated. 485 */ 486 pi = OFF_TO_IDX(length + PAGE_MASK); 487 while (pi < nobjsize) { 488 m = vm_page_lookup_busy_wait(object, pi, FALSE, "vmpg"); 489 if (m) { 490 vm_page_protect(m, VM_PROT_NONE); 491 vm_page_wakeup(m); 492 } 493 ++pi; 494 lwkt_yield(); 495 } 496 } else { 497 /* 498 * File has expanded. 499 */ 500 vp->v_filesize = length; 501 } 502 vm_object_drop(object); 503 } 504