xref: /dragonfly/sys/kern/vfs_vm.c (revision def76f9f)
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 /*
36  * Implements new VFS/VM coherency functions.  For conforming VFSs
37  * we treat the backing VM object slightly differently.  Instead of
38  * maintaining a number of pages to exactly fit the size of the file
39  * we instead maintain pages to fit the entire contents of the last
40  * buffer cache buffer used by the file.
41  *
42  * For VFSs like NFS and HAMMER which use (generally speaking) fixed
43  * sized buffers this greatly reduces the complexity of VFS/VM interactions.
44  *
45  * Truncations no longer invalidate pages covered by the buffer cache
46  * beyond the file EOF which still fit within the file's last buffer.
47  * We simply unmap them and do not allow userland to fault them in.
48  *
49  * The VFS is no longer responsible for zero-filling buffers during a
50  * truncation, the last buffer will be automatically zero-filled by
51  * nvtruncbuf().
52  *
53  * This code is intended to (eventually) replace vtruncbuf() and
54  * vnode_pager_setsize().
55  */
56 
57 #include <sys/param.h>
58 #include <sys/systm.h>
59 #include <sys/buf.h>
60 #include <sys/conf.h>
61 #include <sys/fcntl.h>
62 #include <sys/file.h>
63 #include <sys/kernel.h>
64 #include <sys/malloc.h>
65 #include <sys/mount.h>
66 #include <sys/proc.h>
67 #include <sys/socket.h>
68 #include <sys/stat.h>
69 #include <sys/sysctl.h>
70 #include <sys/unistd.h>
71 #include <sys/vmmeter.h>
72 #include <sys/vnode.h>
73 
74 #include <machine/limits.h>
75 
76 #include <vm/vm.h>
77 #include <vm/vm_object.h>
78 #include <vm/vm_extern.h>
79 #include <vm/vm_kern.h>
80 #include <vm/pmap.h>
81 #include <vm/vm_map.h>
82 #include <vm/vm_page.h>
83 #include <vm/vm_pager.h>
84 #include <vm/vnode_pager.h>
85 #include <vm/vm_zone.h>
86 
87 #include <sys/buf2.h>
88 #include <sys/thread2.h>
89 #include <vm/vm_page2.h>
90 
91 static int nvtruncbuf_bp_trunc_cmp(struct buf *bp, void *data);
92 static int nvtruncbuf_bp_trunc(struct buf *bp, void *data);
93 static int nvtruncbuf_bp_metasync_cmp(struct buf *bp, void *data);
94 static int nvtruncbuf_bp_metasync(struct buf *bp, void *data);
95 
96 /*
97  * Truncate a file's buffer and pages to a specified length. The
98  * byte-granular length of the file is specified along with the block
99  * size of the buffer containing that offset.
100  *
101  * If the last buffer straddles the length its contents will be zero-filled
102  * as appropriate.  All buffers and pages after the last buffer will be
103  * destroyed.  The last buffer itself will be destroyed only if the length
104  * is exactly aligned with it.
105  *
106  * UFS typically passes the old block size prior to the actual truncation,
107  * then later resizes the block based on the new file size.  NFS uses a
108  * fixed block size and doesn't care.  HAMMER uses a block size based on
109  * the offset which is fixed for any particular offset.
110  *
111  * When zero-filling we must bdwrite() to avoid a window of opportunity
112  * where the kernel might throw away a clean buffer and the filesystem
113  * then attempts to bread() it again before completing (or as part of)
114  * the extension.  The filesystem is still responsible for zero-filling
115  * any remainder when writing to the media in the strategy function when
116  * it is able to do so without the page being mapped.  The page may still
117  * be mapped by userland here.
118  *
119  * When modifying a buffer we must clear any cached raw disk offset.
120  * bdwrite() will call BMAP on it again.  Some filesystems, like HAMMER,
121  * never overwrite existing data blocks.
122  */
123 
124 struct truncbuf_info {
125 	struct vnode *vp;
126 	off_t truncloffset;	/* truncation point */
127 	int clean;		/* clean tree, else dirty tree */
128 };
129 
130 int
131 nvtruncbuf(struct vnode *vp, off_t length, int blksize, int boff, int trivial)
132 {
133 	struct truncbuf_info info;
134 	off_t truncboffset;
135 	const char *filename;
136 	struct buf *bp;
137 	int count;
138 	int error;
139 
140 	/*
141 	 * Round up to the *next* block, then destroy the buffers in question.
142 	 * Since we are only removing some of the buffers we must rely on the
143 	 * scan count to determine whether a loop is necessary.
144 	 *
145 	 * Destroy any pages beyond the last buffer.
146 	 */
147 	if (boff < 0)
148 		boff = (int)(length % blksize);
149 	if (boff)
150 		info.truncloffset = length + (blksize - boff);
151 	else
152 		info.truncloffset = length;
153 	info.vp = vp;
154 	lwkt_gettoken(&vp->v_token);
155 	do {
156 		info.clean = 1;
157 		count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree,
158 				nvtruncbuf_bp_trunc_cmp,
159 				nvtruncbuf_bp_trunc, &info);
160 		info.clean = 0;
161 		count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
162 				nvtruncbuf_bp_trunc_cmp,
163 				nvtruncbuf_bp_trunc, &info);
164 	} while(count);
165 
166 	nvnode_pager_setsize(vp, length, blksize, boff);
167 
168 	/*
169 	 * Zero-fill the area beyond the file EOF that still fits within
170 	 * the last buffer.  We must mark the buffer as dirty even though
171 	 * the modified area is beyond EOF to avoid races where the kernel
172 	 * might flush the buffer before the filesystem is able to reallocate
173 	 * the block.
174 	 *
175 	 * The VFS is responsible for dealing with the actual truncation.
176 	 *
177 	 * Only do this if trivial is zero, otherwise it is up to the
178 	 * VFS to handle the block straddling the EOF.
179 	 */
180 	if (boff && trivial == 0) {
181 		truncboffset = length - boff;
182 		error = bread_kvabio(vp, truncboffset, blksize, &bp);
183 		if (error == 0) {
184 			bkvasync(bp);
185 			bzero(bp->b_data + boff, blksize - boff);
186 			if (bp->b_flags & B_DELWRI) {
187 				if (bp->b_dirtyoff > boff)
188 					bp->b_dirtyoff = boff;
189 				if (bp->b_dirtyend > boff)
190 					bp->b_dirtyend = boff;
191 			}
192 			bp->b_bio2.bio_offset = NOOFFSET;
193 			bdwrite(bp);
194 		} else {
195 			kprintf("nvtruncbuf: bread error %d @0x%016jx\n",
196 				error, truncboffset);
197 			bp->b_flags |= B_INVAL | B_RELBUF;
198 			brelse(bp);
199 		}
200 	} else {
201 		error = 0;
202 	}
203 
204 	/*
205 	 * For safety, fsync any remaining metadata if the file is not being
206 	 * truncated to 0.  Since the metadata does not represent the entire
207 	 * dirty list we have to rely on the hit count to ensure that we get
208 	 * all of it.
209 	 *
210 	 * This is typically applicable only to UFS.  NFS and HAMMER do
211 	 * not store indirect blocks in the per-vnode buffer cache.
212 	 */
213 	if (length > 0) {
214 		do {
215 			count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
216 					nvtruncbuf_bp_metasync_cmp,
217 					nvtruncbuf_bp_metasync, &info);
218 		} while (count);
219 	}
220 
221 	/*
222 	 * It is possible to have in-progress I/O from buffers that were
223 	 * not part of the truncation.  This should not happen if we
224 	 * are truncating to 0-length.
225 	 */
226 	bio_track_wait(&vp->v_track_write, 0, 0);
227 
228 	/*
229 	 * Debugging only
230 	 */
231 	spin_lock(&vp->v_spin);
232 	filename = TAILQ_FIRST(&vp->v_namecache) ?
233 		   TAILQ_FIRST(&vp->v_namecache)->nc_name : "?";
234 	spin_unlock(&vp->v_spin);
235 
236 	/*
237 	 * Make sure no buffers were instantiated while we were trying
238 	 * to clean out the remaining VM pages.  This could occur due
239 	 * to busy dirty VM pages being flushed out to disk.
240 	 */
241 	do {
242 		info.clean = 1;
243 		count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree,
244 				nvtruncbuf_bp_trunc_cmp,
245 				nvtruncbuf_bp_trunc, &info);
246 		info.clean = 0;
247 		count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
248 				nvtruncbuf_bp_trunc_cmp,
249 				nvtruncbuf_bp_trunc, &info);
250 		if (count) {
251 			kprintf("Warning: vtruncbuf():  Had to re-clean %d "
252 			       "left over buffers in %s\n", count, filename);
253 		}
254 	} while(count);
255 
256 	lwkt_reltoken(&vp->v_token);
257 
258 	return (error);
259 }
260 
261 /*
262  * The callback buffer is beyond the new file EOF and must be destroyed.
263  * Note that the compare function must conform to the RB_SCAN's requirements.
264  */
265 static
266 int
267 nvtruncbuf_bp_trunc_cmp(struct buf *bp, void *data)
268 {
269 	struct truncbuf_info *info = data;
270 
271 	if (bp->b_loffset >= info->truncloffset)
272 		return(0);
273 	return(-1);
274 }
275 
276 static
277 int
278 nvtruncbuf_bp_trunc(struct buf *bp, void *data)
279 {
280 	struct truncbuf_info *info = data;
281 
282 	/*
283 	 * Do not try to use a buffer we cannot immediately lock,
284 	 * but sleep anyway to prevent a livelock.  The code will
285 	 * loop until all buffers can be acted upon.
286 	 */
287 	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
288 		atomic_add_int(&bp->b_refs, 1);
289 		if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
290 			BUF_UNLOCK(bp);
291 		atomic_subtract_int(&bp->b_refs, 1);
292 	} else if ((info->clean && (bp->b_flags & B_DELWRI)) ||
293 		   (info->clean == 0 && (bp->b_flags & B_DELWRI) == 0) ||
294 		   bp->b_vp != info->vp ||
295 		   nvtruncbuf_bp_trunc_cmp(bp, data)) {
296 		BUF_UNLOCK(bp);
297 	} else {
298 		bremfree(bp);
299 		bp->b_flags |= (B_INVAL | B_RELBUF | B_NOCACHE);
300 		brelse(bp);
301 	}
302 	lwkt_yield();
303 	return(1);
304 }
305 
306 /*
307  * Fsync all meta-data after truncating a file to be non-zero.  Only metadata
308  * blocks (with a negative loffset) are scanned.
309  * Note that the compare function must conform to the RB_SCAN's requirements.
310  */
311 static int
312 nvtruncbuf_bp_metasync_cmp(struct buf *bp, void *data __unused)
313 {
314 	if (bp->b_loffset < 0)
315 		return(0);
316 	lwkt_yield();
317 	return(1);
318 }
319 
320 static int
321 nvtruncbuf_bp_metasync(struct buf *bp, void *data)
322 {
323 	struct truncbuf_info *info = data;
324 
325 	/*
326 	 * Do not try to use a buffer we cannot immediately lock,
327 	 * but sleep anyway to prevent a livelock.  The code will
328 	 * loop until all buffers can be acted upon.
329 	 */
330 	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
331 		atomic_add_int(&bp->b_refs, 1);
332 		if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
333 			BUF_UNLOCK(bp);
334 		atomic_subtract_int(&bp->b_refs, 1);
335 	} else if ((bp->b_flags & B_DELWRI) == 0 ||
336 		   bp->b_vp != info->vp ||
337 		   nvtruncbuf_bp_metasync_cmp(bp, data)) {
338 		BUF_UNLOCK(bp);
339 	} else {
340 		bremfree(bp);
341 		bawrite(bp);
342 	}
343 	lwkt_yield();
344 	return(1);
345 }
346 
347 /*
348  * Extend a file's buffer and pages to a new, larger size.  The block size
349  * at both the old and new length must be passed, but buffer cache operations
350  * will only be performed on the old block.  The new nlength/nblksize will
351  * be used to properly set the VM object size.
352  *
353  * To make this explicit we require the old length to passed even though
354  * we can acquire it from vp->v_filesize, which also avoids potential
355  * corruption if the filesystem and vp get desynchronized somehow.
356  *
357  * If the caller intends to immediately write into the newly extended
358  * space pass trivial == 1.  If trivial is 0 the original buffer will be
359  * zero-filled as necessary to clean out any junk in the extended space.
360  * If non-zero the original buffer (straddling EOF) is not touched.
361  *
362  * When zero-filling we must bdwrite() to avoid a window of opportunity
363  * where the kernel might throw away a clean buffer and the filesystem
364  * then attempts to bread() it again before completing (or as part of)
365  * the extension.  The filesystem is still responsible for zero-filling
366  * any remainder when writing to the media in the strategy function when
367  * it is able to do so without the page being mapped.  The page may still
368  * be mapped by userland here.
369  *
370  * When modifying a buffer we must clear any cached raw disk offset.
371  * bdwrite() will call BMAP on it again.  Some filesystems, like HAMMER,
372  * never overwrite existing data blocks.
373  */
374 int
375 nvextendbuf(struct vnode *vp, off_t olength, off_t nlength,
376 	    int oblksize, int nblksize, int oboff, int nboff, int trivial)
377 {
378 	off_t truncboffset;
379 	struct buf *bp;
380 	int error;
381 
382 	error = 0;
383 	nvnode_pager_setsize(vp, nlength, nblksize, nboff);
384 	if (trivial == 0) {
385 		if (oboff < 0)
386 			oboff = (int)(olength % oblksize);
387 		truncboffset = olength - oboff;
388 
389 		if (oboff) {
390 			error = bread_kvabio(vp, truncboffset, oblksize, &bp);
391 			if (error == 0) {
392 				bkvasync(bp);
393 				bzero(bp->b_data + oboff, oblksize - oboff);
394 				bp->b_bio2.bio_offset = NOOFFSET;
395 				bdwrite(bp);
396 			} else {
397 				kprintf("nvextendbuf: bread EOF @ %016jx "
398 					"error %d\n",
399 					truncboffset, error);
400 				bp->b_flags |= B_INVAL | B_RELBUF;
401 				brelse(bp);
402 			}
403 		}
404 	}
405 	return (error);
406 }
407 
408 /*
409  * Set vp->v_filesize and vp->v_object->size, destroy pages beyond
410  * the last buffer when truncating.
411  *
412  * This function does not do any zeroing or invalidating of partially
413  * overlapping pages.  Zeroing is the responsibility of nvtruncbuf().
414  * However, it does unmap VM pages from the user address space on a
415  * page-granular (verses buffer cache granular) basis.
416  *
417  * If boff is passed as -1 the base offset of the buffer cache buffer is
418  * calculated from length and blksize.  Filesystems such as UFS which deal
419  * with fragments have to specify a boff >= 0 since the base offset cannot
420  * be calculated from length and blksize.
421  *
422  * For UFS blksize is the 'new' blocksize, used only to determine how large
423  * the VM object must become.
424  */
425 void
426 nvnode_pager_setsize(struct vnode *vp, off_t length, int blksize, int boff)
427 {
428 	vm_pindex_t nobjsize;
429 	vm_pindex_t oobjsize;
430 	vm_pindex_t pi;
431 	vm_object_t object;
432 	vm_page_t m;
433 	off_t truncboffset;
434 
435 	/*
436 	 * Degenerate conditions
437 	 */
438 	if ((object = vp->v_object) == NULL)
439 		return;
440 	vm_object_hold(object);
441 	if (length == vp->v_filesize) {
442 		vm_object_drop(object);
443 		return;
444 	}
445 
446 	/*
447 	 * Calculate the size of the VM object, coverage includes
448 	 * the buffer straddling EOF.  If EOF is buffer-aligned
449 	 * we don't bother.
450 	 *
451 	 * Buffers do not have to be page-aligned.  Make sure
452 	 * nobjsize is beyond the last page of the buffer.
453 	 */
454 	if (boff < 0)
455 		boff = (int)(length % blksize);
456 	truncboffset = length - boff;
457 	oobjsize = object->size;
458 	if (boff)
459 		nobjsize = OFF_TO_IDX(truncboffset + blksize + PAGE_MASK);
460 	else
461 		nobjsize = OFF_TO_IDX(truncboffset + PAGE_MASK);
462 	object->size = nobjsize;
463 
464 	if (length < vp->v_filesize) {
465 		/*
466 		 * File has shrunk, toss any cached pages beyond
467 		 * the end of the buffer (blksize aligned) for the
468 		 * new EOF.
469 		 */
470 		vp->v_filesize = length;
471 		if (nobjsize < oobjsize) {
472 			vm_object_page_remove(object, nobjsize, oobjsize,
473 					      FALSE);
474 		}
475 
476 		/*
477 		 * Unmap any pages (page aligned) beyond the new EOF.
478 		 * The pages remain part of the (last) buffer and are not
479 		 * invalidated.
480 		 */
481 		pi = OFF_TO_IDX(length + PAGE_MASK);
482 		while (pi < nobjsize) {
483 			m = vm_page_lookup_busy_wait(object, pi, FALSE, "vmpg");
484 			if (m) {
485 				vm_page_protect(m, VM_PROT_NONE);
486 				vm_page_wakeup(m);
487 			}
488 			++pi;
489 			lwkt_yield();
490 		}
491 	} else {
492 		/*
493 		 * File has expanded.
494 		 */
495 		vp->v_filesize = length;
496 	}
497 	vm_object_drop(object);
498 }
499