1 /* $NetBSD: lfs_bio.c,v 1.47 2002/11/27 11:36:40 yamt Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant@hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 /* 39 * Copyright (c) 1991, 1993 40 * The Regents of the University of California. All rights reserved. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. All advertising materials mentioning features or use of this software 51 * must display the following acknowledgement: 52 * This product includes software developed by the University of 53 * California, Berkeley and its contributors. 54 * 4. Neither the name of the University nor the names of its contributors 55 * may be used to endorse or promote products derived from this software 56 * without specific prior written permission. 57 * 58 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 59 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 60 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 68 * SUCH DAMAGE. 69 * 70 * @(#)lfs_bio.c 8.10 (Berkeley) 6/10/95 71 */ 72 73 #include <sys/cdefs.h> 74 __KERNEL_RCSID(0, "$NetBSD: lfs_bio.c,v 1.47 2002/11/27 11:36:40 yamt Exp $"); 75 76 #include <sys/param.h> 77 #include <sys/systm.h> 78 #include <sys/proc.h> 79 #include <sys/buf.h> 80 #include <sys/vnode.h> 81 #include <sys/resourcevar.h> 82 #include <sys/mount.h> 83 #include <sys/kernel.h> 84 85 #include <ufs/ufs/inode.h> 86 #include <ufs/ufs/ufsmount.h> 87 #include <ufs/ufs/ufs_extern.h> 88 89 #include <sys/malloc.h> 90 #include <ufs/lfs/lfs.h> 91 #include <ufs/lfs/lfs_extern.h> 92 93 /* Macros to clear/set/test flags. */ 94 # define SET(t, f) (t) |= (f) 95 # define CLR(t, f) (t) &= ~(f) 96 # define ISSET(t, f) ((t) & (f)) 97 98 /* 99 * LFS block write function. 100 * 101 * XXX 102 * No write cost accounting is done. 103 * This is almost certainly wrong for synchronous operations and NFS. 104 */ 105 int locked_queue_count = 0; /* XXX Count of locked-down buffers. */ 106 long locked_queue_bytes = 0L; /* XXX Total size of locked buffers. */ 107 int lfs_writing = 0; /* Set if already kicked off a writer 108 because of buffer space */ 109 extern int lfs_dostats; 110 111 /* 112 * Try to reserve some blocks, prior to performing a sensitive operation that 113 * requires the vnode lock to be honored. If there is not enough space, give 114 * up the vnode lock temporarily and wait for the space to become available. 115 * 116 * Called with vp locked. (Note nowever that if fsb < 0, vp is ignored.) 117 * 118 * XXX YAMT - it isn't safe to unlock vp here 119 * because the node might be modified while we sleep. 120 * (eg. cached states like i_offset might be stale, 121 * the vnode might be truncated, etc..) 122 * maybe we should have a way to restart the vnode op. (EVOPRESTART?) 123 * 124 * XXX YAMT - we unlock the vnode so that cleaner can lock it. 125 * but it isn't enough. eg. for VOP_REMOVE, we should unlock the vnode that 126 * is going to be removed as well. 127 */ 128 int 129 lfs_reserve(struct lfs *fs, struct vnode *vp, int fsb) 130 { 131 CLEANERINFO *cip; 132 struct buf *bp; 133 int error, slept; 134 135 slept = 0; 136 while (fsb > 0 && !lfs_fits(fs, fsb + fs->lfs_ravail) && 137 vp != fs->lfs_unlockvp) { 138 VOP_UNLOCK(vp, 0); 139 140 if (!slept) { 141 #ifdef DEBUG 142 printf("lfs_reserve: waiting for %ld (bfree = %d," 143 " est_bfree = %d)\n", 144 fsb + fs->lfs_ravail, fs->lfs_bfree, 145 LFS_EST_BFREE(fs)); 146 #endif 147 } 148 ++slept; 149 150 /* Wake up the cleaner */ 151 LFS_CLEANERINFO(cip, fs, bp); 152 LFS_SYNC_CLEANERINFO(cip, fs, bp, 0); 153 wakeup(&lfs_allclean_wakeup); 154 wakeup(&fs->lfs_nextseg); 155 156 error = tsleep(&fs->lfs_avail, PCATCH | PUSER, "lfs_reserve", 157 0); 158 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* XXX use lockstatus */ 159 if (error) 160 return error; 161 } 162 #ifdef DEBUG 163 if (slept) 164 printf("lfs_reserve: woke up\n"); 165 #endif 166 fs->lfs_ravail += fsb; 167 return 0; 168 } 169 170 /* 171 * 172 * XXX we don't let meta-data writes run out of space because they can 173 * come from the segment writer. We need to make sure that there is 174 * enough space reserved so that there's room to write meta-data 175 * blocks. 176 * 177 * Also, we don't let blocks that have come to us from the cleaner 178 * run out of space. 179 */ 180 #define CANT_WAIT(BP,F) (IS_IFILE((BP)) || (BP)->b_lblkno < 0 || ((F) & BW_CLEAN)) 181 182 int 183 lfs_bwrite(void *v) 184 { 185 struct vop_bwrite_args /* { 186 struct buf *a_bp; 187 } */ *ap = v; 188 struct buf *bp = ap->a_bp; 189 190 #ifdef DIAGNOSTIC 191 if (VTOI(bp->b_vp)->i_lfs->lfs_ronly == 0 && (bp->b_flags & B_ASYNC)) { 192 panic("bawrite LFS buffer"); 193 } 194 #endif /* DIAGNOSTIC */ 195 return lfs_bwrite_ext(bp,0); 196 } 197 198 /* 199 * Determine if there is enough room currently available to write fsb 200 * blocks. We need enough blocks for the new blocks, the current 201 * inode blocks (including potentially the ifile inode), a summary block, 202 * and the segment usage table, plus an ifile block. 203 */ 204 int 205 lfs_fits(struct lfs *fs, int fsb) 206 { 207 int needed; 208 209 needed = fsb + btofsb(fs, fs->lfs_sumsize) + 210 ((howmany(fs->lfs_uinodes + 1, INOPB(fs)) + fs->lfs_segtabsz + 211 1) << (fs->lfs_blktodb - fs->lfs_fsbtodb)); 212 213 if (needed >= fs->lfs_avail) { 214 #ifdef DEBUG 215 printf("lfs_fits: no fit: fsb = %d, uinodes = %d, " 216 "needed = %d, avail = %d\n", 217 fsb, fs->lfs_uinodes, needed, fs->lfs_avail); 218 #endif 219 return 0; 220 } 221 return 1; 222 } 223 224 int 225 lfs_availwait(struct lfs *fs, int fsb) 226 { 227 int error; 228 CLEANERINFO *cip; 229 struct buf *cbp; 230 231 while (!lfs_fits(fs, fsb)) { 232 /* 233 * Out of space, need cleaner to run. 234 * Update the cleaner info, then wake it up. 235 * Note the cleanerinfo block is on the ifile 236 * so it CANT_WAIT. 237 */ 238 LFS_CLEANERINFO(cip, fs, cbp); 239 LFS_SYNC_CLEANERINFO(cip, fs, cbp, 0); 240 241 printf("lfs_availwait: out of available space, " 242 "waiting on cleaner\n"); 243 244 wakeup(&lfs_allclean_wakeup); 245 wakeup(&fs->lfs_nextseg); 246 #ifdef DIAGNOSTIC 247 if (fs->lfs_seglock && fs->lfs_lockpid == curproc->p_pid) 248 panic("lfs_availwait: deadlock"); 249 #endif 250 error = tsleep(&fs->lfs_avail, PCATCH | PUSER, "cleaner", 0); 251 if (error) 252 return (error); 253 } 254 return 0; 255 } 256 257 int 258 lfs_bwrite_ext(struct buf *bp, int flags) 259 { 260 struct lfs *fs; 261 struct inode *ip; 262 int fsb, error, s; 263 264 /* 265 * Don't write *any* blocks if we're mounted read-only. 266 * In particular the cleaner can't write blocks either. 267 */ 268 if (VTOI(bp->b_vp)->i_lfs->lfs_ronly) { 269 bp->b_flags &= ~(B_DELWRI | B_READ | B_ERROR); 270 LFS_UNLOCK_BUF(bp); 271 if (bp->b_flags & B_CALL) 272 bp->b_flags &= ~B_BUSY; 273 else 274 brelse(bp); 275 return EROFS; 276 } 277 278 /* 279 * Set the delayed write flag and use reassignbuf to move the buffer 280 * from the clean list to the dirty one. 281 * 282 * Set the B_LOCKED flag and unlock the buffer, causing brelse to move 283 * the buffer onto the LOCKED free list. This is necessary, otherwise 284 * getnewbuf() would try to reclaim the buffers using bawrite, which 285 * isn't going to work. 286 * 287 * XXX we don't let meta-data writes run out of space because they can 288 * come from the segment writer. We need to make sure that there is 289 * enough space reserved so that there's room to write meta-data 290 * blocks. 291 */ 292 if (!(bp->b_flags & B_LOCKED)) { 293 fs = VFSTOUFS(bp->b_vp->v_mount)->um_lfs; 294 fsb = fragstofsb(fs, numfrags(fs, bp->b_bcount)); 295 if (!CANT_WAIT(bp, flags)) { 296 if ((error = lfs_availwait(fs, fsb)) != 0) { 297 brelse(bp); 298 return error; 299 } 300 } 301 302 ip = VTOI(bp->b_vp); 303 if (bp->b_flags & B_CALL) { 304 LFS_SET_UINO(ip, IN_CLEANING); 305 } else { 306 LFS_SET_UINO(ip, IN_MODIFIED); 307 if (bp->b_lblkno >= 0) 308 LFS_SET_UINO(ip, IN_UPDATE); 309 } 310 fs->lfs_avail -= fsb; 311 bp->b_flags |= B_DELWRI; 312 313 LFS_LOCK_BUF(bp); 314 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR); 315 s = splbio(); 316 reassignbuf(bp, bp->b_vp); 317 splx(s); 318 } 319 320 if (bp->b_flags & B_CALL) 321 bp->b_flags &= ~B_BUSY; 322 else 323 brelse(bp); 324 325 return (0); 326 } 327 328 void 329 lfs_flush_fs(struct lfs *fs, int flags) 330 { 331 if (fs->lfs_ronly == 0 && fs->lfs_dirops == 0) 332 { 333 /* disallow dirops during flush */ 334 fs->lfs_writer++; 335 336 /* 337 * We set the queue to 0 here because we 338 * are about to write all the dirty 339 * buffers we have. If more come in 340 * while we're writing the segment, they 341 * may not get written, so we want the 342 * count to reflect these new writes 343 * after the segwrite completes. 344 */ 345 if (lfs_dostats) 346 ++lfs_stats.flush_invoked; 347 lfs_segwrite(fs->lfs_ivnode->v_mount, flags); 348 349 /* XXX KS - allow dirops again */ 350 if (--fs->lfs_writer == 0) 351 wakeup(&fs->lfs_dirops); 352 } 353 } 354 355 /* 356 * XXX 357 * This routine flushes buffers out of the B_LOCKED queue when LFS has too 358 * many locked down. Eventually the pageout daemon will simply call LFS 359 * when pages need to be reclaimed. Note, we have one static count of locked 360 * buffers, so we can't have more than a single file system. To make this 361 * work for multiple file systems, put the count into the mount structure. 362 */ 363 void 364 lfs_flush(struct lfs *fs, int flags) 365 { 366 struct mount *mp, *nmp; 367 368 if (lfs_dostats) 369 ++lfs_stats.write_exceeded; 370 if (lfs_writing && flags == 0) {/* XXX flags */ 371 #ifdef DEBUG_LFS 372 printf("lfs_flush: not flushing because another flush is active\n"); 373 #endif 374 return; 375 } 376 lfs_writing = 1; 377 378 simple_lock(&mountlist_slock); 379 for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { 380 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) { 381 nmp = mp->mnt_list.cqe_next; 382 continue; 383 } 384 if (strncmp(&mp->mnt_stat.f_fstypename[0], MOUNT_LFS, MFSNAMELEN) == 0) 385 lfs_flush_fs(((struct ufsmount *)mp->mnt_data)->ufsmount_u.lfs, flags); 386 simple_lock(&mountlist_slock); 387 nmp = mp->mnt_list.cqe_next; 388 vfs_unbusy(mp); 389 } 390 simple_unlock(&mountlist_slock); 391 392 LFS_DEBUG_COUNTLOCKED("flush"); 393 394 lfs_writing = 0; 395 } 396 397 #define INOCOUNT(fs) howmany((fs)->lfs_uinodes, INOPB(fs)) 398 #define INOBYTES(fs) ((fs)->lfs_uinodes * DINODE_SIZE) 399 400 int 401 lfs_check(struct vnode *vp, ufs_daddr_t blkno, int flags) 402 { 403 int error; 404 struct lfs *fs; 405 struct inode *ip; 406 extern int lfs_dirvcount; 407 408 error = 0; 409 ip = VTOI(vp); 410 411 /* If out of buffers, wait on writer */ 412 /* XXX KS - if it's the Ifile, we're probably the cleaner! */ 413 if (ip->i_number == LFS_IFILE_INUM) 414 return 0; 415 /* If we're being called from inside a dirop, don't sleep */ 416 if (ip->i_flag & IN_ADIROP) 417 return 0; 418 419 fs = ip->i_lfs; 420 421 /* 422 * If we would flush below, but dirops are active, sleep. 423 * Note that a dirop cannot ever reach this code! 424 */ 425 while (fs->lfs_dirops > 0 && 426 (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS || 427 locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES || 428 lfs_dirvcount > LFS_MAXDIROP || fs->lfs_diropwait > 0)) 429 { 430 ++fs->lfs_diropwait; 431 tsleep(&fs->lfs_writer, PRIBIO+1, "bufdirop", 0); 432 --fs->lfs_diropwait; 433 } 434 435 if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS || 436 locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES || 437 lfs_dirvcount > LFS_MAXDIROP || fs->lfs_diropwait > 0) 438 { 439 ++fs->lfs_writer; 440 lfs_flush(fs, flags); 441 if (--fs->lfs_writer == 0) 442 wakeup(&fs->lfs_dirops); 443 } 444 445 while (locked_queue_count + INOCOUNT(fs) > LFS_WAIT_BUFS 446 || locked_queue_bytes + INOBYTES(fs) > LFS_WAIT_BYTES) 447 { 448 if (lfs_dostats) 449 ++lfs_stats.wait_exceeded; 450 #ifdef DEBUG_LFS 451 printf("lfs_check: waiting: count=%d, bytes=%ld\n", 452 locked_queue_count, locked_queue_bytes); 453 #endif 454 error = tsleep(&locked_queue_count, PCATCH | PUSER, 455 "buffers", hz * LFS_BUFWAIT); 456 if (error != EWOULDBLOCK) 457 break; 458 /* 459 * lfs_flush might not flush all the buffers, if some of the 460 * inodes were locked or if most of them were Ifile blocks 461 * and we weren't asked to checkpoint. Try flushing again 462 * to keep us from blocking indefinitely. 463 */ 464 if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS || 465 locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES) 466 { 467 ++fs->lfs_writer; 468 lfs_flush(fs, flags | SEGM_CKP); 469 if (--fs->lfs_writer == 0) 470 wakeup(&fs->lfs_dirops); 471 } 472 } 473 return (error); 474 } 475 476 /* 477 * Allocate a new buffer header. 478 */ 479 #ifdef MALLOCLOG 480 # define DOMALLOC(S, T, F) _malloc((S), (T), (F), file, line) 481 struct buf * 482 lfs_newbuf_malloclog(struct lfs *fs, struct vnode *vp, ufs_daddr_t daddr, size_t size, char *file, int line) 483 #else 484 # define DOMALLOC(S, T, F) malloc((S), (T), (F)) 485 struct buf * 486 lfs_newbuf(struct lfs *fs, struct vnode *vp, ufs_daddr_t daddr, size_t size) 487 #endif 488 { 489 struct buf *bp; 490 size_t nbytes; 491 int s; 492 493 nbytes = roundup(size, fsbtob(fs, 1)); 494 495 bp = DOMALLOC(sizeof(struct buf), M_SEGMENT, M_WAITOK); 496 bzero(bp, sizeof(struct buf)); 497 if (nbytes) { 498 bp->b_data = DOMALLOC(nbytes, M_SEGMENT, M_WAITOK); 499 bzero(bp->b_data, nbytes); 500 } 501 #ifdef DIAGNOSTIC 502 if (vp == NULL) 503 panic("vp is NULL in lfs_newbuf"); 504 if (bp == NULL) 505 panic("bp is NULL after malloc in lfs_newbuf"); 506 #endif 507 s = splbio(); 508 bgetvp(vp, bp); 509 splx(s); 510 511 bp->b_saveaddr = (caddr_t)fs; 512 bp->b_bufsize = size; 513 bp->b_bcount = size; 514 bp->b_lblkno = daddr; 515 bp->b_blkno = daddr; 516 bp->b_error = 0; 517 bp->b_resid = 0; 518 bp->b_iodone = lfs_callback; 519 bp->b_flags |= B_BUSY | B_CALL | B_NOCACHE; 520 521 return (bp); 522 } 523 524 #ifdef MALLOCLOG 525 # define DOFREE(A, T) _free((A), (T), file, line) 526 void 527 lfs_freebuf_malloclog(struct buf *bp, char *file, int line) 528 #else 529 # define DOFREE(A, T) free((A), (T)) 530 void 531 lfs_freebuf(struct buf *bp) 532 #endif 533 { 534 int s; 535 536 s = splbio(); 537 if (bp->b_vp) 538 brelvp(bp); 539 splx(s); 540 if (!(bp->b_flags & B_INVAL)) { /* B_INVAL indicates a "fake" buffer */ 541 DOFREE(bp->b_data, M_SEGMENT); 542 bp->b_data = NULL; 543 } 544 DOFREE(bp, M_SEGMENT); 545 } 546 547 /* 548 * Definitions for the buffer free lists. 549 */ 550 #define BQUEUES 4 /* number of free buffer queues */ 551 552 #define BQ_LOCKED 0 /* super-blocks &c */ 553 #define BQ_LRU 1 /* lru, useful buffers */ 554 #define BQ_AGE 2 /* rubbish */ 555 #define BQ_EMPTY 3 /* buffer headers with no memory */ 556 557 extern TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES]; 558 559 /* 560 * Return a count of buffers on the "locked" queue. 561 * Don't count malloced buffers, since they don't detract from the total. 562 */ 563 void 564 lfs_countlocked(int *count, long *bytes, char *msg) 565 { 566 struct buf *bp; 567 int n = 0; 568 long int size = 0L; 569 570 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp; 571 bp = bp->b_freelist.tqe_next) { 572 if (bp->b_flags & B_CALL) /* Malloced buffer */ 573 continue; 574 n++; 575 size += bp->b_bufsize; 576 #ifdef DEBUG_LOCKED_LIST 577 if (n > nbuf) 578 panic("lfs_countlocked: this can't happen: more" 579 " buffers locked than exist"); 580 #endif 581 } 582 #ifdef DEBUG_LOCKED_LIST 583 /* Theoretically this function never really does anything */ 584 if (n != *count) 585 printf("lfs_countlocked: %s: adjusted buf count from %d to %d\n", 586 msg, *count, n); 587 if (size != *bytes) 588 printf("lfs_countlocked: %s: adjusted byte count from %ld to %ld\n", 589 msg, *bytes, size); 590 #endif 591 *count = n; 592 *bytes = size; 593 return; 594 } 595