1 /* 2 * Copyright (c) 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * %sccs.include.redist.c% 6 * 7 * @(#)lfs_segment.c 8.8 (Berkeley) 05/08/95 8 */ 9 10 #include <sys/param.h> 11 #include <sys/systm.h> 12 #include <sys/namei.h> 13 #include <sys/kernel.h> 14 #include <sys/resourcevar.h> 15 #include <sys/file.h> 16 #include <sys/stat.h> 17 #include <sys/buf.h> 18 #include <sys/proc.h> 19 #include <sys/conf.h> 20 #include <sys/vnode.h> 21 #include <sys/malloc.h> 22 #include <sys/mount.h> 23 24 #include <miscfs/specfs/specdev.h> 25 #include <miscfs/fifofs/fifo.h> 26 27 #include <ufs/ufs/quota.h> 28 #include <ufs/ufs/inode.h> 29 #include <ufs/ufs/dir.h> 30 #include <ufs/ufs/ufsmount.h> 31 #include <ufs/ufs/ufs_extern.h> 32 33 #include <ufs/lfs/lfs.h> 34 #include <ufs/lfs/lfs_extern.h> 35 36 extern int count_lock_queue __P((void)); 37 38 #define MAX_ACTIVE 10 39 /* 40 * Determine if it's OK to start a partial in this segment, or if we need 41 * to go on to a new segment. 42 */ 43 #define LFS_PARTIAL_FITS(fs) \ 44 ((fs)->lfs_dbpseg - ((fs)->lfs_offset - (fs)->lfs_curseg) > \ 45 1 << (fs)->lfs_fsbtodb) 46 47 void lfs_callback __P((struct buf *)); 48 void lfs_gather __P((struct lfs *, struct segment *, 49 struct vnode *, int (*) __P((struct lfs *, struct buf *)))); 50 int lfs_gatherblock __P((struct segment *, struct buf *, int *)); 51 void lfs_iset __P((struct inode *, ufs_daddr_t, time_t)); 52 int lfs_match_data __P((struct lfs *, struct buf *)); 53 int lfs_match_dindir __P((struct lfs *, struct buf *)); 54 int lfs_match_indir __P((struct lfs *, struct buf *)); 55 int lfs_match_tindir __P((struct lfs *, struct buf *)); 56 void lfs_newseg __P((struct lfs *)); 57 void lfs_shellsort __P((struct buf **, ufs_daddr_t *, register int)); 58 void lfs_supercallback __P((struct buf *)); 59 void lfs_updatemeta __P((struct segment *)); 60 int lfs_vref __P((struct vnode *)); 61 void lfs_vunref __P((struct vnode *)); 62 void lfs_writefile __P((struct lfs *, struct segment *, struct vnode *)); 63 int lfs_writeinode __P((struct lfs *, struct segment *, struct inode *)); 64 int lfs_writeseg __P((struct lfs *, struct segment *)); 65 void lfs_writesuper __P((struct lfs *)); 66 void lfs_writevnodes __P((struct lfs *fs, struct mount *mp, 67 struct segment *sp, int dirops)); 68 69 int lfs_allclean_wakeup; /* Cleaner wakeup address. */ 70 71 /* Statistics Counters */ 72 #define DOSTATS 73 struct lfs_stats lfs_stats; 74 75 /* op values to lfs_writevnodes */ 76 #define VN_REG 0 77 #define VN_DIROP 1 78 #define VN_EMPTY 2 79 80 /* 81 * Ifile and meta data blocks are not marked busy, so segment writes MUST be 82 * single threaded. Currently, there are two paths into lfs_segwrite, sync() 83 * and getnewbuf(). They both mark the file system busy. Lfs_vflush() 84 * explicitly marks the file system busy. So lfs_segwrite is safe. I think. 85 */ 86 87 int 88 lfs_vflush(vp) 89 struct vnode *vp; 90 { 91 struct inode *ip; 92 struct lfs *fs; 93 struct segment *sp; 94 95 fs = VFSTOUFS(vp->v_mount)->um_lfs; 96 if (fs->lfs_nactive > MAX_ACTIVE) 97 return(lfs_segwrite(vp->v_mount, SEGM_SYNC|SEGM_CKP)); 98 lfs_seglock(fs, SEGM_SYNC); 99 sp = fs->lfs_sp; 100 101 102 ip = VTOI(vp); 103 if (vp->v_dirtyblkhd.lh_first == NULL) 104 lfs_writevnodes(fs, vp->v_mount, sp, VN_EMPTY); 105 106 do { 107 do { 108 if (vp->v_dirtyblkhd.lh_first != NULL) 109 lfs_writefile(fs, sp, vp); 110 } while (lfs_writeinode(fs, sp, ip)); 111 112 } while (lfs_writeseg(fs, sp) && ip->i_number == LFS_IFILE_INUM); 113 114 #ifdef DOSTATS 115 ++lfs_stats.nwrites; 116 if (sp->seg_flags & SEGM_SYNC) 117 ++lfs_stats.nsync_writes; 118 if (sp->seg_flags & SEGM_CKP) 119 ++lfs_stats.ncheckpoints; 120 #endif 121 lfs_segunlock(fs); 122 return (0); 123 } 124 125 void 126 lfs_writevnodes(fs, mp, sp, op) 127 struct lfs *fs; 128 struct mount *mp; 129 struct segment *sp; 130 int op; 131 { 132 struct inode *ip; 133 struct vnode *vp; 134 135 /* BEGIN HACK */ 136 #define VN_OFFSET (((void *)&vp->v_mntvnodes.le_next) - (void *)vp) 137 #define BACK_VP(VP) ((struct vnode *)(((void *)VP->v_mntvnodes.le_prev) - VN_OFFSET)) 138 #define BEG_OF_VLIST ((struct vnode *)(((void *)&mp->mnt_vnodelist.lh_first) - VN_OFFSET)) 139 140 /* Find last vnode. */ 141 loop: for (vp = mp->mnt_vnodelist.lh_first; 142 vp && vp->v_mntvnodes.le_next != NULL; 143 vp = vp->v_mntvnodes.le_next); 144 for (; vp && vp != BEG_OF_VLIST; vp = BACK_VP(vp)) { 145 /* END HACK */ 146 /* 147 loop: 148 for (vp = mp->mnt_vnodelist.lh_first; 149 vp != NULL; 150 vp = vp->v_mntvnodes.le_next) { 151 */ 152 /* 153 * If the vnode that we are about to sync is no longer 154 * associated with this mount point, start over. 155 */ 156 if (vp->v_mount != mp) 157 goto loop; 158 159 /* XXX ignore dirops for now 160 if (op == VN_DIROP && !(vp->v_flag & VDIROP) || 161 op != VN_DIROP && (vp->v_flag & VDIROP)) 162 continue; 163 */ 164 165 if (op == VN_EMPTY && vp->v_dirtyblkhd.lh_first) 166 continue; 167 168 if (vp->v_type == VNON) 169 continue; 170 171 if (lfs_vref(vp)) 172 continue; 173 174 /* 175 * Write the inode/file if dirty and it's not the 176 * the IFILE. 177 */ 178 ip = VTOI(vp); 179 if ((ip->i_flag & 180 (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE) || 181 vp->v_dirtyblkhd.lh_first != NULL) && 182 ip->i_number != LFS_IFILE_INUM) { 183 if (vp->v_dirtyblkhd.lh_first != NULL) 184 lfs_writefile(fs, sp, vp); 185 (void) lfs_writeinode(fs, sp, ip); 186 } 187 vp->v_flag &= ~VDIROP; 188 lfs_vunref(vp); 189 } 190 } 191 192 int 193 lfs_segwrite(mp, flags) 194 struct mount *mp; 195 int flags; /* Do a checkpoint. */ 196 { 197 struct buf *bp; 198 struct inode *ip; 199 struct lfs *fs; 200 struct segment *sp; 201 struct vnode *vp; 202 SEGUSE *segusep; 203 ufs_daddr_t ibno; 204 CLEANERINFO *cip; 205 int clean, do_ckp, error, i; 206 207 fs = VFSTOUFS(mp)->um_lfs; 208 209 /* 210 * If we have fewer than 2 clean segments, wait until cleaner 211 * writes. 212 */ 213 do { 214 LFS_CLEANERINFO(cip, fs, bp); 215 clean = cip->clean; 216 brelse(bp); 217 if (clean <= 2) { 218 /* printf ("segs clean: %d\n", clean); */ 219 wakeup(&lfs_allclean_wakeup); 220 if (error = tsleep(&fs->lfs_avail, PRIBIO + 1, 221 "lfs writer", 0)) 222 return (error); 223 } 224 } while (clean <= 2 ); 225 226 /* 227 * Allocate a segment structure and enough space to hold pointers to 228 * the maximum possible number of buffers which can be described in a 229 * single summary block. 230 */ 231 do_ckp = flags & SEGM_CKP || fs->lfs_nactive > MAX_ACTIVE; 232 lfs_seglock(fs, flags | (do_ckp ? SEGM_CKP : 0)); 233 sp = fs->lfs_sp; 234 235 lfs_writevnodes(fs, mp, sp, VN_REG); 236 237 /* XXX ignore ordering of dirops for now */ 238 /* XXX 239 fs->lfs_writer = 1; 240 if (fs->lfs_dirops && (error = 241 tsleep(&fs->lfs_writer, PRIBIO + 1, "lfs writer", 0))) { 242 free(sp->bpp, M_SEGMENT); 243 free(sp, M_SEGMENT); 244 fs->lfs_writer = 0; 245 return (error); 246 } 247 248 lfs_writevnodes(fs, mp, sp, VN_DIROP); 249 */ 250 251 /* 252 * If we are doing a checkpoint, mark everything since the 253 * last checkpoint as no longer ACTIVE. 254 */ 255 if (do_ckp) 256 for (ibno = fs->lfs_cleansz + fs->lfs_segtabsz; 257 --ibno >= fs->lfs_cleansz; ) { 258 if (bread(fs->lfs_ivnode, ibno, fs->lfs_bsize, 259 NOCRED, &bp)) 260 261 panic("lfs: ifile read"); 262 segusep = (SEGUSE *)bp->b_data; 263 for (i = fs->lfs_sepb; i--; segusep++) 264 segusep->su_flags &= ~SEGUSE_ACTIVE; 265 266 error = VOP_BWRITE(bp); 267 } 268 269 if (do_ckp || fs->lfs_doifile) { 270 redo: 271 vp = fs->lfs_ivnode; 272 while (vget(vp, 1)); 273 ip = VTOI(vp); 274 if (vp->v_dirtyblkhd.lh_first != NULL) 275 lfs_writefile(fs, sp, vp); 276 (void)lfs_writeinode(fs, sp, ip); 277 vput(vp); 278 if (lfs_writeseg(fs, sp) && do_ckp) 279 goto redo; 280 } else 281 (void) lfs_writeseg(fs, sp); 282 283 /* 284 * If the I/O count is non-zero, sleep until it reaches zero. At the 285 * moment, the user's process hangs around so we can sleep. 286 */ 287 /* XXX ignore dirops for now 288 fs->lfs_writer = 0; 289 fs->lfs_doifile = 0; 290 wakeup(&fs->lfs_dirops); 291 */ 292 293 #ifdef DOSTATS 294 ++lfs_stats.nwrites; 295 if (sp->seg_flags & SEGM_SYNC) 296 ++lfs_stats.nsync_writes; 297 if (sp->seg_flags & SEGM_CKP) 298 ++lfs_stats.ncheckpoints; 299 #endif 300 lfs_segunlock(fs); 301 return (0); 302 } 303 304 /* 305 * Write the dirty blocks associated with a vnode. 306 */ 307 void 308 lfs_writefile(fs, sp, vp) 309 struct lfs *fs; 310 struct segment *sp; 311 struct vnode *vp; 312 { 313 struct buf *bp; 314 struct finfo *fip; 315 IFILE *ifp; 316 317 if (sp->seg_bytes_left < fs->lfs_bsize || 318 sp->sum_bytes_left < sizeof(struct finfo)) 319 (void) lfs_writeseg(fs, sp); 320 321 sp->sum_bytes_left -= sizeof(struct finfo) - sizeof(ufs_daddr_t); 322 ++((SEGSUM *)(sp->segsum))->ss_nfinfo; 323 324 fip = sp->fip; 325 fip->fi_nblocks = 0; 326 fip->fi_ino = VTOI(vp)->i_number; 327 LFS_IENTRY(ifp, fs, fip->fi_ino, bp); 328 fip->fi_version = ifp->if_version; 329 brelse(bp); 330 331 /* 332 * It may not be necessary to write the meta-data blocks at this point, 333 * as the roll-forward recovery code should be able to reconstruct the 334 * list. 335 */ 336 lfs_gather(fs, sp, vp, lfs_match_data); 337 lfs_gather(fs, sp, vp, lfs_match_indir); 338 lfs_gather(fs, sp, vp, lfs_match_dindir); 339 #ifdef TRIPLE 340 lfs_gather(fs, sp, vp, lfs_match_tindir); 341 #endif 342 343 fip = sp->fip; 344 if (fip->fi_nblocks != 0) { 345 sp->fip = 346 (struct finfo *)((caddr_t)fip + sizeof(struct finfo) + 347 sizeof(ufs_daddr_t) * (fip->fi_nblocks - 1)); 348 sp->start_lbp = &sp->fip->fi_blocks[0]; 349 } else { 350 sp->sum_bytes_left += sizeof(struct finfo) - sizeof(ufs_daddr_t); 351 --((SEGSUM *)(sp->segsum))->ss_nfinfo; 352 } 353 } 354 355 int 356 lfs_writeinode(fs, sp, ip) 357 struct lfs *fs; 358 struct segment *sp; 359 struct inode *ip; 360 { 361 struct buf *bp, *ibp; 362 IFILE *ifp; 363 SEGUSE *sup; 364 ufs_daddr_t daddr; 365 ino_t ino; 366 int error, i, ndx; 367 int redo_ifile = 0; 368 369 if (!(ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE))) 370 return(0); 371 372 /* Allocate a new inode block if necessary. */ 373 if (sp->ibp == NULL) { 374 /* Allocate a new segment if necessary. */ 375 if (sp->seg_bytes_left < fs->lfs_bsize || 376 sp->sum_bytes_left < sizeof(ufs_daddr_t)) 377 (void) lfs_writeseg(fs, sp); 378 379 /* Get next inode block. */ 380 daddr = fs->lfs_offset; 381 fs->lfs_offset += fsbtodb(fs, 1); 382 sp->ibp = *sp->cbpp++ = 383 lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, daddr, 384 fs->lfs_bsize); 385 /* Zero out inode numbers */ 386 for (i = 0; i < INOPB(fs); ++i) 387 ((struct dinode *)sp->ibp->b_data)[i].di_inumber = 0; 388 ++sp->start_bpp; 389 fs->lfs_avail -= fsbtodb(fs, 1); 390 /* Set remaining space counters. */ 391 sp->seg_bytes_left -= fs->lfs_bsize; 392 sp->sum_bytes_left -= sizeof(ufs_daddr_t); 393 ndx = LFS_SUMMARY_SIZE / sizeof(ufs_daddr_t) - 394 sp->ninodes / INOPB(fs) - 1; 395 ((ufs_daddr_t *)(sp->segsum))[ndx] = daddr; 396 } 397 398 /* Update the inode times and copy the inode onto the inode page. */ 399 if (ip->i_flag & IN_MODIFIED) 400 --fs->lfs_uinodes; 401 ITIMES(ip, &time, &time); 402 ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE); 403 bp = sp->ibp; 404 ((struct dinode *)bp->b_data)[sp->ninodes % INOPB(fs)] = ip->i_din; 405 /* Increment inode count in segment summary block. */ 406 ++((SEGSUM *)(sp->segsum))->ss_ninos; 407 408 /* If this page is full, set flag to allocate a new page. */ 409 if (++sp->ninodes % INOPB(fs) == 0) 410 sp->ibp = NULL; 411 412 /* 413 * If updating the ifile, update the super-block. Update the disk 414 * address and access times for this inode in the ifile. 415 */ 416 ino = ip->i_number; 417 if (ino == LFS_IFILE_INUM) { 418 daddr = fs->lfs_idaddr; 419 fs->lfs_idaddr = bp->b_blkno; 420 } else { 421 LFS_IENTRY(ifp, fs, ino, ibp); 422 daddr = ifp->if_daddr; 423 ifp->if_daddr = bp->b_blkno; 424 error = VOP_BWRITE(ibp); 425 } 426 427 /* 428 * No need to update segment usage if there was no former inode address 429 * or if the last inode address is in the current partial segment. 430 */ 431 if (daddr != LFS_UNUSED_DADDR && 432 !(daddr >= fs->lfs_lastpseg && daddr <= bp->b_blkno)) { 433 LFS_SEGENTRY(sup, fs, datosn(fs, daddr), bp); 434 #ifdef DIAGNOSTIC 435 if (sup->su_nbytes < sizeof(struct dinode)) { 436 /* XXX -- Change to a panic. */ 437 printf("lfs: negative bytes (segment %d)\n", 438 datosn(fs, daddr)); 439 panic("negative bytes"); 440 } 441 #endif 442 sup->su_nbytes -= sizeof(struct dinode); 443 redo_ifile = 444 (ino == LFS_IFILE_INUM && !(bp->b_flags & B_GATHERED)); 445 error = VOP_BWRITE(bp); 446 } 447 return (redo_ifile); 448 } 449 450 int 451 lfs_gatherblock(sp, bp, sptr) 452 struct segment *sp; 453 struct buf *bp; 454 int *sptr; 455 { 456 struct lfs *fs; 457 int version; 458 459 /* 460 * If full, finish this segment. We may be doing I/O, so 461 * release and reacquire the splbio(). 462 */ 463 #ifdef DIAGNOSTIC 464 if (sp->vp == NULL) 465 panic ("lfs_gatherblock: Null vp in segment"); 466 #endif 467 fs = sp->fs; 468 if (sp->sum_bytes_left < sizeof(ufs_daddr_t) || 469 sp->seg_bytes_left < bp->b_bcount) { 470 if (sptr) 471 splx(*sptr); 472 lfs_updatemeta(sp); 473 474 version = sp->fip->fi_version; 475 (void) lfs_writeseg(fs, sp); 476 477 sp->fip->fi_version = version; 478 sp->fip->fi_ino = VTOI(sp->vp)->i_number; 479 /* Add the current file to the segment summary. */ 480 ++((SEGSUM *)(sp->segsum))->ss_nfinfo; 481 sp->sum_bytes_left -= 482 sizeof(struct finfo) - sizeof(ufs_daddr_t); 483 484 if (sptr) 485 *sptr = splbio(); 486 return(1); 487 } 488 489 /* Insert into the buffer list, update the FINFO block. */ 490 bp->b_flags |= B_GATHERED; 491 *sp->cbpp++ = bp; 492 sp->fip->fi_blocks[sp->fip->fi_nblocks++] = bp->b_lblkno; 493 494 sp->sum_bytes_left -= sizeof(ufs_daddr_t); 495 sp->seg_bytes_left -= bp->b_bcount; 496 return(0); 497 } 498 499 void 500 lfs_gather(fs, sp, vp, match) 501 struct lfs *fs; 502 struct segment *sp; 503 struct vnode *vp; 504 int (*match) __P((struct lfs *, struct buf *)); 505 { 506 struct buf *bp; 507 int s; 508 509 sp->vp = vp; 510 s = splbio(); 511 /* This is a hack to see if ordering the blocks in LFS makes a difference. */ 512 /* BEGIN HACK */ 513 #define BUF_OFFSET (((void *)&bp->b_vnbufs.le_next) - (void *)bp) 514 #define BACK_BUF(BP) ((struct buf *)(((void *)BP->b_vnbufs.le_prev) - BUF_OFFSET)) 515 #define BEG_OF_LIST ((struct buf *)(((void *)&vp->v_dirtyblkhd.lh_first) - BUF_OFFSET)) 516 517 518 /*loop: for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = bp->b_vnbufs.le_next) {*/ 519 /* Find last buffer. */ 520 loop: for (bp = vp->v_dirtyblkhd.lh_first; bp && bp->b_vnbufs.le_next != NULL; 521 bp = bp->b_vnbufs.le_next); 522 for (; bp && bp != BEG_OF_LIST; bp = BACK_BUF(bp)) { 523 /* END HACK */ 524 if (bp->b_flags & B_BUSY || !match(fs, bp) || 525 bp->b_flags & B_GATHERED) 526 continue; 527 #ifdef DIAGNOSTIC 528 if (!(bp->b_flags & B_DELWRI)) 529 panic("lfs_gather: bp not B_DELWRI"); 530 if (!(bp->b_flags & B_LOCKED)) 531 panic("lfs_gather: bp not B_LOCKED"); 532 #endif 533 if (lfs_gatherblock(sp, bp, &s)) 534 goto loop; 535 } 536 splx(s); 537 lfs_updatemeta(sp); 538 sp->vp = NULL; 539 } 540 541 542 /* 543 * Update the metadata that points to the blocks listed in the FINFO 544 * array. 545 */ 546 void 547 lfs_updatemeta(sp) 548 struct segment *sp; 549 { 550 SEGUSE *sup; 551 struct buf *bp; 552 struct lfs *fs; 553 struct vnode *vp; 554 struct indir a[NIADDR + 2], *ap; 555 struct inode *ip; 556 ufs_daddr_t daddr, lbn, off; 557 int error, i, nblocks, num; 558 559 vp = sp->vp; 560 nblocks = &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp; 561 if (nblocks < 0) 562 panic("This is a bad thing\n"); 563 if (vp == NULL || nblocks == 0) 564 return; 565 566 /* Sort the blocks. */ 567 if (!(sp->seg_flags & SEGM_CLEAN)) 568 lfs_shellsort(sp->start_bpp, sp->start_lbp, nblocks); 569 570 /* 571 * Record the length of the last block in case it's a fragment. 572 * If there are indirect blocks present, they sort last. An 573 * indirect block will be lfs_bsize and its presence indicates 574 * that you cannot have fragments. 575 */ 576 sp->fip->fi_lastlength = sp->start_bpp[nblocks - 1]->b_bcount; 577 578 /* 579 * Assign disk addresses, and update references to the logical 580 * block and the segment usage information. 581 */ 582 fs = sp->fs; 583 for (i = nblocks; i--; ++sp->start_bpp) { 584 lbn = *sp->start_lbp++; 585 (*sp->start_bpp)->b_blkno = off = fs->lfs_offset; 586 fs->lfs_offset += 587 fragstodb(fs, numfrags(fs, (*sp->start_bpp)->b_bcount)); 588 589 if (error = ufs_bmaparray(vp, lbn, &daddr, a, &num, NULL)) 590 panic("lfs_updatemeta: ufs_bmaparray %d", error); 591 ip = VTOI(vp); 592 switch (num) { 593 case 0: 594 ip->i_db[lbn] = off; 595 break; 596 case 1: 597 ip->i_ib[a[0].in_off] = off; 598 break; 599 default: 600 ap = &a[num - 1]; 601 if (bread(vp, ap->in_lbn, fs->lfs_bsize, NOCRED, &bp)) 602 panic("lfs_updatemeta: bread bno %d", 603 ap->in_lbn); 604 /* 605 * Bread may create a new indirect block which needs 606 * to get counted for the inode. 607 */ 608 if (bp->b_blkno == -1 && !(bp->b_flags & B_CACHE)) { 609 ip->i_blocks += fsbtodb(fs, 1); 610 fs->lfs_bfree -= fragstodb(fs, fs->lfs_frag); 611 } 612 ((ufs_daddr_t *)bp->b_data)[ap->in_off] = off; 613 VOP_BWRITE(bp); 614 } 615 616 /* Update segment usage information. */ 617 if (daddr != UNASSIGNED && 618 !(daddr >= fs->lfs_lastpseg && daddr <= off)) { 619 LFS_SEGENTRY(sup, fs, datosn(fs, daddr), bp); 620 #ifdef DIAGNOSTIC 621 if (sup->su_nbytes < (*sp->start_bpp)->b_bcount) { 622 /* XXX -- Change to a panic. */ 623 printf("lfs: negative bytes (segment %d)\n", 624 datosn(fs, daddr)); 625 panic ("Negative Bytes"); 626 } 627 #endif 628 sup->su_nbytes -= (*sp->start_bpp)->b_bcount; 629 error = VOP_BWRITE(bp); 630 } 631 } 632 } 633 634 /* 635 * Start a new segment. 636 */ 637 int 638 lfs_initseg(fs) 639 struct lfs *fs; 640 { 641 struct segment *sp; 642 SEGUSE *sup; 643 SEGSUM *ssp; 644 struct buf *bp; 645 int repeat; 646 647 sp = fs->lfs_sp; 648 649 repeat = 0; 650 /* Advance to the next segment. */ 651 if (!LFS_PARTIAL_FITS(fs)) { 652 /* Wake up any cleaning procs waiting on this file system. */ 653 wakeup(&lfs_allclean_wakeup); 654 655 lfs_newseg(fs); 656 repeat = 1; 657 fs->lfs_offset = fs->lfs_curseg; 658 sp->seg_number = datosn(fs, fs->lfs_curseg); 659 sp->seg_bytes_left = fs->lfs_dbpseg * DEV_BSIZE; 660 661 /* 662 * If the segment contains a superblock, update the offset 663 * and summary address to skip over it. 664 */ 665 LFS_SEGENTRY(sup, fs, sp->seg_number, bp); 666 if (sup->su_flags & SEGUSE_SUPERBLOCK) { 667 fs->lfs_offset += LFS_SBPAD / DEV_BSIZE; 668 sp->seg_bytes_left -= LFS_SBPAD; 669 } 670 brelse(bp); 671 } else { 672 sp->seg_number = datosn(fs, fs->lfs_curseg); 673 sp->seg_bytes_left = (fs->lfs_dbpseg - 674 (fs->lfs_offset - fs->lfs_curseg)) * DEV_BSIZE; 675 } 676 fs->lfs_lastpseg = fs->lfs_offset; 677 678 sp->fs = fs; 679 sp->ibp = NULL; 680 sp->ninodes = 0; 681 682 /* Get a new buffer for SEGSUM and enter it into the buffer list. */ 683 sp->cbpp = sp->bpp; 684 *sp->cbpp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, fs->lfs_offset, 685 LFS_SUMMARY_SIZE); 686 sp->segsum = (*sp->cbpp)->b_data; 687 bzero(sp->segsum, LFS_SUMMARY_SIZE); 688 sp->start_bpp = ++sp->cbpp; 689 fs->lfs_offset += LFS_SUMMARY_SIZE / DEV_BSIZE; 690 691 /* Set point to SEGSUM, initialize it. */ 692 ssp = sp->segsum; 693 ssp->ss_next = fs->lfs_nextseg; 694 ssp->ss_nfinfo = ssp->ss_ninos = 0; 695 ssp->ss_magic = SS_MAGIC; 696 697 /* Set pointer to first FINFO, initialize it. */ 698 sp->fip = (struct finfo *)((caddr_t)sp->segsum + sizeof(SEGSUM)); 699 sp->fip->fi_nblocks = 0; 700 sp->start_lbp = &sp->fip->fi_blocks[0]; 701 sp->fip->fi_lastlength = 0; 702 703 sp->seg_bytes_left -= LFS_SUMMARY_SIZE; 704 sp->sum_bytes_left = LFS_SUMMARY_SIZE - sizeof(SEGSUM); 705 706 return(repeat); 707 } 708 709 /* 710 * Return the next segment to write. 711 */ 712 void 713 lfs_newseg(fs) 714 struct lfs *fs; 715 { 716 CLEANERINFO *cip; 717 SEGUSE *sup; 718 struct buf *bp; 719 int curseg, isdirty, sn; 720 721 LFS_SEGENTRY(sup, fs, datosn(fs, fs->lfs_nextseg), bp); 722 sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE; 723 sup->su_nbytes = 0; 724 sup->su_nsums = 0; 725 sup->su_ninos = 0; 726 (void) VOP_BWRITE(bp); 727 728 LFS_CLEANERINFO(cip, fs, bp); 729 --cip->clean; 730 ++cip->dirty; 731 (void) VOP_BWRITE(bp); 732 733 fs->lfs_lastseg = fs->lfs_curseg; 734 fs->lfs_curseg = fs->lfs_nextseg; 735 for (sn = curseg = datosn(fs, fs->lfs_curseg);;) { 736 sn = (sn + 1) % fs->lfs_nseg; 737 if (sn == curseg) 738 panic("lfs_nextseg: no clean segments"); 739 LFS_SEGENTRY(sup, fs, sn, bp); 740 isdirty = sup->su_flags & SEGUSE_DIRTY; 741 brelse(bp); 742 if (!isdirty) 743 break; 744 } 745 746 ++fs->lfs_nactive; 747 fs->lfs_nextseg = sntoda(fs, sn); 748 #ifdef DOSTATS 749 ++lfs_stats.segsused; 750 #endif 751 } 752 753 int 754 lfs_writeseg(fs, sp) 755 struct lfs *fs; 756 struct segment *sp; 757 { 758 extern int locked_queue_count; 759 struct buf **bpp, *bp, *cbp; 760 SEGUSE *sup; 761 SEGSUM *ssp; 762 dev_t i_dev; 763 u_long *datap, *dp; 764 int do_again, i, nblocks, s; 765 int (*strategy)__P((struct vop_strategy_args *)); 766 struct vop_strategy_args vop_strategy_a; 767 u_short ninos; 768 char *p; 769 long *lp; 770 771 /* 772 * If there are no buffers other than the segment summary to write 773 * and it is not a checkpoint, don't do anything. On a checkpoint, 774 * even if there aren't any buffers, you need to write the superblock. 775 */ 776 if ((nblocks = sp->cbpp - sp->bpp) == 1) 777 return (0); 778 779 /* Update the segment usage information. */ 780 LFS_SEGENTRY(sup, fs, sp->seg_number, bp); 781 782 /* Loop through all blocks, except the segment summary. */ 783 for (bpp = sp->bpp; ++bpp < sp->cbpp; ) 784 sup->su_nbytes += (*bpp)->b_bcount; 785 786 ssp = (SEGSUM *)sp->segsum; 787 788 ninos = (ssp->ss_ninos + INOPB(fs) - 1) / INOPB(fs); 789 sup->su_nbytes += ssp->ss_ninos * sizeof(struct dinode); 790 sup->su_nbytes += LFS_SUMMARY_SIZE; 791 sup->su_lastmod = time.tv_sec; 792 sup->su_ninos += ninos; 793 ++sup->su_nsums; 794 do_again = !(bp->b_flags & B_GATHERED); 795 (void)VOP_BWRITE(bp); 796 /* 797 * Compute checksum across data and then across summary; the first 798 * block (the summary block) is skipped. Set the create time here 799 * so that it's guaranteed to be later than the inode mod times. 800 * 801 * XXX 802 * Fix this to do it inline, instead of malloc/copy. 803 */ 804 datap = dp = malloc(nblocks * sizeof(u_long), M_SEGMENT, M_WAITOK); 805 for (bpp = sp->bpp, i = nblocks - 1; i--;) { 806 if ((*++bpp)->b_flags & B_INVAL) { 807 if (copyin((*bpp)->b_saveaddr, dp++, sizeof(u_long))) 808 panic("lfs_writeseg: copyin failed"); 809 } else 810 *dp++ = ((u_long *)(*bpp)->b_data)[0]; 811 } 812 ssp->ss_create = time.tv_sec; 813 ssp->ss_datasum = cksum(datap, (nblocks - 1) * sizeof(u_long)); 814 ssp->ss_sumsum = 815 cksum(&ssp->ss_datasum, LFS_SUMMARY_SIZE - sizeof(ssp->ss_sumsum)); 816 free(datap, M_SEGMENT); 817 #ifdef DIAGNOSTIC 818 if (fs->lfs_bfree < fsbtodb(fs, ninos) + LFS_SUMMARY_SIZE / DEV_BSIZE) 819 panic("lfs_writeseg: No diskspace for summary"); 820 #endif 821 fs->lfs_bfree -= (fsbtodb(fs, ninos) + LFS_SUMMARY_SIZE / DEV_BSIZE); 822 823 i_dev = VTOI(fs->lfs_ivnode)->i_dev; 824 strategy = VTOI(fs->lfs_ivnode)->i_devvp->v_op[VOFFSET(vop_strategy)]; 825 826 /* 827 * When we simply write the blocks we lose a rotation for every block 828 * written. To avoid this problem, we allocate memory in chunks, copy 829 * the buffers into the chunk and write the chunk. MAXPHYS is the 830 * largest size I/O devices can handle. 831 * When the data is copied to the chunk, turn off the the B_LOCKED bit 832 * and brelse the buffer (which will move them to the LRU list). Add 833 * the B_CALL flag to the buffer header so we can count I/O's for the 834 * checkpoints and so we can release the allocated memory. 835 * 836 * XXX 837 * This should be removed if the new virtual memory system allows us to 838 * easily make the buffers contiguous in kernel memory and if that's 839 * fast enough. 840 */ 841 for (bpp = sp->bpp, i = nblocks; i;) { 842 cbp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, 843 (*bpp)->b_blkno, MAXPHYS); 844 cbp->b_dev = i_dev; 845 cbp->b_flags |= B_ASYNC | B_BUSY; 846 cbp->b_bcount = 0; 847 848 s = splbio(); 849 ++fs->lfs_iocount; 850 for (p = cbp->b_data; i && cbp->b_bcount < MAXPHYS; i--) { 851 bp = *bpp; 852 if (bp->b_bcount > (MAXPHYS - cbp->b_bcount)) 853 break; 854 bpp++; 855 856 /* 857 * Fake buffers from the cleaner are marked as B_INVAL. 858 * We need to copy the data from user space rather than 859 * from the buffer indicated. 860 * XXX == what do I do on an error? 861 */ 862 if (bp->b_flags & B_INVAL) { 863 if (copyin(bp->b_saveaddr, p, bp->b_bcount)) 864 panic("lfs_writeseg: copyin failed"); 865 } else 866 bcopy(bp->b_data, p, bp->b_bcount); 867 868 p += bp->b_bcount; 869 cbp->b_bcount += bp->b_bcount; 870 if (bp->b_flags & B_LOCKED) 871 --locked_queue_count; 872 bp->b_flags &= ~(B_ERROR | B_READ | B_DELWRI | 873 B_LOCKED | B_GATHERED); 874 if (bp->b_flags & B_CALL) { 875 /* if B_CALL, it was created with newbuf */ 876 brelvp(bp); 877 if (!(bp->b_flags & B_INVAL)) 878 free(bp->b_data, M_SEGMENT); 879 free(bp, M_SEGMENT); 880 } else { 881 bremfree(bp); 882 bp->b_flags |= B_DONE; 883 reassignbuf(bp, bp->b_vp); 884 brelse(bp); 885 } 886 } 887 ++cbp->b_vp->v_numoutput; 888 splx(s); 889 /* 890 * XXXX This is a gross and disgusting hack. Since these 891 * buffers are physically addressed, they hang off the 892 * device vnode (devvp). As a result, they have no way 893 * of getting to the LFS superblock or lfs structure to 894 * keep track of the number of I/O's pending. So, I am 895 * going to stuff the fs into the saveaddr field of 896 * the buffer (yuk). 897 */ 898 cbp->b_saveaddr = (caddr_t)fs; 899 vop_strategy_a.a_desc = VDESC(vop_strategy); 900 vop_strategy_a.a_bp = cbp; 901 (strategy)(&vop_strategy_a); 902 } 903 /* 904 * XXX 905 * Vinvalbuf can move locked buffers off the locked queue 906 * and we have no way of knowing about this. So, after 907 * doing a big write, we recalculate how many bufers are 908 * really still left on the locked queue. 909 */ 910 locked_queue_count = count_lock_queue(); 911 wakeup(&locked_queue_count); 912 #ifdef DOSTATS 913 ++lfs_stats.psegwrites; 914 lfs_stats.blocktot += nblocks - 1; 915 if (fs->lfs_sp->seg_flags & SEGM_SYNC) 916 ++lfs_stats.psyncwrites; 917 if (fs->lfs_sp->seg_flags & SEGM_CLEAN) { 918 ++lfs_stats.pcleanwrites; 919 lfs_stats.cleanblocks += nblocks - 1; 920 } 921 #endif 922 return (lfs_initseg(fs) || do_again); 923 } 924 925 void 926 lfs_writesuper(fs) 927 struct lfs *fs; 928 { 929 struct buf *bp; 930 dev_t i_dev; 931 int (*strategy) __P((struct vop_strategy_args *)); 932 int s; 933 struct vop_strategy_args vop_strategy_a; 934 935 i_dev = VTOI(fs->lfs_ivnode)->i_dev; 936 strategy = VTOI(fs->lfs_ivnode)->i_devvp->v_op[VOFFSET(vop_strategy)]; 937 938 /* Checksum the superblock and copy it into a buffer. */ 939 fs->lfs_cksum = cksum(fs, sizeof(struct lfs) - sizeof(fs->lfs_cksum)); 940 bp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, fs->lfs_sboffs[0], 941 LFS_SBPAD); 942 *(struct lfs *)bp->b_data = *fs; 943 944 /* XXX Toggle between first two superblocks; for now just write first */ 945 bp->b_dev = i_dev; 946 bp->b_flags |= B_BUSY | B_CALL | B_ASYNC; 947 bp->b_flags &= ~(B_DONE | B_ERROR | B_READ | B_DELWRI); 948 bp->b_iodone = lfs_supercallback; 949 vop_strategy_a.a_desc = VDESC(vop_strategy); 950 vop_strategy_a.a_bp = bp; 951 s = splbio(); 952 ++bp->b_vp->v_numoutput; 953 splx(s); 954 (strategy)(&vop_strategy_a); 955 } 956 957 /* 958 * Logical block number match routines used when traversing the dirty block 959 * chain. 960 */ 961 int 962 lfs_match_data(fs, bp) 963 struct lfs *fs; 964 struct buf *bp; 965 { 966 return (bp->b_lblkno >= 0); 967 } 968 969 int 970 lfs_match_indir(fs, bp) 971 struct lfs *fs; 972 struct buf *bp; 973 { 974 int lbn; 975 976 lbn = bp->b_lblkno; 977 return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 0); 978 } 979 980 int 981 lfs_match_dindir(fs, bp) 982 struct lfs *fs; 983 struct buf *bp; 984 { 985 int lbn; 986 987 lbn = bp->b_lblkno; 988 return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 1); 989 } 990 991 int 992 lfs_match_tindir(fs, bp) 993 struct lfs *fs; 994 struct buf *bp; 995 { 996 int lbn; 997 998 lbn = bp->b_lblkno; 999 return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 2); 1000 } 1001 1002 /* 1003 * Allocate a new buffer header. 1004 */ 1005 struct buf * 1006 lfs_newbuf(vp, daddr, size) 1007 struct vnode *vp; 1008 ufs_daddr_t daddr; 1009 size_t size; 1010 { 1011 struct buf *bp; 1012 size_t nbytes; 1013 1014 nbytes = roundup(size, DEV_BSIZE); 1015 bp = malloc(sizeof(struct buf), M_SEGMENT, M_WAITOK); 1016 bzero(bp, sizeof(struct buf)); 1017 if (nbytes) 1018 bp->b_data = malloc(nbytes, M_SEGMENT, M_WAITOK); 1019 bgetvp(vp, bp); 1020 bp->b_bufsize = size; 1021 bp->b_bcount = size; 1022 bp->b_lblkno = daddr; 1023 bp->b_blkno = daddr; 1024 bp->b_error = 0; 1025 bp->b_resid = 0; 1026 bp->b_iodone = lfs_callback; 1027 bp->b_flags |= B_BUSY | B_CALL | B_NOCACHE; 1028 return (bp); 1029 } 1030 1031 void 1032 lfs_callback(bp) 1033 struct buf *bp; 1034 { 1035 struct lfs *fs; 1036 1037 fs = (struct lfs *)bp->b_saveaddr; 1038 #ifdef DIAGNOSTIC 1039 if (fs->lfs_iocount == 0) 1040 panic("lfs_callback: zero iocount\n"); 1041 #endif 1042 if (--fs->lfs_iocount == 0) 1043 wakeup(&fs->lfs_iocount); 1044 1045 brelvp(bp); 1046 free(bp->b_data, M_SEGMENT); 1047 free(bp, M_SEGMENT); 1048 } 1049 1050 void 1051 lfs_supercallback(bp) 1052 struct buf *bp; 1053 { 1054 brelvp(bp); 1055 free(bp->b_data, M_SEGMENT); 1056 free(bp, M_SEGMENT); 1057 } 1058 1059 /* 1060 * Shellsort (diminishing increment sort) from Data Structures and 1061 * Algorithms, Aho, Hopcraft and Ullman, 1983 Edition, page 290; 1062 * see also Knuth Vol. 3, page 84. The increments are selected from 1063 * formula (8), page 95. Roughly O(N^3/2). 1064 */ 1065 /* 1066 * This is our own private copy of shellsort because we want to sort 1067 * two parallel arrays (the array of buffer pointers and the array of 1068 * logical block numbers) simultaneously. Note that we cast the array 1069 * of logical block numbers to a unsigned in this routine so that the 1070 * negative block numbers (meta data blocks) sort AFTER the data blocks. 1071 */ 1072 void 1073 lfs_shellsort(bp_array, lb_array, nmemb) 1074 struct buf **bp_array; 1075 ufs_daddr_t *lb_array; 1076 register int nmemb; 1077 { 1078 static int __rsshell_increments[] = { 4, 1, 0 }; 1079 register int incr, *incrp, t1, t2; 1080 struct buf *bp_temp; 1081 u_long lb_temp; 1082 1083 for (incrp = __rsshell_increments; incr = *incrp++;) 1084 for (t1 = incr; t1 < nmemb; ++t1) 1085 for (t2 = t1 - incr; t2 >= 0;) 1086 if (lb_array[t2] > lb_array[t2 + incr]) { 1087 lb_temp = lb_array[t2]; 1088 lb_array[t2] = lb_array[t2 + incr]; 1089 lb_array[t2 + incr] = lb_temp; 1090 bp_temp = bp_array[t2]; 1091 bp_array[t2] = bp_array[t2 + incr]; 1092 bp_array[t2 + incr] = bp_temp; 1093 t2 -= incr; 1094 } else 1095 break; 1096 } 1097 1098 /* 1099 * Check VXLOCK. Return 1 if the vnode is locked. Otherwise, vget it. 1100 */ 1101 lfs_vref(vp) 1102 register struct vnode *vp; 1103 { 1104 1105 if (vp->v_flag & VXLOCK) 1106 return(1); 1107 return (vget(vp, 0)); 1108 } 1109 1110 void 1111 lfs_vunref(vp) 1112 register struct vnode *vp; 1113 { 1114 extern int lfs_no_inactive; 1115 1116 /* 1117 * This is vrele except that we do not want to VOP_INACTIVE 1118 * this vnode. Rather than inline vrele here, we use a global 1119 * flag to tell lfs_inactive not to run. Yes, its gross. 1120 */ 1121 lfs_no_inactive = 1; 1122 vrele(vp); 1123 lfs_no_inactive = 0; 1124 } 1125