1 /* 2 * Copyright (c) 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * %sccs.include.redist.c% 6 * 7 * @(#)lfs_segment.c 8.9 (Berkeley) 05/14/95 8 */ 9 10 #include <sys/param.h> 11 #include <sys/systm.h> 12 #include <sys/namei.h> 13 #include <sys/kernel.h> 14 #include <sys/resourcevar.h> 15 #include <sys/file.h> 16 #include <sys/stat.h> 17 #include <sys/buf.h> 18 #include <sys/proc.h> 19 #include <sys/conf.h> 20 #include <sys/vnode.h> 21 #include <sys/malloc.h> 22 #include <sys/mount.h> 23 24 #include <miscfs/specfs/specdev.h> 25 #include <miscfs/fifofs/fifo.h> 26 27 #include <ufs/ufs/quota.h> 28 #include <ufs/ufs/inode.h> 29 #include <ufs/ufs/dir.h> 30 #include <ufs/ufs/ufsmount.h> 31 #include <ufs/ufs/ufs_extern.h> 32 33 #include <ufs/lfs/lfs.h> 34 #include <ufs/lfs/lfs_extern.h> 35 36 extern int count_lock_queue __P((void)); 37 38 #define MAX_ACTIVE 10 39 /* 40 * Determine if it's OK to start a partial in this segment, or if we need 41 * to go on to a new segment. 42 */ 43 #define LFS_PARTIAL_FITS(fs) \ 44 ((fs)->lfs_dbpseg - ((fs)->lfs_offset - (fs)->lfs_curseg) > \ 45 1 << (fs)->lfs_fsbtodb) 46 47 void lfs_callback __P((struct buf *)); 48 void lfs_gather __P((struct lfs *, struct segment *, 49 struct vnode *, int (*) __P((struct lfs *, struct buf *)))); 50 int lfs_gatherblock __P((struct segment *, struct buf *, int *)); 51 void lfs_iset __P((struct inode *, ufs_daddr_t, time_t)); 52 int lfs_match_data __P((struct lfs *, struct buf *)); 53 int lfs_match_dindir __P((struct lfs *, struct buf *)); 54 int lfs_match_indir __P((struct lfs *, struct buf *)); 55 int lfs_match_tindir __P((struct lfs *, struct buf *)); 56 void lfs_newseg __P((struct lfs *)); 57 void lfs_shellsort __P((struct buf **, ufs_daddr_t *, register int)); 58 void lfs_supercallback __P((struct buf *)); 59 void lfs_updatemeta __P((struct segment *)); 60 int lfs_vref __P((struct vnode *)); 61 void lfs_vunref __P((struct vnode *)); 62 void lfs_writefile __P((struct lfs *, struct segment *, struct vnode *)); 63 int lfs_writeinode __P((struct lfs *, struct segment *, struct inode *)); 64 int lfs_writeseg __P((struct lfs *, struct segment *)); 65 void lfs_writesuper __P((struct lfs *)); 66 void lfs_writevnodes __P((struct lfs *fs, struct mount *mp, 67 struct segment *sp, int dirops)); 68 69 int lfs_allclean_wakeup; /* Cleaner wakeup address. */ 70 71 /* Statistics Counters */ 72 #define DOSTATS 73 struct lfs_stats lfs_stats; 74 75 /* op values to lfs_writevnodes */ 76 #define VN_REG 0 77 #define VN_DIROP 1 78 #define VN_EMPTY 2 79 80 /* 81 * Ifile and meta data blocks are not marked busy, so segment writes MUST be 82 * single threaded. Currently, there are two paths into lfs_segwrite, sync() 83 * and getnewbuf(). They both mark the file system busy. Lfs_vflush() 84 * explicitly marks the file system busy. So lfs_segwrite is safe. I think. 85 */ 86 87 int 88 lfs_vflush(vp) 89 struct vnode *vp; 90 { 91 struct inode *ip; 92 struct lfs *fs; 93 struct segment *sp; 94 95 fs = VFSTOUFS(vp->v_mount)->um_lfs; 96 if (fs->lfs_nactive > MAX_ACTIVE) 97 return(lfs_segwrite(vp->v_mount, SEGM_SYNC|SEGM_CKP)); 98 lfs_seglock(fs, SEGM_SYNC); 99 sp = fs->lfs_sp; 100 101 102 ip = VTOI(vp); 103 if (vp->v_dirtyblkhd.lh_first == NULL) 104 lfs_writevnodes(fs, vp->v_mount, sp, VN_EMPTY); 105 106 do { 107 do { 108 if (vp->v_dirtyblkhd.lh_first != NULL) 109 lfs_writefile(fs, sp, vp); 110 } while (lfs_writeinode(fs, sp, ip)); 111 112 } while (lfs_writeseg(fs, sp) && ip->i_number == LFS_IFILE_INUM); 113 114 #ifdef DOSTATS 115 ++lfs_stats.nwrites; 116 if (sp->seg_flags & SEGM_SYNC) 117 ++lfs_stats.nsync_writes; 118 if (sp->seg_flags & SEGM_CKP) 119 ++lfs_stats.ncheckpoints; 120 #endif 121 lfs_segunlock(fs); 122 return (0); 123 } 124 125 void 126 lfs_writevnodes(fs, mp, sp, op) 127 struct lfs *fs; 128 struct mount *mp; 129 struct segment *sp; 130 int op; 131 { 132 struct inode *ip; 133 struct vnode *vp; 134 135 /* BEGIN HACK */ 136 #define VN_OFFSET (((void *)&vp->v_mntvnodes.le_next) - (void *)vp) 137 #define BACK_VP(VP) ((struct vnode *)(((void *)VP->v_mntvnodes.le_prev) - VN_OFFSET)) 138 #define BEG_OF_VLIST ((struct vnode *)(((void *)&mp->mnt_vnodelist.lh_first) - VN_OFFSET)) 139 140 /* Find last vnode. */ 141 loop: for (vp = mp->mnt_vnodelist.lh_first; 142 vp && vp->v_mntvnodes.le_next != NULL; 143 vp = vp->v_mntvnodes.le_next); 144 for (; vp && vp != BEG_OF_VLIST; vp = BACK_VP(vp)) { 145 /* END HACK */ 146 /* 147 loop: 148 for (vp = mp->mnt_vnodelist.lh_first; 149 vp != NULL; 150 vp = vp->v_mntvnodes.le_next) { 151 */ 152 /* 153 * If the vnode that we are about to sync is no longer 154 * associated with this mount point, start over. 155 */ 156 if (vp->v_mount != mp) 157 goto loop; 158 159 /* XXX ignore dirops for now 160 if (op == VN_DIROP && !(vp->v_flag & VDIROP) || 161 op != VN_DIROP && (vp->v_flag & VDIROP)) 162 continue; 163 */ 164 165 if (op == VN_EMPTY && vp->v_dirtyblkhd.lh_first) 166 continue; 167 168 if (vp->v_type == VNON) 169 continue; 170 171 if (lfs_vref(vp)) 172 continue; 173 174 /* 175 * Write the inode/file if dirty and it's not the 176 * the IFILE. 177 */ 178 ip = VTOI(vp); 179 if ((ip->i_flag & 180 (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE) || 181 vp->v_dirtyblkhd.lh_first != NULL) && 182 ip->i_number != LFS_IFILE_INUM) { 183 if (vp->v_dirtyblkhd.lh_first != NULL) 184 lfs_writefile(fs, sp, vp); 185 (void) lfs_writeinode(fs, sp, ip); 186 } 187 vp->v_flag &= ~VDIROP; 188 lfs_vunref(vp); 189 } 190 } 191 192 int 193 lfs_segwrite(mp, flags) 194 struct mount *mp; 195 int flags; /* Do a checkpoint. */ 196 { 197 struct proc *p = curproc; /* XXX */ 198 struct buf *bp; 199 struct inode *ip; 200 struct lfs *fs; 201 struct segment *sp; 202 struct vnode *vp; 203 SEGUSE *segusep; 204 ufs_daddr_t ibno; 205 CLEANERINFO *cip; 206 int clean, do_ckp, error, i; 207 208 fs = VFSTOUFS(mp)->um_lfs; 209 210 /* 211 * If we have fewer than 2 clean segments, wait until cleaner 212 * writes. 213 */ 214 do { 215 LFS_CLEANERINFO(cip, fs, bp); 216 clean = cip->clean; 217 brelse(bp); 218 if (clean <= 2) { 219 /* printf ("segs clean: %d\n", clean); */ 220 wakeup(&lfs_allclean_wakeup); 221 if (error = tsleep(&fs->lfs_avail, PRIBIO + 1, 222 "lfs writer", 0)) 223 return (error); 224 } 225 } while (clean <= 2 ); 226 227 /* 228 * Allocate a segment structure and enough space to hold pointers to 229 * the maximum possible number of buffers which can be described in a 230 * single summary block. 231 */ 232 do_ckp = flags & SEGM_CKP || fs->lfs_nactive > MAX_ACTIVE; 233 lfs_seglock(fs, flags | (do_ckp ? SEGM_CKP : 0)); 234 sp = fs->lfs_sp; 235 236 lfs_writevnodes(fs, mp, sp, VN_REG); 237 238 /* XXX ignore ordering of dirops for now */ 239 /* XXX 240 fs->lfs_writer = 1; 241 if (fs->lfs_dirops && (error = 242 tsleep(&fs->lfs_writer, PRIBIO + 1, "lfs writer", 0))) { 243 free(sp->bpp, M_SEGMENT); 244 free(sp, M_SEGMENT); 245 fs->lfs_writer = 0; 246 return (error); 247 } 248 249 lfs_writevnodes(fs, mp, sp, VN_DIROP); 250 */ 251 252 /* 253 * If we are doing a checkpoint, mark everything since the 254 * last checkpoint as no longer ACTIVE. 255 */ 256 if (do_ckp) 257 for (ibno = fs->lfs_cleansz + fs->lfs_segtabsz; 258 --ibno >= fs->lfs_cleansz; ) { 259 if (bread(fs->lfs_ivnode, ibno, fs->lfs_bsize, 260 NOCRED, &bp)) 261 262 panic("lfs: ifile read"); 263 segusep = (SEGUSE *)bp->b_data; 264 for (i = fs->lfs_sepb; i--; segusep++) 265 segusep->su_flags &= ~SEGUSE_ACTIVE; 266 267 error = VOP_BWRITE(bp); 268 } 269 270 if (do_ckp || fs->lfs_doifile) { 271 redo: 272 vp = fs->lfs_ivnode; 273 while (vget(vp, LK_EXCLUSIVE, p)) 274 continue; 275 ip = VTOI(vp); 276 if (vp->v_dirtyblkhd.lh_first != NULL) 277 lfs_writefile(fs, sp, vp); 278 (void)lfs_writeinode(fs, sp, ip); 279 vput(vp); 280 if (lfs_writeseg(fs, sp) && do_ckp) 281 goto redo; 282 } else 283 (void) lfs_writeseg(fs, sp); 284 285 /* 286 * If the I/O count is non-zero, sleep until it reaches zero. At the 287 * moment, the user's process hangs around so we can sleep. 288 */ 289 /* XXX ignore dirops for now 290 fs->lfs_writer = 0; 291 fs->lfs_doifile = 0; 292 wakeup(&fs->lfs_dirops); 293 */ 294 295 #ifdef DOSTATS 296 ++lfs_stats.nwrites; 297 if (sp->seg_flags & SEGM_SYNC) 298 ++lfs_stats.nsync_writes; 299 if (sp->seg_flags & SEGM_CKP) 300 ++lfs_stats.ncheckpoints; 301 #endif 302 lfs_segunlock(fs); 303 return (0); 304 } 305 306 /* 307 * Write the dirty blocks associated with a vnode. 308 */ 309 void 310 lfs_writefile(fs, sp, vp) 311 struct lfs *fs; 312 struct segment *sp; 313 struct vnode *vp; 314 { 315 struct buf *bp; 316 struct finfo *fip; 317 IFILE *ifp; 318 319 if (sp->seg_bytes_left < fs->lfs_bsize || 320 sp->sum_bytes_left < sizeof(struct finfo)) 321 (void) lfs_writeseg(fs, sp); 322 323 sp->sum_bytes_left -= sizeof(struct finfo) - sizeof(ufs_daddr_t); 324 ++((SEGSUM *)(sp->segsum))->ss_nfinfo; 325 326 fip = sp->fip; 327 fip->fi_nblocks = 0; 328 fip->fi_ino = VTOI(vp)->i_number; 329 LFS_IENTRY(ifp, fs, fip->fi_ino, bp); 330 fip->fi_version = ifp->if_version; 331 brelse(bp); 332 333 /* 334 * It may not be necessary to write the meta-data blocks at this point, 335 * as the roll-forward recovery code should be able to reconstruct the 336 * list. 337 */ 338 lfs_gather(fs, sp, vp, lfs_match_data); 339 lfs_gather(fs, sp, vp, lfs_match_indir); 340 lfs_gather(fs, sp, vp, lfs_match_dindir); 341 #ifdef TRIPLE 342 lfs_gather(fs, sp, vp, lfs_match_tindir); 343 #endif 344 345 fip = sp->fip; 346 if (fip->fi_nblocks != 0) { 347 sp->fip = 348 (struct finfo *)((caddr_t)fip + sizeof(struct finfo) + 349 sizeof(ufs_daddr_t) * (fip->fi_nblocks - 1)); 350 sp->start_lbp = &sp->fip->fi_blocks[0]; 351 } else { 352 sp->sum_bytes_left += sizeof(struct finfo) - sizeof(ufs_daddr_t); 353 --((SEGSUM *)(sp->segsum))->ss_nfinfo; 354 } 355 } 356 357 int 358 lfs_writeinode(fs, sp, ip) 359 struct lfs *fs; 360 struct segment *sp; 361 struct inode *ip; 362 { 363 struct buf *bp, *ibp; 364 IFILE *ifp; 365 SEGUSE *sup; 366 ufs_daddr_t daddr; 367 ino_t ino; 368 int error, i, ndx; 369 int redo_ifile = 0; 370 371 if (!(ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE))) 372 return(0); 373 374 /* Allocate a new inode block if necessary. */ 375 if (sp->ibp == NULL) { 376 /* Allocate a new segment if necessary. */ 377 if (sp->seg_bytes_left < fs->lfs_bsize || 378 sp->sum_bytes_left < sizeof(ufs_daddr_t)) 379 (void) lfs_writeseg(fs, sp); 380 381 /* Get next inode block. */ 382 daddr = fs->lfs_offset; 383 fs->lfs_offset += fsbtodb(fs, 1); 384 sp->ibp = *sp->cbpp++ = 385 lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, daddr, 386 fs->lfs_bsize); 387 /* Zero out inode numbers */ 388 for (i = 0; i < INOPB(fs); ++i) 389 ((struct dinode *)sp->ibp->b_data)[i].di_inumber = 0; 390 ++sp->start_bpp; 391 fs->lfs_avail -= fsbtodb(fs, 1); 392 /* Set remaining space counters. */ 393 sp->seg_bytes_left -= fs->lfs_bsize; 394 sp->sum_bytes_left -= sizeof(ufs_daddr_t); 395 ndx = LFS_SUMMARY_SIZE / sizeof(ufs_daddr_t) - 396 sp->ninodes / INOPB(fs) - 1; 397 ((ufs_daddr_t *)(sp->segsum))[ndx] = daddr; 398 } 399 400 /* Update the inode times and copy the inode onto the inode page. */ 401 if (ip->i_flag & IN_MODIFIED) 402 --fs->lfs_uinodes; 403 ITIMES(ip, &time, &time); 404 ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE); 405 bp = sp->ibp; 406 ((struct dinode *)bp->b_data)[sp->ninodes % INOPB(fs)] = ip->i_din; 407 /* Increment inode count in segment summary block. */ 408 ++((SEGSUM *)(sp->segsum))->ss_ninos; 409 410 /* If this page is full, set flag to allocate a new page. */ 411 if (++sp->ninodes % INOPB(fs) == 0) 412 sp->ibp = NULL; 413 414 /* 415 * If updating the ifile, update the super-block. Update the disk 416 * address and access times for this inode in the ifile. 417 */ 418 ino = ip->i_number; 419 if (ino == LFS_IFILE_INUM) { 420 daddr = fs->lfs_idaddr; 421 fs->lfs_idaddr = bp->b_blkno; 422 } else { 423 LFS_IENTRY(ifp, fs, ino, ibp); 424 daddr = ifp->if_daddr; 425 ifp->if_daddr = bp->b_blkno; 426 error = VOP_BWRITE(ibp); 427 } 428 429 /* 430 * No need to update segment usage if there was no former inode address 431 * or if the last inode address is in the current partial segment. 432 */ 433 if (daddr != LFS_UNUSED_DADDR && 434 !(daddr >= fs->lfs_lastpseg && daddr <= bp->b_blkno)) { 435 LFS_SEGENTRY(sup, fs, datosn(fs, daddr), bp); 436 #ifdef DIAGNOSTIC 437 if (sup->su_nbytes < sizeof(struct dinode)) { 438 /* XXX -- Change to a panic. */ 439 printf("lfs: negative bytes (segment %d)\n", 440 datosn(fs, daddr)); 441 panic("negative bytes"); 442 } 443 #endif 444 sup->su_nbytes -= sizeof(struct dinode); 445 redo_ifile = 446 (ino == LFS_IFILE_INUM && !(bp->b_flags & B_GATHERED)); 447 error = VOP_BWRITE(bp); 448 } 449 return (redo_ifile); 450 } 451 452 int 453 lfs_gatherblock(sp, bp, sptr) 454 struct segment *sp; 455 struct buf *bp; 456 int *sptr; 457 { 458 struct lfs *fs; 459 int version; 460 461 /* 462 * If full, finish this segment. We may be doing I/O, so 463 * release and reacquire the splbio(). 464 */ 465 #ifdef DIAGNOSTIC 466 if (sp->vp == NULL) 467 panic ("lfs_gatherblock: Null vp in segment"); 468 #endif 469 fs = sp->fs; 470 if (sp->sum_bytes_left < sizeof(ufs_daddr_t) || 471 sp->seg_bytes_left < bp->b_bcount) { 472 if (sptr) 473 splx(*sptr); 474 lfs_updatemeta(sp); 475 476 version = sp->fip->fi_version; 477 (void) lfs_writeseg(fs, sp); 478 479 sp->fip->fi_version = version; 480 sp->fip->fi_ino = VTOI(sp->vp)->i_number; 481 /* Add the current file to the segment summary. */ 482 ++((SEGSUM *)(sp->segsum))->ss_nfinfo; 483 sp->sum_bytes_left -= 484 sizeof(struct finfo) - sizeof(ufs_daddr_t); 485 486 if (sptr) 487 *sptr = splbio(); 488 return(1); 489 } 490 491 /* Insert into the buffer list, update the FINFO block. */ 492 bp->b_flags |= B_GATHERED; 493 *sp->cbpp++ = bp; 494 sp->fip->fi_blocks[sp->fip->fi_nblocks++] = bp->b_lblkno; 495 496 sp->sum_bytes_left -= sizeof(ufs_daddr_t); 497 sp->seg_bytes_left -= bp->b_bcount; 498 return(0); 499 } 500 501 void 502 lfs_gather(fs, sp, vp, match) 503 struct lfs *fs; 504 struct segment *sp; 505 struct vnode *vp; 506 int (*match) __P((struct lfs *, struct buf *)); 507 { 508 struct buf *bp; 509 int s; 510 511 sp->vp = vp; 512 s = splbio(); 513 /* This is a hack to see if ordering the blocks in LFS makes a difference. */ 514 /* BEGIN HACK */ 515 #define BUF_OFFSET (((void *)&bp->b_vnbufs.le_next) - (void *)bp) 516 #define BACK_BUF(BP) ((struct buf *)(((void *)BP->b_vnbufs.le_prev) - BUF_OFFSET)) 517 #define BEG_OF_LIST ((struct buf *)(((void *)&vp->v_dirtyblkhd.lh_first) - BUF_OFFSET)) 518 519 520 /*loop: for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = bp->b_vnbufs.le_next) {*/ 521 /* Find last buffer. */ 522 loop: for (bp = vp->v_dirtyblkhd.lh_first; bp && bp->b_vnbufs.le_next != NULL; 523 bp = bp->b_vnbufs.le_next); 524 for (; bp && bp != BEG_OF_LIST; bp = BACK_BUF(bp)) { 525 /* END HACK */ 526 if (bp->b_flags & B_BUSY || !match(fs, bp) || 527 bp->b_flags & B_GATHERED) 528 continue; 529 #ifdef DIAGNOSTIC 530 if (!(bp->b_flags & B_DELWRI)) 531 panic("lfs_gather: bp not B_DELWRI"); 532 if (!(bp->b_flags & B_LOCKED)) 533 panic("lfs_gather: bp not B_LOCKED"); 534 #endif 535 if (lfs_gatherblock(sp, bp, &s)) 536 goto loop; 537 } 538 splx(s); 539 lfs_updatemeta(sp); 540 sp->vp = NULL; 541 } 542 543 544 /* 545 * Update the metadata that points to the blocks listed in the FINFO 546 * array. 547 */ 548 void 549 lfs_updatemeta(sp) 550 struct segment *sp; 551 { 552 SEGUSE *sup; 553 struct buf *bp; 554 struct lfs *fs; 555 struct vnode *vp; 556 struct indir a[NIADDR + 2], *ap; 557 struct inode *ip; 558 ufs_daddr_t daddr, lbn, off; 559 int error, i, nblocks, num; 560 561 vp = sp->vp; 562 nblocks = &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp; 563 if (nblocks < 0) 564 panic("This is a bad thing\n"); 565 if (vp == NULL || nblocks == 0) 566 return; 567 568 /* Sort the blocks. */ 569 if (!(sp->seg_flags & SEGM_CLEAN)) 570 lfs_shellsort(sp->start_bpp, sp->start_lbp, nblocks); 571 572 /* 573 * Record the length of the last block in case it's a fragment. 574 * If there are indirect blocks present, they sort last. An 575 * indirect block will be lfs_bsize and its presence indicates 576 * that you cannot have fragments. 577 */ 578 sp->fip->fi_lastlength = sp->start_bpp[nblocks - 1]->b_bcount; 579 580 /* 581 * Assign disk addresses, and update references to the logical 582 * block and the segment usage information. 583 */ 584 fs = sp->fs; 585 for (i = nblocks; i--; ++sp->start_bpp) { 586 lbn = *sp->start_lbp++; 587 (*sp->start_bpp)->b_blkno = off = fs->lfs_offset; 588 fs->lfs_offset += 589 fragstodb(fs, numfrags(fs, (*sp->start_bpp)->b_bcount)); 590 591 if (error = ufs_bmaparray(vp, lbn, &daddr, a, &num, NULL)) 592 panic("lfs_updatemeta: ufs_bmaparray %d", error); 593 ip = VTOI(vp); 594 switch (num) { 595 case 0: 596 ip->i_db[lbn] = off; 597 break; 598 case 1: 599 ip->i_ib[a[0].in_off] = off; 600 break; 601 default: 602 ap = &a[num - 1]; 603 if (bread(vp, ap->in_lbn, fs->lfs_bsize, NOCRED, &bp)) 604 panic("lfs_updatemeta: bread bno %d", 605 ap->in_lbn); 606 /* 607 * Bread may create a new indirect block which needs 608 * to get counted for the inode. 609 */ 610 if (bp->b_blkno == -1 && !(bp->b_flags & B_CACHE)) { 611 ip->i_blocks += fsbtodb(fs, 1); 612 fs->lfs_bfree -= fragstodb(fs, fs->lfs_frag); 613 } 614 ((ufs_daddr_t *)bp->b_data)[ap->in_off] = off; 615 VOP_BWRITE(bp); 616 } 617 618 /* Update segment usage information. */ 619 if (daddr != UNASSIGNED && 620 !(daddr >= fs->lfs_lastpseg && daddr <= off)) { 621 LFS_SEGENTRY(sup, fs, datosn(fs, daddr), bp); 622 #ifdef DIAGNOSTIC 623 if (sup->su_nbytes < (*sp->start_bpp)->b_bcount) { 624 /* XXX -- Change to a panic. */ 625 printf("lfs: negative bytes (segment %d)\n", 626 datosn(fs, daddr)); 627 panic ("Negative Bytes"); 628 } 629 #endif 630 sup->su_nbytes -= (*sp->start_bpp)->b_bcount; 631 error = VOP_BWRITE(bp); 632 } 633 } 634 } 635 636 /* 637 * Start a new segment. 638 */ 639 int 640 lfs_initseg(fs) 641 struct lfs *fs; 642 { 643 struct segment *sp; 644 SEGUSE *sup; 645 SEGSUM *ssp; 646 struct buf *bp; 647 int repeat; 648 649 sp = fs->lfs_sp; 650 651 repeat = 0; 652 /* Advance to the next segment. */ 653 if (!LFS_PARTIAL_FITS(fs)) { 654 /* Wake up any cleaning procs waiting on this file system. */ 655 wakeup(&lfs_allclean_wakeup); 656 657 lfs_newseg(fs); 658 repeat = 1; 659 fs->lfs_offset = fs->lfs_curseg; 660 sp->seg_number = datosn(fs, fs->lfs_curseg); 661 sp->seg_bytes_left = fs->lfs_dbpseg * DEV_BSIZE; 662 663 /* 664 * If the segment contains a superblock, update the offset 665 * and summary address to skip over it. 666 */ 667 LFS_SEGENTRY(sup, fs, sp->seg_number, bp); 668 if (sup->su_flags & SEGUSE_SUPERBLOCK) { 669 fs->lfs_offset += LFS_SBPAD / DEV_BSIZE; 670 sp->seg_bytes_left -= LFS_SBPAD; 671 } 672 brelse(bp); 673 } else { 674 sp->seg_number = datosn(fs, fs->lfs_curseg); 675 sp->seg_bytes_left = (fs->lfs_dbpseg - 676 (fs->lfs_offset - fs->lfs_curseg)) * DEV_BSIZE; 677 } 678 fs->lfs_lastpseg = fs->lfs_offset; 679 680 sp->fs = fs; 681 sp->ibp = NULL; 682 sp->ninodes = 0; 683 684 /* Get a new buffer for SEGSUM and enter it into the buffer list. */ 685 sp->cbpp = sp->bpp; 686 *sp->cbpp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, fs->lfs_offset, 687 LFS_SUMMARY_SIZE); 688 sp->segsum = (*sp->cbpp)->b_data; 689 bzero(sp->segsum, LFS_SUMMARY_SIZE); 690 sp->start_bpp = ++sp->cbpp; 691 fs->lfs_offset += LFS_SUMMARY_SIZE / DEV_BSIZE; 692 693 /* Set point to SEGSUM, initialize it. */ 694 ssp = sp->segsum; 695 ssp->ss_next = fs->lfs_nextseg; 696 ssp->ss_nfinfo = ssp->ss_ninos = 0; 697 ssp->ss_magic = SS_MAGIC; 698 699 /* Set pointer to first FINFO, initialize it. */ 700 sp->fip = (struct finfo *)((caddr_t)sp->segsum + sizeof(SEGSUM)); 701 sp->fip->fi_nblocks = 0; 702 sp->start_lbp = &sp->fip->fi_blocks[0]; 703 sp->fip->fi_lastlength = 0; 704 705 sp->seg_bytes_left -= LFS_SUMMARY_SIZE; 706 sp->sum_bytes_left = LFS_SUMMARY_SIZE - sizeof(SEGSUM); 707 708 return(repeat); 709 } 710 711 /* 712 * Return the next segment to write. 713 */ 714 void 715 lfs_newseg(fs) 716 struct lfs *fs; 717 { 718 CLEANERINFO *cip; 719 SEGUSE *sup; 720 struct buf *bp; 721 int curseg, isdirty, sn; 722 723 LFS_SEGENTRY(sup, fs, datosn(fs, fs->lfs_nextseg), bp); 724 sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE; 725 sup->su_nbytes = 0; 726 sup->su_nsums = 0; 727 sup->su_ninos = 0; 728 (void) VOP_BWRITE(bp); 729 730 LFS_CLEANERINFO(cip, fs, bp); 731 --cip->clean; 732 ++cip->dirty; 733 (void) VOP_BWRITE(bp); 734 735 fs->lfs_lastseg = fs->lfs_curseg; 736 fs->lfs_curseg = fs->lfs_nextseg; 737 for (sn = curseg = datosn(fs, fs->lfs_curseg);;) { 738 sn = (sn + 1) % fs->lfs_nseg; 739 if (sn == curseg) 740 panic("lfs_nextseg: no clean segments"); 741 LFS_SEGENTRY(sup, fs, sn, bp); 742 isdirty = sup->su_flags & SEGUSE_DIRTY; 743 brelse(bp); 744 if (!isdirty) 745 break; 746 } 747 748 ++fs->lfs_nactive; 749 fs->lfs_nextseg = sntoda(fs, sn); 750 #ifdef DOSTATS 751 ++lfs_stats.segsused; 752 #endif 753 } 754 755 int 756 lfs_writeseg(fs, sp) 757 struct lfs *fs; 758 struct segment *sp; 759 { 760 extern int locked_queue_count; 761 struct buf **bpp, *bp, *cbp; 762 SEGUSE *sup; 763 SEGSUM *ssp; 764 dev_t i_dev; 765 u_long *datap, *dp; 766 int do_again, i, nblocks, s; 767 int (*strategy)__P((struct vop_strategy_args *)); 768 struct vop_strategy_args vop_strategy_a; 769 u_short ninos; 770 char *p; 771 long *lp; 772 773 /* 774 * If there are no buffers other than the segment summary to write 775 * and it is not a checkpoint, don't do anything. On a checkpoint, 776 * even if there aren't any buffers, you need to write the superblock. 777 */ 778 if ((nblocks = sp->cbpp - sp->bpp) == 1) 779 return (0); 780 781 /* Update the segment usage information. */ 782 LFS_SEGENTRY(sup, fs, sp->seg_number, bp); 783 784 /* Loop through all blocks, except the segment summary. */ 785 for (bpp = sp->bpp; ++bpp < sp->cbpp; ) 786 sup->su_nbytes += (*bpp)->b_bcount; 787 788 ssp = (SEGSUM *)sp->segsum; 789 790 ninos = (ssp->ss_ninos + INOPB(fs) - 1) / INOPB(fs); 791 sup->su_nbytes += ssp->ss_ninos * sizeof(struct dinode); 792 sup->su_nbytes += LFS_SUMMARY_SIZE; 793 sup->su_lastmod = time.tv_sec; 794 sup->su_ninos += ninos; 795 ++sup->su_nsums; 796 do_again = !(bp->b_flags & B_GATHERED); 797 (void)VOP_BWRITE(bp); 798 /* 799 * Compute checksum across data and then across summary; the first 800 * block (the summary block) is skipped. Set the create time here 801 * so that it's guaranteed to be later than the inode mod times. 802 * 803 * XXX 804 * Fix this to do it inline, instead of malloc/copy. 805 */ 806 datap = dp = malloc(nblocks * sizeof(u_long), M_SEGMENT, M_WAITOK); 807 for (bpp = sp->bpp, i = nblocks - 1; i--;) { 808 if ((*++bpp)->b_flags & B_INVAL) { 809 if (copyin((*bpp)->b_saveaddr, dp++, sizeof(u_long))) 810 panic("lfs_writeseg: copyin failed"); 811 } else 812 *dp++ = ((u_long *)(*bpp)->b_data)[0]; 813 } 814 ssp->ss_create = time.tv_sec; 815 ssp->ss_datasum = cksum(datap, (nblocks - 1) * sizeof(u_long)); 816 ssp->ss_sumsum = 817 cksum(&ssp->ss_datasum, LFS_SUMMARY_SIZE - sizeof(ssp->ss_sumsum)); 818 free(datap, M_SEGMENT); 819 #ifdef DIAGNOSTIC 820 if (fs->lfs_bfree < fsbtodb(fs, ninos) + LFS_SUMMARY_SIZE / DEV_BSIZE) 821 panic("lfs_writeseg: No diskspace for summary"); 822 #endif 823 fs->lfs_bfree -= (fsbtodb(fs, ninos) + LFS_SUMMARY_SIZE / DEV_BSIZE); 824 825 i_dev = VTOI(fs->lfs_ivnode)->i_dev; 826 strategy = VTOI(fs->lfs_ivnode)->i_devvp->v_op[VOFFSET(vop_strategy)]; 827 828 /* 829 * When we simply write the blocks we lose a rotation for every block 830 * written. To avoid this problem, we allocate memory in chunks, copy 831 * the buffers into the chunk and write the chunk. MAXPHYS is the 832 * largest size I/O devices can handle. 833 * When the data is copied to the chunk, turn off the the B_LOCKED bit 834 * and brelse the buffer (which will move them to the LRU list). Add 835 * the B_CALL flag to the buffer header so we can count I/O's for the 836 * checkpoints and so we can release the allocated memory. 837 * 838 * XXX 839 * This should be removed if the new virtual memory system allows us to 840 * easily make the buffers contiguous in kernel memory and if that's 841 * fast enough. 842 */ 843 for (bpp = sp->bpp, i = nblocks; i;) { 844 cbp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, 845 (*bpp)->b_blkno, MAXPHYS); 846 cbp->b_dev = i_dev; 847 cbp->b_flags |= B_ASYNC | B_BUSY; 848 cbp->b_bcount = 0; 849 850 s = splbio(); 851 ++fs->lfs_iocount; 852 for (p = cbp->b_data; i && cbp->b_bcount < MAXPHYS; i--) { 853 bp = *bpp; 854 if (bp->b_bcount > (MAXPHYS - cbp->b_bcount)) 855 break; 856 bpp++; 857 858 /* 859 * Fake buffers from the cleaner are marked as B_INVAL. 860 * We need to copy the data from user space rather than 861 * from the buffer indicated. 862 * XXX == what do I do on an error? 863 */ 864 if (bp->b_flags & B_INVAL) { 865 if (copyin(bp->b_saveaddr, p, bp->b_bcount)) 866 panic("lfs_writeseg: copyin failed"); 867 } else 868 bcopy(bp->b_data, p, bp->b_bcount); 869 870 p += bp->b_bcount; 871 cbp->b_bcount += bp->b_bcount; 872 if (bp->b_flags & B_LOCKED) 873 --locked_queue_count; 874 bp->b_flags &= ~(B_ERROR | B_READ | B_DELWRI | 875 B_LOCKED | B_GATHERED); 876 if (bp->b_flags & B_CALL) { 877 /* if B_CALL, it was created with newbuf */ 878 brelvp(bp); 879 if (!(bp->b_flags & B_INVAL)) 880 free(bp->b_data, M_SEGMENT); 881 free(bp, M_SEGMENT); 882 } else { 883 bremfree(bp); 884 bp->b_flags |= B_DONE; 885 reassignbuf(bp, bp->b_vp); 886 brelse(bp); 887 } 888 } 889 ++cbp->b_vp->v_numoutput; 890 splx(s); 891 /* 892 * XXXX This is a gross and disgusting hack. Since these 893 * buffers are physically addressed, they hang off the 894 * device vnode (devvp). As a result, they have no way 895 * of getting to the LFS superblock or lfs structure to 896 * keep track of the number of I/O's pending. So, I am 897 * going to stuff the fs into the saveaddr field of 898 * the buffer (yuk). 899 */ 900 cbp->b_saveaddr = (caddr_t)fs; 901 vop_strategy_a.a_desc = VDESC(vop_strategy); 902 vop_strategy_a.a_bp = cbp; 903 (strategy)(&vop_strategy_a); 904 } 905 /* 906 * XXX 907 * Vinvalbuf can move locked buffers off the locked queue 908 * and we have no way of knowing about this. So, after 909 * doing a big write, we recalculate how many bufers are 910 * really still left on the locked queue. 911 */ 912 locked_queue_count = count_lock_queue(); 913 wakeup(&locked_queue_count); 914 #ifdef DOSTATS 915 ++lfs_stats.psegwrites; 916 lfs_stats.blocktot += nblocks - 1; 917 if (fs->lfs_sp->seg_flags & SEGM_SYNC) 918 ++lfs_stats.psyncwrites; 919 if (fs->lfs_sp->seg_flags & SEGM_CLEAN) { 920 ++lfs_stats.pcleanwrites; 921 lfs_stats.cleanblocks += nblocks - 1; 922 } 923 #endif 924 return (lfs_initseg(fs) || do_again); 925 } 926 927 void 928 lfs_writesuper(fs) 929 struct lfs *fs; 930 { 931 struct buf *bp; 932 dev_t i_dev; 933 int (*strategy) __P((struct vop_strategy_args *)); 934 int s; 935 struct vop_strategy_args vop_strategy_a; 936 937 i_dev = VTOI(fs->lfs_ivnode)->i_dev; 938 strategy = VTOI(fs->lfs_ivnode)->i_devvp->v_op[VOFFSET(vop_strategy)]; 939 940 /* Checksum the superblock and copy it into a buffer. */ 941 fs->lfs_cksum = cksum(fs, sizeof(struct lfs) - sizeof(fs->lfs_cksum)); 942 bp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, fs->lfs_sboffs[0], 943 LFS_SBPAD); 944 *(struct lfs *)bp->b_data = *fs; 945 946 /* XXX Toggle between first two superblocks; for now just write first */ 947 bp->b_dev = i_dev; 948 bp->b_flags |= B_BUSY | B_CALL | B_ASYNC; 949 bp->b_flags &= ~(B_DONE | B_ERROR | B_READ | B_DELWRI); 950 bp->b_iodone = lfs_supercallback; 951 vop_strategy_a.a_desc = VDESC(vop_strategy); 952 vop_strategy_a.a_bp = bp; 953 s = splbio(); 954 ++bp->b_vp->v_numoutput; 955 splx(s); 956 (strategy)(&vop_strategy_a); 957 } 958 959 /* 960 * Logical block number match routines used when traversing the dirty block 961 * chain. 962 */ 963 int 964 lfs_match_data(fs, bp) 965 struct lfs *fs; 966 struct buf *bp; 967 { 968 return (bp->b_lblkno >= 0); 969 } 970 971 int 972 lfs_match_indir(fs, bp) 973 struct lfs *fs; 974 struct buf *bp; 975 { 976 int lbn; 977 978 lbn = bp->b_lblkno; 979 return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 0); 980 } 981 982 int 983 lfs_match_dindir(fs, bp) 984 struct lfs *fs; 985 struct buf *bp; 986 { 987 int lbn; 988 989 lbn = bp->b_lblkno; 990 return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 1); 991 } 992 993 int 994 lfs_match_tindir(fs, bp) 995 struct lfs *fs; 996 struct buf *bp; 997 { 998 int lbn; 999 1000 lbn = bp->b_lblkno; 1001 return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 2); 1002 } 1003 1004 /* 1005 * Allocate a new buffer header. 1006 */ 1007 struct buf * 1008 lfs_newbuf(vp, daddr, size) 1009 struct vnode *vp; 1010 ufs_daddr_t daddr; 1011 size_t size; 1012 { 1013 struct buf *bp; 1014 size_t nbytes; 1015 1016 nbytes = roundup(size, DEV_BSIZE); 1017 bp = malloc(sizeof(struct buf), M_SEGMENT, M_WAITOK); 1018 bzero(bp, sizeof(struct buf)); 1019 if (nbytes) 1020 bp->b_data = malloc(nbytes, M_SEGMENT, M_WAITOK); 1021 bgetvp(vp, bp); 1022 bp->b_bufsize = size; 1023 bp->b_bcount = size; 1024 bp->b_lblkno = daddr; 1025 bp->b_blkno = daddr; 1026 bp->b_error = 0; 1027 bp->b_resid = 0; 1028 bp->b_iodone = lfs_callback; 1029 bp->b_flags |= B_BUSY | B_CALL | B_NOCACHE; 1030 return (bp); 1031 } 1032 1033 void 1034 lfs_callback(bp) 1035 struct buf *bp; 1036 { 1037 struct lfs *fs; 1038 1039 fs = (struct lfs *)bp->b_saveaddr; 1040 #ifdef DIAGNOSTIC 1041 if (fs->lfs_iocount == 0) 1042 panic("lfs_callback: zero iocount\n"); 1043 #endif 1044 if (--fs->lfs_iocount == 0) 1045 wakeup(&fs->lfs_iocount); 1046 1047 brelvp(bp); 1048 free(bp->b_data, M_SEGMENT); 1049 free(bp, M_SEGMENT); 1050 } 1051 1052 void 1053 lfs_supercallback(bp) 1054 struct buf *bp; 1055 { 1056 brelvp(bp); 1057 free(bp->b_data, M_SEGMENT); 1058 free(bp, M_SEGMENT); 1059 } 1060 1061 /* 1062 * Shellsort (diminishing increment sort) from Data Structures and 1063 * Algorithms, Aho, Hopcraft and Ullman, 1983 Edition, page 290; 1064 * see also Knuth Vol. 3, page 84. The increments are selected from 1065 * formula (8), page 95. Roughly O(N^3/2). 1066 */ 1067 /* 1068 * This is our own private copy of shellsort because we want to sort 1069 * two parallel arrays (the array of buffer pointers and the array of 1070 * logical block numbers) simultaneously. Note that we cast the array 1071 * of logical block numbers to a unsigned in this routine so that the 1072 * negative block numbers (meta data blocks) sort AFTER the data blocks. 1073 */ 1074 void 1075 lfs_shellsort(bp_array, lb_array, nmemb) 1076 struct buf **bp_array; 1077 ufs_daddr_t *lb_array; 1078 register int nmemb; 1079 { 1080 static int __rsshell_increments[] = { 4, 1, 0 }; 1081 register int incr, *incrp, t1, t2; 1082 struct buf *bp_temp; 1083 u_long lb_temp; 1084 1085 for (incrp = __rsshell_increments; incr = *incrp++;) 1086 for (t1 = incr; t1 < nmemb; ++t1) 1087 for (t2 = t1 - incr; t2 >= 0;) 1088 if (lb_array[t2] > lb_array[t2 + incr]) { 1089 lb_temp = lb_array[t2]; 1090 lb_array[t2] = lb_array[t2 + incr]; 1091 lb_array[t2 + incr] = lb_temp; 1092 bp_temp = bp_array[t2]; 1093 bp_array[t2] = bp_array[t2 + incr]; 1094 bp_array[t2 + incr] = bp_temp; 1095 t2 -= incr; 1096 } else 1097 break; 1098 } 1099 1100 /* 1101 * Check VXLOCK. Return 1 if the vnode is locked. Otherwise, vget it. 1102 */ 1103 lfs_vref(vp) 1104 register struct vnode *vp; 1105 { 1106 struct proc *p = curproc; /* XXX */ 1107 1108 if (vp->v_flag & VXLOCK) /* XXX */ 1109 return(1); 1110 return (vget(vp, 0, p)); 1111 } 1112 1113 void 1114 lfs_vunref(vp) 1115 register struct vnode *vp; 1116 { 1117 extern int lfs_no_inactive; 1118 1119 /* 1120 * This is vrele except that we do not want to VOP_INACTIVE 1121 * this vnode. Rather than inline vrele here, we use a global 1122 * flag to tell lfs_inactive not to run. Yes, its gross. 1123 */ 1124 lfs_no_inactive = 1; 1125 vrele(vp); 1126 lfs_no_inactive = 0; 1127 } 1128