1 /*- 2 * Copyright 1998, 2000 Marshall Kirk McKusick. 3 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org> 4 * All rights reserved. 5 * 6 * The soft updates code is derived from the appendix of a University 7 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, 8 * "Soft Updates: A Solution to the Metadata Update Problem in File 9 * Systems", CSE-TR-254-95, August 1995). 10 * 11 * Further information about soft updates can be obtained from: 12 * 13 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 14 * 1614 Oxford Street mckusick@mckusick.com 15 * Berkeley, CA 94709-1608 +1-510-843-9542 16 * USA 17 * 18 * Redistribution and use in source and binary forms, with or without 19 * modification, are permitted provided that the following conditions 20 * are met: 21 * 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 31 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, 32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 34 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 35 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 36 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 37 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 * 39 * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00 40 */ 41 42 #include <sys/cdefs.h> 43 __FBSDID("$FreeBSD$"); 44 45 #include "opt_ffs.h" 46 #include "opt_ddb.h" 47 48 /* 49 * For now we want the safety net that the DEBUG flag provides. 50 */ 51 #ifndef DEBUG 52 #define DEBUG 53 #endif 54 55 #include <sys/param.h> 56 #include <sys/kernel.h> 57 #include <sys/systm.h> 58 #include <sys/bio.h> 59 #include <sys/buf.h> 60 #include <sys/kdb.h> 61 #include <sys/kthread.h> 62 #include <sys/lock.h> 63 #include <sys/malloc.h> 64 #include <sys/mount.h> 65 #include <sys/mutex.h> 66 #include <sys/namei.h> 67 #include <sys/proc.h> 68 #include <sys/stat.h> 69 #include <sys/sysctl.h> 70 #include <sys/syslog.h> 71 #include <sys/vnode.h> 72 #include <sys/conf.h> 73 #include <ufs/ufs/dir.h> 74 #include <ufs/ufs/extattr.h> 75 #include <ufs/ufs/quota.h> 76 #include <ufs/ufs/inode.h> 77 #include <ufs/ufs/ufsmount.h> 78 #include <ufs/ffs/fs.h> 79 #include <ufs/ffs/softdep.h> 80 #include <ufs/ffs/ffs_extern.h> 81 #include <ufs/ufs/ufs_extern.h> 82 83 #include <vm/vm.h> 84 85 #include <ddb/ddb.h> 86 87 #ifndef SOFTUPDATES 88 89 int 90 softdep_flushfiles(oldmnt, flags, td) 91 struct mount *oldmnt; 92 int flags; 93 struct thread *td; 94 { 95 96 panic("softdep_flushfiles called"); 97 } 98 99 int 100 softdep_mount(devvp, mp, fs, cred) 101 struct vnode *devvp; 102 struct mount *mp; 103 struct fs *fs; 104 struct ucred *cred; 105 { 106 107 return (0); 108 } 109 110 void 111 softdep_initialize() 112 { 113 114 return; 115 } 116 117 void 118 softdep_uninitialize() 119 { 120 121 return; 122 } 123 124 void 125 softdep_unmount(mp) 126 struct mount *mp; 127 { 128 129 } 130 131 void 132 softdep_setup_sbupdate(ump, fs, bp) 133 struct ufsmount *ump; 134 struct fs *fs; 135 struct buf *bp; 136 { 137 } 138 139 void 140 softdep_setup_inomapdep(bp, ip, newinum) 141 struct buf *bp; 142 struct inode *ip; 143 ino_t newinum; 144 { 145 146 panic("softdep_setup_inomapdep called"); 147 } 148 149 void 150 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) 151 struct buf *bp; 152 struct mount *mp; 153 ufs2_daddr_t newblkno; 154 int frags; 155 int oldfrags; 156 { 157 158 panic("softdep_setup_blkmapdep called"); 159 } 160 161 void 162 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 163 struct inode *ip; 164 ufs_lbn_t lbn; 165 ufs2_daddr_t newblkno; 166 ufs2_daddr_t oldblkno; 167 long newsize; 168 long oldsize; 169 struct buf *bp; 170 { 171 172 panic("softdep_setup_allocdirect called"); 173 } 174 175 void 176 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 177 struct inode *ip; 178 ufs_lbn_t lbn; 179 ufs2_daddr_t newblkno; 180 ufs2_daddr_t oldblkno; 181 long newsize; 182 long oldsize; 183 struct buf *bp; 184 { 185 186 panic("softdep_setup_allocext called"); 187 } 188 189 void 190 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 191 struct inode *ip; 192 ufs_lbn_t lbn; 193 struct buf *bp; 194 int ptrno; 195 ufs2_daddr_t newblkno; 196 ufs2_daddr_t oldblkno; 197 struct buf *nbp; 198 { 199 200 panic("softdep_setup_allocindir_page called"); 201 } 202 203 void 204 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 205 struct buf *nbp; 206 struct inode *ip; 207 struct buf *bp; 208 int ptrno; 209 ufs2_daddr_t newblkno; 210 { 211 212 panic("softdep_setup_allocindir_meta called"); 213 } 214 215 void 216 softdep_setup_freeblocks(ip, length, flags) 217 struct inode *ip; 218 off_t length; 219 int flags; 220 { 221 222 panic("softdep_setup_freeblocks called"); 223 } 224 225 void 226 softdep_freefile(pvp, ino, mode) 227 struct vnode *pvp; 228 ino_t ino; 229 int mode; 230 { 231 232 panic("softdep_freefile called"); 233 } 234 235 int 236 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 237 struct buf *bp; 238 struct inode *dp; 239 off_t diroffset; 240 ino_t newinum; 241 struct buf *newdirbp; 242 int isnewblk; 243 { 244 245 panic("softdep_setup_directory_add called"); 246 } 247 248 void 249 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) 250 struct buf *bp; 251 struct inode *dp; 252 caddr_t base; 253 caddr_t oldloc; 254 caddr_t newloc; 255 int entrysize; 256 { 257 258 panic("softdep_change_directoryentry_offset called"); 259 } 260 261 void 262 softdep_setup_remove(bp, dp, ip, isrmdir) 263 struct buf *bp; 264 struct inode *dp; 265 struct inode *ip; 266 int isrmdir; 267 { 268 269 panic("softdep_setup_remove called"); 270 } 271 272 void 273 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 274 struct buf *bp; 275 struct inode *dp; 276 struct inode *ip; 277 ino_t newinum; 278 int isrmdir; 279 { 280 281 panic("softdep_setup_directory_change called"); 282 } 283 284 void * 285 softdep_setup_trunc(vp, length, flags) 286 struct vnode *vp; 287 off_t length; 288 int flags; 289 { 290 291 panic("%s called", __FUNCTION__); 292 293 return (NULL); 294 } 295 296 int 297 softdep_complete_trunc(vp, cookie) 298 struct vnode *vp; 299 void *cookie; 300 { 301 302 panic("%s called", __FUNCTION__); 303 304 return (0); 305 } 306 307 void 308 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) 309 struct mount *mp; 310 struct buf *bp; 311 ufs2_daddr_t blkno; 312 int frags; 313 struct workhead *wkhd; 314 { 315 316 panic("%s called", __FUNCTION__); 317 } 318 319 void 320 softdep_setup_inofree(mp, bp, ino, wkhd) 321 struct mount *mp; 322 struct buf *bp; 323 ino_t ino; 324 struct workhead *wkhd; 325 { 326 327 panic("%s called", __FUNCTION__); 328 } 329 330 void 331 softdep_setup_unlink(dp, ip) 332 struct inode *dp; 333 struct inode *ip; 334 { 335 336 panic("%s called", __FUNCTION__); 337 } 338 339 void 340 softdep_setup_link(dp, ip) 341 struct inode *dp; 342 struct inode *ip; 343 { 344 345 panic("%s called", __FUNCTION__); 346 } 347 348 void 349 softdep_revert_link(dp, ip) 350 struct inode *dp; 351 struct inode *ip; 352 { 353 354 panic("%s called", __FUNCTION__); 355 } 356 357 void 358 softdep_setup_rmdir(dp, ip) 359 struct inode *dp; 360 struct inode *ip; 361 { 362 363 panic("%s called", __FUNCTION__); 364 } 365 366 void 367 softdep_revert_rmdir(dp, ip) 368 struct inode *dp; 369 struct inode *ip; 370 { 371 372 panic("%s called", __FUNCTION__); 373 } 374 375 void 376 softdep_setup_create(dp, ip) 377 struct inode *dp; 378 struct inode *ip; 379 { 380 381 panic("%s called", __FUNCTION__); 382 } 383 384 void 385 softdep_revert_create(dp, ip) 386 struct inode *dp; 387 struct inode *ip; 388 { 389 390 panic("%s called", __FUNCTION__); 391 } 392 393 void 394 softdep_setup_mkdir(dp, ip) 395 struct inode *dp; 396 struct inode *ip; 397 { 398 399 panic("%s called", __FUNCTION__); 400 } 401 402 void 403 softdep_revert_mkdir(dp, ip) 404 struct inode *dp; 405 struct inode *ip; 406 { 407 408 panic("%s called", __FUNCTION__); 409 } 410 411 void 412 softdep_setup_dotdot_link(dp, ip) 413 struct inode *dp; 414 struct inode *ip; 415 { 416 417 panic("%s called", __FUNCTION__); 418 } 419 420 int 421 softdep_prealloc(vp, waitok) 422 struct vnode *vp; 423 int waitok; 424 { 425 426 panic("%s called", __FUNCTION__); 427 428 return (0); 429 } 430 431 int 432 softdep_journal_lookup(mp, vpp) 433 struct mount *mp; 434 struct vnode **vpp; 435 { 436 437 return (ENOENT); 438 } 439 440 void 441 softdep_change_linkcnt(ip) 442 struct inode *ip; 443 { 444 445 panic("softdep_change_linkcnt called"); 446 } 447 448 void 449 softdep_load_inodeblock(ip) 450 struct inode *ip; 451 { 452 453 panic("softdep_load_inodeblock called"); 454 } 455 456 void 457 softdep_update_inodeblock(ip, bp, waitfor) 458 struct inode *ip; 459 struct buf *bp; 460 int waitfor; 461 { 462 463 panic("softdep_update_inodeblock called"); 464 } 465 466 int 467 softdep_fsync(vp) 468 struct vnode *vp; /* the "in_core" copy of the inode */ 469 { 470 471 return (0); 472 } 473 474 void 475 softdep_fsync_mountdev(vp) 476 struct vnode *vp; 477 { 478 479 return; 480 } 481 482 int 483 softdep_flushworklist(oldmnt, countp, td) 484 struct mount *oldmnt; 485 int *countp; 486 struct thread *td; 487 { 488 489 *countp = 0; 490 return (0); 491 } 492 493 int 494 softdep_sync_metadata(struct vnode *vp) 495 { 496 497 return (0); 498 } 499 500 int 501 softdep_slowdown(vp) 502 struct vnode *vp; 503 { 504 505 panic("softdep_slowdown called"); 506 } 507 508 void 509 softdep_releasefile(ip) 510 struct inode *ip; /* inode with the zero effective link count */ 511 { 512 513 panic("softdep_releasefile called"); 514 } 515 516 int 517 softdep_request_cleanup(fs, vp) 518 struct fs *fs; 519 struct vnode *vp; 520 { 521 522 return (0); 523 } 524 525 int 526 softdep_check_suspend(struct mount *mp, 527 struct vnode *devvp, 528 int softdep_deps, 529 int softdep_accdeps, 530 int secondary_writes, 531 int secondary_accwrites) 532 { 533 struct bufobj *bo; 534 int error; 535 536 (void) softdep_deps, 537 (void) softdep_accdeps; 538 539 bo = &devvp->v_bufobj; 540 ASSERT_BO_LOCKED(bo); 541 542 MNT_ILOCK(mp); 543 while (mp->mnt_secondary_writes != 0) { 544 BO_UNLOCK(bo); 545 msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), 546 (PUSER - 1) | PDROP, "secwr", 0); 547 BO_LOCK(bo); 548 MNT_ILOCK(mp); 549 } 550 551 /* 552 * Reasons for needing more work before suspend: 553 * - Dirty buffers on devvp. 554 * - Secondary writes occurred after start of vnode sync loop 555 */ 556 error = 0; 557 if (bo->bo_numoutput > 0 || 558 bo->bo_dirty.bv_cnt > 0 || 559 secondary_writes != 0 || 560 mp->mnt_secondary_writes != 0 || 561 secondary_accwrites != mp->mnt_secondary_accwrites) 562 error = EAGAIN; 563 BO_UNLOCK(bo); 564 return (error); 565 } 566 567 void 568 softdep_get_depcounts(struct mount *mp, 569 int *softdepactivep, 570 int *softdepactiveaccp) 571 { 572 (void) mp; 573 *softdepactivep = 0; 574 *softdepactiveaccp = 0; 575 } 576 577 #else 578 579 FEATURE(softupdates, "FFS soft-updates support"); 580 581 /* 582 * These definitions need to be adapted to the system to which 583 * this file is being ported. 584 */ 585 586 #define M_SOFTDEP_FLAGS (M_WAITOK) 587 588 #define D_PAGEDEP 0 589 #define D_INODEDEP 1 590 #define D_BMSAFEMAP 2 591 #define D_NEWBLK 3 592 #define D_ALLOCDIRECT 4 593 #define D_INDIRDEP 5 594 #define D_ALLOCINDIR 6 595 #define D_FREEFRAG 7 596 #define D_FREEBLKS 8 597 #define D_FREEFILE 9 598 #define D_DIRADD 10 599 #define D_MKDIR 11 600 #define D_DIRREM 12 601 #define D_NEWDIRBLK 13 602 #define D_FREEWORK 14 603 #define D_FREEDEP 15 604 #define D_JADDREF 16 605 #define D_JREMREF 17 606 #define D_JMVREF 18 607 #define D_JNEWBLK 19 608 #define D_JFREEBLK 20 609 #define D_JFREEFRAG 21 610 #define D_JSEG 22 611 #define D_JSEGDEP 23 612 #define D_SBDEP 24 613 #define D_JTRUNC 25 614 #define D_LAST D_JTRUNC 615 616 unsigned long dep_current[D_LAST + 1]; 617 unsigned long dep_total[D_LAST + 1]; 618 619 620 SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, "soft updates stats"); 621 SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0, 622 "total dependencies allocated"); 623 SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0, 624 "current dependencies allocated"); 625 626 #define SOFTDEP_TYPE(type, str, long) \ 627 static MALLOC_DEFINE(M_ ## type, #str, long); \ 628 SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \ 629 &dep_total[D_ ## type], 0, ""); \ 630 SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \ 631 &dep_current[D_ ## type], 0, ""); 632 633 SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies"); 634 SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies"); 635 SOFTDEP_TYPE(BMSAFEMAP, bmsafemap, 636 "Block or frag allocated from cyl group map"); 637 SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency"); 638 SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode"); 639 SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies"); 640 SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block"); 641 SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode"); 642 SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode"); 643 SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated"); 644 SOFTDEP_TYPE(DIRADD, diradd, "New directory entry"); 645 SOFTDEP_TYPE(MKDIR, mkdir, "New directory"); 646 SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted"); 647 SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block"); 648 SOFTDEP_TYPE(FREEWORK, freework, "free an inode block"); 649 SOFTDEP_TYPE(FREEDEP, freedep, "track a block free"); 650 SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add"); 651 SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove"); 652 SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move"); 653 SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block"); 654 SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block"); 655 SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag"); 656 SOFTDEP_TYPE(JSEG, jseg, "Journal segment"); 657 SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete"); 658 SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency"); 659 SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation"); 660 661 static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes"); 662 static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations"); 663 664 /* 665 * translate from workitem type to memory type 666 * MUST match the defines above, such that memtype[D_XXX] == M_XXX 667 */ 668 static struct malloc_type *memtype[] = { 669 M_PAGEDEP, 670 M_INODEDEP, 671 M_BMSAFEMAP, 672 M_NEWBLK, 673 M_ALLOCDIRECT, 674 M_INDIRDEP, 675 M_ALLOCINDIR, 676 M_FREEFRAG, 677 M_FREEBLKS, 678 M_FREEFILE, 679 M_DIRADD, 680 M_MKDIR, 681 M_DIRREM, 682 M_NEWDIRBLK, 683 M_FREEWORK, 684 M_FREEDEP, 685 M_JADDREF, 686 M_JREMREF, 687 M_JMVREF, 688 M_JNEWBLK, 689 M_JFREEBLK, 690 M_JFREEFRAG, 691 M_JSEG, 692 M_JSEGDEP, 693 M_SBDEP, 694 M_JTRUNC 695 }; 696 697 static LIST_HEAD(mkdirlist, mkdir) mkdirlisthd; 698 699 #define DtoM(type) (memtype[type]) 700 701 /* 702 * Names of malloc types. 703 */ 704 #define TYPENAME(type) \ 705 ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???") 706 /* 707 * End system adaptation definitions. 708 */ 709 710 #define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino) 711 #define DOT_OFFSET offsetof(struct dirtemplate, dot_ino) 712 713 /* 714 * Forward declarations. 715 */ 716 struct inodedep_hashhead; 717 struct newblk_hashhead; 718 struct pagedep_hashhead; 719 struct bmsafemap_hashhead; 720 721 /* 722 * Internal function prototypes. 723 */ 724 static void softdep_error(char *, int); 725 static void drain_output(struct vnode *); 726 static struct buf *getdirtybuf(struct buf *, struct mtx *, int); 727 static void clear_remove(struct thread *); 728 static void clear_inodedeps(struct thread *); 729 static void unlinked_inodedep(struct mount *, struct inodedep *); 730 static void clear_unlinked_inodedep(struct inodedep *); 731 static struct inodedep *first_unlinked_inodedep(struct ufsmount *); 732 static int flush_pagedep_deps(struct vnode *, struct mount *, 733 struct diraddhd *); 734 static void free_pagedep(struct pagedep *); 735 static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t); 736 static int flush_inodedep_deps(struct mount *, ino_t); 737 static int flush_deplist(struct allocdirectlst *, int, int *); 738 static int handle_written_filepage(struct pagedep *, struct buf *); 739 static int handle_written_sbdep(struct sbdep *, struct buf *); 740 static void initiate_write_sbdep(struct sbdep *); 741 static void diradd_inode_written(struct diradd *, struct inodedep *); 742 static int handle_written_indirdep(struct indirdep *, struct buf *, 743 struct buf**); 744 static int handle_written_inodeblock(struct inodedep *, struct buf *); 745 static int handle_written_bmsafemap(struct bmsafemap *, struct buf *); 746 static void handle_written_jaddref(struct jaddref *); 747 static void handle_written_jremref(struct jremref *); 748 static void handle_written_jseg(struct jseg *, struct buf *); 749 static void handle_written_jnewblk(struct jnewblk *); 750 static void handle_written_jfreeblk(struct jfreeblk *); 751 static void handle_written_jfreefrag(struct jfreefrag *); 752 static void complete_jseg(struct jseg *); 753 static void jseg_write(struct ufsmount *ump, struct jblocks *, struct jseg *, 754 uint8_t *); 755 static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *); 756 static void jremref_write(struct jremref *, struct jseg *, uint8_t *); 757 static void jmvref_write(struct jmvref *, struct jseg *, uint8_t *); 758 static void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *); 759 static void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *); 760 static void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *); 761 static void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *); 762 static inline void inoref_write(struct inoref *, struct jseg *, 763 struct jrefrec *); 764 static void handle_allocdirect_partdone(struct allocdirect *, 765 struct workhead *); 766 static void cancel_newblk(struct newblk *, struct workhead *); 767 static void indirdep_complete(struct indirdep *); 768 static void handle_allocindir_partdone(struct allocindir *); 769 static void initiate_write_filepage(struct pagedep *, struct buf *); 770 static void initiate_write_indirdep(struct indirdep*, struct buf *); 771 static void handle_written_mkdir(struct mkdir *, int); 772 static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *); 773 static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *); 774 static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *); 775 static void handle_workitem_freefile(struct freefile *); 776 static void handle_workitem_remove(struct dirrem *, struct vnode *); 777 static struct dirrem *newdirrem(struct buf *, struct inode *, 778 struct inode *, int, struct dirrem **); 779 static void cancel_indirdep(struct indirdep *, struct buf *, struct inodedep *, 780 struct freeblks *); 781 static void free_indirdep(struct indirdep *); 782 static void free_diradd(struct diradd *, struct workhead *); 783 static void merge_diradd(struct inodedep *, struct diradd *); 784 static void complete_diradd(struct diradd *); 785 static struct diradd *diradd_lookup(struct pagedep *, int); 786 static struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *, 787 struct jremref *); 788 static struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *, 789 struct jremref *); 790 static void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *, 791 struct jremref *, struct jremref *); 792 static void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *, 793 struct jremref *); 794 static void cancel_allocindir(struct allocindir *, struct inodedep *, 795 struct freeblks *); 796 static void complete_mkdir(struct mkdir *); 797 static void free_newdirblk(struct newdirblk *); 798 static void free_jremref(struct jremref *); 799 static void free_jaddref(struct jaddref *); 800 static void free_jsegdep(struct jsegdep *); 801 static void free_jseg(struct jseg *); 802 static void free_jnewblk(struct jnewblk *); 803 static void free_jfreeblk(struct jfreeblk *); 804 static void free_jfreefrag(struct jfreefrag *); 805 static void free_freedep(struct freedep *); 806 static void journal_jremref(struct dirrem *, struct jremref *, 807 struct inodedep *); 808 static void cancel_jnewblk(struct jnewblk *, struct workhead *); 809 static int cancel_jaddref(struct jaddref *, struct inodedep *, 810 struct workhead *); 811 static void cancel_jfreefrag(struct jfreefrag *); 812 static void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t); 813 static int deallocate_dependencies(struct buf *, struct inodedep *, 814 struct freeblks *); 815 static void free_newblk(struct newblk *); 816 static void cancel_allocdirect(struct allocdirectlst *, 817 struct allocdirect *, struct freeblks *, int); 818 static int check_inode_unwritten(struct inodedep *); 819 static int free_inodedep(struct inodedep *); 820 static void freework_freeblock(struct freework *); 821 static void handle_workitem_freeblocks(struct freeblks *, int); 822 static void handle_complete_freeblocks(struct freeblks *); 823 static void handle_workitem_indirblk(struct freework *); 824 static void handle_written_freework(struct freework *); 825 static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); 826 static void setup_allocindir_phase2(struct buf *, struct inode *, 827 struct inodedep *, struct allocindir *, ufs_lbn_t); 828 static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, 829 ufs2_daddr_t, ufs_lbn_t); 830 static void handle_workitem_freefrag(struct freefrag *); 831 static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long, 832 ufs_lbn_t); 833 static void allocdirect_merge(struct allocdirectlst *, 834 struct allocdirect *, struct allocdirect *); 835 static struct freefrag *allocindir_merge(struct allocindir *, 836 struct allocindir *); 837 static int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int, 838 struct bmsafemap **); 839 static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *, 840 int cg); 841 static int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t, 842 int, struct newblk **); 843 static int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **); 844 static int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t, 845 struct inodedep **); 846 static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **); 847 static int pagedep_lookup(struct mount *, ino_t, ufs_lbn_t, int, 848 struct pagedep **); 849 static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, 850 struct mount *mp, int, struct pagedep **); 851 static void pause_timer(void *); 852 static int request_cleanup(struct mount *, int); 853 static int process_worklist_item(struct mount *, int); 854 static void process_removes(struct vnode *); 855 static void jwork_move(struct workhead *, struct workhead *); 856 static void add_to_worklist(struct worklist *, int); 857 static void remove_from_worklist(struct worklist *); 858 static void softdep_flush(void); 859 static int softdep_speedup(void); 860 static void worklist_speedup(void); 861 static int journal_mount(struct mount *, struct fs *, struct ucred *); 862 static void journal_unmount(struct mount *); 863 static int journal_space(struct ufsmount *, int); 864 static void journal_suspend(struct ufsmount *); 865 static int journal_unsuspend(struct ufsmount *ump); 866 static void softdep_prelink(struct vnode *, struct vnode *); 867 static void add_to_journal(struct worklist *); 868 static void remove_from_journal(struct worklist *); 869 static void softdep_process_journal(struct mount *, int); 870 static struct jremref *newjremref(struct dirrem *, struct inode *, 871 struct inode *ip, off_t, nlink_t); 872 static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t, 873 uint16_t); 874 static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t, 875 uint16_t); 876 static inline struct jsegdep *inoref_jseg(struct inoref *); 877 static struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t); 878 static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t, 879 ufs2_daddr_t, int); 880 static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *, 881 ufs2_daddr_t, long, ufs_lbn_t); 882 static struct freework *newfreework(struct ufsmount *, struct freeblks *, 883 struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int); 884 static void jwait(struct worklist *wk); 885 static struct inodedep *inodedep_lookup_ip(struct inode *); 886 static int bmsafemap_rollbacks(struct bmsafemap *); 887 static struct freefile *handle_bufwait(struct inodedep *, struct workhead *); 888 static void handle_jwork(struct workhead *); 889 static struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *, 890 struct mkdir **); 891 static struct jblocks *jblocks_create(void); 892 static ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *); 893 static void jblocks_free(struct jblocks *, struct mount *, int); 894 static void jblocks_destroy(struct jblocks *); 895 static void jblocks_add(struct jblocks *, ufs2_daddr_t, int); 896 897 /* 898 * Exported softdep operations. 899 */ 900 static void softdep_disk_io_initiation(struct buf *); 901 static void softdep_disk_write_complete(struct buf *); 902 static void softdep_deallocate_dependencies(struct buf *); 903 static int softdep_count_dependencies(struct buf *bp, int); 904 905 static struct mtx lk; 906 MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF); 907 908 #define TRY_ACQUIRE_LOCK(lk) mtx_trylock(lk) 909 #define ACQUIRE_LOCK(lk) mtx_lock(lk) 910 #define FREE_LOCK(lk) mtx_unlock(lk) 911 912 #define BUF_AREC(bp) lockallowrecurse(&(bp)->b_lock) 913 #define BUF_NOREC(bp) lockdisablerecurse(&(bp)->b_lock) 914 915 /* 916 * Worklist queue management. 917 * These routines require that the lock be held. 918 */ 919 #ifndef /* NOT */ DEBUG 920 #define WORKLIST_INSERT(head, item) do { \ 921 (item)->wk_state |= ONWORKLIST; \ 922 LIST_INSERT_HEAD(head, item, wk_list); \ 923 } while (0) 924 #define WORKLIST_REMOVE(item) do { \ 925 (item)->wk_state &= ~ONWORKLIST; \ 926 LIST_REMOVE(item, wk_list); \ 927 } while (0) 928 #define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT 929 #define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE 930 931 #else /* DEBUG */ 932 static void worklist_insert(struct workhead *, struct worklist *, int); 933 static void worklist_remove(struct worklist *, int); 934 935 #define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1) 936 #define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0) 937 #define WORKLIST_REMOVE(item) worklist_remove(item, 1) 938 #define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0) 939 940 static void 941 worklist_insert(head, item, locked) 942 struct workhead *head; 943 struct worklist *item; 944 int locked; 945 { 946 947 if (locked) 948 mtx_assert(&lk, MA_OWNED); 949 if (item->wk_state & ONWORKLIST) 950 panic("worklist_insert: %p %s(0x%X) already on list", 951 item, TYPENAME(item->wk_type), item->wk_state); 952 item->wk_state |= ONWORKLIST; 953 LIST_INSERT_HEAD(head, item, wk_list); 954 } 955 956 static void 957 worklist_remove(item, locked) 958 struct worklist *item; 959 int locked; 960 { 961 962 if (locked) 963 mtx_assert(&lk, MA_OWNED); 964 if ((item->wk_state & ONWORKLIST) == 0) 965 panic("worklist_remove: %p %s(0x%X) not on list", 966 item, TYPENAME(item->wk_type), item->wk_state); 967 item->wk_state &= ~ONWORKLIST; 968 LIST_REMOVE(item, wk_list); 969 } 970 #endif /* DEBUG */ 971 972 /* 973 * Merge two jsegdeps keeping only the oldest one as newer references 974 * can't be discarded until after older references. 975 */ 976 static inline struct jsegdep * 977 jsegdep_merge(struct jsegdep *one, struct jsegdep *two) 978 { 979 struct jsegdep *swp; 980 981 if (two == NULL) 982 return (one); 983 984 if (one->jd_seg->js_seq > two->jd_seg->js_seq) { 985 swp = one; 986 one = two; 987 two = swp; 988 } 989 WORKLIST_REMOVE(&two->jd_list); 990 free_jsegdep(two); 991 992 return (one); 993 } 994 995 /* 996 * If two freedeps are compatible free one to reduce list size. 997 */ 998 static inline struct freedep * 999 freedep_merge(struct freedep *one, struct freedep *two) 1000 { 1001 if (two == NULL) 1002 return (one); 1003 1004 if (one->fd_freework == two->fd_freework) { 1005 WORKLIST_REMOVE(&two->fd_list); 1006 free_freedep(two); 1007 } 1008 return (one); 1009 } 1010 1011 /* 1012 * Move journal work from one list to another. Duplicate freedeps and 1013 * jsegdeps are coalesced to keep the lists as small as possible. 1014 */ 1015 static void 1016 jwork_move(dst, src) 1017 struct workhead *dst; 1018 struct workhead *src; 1019 { 1020 struct freedep *freedep; 1021 struct jsegdep *jsegdep; 1022 struct worklist *wkn; 1023 struct worklist *wk; 1024 1025 KASSERT(dst != src, 1026 ("jwork_move: dst == src")); 1027 freedep = NULL; 1028 jsegdep = NULL; 1029 LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) { 1030 if (wk->wk_type == D_JSEGDEP) 1031 jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); 1032 if (wk->wk_type == D_FREEDEP) 1033 freedep = freedep_merge(WK_FREEDEP(wk), freedep); 1034 } 1035 1036 mtx_assert(&lk, MA_OWNED); 1037 while ((wk = LIST_FIRST(src)) != NULL) { 1038 WORKLIST_REMOVE(wk); 1039 WORKLIST_INSERT(dst, wk); 1040 if (wk->wk_type == D_JSEGDEP) { 1041 jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); 1042 continue; 1043 } 1044 if (wk->wk_type == D_FREEDEP) 1045 freedep = freedep_merge(WK_FREEDEP(wk), freedep); 1046 } 1047 } 1048 1049 /* 1050 * Routines for tracking and managing workitems. 1051 */ 1052 static void workitem_free(struct worklist *, int); 1053 static void workitem_alloc(struct worklist *, int, struct mount *); 1054 1055 #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type)) 1056 1057 static void 1058 workitem_free(item, type) 1059 struct worklist *item; 1060 int type; 1061 { 1062 struct ufsmount *ump; 1063 mtx_assert(&lk, MA_OWNED); 1064 1065 #ifdef DEBUG 1066 if (item->wk_state & ONWORKLIST) 1067 panic("workitem_free: %s(0x%X) still on list", 1068 TYPENAME(item->wk_type), item->wk_state); 1069 if (item->wk_type != type) 1070 panic("workitem_free: type mismatch %s != %s", 1071 TYPENAME(item->wk_type), TYPENAME(type)); 1072 #endif 1073 ump = VFSTOUFS(item->wk_mp); 1074 if (--ump->softdep_deps == 0 && ump->softdep_req) 1075 wakeup(&ump->softdep_deps); 1076 dep_current[type]--; 1077 free(item, DtoM(type)); 1078 } 1079 1080 static void 1081 workitem_alloc(item, type, mp) 1082 struct worklist *item; 1083 int type; 1084 struct mount *mp; 1085 { 1086 item->wk_type = type; 1087 item->wk_mp = mp; 1088 item->wk_state = 0; 1089 ACQUIRE_LOCK(&lk); 1090 dep_current[type]++; 1091 dep_total[type]++; 1092 VFSTOUFS(mp)->softdep_deps++; 1093 VFSTOUFS(mp)->softdep_accdeps++; 1094 FREE_LOCK(&lk); 1095 } 1096 1097 /* 1098 * Workitem queue management 1099 */ 1100 static int max_softdeps; /* maximum number of structs before slowdown */ 1101 static int maxindirdeps = 50; /* max number of indirdeps before slowdown */ 1102 static int tickdelay = 2; /* number of ticks to pause during slowdown */ 1103 static int proc_waiting; /* tracks whether we have a timeout posted */ 1104 static int *stat_countp; /* statistic to count in proc_waiting timeout */ 1105 static struct callout softdep_callout; 1106 static int req_pending; 1107 static int req_clear_inodedeps; /* syncer process flush some inodedeps */ 1108 #define FLUSH_INODES 1 1109 static int req_clear_remove; /* syncer process flush some freeblks */ 1110 #define FLUSH_REMOVE 2 1111 #define FLUSH_REMOVE_WAIT 3 1112 static long num_freeblkdep; /* number of freeblks workitems allocated */ 1113 1114 /* 1115 * runtime statistics 1116 */ 1117 static int stat_worklist_push; /* number of worklist cleanups */ 1118 static int stat_blk_limit_push; /* number of times block limit neared */ 1119 static int stat_ino_limit_push; /* number of times inode limit neared */ 1120 static int stat_blk_limit_hit; /* number of times block slowdown imposed */ 1121 static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ 1122 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */ 1123 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ 1124 static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ 1125 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ 1126 static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ 1127 static int stat_jaddref; /* bufs redirtied as ino bitmap can not write */ 1128 static int stat_jnewblk; /* bufs redirtied as blk bitmap can not write */ 1129 static int stat_journal_min; /* Times hit journal min threshold */ 1130 static int stat_journal_low; /* Times hit journal low threshold */ 1131 static int stat_journal_wait; /* Times blocked in jwait(). */ 1132 static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */ 1133 static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */ 1134 static int stat_jwait_inode; /* Times blocked in jwait() for inodes. */ 1135 static int stat_jwait_newblk; /* Times blocked in jwait() for newblks. */ 1136 1137 SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW, 1138 &max_softdeps, 0, ""); 1139 SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW, 1140 &tickdelay, 0, ""); 1141 SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW, 1142 &maxindirdeps, 0, ""); 1143 SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW, 1144 &stat_worklist_push, 0,""); 1145 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW, 1146 &stat_blk_limit_push, 0,""); 1147 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW, 1148 &stat_ino_limit_push, 0,""); 1149 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW, 1150 &stat_blk_limit_hit, 0, ""); 1151 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW, 1152 &stat_ino_limit_hit, 0, ""); 1153 SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW, 1154 &stat_sync_limit_hit, 0, ""); 1155 SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, 1156 &stat_indir_blk_ptrs, 0, ""); 1157 SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW, 1158 &stat_inode_bitmap, 0, ""); 1159 SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, 1160 &stat_direct_blk_ptrs, 0, ""); 1161 SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW, 1162 &stat_dir_entry, 0, ""); 1163 SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW, 1164 &stat_jaddref, 0, ""); 1165 SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW, 1166 &stat_jnewblk, 0, ""); 1167 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW, 1168 &stat_journal_low, 0, ""); 1169 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW, 1170 &stat_journal_min, 0, ""); 1171 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW, 1172 &stat_journal_wait, 0, ""); 1173 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW, 1174 &stat_jwait_filepage, 0, ""); 1175 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW, 1176 &stat_jwait_freeblks, 0, ""); 1177 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW, 1178 &stat_jwait_inode, 0, ""); 1179 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW, 1180 &stat_jwait_newblk, 0, ""); 1181 1182 SYSCTL_DECL(_vfs_ffs); 1183 1184 LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl; 1185 static u_long bmsafemap_hash; /* size of hash table - 1 */ 1186 1187 static int compute_summary_at_mount = 0; /* Whether to recompute the summary at mount time */ 1188 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW, 1189 &compute_summary_at_mount, 0, "Recompute summary at mount"); 1190 1191 static struct proc *softdepproc; 1192 static struct kproc_desc softdep_kp = { 1193 "softdepflush", 1194 softdep_flush, 1195 &softdepproc 1196 }; 1197 SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start, 1198 &softdep_kp); 1199 1200 static void 1201 softdep_flush(void) 1202 { 1203 struct mount *nmp; 1204 struct mount *mp; 1205 struct ufsmount *ump; 1206 struct thread *td; 1207 int remaining; 1208 int progress; 1209 int vfslocked; 1210 1211 td = curthread; 1212 td->td_pflags |= TDP_NORUNNINGBUF; 1213 1214 for (;;) { 1215 kproc_suspend_check(softdepproc); 1216 vfslocked = VFS_LOCK_GIANT((struct mount *)NULL); 1217 ACQUIRE_LOCK(&lk); 1218 /* 1219 * If requested, try removing inode or removal dependencies. 1220 */ 1221 if (req_clear_inodedeps) { 1222 clear_inodedeps(td); 1223 req_clear_inodedeps -= 1; 1224 wakeup_one(&proc_waiting); 1225 } 1226 if (req_clear_remove) { 1227 clear_remove(td); 1228 req_clear_remove -= 1; 1229 wakeup_one(&proc_waiting); 1230 } 1231 FREE_LOCK(&lk); 1232 VFS_UNLOCK_GIANT(vfslocked); 1233 remaining = progress = 0; 1234 mtx_lock(&mountlist_mtx); 1235 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 1236 nmp = TAILQ_NEXT(mp, mnt_list); 1237 if ((mp->mnt_flag & MNT_SOFTDEP) == 0) 1238 continue; 1239 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 1240 continue; 1241 vfslocked = VFS_LOCK_GIANT(mp); 1242 progress += softdep_process_worklist(mp, 0); 1243 ump = VFSTOUFS(mp); 1244 remaining += ump->softdep_on_worklist - 1245 ump->softdep_on_worklist_inprogress; 1246 VFS_UNLOCK_GIANT(vfslocked); 1247 mtx_lock(&mountlist_mtx); 1248 nmp = TAILQ_NEXT(mp, mnt_list); 1249 vfs_unbusy(mp); 1250 } 1251 mtx_unlock(&mountlist_mtx); 1252 if (remaining && progress) 1253 continue; 1254 ACQUIRE_LOCK(&lk); 1255 if (!req_pending) 1256 msleep(&req_pending, &lk, PVM, "sdflush", hz); 1257 req_pending = 0; 1258 FREE_LOCK(&lk); 1259 } 1260 } 1261 1262 static void 1263 worklist_speedup(void) 1264 { 1265 mtx_assert(&lk, MA_OWNED); 1266 if (req_pending == 0) { 1267 req_pending = 1; 1268 wakeup(&req_pending); 1269 } 1270 } 1271 1272 static int 1273 softdep_speedup(void) 1274 { 1275 1276 worklist_speedup(); 1277 bd_speedup(); 1278 return speedup_syncer(); 1279 } 1280 1281 /* 1282 * Add an item to the end of the work queue. 1283 * This routine requires that the lock be held. 1284 * This is the only routine that adds items to the list. 1285 * The following routine is the only one that removes items 1286 * and does so in order from first to last. 1287 */ 1288 static void 1289 add_to_worklist(wk, nodelay) 1290 struct worklist *wk; 1291 int nodelay; 1292 { 1293 struct ufsmount *ump; 1294 1295 mtx_assert(&lk, MA_OWNED); 1296 ump = VFSTOUFS(wk->wk_mp); 1297 if (wk->wk_state & ONWORKLIST) 1298 panic("add_to_worklist: %s(0x%X) already on list", 1299 TYPENAME(wk->wk_type), wk->wk_state); 1300 wk->wk_state |= ONWORKLIST; 1301 if (LIST_EMPTY(&ump->softdep_workitem_pending)) 1302 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); 1303 else 1304 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list); 1305 ump->softdep_worklist_tail = wk; 1306 ump->softdep_on_worklist += 1; 1307 if (nodelay) 1308 worklist_speedup(); 1309 } 1310 1311 /* 1312 * Remove the item to be processed. If we are removing the last 1313 * item on the list, we need to recalculate the tail pointer. 1314 */ 1315 static void 1316 remove_from_worklist(wk) 1317 struct worklist *wk; 1318 { 1319 struct ufsmount *ump; 1320 struct worklist *wkend; 1321 1322 ump = VFSTOUFS(wk->wk_mp); 1323 WORKLIST_REMOVE(wk); 1324 if (wk == ump->softdep_worklist_tail) { 1325 LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list) 1326 if (LIST_NEXT(wkend, wk_list) == NULL) 1327 break; 1328 ump->softdep_worklist_tail = wkend; 1329 } 1330 ump->softdep_on_worklist -= 1; 1331 } 1332 1333 /* 1334 * Process that runs once per second to handle items in the background queue. 1335 * 1336 * Note that we ensure that everything is done in the order in which they 1337 * appear in the queue. The code below depends on this property to ensure 1338 * that blocks of a file are freed before the inode itself is freed. This 1339 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated 1340 * until all the old ones have been purged from the dependency lists. 1341 */ 1342 int 1343 softdep_process_worklist(mp, full) 1344 struct mount *mp; 1345 int full; 1346 { 1347 struct thread *td = curthread; 1348 int cnt, matchcnt; 1349 struct ufsmount *ump; 1350 long starttime; 1351 1352 KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp")); 1353 /* 1354 * Record the process identifier of our caller so that we can give 1355 * this process preferential treatment in request_cleanup below. 1356 */ 1357 matchcnt = 0; 1358 ump = VFSTOUFS(mp); 1359 ACQUIRE_LOCK(&lk); 1360 starttime = time_second; 1361 softdep_process_journal(mp, full?MNT_WAIT:0); 1362 while (ump->softdep_on_worklist > 0) { 1363 if ((cnt = process_worklist_item(mp, LK_NOWAIT)) == -1) 1364 break; 1365 else 1366 matchcnt += cnt; 1367 /* 1368 * If requested, try removing inode or removal dependencies. 1369 */ 1370 if (req_clear_inodedeps) { 1371 clear_inodedeps(td); 1372 req_clear_inodedeps -= 1; 1373 wakeup_one(&proc_waiting); 1374 } 1375 if (req_clear_remove) { 1376 clear_remove(td); 1377 req_clear_remove -= 1; 1378 wakeup_one(&proc_waiting); 1379 } 1380 /* 1381 * We do not generally want to stop for buffer space, but if 1382 * we are really being a buffer hog, we will stop and wait. 1383 */ 1384 if (should_yield()) { 1385 FREE_LOCK(&lk); 1386 kern_yield(-1); 1387 bwillwrite(); 1388 ACQUIRE_LOCK(&lk); 1389 } 1390 /* 1391 * Never allow processing to run for more than one 1392 * second. Otherwise the other mountpoints may get 1393 * excessively backlogged. 1394 */ 1395 if (!full && starttime != time_second) 1396 break; 1397 } 1398 if (full == 0) 1399 journal_unsuspend(ump); 1400 FREE_LOCK(&lk); 1401 return (matchcnt); 1402 } 1403 1404 /* 1405 * Process all removes associated with a vnode if we are running out of 1406 * journal space. Any other process which attempts to flush these will 1407 * be unable as we have the vnodes locked. 1408 */ 1409 static void 1410 process_removes(vp) 1411 struct vnode *vp; 1412 { 1413 struct inodedep *inodedep; 1414 struct dirrem *dirrem; 1415 struct mount *mp; 1416 ino_t inum; 1417 1418 mtx_assert(&lk, MA_OWNED); 1419 1420 mp = vp->v_mount; 1421 inum = VTOI(vp)->i_number; 1422 for (;;) { 1423 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) 1424 return; 1425 LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) 1426 if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) == 1427 (COMPLETE | ONWORKLIST)) 1428 break; 1429 if (dirrem == NULL) 1430 return; 1431 /* 1432 * If another thread is trying to lock this vnode it will 1433 * fail but we must wait for it to do so before we can 1434 * proceed. 1435 */ 1436 if (dirrem->dm_state & INPROGRESS) { 1437 dirrem->dm_state |= IOWAITING; 1438 msleep(&dirrem->dm_list, &lk, PVM, "pwrwait", 0); 1439 continue; 1440 } 1441 remove_from_worklist(&dirrem->dm_list); 1442 FREE_LOCK(&lk); 1443 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) 1444 panic("process_removes: suspended filesystem"); 1445 handle_workitem_remove(dirrem, vp); 1446 vn_finished_secondary_write(mp); 1447 ACQUIRE_LOCK(&lk); 1448 } 1449 } 1450 1451 /* 1452 * Process one item on the worklist. 1453 */ 1454 static int 1455 process_worklist_item(mp, flags) 1456 struct mount *mp; 1457 int flags; 1458 { 1459 struct worklist *wk; 1460 struct ufsmount *ump; 1461 struct vnode *vp; 1462 int matchcnt = 0; 1463 1464 mtx_assert(&lk, MA_OWNED); 1465 KASSERT(mp != NULL, ("process_worklist_item: NULL mp")); 1466 /* 1467 * If we are being called because of a process doing a 1468 * copy-on-write, then it is not safe to write as we may 1469 * recurse into the copy-on-write routine. 1470 */ 1471 if (curthread->td_pflags & TDP_COWINPROGRESS) 1472 return (-1); 1473 /* 1474 * Normally we just process each item on the worklist in order. 1475 * However, if we are in a situation where we cannot lock any 1476 * inodes, we have to skip over any dirrem requests whose 1477 * vnodes are resident and locked. 1478 */ 1479 vp = NULL; 1480 ump = VFSTOUFS(mp); 1481 LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) { 1482 if (wk->wk_state & INPROGRESS) 1483 continue; 1484 if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM) 1485 break; 1486 wk->wk_state |= INPROGRESS; 1487 ump->softdep_on_worklist_inprogress++; 1488 FREE_LOCK(&lk); 1489 ffs_vgetf(mp, WK_DIRREM(wk)->dm_oldinum, 1490 LK_NOWAIT | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ); 1491 ACQUIRE_LOCK(&lk); 1492 if (wk->wk_state & IOWAITING) { 1493 wk->wk_state &= ~IOWAITING; 1494 wakeup(wk); 1495 } 1496 wk->wk_state &= ~INPROGRESS; 1497 ump->softdep_on_worklist_inprogress--; 1498 if (vp != NULL) 1499 break; 1500 } 1501 if (wk == 0) 1502 return (-1); 1503 remove_from_worklist(wk); 1504 FREE_LOCK(&lk); 1505 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) 1506 panic("process_worklist_item: suspended filesystem"); 1507 matchcnt++; 1508 switch (wk->wk_type) { 1509 1510 case D_DIRREM: 1511 /* removal of a directory entry */ 1512 handle_workitem_remove(WK_DIRREM(wk), vp); 1513 if (vp) 1514 vput(vp); 1515 break; 1516 1517 case D_FREEBLKS: 1518 /* releasing blocks and/or fragments from a file */ 1519 handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT); 1520 break; 1521 1522 case D_FREEFRAG: 1523 /* releasing a fragment when replaced as a file grows */ 1524 handle_workitem_freefrag(WK_FREEFRAG(wk)); 1525 break; 1526 1527 case D_FREEFILE: 1528 /* releasing an inode when its link count drops to 0 */ 1529 handle_workitem_freefile(WK_FREEFILE(wk)); 1530 break; 1531 1532 case D_FREEWORK: 1533 /* Final block in an indirect was freed. */ 1534 handle_workitem_indirblk(WK_FREEWORK(wk)); 1535 break; 1536 1537 default: 1538 panic("%s_process_worklist: Unknown type %s", 1539 "softdep", TYPENAME(wk->wk_type)); 1540 /* NOTREACHED */ 1541 } 1542 vn_finished_secondary_write(mp); 1543 ACQUIRE_LOCK(&lk); 1544 return (matchcnt); 1545 } 1546 1547 /* 1548 * Move dependencies from one buffer to another. 1549 */ 1550 int 1551 softdep_move_dependencies(oldbp, newbp) 1552 struct buf *oldbp; 1553 struct buf *newbp; 1554 { 1555 struct worklist *wk, *wktail; 1556 int dirty; 1557 1558 dirty = 0; 1559 wktail = NULL; 1560 ACQUIRE_LOCK(&lk); 1561 while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { 1562 LIST_REMOVE(wk, wk_list); 1563 if (wk->wk_type == D_BMSAFEMAP && 1564 bmsafemap_rollbacks(WK_BMSAFEMAP(wk))) 1565 dirty = 1; 1566 if (wktail == 0) 1567 LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); 1568 else 1569 LIST_INSERT_AFTER(wktail, wk, wk_list); 1570 wktail = wk; 1571 } 1572 FREE_LOCK(&lk); 1573 1574 return (dirty); 1575 } 1576 1577 /* 1578 * Purge the work list of all items associated with a particular mount point. 1579 */ 1580 int 1581 softdep_flushworklist(oldmnt, countp, td) 1582 struct mount *oldmnt; 1583 int *countp; 1584 struct thread *td; 1585 { 1586 struct vnode *devvp; 1587 int count, error = 0; 1588 struct ufsmount *ump; 1589 1590 /* 1591 * Alternately flush the block device associated with the mount 1592 * point and process any dependencies that the flushing 1593 * creates. We continue until no more worklist dependencies 1594 * are found. 1595 */ 1596 *countp = 0; 1597 ump = VFSTOUFS(oldmnt); 1598 devvp = ump->um_devvp; 1599 while ((count = softdep_process_worklist(oldmnt, 1)) > 0) { 1600 *countp += count; 1601 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1602 error = VOP_FSYNC(devvp, MNT_WAIT, td); 1603 VOP_UNLOCK(devvp, 0); 1604 if (error) 1605 break; 1606 } 1607 return (error); 1608 } 1609 1610 int 1611 softdep_waitidle(struct mount *mp) 1612 { 1613 struct ufsmount *ump; 1614 int error; 1615 int i; 1616 1617 ump = VFSTOUFS(mp); 1618 ACQUIRE_LOCK(&lk); 1619 for (i = 0; i < 10 && ump->softdep_deps; i++) { 1620 ump->softdep_req = 1; 1621 if (ump->softdep_on_worklist) 1622 panic("softdep_waitidle: work added after flush."); 1623 msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1); 1624 } 1625 ump->softdep_req = 0; 1626 FREE_LOCK(&lk); 1627 error = 0; 1628 if (i == 10) { 1629 error = EBUSY; 1630 printf("softdep_waitidle: Failed to flush worklist for %p\n", 1631 mp); 1632 } 1633 1634 return (error); 1635 } 1636 1637 /* 1638 * Flush all vnodes and worklist items associated with a specified mount point. 1639 */ 1640 int 1641 softdep_flushfiles(oldmnt, flags, td) 1642 struct mount *oldmnt; 1643 int flags; 1644 struct thread *td; 1645 { 1646 int error, depcount, loopcnt, retry_flush_count, retry; 1647 1648 loopcnt = 10; 1649 retry_flush_count = 3; 1650 retry_flush: 1651 error = 0; 1652 1653 /* 1654 * Alternately flush the vnodes associated with the mount 1655 * point and process any dependencies that the flushing 1656 * creates. In theory, this loop can happen at most twice, 1657 * but we give it a few extra just to be sure. 1658 */ 1659 for (; loopcnt > 0; loopcnt--) { 1660 /* 1661 * Do another flush in case any vnodes were brought in 1662 * as part of the cleanup operations. 1663 */ 1664 if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0) 1665 break; 1666 if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 || 1667 depcount == 0) 1668 break; 1669 } 1670 /* 1671 * If we are unmounting then it is an error to fail. If we 1672 * are simply trying to downgrade to read-only, then filesystem 1673 * activity can keep us busy forever, so we just fail with EBUSY. 1674 */ 1675 if (loopcnt == 0) { 1676 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) 1677 panic("softdep_flushfiles: looping"); 1678 error = EBUSY; 1679 } 1680 if (!error) 1681 error = softdep_waitidle(oldmnt); 1682 if (!error) { 1683 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) { 1684 retry = 0; 1685 MNT_ILOCK(oldmnt); 1686 KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0, 1687 ("softdep_flushfiles: !MNTK_NOINSMNTQ")); 1688 if (oldmnt->mnt_nvnodelistsize > 0) { 1689 if (--retry_flush_count > 0) { 1690 retry = 1; 1691 loopcnt = 3; 1692 } else 1693 error = EBUSY; 1694 } 1695 MNT_IUNLOCK(oldmnt); 1696 if (retry) 1697 goto retry_flush; 1698 } 1699 } 1700 return (error); 1701 } 1702 1703 /* 1704 * Structure hashing. 1705 * 1706 * There are three types of structures that can be looked up: 1707 * 1) pagedep structures identified by mount point, inode number, 1708 * and logical block. 1709 * 2) inodedep structures identified by mount point and inode number. 1710 * 3) newblk structures identified by mount point and 1711 * physical block number. 1712 * 1713 * The "pagedep" and "inodedep" dependency structures are hashed 1714 * separately from the file blocks and inodes to which they correspond. 1715 * This separation helps when the in-memory copy of an inode or 1716 * file block must be replaced. It also obviates the need to access 1717 * an inode or file page when simply updating (or de-allocating) 1718 * dependency structures. Lookup of newblk structures is needed to 1719 * find newly allocated blocks when trying to associate them with 1720 * their allocdirect or allocindir structure. 1721 * 1722 * The lookup routines optionally create and hash a new instance when 1723 * an existing entry is not found. 1724 */ 1725 #define DEPALLOC 0x0001 /* allocate structure if lookup fails */ 1726 #define NODELAY 0x0002 /* cannot do background work */ 1727 1728 /* 1729 * Structures and routines associated with pagedep caching. 1730 */ 1731 LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl; 1732 u_long pagedep_hash; /* size of hash table - 1 */ 1733 #define PAGEDEP_HASH(mp, inum, lbn) \ 1734 (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ 1735 pagedep_hash]) 1736 1737 static int 1738 pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp) 1739 struct pagedep_hashhead *pagedephd; 1740 ino_t ino; 1741 ufs_lbn_t lbn; 1742 struct mount *mp; 1743 int flags; 1744 struct pagedep **pagedeppp; 1745 { 1746 struct pagedep *pagedep; 1747 1748 LIST_FOREACH(pagedep, pagedephd, pd_hash) 1749 if (ino == pagedep->pd_ino && 1750 lbn == pagedep->pd_lbn && 1751 mp == pagedep->pd_list.wk_mp) 1752 break; 1753 if (pagedep) { 1754 *pagedeppp = pagedep; 1755 if ((flags & DEPALLOC) != 0 && 1756 (pagedep->pd_state & ONWORKLIST) == 0) 1757 return (0); 1758 return (1); 1759 } 1760 *pagedeppp = NULL; 1761 return (0); 1762 } 1763 /* 1764 * Look up a pagedep. Return 1 if found, 0 if not found or found 1765 * when asked to allocate but not associated with any buffer. 1766 * If not found, allocate if DEPALLOC flag is passed. 1767 * Found or allocated entry is returned in pagedeppp. 1768 * This routine must be called with splbio interrupts blocked. 1769 */ 1770 static int 1771 pagedep_lookup(mp, ino, lbn, flags, pagedeppp) 1772 struct mount *mp; 1773 ino_t ino; 1774 ufs_lbn_t lbn; 1775 int flags; 1776 struct pagedep **pagedeppp; 1777 { 1778 struct pagedep *pagedep; 1779 struct pagedep_hashhead *pagedephd; 1780 int ret; 1781 int i; 1782 1783 mtx_assert(&lk, MA_OWNED); 1784 pagedephd = PAGEDEP_HASH(mp, ino, lbn); 1785 1786 ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); 1787 if (*pagedeppp || (flags & DEPALLOC) == 0) 1788 return (ret); 1789 FREE_LOCK(&lk); 1790 pagedep = malloc(sizeof(struct pagedep), 1791 M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO); 1792 workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp); 1793 ACQUIRE_LOCK(&lk); 1794 ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); 1795 if (*pagedeppp) { 1796 WORKITEM_FREE(pagedep, D_PAGEDEP); 1797 return (ret); 1798 } 1799 pagedep->pd_ino = ino; 1800 pagedep->pd_lbn = lbn; 1801 LIST_INIT(&pagedep->pd_dirremhd); 1802 LIST_INIT(&pagedep->pd_pendinghd); 1803 for (i = 0; i < DAHASHSZ; i++) 1804 LIST_INIT(&pagedep->pd_diraddhd[i]); 1805 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); 1806 *pagedeppp = pagedep; 1807 return (0); 1808 } 1809 1810 /* 1811 * Structures and routines associated with inodedep caching. 1812 */ 1813 LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl; 1814 static u_long inodedep_hash; /* size of hash table - 1 */ 1815 static long num_inodedep; /* number of inodedep allocated */ 1816 #define INODEDEP_HASH(fs, inum) \ 1817 (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) 1818 1819 static int 1820 inodedep_find(inodedephd, fs, inum, inodedeppp) 1821 struct inodedep_hashhead *inodedephd; 1822 struct fs *fs; 1823 ino_t inum; 1824 struct inodedep **inodedeppp; 1825 { 1826 struct inodedep *inodedep; 1827 1828 LIST_FOREACH(inodedep, inodedephd, id_hash) 1829 if (inum == inodedep->id_ino && fs == inodedep->id_fs) 1830 break; 1831 if (inodedep) { 1832 *inodedeppp = inodedep; 1833 return (1); 1834 } 1835 *inodedeppp = NULL; 1836 1837 return (0); 1838 } 1839 /* 1840 * Look up an inodedep. Return 1 if found, 0 if not found. 1841 * If not found, allocate if DEPALLOC flag is passed. 1842 * Found or allocated entry is returned in inodedeppp. 1843 * This routine must be called with splbio interrupts blocked. 1844 */ 1845 static int 1846 inodedep_lookup(mp, inum, flags, inodedeppp) 1847 struct mount *mp; 1848 ino_t inum; 1849 int flags; 1850 struct inodedep **inodedeppp; 1851 { 1852 struct inodedep *inodedep; 1853 struct inodedep_hashhead *inodedephd; 1854 struct fs *fs; 1855 1856 mtx_assert(&lk, MA_OWNED); 1857 fs = VFSTOUFS(mp)->um_fs; 1858 inodedephd = INODEDEP_HASH(fs, inum); 1859 1860 if (inodedep_find(inodedephd, fs, inum, inodedeppp)) 1861 return (1); 1862 if ((flags & DEPALLOC) == 0) 1863 return (0); 1864 /* 1865 * If we are over our limit, try to improve the situation. 1866 */ 1867 if (num_inodedep > max_softdeps && (flags & NODELAY) == 0) 1868 request_cleanup(mp, FLUSH_INODES); 1869 FREE_LOCK(&lk); 1870 inodedep = malloc(sizeof(struct inodedep), 1871 M_INODEDEP, M_SOFTDEP_FLAGS); 1872 workitem_alloc(&inodedep->id_list, D_INODEDEP, mp); 1873 ACQUIRE_LOCK(&lk); 1874 if (inodedep_find(inodedephd, fs, inum, inodedeppp)) { 1875 WORKITEM_FREE(inodedep, D_INODEDEP); 1876 return (1); 1877 } 1878 num_inodedep += 1; 1879 inodedep->id_fs = fs; 1880 inodedep->id_ino = inum; 1881 inodedep->id_state = ALLCOMPLETE; 1882 inodedep->id_nlinkdelta = 0; 1883 inodedep->id_savedino1 = NULL; 1884 inodedep->id_savedsize = -1; 1885 inodedep->id_savedextsize = -1; 1886 inodedep->id_savednlink = -1; 1887 inodedep->id_bmsafemap = NULL; 1888 inodedep->id_mkdiradd = NULL; 1889 LIST_INIT(&inodedep->id_dirremhd); 1890 LIST_INIT(&inodedep->id_pendinghd); 1891 LIST_INIT(&inodedep->id_inowait); 1892 LIST_INIT(&inodedep->id_bufwait); 1893 TAILQ_INIT(&inodedep->id_inoreflst); 1894 TAILQ_INIT(&inodedep->id_inoupdt); 1895 TAILQ_INIT(&inodedep->id_newinoupdt); 1896 TAILQ_INIT(&inodedep->id_extupdt); 1897 TAILQ_INIT(&inodedep->id_newextupdt); 1898 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); 1899 *inodedeppp = inodedep; 1900 return (0); 1901 } 1902 1903 /* 1904 * Structures and routines associated with newblk caching. 1905 */ 1906 LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl; 1907 u_long newblk_hash; /* size of hash table - 1 */ 1908 #define NEWBLK_HASH(fs, inum) \ 1909 (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) 1910 1911 static int 1912 newblk_find(newblkhd, mp, newblkno, flags, newblkpp) 1913 struct newblk_hashhead *newblkhd; 1914 struct mount *mp; 1915 ufs2_daddr_t newblkno; 1916 int flags; 1917 struct newblk **newblkpp; 1918 { 1919 struct newblk *newblk; 1920 1921 LIST_FOREACH(newblk, newblkhd, nb_hash) { 1922 if (newblkno != newblk->nb_newblkno) 1923 continue; 1924 if (mp != newblk->nb_list.wk_mp) 1925 continue; 1926 /* 1927 * If we're creating a new dependency don't match those that 1928 * have already been converted to allocdirects. This is for 1929 * a frag extend. 1930 */ 1931 if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK) 1932 continue; 1933 break; 1934 } 1935 if (newblk) { 1936 *newblkpp = newblk; 1937 return (1); 1938 } 1939 *newblkpp = NULL; 1940 return (0); 1941 } 1942 1943 /* 1944 * Look up a newblk. Return 1 if found, 0 if not found. 1945 * If not found, allocate if DEPALLOC flag is passed. 1946 * Found or allocated entry is returned in newblkpp. 1947 */ 1948 static int 1949 newblk_lookup(mp, newblkno, flags, newblkpp) 1950 struct mount *mp; 1951 ufs2_daddr_t newblkno; 1952 int flags; 1953 struct newblk **newblkpp; 1954 { 1955 struct newblk *newblk; 1956 struct newblk_hashhead *newblkhd; 1957 1958 newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno); 1959 if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) 1960 return (1); 1961 if ((flags & DEPALLOC) == 0) 1962 return (0); 1963 FREE_LOCK(&lk); 1964 newblk = malloc(sizeof(union allblk), M_NEWBLK, 1965 M_SOFTDEP_FLAGS | M_ZERO); 1966 workitem_alloc(&newblk->nb_list, D_NEWBLK, mp); 1967 ACQUIRE_LOCK(&lk); 1968 if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) { 1969 WORKITEM_FREE(newblk, D_NEWBLK); 1970 return (1); 1971 } 1972 newblk->nb_freefrag = NULL; 1973 LIST_INIT(&newblk->nb_indirdeps); 1974 LIST_INIT(&newblk->nb_newdirblk); 1975 LIST_INIT(&newblk->nb_jwork); 1976 newblk->nb_state = ATTACHED; 1977 newblk->nb_newblkno = newblkno; 1978 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); 1979 *newblkpp = newblk; 1980 return (0); 1981 } 1982 1983 /* 1984 * Executed during filesystem system initialization before 1985 * mounting any filesystems. 1986 */ 1987 void 1988 softdep_initialize() 1989 { 1990 1991 LIST_INIT(&mkdirlisthd); 1992 max_softdeps = desiredvnodes * 4; 1993 pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash); 1994 inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); 1995 newblk_hashtbl = hashinit(desiredvnodes / 5, M_NEWBLK, &newblk_hash); 1996 bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash); 1997 1998 /* initialise bioops hack */ 1999 bioops.io_start = softdep_disk_io_initiation; 2000 bioops.io_complete = softdep_disk_write_complete; 2001 bioops.io_deallocate = softdep_deallocate_dependencies; 2002 bioops.io_countdeps = softdep_count_dependencies; 2003 2004 /* Initialize the callout with an mtx. */ 2005 callout_init_mtx(&softdep_callout, &lk, 0); 2006 } 2007 2008 /* 2009 * Executed after all filesystems have been unmounted during 2010 * filesystem module unload. 2011 */ 2012 void 2013 softdep_uninitialize() 2014 { 2015 2016 callout_drain(&softdep_callout); 2017 hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash); 2018 hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash); 2019 hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash); 2020 hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash); 2021 } 2022 2023 /* 2024 * Called at mount time to notify the dependency code that a 2025 * filesystem wishes to use it. 2026 */ 2027 int 2028 softdep_mount(devvp, mp, fs, cred) 2029 struct vnode *devvp; 2030 struct mount *mp; 2031 struct fs *fs; 2032 struct ucred *cred; 2033 { 2034 struct csum_total cstotal; 2035 struct ufsmount *ump; 2036 struct cg *cgp; 2037 struct buf *bp; 2038 int error, cyl; 2039 2040 MNT_ILOCK(mp); 2041 mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP; 2042 if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) { 2043 mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | 2044 MNTK_SOFTDEP; 2045 mp->mnt_noasync++; 2046 } 2047 MNT_IUNLOCK(mp); 2048 ump = VFSTOUFS(mp); 2049 LIST_INIT(&ump->softdep_workitem_pending); 2050 LIST_INIT(&ump->softdep_journal_pending); 2051 TAILQ_INIT(&ump->softdep_unlinked); 2052 ump->softdep_worklist_tail = NULL; 2053 ump->softdep_on_worklist = 0; 2054 ump->softdep_deps = 0; 2055 if ((fs->fs_flags & FS_SUJ) && 2056 (error = journal_mount(mp, fs, cred)) != 0) { 2057 printf("Failed to start journal: %d\n", error); 2058 return (error); 2059 } 2060 /* 2061 * When doing soft updates, the counters in the 2062 * superblock may have gotten out of sync. Recomputation 2063 * can take a long time and can be deferred for background 2064 * fsck. However, the old behavior of scanning the cylinder 2065 * groups and recalculating them at mount time is available 2066 * by setting vfs.ffs.compute_summary_at_mount to one. 2067 */ 2068 if (compute_summary_at_mount == 0 || fs->fs_clean != 0) 2069 return (0); 2070 bzero(&cstotal, sizeof cstotal); 2071 for (cyl = 0; cyl < fs->fs_ncg; cyl++) { 2072 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), 2073 fs->fs_cgsize, cred, &bp)) != 0) { 2074 brelse(bp); 2075 return (error); 2076 } 2077 cgp = (struct cg *)bp->b_data; 2078 cstotal.cs_nffree += cgp->cg_cs.cs_nffree; 2079 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; 2080 cstotal.cs_nifree += cgp->cg_cs.cs_nifree; 2081 cstotal.cs_ndir += cgp->cg_cs.cs_ndir; 2082 fs->fs_cs(fs, cyl) = cgp->cg_cs; 2083 brelse(bp); 2084 } 2085 #ifdef DEBUG 2086 if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) 2087 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); 2088 #endif 2089 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); 2090 return (0); 2091 } 2092 2093 void 2094 softdep_unmount(mp) 2095 struct mount *mp; 2096 { 2097 2098 if (mp->mnt_kern_flag & MNTK_SUJ) 2099 journal_unmount(mp); 2100 } 2101 2102 struct jblocks { 2103 struct jseglst jb_segs; /* TAILQ of current segments. */ 2104 struct jseg *jb_writeseg; /* Next write to complete. */ 2105 struct jextent *jb_extent; /* Extent array. */ 2106 uint64_t jb_nextseq; /* Next sequence number. */ 2107 uint64_t jb_oldestseq; /* Oldest active sequence number. */ 2108 int jb_avail; /* Available extents. */ 2109 int jb_used; /* Last used extent. */ 2110 int jb_head; /* Allocator head. */ 2111 int jb_off; /* Allocator extent offset. */ 2112 int jb_blocks; /* Total disk blocks covered. */ 2113 int jb_free; /* Total disk blocks free. */ 2114 int jb_min; /* Minimum free space. */ 2115 int jb_low; /* Low on space. */ 2116 int jb_age; /* Insertion time of oldest rec. */ 2117 int jb_suspended; /* Did journal suspend writes? */ 2118 }; 2119 2120 struct jextent { 2121 ufs2_daddr_t je_daddr; /* Disk block address. */ 2122 int je_blocks; /* Disk block count. */ 2123 }; 2124 2125 static struct jblocks * 2126 jblocks_create(void) 2127 { 2128 struct jblocks *jblocks; 2129 2130 jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO); 2131 TAILQ_INIT(&jblocks->jb_segs); 2132 jblocks->jb_avail = 10; 2133 jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail, 2134 M_JBLOCKS, M_WAITOK | M_ZERO); 2135 2136 return (jblocks); 2137 } 2138 2139 static ufs2_daddr_t 2140 jblocks_alloc(jblocks, bytes, actual) 2141 struct jblocks *jblocks; 2142 int bytes; 2143 int *actual; 2144 { 2145 ufs2_daddr_t daddr; 2146 struct jextent *jext; 2147 int freecnt; 2148 int blocks; 2149 2150 blocks = bytes / DEV_BSIZE; 2151 jext = &jblocks->jb_extent[jblocks->jb_head]; 2152 freecnt = jext->je_blocks - jblocks->jb_off; 2153 if (freecnt == 0) { 2154 jblocks->jb_off = 0; 2155 if (++jblocks->jb_head > jblocks->jb_used) 2156 jblocks->jb_head = 0; 2157 jext = &jblocks->jb_extent[jblocks->jb_head]; 2158 freecnt = jext->je_blocks; 2159 } 2160 if (freecnt > blocks) 2161 freecnt = blocks; 2162 *actual = freecnt * DEV_BSIZE; 2163 daddr = jext->je_daddr + jblocks->jb_off; 2164 jblocks->jb_off += freecnt; 2165 jblocks->jb_free -= freecnt; 2166 2167 return (daddr); 2168 } 2169 2170 static void 2171 jblocks_free(jblocks, mp, bytes) 2172 struct jblocks *jblocks; 2173 struct mount *mp; 2174 int bytes; 2175 { 2176 2177 jblocks->jb_free += bytes / DEV_BSIZE; 2178 if (jblocks->jb_suspended) 2179 worklist_speedup(); 2180 wakeup(jblocks); 2181 } 2182 2183 static void 2184 jblocks_destroy(jblocks) 2185 struct jblocks *jblocks; 2186 { 2187 2188 if (jblocks->jb_extent) 2189 free(jblocks->jb_extent, M_JBLOCKS); 2190 free(jblocks, M_JBLOCKS); 2191 } 2192 2193 static void 2194 jblocks_add(jblocks, daddr, blocks) 2195 struct jblocks *jblocks; 2196 ufs2_daddr_t daddr; 2197 int blocks; 2198 { 2199 struct jextent *jext; 2200 2201 jblocks->jb_blocks += blocks; 2202 jblocks->jb_free += blocks; 2203 jext = &jblocks->jb_extent[jblocks->jb_used]; 2204 /* Adding the first block. */ 2205 if (jext->je_daddr == 0) { 2206 jext->je_daddr = daddr; 2207 jext->je_blocks = blocks; 2208 return; 2209 } 2210 /* Extending the last extent. */ 2211 if (jext->je_daddr + jext->je_blocks == daddr) { 2212 jext->je_blocks += blocks; 2213 return; 2214 } 2215 /* Adding a new extent. */ 2216 if (++jblocks->jb_used == jblocks->jb_avail) { 2217 jblocks->jb_avail *= 2; 2218 jext = malloc(sizeof(struct jextent) * jblocks->jb_avail, 2219 M_JBLOCKS, M_WAITOK | M_ZERO); 2220 memcpy(jext, jblocks->jb_extent, 2221 sizeof(struct jextent) * jblocks->jb_used); 2222 free(jblocks->jb_extent, M_JBLOCKS); 2223 jblocks->jb_extent = jext; 2224 } 2225 jext = &jblocks->jb_extent[jblocks->jb_used]; 2226 jext->je_daddr = daddr; 2227 jext->je_blocks = blocks; 2228 return; 2229 } 2230 2231 int 2232 softdep_journal_lookup(mp, vpp) 2233 struct mount *mp; 2234 struct vnode **vpp; 2235 { 2236 struct componentname cnp; 2237 struct vnode *dvp; 2238 ino_t sujournal; 2239 int error; 2240 2241 error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp); 2242 if (error) 2243 return (error); 2244 bzero(&cnp, sizeof(cnp)); 2245 cnp.cn_nameiop = LOOKUP; 2246 cnp.cn_flags = ISLASTCN; 2247 cnp.cn_thread = curthread; 2248 cnp.cn_cred = curthread->td_ucred; 2249 cnp.cn_pnbuf = SUJ_FILE; 2250 cnp.cn_nameptr = SUJ_FILE; 2251 cnp.cn_namelen = strlen(SUJ_FILE); 2252 error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal); 2253 vput(dvp); 2254 if (error != 0) 2255 return (error); 2256 error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp); 2257 return (error); 2258 } 2259 2260 /* 2261 * Open and verify the journal file. 2262 */ 2263 static int 2264 journal_mount(mp, fs, cred) 2265 struct mount *mp; 2266 struct fs *fs; 2267 struct ucred *cred; 2268 { 2269 struct jblocks *jblocks; 2270 struct vnode *vp; 2271 struct inode *ip; 2272 ufs2_daddr_t blkno; 2273 int bcount; 2274 int error; 2275 int i; 2276 2277 error = softdep_journal_lookup(mp, &vp); 2278 if (error != 0) { 2279 printf("Failed to find journal. Use tunefs to create one\n"); 2280 return (error); 2281 } 2282 ip = VTOI(vp); 2283 if (ip->i_size < SUJ_MIN) { 2284 error = ENOSPC; 2285 goto out; 2286 } 2287 bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */ 2288 jblocks = jblocks_create(); 2289 for (i = 0; i < bcount; i++) { 2290 error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL); 2291 if (error) 2292 break; 2293 jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag)); 2294 } 2295 if (error) { 2296 jblocks_destroy(jblocks); 2297 goto out; 2298 } 2299 jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */ 2300 jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */ 2301 VFSTOUFS(mp)->softdep_jblocks = jblocks; 2302 out: 2303 if (error == 0) { 2304 MNT_ILOCK(mp); 2305 mp->mnt_kern_flag |= MNTK_SUJ; 2306 MNT_IUNLOCK(mp); 2307 /* 2308 * Only validate the journal contents if the 2309 * filesystem is clean, otherwise we write the logs 2310 * but they'll never be used. If the filesystem was 2311 * still dirty when we mounted it the journal is 2312 * invalid and a new journal can only be valid if it 2313 * starts from a clean mount. 2314 */ 2315 if (fs->fs_clean) { 2316 DIP_SET(ip, i_modrev, fs->fs_mtime); 2317 ip->i_flags |= IN_MODIFIED; 2318 ffs_update(vp, 1); 2319 } 2320 } 2321 vput(vp); 2322 return (error); 2323 } 2324 2325 static void 2326 journal_unmount(mp) 2327 struct mount *mp; 2328 { 2329 struct ufsmount *ump; 2330 2331 ump = VFSTOUFS(mp); 2332 if (ump->softdep_jblocks) 2333 jblocks_destroy(ump->softdep_jblocks); 2334 ump->softdep_jblocks = NULL; 2335 } 2336 2337 /* 2338 * Called when a journal record is ready to be written. Space is allocated 2339 * and the journal entry is created when the journal is flushed to stable 2340 * store. 2341 */ 2342 static void 2343 add_to_journal(wk) 2344 struct worklist *wk; 2345 { 2346 struct ufsmount *ump; 2347 2348 mtx_assert(&lk, MA_OWNED); 2349 ump = VFSTOUFS(wk->wk_mp); 2350 if (wk->wk_state & ONWORKLIST) 2351 panic("add_to_journal: %s(0x%X) already on list", 2352 TYPENAME(wk->wk_type), wk->wk_state); 2353 wk->wk_state |= ONWORKLIST | DEPCOMPLETE; 2354 if (LIST_EMPTY(&ump->softdep_journal_pending)) { 2355 ump->softdep_jblocks->jb_age = ticks; 2356 LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list); 2357 } else 2358 LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list); 2359 ump->softdep_journal_tail = wk; 2360 ump->softdep_on_journal += 1; 2361 } 2362 2363 /* 2364 * Remove an arbitrary item for the journal worklist maintain the tail 2365 * pointer. This happens when a new operation obviates the need to 2366 * journal an old operation. 2367 */ 2368 static void 2369 remove_from_journal(wk) 2370 struct worklist *wk; 2371 { 2372 struct ufsmount *ump; 2373 2374 mtx_assert(&lk, MA_OWNED); 2375 ump = VFSTOUFS(wk->wk_mp); 2376 #ifdef SUJ_DEBUG 2377 { 2378 struct worklist *wkn; 2379 2380 LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list) 2381 if (wkn == wk) 2382 break; 2383 if (wkn == NULL) 2384 panic("remove_from_journal: %p is not in journal", wk); 2385 } 2386 #endif 2387 /* 2388 * We emulate a TAILQ to save space in most structures which do not 2389 * require TAILQ semantics. Here we must update the tail position 2390 * when removing the tail which is not the final entry. This works 2391 * only if the worklist linkage are at the beginning of the structure. 2392 */ 2393 if (ump->softdep_journal_tail == wk) 2394 ump->softdep_journal_tail = 2395 (struct worklist *)wk->wk_list.le_prev; 2396 2397 WORKLIST_REMOVE(wk); 2398 ump->softdep_on_journal -= 1; 2399 } 2400 2401 /* 2402 * Check for journal space as well as dependency limits so the prelink 2403 * code can throttle both journaled and non-journaled filesystems. 2404 * Threshold is 0 for low and 1 for min. 2405 */ 2406 static int 2407 journal_space(ump, thresh) 2408 struct ufsmount *ump; 2409 int thresh; 2410 { 2411 struct jblocks *jblocks; 2412 int avail; 2413 2414 jblocks = ump->softdep_jblocks; 2415 if (jblocks == NULL) 2416 return (1); 2417 /* 2418 * We use a tighter restriction here to prevent request_cleanup() 2419 * running in threads from running into locks we currently hold. 2420 */ 2421 if (num_inodedep > (max_softdeps / 10) * 9) 2422 return (0); 2423 if (thresh) 2424 thresh = jblocks->jb_min; 2425 else 2426 thresh = jblocks->jb_low; 2427 avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE; 2428 avail = jblocks->jb_free - avail; 2429 2430 return (avail > thresh); 2431 } 2432 2433 static void 2434 journal_suspend(ump) 2435 struct ufsmount *ump; 2436 { 2437 struct jblocks *jblocks; 2438 struct mount *mp; 2439 2440 mp = UFSTOVFS(ump); 2441 jblocks = ump->softdep_jblocks; 2442 MNT_ILOCK(mp); 2443 if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { 2444 stat_journal_min++; 2445 mp->mnt_kern_flag |= MNTK_SUSPEND; 2446 mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc); 2447 } 2448 jblocks->jb_suspended = 1; 2449 MNT_IUNLOCK(mp); 2450 } 2451 2452 static int 2453 journal_unsuspend(struct ufsmount *ump) 2454 { 2455 struct jblocks *jblocks; 2456 struct mount *mp; 2457 2458 mp = UFSTOVFS(ump); 2459 jblocks = ump->softdep_jblocks; 2460 2461 if (jblocks != NULL && jblocks->jb_suspended && 2462 journal_space(ump, jblocks->jb_min)) { 2463 jblocks->jb_suspended = 0; 2464 FREE_LOCK(&lk); 2465 mp->mnt_susp_owner = curthread; 2466 vfs_write_resume(mp); 2467 ACQUIRE_LOCK(&lk); 2468 return (1); 2469 } 2470 return (0); 2471 } 2472 2473 /* 2474 * Called before any allocation function to be certain that there is 2475 * sufficient space in the journal prior to creating any new records. 2476 * Since in the case of block allocation we may have multiple locked 2477 * buffers at the time of the actual allocation we can not block 2478 * when the journal records are created. Doing so would create a deadlock 2479 * if any of these buffers needed to be flushed to reclaim space. Instead 2480 * we require a sufficiently large amount of available space such that 2481 * each thread in the system could have passed this allocation check and 2482 * still have sufficient free space. With 20% of a minimum journal size 2483 * of 1MB we have 6553 records available. 2484 */ 2485 int 2486 softdep_prealloc(vp, waitok) 2487 struct vnode *vp; 2488 int waitok; 2489 { 2490 struct ufsmount *ump; 2491 2492 if (DOINGSUJ(vp) == 0) 2493 return (0); 2494 ump = VFSTOUFS(vp->v_mount); 2495 ACQUIRE_LOCK(&lk); 2496 if (journal_space(ump, 0)) { 2497 FREE_LOCK(&lk); 2498 return (0); 2499 } 2500 stat_journal_low++; 2501 FREE_LOCK(&lk); 2502 if (waitok == MNT_NOWAIT) 2503 return (ENOSPC); 2504 /* 2505 * Attempt to sync this vnode once to flush any journal 2506 * work attached to it. 2507 */ 2508 if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0) 2509 ffs_syncvnode(vp, waitok); 2510 ACQUIRE_LOCK(&lk); 2511 process_removes(vp); 2512 if (journal_space(ump, 0) == 0) { 2513 softdep_speedup(); 2514 if (journal_space(ump, 1) == 0) 2515 journal_suspend(ump); 2516 } 2517 FREE_LOCK(&lk); 2518 2519 return (0); 2520 } 2521 2522 /* 2523 * Before adjusting a link count on a vnode verify that we have sufficient 2524 * journal space. If not, process operations that depend on the currently 2525 * locked pair of vnodes to try to flush space as the syncer, buf daemon, 2526 * and softdep flush threads can not acquire these locks to reclaim space. 2527 */ 2528 static void 2529 softdep_prelink(dvp, vp) 2530 struct vnode *dvp; 2531 struct vnode *vp; 2532 { 2533 struct ufsmount *ump; 2534 2535 ump = VFSTOUFS(dvp->v_mount); 2536 mtx_assert(&lk, MA_OWNED); 2537 if (journal_space(ump, 0)) 2538 return; 2539 stat_journal_low++; 2540 FREE_LOCK(&lk); 2541 if (vp) 2542 ffs_syncvnode(vp, MNT_NOWAIT); 2543 ffs_syncvnode(dvp, MNT_WAIT); 2544 ACQUIRE_LOCK(&lk); 2545 /* Process vp before dvp as it may create .. removes. */ 2546 if (vp) 2547 process_removes(vp); 2548 process_removes(dvp); 2549 softdep_speedup(); 2550 process_worklist_item(UFSTOVFS(ump), LK_NOWAIT); 2551 process_worklist_item(UFSTOVFS(ump), LK_NOWAIT); 2552 if (journal_space(ump, 0) == 0) { 2553 softdep_speedup(); 2554 if (journal_space(ump, 1) == 0) 2555 journal_suspend(ump); 2556 } 2557 } 2558 2559 static void 2560 jseg_write(ump, jblocks, jseg, data) 2561 struct ufsmount *ump; 2562 struct jblocks *jblocks; 2563 struct jseg *jseg; 2564 uint8_t *data; 2565 { 2566 struct jsegrec *rec; 2567 2568 rec = (struct jsegrec *)data; 2569 rec->jsr_seq = jseg->js_seq; 2570 rec->jsr_oldest = jblocks->jb_oldestseq; 2571 rec->jsr_cnt = jseg->js_cnt; 2572 rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize; 2573 rec->jsr_crc = 0; 2574 rec->jsr_time = ump->um_fs->fs_mtime; 2575 } 2576 2577 static inline void 2578 inoref_write(inoref, jseg, rec) 2579 struct inoref *inoref; 2580 struct jseg *jseg; 2581 struct jrefrec *rec; 2582 { 2583 2584 inoref->if_jsegdep->jd_seg = jseg; 2585 rec->jr_ino = inoref->if_ino; 2586 rec->jr_parent = inoref->if_parent; 2587 rec->jr_nlink = inoref->if_nlink; 2588 rec->jr_mode = inoref->if_mode; 2589 rec->jr_diroff = inoref->if_diroff; 2590 } 2591 2592 static void 2593 jaddref_write(jaddref, jseg, data) 2594 struct jaddref *jaddref; 2595 struct jseg *jseg; 2596 uint8_t *data; 2597 { 2598 struct jrefrec *rec; 2599 2600 rec = (struct jrefrec *)data; 2601 rec->jr_op = JOP_ADDREF; 2602 inoref_write(&jaddref->ja_ref, jseg, rec); 2603 } 2604 2605 static void 2606 jremref_write(jremref, jseg, data) 2607 struct jremref *jremref; 2608 struct jseg *jseg; 2609 uint8_t *data; 2610 { 2611 struct jrefrec *rec; 2612 2613 rec = (struct jrefrec *)data; 2614 rec->jr_op = JOP_REMREF; 2615 inoref_write(&jremref->jr_ref, jseg, rec); 2616 } 2617 2618 static void 2619 jmvref_write(jmvref, jseg, data) 2620 struct jmvref *jmvref; 2621 struct jseg *jseg; 2622 uint8_t *data; 2623 { 2624 struct jmvrec *rec; 2625 2626 rec = (struct jmvrec *)data; 2627 rec->jm_op = JOP_MVREF; 2628 rec->jm_ino = jmvref->jm_ino; 2629 rec->jm_parent = jmvref->jm_parent; 2630 rec->jm_oldoff = jmvref->jm_oldoff; 2631 rec->jm_newoff = jmvref->jm_newoff; 2632 } 2633 2634 static void 2635 jnewblk_write(jnewblk, jseg, data) 2636 struct jnewblk *jnewblk; 2637 struct jseg *jseg; 2638 uint8_t *data; 2639 { 2640 struct jblkrec *rec; 2641 2642 jnewblk->jn_jsegdep->jd_seg = jseg; 2643 rec = (struct jblkrec *)data; 2644 rec->jb_op = JOP_NEWBLK; 2645 rec->jb_ino = jnewblk->jn_ino; 2646 rec->jb_blkno = jnewblk->jn_blkno; 2647 rec->jb_lbn = jnewblk->jn_lbn; 2648 rec->jb_frags = jnewblk->jn_frags; 2649 rec->jb_oldfrags = jnewblk->jn_oldfrags; 2650 } 2651 2652 static void 2653 jfreeblk_write(jfreeblk, jseg, data) 2654 struct jfreeblk *jfreeblk; 2655 struct jseg *jseg; 2656 uint8_t *data; 2657 { 2658 struct jblkrec *rec; 2659 2660 jfreeblk->jf_jsegdep->jd_seg = jseg; 2661 rec = (struct jblkrec *)data; 2662 rec->jb_op = JOP_FREEBLK; 2663 rec->jb_ino = jfreeblk->jf_ino; 2664 rec->jb_blkno = jfreeblk->jf_blkno; 2665 rec->jb_lbn = jfreeblk->jf_lbn; 2666 rec->jb_frags = jfreeblk->jf_frags; 2667 rec->jb_oldfrags = 0; 2668 } 2669 2670 static void 2671 jfreefrag_write(jfreefrag, jseg, data) 2672 struct jfreefrag *jfreefrag; 2673 struct jseg *jseg; 2674 uint8_t *data; 2675 { 2676 struct jblkrec *rec; 2677 2678 jfreefrag->fr_jsegdep->jd_seg = jseg; 2679 rec = (struct jblkrec *)data; 2680 rec->jb_op = JOP_FREEBLK; 2681 rec->jb_ino = jfreefrag->fr_ino; 2682 rec->jb_blkno = jfreefrag->fr_blkno; 2683 rec->jb_lbn = jfreefrag->fr_lbn; 2684 rec->jb_frags = jfreefrag->fr_frags; 2685 rec->jb_oldfrags = 0; 2686 } 2687 2688 static void 2689 jtrunc_write(jtrunc, jseg, data) 2690 struct jtrunc *jtrunc; 2691 struct jseg *jseg; 2692 uint8_t *data; 2693 { 2694 struct jtrncrec *rec; 2695 2696 rec = (struct jtrncrec *)data; 2697 rec->jt_op = JOP_TRUNC; 2698 rec->jt_ino = jtrunc->jt_ino; 2699 rec->jt_size = jtrunc->jt_size; 2700 rec->jt_extsize = jtrunc->jt_extsize; 2701 } 2702 2703 /* 2704 * Flush some journal records to disk. 2705 */ 2706 static void 2707 softdep_process_journal(mp, flags) 2708 struct mount *mp; 2709 int flags; 2710 { 2711 struct jblocks *jblocks; 2712 struct ufsmount *ump; 2713 struct worklist *wk; 2714 struct jseg *jseg; 2715 struct buf *bp; 2716 uint8_t *data; 2717 struct fs *fs; 2718 int segwritten; 2719 int jrecmin; /* Minimum records per block. */ 2720 int jrecmax; /* Maximum records per block. */ 2721 int size; 2722 int cnt; 2723 int off; 2724 int devbsize; 2725 2726 if ((mp->mnt_kern_flag & MNTK_SUJ) == 0) 2727 return; 2728 ump = VFSTOUFS(mp); 2729 fs = ump->um_fs; 2730 jblocks = ump->softdep_jblocks; 2731 devbsize = ump->um_devvp->v_bufobj.bo_bsize; 2732 /* 2733 * We write anywhere between a disk block and fs block. The upper 2734 * bound is picked to prevent buffer cache fragmentation and limit 2735 * processing time per I/O. 2736 */ 2737 jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */ 2738 jrecmax = (fs->fs_bsize / devbsize) * jrecmin; 2739 segwritten = 0; 2740 while ((cnt = ump->softdep_on_journal) != 0) { 2741 /* 2742 * Create a new segment to hold as many as 'cnt' journal 2743 * entries and add them to the segment. Notice cnt is 2744 * off by one to account for the space required by the 2745 * jsegrec. If we don't have a full block to log skip it 2746 * unless we haven't written anything. 2747 */ 2748 cnt++; 2749 if (cnt < jrecmax && segwritten) 2750 break; 2751 /* 2752 * Verify some free journal space. softdep_prealloc() should 2753 * guarantee that we don't run out so this is indicative of 2754 * a problem with the flow control. Try to recover 2755 * gracefully in any event. 2756 */ 2757 while (jblocks->jb_free == 0) { 2758 if (flags != MNT_WAIT) 2759 break; 2760 printf("softdep: Out of journal space!\n"); 2761 softdep_speedup(); 2762 msleep(jblocks, &lk, PRIBIO, "jblocks", hz); 2763 } 2764 FREE_LOCK(&lk); 2765 jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS); 2766 workitem_alloc(&jseg->js_list, D_JSEG, mp); 2767 LIST_INIT(&jseg->js_entries); 2768 jseg->js_state = ATTACHED; 2769 jseg->js_jblocks = jblocks; 2770 bp = geteblk(fs->fs_bsize, 0); 2771 ACQUIRE_LOCK(&lk); 2772 /* 2773 * If there was a race while we were allocating the block 2774 * and jseg the entry we care about was likely written. 2775 * We bail out in both the WAIT and NOWAIT case and assume 2776 * the caller will loop if the entry it cares about is 2777 * not written. 2778 */ 2779 if (ump->softdep_on_journal == 0 || jblocks->jb_free == 0) { 2780 bp->b_flags |= B_INVAL | B_NOCACHE; 2781 WORKITEM_FREE(jseg, D_JSEG); 2782 FREE_LOCK(&lk); 2783 brelse(bp); 2784 ACQUIRE_LOCK(&lk); 2785 break; 2786 } 2787 /* 2788 * Calculate the disk block size required for the available 2789 * records rounded to the min size. 2790 */ 2791 cnt = ump->softdep_on_journal; 2792 if (cnt < jrecmax) 2793 size = howmany(cnt, jrecmin) * devbsize; 2794 else 2795 size = fs->fs_bsize; 2796 /* 2797 * Allocate a disk block for this journal data and account 2798 * for truncation of the requested size if enough contiguous 2799 * space was not available. 2800 */ 2801 bp->b_blkno = jblocks_alloc(jblocks, size, &size); 2802 bp->b_lblkno = bp->b_blkno; 2803 bp->b_offset = bp->b_blkno * DEV_BSIZE; 2804 bp->b_bcount = size; 2805 bp->b_bufobj = &ump->um_devvp->v_bufobj; 2806 bp->b_flags &= ~B_INVAL; 2807 bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY; 2808 /* 2809 * Initialize our jseg with cnt records. Assign the next 2810 * sequence number to it and link it in-order. 2811 */ 2812 cnt = MIN(ump->softdep_on_journal, 2813 (size / devbsize) * jrecmin); 2814 jseg->js_buf = bp; 2815 jseg->js_cnt = cnt; 2816 jseg->js_refs = cnt + 1; /* Self ref. */ 2817 jseg->js_size = size; 2818 jseg->js_seq = jblocks->jb_nextseq++; 2819 if (TAILQ_EMPTY(&jblocks->jb_segs)) 2820 jblocks->jb_oldestseq = jseg->js_seq; 2821 TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next); 2822 if (jblocks->jb_writeseg == NULL) 2823 jblocks->jb_writeseg = jseg; 2824 /* 2825 * Start filling in records from the pending list. 2826 */ 2827 data = bp->b_data; 2828 off = 0; 2829 while ((wk = LIST_FIRST(&ump->softdep_journal_pending)) 2830 != NULL) { 2831 /* Place a segment header on every device block. */ 2832 if ((off % devbsize) == 0) { 2833 jseg_write(ump, jblocks, jseg, data); 2834 off += JREC_SIZE; 2835 data = bp->b_data + off; 2836 } 2837 remove_from_journal(wk); 2838 wk->wk_state |= IOSTARTED; 2839 WORKLIST_INSERT(&jseg->js_entries, wk); 2840 switch (wk->wk_type) { 2841 case D_JADDREF: 2842 jaddref_write(WK_JADDREF(wk), jseg, data); 2843 break; 2844 case D_JREMREF: 2845 jremref_write(WK_JREMREF(wk), jseg, data); 2846 break; 2847 case D_JMVREF: 2848 jmvref_write(WK_JMVREF(wk), jseg, data); 2849 break; 2850 case D_JNEWBLK: 2851 jnewblk_write(WK_JNEWBLK(wk), jseg, data); 2852 break; 2853 case D_JFREEBLK: 2854 jfreeblk_write(WK_JFREEBLK(wk), jseg, data); 2855 break; 2856 case D_JFREEFRAG: 2857 jfreefrag_write(WK_JFREEFRAG(wk), jseg, data); 2858 break; 2859 case D_JTRUNC: 2860 jtrunc_write(WK_JTRUNC(wk), jseg, data); 2861 break; 2862 default: 2863 panic("process_journal: Unknown type %s", 2864 TYPENAME(wk->wk_type)); 2865 /* NOTREACHED */ 2866 } 2867 if (--cnt == 0) 2868 break; 2869 off += JREC_SIZE; 2870 data = bp->b_data + off; 2871 } 2872 /* 2873 * Write this one buffer and continue. 2874 */ 2875 WORKLIST_INSERT(&bp->b_dep, &jseg->js_list); 2876 FREE_LOCK(&lk); 2877 BO_LOCK(bp->b_bufobj); 2878 bgetvp(ump->um_devvp, bp); 2879 BO_UNLOCK(bp->b_bufobj); 2880 if (flags == MNT_NOWAIT) 2881 bawrite(bp); 2882 else 2883 bwrite(bp); 2884 ACQUIRE_LOCK(&lk); 2885 } 2886 /* 2887 * If we've suspended the filesystem because we ran out of journal 2888 * space either try to sync it here to make some progress or 2889 * unsuspend it if we already have. 2890 */ 2891 if (flags == 0 && jblocks->jb_suspended) { 2892 if (journal_unsuspend(ump)) 2893 return; 2894 FREE_LOCK(&lk); 2895 VFS_SYNC(mp, MNT_NOWAIT); 2896 ffs_sbupdate(ump, MNT_WAIT, 0); 2897 ACQUIRE_LOCK(&lk); 2898 } 2899 } 2900 2901 /* 2902 * Complete a jseg, allowing all dependencies awaiting journal writes 2903 * to proceed. Each journal dependency also attaches a jsegdep to dependent 2904 * structures so that the journal segment can be freed to reclaim space. 2905 */ 2906 static void 2907 complete_jseg(jseg) 2908 struct jseg *jseg; 2909 { 2910 struct worklist *wk; 2911 struct jmvref *jmvref; 2912 int waiting; 2913 #ifdef INVARIANTS 2914 int i = 0; 2915 #endif 2916 2917 while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) { 2918 WORKLIST_REMOVE(wk); 2919 waiting = wk->wk_state & IOWAITING; 2920 wk->wk_state &= ~(IOSTARTED | IOWAITING); 2921 wk->wk_state |= COMPLETE; 2922 KASSERT(i++ < jseg->js_cnt, 2923 ("handle_written_jseg: overflow %d >= %d", 2924 i - 1, jseg->js_cnt)); 2925 switch (wk->wk_type) { 2926 case D_JADDREF: 2927 handle_written_jaddref(WK_JADDREF(wk)); 2928 break; 2929 case D_JREMREF: 2930 handle_written_jremref(WK_JREMREF(wk)); 2931 break; 2932 case D_JMVREF: 2933 /* No jsegdep here. */ 2934 free_jseg(jseg); 2935 jmvref = WK_JMVREF(wk); 2936 LIST_REMOVE(jmvref, jm_deps); 2937 free_pagedep(jmvref->jm_pagedep); 2938 WORKITEM_FREE(jmvref, D_JMVREF); 2939 break; 2940 case D_JNEWBLK: 2941 handle_written_jnewblk(WK_JNEWBLK(wk)); 2942 break; 2943 case D_JFREEBLK: 2944 handle_written_jfreeblk(WK_JFREEBLK(wk)); 2945 break; 2946 case D_JFREEFRAG: 2947 handle_written_jfreefrag(WK_JFREEFRAG(wk)); 2948 break; 2949 case D_JTRUNC: 2950 WK_JTRUNC(wk)->jt_jsegdep->jd_seg = jseg; 2951 WORKITEM_FREE(wk, D_JTRUNC); 2952 break; 2953 default: 2954 panic("handle_written_jseg: Unknown type %s", 2955 TYPENAME(wk->wk_type)); 2956 /* NOTREACHED */ 2957 } 2958 if (waiting) 2959 wakeup(wk); 2960 } 2961 /* Release the self reference so the structure may be freed. */ 2962 free_jseg(jseg); 2963 } 2964 2965 /* 2966 * Mark a jseg as DEPCOMPLETE and throw away the buffer. Handle jseg 2967 * completions in order only. 2968 */ 2969 static void 2970 handle_written_jseg(jseg, bp) 2971 struct jseg *jseg; 2972 struct buf *bp; 2973 { 2974 struct jblocks *jblocks; 2975 struct jseg *jsegn; 2976 2977 if (jseg->js_refs == 0) 2978 panic("handle_written_jseg: No self-reference on %p", jseg); 2979 jseg->js_state |= DEPCOMPLETE; 2980 /* 2981 * We'll never need this buffer again, set flags so it will be 2982 * discarded. 2983 */ 2984 bp->b_flags |= B_INVAL | B_NOCACHE; 2985 jblocks = jseg->js_jblocks; 2986 /* 2987 * Don't allow out of order completions. If this isn't the first 2988 * block wait for it to write before we're done. 2989 */ 2990 if (jseg != jblocks->jb_writeseg) 2991 return; 2992 /* Iterate through available jsegs processing their entries. */ 2993 do { 2994 jsegn = TAILQ_NEXT(jseg, js_next); 2995 complete_jseg(jseg); 2996 jseg = jsegn; 2997 } while (jseg && jseg->js_state & DEPCOMPLETE); 2998 jblocks->jb_writeseg = jseg; 2999 } 3000 3001 static inline struct jsegdep * 3002 inoref_jseg(inoref) 3003 struct inoref *inoref; 3004 { 3005 struct jsegdep *jsegdep; 3006 3007 jsegdep = inoref->if_jsegdep; 3008 inoref->if_jsegdep = NULL; 3009 3010 return (jsegdep); 3011 } 3012 3013 /* 3014 * Called once a jremref has made it to stable store. The jremref is marked 3015 * complete and we attempt to free it. Any pagedeps writes sleeping waiting 3016 * for the jremref to complete will be awoken by free_jremref. 3017 */ 3018 static void 3019 handle_written_jremref(jremref) 3020 struct jremref *jremref; 3021 { 3022 struct inodedep *inodedep; 3023 struct jsegdep *jsegdep; 3024 struct dirrem *dirrem; 3025 3026 /* Grab the jsegdep. */ 3027 jsegdep = inoref_jseg(&jremref->jr_ref); 3028 /* 3029 * Remove us from the inoref list. 3030 */ 3031 if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 3032 0, &inodedep) == 0) 3033 panic("handle_written_jremref: Lost inodedep"); 3034 TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); 3035 /* 3036 * Complete the dirrem. 3037 */ 3038 dirrem = jremref->jr_dirrem; 3039 jremref->jr_dirrem = NULL; 3040 LIST_REMOVE(jremref, jr_deps); 3041 jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT; 3042 WORKLIST_INSERT(&dirrem->dm_jwork, &jsegdep->jd_list); 3043 if (LIST_EMPTY(&dirrem->dm_jremrefhd) && 3044 (dirrem->dm_state & COMPLETE) != 0) 3045 add_to_worklist(&dirrem->dm_list, 0); 3046 free_jremref(jremref); 3047 } 3048 3049 /* 3050 * Called once a jaddref has made it to stable store. The dependency is 3051 * marked complete and any dependent structures are added to the inode 3052 * bufwait list to be completed as soon as it is written. If a bitmap write 3053 * depends on this entry we move the inode into the inodedephd of the 3054 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap. 3055 */ 3056 static void 3057 handle_written_jaddref(jaddref) 3058 struct jaddref *jaddref; 3059 { 3060 struct jsegdep *jsegdep; 3061 struct inodedep *inodedep; 3062 struct diradd *diradd; 3063 struct mkdir *mkdir; 3064 3065 /* Grab the jsegdep. */ 3066 jsegdep = inoref_jseg(&jaddref->ja_ref); 3067 mkdir = NULL; 3068 diradd = NULL; 3069 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 3070 0, &inodedep) == 0) 3071 panic("handle_written_jaddref: Lost inodedep."); 3072 if (jaddref->ja_diradd == NULL) 3073 panic("handle_written_jaddref: No dependency"); 3074 if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) { 3075 diradd = jaddref->ja_diradd; 3076 WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list); 3077 } else if (jaddref->ja_state & MKDIR_PARENT) { 3078 mkdir = jaddref->ja_mkdir; 3079 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list); 3080 } else if (jaddref->ja_state & MKDIR_BODY) 3081 mkdir = jaddref->ja_mkdir; 3082 else 3083 panic("handle_written_jaddref: Unknown dependency %p", 3084 jaddref->ja_diradd); 3085 jaddref->ja_diradd = NULL; /* also clears ja_mkdir */ 3086 /* 3087 * Remove us from the inode list. 3088 */ 3089 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); 3090 /* 3091 * The mkdir may be waiting on the jaddref to clear before freeing. 3092 */ 3093 if (mkdir) { 3094 KASSERT(mkdir->md_list.wk_type == D_MKDIR, 3095 ("handle_written_jaddref: Incorrect type for mkdir %s", 3096 TYPENAME(mkdir->md_list.wk_type))); 3097 mkdir->md_jaddref = NULL; 3098 diradd = mkdir->md_diradd; 3099 mkdir->md_state |= DEPCOMPLETE; 3100 complete_mkdir(mkdir); 3101 } 3102 WORKLIST_INSERT(&diradd->da_jwork, &jsegdep->jd_list); 3103 if (jaddref->ja_state & NEWBLOCK) { 3104 inodedep->id_state |= ONDEPLIST; 3105 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd, 3106 inodedep, id_deps); 3107 } 3108 free_jaddref(jaddref); 3109 } 3110 3111 /* 3112 * Called once a jnewblk journal is written. The allocdirect or allocindir 3113 * is placed in the bmsafemap to await notification of a written bitmap. 3114 */ 3115 static void 3116 handle_written_jnewblk(jnewblk) 3117 struct jnewblk *jnewblk; 3118 { 3119 struct bmsafemap *bmsafemap; 3120 struct jsegdep *jsegdep; 3121 struct newblk *newblk; 3122 3123 /* Grab the jsegdep. */ 3124 jsegdep = jnewblk->jn_jsegdep; 3125 jnewblk->jn_jsegdep = NULL; 3126 /* 3127 * Add the written block to the bmsafemap so it can be notified when 3128 * the bitmap is on disk. 3129 */ 3130 newblk = jnewblk->jn_newblk; 3131 jnewblk->jn_newblk = NULL; 3132 if (newblk == NULL) 3133 panic("handle_written_jnewblk: No dependency for the segdep."); 3134 3135 newblk->nb_jnewblk = NULL; 3136 bmsafemap = newblk->nb_bmsafemap; 3137 WORKLIST_INSERT(&newblk->nb_jwork, &jsegdep->jd_list); 3138 newblk->nb_state |= ONDEPLIST; 3139 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 3140 free_jnewblk(jnewblk); 3141 } 3142 3143 /* 3144 * Cancel a jfreefrag that won't be needed, probably due to colliding with 3145 * an in-flight allocation that has not yet been committed. Divorce us 3146 * from the freefrag and mark it DEPCOMPLETE so that it may be added 3147 * to the worklist. 3148 */ 3149 static void 3150 cancel_jfreefrag(jfreefrag) 3151 struct jfreefrag *jfreefrag; 3152 { 3153 struct freefrag *freefrag; 3154 3155 if (jfreefrag->fr_jsegdep) { 3156 free_jsegdep(jfreefrag->fr_jsegdep); 3157 jfreefrag->fr_jsegdep = NULL; 3158 } 3159 freefrag = jfreefrag->fr_freefrag; 3160 jfreefrag->fr_freefrag = NULL; 3161 freefrag->ff_jfreefrag = NULL; 3162 free_jfreefrag(jfreefrag); 3163 freefrag->ff_state |= DEPCOMPLETE; 3164 } 3165 3166 /* 3167 * Free a jfreefrag when the parent freefrag is rendered obsolete. 3168 */ 3169 static void 3170 free_jfreefrag(jfreefrag) 3171 struct jfreefrag *jfreefrag; 3172 { 3173 3174 if (jfreefrag->fr_state & IOSTARTED) 3175 WORKLIST_REMOVE(&jfreefrag->fr_list); 3176 else if (jfreefrag->fr_state & ONWORKLIST) 3177 remove_from_journal(&jfreefrag->fr_list); 3178 if (jfreefrag->fr_freefrag != NULL) 3179 panic("free_jfreefrag: Still attached to a freefrag."); 3180 WORKITEM_FREE(jfreefrag, D_JFREEFRAG); 3181 } 3182 3183 /* 3184 * Called when the journal write for a jfreefrag completes. The parent 3185 * freefrag is added to the worklist if this completes its dependencies. 3186 */ 3187 static void 3188 handle_written_jfreefrag(jfreefrag) 3189 struct jfreefrag *jfreefrag; 3190 { 3191 struct jsegdep *jsegdep; 3192 struct freefrag *freefrag; 3193 3194 /* Grab the jsegdep. */ 3195 jsegdep = jfreefrag->fr_jsegdep; 3196 jfreefrag->fr_jsegdep = NULL; 3197 freefrag = jfreefrag->fr_freefrag; 3198 if (freefrag == NULL) 3199 panic("handle_written_jfreefrag: No freefrag."); 3200 freefrag->ff_state |= DEPCOMPLETE; 3201 freefrag->ff_jfreefrag = NULL; 3202 WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list); 3203 if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) 3204 add_to_worklist(&freefrag->ff_list, 0); 3205 jfreefrag->fr_freefrag = NULL; 3206 free_jfreefrag(jfreefrag); 3207 } 3208 3209 /* 3210 * Called when the journal write for a jfreeblk completes. The jfreeblk 3211 * is removed from the freeblks list of pending journal writes and the 3212 * jsegdep is moved to the freeblks jwork to be completed when all blocks 3213 * have been reclaimed. 3214 */ 3215 static void 3216 handle_written_jfreeblk(jfreeblk) 3217 struct jfreeblk *jfreeblk; 3218 { 3219 struct freeblks *freeblks; 3220 struct jsegdep *jsegdep; 3221 3222 /* Grab the jsegdep. */ 3223 jsegdep = jfreeblk->jf_jsegdep; 3224 jfreeblk->jf_jsegdep = NULL; 3225 freeblks = jfreeblk->jf_freeblks; 3226 LIST_REMOVE(jfreeblk, jf_deps); 3227 WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list); 3228 /* 3229 * If the freeblks is all journaled, we can add it to the worklist. 3230 */ 3231 if (LIST_EMPTY(&freeblks->fb_jfreeblkhd) && 3232 (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) { 3233 /* Remove from the b_dep that is waiting on this write. */ 3234 if (freeblks->fb_state & ONWORKLIST) 3235 WORKLIST_REMOVE(&freeblks->fb_list); 3236 add_to_worklist(&freeblks->fb_list, 1); 3237 } 3238 3239 free_jfreeblk(jfreeblk); 3240 } 3241 3242 static struct jsegdep * 3243 newjsegdep(struct worklist *wk) 3244 { 3245 struct jsegdep *jsegdep; 3246 3247 jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS); 3248 workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp); 3249 jsegdep->jd_seg = NULL; 3250 3251 return (jsegdep); 3252 } 3253 3254 static struct jmvref * 3255 newjmvref(dp, ino, oldoff, newoff) 3256 struct inode *dp; 3257 ino_t ino; 3258 off_t oldoff; 3259 off_t newoff; 3260 { 3261 struct jmvref *jmvref; 3262 3263 jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS); 3264 workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump)); 3265 jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE; 3266 jmvref->jm_parent = dp->i_number; 3267 jmvref->jm_ino = ino; 3268 jmvref->jm_oldoff = oldoff; 3269 jmvref->jm_newoff = newoff; 3270 3271 return (jmvref); 3272 } 3273 3274 /* 3275 * Allocate a new jremref that tracks the removal of ip from dp with the 3276 * directory entry offset of diroff. Mark the entry as ATTACHED and 3277 * DEPCOMPLETE as we have all the information required for the journal write 3278 * and the directory has already been removed from the buffer. The caller 3279 * is responsible for linking the jremref into the pagedep and adding it 3280 * to the journal to write. The MKDIR_PARENT flag is set if we're doing 3281 * a DOTDOT addition so handle_workitem_remove() can properly assign 3282 * the jsegdep when we're done. 3283 */ 3284 static struct jremref * 3285 newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip, 3286 off_t diroff, nlink_t nlink) 3287 { 3288 struct jremref *jremref; 3289 3290 jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS); 3291 workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump)); 3292 jremref->jr_state = ATTACHED; 3293 newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff, 3294 nlink, ip->i_mode); 3295 jremref->jr_dirrem = dirrem; 3296 3297 return (jremref); 3298 } 3299 3300 static inline void 3301 newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff, 3302 nlink_t nlink, uint16_t mode) 3303 { 3304 3305 inoref->if_jsegdep = newjsegdep(&inoref->if_list); 3306 inoref->if_diroff = diroff; 3307 inoref->if_ino = ino; 3308 inoref->if_parent = parent; 3309 inoref->if_nlink = nlink; 3310 inoref->if_mode = mode; 3311 } 3312 3313 /* 3314 * Allocate a new jaddref to track the addition of ino to dp at diroff. The 3315 * directory offset may not be known until later. The caller is responsible 3316 * adding the entry to the journal when this information is available. nlink 3317 * should be the link count prior to the addition and mode is only required 3318 * to have the correct FMT. 3319 */ 3320 static struct jaddref * 3321 newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink, 3322 uint16_t mode) 3323 { 3324 struct jaddref *jaddref; 3325 3326 jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS); 3327 workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump)); 3328 jaddref->ja_state = ATTACHED; 3329 jaddref->ja_mkdir = NULL; 3330 newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode); 3331 3332 return (jaddref); 3333 } 3334 3335 /* 3336 * Create a new free dependency for a freework. The caller is responsible 3337 * for adjusting the reference count when it has the lock held. The freedep 3338 * will track an outstanding bitmap write that will ultimately clear the 3339 * freework to continue. 3340 */ 3341 static struct freedep * 3342 newfreedep(struct freework *freework) 3343 { 3344 struct freedep *freedep; 3345 3346 freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS); 3347 workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp); 3348 freedep->fd_freework = freework; 3349 3350 return (freedep); 3351 } 3352 3353 /* 3354 * Free a freedep structure once the buffer it is linked to is written. If 3355 * this is the last reference to the freework schedule it for completion. 3356 */ 3357 static void 3358 free_freedep(freedep) 3359 struct freedep *freedep; 3360 { 3361 3362 if (--freedep->fd_freework->fw_ref == 0) 3363 add_to_worklist(&freedep->fd_freework->fw_list, 1); 3364 WORKITEM_FREE(freedep, D_FREEDEP); 3365 } 3366 3367 /* 3368 * Allocate a new freework structure that may be a level in an indirect 3369 * when parent is not NULL or a top level block when it is. The top level 3370 * freework structures are allocated without lk held and before the freeblks 3371 * is visible outside of softdep_setup_freeblocks(). 3372 */ 3373 static struct freework * 3374 newfreework(ump, freeblks, parent, lbn, nb, frags, journal) 3375 struct ufsmount *ump; 3376 struct freeblks *freeblks; 3377 struct freework *parent; 3378 ufs_lbn_t lbn; 3379 ufs2_daddr_t nb; 3380 int frags; 3381 int journal; 3382 { 3383 struct freework *freework; 3384 3385 freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS); 3386 workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp); 3387 freework->fw_freeblks = freeblks; 3388 freework->fw_parent = parent; 3389 freework->fw_lbn = lbn; 3390 freework->fw_blkno = nb; 3391 freework->fw_frags = frags; 3392 freework->fw_ref = ((UFSTOVFS(ump)->mnt_kern_flag & MNTK_SUJ) == 0 || 3393 lbn >= -NXADDR) ? 0 : NINDIR(ump->um_fs) + 1; 3394 freework->fw_off = 0; 3395 LIST_INIT(&freework->fw_jwork); 3396 3397 if (parent == NULL) { 3398 WORKLIST_INSERT_UNLOCKED(&freeblks->fb_freeworkhd, 3399 &freework->fw_list); 3400 freeblks->fb_ref++; 3401 } 3402 if (journal) 3403 newjfreeblk(freeblks, lbn, nb, frags); 3404 3405 return (freework); 3406 } 3407 3408 /* 3409 * Allocate a new jfreeblk to journal top level block pointer when truncating 3410 * a file. The caller must add this to the worklist when lk is held. 3411 */ 3412 static struct jfreeblk * 3413 newjfreeblk(freeblks, lbn, blkno, frags) 3414 struct freeblks *freeblks; 3415 ufs_lbn_t lbn; 3416 ufs2_daddr_t blkno; 3417 int frags; 3418 { 3419 struct jfreeblk *jfreeblk; 3420 3421 jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS); 3422 workitem_alloc(&jfreeblk->jf_list, D_JFREEBLK, freeblks->fb_list.wk_mp); 3423 jfreeblk->jf_jsegdep = newjsegdep(&jfreeblk->jf_list); 3424 jfreeblk->jf_state = ATTACHED | DEPCOMPLETE; 3425 jfreeblk->jf_ino = freeblks->fb_previousinum; 3426 jfreeblk->jf_lbn = lbn; 3427 jfreeblk->jf_blkno = blkno; 3428 jfreeblk->jf_frags = frags; 3429 jfreeblk->jf_freeblks = freeblks; 3430 LIST_INSERT_HEAD(&freeblks->fb_jfreeblkhd, jfreeblk, jf_deps); 3431 3432 return (jfreeblk); 3433 } 3434 3435 static void move_newblock_dep(struct jaddref *, struct inodedep *); 3436 /* 3437 * If we're canceling a new bitmap we have to search for another ref 3438 * to move into the bmsafemap dep. This might be better expressed 3439 * with another structure. 3440 */ 3441 static void 3442 move_newblock_dep(jaddref, inodedep) 3443 struct jaddref *jaddref; 3444 struct inodedep *inodedep; 3445 { 3446 struct inoref *inoref; 3447 struct jaddref *jaddrefn; 3448 3449 jaddrefn = NULL; 3450 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; 3451 inoref = TAILQ_NEXT(inoref, if_deps)) { 3452 if ((jaddref->ja_state & NEWBLOCK) && 3453 inoref->if_list.wk_type == D_JADDREF) { 3454 jaddrefn = (struct jaddref *)inoref; 3455 break; 3456 } 3457 } 3458 if (jaddrefn == NULL) 3459 return; 3460 jaddrefn->ja_state &= ~(ATTACHED | UNDONE); 3461 jaddrefn->ja_state |= jaddref->ja_state & 3462 (ATTACHED | UNDONE | NEWBLOCK); 3463 jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK); 3464 jaddref->ja_state |= ATTACHED; 3465 LIST_REMOVE(jaddref, ja_bmdeps); 3466 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn, 3467 ja_bmdeps); 3468 } 3469 3470 /* 3471 * Cancel a jaddref either before it has been written or while it is being 3472 * written. This happens when a link is removed before the add reaches 3473 * the disk. The jaddref dependency is kept linked into the bmsafemap 3474 * and inode to prevent the link count or bitmap from reaching the disk 3475 * until handle_workitem_remove() re-adjusts the counts and bitmaps as 3476 * required. 3477 * 3478 * Returns 1 if the canceled addref requires journaling of the remove and 3479 * 0 otherwise. 3480 */ 3481 static int 3482 cancel_jaddref(jaddref, inodedep, wkhd) 3483 struct jaddref *jaddref; 3484 struct inodedep *inodedep; 3485 struct workhead *wkhd; 3486 { 3487 struct inoref *inoref; 3488 struct jsegdep *jsegdep; 3489 int needsj; 3490 3491 KASSERT((jaddref->ja_state & COMPLETE) == 0, 3492 ("cancel_jaddref: Canceling complete jaddref")); 3493 if (jaddref->ja_state & (IOSTARTED | COMPLETE)) 3494 needsj = 1; 3495 else 3496 needsj = 0; 3497 if (inodedep == NULL) 3498 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 3499 0, &inodedep) == 0) 3500 panic("cancel_jaddref: Lost inodedep"); 3501 /* 3502 * We must adjust the nlink of any reference operation that follows 3503 * us so that it is consistent with the in-memory reference. This 3504 * ensures that inode nlink rollbacks always have the correct link. 3505 */ 3506 if (needsj == 0) 3507 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; 3508 inoref = TAILQ_NEXT(inoref, if_deps)) 3509 inoref->if_nlink--; 3510 jsegdep = inoref_jseg(&jaddref->ja_ref); 3511 if (jaddref->ja_state & NEWBLOCK) 3512 move_newblock_dep(jaddref, inodedep); 3513 if (jaddref->ja_state & IOWAITING) { 3514 jaddref->ja_state &= ~IOWAITING; 3515 wakeup(&jaddref->ja_list); 3516 } 3517 jaddref->ja_mkdir = NULL; 3518 if (jaddref->ja_state & IOSTARTED) { 3519 jaddref->ja_state &= ~IOSTARTED; 3520 WORKLIST_REMOVE(&jaddref->ja_list); 3521 WORKLIST_INSERT(wkhd, &jsegdep->jd_list); 3522 } else { 3523 free_jsegdep(jsegdep); 3524 if (jaddref->ja_state & DEPCOMPLETE) 3525 remove_from_journal(&jaddref->ja_list); 3526 } 3527 /* 3528 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove 3529 * can arrange for them to be freed with the bitmap. Otherwise we 3530 * no longer need this addref attached to the inoreflst and it 3531 * will incorrectly adjust nlink if we leave it. 3532 */ 3533 if ((jaddref->ja_state & NEWBLOCK) == 0) { 3534 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, 3535 if_deps); 3536 jaddref->ja_state |= COMPLETE; 3537 free_jaddref(jaddref); 3538 return (needsj); 3539 } 3540 jaddref->ja_state |= GOINGAWAY; 3541 /* 3542 * Leave the head of the list for jsegdeps for fast merging. 3543 */ 3544 if (LIST_FIRST(wkhd) != NULL) { 3545 jaddref->ja_state |= ONWORKLIST; 3546 LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list); 3547 } else 3548 WORKLIST_INSERT(wkhd, &jaddref->ja_list); 3549 3550 return (needsj); 3551 } 3552 3553 /* 3554 * Attempt to free a jaddref structure when some work completes. This 3555 * should only succeed once the entry is written and all dependencies have 3556 * been notified. 3557 */ 3558 static void 3559 free_jaddref(jaddref) 3560 struct jaddref *jaddref; 3561 { 3562 3563 if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE) 3564 return; 3565 if (jaddref->ja_ref.if_jsegdep) 3566 panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n", 3567 jaddref, jaddref->ja_state); 3568 if (jaddref->ja_state & NEWBLOCK) 3569 LIST_REMOVE(jaddref, ja_bmdeps); 3570 if (jaddref->ja_state & (IOSTARTED | ONWORKLIST)) 3571 panic("free_jaddref: Bad state %p(0x%X)", 3572 jaddref, jaddref->ja_state); 3573 if (jaddref->ja_mkdir != NULL) 3574 panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state); 3575 WORKITEM_FREE(jaddref, D_JADDREF); 3576 } 3577 3578 /* 3579 * Free a jremref structure once it has been written or discarded. 3580 */ 3581 static void 3582 free_jremref(jremref) 3583 struct jremref *jremref; 3584 { 3585 3586 if (jremref->jr_ref.if_jsegdep) 3587 free_jsegdep(jremref->jr_ref.if_jsegdep); 3588 if (jremref->jr_state & IOSTARTED) 3589 panic("free_jremref: IO still pending"); 3590 WORKITEM_FREE(jremref, D_JREMREF); 3591 } 3592 3593 /* 3594 * Free a jnewblk structure. 3595 */ 3596 static void 3597 free_jnewblk(jnewblk) 3598 struct jnewblk *jnewblk; 3599 { 3600 3601 if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE) 3602 return; 3603 LIST_REMOVE(jnewblk, jn_deps); 3604 if (jnewblk->jn_newblk != NULL) 3605 panic("free_jnewblk: Dependency still attached."); 3606 WORKITEM_FREE(jnewblk, D_JNEWBLK); 3607 } 3608 3609 /* 3610 * Cancel a jnewblk which has been superseded by a freeblk. The jnewblk 3611 * is kept linked into the bmsafemap until the free completes, thus 3612 * preventing the modified state from ever reaching disk. The free 3613 * routine must pass this structure via ffs_blkfree() to 3614 * softdep_setup_freeblks() so there is no race in releasing the space. 3615 */ 3616 static void 3617 cancel_jnewblk(jnewblk, wkhd) 3618 struct jnewblk *jnewblk; 3619 struct workhead *wkhd; 3620 { 3621 struct jsegdep *jsegdep; 3622 3623 jsegdep = jnewblk->jn_jsegdep; 3624 jnewblk->jn_jsegdep = NULL; 3625 free_jsegdep(jsegdep); 3626 jnewblk->jn_newblk = NULL; 3627 jnewblk->jn_state |= GOINGAWAY; 3628 if (jnewblk->jn_state & IOSTARTED) { 3629 jnewblk->jn_state &= ~IOSTARTED; 3630 WORKLIST_REMOVE(&jnewblk->jn_list); 3631 } else 3632 remove_from_journal(&jnewblk->jn_list); 3633 /* 3634 * Leave the head of the list for jsegdeps for fast merging. 3635 */ 3636 if (LIST_FIRST(wkhd) != NULL) { 3637 jnewblk->jn_state |= ONWORKLIST; 3638 LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jnewblk->jn_list, wk_list); 3639 } else 3640 WORKLIST_INSERT(wkhd, &jnewblk->jn_list); 3641 if (jnewblk->jn_state & IOWAITING) { 3642 jnewblk->jn_state &= ~IOWAITING; 3643 wakeup(&jnewblk->jn_list); 3644 } 3645 } 3646 3647 static void 3648 free_jfreeblk(jfreeblk) 3649 struct jfreeblk *jfreeblk; 3650 { 3651 3652 WORKITEM_FREE(jfreeblk, D_JFREEBLK); 3653 } 3654 3655 /* 3656 * Release one reference to a jseg and free it if the count reaches 0. This 3657 * should eventually reclaim journal space as well. 3658 */ 3659 static void 3660 free_jseg(jseg) 3661 struct jseg *jseg; 3662 { 3663 struct jblocks *jblocks; 3664 3665 KASSERT(jseg->js_refs > 0, 3666 ("free_jseg: Invalid refcnt %d", jseg->js_refs)); 3667 if (--jseg->js_refs != 0) 3668 return; 3669 /* 3670 * Free only those jsegs which have none allocated before them to 3671 * preserve the journal space ordering. 3672 */ 3673 jblocks = jseg->js_jblocks; 3674 while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) { 3675 jblocks->jb_oldestseq = jseg->js_seq; 3676 if (jseg->js_refs != 0) 3677 break; 3678 TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next); 3679 jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size); 3680 KASSERT(LIST_EMPTY(&jseg->js_entries), 3681 ("free_jseg: Freed jseg has valid entries.")); 3682 WORKITEM_FREE(jseg, D_JSEG); 3683 } 3684 } 3685 3686 /* 3687 * Release a jsegdep and decrement the jseg count. 3688 */ 3689 static void 3690 free_jsegdep(jsegdep) 3691 struct jsegdep *jsegdep; 3692 { 3693 3694 if (jsegdep->jd_seg) 3695 free_jseg(jsegdep->jd_seg); 3696 WORKITEM_FREE(jsegdep, D_JSEGDEP); 3697 } 3698 3699 /* 3700 * Wait for a journal item to make it to disk. Initiate journal processing 3701 * if required. 3702 */ 3703 static void 3704 jwait(wk) 3705 struct worklist *wk; 3706 { 3707 3708 stat_journal_wait++; 3709 /* 3710 * If IO has not started we process the journal. We can't mark the 3711 * worklist item as IOWAITING because we drop the lock while 3712 * processing the journal and the worklist entry may be freed after 3713 * this point. The caller may call back in and re-issue the request. 3714 */ 3715 if ((wk->wk_state & IOSTARTED) == 0) { 3716 softdep_process_journal(wk->wk_mp, MNT_WAIT); 3717 return; 3718 } 3719 wk->wk_state |= IOWAITING; 3720 msleep(wk, &lk, PRIBIO, "jwait", 0); 3721 } 3722 3723 /* 3724 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as 3725 * appropriate. This is a convenience function to reduce duplicate code 3726 * for the setup and revert functions below. 3727 */ 3728 static struct inodedep * 3729 inodedep_lookup_ip(ip) 3730 struct inode *ip; 3731 { 3732 struct inodedep *inodedep; 3733 3734 KASSERT(ip->i_nlink >= ip->i_effnlink, 3735 ("inodedep_lookup_ip: bad delta")); 3736 (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 3737 DEPALLOC, &inodedep); 3738 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 3739 3740 return (inodedep); 3741 } 3742 3743 /* 3744 * Create a journal entry that describes a truncate that we're about to 3745 * perform. The inode allocations and frees between here and the completion 3746 * of the operation are done asynchronously and without journaling. At 3747 * the end of the operation the vnode is sync'd and the journal space 3748 * is released. Recovery will discover the partially completed truncate 3749 * and complete it. 3750 */ 3751 void * 3752 softdep_setup_trunc(vp, length, flags) 3753 struct vnode *vp; 3754 off_t length; 3755 int flags; 3756 { 3757 struct jsegdep *jsegdep; 3758 struct jtrunc *jtrunc; 3759 struct ufsmount *ump; 3760 struct inode *ip; 3761 3762 softdep_prealloc(vp, MNT_WAIT); 3763 ip = VTOI(vp); 3764 ump = VFSTOUFS(vp->v_mount); 3765 jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS); 3766 workitem_alloc(&jtrunc->jt_list, D_JTRUNC, vp->v_mount); 3767 jsegdep = jtrunc->jt_jsegdep = newjsegdep(&jtrunc->jt_list); 3768 jtrunc->jt_ino = ip->i_number; 3769 jtrunc->jt_extsize = 0; 3770 jtrunc->jt_size = length; 3771 if ((flags & IO_EXT) == 0 && ump->um_fstype == UFS2) 3772 jtrunc->jt_extsize = ip->i_din2->di_extsize; 3773 if ((flags & IO_NORMAL) == 0) 3774 jtrunc->jt_size = DIP(ip, i_size); 3775 ACQUIRE_LOCK(&lk); 3776 add_to_journal(&jtrunc->jt_list); 3777 while (jsegdep->jd_seg == NULL) { 3778 stat_jwait_freeblks++; 3779 jwait(&jtrunc->jt_list); 3780 } 3781 FREE_LOCK(&lk); 3782 3783 return (jsegdep); 3784 } 3785 3786 /* 3787 * After synchronous truncation is complete we free sync the vnode and 3788 * release the jsegdep so the journal space can be freed. 3789 */ 3790 int 3791 softdep_complete_trunc(vp, cookie) 3792 struct vnode *vp; 3793 void *cookie; 3794 { 3795 int error; 3796 3797 error = ffs_syncvnode(vp, MNT_WAIT); 3798 ACQUIRE_LOCK(&lk); 3799 free_jsegdep((struct jsegdep *)cookie); 3800 FREE_LOCK(&lk); 3801 3802 return (error); 3803 } 3804 3805 /* 3806 * Called prior to creating a new inode and linking it to a directory. The 3807 * jaddref structure must already be allocated by softdep_setup_inomapdep 3808 * and it is discovered here so we can initialize the mode and update 3809 * nlinkdelta. 3810 */ 3811 void 3812 softdep_setup_create(dp, ip) 3813 struct inode *dp; 3814 struct inode *ip; 3815 { 3816 struct inodedep *inodedep; 3817 struct jaddref *jaddref; 3818 struct vnode *dvp; 3819 3820 KASSERT(ip->i_nlink == 1, 3821 ("softdep_setup_create: Invalid link count.")); 3822 dvp = ITOV(dp); 3823 ACQUIRE_LOCK(&lk); 3824 inodedep = inodedep_lookup_ip(ip); 3825 if (DOINGSUJ(dvp)) { 3826 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 3827 inoreflst); 3828 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 3829 ("softdep_setup_create: No addref structure present.")); 3830 jaddref->ja_mode = ip->i_mode; 3831 } 3832 softdep_prelink(dvp, NULL); 3833 FREE_LOCK(&lk); 3834 } 3835 3836 /* 3837 * Create a jaddref structure to track the addition of a DOTDOT link when 3838 * we are reparenting an inode as part of a rename. This jaddref will be 3839 * found by softdep_setup_directory_change. Adjusts nlinkdelta for 3840 * non-journaling softdep. 3841 */ 3842 void 3843 softdep_setup_dotdot_link(dp, ip) 3844 struct inode *dp; 3845 struct inode *ip; 3846 { 3847 struct inodedep *inodedep; 3848 struct jaddref *jaddref; 3849 struct vnode *dvp; 3850 struct vnode *vp; 3851 3852 dvp = ITOV(dp); 3853 vp = ITOV(ip); 3854 jaddref = NULL; 3855 /* 3856 * We don't set MKDIR_PARENT as this is not tied to a mkdir and 3857 * is used as a normal link would be. 3858 */ 3859 if (DOINGSUJ(dvp)) 3860 jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, 3861 dp->i_effnlink - 1, dp->i_mode); 3862 ACQUIRE_LOCK(&lk); 3863 inodedep = inodedep_lookup_ip(dp); 3864 if (jaddref) 3865 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 3866 if_deps); 3867 softdep_prelink(dvp, ITOV(ip)); 3868 FREE_LOCK(&lk); 3869 } 3870 3871 /* 3872 * Create a jaddref structure to track a new link to an inode. The directory 3873 * offset is not known until softdep_setup_directory_add or 3874 * softdep_setup_directory_change. Adjusts nlinkdelta for non-journaling 3875 * softdep. 3876 */ 3877 void 3878 softdep_setup_link(dp, ip) 3879 struct inode *dp; 3880 struct inode *ip; 3881 { 3882 struct inodedep *inodedep; 3883 struct jaddref *jaddref; 3884 struct vnode *dvp; 3885 3886 dvp = ITOV(dp); 3887 jaddref = NULL; 3888 if (DOINGSUJ(dvp)) 3889 jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1, 3890 ip->i_mode); 3891 ACQUIRE_LOCK(&lk); 3892 inodedep = inodedep_lookup_ip(ip); 3893 if (jaddref) 3894 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 3895 if_deps); 3896 softdep_prelink(dvp, ITOV(ip)); 3897 FREE_LOCK(&lk); 3898 } 3899 3900 /* 3901 * Called to create the jaddref structures to track . and .. references as 3902 * well as lookup and further initialize the incomplete jaddref created 3903 * by softdep_setup_inomapdep when the inode was allocated. Adjusts 3904 * nlinkdelta for non-journaling softdep. 3905 */ 3906 void 3907 softdep_setup_mkdir(dp, ip) 3908 struct inode *dp; 3909 struct inode *ip; 3910 { 3911 struct inodedep *inodedep; 3912 struct jaddref *dotdotaddref; 3913 struct jaddref *dotaddref; 3914 struct jaddref *jaddref; 3915 struct vnode *dvp; 3916 3917 dvp = ITOV(dp); 3918 dotaddref = dotdotaddref = NULL; 3919 if (DOINGSUJ(dvp)) { 3920 dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1, 3921 ip->i_mode); 3922 dotaddref->ja_state |= MKDIR_BODY; 3923 dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, 3924 dp->i_effnlink - 1, dp->i_mode); 3925 dotdotaddref->ja_state |= MKDIR_PARENT; 3926 } 3927 ACQUIRE_LOCK(&lk); 3928 inodedep = inodedep_lookup_ip(ip); 3929 if (DOINGSUJ(dvp)) { 3930 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 3931 inoreflst); 3932 KASSERT(jaddref != NULL, 3933 ("softdep_setup_mkdir: No addref structure present.")); 3934 KASSERT(jaddref->ja_parent == dp->i_number, 3935 ("softdep_setup_mkdir: bad parent %d", 3936 jaddref->ja_parent)); 3937 jaddref->ja_mode = ip->i_mode; 3938 TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref, 3939 if_deps); 3940 } 3941 inodedep = inodedep_lookup_ip(dp); 3942 if (DOINGSUJ(dvp)) 3943 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, 3944 &dotdotaddref->ja_ref, if_deps); 3945 softdep_prelink(ITOV(dp), NULL); 3946 FREE_LOCK(&lk); 3947 } 3948 3949 /* 3950 * Called to track nlinkdelta of the inode and parent directories prior to 3951 * unlinking a directory. 3952 */ 3953 void 3954 softdep_setup_rmdir(dp, ip) 3955 struct inode *dp; 3956 struct inode *ip; 3957 { 3958 struct vnode *dvp; 3959 3960 dvp = ITOV(dp); 3961 ACQUIRE_LOCK(&lk); 3962 (void) inodedep_lookup_ip(ip); 3963 (void) inodedep_lookup_ip(dp); 3964 softdep_prelink(dvp, ITOV(ip)); 3965 FREE_LOCK(&lk); 3966 } 3967 3968 /* 3969 * Called to track nlinkdelta of the inode and parent directories prior to 3970 * unlink. 3971 */ 3972 void 3973 softdep_setup_unlink(dp, ip) 3974 struct inode *dp; 3975 struct inode *ip; 3976 { 3977 struct vnode *dvp; 3978 3979 dvp = ITOV(dp); 3980 ACQUIRE_LOCK(&lk); 3981 (void) inodedep_lookup_ip(ip); 3982 (void) inodedep_lookup_ip(dp); 3983 softdep_prelink(dvp, ITOV(ip)); 3984 FREE_LOCK(&lk); 3985 } 3986 3987 /* 3988 * Called to release the journal structures created by a failed non-directory 3989 * creation. Adjusts nlinkdelta for non-journaling softdep. 3990 */ 3991 void 3992 softdep_revert_create(dp, ip) 3993 struct inode *dp; 3994 struct inode *ip; 3995 { 3996 struct inodedep *inodedep; 3997 struct jaddref *jaddref; 3998 struct vnode *dvp; 3999 4000 dvp = ITOV(dp); 4001 ACQUIRE_LOCK(&lk); 4002 inodedep = inodedep_lookup_ip(ip); 4003 if (DOINGSUJ(dvp)) { 4004 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4005 inoreflst); 4006 KASSERT(jaddref->ja_parent == dp->i_number, 4007 ("softdep_revert_create: addref parent mismatch")); 4008 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4009 } 4010 FREE_LOCK(&lk); 4011 } 4012 4013 /* 4014 * Called to release the journal structures created by a failed dotdot link 4015 * creation. Adjusts nlinkdelta for non-journaling softdep. 4016 */ 4017 void 4018 softdep_revert_dotdot_link(dp, ip) 4019 struct inode *dp; 4020 struct inode *ip; 4021 { 4022 struct inodedep *inodedep; 4023 struct jaddref *jaddref; 4024 struct vnode *dvp; 4025 4026 dvp = ITOV(dp); 4027 ACQUIRE_LOCK(&lk); 4028 inodedep = inodedep_lookup_ip(dp); 4029 if (DOINGSUJ(dvp)) { 4030 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4031 inoreflst); 4032 KASSERT(jaddref->ja_parent == ip->i_number, 4033 ("softdep_revert_dotdot_link: addref parent mismatch")); 4034 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4035 } 4036 FREE_LOCK(&lk); 4037 } 4038 4039 /* 4040 * Called to release the journal structures created by a failed link 4041 * addition. Adjusts nlinkdelta for non-journaling softdep. 4042 */ 4043 void 4044 softdep_revert_link(dp, ip) 4045 struct inode *dp; 4046 struct inode *ip; 4047 { 4048 struct inodedep *inodedep; 4049 struct jaddref *jaddref; 4050 struct vnode *dvp; 4051 4052 dvp = ITOV(dp); 4053 ACQUIRE_LOCK(&lk); 4054 inodedep = inodedep_lookup_ip(ip); 4055 if (DOINGSUJ(dvp)) { 4056 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4057 inoreflst); 4058 KASSERT(jaddref->ja_parent == dp->i_number, 4059 ("softdep_revert_link: addref parent mismatch")); 4060 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4061 } 4062 FREE_LOCK(&lk); 4063 } 4064 4065 /* 4066 * Called to release the journal structures created by a failed mkdir 4067 * attempt. Adjusts nlinkdelta for non-journaling softdep. 4068 */ 4069 void 4070 softdep_revert_mkdir(dp, ip) 4071 struct inode *dp; 4072 struct inode *ip; 4073 { 4074 struct inodedep *inodedep; 4075 struct jaddref *jaddref; 4076 struct vnode *dvp; 4077 4078 dvp = ITOV(dp); 4079 4080 ACQUIRE_LOCK(&lk); 4081 inodedep = inodedep_lookup_ip(dp); 4082 if (DOINGSUJ(dvp)) { 4083 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4084 inoreflst); 4085 KASSERT(jaddref->ja_parent == ip->i_number, 4086 ("softdep_revert_mkdir: dotdot addref parent mismatch")); 4087 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4088 } 4089 inodedep = inodedep_lookup_ip(ip); 4090 if (DOINGSUJ(dvp)) { 4091 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4092 inoreflst); 4093 KASSERT(jaddref->ja_parent == dp->i_number, 4094 ("softdep_revert_mkdir: addref parent mismatch")); 4095 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4096 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4097 inoreflst); 4098 KASSERT(jaddref->ja_parent == ip->i_number, 4099 ("softdep_revert_mkdir: dot addref parent mismatch")); 4100 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4101 } 4102 FREE_LOCK(&lk); 4103 } 4104 4105 /* 4106 * Called to correct nlinkdelta after a failed rmdir. 4107 */ 4108 void 4109 softdep_revert_rmdir(dp, ip) 4110 struct inode *dp; 4111 struct inode *ip; 4112 { 4113 4114 ACQUIRE_LOCK(&lk); 4115 (void) inodedep_lookup_ip(ip); 4116 (void) inodedep_lookup_ip(dp); 4117 FREE_LOCK(&lk); 4118 } 4119 4120 /* 4121 * Protecting the freemaps (or bitmaps). 4122 * 4123 * To eliminate the need to execute fsck before mounting a filesystem 4124 * after a power failure, one must (conservatively) guarantee that the 4125 * on-disk copy of the bitmaps never indicate that a live inode or block is 4126 * free. So, when a block or inode is allocated, the bitmap should be 4127 * updated (on disk) before any new pointers. When a block or inode is 4128 * freed, the bitmap should not be updated until all pointers have been 4129 * reset. The latter dependency is handled by the delayed de-allocation 4130 * approach described below for block and inode de-allocation. The former 4131 * dependency is handled by calling the following procedure when a block or 4132 * inode is allocated. When an inode is allocated an "inodedep" is created 4133 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. 4134 * Each "inodedep" is also inserted into the hash indexing structure so 4135 * that any additional link additions can be made dependent on the inode 4136 * allocation. 4137 * 4138 * The ufs filesystem maintains a number of free block counts (e.g., per 4139 * cylinder group, per cylinder and per <cylinder, rotational position> pair) 4140 * in addition to the bitmaps. These counts are used to improve efficiency 4141 * during allocation and therefore must be consistent with the bitmaps. 4142 * There is no convenient way to guarantee post-crash consistency of these 4143 * counts with simple update ordering, for two main reasons: (1) The counts 4144 * and bitmaps for a single cylinder group block are not in the same disk 4145 * sector. If a disk write is interrupted (e.g., by power failure), one may 4146 * be written and the other not. (2) Some of the counts are located in the 4147 * superblock rather than the cylinder group block. So, we focus our soft 4148 * updates implementation on protecting the bitmaps. When mounting a 4149 * filesystem, we recompute the auxiliary counts from the bitmaps. 4150 */ 4151 4152 /* 4153 * Called just after updating the cylinder group block to allocate an inode. 4154 */ 4155 void 4156 softdep_setup_inomapdep(bp, ip, newinum) 4157 struct buf *bp; /* buffer for cylgroup block with inode map */ 4158 struct inode *ip; /* inode related to allocation */ 4159 ino_t newinum; /* new inode number being allocated */ 4160 { 4161 struct inodedep *inodedep; 4162 struct bmsafemap *bmsafemap; 4163 struct jaddref *jaddref; 4164 struct mount *mp; 4165 struct fs *fs; 4166 4167 mp = UFSTOVFS(ip->i_ump); 4168 fs = ip->i_ump->um_fs; 4169 jaddref = NULL; 4170 4171 /* 4172 * Allocate the journal reference add structure so that the bitmap 4173 * can be dependent on it. 4174 */ 4175 if (mp->mnt_kern_flag & MNTK_SUJ) { 4176 jaddref = newjaddref(ip, newinum, 0, 0, 0); 4177 jaddref->ja_state |= NEWBLOCK; 4178 } 4179 4180 /* 4181 * Create a dependency for the newly allocated inode. 4182 * Panic if it already exists as something is seriously wrong. 4183 * Otherwise add it to the dependency list for the buffer holding 4184 * the cylinder group map from which it was allocated. 4185 */ 4186 ACQUIRE_LOCK(&lk); 4187 if ((inodedep_lookup(mp, newinum, DEPALLOC|NODELAY, &inodedep))) 4188 panic("softdep_setup_inomapdep: dependency %p for new" 4189 "inode already exists", inodedep); 4190 bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum)); 4191 if (jaddref) { 4192 LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps); 4193 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 4194 if_deps); 4195 } else { 4196 inodedep->id_state |= ONDEPLIST; 4197 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); 4198 } 4199 inodedep->id_bmsafemap = bmsafemap; 4200 inodedep->id_state &= ~DEPCOMPLETE; 4201 FREE_LOCK(&lk); 4202 } 4203 4204 /* 4205 * Called just after updating the cylinder group block to 4206 * allocate block or fragment. 4207 */ 4208 void 4209 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) 4210 struct buf *bp; /* buffer for cylgroup block with block map */ 4211 struct mount *mp; /* filesystem doing allocation */ 4212 ufs2_daddr_t newblkno; /* number of newly allocated block */ 4213 int frags; /* Number of fragments. */ 4214 int oldfrags; /* Previous number of fragments for extend. */ 4215 { 4216 struct newblk *newblk; 4217 struct bmsafemap *bmsafemap; 4218 struct jnewblk *jnewblk; 4219 struct fs *fs; 4220 4221 fs = VFSTOUFS(mp)->um_fs; 4222 jnewblk = NULL; 4223 /* 4224 * Create a dependency for the newly allocated block. 4225 * Add it to the dependency list for the buffer holding 4226 * the cylinder group map from which it was allocated. 4227 */ 4228 if (mp->mnt_kern_flag & MNTK_SUJ) { 4229 jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS); 4230 workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp); 4231 jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list); 4232 jnewblk->jn_state = ATTACHED; 4233 jnewblk->jn_blkno = newblkno; 4234 jnewblk->jn_frags = frags; 4235 jnewblk->jn_oldfrags = oldfrags; 4236 #ifdef SUJ_DEBUG 4237 { 4238 struct cg *cgp; 4239 uint8_t *blksfree; 4240 long bno; 4241 int i; 4242 4243 cgp = (struct cg *)bp->b_data; 4244 blksfree = cg_blksfree(cgp); 4245 bno = dtogd(fs, jnewblk->jn_blkno); 4246 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; 4247 i++) { 4248 if (isset(blksfree, bno + i)) 4249 panic("softdep_setup_blkmapdep: " 4250 "free fragment %d from %d-%d " 4251 "state 0x%X dep %p", i, 4252 jnewblk->jn_oldfrags, 4253 jnewblk->jn_frags, 4254 jnewblk->jn_state, 4255 jnewblk->jn_newblk); 4256 } 4257 } 4258 #endif 4259 } 4260 ACQUIRE_LOCK(&lk); 4261 if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0) 4262 panic("softdep_setup_blkmapdep: found block"); 4263 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp, 4264 dtog(fs, newblkno)); 4265 if (jnewblk) { 4266 jnewblk->jn_newblk = newblk; 4267 LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps); 4268 } else { 4269 newblk->nb_state |= ONDEPLIST; 4270 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 4271 } 4272 newblk->nb_bmsafemap = bmsafemap; 4273 newblk->nb_jnewblk = jnewblk; 4274 FREE_LOCK(&lk); 4275 } 4276 4277 #define BMSAFEMAP_HASH(fs, cg) \ 4278 (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash]) 4279 4280 static int 4281 bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp) 4282 struct bmsafemap_hashhead *bmsafemaphd; 4283 struct mount *mp; 4284 int cg; 4285 struct bmsafemap **bmsafemapp; 4286 { 4287 struct bmsafemap *bmsafemap; 4288 4289 LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash) 4290 if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg) 4291 break; 4292 if (bmsafemap) { 4293 *bmsafemapp = bmsafemap; 4294 return (1); 4295 } 4296 *bmsafemapp = NULL; 4297 4298 return (0); 4299 } 4300 4301 /* 4302 * Find the bmsafemap associated with a cylinder group buffer. 4303 * If none exists, create one. The buffer must be locked when 4304 * this routine is called and this routine must be called with 4305 * splbio interrupts blocked. 4306 */ 4307 static struct bmsafemap * 4308 bmsafemap_lookup(mp, bp, cg) 4309 struct mount *mp; 4310 struct buf *bp; 4311 int cg; 4312 { 4313 struct bmsafemap_hashhead *bmsafemaphd; 4314 struct bmsafemap *bmsafemap, *collision; 4315 struct worklist *wk; 4316 struct fs *fs; 4317 4318 mtx_assert(&lk, MA_OWNED); 4319 if (bp) 4320 LIST_FOREACH(wk, &bp->b_dep, wk_list) 4321 if (wk->wk_type == D_BMSAFEMAP) 4322 return (WK_BMSAFEMAP(wk)); 4323 fs = VFSTOUFS(mp)->um_fs; 4324 bmsafemaphd = BMSAFEMAP_HASH(fs, cg); 4325 if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1) 4326 return (bmsafemap); 4327 FREE_LOCK(&lk); 4328 bmsafemap = malloc(sizeof(struct bmsafemap), 4329 M_BMSAFEMAP, M_SOFTDEP_FLAGS); 4330 workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); 4331 bmsafemap->sm_buf = bp; 4332 LIST_INIT(&bmsafemap->sm_inodedephd); 4333 LIST_INIT(&bmsafemap->sm_inodedepwr); 4334 LIST_INIT(&bmsafemap->sm_newblkhd); 4335 LIST_INIT(&bmsafemap->sm_newblkwr); 4336 LIST_INIT(&bmsafemap->sm_jaddrefhd); 4337 LIST_INIT(&bmsafemap->sm_jnewblkhd); 4338 ACQUIRE_LOCK(&lk); 4339 if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) { 4340 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 4341 return (collision); 4342 } 4343 bmsafemap->sm_cg = cg; 4344 LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash); 4345 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); 4346 return (bmsafemap); 4347 } 4348 4349 /* 4350 * Direct block allocation dependencies. 4351 * 4352 * When a new block is allocated, the corresponding disk locations must be 4353 * initialized (with zeros or new data) before the on-disk inode points to 4354 * them. Also, the freemap from which the block was allocated must be 4355 * updated (on disk) before the inode's pointer. These two dependencies are 4356 * independent of each other and are needed for all file blocks and indirect 4357 * blocks that are pointed to directly by the inode. Just before the 4358 * "in-core" version of the inode is updated with a newly allocated block 4359 * number, a procedure (below) is called to setup allocation dependency 4360 * structures. These structures are removed when the corresponding 4361 * dependencies are satisfied or when the block allocation becomes obsolete 4362 * (i.e., the file is deleted, the block is de-allocated, or the block is a 4363 * fragment that gets upgraded). All of these cases are handled in 4364 * procedures described later. 4365 * 4366 * When a file extension causes a fragment to be upgraded, either to a larger 4367 * fragment or to a full block, the on-disk location may change (if the 4368 * previous fragment could not simply be extended). In this case, the old 4369 * fragment must be de-allocated, but not until after the inode's pointer has 4370 * been updated. In most cases, this is handled by later procedures, which 4371 * will construct a "freefrag" structure to be added to the workitem queue 4372 * when the inode update is complete (or obsolete). The main exception to 4373 * this is when an allocation occurs while a pending allocation dependency 4374 * (for the same block pointer) remains. This case is handled in the main 4375 * allocation dependency setup procedure by immediately freeing the 4376 * unreferenced fragments. 4377 */ 4378 void 4379 softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp) 4380 struct inode *ip; /* inode to which block is being added */ 4381 ufs_lbn_t off; /* block pointer within inode */ 4382 ufs2_daddr_t newblkno; /* disk block number being added */ 4383 ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */ 4384 long newsize; /* size of new block */ 4385 long oldsize; /* size of new block */ 4386 struct buf *bp; /* bp for allocated block */ 4387 { 4388 struct allocdirect *adp, *oldadp; 4389 struct allocdirectlst *adphead; 4390 struct freefrag *freefrag; 4391 struct inodedep *inodedep; 4392 struct pagedep *pagedep; 4393 struct jnewblk *jnewblk; 4394 struct newblk *newblk; 4395 struct mount *mp; 4396 ufs_lbn_t lbn; 4397 4398 lbn = bp->b_lblkno; 4399 mp = UFSTOVFS(ip->i_ump); 4400 if (oldblkno && oldblkno != newblkno) 4401 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); 4402 else 4403 freefrag = NULL; 4404 4405 ACQUIRE_LOCK(&lk); 4406 if (off >= NDADDR) { 4407 if (lbn > 0) 4408 panic("softdep_setup_allocdirect: bad lbn %jd, off %jd", 4409 lbn, off); 4410 /* allocating an indirect block */ 4411 if (oldblkno != 0) 4412 panic("softdep_setup_allocdirect: non-zero indir"); 4413 } else { 4414 if (off != lbn) 4415 panic("softdep_setup_allocdirect: lbn %jd != off %jd", 4416 lbn, off); 4417 /* 4418 * Allocating a direct block. 4419 * 4420 * If we are allocating a directory block, then we must 4421 * allocate an associated pagedep to track additions and 4422 * deletions. 4423 */ 4424 if ((ip->i_mode & IFMT) == IFDIR && 4425 pagedep_lookup(mp, ip->i_number, off, DEPALLOC, 4426 &pagedep) == 0) 4427 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 4428 } 4429 if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) 4430 panic("softdep_setup_allocdirect: lost block"); 4431 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 4432 ("softdep_setup_allocdirect: newblk already initialized")); 4433 /* 4434 * Convert the newblk to an allocdirect. 4435 */ 4436 newblk->nb_list.wk_type = D_ALLOCDIRECT; 4437 adp = (struct allocdirect *)newblk; 4438 newblk->nb_freefrag = freefrag; 4439 adp->ad_offset = off; 4440 adp->ad_oldblkno = oldblkno; 4441 adp->ad_newsize = newsize; 4442 adp->ad_oldsize = oldsize; 4443 4444 /* 4445 * Finish initializing the journal. 4446 */ 4447 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 4448 jnewblk->jn_ino = ip->i_number; 4449 jnewblk->jn_lbn = lbn; 4450 add_to_journal(&jnewblk->jn_list); 4451 } 4452 if (freefrag && freefrag->ff_jfreefrag != NULL) 4453 add_to_journal(&freefrag->ff_jfreefrag->fr_list); 4454 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); 4455 adp->ad_inodedep = inodedep; 4456 4457 WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); 4458 /* 4459 * The list of allocdirects must be kept in sorted and ascending 4460 * order so that the rollback routines can quickly determine the 4461 * first uncommitted block (the size of the file stored on disk 4462 * ends at the end of the lowest committed fragment, or if there 4463 * are no fragments, at the end of the highest committed block). 4464 * Since files generally grow, the typical case is that the new 4465 * block is to be added at the end of the list. We speed this 4466 * special case by checking against the last allocdirect in the 4467 * list before laboriously traversing the list looking for the 4468 * insertion point. 4469 */ 4470 adphead = &inodedep->id_newinoupdt; 4471 oldadp = TAILQ_LAST(adphead, allocdirectlst); 4472 if (oldadp == NULL || oldadp->ad_offset <= off) { 4473 /* insert at end of list */ 4474 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 4475 if (oldadp != NULL && oldadp->ad_offset == off) 4476 allocdirect_merge(adphead, adp, oldadp); 4477 FREE_LOCK(&lk); 4478 return; 4479 } 4480 TAILQ_FOREACH(oldadp, adphead, ad_next) { 4481 if (oldadp->ad_offset >= off) 4482 break; 4483 } 4484 if (oldadp == NULL) 4485 panic("softdep_setup_allocdirect: lost entry"); 4486 /* insert in middle of list */ 4487 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 4488 if (oldadp->ad_offset == off) 4489 allocdirect_merge(adphead, adp, oldadp); 4490 4491 FREE_LOCK(&lk); 4492 } 4493 4494 /* 4495 * Replace an old allocdirect dependency with a newer one. 4496 * This routine must be called with splbio interrupts blocked. 4497 */ 4498 static void 4499 allocdirect_merge(adphead, newadp, oldadp) 4500 struct allocdirectlst *adphead; /* head of list holding allocdirects */ 4501 struct allocdirect *newadp; /* allocdirect being added */ 4502 struct allocdirect *oldadp; /* existing allocdirect being checked */ 4503 { 4504 struct worklist *wk; 4505 struct freefrag *freefrag; 4506 struct newdirblk *newdirblk; 4507 4508 freefrag = NULL; 4509 mtx_assert(&lk, MA_OWNED); 4510 if (newadp->ad_oldblkno != oldadp->ad_newblkno || 4511 newadp->ad_oldsize != oldadp->ad_newsize || 4512 newadp->ad_offset >= NDADDR) 4513 panic("%s %jd != new %jd || old size %ld != new %ld", 4514 "allocdirect_merge: old blkno", 4515 (intmax_t)newadp->ad_oldblkno, 4516 (intmax_t)oldadp->ad_newblkno, 4517 newadp->ad_oldsize, oldadp->ad_newsize); 4518 newadp->ad_oldblkno = oldadp->ad_oldblkno; 4519 newadp->ad_oldsize = oldadp->ad_oldsize; 4520 /* 4521 * If the old dependency had a fragment to free or had never 4522 * previously had a block allocated, then the new dependency 4523 * can immediately post its freefrag and adopt the old freefrag. 4524 * This action is done by swapping the freefrag dependencies. 4525 * The new dependency gains the old one's freefrag, and the 4526 * old one gets the new one and then immediately puts it on 4527 * the worklist when it is freed by free_newblk. It is 4528 * not possible to do this swap when the old dependency had a 4529 * non-zero size but no previous fragment to free. This condition 4530 * arises when the new block is an extension of the old block. 4531 * Here, the first part of the fragment allocated to the new 4532 * dependency is part of the block currently claimed on disk by 4533 * the old dependency, so cannot legitimately be freed until the 4534 * conditions for the new dependency are fulfilled. 4535 */ 4536 freefrag = newadp->ad_freefrag; 4537 if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { 4538 newadp->ad_freefrag = oldadp->ad_freefrag; 4539 oldadp->ad_freefrag = freefrag; 4540 } 4541 /* 4542 * If we are tracking a new directory-block allocation, 4543 * move it from the old allocdirect to the new allocdirect. 4544 */ 4545 if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) { 4546 newdirblk = WK_NEWDIRBLK(wk); 4547 WORKLIST_REMOVE(&newdirblk->db_list); 4548 if (!LIST_EMPTY(&oldadp->ad_newdirblk)) 4549 panic("allocdirect_merge: extra newdirblk"); 4550 WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list); 4551 } 4552 TAILQ_REMOVE(adphead, oldadp, ad_next); 4553 /* 4554 * We need to move any journal dependencies over to the freefrag 4555 * that releases this block if it exists. Otherwise we are 4556 * extending an existing block and we'll wait until that is 4557 * complete to release the journal space and extend the 4558 * new journal to cover this old space as well. 4559 */ 4560 if (freefrag == NULL) { 4561 struct jnewblk *jnewblk; 4562 struct jnewblk *njnewblk; 4563 4564 if (oldadp->ad_newblkno != newadp->ad_newblkno) 4565 panic("allocdirect_merge: %jd != %jd", 4566 oldadp->ad_newblkno, newadp->ad_newblkno); 4567 jnewblk = oldadp->ad_block.nb_jnewblk; 4568 cancel_newblk(&oldadp->ad_block, &newadp->ad_block.nb_jwork); 4569 /* 4570 * We have an unwritten jnewblk, we need to merge the 4571 * frag bits with our own. The newer adp's journal can not 4572 * be written prior to the old one so no need to check for 4573 * it here. 4574 */ 4575 if (jnewblk) { 4576 njnewblk = newadp->ad_block.nb_jnewblk; 4577 if (njnewblk == NULL) 4578 panic("allocdirect_merge: No jnewblk"); 4579 if (jnewblk->jn_state & UNDONE) { 4580 njnewblk->jn_state |= UNDONE | NEWBLOCK; 4581 njnewblk->jn_state &= ~ATTACHED; 4582 jnewblk->jn_state &= ~UNDONE; 4583 } 4584 njnewblk->jn_oldfrags = jnewblk->jn_oldfrags; 4585 WORKLIST_REMOVE(&jnewblk->jn_list); 4586 jnewblk->jn_state |= ATTACHED | COMPLETE; 4587 free_jnewblk(jnewblk); 4588 } 4589 } else { 4590 /* 4591 * We can skip journaling for this freefrag and just complete 4592 * any pending journal work for the allocdirect that is being 4593 * removed after the freefrag completes. 4594 */ 4595 if (freefrag->ff_jfreefrag) 4596 cancel_jfreefrag(freefrag->ff_jfreefrag); 4597 cancel_newblk(&oldadp->ad_block, &freefrag->ff_jwork); 4598 } 4599 free_newblk(&oldadp->ad_block); 4600 } 4601 4602 /* 4603 * Allocate a jfreefrag structure to journal a single block free. 4604 */ 4605 static struct jfreefrag * 4606 newjfreefrag(freefrag, ip, blkno, size, lbn) 4607 struct freefrag *freefrag; 4608 struct inode *ip; 4609 ufs2_daddr_t blkno; 4610 long size; 4611 ufs_lbn_t lbn; 4612 { 4613 struct jfreefrag *jfreefrag; 4614 struct fs *fs; 4615 4616 fs = ip->i_fs; 4617 jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG, 4618 M_SOFTDEP_FLAGS); 4619 workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump)); 4620 jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list); 4621 jfreefrag->fr_state = ATTACHED | DEPCOMPLETE; 4622 jfreefrag->fr_ino = ip->i_number; 4623 jfreefrag->fr_lbn = lbn; 4624 jfreefrag->fr_blkno = blkno; 4625 jfreefrag->fr_frags = numfrags(fs, size); 4626 jfreefrag->fr_freefrag = freefrag; 4627 4628 return (jfreefrag); 4629 } 4630 4631 /* 4632 * Allocate a new freefrag structure. 4633 */ 4634 static struct freefrag * 4635 newfreefrag(ip, blkno, size, lbn) 4636 struct inode *ip; 4637 ufs2_daddr_t blkno; 4638 long size; 4639 ufs_lbn_t lbn; 4640 { 4641 struct freefrag *freefrag; 4642 struct fs *fs; 4643 4644 fs = ip->i_fs; 4645 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) 4646 panic("newfreefrag: frag size"); 4647 freefrag = malloc(sizeof(struct freefrag), 4648 M_FREEFRAG, M_SOFTDEP_FLAGS); 4649 workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump)); 4650 freefrag->ff_state = ATTACHED; 4651 LIST_INIT(&freefrag->ff_jwork); 4652 freefrag->ff_inum = ip->i_number; 4653 freefrag->ff_blkno = blkno; 4654 freefrag->ff_fragsize = size; 4655 4656 if (fs->fs_flags & FS_SUJ) { 4657 freefrag->ff_jfreefrag = 4658 newjfreefrag(freefrag, ip, blkno, size, lbn); 4659 } else { 4660 freefrag->ff_state |= DEPCOMPLETE; 4661 freefrag->ff_jfreefrag = NULL; 4662 } 4663 4664 return (freefrag); 4665 } 4666 4667 /* 4668 * This workitem de-allocates fragments that were replaced during 4669 * file block allocation. 4670 */ 4671 static void 4672 handle_workitem_freefrag(freefrag) 4673 struct freefrag *freefrag; 4674 { 4675 struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp); 4676 struct workhead wkhd; 4677 4678 /* 4679 * It would be illegal to add new completion items to the 4680 * freefrag after it was schedule to be done so it must be 4681 * safe to modify the list head here. 4682 */ 4683 LIST_INIT(&wkhd); 4684 LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list); 4685 ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, 4686 freefrag->ff_fragsize, freefrag->ff_inum, &wkhd); 4687 ACQUIRE_LOCK(&lk); 4688 WORKITEM_FREE(freefrag, D_FREEFRAG); 4689 FREE_LOCK(&lk); 4690 } 4691 4692 /* 4693 * Set up a dependency structure for an external attributes data block. 4694 * This routine follows much of the structure of softdep_setup_allocdirect. 4695 * See the description of softdep_setup_allocdirect above for details. 4696 */ 4697 void 4698 softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp) 4699 struct inode *ip; 4700 ufs_lbn_t off; 4701 ufs2_daddr_t newblkno; 4702 ufs2_daddr_t oldblkno; 4703 long newsize; 4704 long oldsize; 4705 struct buf *bp; 4706 { 4707 struct allocdirect *adp, *oldadp; 4708 struct allocdirectlst *adphead; 4709 struct freefrag *freefrag; 4710 struct inodedep *inodedep; 4711 struct jnewblk *jnewblk; 4712 struct newblk *newblk; 4713 struct mount *mp; 4714 ufs_lbn_t lbn; 4715 4716 if (off >= NXADDR) 4717 panic("softdep_setup_allocext: lbn %lld > NXADDR", 4718 (long long)off); 4719 4720 lbn = bp->b_lblkno; 4721 mp = UFSTOVFS(ip->i_ump); 4722 if (oldblkno && oldblkno != newblkno) 4723 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); 4724 else 4725 freefrag = NULL; 4726 4727 ACQUIRE_LOCK(&lk); 4728 if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) 4729 panic("softdep_setup_allocext: lost block"); 4730 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 4731 ("softdep_setup_allocext: newblk already initialized")); 4732 /* 4733 * Convert the newblk to an allocdirect. 4734 */ 4735 newblk->nb_list.wk_type = D_ALLOCDIRECT; 4736 adp = (struct allocdirect *)newblk; 4737 newblk->nb_freefrag = freefrag; 4738 adp->ad_offset = off; 4739 adp->ad_oldblkno = oldblkno; 4740 adp->ad_newsize = newsize; 4741 adp->ad_oldsize = oldsize; 4742 adp->ad_state |= EXTDATA; 4743 4744 /* 4745 * Finish initializing the journal. 4746 */ 4747 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 4748 jnewblk->jn_ino = ip->i_number; 4749 jnewblk->jn_lbn = lbn; 4750 add_to_journal(&jnewblk->jn_list); 4751 } 4752 if (freefrag && freefrag->ff_jfreefrag != NULL) 4753 add_to_journal(&freefrag->ff_jfreefrag->fr_list); 4754 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); 4755 adp->ad_inodedep = inodedep; 4756 4757 WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); 4758 /* 4759 * The list of allocdirects must be kept in sorted and ascending 4760 * order so that the rollback routines can quickly determine the 4761 * first uncommitted block (the size of the file stored on disk 4762 * ends at the end of the lowest committed fragment, or if there 4763 * are no fragments, at the end of the highest committed block). 4764 * Since files generally grow, the typical case is that the new 4765 * block is to be added at the end of the list. We speed this 4766 * special case by checking against the last allocdirect in the 4767 * list before laboriously traversing the list looking for the 4768 * insertion point. 4769 */ 4770 adphead = &inodedep->id_newextupdt; 4771 oldadp = TAILQ_LAST(adphead, allocdirectlst); 4772 if (oldadp == NULL || oldadp->ad_offset <= off) { 4773 /* insert at end of list */ 4774 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 4775 if (oldadp != NULL && oldadp->ad_offset == off) 4776 allocdirect_merge(adphead, adp, oldadp); 4777 FREE_LOCK(&lk); 4778 return; 4779 } 4780 TAILQ_FOREACH(oldadp, adphead, ad_next) { 4781 if (oldadp->ad_offset >= off) 4782 break; 4783 } 4784 if (oldadp == NULL) 4785 panic("softdep_setup_allocext: lost entry"); 4786 /* insert in middle of list */ 4787 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 4788 if (oldadp->ad_offset == off) 4789 allocdirect_merge(adphead, adp, oldadp); 4790 FREE_LOCK(&lk); 4791 } 4792 4793 /* 4794 * Indirect block allocation dependencies. 4795 * 4796 * The same dependencies that exist for a direct block also exist when 4797 * a new block is allocated and pointed to by an entry in a block of 4798 * indirect pointers. The undo/redo states described above are also 4799 * used here. Because an indirect block contains many pointers that 4800 * may have dependencies, a second copy of the entire in-memory indirect 4801 * block is kept. The buffer cache copy is always completely up-to-date. 4802 * The second copy, which is used only as a source for disk writes, 4803 * contains only the safe pointers (i.e., those that have no remaining 4804 * update dependencies). The second copy is freed when all pointers 4805 * are safe. The cache is not allowed to replace indirect blocks with 4806 * pending update dependencies. If a buffer containing an indirect 4807 * block with dependencies is written, these routines will mark it 4808 * dirty again. It can only be successfully written once all the 4809 * dependencies are removed. The ffs_fsync routine in conjunction with 4810 * softdep_sync_metadata work together to get all the dependencies 4811 * removed so that a file can be successfully written to disk. Three 4812 * procedures are used when setting up indirect block pointer 4813 * dependencies. The division is necessary because of the organization 4814 * of the "balloc" routine and because of the distinction between file 4815 * pages and file metadata blocks. 4816 */ 4817 4818 /* 4819 * Allocate a new allocindir structure. 4820 */ 4821 static struct allocindir * 4822 newallocindir(ip, ptrno, newblkno, oldblkno, lbn) 4823 struct inode *ip; /* inode for file being extended */ 4824 int ptrno; /* offset of pointer in indirect block */ 4825 ufs2_daddr_t newblkno; /* disk block number being added */ 4826 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 4827 ufs_lbn_t lbn; 4828 { 4829 struct newblk *newblk; 4830 struct allocindir *aip; 4831 struct freefrag *freefrag; 4832 struct jnewblk *jnewblk; 4833 4834 if (oldblkno) 4835 freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn); 4836 else 4837 freefrag = NULL; 4838 ACQUIRE_LOCK(&lk); 4839 if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0) 4840 panic("new_allocindir: lost block"); 4841 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 4842 ("newallocindir: newblk already initialized")); 4843 newblk->nb_list.wk_type = D_ALLOCINDIR; 4844 newblk->nb_freefrag = freefrag; 4845 aip = (struct allocindir *)newblk; 4846 aip->ai_offset = ptrno; 4847 aip->ai_oldblkno = oldblkno; 4848 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 4849 jnewblk->jn_ino = ip->i_number; 4850 jnewblk->jn_lbn = lbn; 4851 add_to_journal(&jnewblk->jn_list); 4852 } 4853 if (freefrag && freefrag->ff_jfreefrag != NULL) 4854 add_to_journal(&freefrag->ff_jfreefrag->fr_list); 4855 return (aip); 4856 } 4857 4858 /* 4859 * Called just before setting an indirect block pointer 4860 * to a newly allocated file page. 4861 */ 4862 void 4863 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 4864 struct inode *ip; /* inode for file being extended */ 4865 ufs_lbn_t lbn; /* allocated block number within file */ 4866 struct buf *bp; /* buffer with indirect blk referencing page */ 4867 int ptrno; /* offset of pointer in indirect block */ 4868 ufs2_daddr_t newblkno; /* disk block number being added */ 4869 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 4870 struct buf *nbp; /* buffer holding allocated page */ 4871 { 4872 struct inodedep *inodedep; 4873 struct allocindir *aip; 4874 struct pagedep *pagedep; 4875 struct mount *mp; 4876 4877 if (lbn != nbp->b_lblkno) 4878 panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd", 4879 lbn, bp->b_lblkno); 4880 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page"); 4881 mp = UFSTOVFS(ip->i_ump); 4882 aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn); 4883 (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 4884 /* 4885 * If we are allocating a directory page, then we must 4886 * allocate an associated pagedep to track additions and 4887 * deletions. 4888 */ 4889 if ((ip->i_mode & IFMT) == IFDIR && 4890 pagedep_lookup(mp, ip->i_number, lbn, DEPALLOC, &pagedep) == 0) 4891 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); 4892 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); 4893 setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); 4894 FREE_LOCK(&lk); 4895 } 4896 4897 /* 4898 * Called just before setting an indirect block pointer to a 4899 * newly allocated indirect block. 4900 */ 4901 void 4902 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 4903 struct buf *nbp; /* newly allocated indirect block */ 4904 struct inode *ip; /* inode for file being extended */ 4905 struct buf *bp; /* indirect block referencing allocated block */ 4906 int ptrno; /* offset of pointer in indirect block */ 4907 ufs2_daddr_t newblkno; /* disk block number being added */ 4908 { 4909 struct inodedep *inodedep; 4910 struct allocindir *aip; 4911 ufs_lbn_t lbn; 4912 4913 lbn = nbp->b_lblkno; 4914 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta"); 4915 aip = newallocindir(ip, ptrno, newblkno, 0, lbn); 4916 inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep); 4917 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); 4918 setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); 4919 FREE_LOCK(&lk); 4920 } 4921 4922 static void 4923 indirdep_complete(indirdep) 4924 struct indirdep *indirdep; 4925 { 4926 struct allocindir *aip; 4927 4928 LIST_REMOVE(indirdep, ir_next); 4929 indirdep->ir_state &= ~ONDEPLIST; 4930 4931 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) { 4932 LIST_REMOVE(aip, ai_next); 4933 free_newblk(&aip->ai_block); 4934 } 4935 /* 4936 * If this indirdep is not attached to a buf it was simply waiting 4937 * on completion to clear completehd. free_indirdep() asserts 4938 * that nothing is dangling. 4939 */ 4940 if ((indirdep->ir_state & ONWORKLIST) == 0) 4941 free_indirdep(indirdep); 4942 } 4943 4944 /* 4945 * Called to finish the allocation of the "aip" allocated 4946 * by one of the two routines above. 4947 */ 4948 static void 4949 setup_allocindir_phase2(bp, ip, inodedep, aip, lbn) 4950 struct buf *bp; /* in-memory copy of the indirect block */ 4951 struct inode *ip; /* inode for file being extended */ 4952 struct inodedep *inodedep; /* Inodedep for ip */ 4953 struct allocindir *aip; /* allocindir allocated by the above routines */ 4954 ufs_lbn_t lbn; /* Logical block number for this block. */ 4955 { 4956 struct worklist *wk; 4957 struct fs *fs; 4958 struct newblk *newblk; 4959 struct indirdep *indirdep, *newindirdep; 4960 struct allocindir *oldaip; 4961 struct freefrag *freefrag; 4962 struct mount *mp; 4963 ufs2_daddr_t blkno; 4964 4965 mp = UFSTOVFS(ip->i_ump); 4966 fs = ip->i_fs; 4967 mtx_assert(&lk, MA_OWNED); 4968 if (bp->b_lblkno >= 0) 4969 panic("setup_allocindir_phase2: not indir blk"); 4970 for (freefrag = NULL, indirdep = NULL, newindirdep = NULL; ; ) { 4971 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 4972 if (wk->wk_type != D_INDIRDEP) 4973 continue; 4974 indirdep = WK_INDIRDEP(wk); 4975 break; 4976 } 4977 if (indirdep == NULL && newindirdep) { 4978 indirdep = newindirdep; 4979 newindirdep = NULL; 4980 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); 4981 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, 4982 &newblk)) { 4983 indirdep->ir_state |= ONDEPLIST; 4984 LIST_INSERT_HEAD(&newblk->nb_indirdeps, 4985 indirdep, ir_next); 4986 } else 4987 indirdep->ir_state |= DEPCOMPLETE; 4988 } 4989 if (indirdep) { 4990 aip->ai_indirdep = indirdep; 4991 /* 4992 * Check to see if there is an existing dependency 4993 * for this block. If there is, merge the old 4994 * dependency into the new one. This happens 4995 * as a result of reallocblk only. 4996 */ 4997 if (aip->ai_oldblkno == 0) 4998 oldaip = NULL; 4999 else 5000 5001 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, 5002 ai_next) 5003 if (oldaip->ai_offset == aip->ai_offset) 5004 break; 5005 if (oldaip != NULL) 5006 freefrag = allocindir_merge(aip, oldaip); 5007 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); 5008 KASSERT(aip->ai_offset >= 0 && 5009 aip->ai_offset < NINDIR(ip->i_ump->um_fs), 5010 ("setup_allocindir_phase2: Bad offset %d", 5011 aip->ai_offset)); 5012 KASSERT(indirdep->ir_savebp != NULL, 5013 ("setup_allocindir_phase2 NULL ir_savebp")); 5014 if (ip->i_ump->um_fstype == UFS1) 5015 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data) 5016 [aip->ai_offset] = aip->ai_oldblkno; 5017 else 5018 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data) 5019 [aip->ai_offset] = aip->ai_oldblkno; 5020 FREE_LOCK(&lk); 5021 if (freefrag != NULL) 5022 handle_workitem_freefrag(freefrag); 5023 } else 5024 FREE_LOCK(&lk); 5025 if (newindirdep) { 5026 newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE; 5027 brelse(newindirdep->ir_savebp); 5028 ACQUIRE_LOCK(&lk); 5029 WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP); 5030 if (indirdep) 5031 break; 5032 FREE_LOCK(&lk); 5033 } 5034 if (indirdep) { 5035 ACQUIRE_LOCK(&lk); 5036 break; 5037 } 5038 newindirdep = malloc(sizeof(struct indirdep), 5039 M_INDIRDEP, M_SOFTDEP_FLAGS); 5040 workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp); 5041 newindirdep->ir_state = ATTACHED; 5042 if (ip->i_ump->um_fstype == UFS1) 5043 newindirdep->ir_state |= UFS1FMT; 5044 newindirdep->ir_saveddata = NULL; 5045 LIST_INIT(&newindirdep->ir_deplisthd); 5046 LIST_INIT(&newindirdep->ir_donehd); 5047 LIST_INIT(&newindirdep->ir_writehd); 5048 LIST_INIT(&newindirdep->ir_completehd); 5049 LIST_INIT(&newindirdep->ir_jwork); 5050 if (bp->b_blkno == bp->b_lblkno) { 5051 ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp, 5052 NULL, NULL); 5053 bp->b_blkno = blkno; 5054 } 5055 newindirdep->ir_savebp = 5056 getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0); 5057 BUF_KERNPROC(newindirdep->ir_savebp); 5058 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); 5059 ACQUIRE_LOCK(&lk); 5060 } 5061 } 5062 5063 /* 5064 * Merge two allocindirs which refer to the same block. Move newblock 5065 * dependencies and setup the freefrags appropriately. 5066 */ 5067 static struct freefrag * 5068 allocindir_merge(aip, oldaip) 5069 struct allocindir *aip; 5070 struct allocindir *oldaip; 5071 { 5072 struct newdirblk *newdirblk; 5073 struct freefrag *freefrag; 5074 struct worklist *wk; 5075 5076 if (oldaip->ai_newblkno != aip->ai_oldblkno) 5077 panic("allocindir_merge: blkno"); 5078 aip->ai_oldblkno = oldaip->ai_oldblkno; 5079 freefrag = aip->ai_freefrag; 5080 aip->ai_freefrag = oldaip->ai_freefrag; 5081 oldaip->ai_freefrag = NULL; 5082 KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag")); 5083 /* 5084 * If we are tracking a new directory-block allocation, 5085 * move it from the old allocindir to the new allocindir. 5086 */ 5087 if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) { 5088 newdirblk = WK_NEWDIRBLK(wk); 5089 WORKLIST_REMOVE(&newdirblk->db_list); 5090 if (!LIST_EMPTY(&oldaip->ai_newdirblk)) 5091 panic("allocindir_merge: extra newdirblk"); 5092 WORKLIST_INSERT(&aip->ai_newdirblk, &newdirblk->db_list); 5093 } 5094 /* 5095 * We can skip journaling for this freefrag and just complete 5096 * any pending journal work for the allocindir that is being 5097 * removed after the freefrag completes. 5098 */ 5099 if (freefrag->ff_jfreefrag) 5100 cancel_jfreefrag(freefrag->ff_jfreefrag); 5101 LIST_REMOVE(oldaip, ai_next); 5102 cancel_newblk(&oldaip->ai_block, &freefrag->ff_jwork); 5103 free_newblk(&oldaip->ai_block); 5104 5105 return (freefrag); 5106 } 5107 5108 /* 5109 * Block de-allocation dependencies. 5110 * 5111 * When blocks are de-allocated, the on-disk pointers must be nullified before 5112 * the blocks are made available for use by other files. (The true 5113 * requirement is that old pointers must be nullified before new on-disk 5114 * pointers are set. We chose this slightly more stringent requirement to 5115 * reduce complexity.) Our implementation handles this dependency by updating 5116 * the inode (or indirect block) appropriately but delaying the actual block 5117 * de-allocation (i.e., freemap and free space count manipulation) until 5118 * after the updated versions reach stable storage. After the disk is 5119 * updated, the blocks can be safely de-allocated whenever it is convenient. 5120 * This implementation handles only the common case of reducing a file's 5121 * length to zero. Other cases are handled by the conventional synchronous 5122 * write approach. 5123 * 5124 * The ffs implementation with which we worked double-checks 5125 * the state of the block pointers and file size as it reduces 5126 * a file's length. Some of this code is replicated here in our 5127 * soft updates implementation. The freeblks->fb_chkcnt field is 5128 * used to transfer a part of this information to the procedure 5129 * that eventually de-allocates the blocks. 5130 * 5131 * This routine should be called from the routine that shortens 5132 * a file's length, before the inode's size or block pointers 5133 * are modified. It will save the block pointer information for 5134 * later release and zero the inode so that the calling routine 5135 * can release it. 5136 */ 5137 void 5138 softdep_setup_freeblocks(ip, length, flags) 5139 struct inode *ip; /* The inode whose length is to be reduced */ 5140 off_t length; /* The new length for the file */ 5141 int flags; /* IO_EXT and/or IO_NORMAL */ 5142 { 5143 struct ufs1_dinode *dp1; 5144 struct ufs2_dinode *dp2; 5145 struct freeblks *freeblks; 5146 struct inodedep *inodedep; 5147 struct allocdirect *adp; 5148 struct jfreeblk *jfreeblk; 5149 struct bufobj *bo; 5150 struct vnode *vp; 5151 struct buf *bp; 5152 struct fs *fs; 5153 ufs2_daddr_t extblocks, datablocks; 5154 struct mount *mp; 5155 int i, delay, error; 5156 ufs2_daddr_t blkno; 5157 ufs_lbn_t tmpval; 5158 ufs_lbn_t lbn; 5159 long oldextsize; 5160 long oldsize; 5161 int frags; 5162 int needj; 5163 5164 fs = ip->i_fs; 5165 mp = UFSTOVFS(ip->i_ump); 5166 if (length != 0) 5167 panic("softdep_setup_freeblocks: non-zero length"); 5168 freeblks = malloc(sizeof(struct freeblks), 5169 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO); 5170 workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp); 5171 LIST_INIT(&freeblks->fb_jfreeblkhd); 5172 LIST_INIT(&freeblks->fb_jwork); 5173 freeblks->fb_state = ATTACHED; 5174 freeblks->fb_uid = ip->i_uid; 5175 freeblks->fb_previousinum = ip->i_number; 5176 freeblks->fb_devvp = ip->i_devvp; 5177 freeblks->fb_chkcnt = 0; 5178 ACQUIRE_LOCK(&lk); 5179 /* 5180 * If we're truncating a removed file that will never be written 5181 * we don't need to journal the block frees. The canceled journals 5182 * for the allocations will suffice. 5183 */ 5184 inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 5185 if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED || 5186 (fs->fs_flags & FS_SUJ) == 0) 5187 needj = 0; 5188 else 5189 needj = 1; 5190 num_freeblkdep++; 5191 FREE_LOCK(&lk); 5192 extblocks = 0; 5193 if (fs->fs_magic == FS_UFS2_MAGIC) 5194 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); 5195 datablocks = DIP(ip, i_blocks) - extblocks; 5196 if ((flags & IO_NORMAL) != 0) { 5197 oldsize = ip->i_size; 5198 ip->i_size = 0; 5199 DIP_SET(ip, i_size, 0); 5200 freeblks->fb_chkcnt = datablocks; 5201 for (i = 0; i < NDADDR; i++) { 5202 blkno = DIP(ip, i_db[i]); 5203 DIP_SET(ip, i_db[i], 0); 5204 if (blkno == 0) 5205 continue; 5206 frags = sblksize(fs, oldsize, i); 5207 frags = numfrags(fs, frags); 5208 newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 5209 needj); 5210 } 5211 for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; 5212 i++, tmpval *= NINDIR(fs)) { 5213 blkno = DIP(ip, i_ib[i]); 5214 DIP_SET(ip, i_ib[i], 0); 5215 if (blkno) 5216 newfreework(ip->i_ump, freeblks, NULL, -lbn - i, 5217 blkno, fs->fs_frag, needj); 5218 lbn += tmpval; 5219 } 5220 UFS_LOCK(ip->i_ump); 5221 fs->fs_pendingblocks += datablocks; 5222 UFS_UNLOCK(ip->i_ump); 5223 } 5224 if ((flags & IO_EXT) != 0) { 5225 oldextsize = ip->i_din2->di_extsize; 5226 ip->i_din2->di_extsize = 0; 5227 freeblks->fb_chkcnt += extblocks; 5228 for (i = 0; i < NXADDR; i++) { 5229 blkno = ip->i_din2->di_extb[i]; 5230 ip->i_din2->di_extb[i] = 0; 5231 if (blkno == 0) 5232 continue; 5233 frags = sblksize(fs, oldextsize, i); 5234 frags = numfrags(fs, frags); 5235 newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, 5236 frags, needj); 5237 } 5238 } 5239 if (LIST_EMPTY(&freeblks->fb_jfreeblkhd)) 5240 needj = 0; 5241 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt); 5242 /* 5243 * Push the zero'ed inode to to its disk buffer so that we are free 5244 * to delete its dependencies below. Once the dependencies are gone 5245 * the buffer can be safely released. 5246 */ 5247 if ((error = bread(ip->i_devvp, 5248 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 5249 (int)fs->fs_bsize, NOCRED, &bp)) != 0) { 5250 brelse(bp); 5251 softdep_error("softdep_setup_freeblocks", error); 5252 } 5253 if (ip->i_ump->um_fstype == UFS1) { 5254 dp1 = ((struct ufs1_dinode *)bp->b_data + 5255 ino_to_fsbo(fs, ip->i_number)); 5256 ip->i_din1->di_freelink = dp1->di_freelink; 5257 *dp1 = *ip->i_din1; 5258 } else { 5259 dp2 = ((struct ufs2_dinode *)bp->b_data + 5260 ino_to_fsbo(fs, ip->i_number)); 5261 ip->i_din2->di_freelink = dp2->di_freelink; 5262 *dp2 = *ip->i_din2; 5263 } 5264 /* 5265 * Find and eliminate any inode dependencies. 5266 */ 5267 ACQUIRE_LOCK(&lk); 5268 (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 5269 if ((inodedep->id_state & IOSTARTED) != 0) 5270 panic("softdep_setup_freeblocks: inode busy"); 5271 /* 5272 * Add the freeblks structure to the list of operations that 5273 * must await the zero'ed inode being written to disk. If we 5274 * still have a bitmap dependency (delay == 0), then the inode 5275 * has never been written to disk, so we can process the 5276 * freeblks below once we have deleted the dependencies. 5277 */ 5278 delay = (inodedep->id_state & DEPCOMPLETE); 5279 if (delay) 5280 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); 5281 else if (needj) 5282 freeblks->fb_state |= COMPLETE; 5283 /* 5284 * Because the file length has been truncated to zero, any 5285 * pending block allocation dependency structures associated 5286 * with this inode are obsolete and can simply be de-allocated. 5287 * We must first merge the two dependency lists to get rid of 5288 * any duplicate freefrag structures, then purge the merged list. 5289 * If we still have a bitmap dependency, then the inode has never 5290 * been written to disk, so we can free any fragments without delay. 5291 */ 5292 if (flags & IO_NORMAL) { 5293 merge_inode_lists(&inodedep->id_newinoupdt, 5294 &inodedep->id_inoupdt); 5295 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) 5296 cancel_allocdirect(&inodedep->id_inoupdt, adp, 5297 freeblks, delay); 5298 } 5299 if (flags & IO_EXT) { 5300 merge_inode_lists(&inodedep->id_newextupdt, 5301 &inodedep->id_extupdt); 5302 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) 5303 cancel_allocdirect(&inodedep->id_extupdt, adp, 5304 freeblks, delay); 5305 } 5306 LIST_FOREACH(jfreeblk, &freeblks->fb_jfreeblkhd, jf_deps) 5307 add_to_journal(&jfreeblk->jf_list); 5308 5309 FREE_LOCK(&lk); 5310 bdwrite(bp); 5311 /* 5312 * We must wait for any I/O in progress to finish so that 5313 * all potential buffers on the dirty list will be visible. 5314 * Once they are all there, walk the list and get rid of 5315 * any dependencies. 5316 */ 5317 vp = ITOV(ip); 5318 bo = &vp->v_bufobj; 5319 BO_LOCK(bo); 5320 drain_output(vp); 5321 restart: 5322 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { 5323 if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) || 5324 ((flags & IO_NORMAL) == 0 && 5325 (bp->b_xflags & BX_ALTDATA) == 0)) 5326 continue; 5327 if ((bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT)) == NULL) 5328 goto restart; 5329 BO_UNLOCK(bo); 5330 ACQUIRE_LOCK(&lk); 5331 (void) inodedep_lookup(mp, ip->i_number, 0, &inodedep); 5332 if (deallocate_dependencies(bp, inodedep, freeblks)) 5333 bp->b_flags |= B_INVAL | B_NOCACHE; 5334 FREE_LOCK(&lk); 5335 brelse(bp); 5336 BO_LOCK(bo); 5337 goto restart; 5338 } 5339 BO_UNLOCK(bo); 5340 ACQUIRE_LOCK(&lk); 5341 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 5342 (void) free_inodedep(inodedep); 5343 5344 if (delay || needj) 5345 freeblks->fb_state |= DEPCOMPLETE; 5346 if (delay) { 5347 /* 5348 * If the inode with zeroed block pointers is now on disk 5349 * we can start freeing blocks. Add freeblks to the worklist 5350 * instead of calling handle_workitem_freeblocks directly as 5351 * it is more likely that additional IO is needed to complete 5352 * the request here than in the !delay case. 5353 */ 5354 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) 5355 add_to_worklist(&freeblks->fb_list, 1); 5356 } 5357 if (needj && LIST_EMPTY(&freeblks->fb_jfreeblkhd)) 5358 needj = 0; 5359 5360 FREE_LOCK(&lk); 5361 /* 5362 * If the inode has never been written to disk (delay == 0) and 5363 * we're not waiting on any journal writes, then we can process the 5364 * freeblks now that we have deleted the dependencies. 5365 */ 5366 if (!delay && !needj) 5367 handle_workitem_freeblocks(freeblks, 0); 5368 } 5369 5370 /* 5371 * Reclaim any dependency structures from a buffer that is about to 5372 * be reallocated to a new vnode. The buffer must be locked, thus, 5373 * no I/O completion operations can occur while we are manipulating 5374 * its associated dependencies. The mutex is held so that other I/O's 5375 * associated with related dependencies do not occur. Returns 1 if 5376 * all dependencies were cleared, 0 otherwise. 5377 */ 5378 static int 5379 deallocate_dependencies(bp, inodedep, freeblks) 5380 struct buf *bp; 5381 struct inodedep *inodedep; 5382 struct freeblks *freeblks; 5383 { 5384 struct worklist *wk; 5385 struct indirdep *indirdep; 5386 struct newdirblk *newdirblk; 5387 struct allocindir *aip; 5388 struct pagedep *pagedep; 5389 struct jremref *jremref; 5390 struct jmvref *jmvref; 5391 struct dirrem *dirrem; 5392 int i; 5393 5394 mtx_assert(&lk, MA_OWNED); 5395 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 5396 switch (wk->wk_type) { 5397 5398 case D_INDIRDEP: 5399 indirdep = WK_INDIRDEP(wk); 5400 if (bp->b_lblkno >= 0 || 5401 bp->b_blkno != indirdep->ir_savebp->b_lblkno) 5402 panic("deallocate_dependencies: not indir"); 5403 cancel_indirdep(indirdep, bp, inodedep, freeblks); 5404 continue; 5405 5406 case D_PAGEDEP: 5407 pagedep = WK_PAGEDEP(wk); 5408 /* 5409 * There should be no directory add dependencies present 5410 * as the directory could not be truncated until all 5411 * children were removed. 5412 */ 5413 KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL, 5414 ("deallocate_dependencies: pendinghd != NULL")); 5415 for (i = 0; i < DAHASHSZ; i++) 5416 KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL, 5417 ("deallocate_dependencies: diraddhd != NULL")); 5418 /* 5419 * Copy any directory remove dependencies to the list 5420 * to be processed after the zero'ed inode is written. 5421 * If the inode has already been written, then they 5422 * can be dumped directly onto the work list. 5423 */ 5424 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { 5425 /* 5426 * If there are any dirrems we wait for 5427 * the journal write to complete and 5428 * then restart the buf scan as the lock 5429 * has been dropped. 5430 */ 5431 while ((jremref = 5432 LIST_FIRST(&dirrem->dm_jremrefhd)) 5433 != NULL) { 5434 stat_jwait_filepage++; 5435 jwait(&jremref->jr_list); 5436 return (0); 5437 } 5438 LIST_REMOVE(dirrem, dm_next); 5439 dirrem->dm_dirinum = pagedep->pd_ino; 5440 if (inodedep == NULL || 5441 (inodedep->id_state & ALLCOMPLETE) == 5442 ALLCOMPLETE) { 5443 dirrem->dm_state |= COMPLETE; 5444 add_to_worklist(&dirrem->dm_list, 0); 5445 } else 5446 WORKLIST_INSERT(&inodedep->id_bufwait, 5447 &dirrem->dm_list); 5448 } 5449 if ((pagedep->pd_state & NEWBLOCK) != 0) { 5450 newdirblk = pagedep->pd_newdirblk; 5451 WORKLIST_REMOVE(&newdirblk->db_list); 5452 free_newdirblk(newdirblk); 5453 } 5454 while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) 5455 != NULL) { 5456 stat_jwait_filepage++; 5457 jwait(&jmvref->jm_list); 5458 return (0); 5459 } 5460 WORKLIST_REMOVE(&pagedep->pd_list); 5461 LIST_REMOVE(pagedep, pd_hash); 5462 WORKITEM_FREE(pagedep, D_PAGEDEP); 5463 continue; 5464 5465 case D_ALLOCINDIR: 5466 aip = WK_ALLOCINDIR(wk); 5467 cancel_allocindir(aip, inodedep, freeblks); 5468 continue; 5469 5470 case D_ALLOCDIRECT: 5471 case D_INODEDEP: 5472 panic("deallocate_dependencies: Unexpected type %s", 5473 TYPENAME(wk->wk_type)); 5474 /* NOTREACHED */ 5475 5476 default: 5477 panic("deallocate_dependencies: Unknown type %s", 5478 TYPENAME(wk->wk_type)); 5479 /* NOTREACHED */ 5480 } 5481 } 5482 5483 return (1); 5484 } 5485 5486 /* 5487 * An allocdirect is being canceled due to a truncate. We must make sure 5488 * the journal entry is released in concert with the blkfree that releases 5489 * the storage. Completed journal entries must not be released until the 5490 * space is no longer pointed to by the inode or in the bitmap. 5491 */ 5492 static void 5493 cancel_allocdirect(adphead, adp, freeblks, delay) 5494 struct allocdirectlst *adphead; 5495 struct allocdirect *adp; 5496 struct freeblks *freeblks; 5497 int delay; 5498 { 5499 struct freework *freework; 5500 struct newblk *newblk; 5501 struct worklist *wk; 5502 ufs_lbn_t lbn; 5503 5504 TAILQ_REMOVE(adphead, adp, ad_next); 5505 newblk = (struct newblk *)adp; 5506 /* 5507 * If the journal hasn't been written the jnewblk must be passed 5508 * to the call to ffs_blkfree that reclaims the space. We accomplish 5509 * this by linking the journal dependency into the freework to be 5510 * freed when freework_freeblock() is called. If the journal has 5511 * been written we can simply reclaim the journal space when the 5512 * freeblks work is complete. 5513 */ 5514 if (newblk->nb_jnewblk == NULL) { 5515 cancel_newblk(newblk, &freeblks->fb_jwork); 5516 goto found; 5517 } 5518 lbn = newblk->nb_jnewblk->jn_lbn; 5519 /* 5520 * Find the correct freework structure so it releases the canceled 5521 * journal when the bitmap is cleared. This preserves rollback 5522 * until the allocation is reverted. 5523 */ 5524 LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) { 5525 freework = WK_FREEWORK(wk); 5526 if (freework->fw_lbn != lbn) 5527 continue; 5528 cancel_newblk(newblk, &freework->fw_jwork); 5529 goto found; 5530 } 5531 panic("cancel_allocdirect: Freework not found for lbn %jd\n", lbn); 5532 found: 5533 if (delay) 5534 WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, 5535 &newblk->nb_list); 5536 else 5537 free_newblk(newblk); 5538 return; 5539 } 5540 5541 5542 static void 5543 cancel_newblk(newblk, wkhd) 5544 struct newblk *newblk; 5545 struct workhead *wkhd; 5546 { 5547 struct indirdep *indirdep; 5548 struct allocindir *aip; 5549 5550 while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) { 5551 indirdep->ir_state &= ~ONDEPLIST; 5552 LIST_REMOVE(indirdep, ir_next); 5553 /* 5554 * If an indirdep is not on the buf worklist we need to 5555 * free it here as deallocate_dependencies() will never 5556 * find it. These pointers were never visible on disk and 5557 * can be discarded immediately. 5558 */ 5559 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) { 5560 LIST_REMOVE(aip, ai_next); 5561 cancel_newblk(&aip->ai_block, wkhd); 5562 free_newblk(&aip->ai_block); 5563 } 5564 /* 5565 * If this indirdep is not attached to a buf it was simply 5566 * waiting on completion to clear completehd. free_indirdep() 5567 * asserts that nothing is dangling. 5568 */ 5569 if ((indirdep->ir_state & ONWORKLIST) == 0) 5570 free_indirdep(indirdep); 5571 } 5572 if (newblk->nb_state & ONDEPLIST) { 5573 newblk->nb_state &= ~ONDEPLIST; 5574 LIST_REMOVE(newblk, nb_deps); 5575 } 5576 if (newblk->nb_state & ONWORKLIST) 5577 WORKLIST_REMOVE(&newblk->nb_list); 5578 /* 5579 * If the journal entry hasn't been written we hold onto the dep 5580 * until it is safe to free along with the other journal work. 5581 */ 5582 if (newblk->nb_jnewblk != NULL) { 5583 cancel_jnewblk(newblk->nb_jnewblk, wkhd); 5584 newblk->nb_jnewblk = NULL; 5585 } 5586 if (!LIST_EMPTY(&newblk->nb_jwork)) 5587 jwork_move(wkhd, &newblk->nb_jwork); 5588 } 5589 5590 /* 5591 * Free a newblk. Generate a new freefrag work request if appropriate. 5592 * This must be called after the inode pointer and any direct block pointers 5593 * are valid or fully removed via truncate or frag extension. 5594 */ 5595 static void 5596 free_newblk(newblk) 5597 struct newblk *newblk; 5598 { 5599 struct indirdep *indirdep; 5600 struct newdirblk *newdirblk; 5601 struct freefrag *freefrag; 5602 struct worklist *wk; 5603 5604 mtx_assert(&lk, MA_OWNED); 5605 if (newblk->nb_state & ONDEPLIST) 5606 LIST_REMOVE(newblk, nb_deps); 5607 if (newblk->nb_state & ONWORKLIST) 5608 WORKLIST_REMOVE(&newblk->nb_list); 5609 LIST_REMOVE(newblk, nb_hash); 5610 if ((freefrag = newblk->nb_freefrag) != NULL) { 5611 freefrag->ff_state |= COMPLETE; 5612 if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) 5613 add_to_worklist(&freefrag->ff_list, 0); 5614 } 5615 if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) { 5616 newdirblk = WK_NEWDIRBLK(wk); 5617 WORKLIST_REMOVE(&newdirblk->db_list); 5618 if (!LIST_EMPTY(&newblk->nb_newdirblk)) 5619 panic("free_newblk: extra newdirblk"); 5620 free_newdirblk(newdirblk); 5621 } 5622 while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) { 5623 indirdep->ir_state |= DEPCOMPLETE; 5624 indirdep_complete(indirdep); 5625 } 5626 KASSERT(newblk->nb_jnewblk == NULL, 5627 ("free_newblk; jnewblk %p still attached", newblk->nb_jnewblk)); 5628 handle_jwork(&newblk->nb_jwork); 5629 newblk->nb_list.wk_type = D_NEWBLK; 5630 WORKITEM_FREE(newblk, D_NEWBLK); 5631 } 5632 5633 /* 5634 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep. 5635 * This routine must be called with splbio interrupts blocked. 5636 */ 5637 static void 5638 free_newdirblk(newdirblk) 5639 struct newdirblk *newdirblk; 5640 { 5641 struct pagedep *pagedep; 5642 struct diradd *dap; 5643 struct worklist *wk; 5644 int i; 5645 5646 mtx_assert(&lk, MA_OWNED); 5647 /* 5648 * If the pagedep is still linked onto the directory buffer 5649 * dependency chain, then some of the entries on the 5650 * pd_pendinghd list may not be committed to disk yet. In 5651 * this case, we will simply clear the NEWBLOCK flag and 5652 * let the pd_pendinghd list be processed when the pagedep 5653 * is next written. If the pagedep is no longer on the buffer 5654 * dependency chain, then all the entries on the pd_pending 5655 * list are committed to disk and we can free them here. 5656 */ 5657 pagedep = newdirblk->db_pagedep; 5658 pagedep->pd_state &= ~NEWBLOCK; 5659 if ((pagedep->pd_state & ONWORKLIST) == 0) 5660 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 5661 free_diradd(dap, NULL); 5662 /* 5663 * If no dependencies remain, the pagedep will be freed. 5664 */ 5665 for (i = 0; i < DAHASHSZ; i++) 5666 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) 5667 break; 5668 if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0 && 5669 LIST_EMPTY(&pagedep->pd_jmvrefhd)) { 5670 KASSERT(LIST_FIRST(&pagedep->pd_dirremhd) == NULL, 5671 ("free_newdirblk: Freeing non-free pagedep %p", pagedep)); 5672 LIST_REMOVE(pagedep, pd_hash); 5673 WORKITEM_FREE(pagedep, D_PAGEDEP); 5674 } 5675 /* Should only ever be one item in the list. */ 5676 while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) { 5677 WORKLIST_REMOVE(wk); 5678 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 5679 } 5680 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 5681 } 5682 5683 /* 5684 * Prepare an inode to be freed. The actual free operation is not 5685 * done until the zero'ed inode has been written to disk. 5686 */ 5687 void 5688 softdep_freefile(pvp, ino, mode) 5689 struct vnode *pvp; 5690 ino_t ino; 5691 int mode; 5692 { 5693 struct inode *ip = VTOI(pvp); 5694 struct inodedep *inodedep; 5695 struct freefile *freefile; 5696 5697 /* 5698 * This sets up the inode de-allocation dependency. 5699 */ 5700 freefile = malloc(sizeof(struct freefile), 5701 M_FREEFILE, M_SOFTDEP_FLAGS); 5702 workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount); 5703 freefile->fx_mode = mode; 5704 freefile->fx_oldinum = ino; 5705 freefile->fx_devvp = ip->i_devvp; 5706 LIST_INIT(&freefile->fx_jwork); 5707 UFS_LOCK(ip->i_ump); 5708 ip->i_fs->fs_pendinginodes += 1; 5709 UFS_UNLOCK(ip->i_ump); 5710 5711 /* 5712 * If the inodedep does not exist, then the zero'ed inode has 5713 * been written to disk. If the allocated inode has never been 5714 * written to disk, then the on-disk inode is zero'ed. In either 5715 * case we can free the file immediately. If the journal was 5716 * canceled before being written the inode will never make it to 5717 * disk and we must send the canceled journal entrys to 5718 * ffs_freefile() to be cleared in conjunction with the bitmap. 5719 * Any blocks waiting on the inode to write can be safely freed 5720 * here as it will never been written. 5721 */ 5722 ACQUIRE_LOCK(&lk); 5723 inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); 5724 /* 5725 * Remove this inode from the unlinked list and set 5726 * GOINGAWAY as appropriate to indicate that this inode 5727 * will never be written. 5728 */ 5729 if (inodedep && inodedep->id_state & UNLINKED) { 5730 /* 5731 * Save the journal work to be freed with the bitmap 5732 * before we clear UNLINKED. Otherwise it can be lost 5733 * if the inode block is written. 5734 */ 5735 handle_bufwait(inodedep, &freefile->fx_jwork); 5736 clear_unlinked_inodedep(inodedep); 5737 /* Re-acquire inodedep as we've dropped lk. */ 5738 inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); 5739 if (inodedep && (inodedep->id_state & DEPCOMPLETE) == 0) 5740 inodedep->id_state |= GOINGAWAY; 5741 } 5742 if (inodedep == NULL || check_inode_unwritten(inodedep)) { 5743 FREE_LOCK(&lk); 5744 handle_workitem_freefile(freefile); 5745 return; 5746 } 5747 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); 5748 FREE_LOCK(&lk); 5749 if (ip->i_number == ino) 5750 ip->i_flag |= IN_MODIFIED; 5751 } 5752 5753 /* 5754 * Check to see if an inode has never been written to disk. If 5755 * so free the inodedep and return success, otherwise return failure. 5756 * This routine must be called with splbio interrupts blocked. 5757 * 5758 * If we still have a bitmap dependency, then the inode has never 5759 * been written to disk. Drop the dependency as it is no longer 5760 * necessary since the inode is being deallocated. We set the 5761 * ALLCOMPLETE flags since the bitmap now properly shows that the 5762 * inode is not allocated. Even if the inode is actively being 5763 * written, it has been rolled back to its zero'ed state, so we 5764 * are ensured that a zero inode is what is on the disk. For short 5765 * lived files, this change will usually result in removing all the 5766 * dependencies from the inode so that it can be freed immediately. 5767 */ 5768 static int 5769 check_inode_unwritten(inodedep) 5770 struct inodedep *inodedep; 5771 { 5772 5773 mtx_assert(&lk, MA_OWNED); 5774 5775 if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 || 5776 !LIST_EMPTY(&inodedep->id_pendinghd) || 5777 !LIST_EMPTY(&inodedep->id_bufwait) || 5778 !LIST_EMPTY(&inodedep->id_inowait) || 5779 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 5780 !TAILQ_EMPTY(&inodedep->id_newinoupdt) || 5781 !TAILQ_EMPTY(&inodedep->id_extupdt) || 5782 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 5783 inodedep->id_mkdiradd != NULL || 5784 inodedep->id_nlinkdelta != 0) 5785 return (0); 5786 /* 5787 * Another process might be in initiate_write_inodeblock_ufs[12] 5788 * trying to allocate memory without holding "Softdep Lock". 5789 */ 5790 if ((inodedep->id_state & IOSTARTED) != 0 && 5791 inodedep->id_savedino1 == NULL) 5792 return (0); 5793 5794 if (inodedep->id_state & ONDEPLIST) 5795 LIST_REMOVE(inodedep, id_deps); 5796 inodedep->id_state &= ~ONDEPLIST; 5797 inodedep->id_state |= ALLCOMPLETE; 5798 inodedep->id_bmsafemap = NULL; 5799 if (inodedep->id_state & ONWORKLIST) 5800 WORKLIST_REMOVE(&inodedep->id_list); 5801 if (inodedep->id_savedino1 != NULL) { 5802 free(inodedep->id_savedino1, M_SAVEDINO); 5803 inodedep->id_savedino1 = NULL; 5804 } 5805 if (free_inodedep(inodedep) == 0) 5806 panic("check_inode_unwritten: busy inode"); 5807 return (1); 5808 } 5809 5810 /* 5811 * Try to free an inodedep structure. Return 1 if it could be freed. 5812 */ 5813 static int 5814 free_inodedep(inodedep) 5815 struct inodedep *inodedep; 5816 { 5817 5818 mtx_assert(&lk, MA_OWNED); 5819 if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 || 5820 (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || 5821 !LIST_EMPTY(&inodedep->id_dirremhd) || 5822 !LIST_EMPTY(&inodedep->id_pendinghd) || 5823 !LIST_EMPTY(&inodedep->id_bufwait) || 5824 !LIST_EMPTY(&inodedep->id_inowait) || 5825 !TAILQ_EMPTY(&inodedep->id_inoreflst) || 5826 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 5827 !TAILQ_EMPTY(&inodedep->id_newinoupdt) || 5828 !TAILQ_EMPTY(&inodedep->id_extupdt) || 5829 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 5830 inodedep->id_mkdiradd != NULL || 5831 inodedep->id_nlinkdelta != 0 || 5832 inodedep->id_savedino1 != NULL) 5833 return (0); 5834 if (inodedep->id_state & ONDEPLIST) 5835 LIST_REMOVE(inodedep, id_deps); 5836 LIST_REMOVE(inodedep, id_hash); 5837 WORKITEM_FREE(inodedep, D_INODEDEP); 5838 num_inodedep -= 1; 5839 return (1); 5840 } 5841 5842 /* 5843 * Free the block referenced by a freework structure. The parent freeblks 5844 * structure is released and completed when the final cg bitmap reaches 5845 * the disk. This routine may be freeing a jnewblk which never made it to 5846 * disk in which case we do not have to wait as the operation is undone 5847 * in memory immediately. 5848 */ 5849 static void 5850 freework_freeblock(freework) 5851 struct freework *freework; 5852 { 5853 struct freeblks *freeblks; 5854 struct ufsmount *ump; 5855 struct workhead wkhd; 5856 struct fs *fs; 5857 int complete; 5858 int pending; 5859 int bsize; 5860 int needj; 5861 5862 freeblks = freework->fw_freeblks; 5863 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 5864 fs = ump->um_fs; 5865 needj = freeblks->fb_list.wk_mp->mnt_kern_flag & MNTK_SUJ; 5866 complete = 0; 5867 LIST_INIT(&wkhd); 5868 /* 5869 * If we are canceling an existing jnewblk pass it to the free 5870 * routine, otherwise pass the freeblk which will ultimately 5871 * release the freeblks. If we're not journaling, we can just 5872 * free the freeblks immediately. 5873 */ 5874 if (!LIST_EMPTY(&freework->fw_jwork)) { 5875 LIST_SWAP(&wkhd, &freework->fw_jwork, worklist, wk_list); 5876 complete = 1; 5877 } else if (needj) 5878 WORKLIST_INSERT_UNLOCKED(&wkhd, &freework->fw_list); 5879 bsize = lfragtosize(fs, freework->fw_frags); 5880 pending = btodb(bsize); 5881 ACQUIRE_LOCK(&lk); 5882 freeblks->fb_chkcnt -= pending; 5883 FREE_LOCK(&lk); 5884 /* 5885 * extattr blocks don't show up in pending blocks. XXX why? 5886 */ 5887 if (freework->fw_lbn >= 0 || freework->fw_lbn <= -NDADDR) { 5888 UFS_LOCK(ump); 5889 fs->fs_pendingblocks -= pending; 5890 UFS_UNLOCK(ump); 5891 } 5892 ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, 5893 bsize, freeblks->fb_previousinum, &wkhd); 5894 if (complete == 0 && needj) 5895 return; 5896 /* 5897 * The jnewblk will be discarded and the bits in the map never 5898 * made it to disk. We can immediately free the freeblk. 5899 */ 5900 ACQUIRE_LOCK(&lk); 5901 handle_written_freework(freework); 5902 FREE_LOCK(&lk); 5903 } 5904 5905 /* 5906 * Start, continue, or finish the process of freeing an indirect block tree. 5907 * The free operation may be paused at any point with fw_off containing the 5908 * offset to restart from. This enables us to implement some flow control 5909 * for large truncates which may fan out and generate a huge number of 5910 * dependencies. 5911 */ 5912 static void 5913 handle_workitem_indirblk(freework) 5914 struct freework *freework; 5915 { 5916 struct freeblks *freeblks; 5917 struct ufsmount *ump; 5918 struct fs *fs; 5919 5920 5921 freeblks = freework->fw_freeblks; 5922 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 5923 fs = ump->um_fs; 5924 if (freework->fw_off == NINDIR(fs)) 5925 freework_freeblock(freework); 5926 else 5927 indir_trunc(freework, fsbtodb(fs, freework->fw_blkno), 5928 freework->fw_lbn); 5929 } 5930 5931 /* 5932 * Called when a freework structure attached to a cg buf is written. The 5933 * ref on either the parent or the freeblks structure is released and 5934 * either may be added to the worklist if it is the final ref. 5935 */ 5936 static void 5937 handle_written_freework(freework) 5938 struct freework *freework; 5939 { 5940 struct freeblks *freeblks; 5941 struct freework *parent; 5942 5943 freeblks = freework->fw_freeblks; 5944 parent = freework->fw_parent; 5945 if (parent) { 5946 if (--parent->fw_ref != 0) 5947 parent = NULL; 5948 freeblks = NULL; 5949 } else if (--freeblks->fb_ref != 0) 5950 freeblks = NULL; 5951 WORKITEM_FREE(freework, D_FREEWORK); 5952 /* 5953 * Don't delay these block frees or it takes an intolerable amount 5954 * of time to process truncates and free their journal entries. 5955 */ 5956 if (freeblks) 5957 add_to_worklist(&freeblks->fb_list, 1); 5958 if (parent) 5959 add_to_worklist(&parent->fw_list, 1); 5960 } 5961 5962 /* 5963 * This workitem routine performs the block de-allocation. 5964 * The workitem is added to the pending list after the updated 5965 * inode block has been written to disk. As mentioned above, 5966 * checks regarding the number of blocks de-allocated (compared 5967 * to the number of blocks allocated for the file) are also 5968 * performed in this function. 5969 */ 5970 static void 5971 handle_workitem_freeblocks(freeblks, flags) 5972 struct freeblks *freeblks; 5973 int flags; 5974 { 5975 struct freework *freework; 5976 struct worklist *wk; 5977 5978 KASSERT(LIST_EMPTY(&freeblks->fb_jfreeblkhd), 5979 ("handle_workitem_freeblocks: Journal entries not written.")); 5980 if (LIST_EMPTY(&freeblks->fb_freeworkhd)) { 5981 handle_complete_freeblocks(freeblks); 5982 return; 5983 } 5984 freeblks->fb_ref++; 5985 while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) { 5986 KASSERT(wk->wk_type == D_FREEWORK, 5987 ("handle_workitem_freeblocks: Unknown type %s", 5988 TYPENAME(wk->wk_type))); 5989 WORKLIST_REMOVE_UNLOCKED(wk); 5990 freework = WK_FREEWORK(wk); 5991 if (freework->fw_lbn <= -NDADDR) 5992 handle_workitem_indirblk(freework); 5993 else 5994 freework_freeblock(freework); 5995 } 5996 ACQUIRE_LOCK(&lk); 5997 if (--freeblks->fb_ref != 0) 5998 freeblks = NULL; 5999 FREE_LOCK(&lk); 6000 if (freeblks) 6001 handle_complete_freeblocks(freeblks); 6002 } 6003 6004 /* 6005 * Once all of the freework workitems are complete we can retire the 6006 * freeblocks dependency and any journal work awaiting completion. This 6007 * can not be called until all other dependencies are stable on disk. 6008 */ 6009 static void 6010 handle_complete_freeblocks(freeblks) 6011 struct freeblks *freeblks; 6012 { 6013 struct inode *ip; 6014 struct vnode *vp; 6015 struct fs *fs; 6016 struct ufsmount *ump; 6017 int flags; 6018 6019 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 6020 fs = ump->um_fs; 6021 flags = LK_NOWAIT; 6022 6023 /* 6024 * If we still have not finished background cleanup, then check 6025 * to see if the block count needs to be adjusted. 6026 */ 6027 if (freeblks->fb_chkcnt != 0 && (fs->fs_flags & FS_UNCLEAN) != 0 && 6028 ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_previousinum, 6029 (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ) == 0) { 6030 ip = VTOI(vp); 6031 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + freeblks->fb_chkcnt); 6032 ip->i_flag |= IN_CHANGE; 6033 vput(vp); 6034 } 6035 6036 if (!(freeblks->fb_chkcnt == 0 || 6037 ((fs->fs_flags & FS_UNCLEAN) != 0 && (flags & LK_NOWAIT) == 0))) 6038 printf( 6039 "handle_workitem_freeblocks: inode %ju block count %jd\n", 6040 (uintmax_t)freeblks->fb_previousinum, 6041 (intmax_t)freeblks->fb_chkcnt); 6042 6043 ACQUIRE_LOCK(&lk); 6044 /* 6045 * All of the freeblock deps must be complete prior to this call 6046 * so it's now safe to complete earlier outstanding journal entries. 6047 */ 6048 handle_jwork(&freeblks->fb_jwork); 6049 WORKITEM_FREE(freeblks, D_FREEBLKS); 6050 num_freeblkdep--; 6051 FREE_LOCK(&lk); 6052 } 6053 6054 /* 6055 * Release blocks associated with the inode ip and stored in the indirect 6056 * block dbn. If level is greater than SINGLE, the block is an indirect block 6057 * and recursive calls to indirtrunc must be used to cleanse other indirect 6058 * blocks. 6059 */ 6060 static void 6061 indir_trunc(freework, dbn, lbn) 6062 struct freework *freework; 6063 ufs2_daddr_t dbn; 6064 ufs_lbn_t lbn; 6065 { 6066 struct freework *nfreework; 6067 struct workhead wkhd; 6068 struct jnewblk *jnewblk; 6069 struct freeblks *freeblks; 6070 struct buf *bp; 6071 struct fs *fs; 6072 struct worklist *wkn; 6073 struct worklist *wk; 6074 struct indirdep *indirdep; 6075 struct ufsmount *ump; 6076 ufs1_daddr_t *bap1 = 0; 6077 ufs2_daddr_t nb, nnb, *bap2 = 0; 6078 ufs_lbn_t lbnadd; 6079 int i, nblocks, ufs1fmt; 6080 int fs_pendingblocks; 6081 int freedeps; 6082 int needj; 6083 int level; 6084 int cnt; 6085 6086 LIST_INIT(&wkhd); 6087 level = lbn_level(lbn); 6088 if (level == -1) 6089 panic("indir_trunc: Invalid lbn %jd\n", lbn); 6090 freeblks = freework->fw_freeblks; 6091 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 6092 fs = ump->um_fs; 6093 fs_pendingblocks = 0; 6094 freedeps = 0; 6095 needj = UFSTOVFS(ump)->mnt_kern_flag & MNTK_SUJ; 6096 lbnadd = lbn_offset(fs, level); 6097 /* 6098 * Get buffer of block pointers to be freed. This routine is not 6099 * called until the zero'ed inode has been written, so it is safe 6100 * to free blocks as they are encountered. Because the inode has 6101 * been zero'ed, calls to bmap on these blocks will fail. So, we 6102 * have to use the on-disk address and the block device for the 6103 * filesystem to look them up. If the file was deleted before its 6104 * indirect blocks were all written to disk, the routine that set 6105 * us up (deallocate_dependencies) will have arranged to leave 6106 * a complete copy of the indirect block in memory for our use. 6107 * Otherwise we have to read the blocks in from the disk. 6108 */ 6109 #ifdef notyet 6110 bp = getblk(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 0, 0, 6111 GB_NOCREAT); 6112 #else 6113 bp = incore(&freeblks->fb_devvp->v_bufobj, dbn); 6114 #endif 6115 ACQUIRE_LOCK(&lk); 6116 if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) { 6117 if (wk->wk_type != D_INDIRDEP || 6118 (wk->wk_state & GOINGAWAY) == 0) 6119 panic("indir_trunc: lost indirdep %p", wk); 6120 indirdep = WK_INDIRDEP(wk); 6121 LIST_SWAP(&wkhd, &indirdep->ir_jwork, worklist, wk_list); 6122 free_indirdep(indirdep); 6123 if (!LIST_EMPTY(&bp->b_dep)) 6124 panic("indir_trunc: dangling dep %p", 6125 LIST_FIRST(&bp->b_dep)); 6126 ump->um_numindirdeps -= 1; 6127 FREE_LOCK(&lk); 6128 } else { 6129 #ifdef notyet 6130 if (bp) 6131 brelse(bp); 6132 #endif 6133 FREE_LOCK(&lk); 6134 if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 6135 NOCRED, &bp) != 0) { 6136 brelse(bp); 6137 return; 6138 } 6139 } 6140 /* 6141 * Recursively free indirect blocks. 6142 */ 6143 if (ump->um_fstype == UFS1) { 6144 ufs1fmt = 1; 6145 bap1 = (ufs1_daddr_t *)bp->b_data; 6146 } else { 6147 ufs1fmt = 0; 6148 bap2 = (ufs2_daddr_t *)bp->b_data; 6149 } 6150 6151 /* 6152 * Reclaim indirect blocks which never made it to disk. 6153 */ 6154 cnt = 0; 6155 LIST_FOREACH_SAFE(wk, &wkhd, wk_list, wkn) { 6156 if (wk->wk_type != D_JNEWBLK) 6157 continue; 6158 ACQUIRE_LOCK(&lk); 6159 WORKLIST_REMOVE(wk); 6160 FREE_LOCK(&lk); 6161 jnewblk = WK_JNEWBLK(wk); 6162 if (jnewblk->jn_lbn > 0) 6163 i = (jnewblk->jn_lbn - -lbn) / lbnadd; 6164 else 6165 i = (-(jnewblk->jn_lbn + level - 1) - -(lbn + level)) / 6166 lbnadd; 6167 KASSERT(i >= 0 && i < NINDIR(fs), 6168 ("indir_trunc: Index out of range %d parent %jd lbn %jd level %d", 6169 i, lbn, jnewblk->jn_lbn, level)); 6170 /* Clear the pointer so it isn't found below. */ 6171 if (ufs1fmt) { 6172 nb = bap1[i]; 6173 bap1[i] = 0; 6174 } else { 6175 nb = bap2[i]; 6176 bap2[i] = 0; 6177 } 6178 KASSERT(nb == jnewblk->jn_blkno, 6179 ("indir_trunc: Block mismatch %jd != %jd", 6180 nb, jnewblk->jn_blkno)); 6181 if (level != 0) { 6182 ufs_lbn_t nlbn; 6183 6184 nlbn = (lbn + 1) - (i * lbnadd); 6185 nfreework = newfreework(ump, freeblks, freework, 6186 nlbn, nb, fs->fs_frag, 0); 6187 WORKLIST_INSERT_UNLOCKED(&nfreework->fw_jwork, wk); 6188 freedeps++; 6189 indir_trunc(nfreework, fsbtodb(fs, nb), nlbn); 6190 } else { 6191 struct workhead freewk; 6192 6193 LIST_INIT(&freewk); 6194 ACQUIRE_LOCK(&lk); 6195 WORKLIST_INSERT(&freewk, wk); 6196 FREE_LOCK(&lk); 6197 ffs_blkfree(ump, fs, freeblks->fb_devvp, 6198 jnewblk->jn_blkno, fs->fs_bsize, 6199 freeblks->fb_previousinum, &freewk); 6200 } 6201 cnt++; 6202 } 6203 ACQUIRE_LOCK(&lk); 6204 /* Any remaining journal work can be completed with freeblks. */ 6205 jwork_move(&freeblks->fb_jwork, &wkhd); 6206 FREE_LOCK(&lk); 6207 nblocks = btodb(fs->fs_bsize); 6208 if (ufs1fmt) 6209 nb = bap1[0]; 6210 else 6211 nb = bap2[0]; 6212 nfreework = freework; 6213 /* 6214 * Reclaim on disk blocks. 6215 */ 6216 for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) { 6217 if (i != NINDIR(fs) - 1) { 6218 if (ufs1fmt) 6219 nnb = bap1[i+1]; 6220 else 6221 nnb = bap2[i+1]; 6222 } else 6223 nnb = 0; 6224 if (nb == 0) 6225 continue; 6226 cnt++; 6227 if (level != 0) { 6228 ufs_lbn_t nlbn; 6229 6230 nlbn = (lbn + 1) - (i * lbnadd); 6231 if (needj != 0) { 6232 nfreework = newfreework(ump, freeblks, freework, 6233 nlbn, nb, fs->fs_frag, 0); 6234 freedeps++; 6235 } 6236 indir_trunc(nfreework, fsbtodb(fs, nb), nlbn); 6237 } else { 6238 struct freedep *freedep; 6239 6240 /* 6241 * Attempt to aggregate freedep dependencies for 6242 * all blocks being released to the same CG. 6243 */ 6244 LIST_INIT(&wkhd); 6245 if (needj != 0 && 6246 (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) { 6247 freedep = newfreedep(freework); 6248 WORKLIST_INSERT_UNLOCKED(&wkhd, 6249 &freedep->fd_list); 6250 freedeps++; 6251 } 6252 ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, 6253 fs->fs_bsize, freeblks->fb_previousinum, &wkhd); 6254 } 6255 } 6256 if (level == 0) 6257 fs_pendingblocks = (nblocks * cnt); 6258 /* 6259 * If we're not journaling we can free the indirect now. Otherwise 6260 * setup the ref counts and offset so this indirect can be completed 6261 * when its children are free. 6262 */ 6263 if (needj == 0) { 6264 fs_pendingblocks += nblocks; 6265 dbn = dbtofsb(fs, dbn); 6266 ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize, 6267 freeblks->fb_previousinum, NULL); 6268 ACQUIRE_LOCK(&lk); 6269 freeblks->fb_chkcnt -= fs_pendingblocks; 6270 if (freework->fw_blkno == dbn) 6271 handle_written_freework(freework); 6272 FREE_LOCK(&lk); 6273 freework = NULL; 6274 } else { 6275 ACQUIRE_LOCK(&lk); 6276 freework->fw_off = i; 6277 freework->fw_ref += freedeps; 6278 freework->fw_ref -= NINDIR(fs) + 1; 6279 if (freework->fw_ref != 0) 6280 freework = NULL; 6281 freeblks->fb_chkcnt -= fs_pendingblocks; 6282 FREE_LOCK(&lk); 6283 } 6284 if (fs_pendingblocks) { 6285 UFS_LOCK(ump); 6286 fs->fs_pendingblocks -= fs_pendingblocks; 6287 UFS_UNLOCK(ump); 6288 } 6289 bp->b_flags |= B_INVAL | B_NOCACHE; 6290 brelse(bp); 6291 if (freework) 6292 handle_workitem_indirblk(freework); 6293 return; 6294 } 6295 6296 /* 6297 * Cancel an allocindir when it is removed via truncation. 6298 */ 6299 static void 6300 cancel_allocindir(aip, inodedep, freeblks) 6301 struct allocindir *aip; 6302 struct inodedep *inodedep; 6303 struct freeblks *freeblks; 6304 { 6305 struct newblk *newblk; 6306 6307 /* 6308 * If the journal hasn't been written the jnewblk must be passed 6309 * to the call to ffs_blkfree that reclaims the space. We accomplish 6310 * this by linking the journal dependency into the indirdep to be 6311 * freed when indir_trunc() is called. If the journal has already 6312 * been written we can simply reclaim the journal space when the 6313 * freeblks work is complete. 6314 */ 6315 LIST_REMOVE(aip, ai_next); 6316 newblk = (struct newblk *)aip; 6317 if (newblk->nb_jnewblk == NULL) 6318 cancel_newblk(newblk, &freeblks->fb_jwork); 6319 else 6320 cancel_newblk(newblk, &aip->ai_indirdep->ir_jwork); 6321 if (inodedep && inodedep->id_state & DEPCOMPLETE) 6322 WORKLIST_INSERT(&inodedep->id_bufwait, &newblk->nb_list); 6323 else 6324 free_newblk(newblk); 6325 } 6326 6327 /* 6328 * Create the mkdir dependencies for . and .. in a new directory. Link them 6329 * in to a newdirblk so any subsequent additions are tracked properly. The 6330 * caller is responsible for adding the mkdir1 dependency to the journal 6331 * and updating id_mkdiradd. This function returns with lk held. 6332 */ 6333 static struct mkdir * 6334 setup_newdir(dap, newinum, dinum, newdirbp, mkdirp) 6335 struct diradd *dap; 6336 ino_t newinum; 6337 ino_t dinum; 6338 struct buf *newdirbp; 6339 struct mkdir **mkdirp; 6340 { 6341 struct newblk *newblk; 6342 struct pagedep *pagedep; 6343 struct inodedep *inodedep; 6344 struct newdirblk *newdirblk = 0; 6345 struct mkdir *mkdir1, *mkdir2; 6346 struct worklist *wk; 6347 struct jaddref *jaddref; 6348 struct mount *mp; 6349 6350 mp = dap->da_list.wk_mp; 6351 newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, 6352 M_SOFTDEP_FLAGS); 6353 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); 6354 LIST_INIT(&newdirblk->db_mkdir); 6355 mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); 6356 workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); 6357 mkdir1->md_state = ATTACHED | MKDIR_BODY; 6358 mkdir1->md_diradd = dap; 6359 mkdir1->md_jaddref = NULL; 6360 mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); 6361 workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); 6362 mkdir2->md_state = ATTACHED | MKDIR_PARENT; 6363 mkdir2->md_diradd = dap; 6364 mkdir2->md_jaddref = NULL; 6365 if ((mp->mnt_kern_flag & MNTK_SUJ) == 0) { 6366 mkdir1->md_state |= DEPCOMPLETE; 6367 mkdir2->md_state |= DEPCOMPLETE; 6368 } 6369 /* 6370 * Dependency on "." and ".." being written to disk. 6371 */ 6372 mkdir1->md_buf = newdirbp; 6373 ACQUIRE_LOCK(&lk); 6374 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); 6375 /* 6376 * We must link the pagedep, allocdirect, and newdirblk for 6377 * the initial file page so the pointer to the new directory 6378 * is not written until the directory contents are live and 6379 * any subsequent additions are not marked live until the 6380 * block is reachable via the inode. 6381 */ 6382 if (pagedep_lookup(mp, newinum, 0, 0, &pagedep) == 0) 6383 panic("setup_newdir: lost pagedep"); 6384 LIST_FOREACH(wk, &newdirbp->b_dep, wk_list) 6385 if (wk->wk_type == D_ALLOCDIRECT) 6386 break; 6387 if (wk == NULL) 6388 panic("setup_newdir: lost allocdirect"); 6389 newblk = WK_NEWBLK(wk); 6390 pagedep->pd_state |= NEWBLOCK; 6391 pagedep->pd_newdirblk = newdirblk; 6392 newdirblk->db_pagedep = pagedep; 6393 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); 6394 WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list); 6395 /* 6396 * Look up the inodedep for the parent directory so that we 6397 * can link mkdir2 into the pending dotdot jaddref or 6398 * the inode write if there is none. If the inode is 6399 * ALLCOMPLETE and no jaddref is present all dependencies have 6400 * been satisfied and mkdir2 can be freed. 6401 */ 6402 inodedep_lookup(mp, dinum, 0, &inodedep); 6403 if (mp->mnt_kern_flag & MNTK_SUJ) { 6404 if (inodedep == NULL) 6405 panic("setup_newdir: Lost parent."); 6406 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 6407 inoreflst); 6408 KASSERT(jaddref != NULL && jaddref->ja_parent == newinum && 6409 (jaddref->ja_state & MKDIR_PARENT), 6410 ("setup_newdir: bad dotdot jaddref %p", jaddref)); 6411 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 6412 mkdir2->md_jaddref = jaddref; 6413 jaddref->ja_mkdir = mkdir2; 6414 } else if (inodedep == NULL || 6415 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 6416 dap->da_state &= ~MKDIR_PARENT; 6417 WORKITEM_FREE(mkdir2, D_MKDIR); 6418 } else { 6419 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 6420 WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list); 6421 } 6422 *mkdirp = mkdir2; 6423 6424 return (mkdir1); 6425 } 6426 6427 /* 6428 * Directory entry addition dependencies. 6429 * 6430 * When adding a new directory entry, the inode (with its incremented link 6431 * count) must be written to disk before the directory entry's pointer to it. 6432 * Also, if the inode is newly allocated, the corresponding freemap must be 6433 * updated (on disk) before the directory entry's pointer. These requirements 6434 * are met via undo/redo on the directory entry's pointer, which consists 6435 * simply of the inode number. 6436 * 6437 * As directory entries are added and deleted, the free space within a 6438 * directory block can become fragmented. The ufs filesystem will compact 6439 * a fragmented directory block to make space for a new entry. When this 6440 * occurs, the offsets of previously added entries change. Any "diradd" 6441 * dependency structures corresponding to these entries must be updated with 6442 * the new offsets. 6443 */ 6444 6445 /* 6446 * This routine is called after the in-memory inode's link 6447 * count has been incremented, but before the directory entry's 6448 * pointer to the inode has been set. 6449 */ 6450 int 6451 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 6452 struct buf *bp; /* buffer containing directory block */ 6453 struct inode *dp; /* inode for directory */ 6454 off_t diroffset; /* offset of new entry in directory */ 6455 ino_t newinum; /* inode referenced by new directory entry */ 6456 struct buf *newdirbp; /* non-NULL => contents of new mkdir */ 6457 int isnewblk; /* entry is in a newly allocated block */ 6458 { 6459 int offset; /* offset of new entry within directory block */ 6460 ufs_lbn_t lbn; /* block in directory containing new entry */ 6461 struct fs *fs; 6462 struct diradd *dap; 6463 struct newblk *newblk; 6464 struct pagedep *pagedep; 6465 struct inodedep *inodedep; 6466 struct newdirblk *newdirblk = 0; 6467 struct mkdir *mkdir1, *mkdir2; 6468 struct jaddref *jaddref; 6469 struct mount *mp; 6470 int isindir; 6471 6472 /* 6473 * Whiteouts have no dependencies. 6474 */ 6475 if (newinum == WINO) { 6476 if (newdirbp != NULL) 6477 bdwrite(newdirbp); 6478 return (0); 6479 } 6480 jaddref = NULL; 6481 mkdir1 = mkdir2 = NULL; 6482 mp = UFSTOVFS(dp->i_ump); 6483 fs = dp->i_fs; 6484 lbn = lblkno(fs, diroffset); 6485 offset = blkoff(fs, diroffset); 6486 dap = malloc(sizeof(struct diradd), M_DIRADD, 6487 M_SOFTDEP_FLAGS|M_ZERO); 6488 workitem_alloc(&dap->da_list, D_DIRADD, mp); 6489 dap->da_offset = offset; 6490 dap->da_newinum = newinum; 6491 dap->da_state = ATTACHED; 6492 LIST_INIT(&dap->da_jwork); 6493 isindir = bp->b_lblkno >= NDADDR; 6494 if (isnewblk && 6495 (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) { 6496 newdirblk = malloc(sizeof(struct newdirblk), 6497 M_NEWDIRBLK, M_SOFTDEP_FLAGS); 6498 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); 6499 LIST_INIT(&newdirblk->db_mkdir); 6500 } 6501 /* 6502 * If we're creating a new directory setup the dependencies and set 6503 * the dap state to wait for them. Otherwise it's COMPLETE and 6504 * we can move on. 6505 */ 6506 if (newdirbp == NULL) { 6507 dap->da_state |= DEPCOMPLETE; 6508 ACQUIRE_LOCK(&lk); 6509 } else { 6510 dap->da_state |= MKDIR_BODY | MKDIR_PARENT; 6511 mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp, 6512 &mkdir2); 6513 } 6514 /* 6515 * Link into parent directory pagedep to await its being written. 6516 */ 6517 if (pagedep_lookup(mp, dp->i_number, lbn, DEPALLOC, &pagedep) == 0) 6518 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 6519 #ifdef DEBUG 6520 if (diradd_lookup(pagedep, offset) != NULL) 6521 panic("softdep_setup_directory_add: %p already at off %d\n", 6522 diradd_lookup(pagedep, offset), offset); 6523 #endif 6524 dap->da_pagedep = pagedep; 6525 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, 6526 da_pdlist); 6527 inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); 6528 /* 6529 * If we're journaling, link the diradd into the jaddref so it 6530 * may be completed after the journal entry is written. Otherwise, 6531 * link the diradd into its inodedep. If the inode is not yet 6532 * written place it on the bufwait list, otherwise do the post-inode 6533 * write processing to put it on the id_pendinghd list. 6534 */ 6535 if (mp->mnt_kern_flag & MNTK_SUJ) { 6536 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 6537 inoreflst); 6538 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 6539 ("softdep_setup_directory_add: bad jaddref %p", jaddref)); 6540 jaddref->ja_diroff = diroffset; 6541 jaddref->ja_diradd = dap; 6542 add_to_journal(&jaddref->ja_list); 6543 } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) 6544 diradd_inode_written(dap, inodedep); 6545 else 6546 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 6547 /* 6548 * Add the journal entries for . and .. links now that the primary 6549 * link is written. 6550 */ 6551 if (mkdir1 != NULL && mp->mnt_kern_flag & MNTK_SUJ) { 6552 jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, 6553 inoreflst, if_deps); 6554 KASSERT(jaddref != NULL && 6555 jaddref->ja_ino == jaddref->ja_parent && 6556 (jaddref->ja_state & MKDIR_BODY), 6557 ("softdep_setup_directory_add: bad dot jaddref %p", 6558 jaddref)); 6559 mkdir1->md_jaddref = jaddref; 6560 jaddref->ja_mkdir = mkdir1; 6561 /* 6562 * It is important that the dotdot journal entry 6563 * is added prior to the dot entry since dot writes 6564 * both the dot and dotdot links. These both must 6565 * be added after the primary link for the journal 6566 * to remain consistent. 6567 */ 6568 add_to_journal(&mkdir2->md_jaddref->ja_list); 6569 add_to_journal(&jaddref->ja_list); 6570 } 6571 /* 6572 * If we are adding a new directory remember this diradd so that if 6573 * we rename it we can keep the dot and dotdot dependencies. If 6574 * we are adding a new name for an inode that has a mkdiradd we 6575 * must be in rename and we have to move the dot and dotdot 6576 * dependencies to this new name. The old name is being orphaned 6577 * soon. 6578 */ 6579 if (mkdir1 != NULL) { 6580 if (inodedep->id_mkdiradd != NULL) 6581 panic("softdep_setup_directory_add: Existing mkdir"); 6582 inodedep->id_mkdiradd = dap; 6583 } else if (inodedep->id_mkdiradd) 6584 merge_diradd(inodedep, dap); 6585 if (newdirblk) { 6586 /* 6587 * There is nothing to do if we are already tracking 6588 * this block. 6589 */ 6590 if ((pagedep->pd_state & NEWBLOCK) != 0) { 6591 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 6592 FREE_LOCK(&lk); 6593 return (0); 6594 } 6595 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk) 6596 == 0) 6597 panic("softdep_setup_directory_add: lost entry"); 6598 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); 6599 pagedep->pd_state |= NEWBLOCK; 6600 pagedep->pd_newdirblk = newdirblk; 6601 newdirblk->db_pagedep = pagedep; 6602 FREE_LOCK(&lk); 6603 /* 6604 * If we extended into an indirect signal direnter to sync. 6605 */ 6606 if (isindir) 6607 return (1); 6608 return (0); 6609 } 6610 FREE_LOCK(&lk); 6611 return (0); 6612 } 6613 6614 /* 6615 * This procedure is called to change the offset of a directory 6616 * entry when compacting a directory block which must be owned 6617 * exclusively by the caller. Note that the actual entry movement 6618 * must be done in this procedure to ensure that no I/O completions 6619 * occur while the move is in progress. 6620 */ 6621 void 6622 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) 6623 struct buf *bp; /* Buffer holding directory block. */ 6624 struct inode *dp; /* inode for directory */ 6625 caddr_t base; /* address of dp->i_offset */ 6626 caddr_t oldloc; /* address of old directory location */ 6627 caddr_t newloc; /* address of new directory location */ 6628 int entrysize; /* size of directory entry */ 6629 { 6630 int offset, oldoffset, newoffset; 6631 struct pagedep *pagedep; 6632 struct jmvref *jmvref; 6633 struct diradd *dap; 6634 struct direct *de; 6635 struct mount *mp; 6636 ufs_lbn_t lbn; 6637 int flags; 6638 6639 mp = UFSTOVFS(dp->i_ump); 6640 de = (struct direct *)oldloc; 6641 jmvref = NULL; 6642 flags = 0; 6643 /* 6644 * Moves are always journaled as it would be too complex to 6645 * determine if any affected adds or removes are present in the 6646 * journal. 6647 */ 6648 if (mp->mnt_kern_flag & MNTK_SUJ) { 6649 flags = DEPALLOC; 6650 jmvref = newjmvref(dp, de->d_ino, 6651 dp->i_offset + (oldloc - base), 6652 dp->i_offset + (newloc - base)); 6653 } 6654 lbn = lblkno(dp->i_fs, dp->i_offset); 6655 offset = blkoff(dp->i_fs, dp->i_offset); 6656 oldoffset = offset + (oldloc - base); 6657 newoffset = offset + (newloc - base); 6658 ACQUIRE_LOCK(&lk); 6659 if (pagedep_lookup(mp, dp->i_number, lbn, flags, &pagedep) == 0) { 6660 if (pagedep) 6661 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 6662 goto done; 6663 } 6664 dap = diradd_lookup(pagedep, oldoffset); 6665 if (dap) { 6666 dap->da_offset = newoffset; 6667 newoffset = DIRADDHASH(newoffset); 6668 oldoffset = DIRADDHASH(oldoffset); 6669 if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE && 6670 newoffset != oldoffset) { 6671 LIST_REMOVE(dap, da_pdlist); 6672 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset], 6673 dap, da_pdlist); 6674 } 6675 } 6676 done: 6677 if (jmvref) { 6678 jmvref->jm_pagedep = pagedep; 6679 LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps); 6680 add_to_journal(&jmvref->jm_list); 6681 } 6682 bcopy(oldloc, newloc, entrysize); 6683 FREE_LOCK(&lk); 6684 } 6685 6686 /* 6687 * Move the mkdir dependencies and journal work from one diradd to another 6688 * when renaming a directory. The new name must depend on the mkdir deps 6689 * completing as the old name did. Directories can only have one valid link 6690 * at a time so one must be canonical. 6691 */ 6692 static void 6693 merge_diradd(inodedep, newdap) 6694 struct inodedep *inodedep; 6695 struct diradd *newdap; 6696 { 6697 struct diradd *olddap; 6698 struct mkdir *mkdir, *nextmd; 6699 short state; 6700 6701 olddap = inodedep->id_mkdiradd; 6702 inodedep->id_mkdiradd = newdap; 6703 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 6704 newdap->da_state &= ~DEPCOMPLETE; 6705 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 6706 nextmd = LIST_NEXT(mkdir, md_mkdirs); 6707 if (mkdir->md_diradd != olddap) 6708 continue; 6709 mkdir->md_diradd = newdap; 6710 state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY); 6711 newdap->da_state |= state; 6712 olddap->da_state &= ~state; 6713 if ((olddap->da_state & 6714 (MKDIR_PARENT | MKDIR_BODY)) == 0) 6715 break; 6716 } 6717 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 6718 panic("merge_diradd: unfound ref"); 6719 } 6720 /* 6721 * Any mkdir related journal items are not safe to be freed until 6722 * the new name is stable. 6723 */ 6724 jwork_move(&newdap->da_jwork, &olddap->da_jwork); 6725 olddap->da_state |= DEPCOMPLETE; 6726 complete_diradd(olddap); 6727 } 6728 6729 /* 6730 * Move the diradd to the pending list when all diradd dependencies are 6731 * complete. 6732 */ 6733 static void 6734 complete_diradd(dap) 6735 struct diradd *dap; 6736 { 6737 struct pagedep *pagedep; 6738 6739 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 6740 if (dap->da_state & DIRCHG) 6741 pagedep = dap->da_previous->dm_pagedep; 6742 else 6743 pagedep = dap->da_pagedep; 6744 LIST_REMOVE(dap, da_pdlist); 6745 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 6746 } 6747 } 6748 6749 /* 6750 * Cancel a diradd when a dirrem overlaps with it. We must cancel the journal 6751 * add entries and conditonally journal the remove. 6752 */ 6753 static void 6754 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref) 6755 struct diradd *dap; 6756 struct dirrem *dirrem; 6757 struct jremref *jremref; 6758 struct jremref *dotremref; 6759 struct jremref *dotdotremref; 6760 { 6761 struct inodedep *inodedep; 6762 struct jaddref *jaddref; 6763 struct inoref *inoref; 6764 struct mkdir *mkdir; 6765 6766 /* 6767 * If no remove references were allocated we're on a non-journaled 6768 * filesystem and can skip the cancel step. 6769 */ 6770 if (jremref == NULL) { 6771 free_diradd(dap, NULL); 6772 return; 6773 } 6774 /* 6775 * Cancel the primary name an free it if it does not require 6776 * journaling. 6777 */ 6778 if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum, 6779 0, &inodedep) != 0) { 6780 /* Abort the addref that reference this diradd. */ 6781 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 6782 if (inoref->if_list.wk_type != D_JADDREF) 6783 continue; 6784 jaddref = (struct jaddref *)inoref; 6785 if (jaddref->ja_diradd != dap) 6786 continue; 6787 if (cancel_jaddref(jaddref, inodedep, 6788 &dirrem->dm_jwork) == 0) { 6789 free_jremref(jremref); 6790 jremref = NULL; 6791 } 6792 break; 6793 } 6794 } 6795 /* 6796 * Cancel subordinate names and free them if they do not require 6797 * journaling. 6798 */ 6799 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 6800 LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) { 6801 if (mkdir->md_diradd != dap) 6802 continue; 6803 if ((jaddref = mkdir->md_jaddref) == NULL) 6804 continue; 6805 mkdir->md_jaddref = NULL; 6806 if (mkdir->md_state & MKDIR_PARENT) { 6807 if (cancel_jaddref(jaddref, NULL, 6808 &dirrem->dm_jwork) == 0) { 6809 free_jremref(dotdotremref); 6810 dotdotremref = NULL; 6811 } 6812 } else { 6813 if (cancel_jaddref(jaddref, inodedep, 6814 &dirrem->dm_jwork) == 0) { 6815 free_jremref(dotremref); 6816 dotremref = NULL; 6817 } 6818 } 6819 } 6820 } 6821 6822 if (jremref) 6823 journal_jremref(dirrem, jremref, inodedep); 6824 if (dotremref) 6825 journal_jremref(dirrem, dotremref, inodedep); 6826 if (dotdotremref) 6827 journal_jremref(dirrem, dotdotremref, NULL); 6828 jwork_move(&dirrem->dm_jwork, &dap->da_jwork); 6829 free_diradd(dap, &dirrem->dm_jwork); 6830 } 6831 6832 /* 6833 * Free a diradd dependency structure. This routine must be called 6834 * with splbio interrupts blocked. 6835 */ 6836 static void 6837 free_diradd(dap, wkhd) 6838 struct diradd *dap; 6839 struct workhead *wkhd; 6840 { 6841 struct dirrem *dirrem; 6842 struct pagedep *pagedep; 6843 struct inodedep *inodedep; 6844 struct mkdir *mkdir, *nextmd; 6845 6846 mtx_assert(&lk, MA_OWNED); 6847 LIST_REMOVE(dap, da_pdlist); 6848 if (dap->da_state & ONWORKLIST) 6849 WORKLIST_REMOVE(&dap->da_list); 6850 if ((dap->da_state & DIRCHG) == 0) { 6851 pagedep = dap->da_pagedep; 6852 } else { 6853 dirrem = dap->da_previous; 6854 pagedep = dirrem->dm_pagedep; 6855 dirrem->dm_dirinum = pagedep->pd_ino; 6856 dirrem->dm_state |= COMPLETE; 6857 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 6858 add_to_worklist(&dirrem->dm_list, 0); 6859 } 6860 if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum, 6861 0, &inodedep) != 0) 6862 if (inodedep->id_mkdiradd == dap) 6863 inodedep->id_mkdiradd = NULL; 6864 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 6865 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 6866 nextmd = LIST_NEXT(mkdir, md_mkdirs); 6867 if (mkdir->md_diradd != dap) 6868 continue; 6869 dap->da_state &= 6870 ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); 6871 LIST_REMOVE(mkdir, md_mkdirs); 6872 if (mkdir->md_state & ONWORKLIST) 6873 WORKLIST_REMOVE(&mkdir->md_list); 6874 if (mkdir->md_jaddref != NULL) 6875 panic("free_diradd: Unexpected jaddref"); 6876 WORKITEM_FREE(mkdir, D_MKDIR); 6877 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) 6878 break; 6879 } 6880 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 6881 panic("free_diradd: unfound ref"); 6882 } 6883 if (inodedep) 6884 free_inodedep(inodedep); 6885 /* 6886 * Free any journal segments waiting for the directory write. 6887 */ 6888 handle_jwork(&dap->da_jwork); 6889 WORKITEM_FREE(dap, D_DIRADD); 6890 } 6891 6892 /* 6893 * Directory entry removal dependencies. 6894 * 6895 * When removing a directory entry, the entry's inode pointer must be 6896 * zero'ed on disk before the corresponding inode's link count is decremented 6897 * (possibly freeing the inode for re-use). This dependency is handled by 6898 * updating the directory entry but delaying the inode count reduction until 6899 * after the directory block has been written to disk. After this point, the 6900 * inode count can be decremented whenever it is convenient. 6901 */ 6902 6903 /* 6904 * This routine should be called immediately after removing 6905 * a directory entry. The inode's link count should not be 6906 * decremented by the calling procedure -- the soft updates 6907 * code will do this task when it is safe. 6908 */ 6909 void 6910 softdep_setup_remove(bp, dp, ip, isrmdir) 6911 struct buf *bp; /* buffer containing directory block */ 6912 struct inode *dp; /* inode for the directory being modified */ 6913 struct inode *ip; /* inode for directory entry being removed */ 6914 int isrmdir; /* indicates if doing RMDIR */ 6915 { 6916 struct dirrem *dirrem, *prevdirrem; 6917 struct inodedep *inodedep; 6918 int direct; 6919 6920 /* 6921 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. We want 6922 * newdirrem() to setup the full directory remove which requires 6923 * isrmdir > 1. 6924 */ 6925 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 6926 /* 6927 * Add the dirrem to the inodedep's pending remove list for quick 6928 * discovery later. 6929 */ 6930 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 6931 &inodedep) == 0) 6932 panic("softdep_setup_remove: Lost inodedep."); 6933 dirrem->dm_state |= ONDEPLIST; 6934 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 6935 6936 /* 6937 * If the COMPLETE flag is clear, then there were no active 6938 * entries and we want to roll back to a zeroed entry until 6939 * the new inode is committed to disk. If the COMPLETE flag is 6940 * set then we have deleted an entry that never made it to 6941 * disk. If the entry we deleted resulted from a name change, 6942 * then the old name still resides on disk. We cannot delete 6943 * its inode (returned to us in prevdirrem) until the zeroed 6944 * directory entry gets to disk. The new inode has never been 6945 * referenced on the disk, so can be deleted immediately. 6946 */ 6947 if ((dirrem->dm_state & COMPLETE) == 0) { 6948 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, 6949 dm_next); 6950 FREE_LOCK(&lk); 6951 } else { 6952 if (prevdirrem != NULL) 6953 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, 6954 prevdirrem, dm_next); 6955 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; 6956 direct = LIST_EMPTY(&dirrem->dm_jremrefhd); 6957 FREE_LOCK(&lk); 6958 if (direct) 6959 handle_workitem_remove(dirrem, NULL); 6960 } 6961 } 6962 6963 /* 6964 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the 6965 * pd_pendinghd list of a pagedep. 6966 */ 6967 static struct diradd * 6968 diradd_lookup(pagedep, offset) 6969 struct pagedep *pagedep; 6970 int offset; 6971 { 6972 struct diradd *dap; 6973 6974 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) 6975 if (dap->da_offset == offset) 6976 return (dap); 6977 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 6978 if (dap->da_offset == offset) 6979 return (dap); 6980 return (NULL); 6981 } 6982 6983 /* 6984 * Search for a .. diradd dependency in a directory that is being removed. 6985 * If the directory was renamed to a new parent we have a diradd rather 6986 * than a mkdir for the .. entry. We need to cancel it now before 6987 * it is found in truncate(). 6988 */ 6989 static struct jremref * 6990 cancel_diradd_dotdot(ip, dirrem, jremref) 6991 struct inode *ip; 6992 struct dirrem *dirrem; 6993 struct jremref *jremref; 6994 { 6995 struct pagedep *pagedep; 6996 struct diradd *dap; 6997 struct worklist *wk; 6998 6999 if (pagedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 0, 7000 &pagedep) == 0) 7001 return (jremref); 7002 dap = diradd_lookup(pagedep, DOTDOT_OFFSET); 7003 if (dap == NULL) 7004 return (jremref); 7005 cancel_diradd(dap, dirrem, jremref, NULL, NULL); 7006 /* 7007 * Mark any journal work as belonging to the parent so it is freed 7008 * with the .. reference. 7009 */ 7010 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) 7011 wk->wk_state |= MKDIR_PARENT; 7012 return (NULL); 7013 } 7014 7015 /* 7016 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to 7017 * replace it with a dirrem/diradd pair as a result of re-parenting a 7018 * directory. This ensures that we don't simultaneously have a mkdir and 7019 * a diradd for the same .. entry. 7020 */ 7021 static struct jremref * 7022 cancel_mkdir_dotdot(ip, dirrem, jremref) 7023 struct inode *ip; 7024 struct dirrem *dirrem; 7025 struct jremref *jremref; 7026 { 7027 struct inodedep *inodedep; 7028 struct jaddref *jaddref; 7029 struct mkdir *mkdir; 7030 struct diradd *dap; 7031 7032 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 7033 &inodedep) == 0) 7034 panic("cancel_mkdir_dotdot: Lost inodedep"); 7035 dap = inodedep->id_mkdiradd; 7036 if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0) 7037 return (jremref); 7038 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; 7039 mkdir = LIST_NEXT(mkdir, md_mkdirs)) 7040 if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT) 7041 break; 7042 if (mkdir == NULL) 7043 panic("cancel_mkdir_dotdot: Unable to find mkdir\n"); 7044 if ((jaddref = mkdir->md_jaddref) != NULL) { 7045 mkdir->md_jaddref = NULL; 7046 jaddref->ja_state &= ~MKDIR_PARENT; 7047 if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0, 7048 &inodedep) == 0) 7049 panic("cancel_mkdir_dotdot: Lost parent inodedep"); 7050 if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) { 7051 journal_jremref(dirrem, jremref, inodedep); 7052 jremref = NULL; 7053 } 7054 } 7055 if (mkdir->md_state & ONWORKLIST) 7056 WORKLIST_REMOVE(&mkdir->md_list); 7057 mkdir->md_state |= ALLCOMPLETE; 7058 complete_mkdir(mkdir); 7059 return (jremref); 7060 } 7061 7062 static void 7063 journal_jremref(dirrem, jremref, inodedep) 7064 struct dirrem *dirrem; 7065 struct jremref *jremref; 7066 struct inodedep *inodedep; 7067 { 7068 7069 if (inodedep == NULL) 7070 if (inodedep_lookup(jremref->jr_list.wk_mp, 7071 jremref->jr_ref.if_ino, 0, &inodedep) == 0) 7072 panic("journal_jremref: Lost inodedep"); 7073 LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps); 7074 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); 7075 add_to_journal(&jremref->jr_list); 7076 } 7077 7078 static void 7079 dirrem_journal(dirrem, jremref, dotremref, dotdotremref) 7080 struct dirrem *dirrem; 7081 struct jremref *jremref; 7082 struct jremref *dotremref; 7083 struct jremref *dotdotremref; 7084 { 7085 struct inodedep *inodedep; 7086 7087 7088 if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, 7089 &inodedep) == 0) 7090 panic("dirrem_journal: Lost inodedep"); 7091 journal_jremref(dirrem, jremref, inodedep); 7092 if (dotremref) 7093 journal_jremref(dirrem, dotremref, inodedep); 7094 if (dotdotremref) 7095 journal_jremref(dirrem, dotdotremref, NULL); 7096 } 7097 7098 /* 7099 * Allocate a new dirrem if appropriate and return it along with 7100 * its associated pagedep. Called without a lock, returns with lock. 7101 */ 7102 static long num_dirrem; /* number of dirrem allocated */ 7103 static struct dirrem * 7104 newdirrem(bp, dp, ip, isrmdir, prevdirremp) 7105 struct buf *bp; /* buffer containing directory block */ 7106 struct inode *dp; /* inode for the directory being modified */ 7107 struct inode *ip; /* inode for directory entry being removed */ 7108 int isrmdir; /* indicates if doing RMDIR */ 7109 struct dirrem **prevdirremp; /* previously referenced inode, if any */ 7110 { 7111 int offset; 7112 ufs_lbn_t lbn; 7113 struct diradd *dap; 7114 struct dirrem *dirrem; 7115 struct pagedep *pagedep; 7116 struct jremref *jremref; 7117 struct jremref *dotremref; 7118 struct jremref *dotdotremref; 7119 struct vnode *dvp; 7120 7121 /* 7122 * Whiteouts have no deletion dependencies. 7123 */ 7124 if (ip == NULL) 7125 panic("newdirrem: whiteout"); 7126 dvp = ITOV(dp); 7127 /* 7128 * If we are over our limit, try to improve the situation. 7129 * Limiting the number of dirrem structures will also limit 7130 * the number of freefile and freeblks structures. 7131 */ 7132 ACQUIRE_LOCK(&lk); 7133 if (!(ip->i_flags & SF_SNAPSHOT) && num_dirrem > max_softdeps / 2) 7134 (void) request_cleanup(ITOV(dp)->v_mount, FLUSH_REMOVE); 7135 num_dirrem += 1; 7136 FREE_LOCK(&lk); 7137 dirrem = malloc(sizeof(struct dirrem), 7138 M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO); 7139 workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount); 7140 LIST_INIT(&dirrem->dm_jremrefhd); 7141 LIST_INIT(&dirrem->dm_jwork); 7142 dirrem->dm_state = isrmdir ? RMDIR : 0; 7143 dirrem->dm_oldinum = ip->i_number; 7144 *prevdirremp = NULL; 7145 /* 7146 * Allocate remove reference structures to track journal write 7147 * dependencies. We will always have one for the link and 7148 * when doing directories we will always have one more for dot. 7149 * When renaming a directory we skip the dotdot link change so 7150 * this is not needed. 7151 */ 7152 jremref = dotremref = dotdotremref = NULL; 7153 if (DOINGSUJ(dvp)) { 7154 if (isrmdir) { 7155 jremref = newjremref(dirrem, dp, ip, dp->i_offset, 7156 ip->i_effnlink + 2); 7157 dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET, 7158 ip->i_effnlink + 1); 7159 dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET, 7160 dp->i_effnlink + 1); 7161 dotdotremref->jr_state |= MKDIR_PARENT; 7162 } else 7163 jremref = newjremref(dirrem, dp, ip, dp->i_offset, 7164 ip->i_effnlink + 1); 7165 } 7166 ACQUIRE_LOCK(&lk); 7167 lbn = lblkno(dp->i_fs, dp->i_offset); 7168 offset = blkoff(dp->i_fs, dp->i_offset); 7169 if (pagedep_lookup(UFSTOVFS(dp->i_ump), dp->i_number, lbn, DEPALLOC, 7170 &pagedep) == 0) 7171 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 7172 dirrem->dm_pagedep = pagedep; 7173 /* 7174 * If we're renaming a .. link to a new directory, cancel any 7175 * existing MKDIR_PARENT mkdir. If it has already been canceled 7176 * the jremref is preserved for any potential diradd in this 7177 * location. This can not coincide with a rmdir. 7178 */ 7179 if (dp->i_offset == DOTDOT_OFFSET) { 7180 if (isrmdir) 7181 panic("newdirrem: .. directory change during remove?"); 7182 jremref = cancel_mkdir_dotdot(dp, dirrem, jremref); 7183 } 7184 /* 7185 * If we're removing a directory search for the .. dependency now and 7186 * cancel it. Any pending journal work will be added to the dirrem 7187 * to be completed when the workitem remove completes. 7188 */ 7189 if (isrmdir) 7190 dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref); 7191 /* 7192 * Check for a diradd dependency for the same directory entry. 7193 * If present, then both dependencies become obsolete and can 7194 * be de-allocated. 7195 */ 7196 dap = diradd_lookup(pagedep, offset); 7197 if (dap == NULL) { 7198 /* 7199 * Link the jremref structures into the dirrem so they are 7200 * written prior to the pagedep. 7201 */ 7202 if (jremref) 7203 dirrem_journal(dirrem, jremref, dotremref, 7204 dotdotremref); 7205 return (dirrem); 7206 } 7207 /* 7208 * Must be ATTACHED at this point. 7209 */ 7210 if ((dap->da_state & ATTACHED) == 0) 7211 panic("newdirrem: not ATTACHED"); 7212 if (dap->da_newinum != ip->i_number) 7213 panic("newdirrem: inum %d should be %d", 7214 ip->i_number, dap->da_newinum); 7215 /* 7216 * If we are deleting a changed name that never made it to disk, 7217 * then return the dirrem describing the previous inode (which 7218 * represents the inode currently referenced from this entry on disk). 7219 */ 7220 if ((dap->da_state & DIRCHG) != 0) { 7221 *prevdirremp = dap->da_previous; 7222 dap->da_state &= ~DIRCHG; 7223 dap->da_pagedep = pagedep; 7224 } 7225 /* 7226 * We are deleting an entry that never made it to disk. 7227 * Mark it COMPLETE so we can delete its inode immediately. 7228 */ 7229 dirrem->dm_state |= COMPLETE; 7230 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref); 7231 #ifdef SUJ_DEBUG 7232 if (isrmdir == 0) { 7233 struct worklist *wk; 7234 7235 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) 7236 if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT)) 7237 panic("bad wk %p (0x%X)\n", wk, wk->wk_state); 7238 } 7239 #endif 7240 7241 return (dirrem); 7242 } 7243 7244 /* 7245 * Directory entry change dependencies. 7246 * 7247 * Changing an existing directory entry requires that an add operation 7248 * be completed first followed by a deletion. The semantics for the addition 7249 * are identical to the description of adding a new entry above except 7250 * that the rollback is to the old inode number rather than zero. Once 7251 * the addition dependency is completed, the removal is done as described 7252 * in the removal routine above. 7253 */ 7254 7255 /* 7256 * This routine should be called immediately after changing 7257 * a directory entry. The inode's link count should not be 7258 * decremented by the calling procedure -- the soft updates 7259 * code will perform this task when it is safe. 7260 */ 7261 void 7262 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 7263 struct buf *bp; /* buffer containing directory block */ 7264 struct inode *dp; /* inode for the directory being modified */ 7265 struct inode *ip; /* inode for directory entry being removed */ 7266 ino_t newinum; /* new inode number for changed entry */ 7267 int isrmdir; /* indicates if doing RMDIR */ 7268 { 7269 int offset; 7270 struct diradd *dap = NULL; 7271 struct dirrem *dirrem, *prevdirrem; 7272 struct pagedep *pagedep; 7273 struct inodedep *inodedep; 7274 struct jaddref *jaddref; 7275 struct mount *mp; 7276 7277 offset = blkoff(dp->i_fs, dp->i_offset); 7278 mp = UFSTOVFS(dp->i_ump); 7279 7280 /* 7281 * Whiteouts do not need diradd dependencies. 7282 */ 7283 if (newinum != WINO) { 7284 dap = malloc(sizeof(struct diradd), 7285 M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); 7286 workitem_alloc(&dap->da_list, D_DIRADD, mp); 7287 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; 7288 dap->da_offset = offset; 7289 dap->da_newinum = newinum; 7290 LIST_INIT(&dap->da_jwork); 7291 } 7292 7293 /* 7294 * Allocate a new dirrem and ACQUIRE_LOCK. 7295 */ 7296 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 7297 pagedep = dirrem->dm_pagedep; 7298 /* 7299 * The possible values for isrmdir: 7300 * 0 - non-directory file rename 7301 * 1 - directory rename within same directory 7302 * inum - directory rename to new directory of given inode number 7303 * When renaming to a new directory, we are both deleting and 7304 * creating a new directory entry, so the link count on the new 7305 * directory should not change. Thus we do not need the followup 7306 * dirrem which is usually done in handle_workitem_remove. We set 7307 * the DIRCHG flag to tell handle_workitem_remove to skip the 7308 * followup dirrem. 7309 */ 7310 if (isrmdir > 1) 7311 dirrem->dm_state |= DIRCHG; 7312 7313 /* 7314 * Whiteouts have no additional dependencies, 7315 * so just put the dirrem on the correct list. 7316 */ 7317 if (newinum == WINO) { 7318 if ((dirrem->dm_state & COMPLETE) == 0) { 7319 LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, 7320 dm_next); 7321 } else { 7322 dirrem->dm_dirinum = pagedep->pd_ino; 7323 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 7324 add_to_worklist(&dirrem->dm_list, 0); 7325 } 7326 FREE_LOCK(&lk); 7327 return; 7328 } 7329 /* 7330 * Add the dirrem to the inodedep's pending remove list for quick 7331 * discovery later. A valid nlinkdelta ensures that this lookup 7332 * will not fail. 7333 */ 7334 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) 7335 panic("softdep_setup_directory_change: Lost inodedep."); 7336 dirrem->dm_state |= ONDEPLIST; 7337 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 7338 7339 /* 7340 * If the COMPLETE flag is clear, then there were no active 7341 * entries and we want to roll back to the previous inode until 7342 * the new inode is committed to disk. If the COMPLETE flag is 7343 * set, then we have deleted an entry that never made it to disk. 7344 * If the entry we deleted resulted from a name change, then the old 7345 * inode reference still resides on disk. Any rollback that we do 7346 * needs to be to that old inode (returned to us in prevdirrem). If 7347 * the entry we deleted resulted from a create, then there is 7348 * no entry on the disk, so we want to roll back to zero rather 7349 * than the uncommitted inode. In either of the COMPLETE cases we 7350 * want to immediately free the unwritten and unreferenced inode. 7351 */ 7352 if ((dirrem->dm_state & COMPLETE) == 0) { 7353 dap->da_previous = dirrem; 7354 } else { 7355 if (prevdirrem != NULL) { 7356 dap->da_previous = prevdirrem; 7357 } else { 7358 dap->da_state &= ~DIRCHG; 7359 dap->da_pagedep = pagedep; 7360 } 7361 dirrem->dm_dirinum = pagedep->pd_ino; 7362 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 7363 add_to_worklist(&dirrem->dm_list, 0); 7364 } 7365 /* 7366 * Lookup the jaddref for this journal entry. We must finish 7367 * initializing it and make the diradd write dependent on it. 7368 * If we're not journaling Put it on the id_bufwait list if the inode 7369 * is not yet written. If it is written, do the post-inode write 7370 * processing to put it on the id_pendinghd list. 7371 */ 7372 inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); 7373 if (mp->mnt_kern_flag & MNTK_SUJ) { 7374 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 7375 inoreflst); 7376 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 7377 ("softdep_setup_directory_change: bad jaddref %p", 7378 jaddref)); 7379 jaddref->ja_diroff = dp->i_offset; 7380 jaddref->ja_diradd = dap; 7381 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 7382 dap, da_pdlist); 7383 add_to_journal(&jaddref->ja_list); 7384 } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 7385 dap->da_state |= COMPLETE; 7386 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 7387 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 7388 } else { 7389 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 7390 dap, da_pdlist); 7391 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 7392 } 7393 /* 7394 * If we're making a new name for a directory that has not been 7395 * committed when need to move the dot and dotdot references to 7396 * this new name. 7397 */ 7398 if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET) 7399 merge_diradd(inodedep, dap); 7400 FREE_LOCK(&lk); 7401 } 7402 7403 /* 7404 * Called whenever the link count on an inode is changed. 7405 * It creates an inode dependency so that the new reference(s) 7406 * to the inode cannot be committed to disk until the updated 7407 * inode has been written. 7408 */ 7409 void 7410 softdep_change_linkcnt(ip) 7411 struct inode *ip; /* the inode with the increased link count */ 7412 { 7413 struct inodedep *inodedep; 7414 7415 ACQUIRE_LOCK(&lk); 7416 inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep); 7417 if (ip->i_nlink < ip->i_effnlink) 7418 panic("softdep_change_linkcnt: bad delta"); 7419 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 7420 FREE_LOCK(&lk); 7421 } 7422 7423 /* 7424 * Attach a sbdep dependency to the superblock buf so that we can keep 7425 * track of the head of the linked list of referenced but unlinked inodes. 7426 */ 7427 void 7428 softdep_setup_sbupdate(ump, fs, bp) 7429 struct ufsmount *ump; 7430 struct fs *fs; 7431 struct buf *bp; 7432 { 7433 struct sbdep *sbdep; 7434 struct worklist *wk; 7435 7436 if ((fs->fs_flags & FS_SUJ) == 0) 7437 return; 7438 LIST_FOREACH(wk, &bp->b_dep, wk_list) 7439 if (wk->wk_type == D_SBDEP) 7440 break; 7441 if (wk != NULL) 7442 return; 7443 sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS); 7444 workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump)); 7445 sbdep->sb_fs = fs; 7446 sbdep->sb_ump = ump; 7447 ACQUIRE_LOCK(&lk); 7448 WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list); 7449 FREE_LOCK(&lk); 7450 } 7451 7452 /* 7453 * Return the first unlinked inodedep which is ready to be the head of the 7454 * list. The inodedep and all those after it must have valid next pointers. 7455 */ 7456 static struct inodedep * 7457 first_unlinked_inodedep(ump) 7458 struct ufsmount *ump; 7459 { 7460 struct inodedep *inodedep; 7461 struct inodedep *idp; 7462 7463 for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst); 7464 inodedep; inodedep = idp) { 7465 if ((inodedep->id_state & UNLINKNEXT) == 0) 7466 return (NULL); 7467 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 7468 if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0) 7469 break; 7470 if ((inodedep->id_state & UNLINKPREV) == 0) 7471 panic("first_unlinked_inodedep: prev != next"); 7472 } 7473 if (inodedep == NULL) 7474 return (NULL); 7475 7476 return (inodedep); 7477 } 7478 7479 /* 7480 * Set the sujfree unlinked head pointer prior to writing a superblock. 7481 */ 7482 static void 7483 initiate_write_sbdep(sbdep) 7484 struct sbdep *sbdep; 7485 { 7486 struct inodedep *inodedep; 7487 struct fs *bpfs; 7488 struct fs *fs; 7489 7490 bpfs = sbdep->sb_fs; 7491 fs = sbdep->sb_ump->um_fs; 7492 inodedep = first_unlinked_inodedep(sbdep->sb_ump); 7493 if (inodedep) { 7494 fs->fs_sujfree = inodedep->id_ino; 7495 inodedep->id_state |= UNLINKPREV; 7496 } else 7497 fs->fs_sujfree = 0; 7498 bpfs->fs_sujfree = fs->fs_sujfree; 7499 } 7500 7501 /* 7502 * After a superblock is written determine whether it must be written again 7503 * due to a changing unlinked list head. 7504 */ 7505 static int 7506 handle_written_sbdep(sbdep, bp) 7507 struct sbdep *sbdep; 7508 struct buf *bp; 7509 { 7510 struct inodedep *inodedep; 7511 struct mount *mp; 7512 struct fs *fs; 7513 7514 fs = sbdep->sb_fs; 7515 mp = UFSTOVFS(sbdep->sb_ump); 7516 inodedep = first_unlinked_inodedep(sbdep->sb_ump); 7517 if ((inodedep && fs->fs_sujfree != inodedep->id_ino) || 7518 (inodedep == NULL && fs->fs_sujfree != 0)) { 7519 bdirty(bp); 7520 return (1); 7521 } 7522 WORKITEM_FREE(sbdep, D_SBDEP); 7523 if (fs->fs_sujfree == 0) 7524 return (0); 7525 if (inodedep_lookup(mp, fs->fs_sujfree, 0, &inodedep) == 0) 7526 panic("handle_written_sbdep: lost inodedep"); 7527 /* 7528 * Now that we have a record of this inode in stable store allow it 7529 * to be written to free up pending work. Inodes may see a lot of 7530 * write activity after they are unlinked which we must not hold up. 7531 */ 7532 for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) { 7533 if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS) 7534 panic("handle_written_sbdep: Bad inodedep %p (0x%X)", 7535 inodedep, inodedep->id_state); 7536 if (inodedep->id_state & UNLINKONLIST) 7537 break; 7538 inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST; 7539 } 7540 7541 return (0); 7542 } 7543 7544 /* 7545 * Mark an inodedep as unlinked and insert it into the in-memory unlinked list. 7546 */ 7547 static void 7548 unlinked_inodedep(mp, inodedep) 7549 struct mount *mp; 7550 struct inodedep *inodedep; 7551 { 7552 struct ufsmount *ump; 7553 7554 if ((mp->mnt_kern_flag & MNTK_SUJ) == 0) 7555 return; 7556 ump = VFSTOUFS(mp); 7557 ump->um_fs->fs_fmod = 1; 7558 inodedep->id_state |= UNLINKED; 7559 TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked); 7560 } 7561 7562 /* 7563 * Remove an inodedep from the unlinked inodedep list. This may require 7564 * disk writes if the inode has made it that far. 7565 */ 7566 static void 7567 clear_unlinked_inodedep(inodedep) 7568 struct inodedep *inodedep; 7569 { 7570 struct ufsmount *ump; 7571 struct inodedep *idp; 7572 struct inodedep *idn; 7573 struct fs *fs; 7574 struct buf *bp; 7575 ino_t ino; 7576 ino_t nino; 7577 ino_t pino; 7578 int error; 7579 7580 ump = VFSTOUFS(inodedep->id_list.wk_mp); 7581 fs = ump->um_fs; 7582 ino = inodedep->id_ino; 7583 error = 0; 7584 for (;;) { 7585 /* 7586 * If nothing has yet been written simply remove us from 7587 * the in memory list and return. This is the most common 7588 * case where handle_workitem_remove() loses the final 7589 * reference. 7590 */ 7591 if ((inodedep->id_state & UNLINKLINKS) == 0) 7592 break; 7593 /* 7594 * If we have a NEXT pointer and no PREV pointer we can simply 7595 * clear NEXT's PREV and remove ourselves from the list. Be 7596 * careful not to clear PREV if the superblock points at 7597 * next as well. 7598 */ 7599 idn = TAILQ_NEXT(inodedep, id_unlinked); 7600 if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) { 7601 if (idn && fs->fs_sujfree != idn->id_ino) 7602 idn->id_state &= ~UNLINKPREV; 7603 break; 7604 } 7605 /* 7606 * Here we have an inodedep which is actually linked into 7607 * the list. We must remove it by forcing a write to the 7608 * link before us, whether it be the superblock or an inode. 7609 * Unfortunately the list may change while we're waiting 7610 * on the buf lock for either resource so we must loop until 7611 * we lock the right one. If both the superblock and an 7612 * inode point to this inode we must clear the inode first 7613 * followed by the superblock. 7614 */ 7615 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 7616 pino = 0; 7617 if (idp && (idp->id_state & UNLINKNEXT)) 7618 pino = idp->id_ino; 7619 FREE_LOCK(&lk); 7620 if (pino == 0) 7621 bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), 7622 (int)fs->fs_sbsize, 0, 0, 0); 7623 else 7624 error = bread(ump->um_devvp, 7625 fsbtodb(fs, ino_to_fsba(fs, pino)), 7626 (int)fs->fs_bsize, NOCRED, &bp); 7627 ACQUIRE_LOCK(&lk); 7628 if (error) 7629 break; 7630 /* If the list has changed restart the loop. */ 7631 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 7632 nino = 0; 7633 if (idp && (idp->id_state & UNLINKNEXT)) 7634 nino = idp->id_ino; 7635 if (nino != pino || 7636 (inodedep->id_state & UNLINKPREV) != UNLINKPREV) { 7637 FREE_LOCK(&lk); 7638 brelse(bp); 7639 ACQUIRE_LOCK(&lk); 7640 continue; 7641 } 7642 /* 7643 * Remove us from the in memory list. After this we cannot 7644 * access the inodedep. 7645 */ 7646 idn = TAILQ_NEXT(inodedep, id_unlinked); 7647 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS); 7648 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); 7649 /* 7650 * Determine the next inode number. 7651 */ 7652 nino = 0; 7653 if (idn) { 7654 /* 7655 * If next isn't on the list we can just clear prev's 7656 * state and schedule it to be fixed later. No need 7657 * to synchronously write if we're not in the real 7658 * list. 7659 */ 7660 if ((idn->id_state & UNLINKPREV) == 0 && pino != 0) { 7661 idp->id_state &= ~UNLINKNEXT; 7662 if ((idp->id_state & ONWORKLIST) == 0) 7663 WORKLIST_INSERT(&bp->b_dep, 7664 &idp->id_list); 7665 FREE_LOCK(&lk); 7666 bawrite(bp); 7667 ACQUIRE_LOCK(&lk); 7668 return; 7669 } 7670 nino = idn->id_ino; 7671 } 7672 FREE_LOCK(&lk); 7673 /* 7674 * The predecessor's next pointer is manually updated here 7675 * so that the NEXT flag is never cleared for an element 7676 * that is in the list. 7677 */ 7678 if (pino == 0) { 7679 bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); 7680 ffs_oldfscompat_write((struct fs *)bp->b_data, ump); 7681 softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, 7682 bp); 7683 } else if (fs->fs_magic == FS_UFS1_MAGIC) 7684 ((struct ufs1_dinode *)bp->b_data + 7685 ino_to_fsbo(fs, pino))->di_freelink = nino; 7686 else 7687 ((struct ufs2_dinode *)bp->b_data + 7688 ino_to_fsbo(fs, pino))->di_freelink = nino; 7689 /* 7690 * If the bwrite fails we have no recourse to recover. The 7691 * filesystem is corrupted already. 7692 */ 7693 bwrite(bp); 7694 ACQUIRE_LOCK(&lk); 7695 /* 7696 * If the superblock pointer still needs to be cleared force 7697 * a write here. 7698 */ 7699 if (fs->fs_sujfree == ino) { 7700 FREE_LOCK(&lk); 7701 bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), 7702 (int)fs->fs_sbsize, 0, 0, 0); 7703 bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); 7704 ffs_oldfscompat_write((struct fs *)bp->b_data, ump); 7705 softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, 7706 bp); 7707 bwrite(bp); 7708 ACQUIRE_LOCK(&lk); 7709 } 7710 if (fs->fs_sujfree != ino) 7711 return; 7712 panic("clear_unlinked_inodedep: Failed to clear free head"); 7713 } 7714 if (inodedep->id_ino == fs->fs_sujfree) 7715 panic("clear_unlinked_inodedep: Freeing head of free list"); 7716 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS); 7717 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); 7718 return; 7719 } 7720 7721 /* 7722 * This workitem decrements the inode's link count. 7723 * If the link count reaches zero, the file is removed. 7724 */ 7725 static void 7726 handle_workitem_remove(dirrem, xp) 7727 struct dirrem *dirrem; 7728 struct vnode *xp; 7729 { 7730 struct inodedep *inodedep; 7731 struct workhead dotdotwk; 7732 struct worklist *wk; 7733 struct ufsmount *ump; 7734 struct mount *mp; 7735 struct vnode *vp; 7736 struct inode *ip; 7737 ino_t oldinum; 7738 int error; 7739 7740 if (dirrem->dm_state & ONWORKLIST) 7741 panic("handle_workitem_remove: dirrem %p still on worklist", 7742 dirrem); 7743 oldinum = dirrem->dm_oldinum; 7744 mp = dirrem->dm_list.wk_mp; 7745 ump = VFSTOUFS(mp); 7746 if ((vp = xp) == NULL && 7747 (error = ffs_vgetf(mp, oldinum, LK_EXCLUSIVE, &vp, 7748 FFSV_FORCEINSMQ)) != 0) { 7749 softdep_error("handle_workitem_remove: vget", error); 7750 return; 7751 } 7752 ip = VTOI(vp); 7753 ACQUIRE_LOCK(&lk); 7754 if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0) 7755 panic("handle_workitem_remove: lost inodedep"); 7756 if (dirrem->dm_state & ONDEPLIST) 7757 LIST_REMOVE(dirrem, dm_inonext); 7758 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), 7759 ("handle_workitem_remove: Journal entries not written.")); 7760 7761 /* 7762 * Move all dependencies waiting on the remove to complete 7763 * from the dirrem to the inode inowait list to be completed 7764 * after the inode has been updated and written to disk. Any 7765 * marked MKDIR_PARENT are saved to be completed when the .. ref 7766 * is removed. 7767 */ 7768 LIST_INIT(&dotdotwk); 7769 while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) { 7770 WORKLIST_REMOVE(wk); 7771 if (wk->wk_state & MKDIR_PARENT) { 7772 wk->wk_state &= ~MKDIR_PARENT; 7773 WORKLIST_INSERT(&dotdotwk, wk); 7774 continue; 7775 } 7776 WORKLIST_INSERT(&inodedep->id_inowait, wk); 7777 } 7778 LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list); 7779 /* 7780 * Normal file deletion. 7781 */ 7782 if ((dirrem->dm_state & RMDIR) == 0) { 7783 ip->i_nlink--; 7784 DIP_SET(ip, i_nlink, ip->i_nlink); 7785 ip->i_flag |= IN_CHANGE; 7786 if (ip->i_nlink < ip->i_effnlink) 7787 panic("handle_workitem_remove: bad file delta"); 7788 if (ip->i_nlink == 0) 7789 unlinked_inodedep(mp, inodedep); 7790 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 7791 num_dirrem -= 1; 7792 KASSERT(LIST_EMPTY(&dirrem->dm_jwork), 7793 ("handle_workitem_remove: worklist not empty. %s", 7794 TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type))); 7795 WORKITEM_FREE(dirrem, D_DIRREM); 7796 FREE_LOCK(&lk); 7797 goto out; 7798 } 7799 /* 7800 * Directory deletion. Decrement reference count for both the 7801 * just deleted parent directory entry and the reference for ".". 7802 * Arrange to have the reference count on the parent decremented 7803 * to account for the loss of "..". 7804 */ 7805 ip->i_nlink -= 2; 7806 DIP_SET(ip, i_nlink, ip->i_nlink); 7807 ip->i_flag |= IN_CHANGE; 7808 if (ip->i_nlink < ip->i_effnlink) 7809 panic("handle_workitem_remove: bad dir delta"); 7810 if (ip->i_nlink == 0) 7811 unlinked_inodedep(mp, inodedep); 7812 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 7813 /* 7814 * Rename a directory to a new parent. Since, we are both deleting 7815 * and creating a new directory entry, the link count on the new 7816 * directory should not change. Thus we skip the followup dirrem. 7817 */ 7818 if (dirrem->dm_state & DIRCHG) { 7819 KASSERT(LIST_EMPTY(&dirrem->dm_jwork), 7820 ("handle_workitem_remove: DIRCHG and worklist not empty.")); 7821 num_dirrem -= 1; 7822 WORKITEM_FREE(dirrem, D_DIRREM); 7823 FREE_LOCK(&lk); 7824 goto out; 7825 } 7826 dirrem->dm_state = ONDEPLIST; 7827 dirrem->dm_oldinum = dirrem->dm_dirinum; 7828 /* 7829 * Place the dirrem on the parent's diremhd list. 7830 */ 7831 if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0) 7832 panic("handle_workitem_remove: lost dir inodedep"); 7833 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 7834 /* 7835 * If the allocated inode has never been written to disk, then 7836 * the on-disk inode is zero'ed and we can remove the file 7837 * immediately. When journaling if the inode has been marked 7838 * unlinked and not DEPCOMPLETE we know it can never be written. 7839 */ 7840 inodedep_lookup(mp, oldinum, 0, &inodedep); 7841 if (inodedep == NULL || 7842 (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED || 7843 check_inode_unwritten(inodedep)) { 7844 if (xp != NULL) 7845 add_to_worklist(&dirrem->dm_list, 0); 7846 FREE_LOCK(&lk); 7847 if (xp == NULL) { 7848 vput(vp); 7849 handle_workitem_remove(dirrem, NULL); 7850 } 7851 return; 7852 } 7853 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); 7854 FREE_LOCK(&lk); 7855 ip->i_flag |= IN_CHANGE; 7856 out: 7857 ffs_update(vp, 0); 7858 if (xp == NULL) 7859 vput(vp); 7860 } 7861 7862 /* 7863 * Inode de-allocation dependencies. 7864 * 7865 * When an inode's link count is reduced to zero, it can be de-allocated. We 7866 * found it convenient to postpone de-allocation until after the inode is 7867 * written to disk with its new link count (zero). At this point, all of the 7868 * on-disk inode's block pointers are nullified and, with careful dependency 7869 * list ordering, all dependencies related to the inode will be satisfied and 7870 * the corresponding dependency structures de-allocated. So, if/when the 7871 * inode is reused, there will be no mixing of old dependencies with new 7872 * ones. This artificial dependency is set up by the block de-allocation 7873 * procedure above (softdep_setup_freeblocks) and completed by the 7874 * following procedure. 7875 */ 7876 static void 7877 handle_workitem_freefile(freefile) 7878 struct freefile *freefile; 7879 { 7880 struct workhead wkhd; 7881 struct fs *fs; 7882 struct inodedep *idp; 7883 struct ufsmount *ump; 7884 int error; 7885 7886 ump = VFSTOUFS(freefile->fx_list.wk_mp); 7887 fs = ump->um_fs; 7888 #ifdef DEBUG 7889 ACQUIRE_LOCK(&lk); 7890 error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp); 7891 FREE_LOCK(&lk); 7892 if (error) 7893 panic("handle_workitem_freefile: inodedep %p survived", idp); 7894 #endif 7895 UFS_LOCK(ump); 7896 fs->fs_pendinginodes -= 1; 7897 UFS_UNLOCK(ump); 7898 LIST_INIT(&wkhd); 7899 LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list); 7900 if ((error = ffs_freefile(ump, fs, freefile->fx_devvp, 7901 freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0) 7902 softdep_error("handle_workitem_freefile", error); 7903 ACQUIRE_LOCK(&lk); 7904 WORKITEM_FREE(freefile, D_FREEFILE); 7905 FREE_LOCK(&lk); 7906 } 7907 7908 7909 /* 7910 * Helper function which unlinks marker element from work list and returns 7911 * the next element on the list. 7912 */ 7913 static __inline struct worklist * 7914 markernext(struct worklist *marker) 7915 { 7916 struct worklist *next; 7917 7918 next = LIST_NEXT(marker, wk_list); 7919 LIST_REMOVE(marker, wk_list); 7920 return next; 7921 } 7922 7923 /* 7924 * Disk writes. 7925 * 7926 * The dependency structures constructed above are most actively used when file 7927 * system blocks are written to disk. No constraints are placed on when a 7928 * block can be written, but unsatisfied update dependencies are made safe by 7929 * modifying (or replacing) the source memory for the duration of the disk 7930 * write. When the disk write completes, the memory block is again brought 7931 * up-to-date. 7932 * 7933 * In-core inode structure reclamation. 7934 * 7935 * Because there are a finite number of "in-core" inode structures, they are 7936 * reused regularly. By transferring all inode-related dependencies to the 7937 * in-memory inode block and indexing them separately (via "inodedep"s), we 7938 * can allow "in-core" inode structures to be reused at any time and avoid 7939 * any increase in contention. 7940 * 7941 * Called just before entering the device driver to initiate a new disk I/O. 7942 * The buffer must be locked, thus, no I/O completion operations can occur 7943 * while we are manipulating its associated dependencies. 7944 */ 7945 static void 7946 softdep_disk_io_initiation(bp) 7947 struct buf *bp; /* structure describing disk write to occur */ 7948 { 7949 struct worklist *wk; 7950 struct worklist marker; 7951 struct inodedep *inodedep; 7952 struct freeblks *freeblks; 7953 struct jfreeblk *jfreeblk; 7954 struct newblk *newblk; 7955 7956 /* 7957 * We only care about write operations. There should never 7958 * be dependencies for reads. 7959 */ 7960 if (bp->b_iocmd != BIO_WRITE) 7961 panic("softdep_disk_io_initiation: not write"); 7962 7963 if (bp->b_vflags & BV_BKGRDINPROG) 7964 panic("softdep_disk_io_initiation: Writing buffer with " 7965 "background write in progress: %p", bp); 7966 7967 marker.wk_type = D_LAST + 1; /* Not a normal workitem */ 7968 PHOLD(curproc); /* Don't swap out kernel stack */ 7969 7970 ACQUIRE_LOCK(&lk); 7971 /* 7972 * Do any necessary pre-I/O processing. 7973 */ 7974 for (wk = LIST_FIRST(&bp->b_dep); wk != NULL; 7975 wk = markernext(&marker)) { 7976 LIST_INSERT_AFTER(wk, &marker, wk_list); 7977 switch (wk->wk_type) { 7978 7979 case D_PAGEDEP: 7980 initiate_write_filepage(WK_PAGEDEP(wk), bp); 7981 continue; 7982 7983 case D_INODEDEP: 7984 inodedep = WK_INODEDEP(wk); 7985 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) 7986 initiate_write_inodeblock_ufs1(inodedep, bp); 7987 else 7988 initiate_write_inodeblock_ufs2(inodedep, bp); 7989 continue; 7990 7991 case D_INDIRDEP: 7992 initiate_write_indirdep(WK_INDIRDEP(wk), bp); 7993 continue; 7994 7995 case D_BMSAFEMAP: 7996 initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp); 7997 continue; 7998 7999 case D_JSEG: 8000 WK_JSEG(wk)->js_buf = NULL; 8001 continue; 8002 8003 case D_FREEBLKS: 8004 freeblks = WK_FREEBLKS(wk); 8005 jfreeblk = LIST_FIRST(&freeblks->fb_jfreeblkhd); 8006 /* 8007 * We have to wait for the jfreeblks to be journaled 8008 * before we can write an inodeblock with updated 8009 * pointers. Be careful to arrange the marker so 8010 * we revisit the jfreeblk if it's not removed by 8011 * the first jwait(). 8012 */ 8013 if (jfreeblk != NULL) { 8014 LIST_REMOVE(&marker, wk_list); 8015 LIST_INSERT_BEFORE(wk, &marker, wk_list); 8016 jwait(&jfreeblk->jf_list); 8017 } 8018 continue; 8019 case D_ALLOCDIRECT: 8020 case D_ALLOCINDIR: 8021 /* 8022 * We have to wait for the jnewblk to be journaled 8023 * before we can write to a block otherwise the 8024 * contents may be confused with an earlier file 8025 * at recovery time. Handle the marker as described 8026 * above. 8027 */ 8028 newblk = WK_NEWBLK(wk); 8029 if (newblk->nb_jnewblk != NULL) { 8030 LIST_REMOVE(&marker, wk_list); 8031 LIST_INSERT_BEFORE(wk, &marker, wk_list); 8032 jwait(&newblk->nb_jnewblk->jn_list); 8033 } 8034 continue; 8035 8036 case D_SBDEP: 8037 initiate_write_sbdep(WK_SBDEP(wk)); 8038 continue; 8039 8040 case D_MKDIR: 8041 case D_FREEWORK: 8042 case D_FREEDEP: 8043 case D_JSEGDEP: 8044 continue; 8045 8046 default: 8047 panic("handle_disk_io_initiation: Unexpected type %s", 8048 TYPENAME(wk->wk_type)); 8049 /* NOTREACHED */ 8050 } 8051 } 8052 FREE_LOCK(&lk); 8053 PRELE(curproc); /* Allow swapout of kernel stack */ 8054 } 8055 8056 /* 8057 * Called from within the procedure above to deal with unsatisfied 8058 * allocation dependencies in a directory. The buffer must be locked, 8059 * thus, no I/O completion operations can occur while we are 8060 * manipulating its associated dependencies. 8061 */ 8062 static void 8063 initiate_write_filepage(pagedep, bp) 8064 struct pagedep *pagedep; 8065 struct buf *bp; 8066 { 8067 struct jremref *jremref; 8068 struct jmvref *jmvref; 8069 struct dirrem *dirrem; 8070 struct diradd *dap; 8071 struct direct *ep; 8072 int i; 8073 8074 if (pagedep->pd_state & IOSTARTED) { 8075 /* 8076 * This can only happen if there is a driver that does not 8077 * understand chaining. Here biodone will reissue the call 8078 * to strategy for the incomplete buffers. 8079 */ 8080 printf("initiate_write_filepage: already started\n"); 8081 return; 8082 } 8083 pagedep->pd_state |= IOSTARTED; 8084 /* 8085 * Wait for all journal remove dependencies to hit the disk. 8086 * We can not allow any potentially conflicting directory adds 8087 * to be visible before removes and rollback is too difficult. 8088 * lk may be dropped and re-acquired, however we hold the buf 8089 * locked so the dependency can not go away. 8090 */ 8091 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) 8092 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) { 8093 stat_jwait_filepage++; 8094 jwait(&jremref->jr_list); 8095 } 8096 while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) { 8097 stat_jwait_filepage++; 8098 jwait(&jmvref->jm_list); 8099 } 8100 for (i = 0; i < DAHASHSZ; i++) { 8101 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 8102 ep = (struct direct *) 8103 ((char *)bp->b_data + dap->da_offset); 8104 if (ep->d_ino != dap->da_newinum) 8105 panic("%s: dir inum %d != new %d", 8106 "initiate_write_filepage", 8107 ep->d_ino, dap->da_newinum); 8108 if (dap->da_state & DIRCHG) 8109 ep->d_ino = dap->da_previous->dm_oldinum; 8110 else 8111 ep->d_ino = 0; 8112 dap->da_state &= ~ATTACHED; 8113 dap->da_state |= UNDONE; 8114 } 8115 } 8116 } 8117 8118 /* 8119 * Version of initiate_write_inodeblock that handles UFS1 dinodes. 8120 * Note that any bug fixes made to this routine must be done in the 8121 * version found below. 8122 * 8123 * Called from within the procedure above to deal with unsatisfied 8124 * allocation dependencies in an inodeblock. The buffer must be 8125 * locked, thus, no I/O completion operations can occur while we 8126 * are manipulating its associated dependencies. 8127 */ 8128 static void 8129 initiate_write_inodeblock_ufs1(inodedep, bp) 8130 struct inodedep *inodedep; 8131 struct buf *bp; /* The inode block */ 8132 { 8133 struct allocdirect *adp, *lastadp; 8134 struct ufs1_dinode *dp; 8135 struct ufs1_dinode *sip; 8136 struct inoref *inoref; 8137 struct fs *fs; 8138 ufs_lbn_t i; 8139 #ifdef INVARIANTS 8140 ufs_lbn_t prevlbn = 0; 8141 #endif 8142 int deplist; 8143 8144 if (inodedep->id_state & IOSTARTED) 8145 panic("initiate_write_inodeblock_ufs1: already started"); 8146 inodedep->id_state |= IOSTARTED; 8147 fs = inodedep->id_fs; 8148 dp = (struct ufs1_dinode *)bp->b_data + 8149 ino_to_fsbo(fs, inodedep->id_ino); 8150 8151 /* 8152 * If we're on the unlinked list but have not yet written our 8153 * next pointer initialize it here. 8154 */ 8155 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 8156 struct inodedep *inon; 8157 8158 inon = TAILQ_NEXT(inodedep, id_unlinked); 8159 dp->di_freelink = inon ? inon->id_ino : 0; 8160 } 8161 /* 8162 * If the bitmap is not yet written, then the allocated 8163 * inode cannot be written to disk. 8164 */ 8165 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 8166 if (inodedep->id_savedino1 != NULL) 8167 panic("initiate_write_inodeblock_ufs1: I/O underway"); 8168 FREE_LOCK(&lk); 8169 sip = malloc(sizeof(struct ufs1_dinode), 8170 M_SAVEDINO, M_SOFTDEP_FLAGS); 8171 ACQUIRE_LOCK(&lk); 8172 inodedep->id_savedino1 = sip; 8173 *inodedep->id_savedino1 = *dp; 8174 bzero((caddr_t)dp, sizeof(struct ufs1_dinode)); 8175 dp->di_gen = inodedep->id_savedino1->di_gen; 8176 dp->di_freelink = inodedep->id_savedino1->di_freelink; 8177 return; 8178 } 8179 /* 8180 * If no dependencies, then there is nothing to roll back. 8181 */ 8182 inodedep->id_savedsize = dp->di_size; 8183 inodedep->id_savedextsize = 0; 8184 inodedep->id_savednlink = dp->di_nlink; 8185 if (TAILQ_EMPTY(&inodedep->id_inoupdt) && 8186 TAILQ_EMPTY(&inodedep->id_inoreflst)) 8187 return; 8188 /* 8189 * Revert the link count to that of the first unwritten journal entry. 8190 */ 8191 inoref = TAILQ_FIRST(&inodedep->id_inoreflst); 8192 if (inoref) 8193 dp->di_nlink = inoref->if_nlink; 8194 /* 8195 * Set the dependencies to busy. 8196 */ 8197 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 8198 adp = TAILQ_NEXT(adp, ad_next)) { 8199 #ifdef INVARIANTS 8200 if (deplist != 0 && prevlbn >= adp->ad_offset) 8201 panic("softdep_write_inodeblock: lbn order"); 8202 prevlbn = adp->ad_offset; 8203 if (adp->ad_offset < NDADDR && 8204 dp->di_db[adp->ad_offset] != adp->ad_newblkno) 8205 panic("%s: direct pointer #%jd mismatch %d != %jd", 8206 "softdep_write_inodeblock", 8207 (intmax_t)adp->ad_offset, 8208 dp->di_db[adp->ad_offset], 8209 (intmax_t)adp->ad_newblkno); 8210 if (adp->ad_offset >= NDADDR && 8211 dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) 8212 panic("%s: indirect pointer #%jd mismatch %d != %jd", 8213 "softdep_write_inodeblock", 8214 (intmax_t)adp->ad_offset - NDADDR, 8215 dp->di_ib[adp->ad_offset - NDADDR], 8216 (intmax_t)adp->ad_newblkno); 8217 deplist |= 1 << adp->ad_offset; 8218 if ((adp->ad_state & ATTACHED) == 0) 8219 panic("softdep_write_inodeblock: Unknown state 0x%x", 8220 adp->ad_state); 8221 #endif /* INVARIANTS */ 8222 adp->ad_state &= ~ATTACHED; 8223 adp->ad_state |= UNDONE; 8224 } 8225 /* 8226 * The on-disk inode cannot claim to be any larger than the last 8227 * fragment that has been written. Otherwise, the on-disk inode 8228 * might have fragments that were not the last block in the file 8229 * which would corrupt the filesystem. 8230 */ 8231 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 8232 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 8233 if (adp->ad_offset >= NDADDR) 8234 break; 8235 dp->di_db[adp->ad_offset] = adp->ad_oldblkno; 8236 /* keep going until hitting a rollback to a frag */ 8237 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 8238 continue; 8239 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 8240 for (i = adp->ad_offset + 1; i < NDADDR; i++) { 8241 #ifdef INVARIANTS 8242 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 8243 panic("softdep_write_inodeblock: lost dep1"); 8244 #endif /* INVARIANTS */ 8245 dp->di_db[i] = 0; 8246 } 8247 for (i = 0; i < NIADDR; i++) { 8248 #ifdef INVARIANTS 8249 if (dp->di_ib[i] != 0 && 8250 (deplist & ((1 << NDADDR) << i)) == 0) 8251 panic("softdep_write_inodeblock: lost dep2"); 8252 #endif /* INVARIANTS */ 8253 dp->di_ib[i] = 0; 8254 } 8255 return; 8256 } 8257 /* 8258 * If we have zero'ed out the last allocated block of the file, 8259 * roll back the size to the last currently allocated block. 8260 * We know that this last allocated block is a full-sized as 8261 * we already checked for fragments in the loop above. 8262 */ 8263 if (lastadp != NULL && 8264 dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 8265 for (i = lastadp->ad_offset; i >= 0; i--) 8266 if (dp->di_db[i] != 0) 8267 break; 8268 dp->di_size = (i + 1) * fs->fs_bsize; 8269 } 8270 /* 8271 * The only dependencies are for indirect blocks. 8272 * 8273 * The file size for indirect block additions is not guaranteed. 8274 * Such a guarantee would be non-trivial to achieve. The conventional 8275 * synchronous write implementation also does not make this guarantee. 8276 * Fsck should catch and fix discrepancies. Arguably, the file size 8277 * can be over-estimated without destroying integrity when the file 8278 * moves into the indirect blocks (i.e., is large). If we want to 8279 * postpone fsck, we are stuck with this argument. 8280 */ 8281 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 8282 dp->di_ib[adp->ad_offset - NDADDR] = 0; 8283 } 8284 8285 /* 8286 * Version of initiate_write_inodeblock that handles UFS2 dinodes. 8287 * Note that any bug fixes made to this routine must be done in the 8288 * version found above. 8289 * 8290 * Called from within the procedure above to deal with unsatisfied 8291 * allocation dependencies in an inodeblock. The buffer must be 8292 * locked, thus, no I/O completion operations can occur while we 8293 * are manipulating its associated dependencies. 8294 */ 8295 static void 8296 initiate_write_inodeblock_ufs2(inodedep, bp) 8297 struct inodedep *inodedep; 8298 struct buf *bp; /* The inode block */ 8299 { 8300 struct allocdirect *adp, *lastadp; 8301 struct ufs2_dinode *dp; 8302 struct ufs2_dinode *sip; 8303 struct inoref *inoref; 8304 struct fs *fs; 8305 ufs_lbn_t i; 8306 #ifdef INVARIANTS 8307 ufs_lbn_t prevlbn = 0; 8308 #endif 8309 int deplist; 8310 8311 if (inodedep->id_state & IOSTARTED) 8312 panic("initiate_write_inodeblock_ufs2: already started"); 8313 inodedep->id_state |= IOSTARTED; 8314 fs = inodedep->id_fs; 8315 dp = (struct ufs2_dinode *)bp->b_data + 8316 ino_to_fsbo(fs, inodedep->id_ino); 8317 8318 /* 8319 * If we're on the unlinked list but have not yet written our 8320 * next pointer initialize it here. 8321 */ 8322 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 8323 struct inodedep *inon; 8324 8325 inon = TAILQ_NEXT(inodedep, id_unlinked); 8326 dp->di_freelink = inon ? inon->id_ino : 0; 8327 } 8328 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == 8329 (UNLINKED | UNLINKNEXT)) { 8330 struct inodedep *inon; 8331 ino_t freelink; 8332 8333 inon = TAILQ_NEXT(inodedep, id_unlinked); 8334 freelink = inon ? inon->id_ino : 0; 8335 if (freelink != dp->di_freelink) 8336 panic("ino %p(0x%X) %d, %d != %d", 8337 inodedep, inodedep->id_state, inodedep->id_ino, 8338 freelink, dp->di_freelink); 8339 } 8340 /* 8341 * If the bitmap is not yet written, then the allocated 8342 * inode cannot be written to disk. 8343 */ 8344 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 8345 if (inodedep->id_savedino2 != NULL) 8346 panic("initiate_write_inodeblock_ufs2: I/O underway"); 8347 FREE_LOCK(&lk); 8348 sip = malloc(sizeof(struct ufs2_dinode), 8349 M_SAVEDINO, M_SOFTDEP_FLAGS); 8350 ACQUIRE_LOCK(&lk); 8351 inodedep->id_savedino2 = sip; 8352 *inodedep->id_savedino2 = *dp; 8353 bzero((caddr_t)dp, sizeof(struct ufs2_dinode)); 8354 dp->di_gen = inodedep->id_savedino2->di_gen; 8355 dp->di_freelink = inodedep->id_savedino2->di_freelink; 8356 return; 8357 } 8358 /* 8359 * If no dependencies, then there is nothing to roll back. 8360 */ 8361 inodedep->id_savedsize = dp->di_size; 8362 inodedep->id_savedextsize = dp->di_extsize; 8363 inodedep->id_savednlink = dp->di_nlink; 8364 if (TAILQ_EMPTY(&inodedep->id_inoupdt) && 8365 TAILQ_EMPTY(&inodedep->id_extupdt) && 8366 TAILQ_EMPTY(&inodedep->id_inoreflst)) 8367 return; 8368 /* 8369 * Revert the link count to that of the first unwritten journal entry. 8370 */ 8371 inoref = TAILQ_FIRST(&inodedep->id_inoreflst); 8372 if (inoref) 8373 dp->di_nlink = inoref->if_nlink; 8374 8375 /* 8376 * Set the ext data dependencies to busy. 8377 */ 8378 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 8379 adp = TAILQ_NEXT(adp, ad_next)) { 8380 #ifdef INVARIANTS 8381 if (deplist != 0 && prevlbn >= adp->ad_offset) 8382 panic("softdep_write_inodeblock: lbn order"); 8383 prevlbn = adp->ad_offset; 8384 if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno) 8385 panic("%s: direct pointer #%jd mismatch %jd != %jd", 8386 "softdep_write_inodeblock", 8387 (intmax_t)adp->ad_offset, 8388 (intmax_t)dp->di_extb[adp->ad_offset], 8389 (intmax_t)adp->ad_newblkno); 8390 deplist |= 1 << adp->ad_offset; 8391 if ((adp->ad_state & ATTACHED) == 0) 8392 panic("softdep_write_inodeblock: Unknown state 0x%x", 8393 adp->ad_state); 8394 #endif /* INVARIANTS */ 8395 adp->ad_state &= ~ATTACHED; 8396 adp->ad_state |= UNDONE; 8397 } 8398 /* 8399 * The on-disk inode cannot claim to be any larger than the last 8400 * fragment that has been written. Otherwise, the on-disk inode 8401 * might have fragments that were not the last block in the ext 8402 * data which would corrupt the filesystem. 8403 */ 8404 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 8405 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 8406 dp->di_extb[adp->ad_offset] = adp->ad_oldblkno; 8407 /* keep going until hitting a rollback to a frag */ 8408 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 8409 continue; 8410 dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 8411 for (i = adp->ad_offset + 1; i < NXADDR; i++) { 8412 #ifdef INVARIANTS 8413 if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) 8414 panic("softdep_write_inodeblock: lost dep1"); 8415 #endif /* INVARIANTS */ 8416 dp->di_extb[i] = 0; 8417 } 8418 lastadp = NULL; 8419 break; 8420 } 8421 /* 8422 * If we have zero'ed out the last allocated block of the ext 8423 * data, roll back the size to the last currently allocated block. 8424 * We know that this last allocated block is a full-sized as 8425 * we already checked for fragments in the loop above. 8426 */ 8427 if (lastadp != NULL && 8428 dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 8429 for (i = lastadp->ad_offset; i >= 0; i--) 8430 if (dp->di_extb[i] != 0) 8431 break; 8432 dp->di_extsize = (i + 1) * fs->fs_bsize; 8433 } 8434 /* 8435 * Set the file data dependencies to busy. 8436 */ 8437 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 8438 adp = TAILQ_NEXT(adp, ad_next)) { 8439 #ifdef INVARIANTS 8440 if (deplist != 0 && prevlbn >= adp->ad_offset) 8441 panic("softdep_write_inodeblock: lbn order"); 8442 prevlbn = adp->ad_offset; 8443 if (adp->ad_offset < NDADDR && 8444 dp->di_db[adp->ad_offset] != adp->ad_newblkno) 8445 panic("%s: direct pointer #%jd mismatch %jd != %jd", 8446 "softdep_write_inodeblock", 8447 (intmax_t)adp->ad_offset, 8448 (intmax_t)dp->di_db[adp->ad_offset], 8449 (intmax_t)adp->ad_newblkno); 8450 if (adp->ad_offset >= NDADDR && 8451 dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) 8452 panic("%s indirect pointer #%jd mismatch %jd != %jd", 8453 "softdep_write_inodeblock:", 8454 (intmax_t)adp->ad_offset - NDADDR, 8455 (intmax_t)dp->di_ib[adp->ad_offset - NDADDR], 8456 (intmax_t)adp->ad_newblkno); 8457 deplist |= 1 << adp->ad_offset; 8458 if ((adp->ad_state & ATTACHED) == 0) 8459 panic("softdep_write_inodeblock: Unknown state 0x%x", 8460 adp->ad_state); 8461 #endif /* INVARIANTS */ 8462 adp->ad_state &= ~ATTACHED; 8463 adp->ad_state |= UNDONE; 8464 } 8465 /* 8466 * The on-disk inode cannot claim to be any larger than the last 8467 * fragment that has been written. Otherwise, the on-disk inode 8468 * might have fragments that were not the last block in the file 8469 * which would corrupt the filesystem. 8470 */ 8471 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 8472 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 8473 if (adp->ad_offset >= NDADDR) 8474 break; 8475 dp->di_db[adp->ad_offset] = adp->ad_oldblkno; 8476 /* keep going until hitting a rollback to a frag */ 8477 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 8478 continue; 8479 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 8480 for (i = adp->ad_offset + 1; i < NDADDR; i++) { 8481 #ifdef INVARIANTS 8482 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 8483 panic("softdep_write_inodeblock: lost dep2"); 8484 #endif /* INVARIANTS */ 8485 dp->di_db[i] = 0; 8486 } 8487 for (i = 0; i < NIADDR; i++) { 8488 #ifdef INVARIANTS 8489 if (dp->di_ib[i] != 0 && 8490 (deplist & ((1 << NDADDR) << i)) == 0) 8491 panic("softdep_write_inodeblock: lost dep3"); 8492 #endif /* INVARIANTS */ 8493 dp->di_ib[i] = 0; 8494 } 8495 return; 8496 } 8497 /* 8498 * If we have zero'ed out the last allocated block of the file, 8499 * roll back the size to the last currently allocated block. 8500 * We know that this last allocated block is a full-sized as 8501 * we already checked for fragments in the loop above. 8502 */ 8503 if (lastadp != NULL && 8504 dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 8505 for (i = lastadp->ad_offset; i >= 0; i--) 8506 if (dp->di_db[i] != 0) 8507 break; 8508 dp->di_size = (i + 1) * fs->fs_bsize; 8509 } 8510 /* 8511 * The only dependencies are for indirect blocks. 8512 * 8513 * The file size for indirect block additions is not guaranteed. 8514 * Such a guarantee would be non-trivial to achieve. The conventional 8515 * synchronous write implementation also does not make this guarantee. 8516 * Fsck should catch and fix discrepancies. Arguably, the file size 8517 * can be over-estimated without destroying integrity when the file 8518 * moves into the indirect blocks (i.e., is large). If we want to 8519 * postpone fsck, we are stuck with this argument. 8520 */ 8521 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 8522 dp->di_ib[adp->ad_offset - NDADDR] = 0; 8523 } 8524 8525 /* 8526 * Cancel an indirdep as a result of truncation. Release all of the 8527 * children allocindirs and place their journal work on the appropriate 8528 * list. 8529 */ 8530 static void 8531 cancel_indirdep(indirdep, bp, inodedep, freeblks) 8532 struct indirdep *indirdep; 8533 struct buf *bp; 8534 struct inodedep *inodedep; 8535 struct freeblks *freeblks; 8536 { 8537 struct allocindir *aip; 8538 8539 /* 8540 * None of the indirect pointers will ever be visible, 8541 * so they can simply be tossed. GOINGAWAY ensures 8542 * that allocated pointers will be saved in the buffer 8543 * cache until they are freed. Note that they will 8544 * only be able to be found by their physical address 8545 * since the inode mapping the logical address will 8546 * be gone. The save buffer used for the safe copy 8547 * was allocated in setup_allocindir_phase2 using 8548 * the physical address so it could be used for this 8549 * purpose. Hence we swap the safe copy with the real 8550 * copy, allowing the safe copy to be freed and holding 8551 * on to the real copy for later use in indir_trunc. 8552 */ 8553 if (indirdep->ir_state & GOINGAWAY) 8554 panic("cancel_indirdep: already gone"); 8555 if (indirdep->ir_state & ONDEPLIST) { 8556 indirdep->ir_state &= ~ONDEPLIST; 8557 LIST_REMOVE(indirdep, ir_next); 8558 } 8559 indirdep->ir_state |= GOINGAWAY; 8560 VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1; 8561 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) 8562 cancel_allocindir(aip, inodedep, freeblks); 8563 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) 8564 cancel_allocindir(aip, inodedep, freeblks); 8565 while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) 8566 cancel_allocindir(aip, inodedep, freeblks); 8567 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0) 8568 cancel_allocindir(aip, inodedep, freeblks); 8569 bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount); 8570 WORKLIST_REMOVE(&indirdep->ir_list); 8571 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list); 8572 indirdep->ir_savebp = NULL; 8573 } 8574 8575 /* 8576 * Free an indirdep once it no longer has new pointers to track. 8577 */ 8578 static void 8579 free_indirdep(indirdep) 8580 struct indirdep *indirdep; 8581 { 8582 8583 KASSERT(LIST_EMPTY(&indirdep->ir_jwork), 8584 ("free_indirdep: Journal work not empty.")); 8585 KASSERT(LIST_EMPTY(&indirdep->ir_completehd), 8586 ("free_indirdep: Complete head not empty.")); 8587 KASSERT(LIST_EMPTY(&indirdep->ir_writehd), 8588 ("free_indirdep: write head not empty.")); 8589 KASSERT(LIST_EMPTY(&indirdep->ir_donehd), 8590 ("free_indirdep: done head not empty.")); 8591 KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd), 8592 ("free_indirdep: deplist head not empty.")); 8593 KASSERT(indirdep->ir_savebp == NULL, 8594 ("free_indirdep: %p ir_savebp != NULL", indirdep)); 8595 KASSERT((indirdep->ir_state & ONDEPLIST) == 0, 8596 ("free_indirdep: %p still on deplist.", indirdep)); 8597 if (indirdep->ir_state & ONWORKLIST) 8598 WORKLIST_REMOVE(&indirdep->ir_list); 8599 WORKITEM_FREE(indirdep, D_INDIRDEP); 8600 } 8601 8602 /* 8603 * Called before a write to an indirdep. This routine is responsible for 8604 * rolling back pointers to a safe state which includes only those 8605 * allocindirs which have been completed. 8606 */ 8607 static void 8608 initiate_write_indirdep(indirdep, bp) 8609 struct indirdep *indirdep; 8610 struct buf *bp; 8611 { 8612 8613 if (indirdep->ir_state & GOINGAWAY) 8614 panic("disk_io_initiation: indirdep gone"); 8615 8616 /* 8617 * If there are no remaining dependencies, this will be writing 8618 * the real pointers. 8619 */ 8620 if (LIST_EMPTY(&indirdep->ir_deplisthd)) 8621 return; 8622 /* 8623 * Replace up-to-date version with safe version. 8624 */ 8625 FREE_LOCK(&lk); 8626 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, 8627 M_SOFTDEP_FLAGS); 8628 ACQUIRE_LOCK(&lk); 8629 indirdep->ir_state &= ~ATTACHED; 8630 indirdep->ir_state |= UNDONE; 8631 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 8632 bcopy(indirdep->ir_savebp->b_data, bp->b_data, 8633 bp->b_bcount); 8634 } 8635 8636 /* 8637 * Called when an inode has been cleared in a cg bitmap. This finally 8638 * eliminates any canceled jaddrefs 8639 */ 8640 void 8641 softdep_setup_inofree(mp, bp, ino, wkhd) 8642 struct mount *mp; 8643 struct buf *bp; 8644 ino_t ino; 8645 struct workhead *wkhd; 8646 { 8647 struct worklist *wk, *wkn; 8648 struct inodedep *inodedep; 8649 uint8_t *inosused; 8650 struct cg *cgp; 8651 struct fs *fs; 8652 8653 ACQUIRE_LOCK(&lk); 8654 fs = VFSTOUFS(mp)->um_fs; 8655 cgp = (struct cg *)bp->b_data; 8656 inosused = cg_inosused(cgp); 8657 if (isset(inosused, ino % fs->fs_ipg)) 8658 panic("softdep_setup_inofree: inode %d not freed.", ino); 8659 if (inodedep_lookup(mp, ino, 0, &inodedep)) 8660 panic("softdep_setup_inofree: ino %d has existing inodedep %p", 8661 ino, inodedep); 8662 if (wkhd) { 8663 LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { 8664 if (wk->wk_type != D_JADDREF) 8665 continue; 8666 WORKLIST_REMOVE(wk); 8667 /* 8668 * We can free immediately even if the jaddref 8669 * isn't attached in a background write as now 8670 * the bitmaps are reconciled. 8671 */ 8672 wk->wk_state |= COMPLETE | ATTACHED; 8673 free_jaddref(WK_JADDREF(wk)); 8674 } 8675 jwork_move(&bp->b_dep, wkhd); 8676 } 8677 FREE_LOCK(&lk); 8678 } 8679 8680 8681 /* 8682 * Called via ffs_blkfree() after a set of frags has been cleared from a cg 8683 * map. Any dependencies waiting for the write to clear are added to the 8684 * buf's list and any jnewblks that are being canceled are discarded 8685 * immediately. 8686 */ 8687 void 8688 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) 8689 struct mount *mp; 8690 struct buf *bp; 8691 ufs2_daddr_t blkno; 8692 int frags; 8693 struct workhead *wkhd; 8694 { 8695 struct jnewblk *jnewblk; 8696 struct worklist *wk, *wkn; 8697 #ifdef SUJ_DEBUG 8698 struct bmsafemap *bmsafemap; 8699 struct fs *fs; 8700 uint8_t *blksfree; 8701 struct cg *cgp; 8702 ufs2_daddr_t jstart; 8703 ufs2_daddr_t jend; 8704 ufs2_daddr_t end; 8705 long bno; 8706 int i; 8707 #endif 8708 8709 ACQUIRE_LOCK(&lk); 8710 /* 8711 * Detach any jnewblks which have been canceled. They must linger 8712 * until the bitmap is cleared again by ffs_blkfree() to prevent 8713 * an unjournaled allocation from hitting the disk. 8714 */ 8715 if (wkhd) { 8716 LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { 8717 if (wk->wk_type != D_JNEWBLK) 8718 continue; 8719 jnewblk = WK_JNEWBLK(wk); 8720 KASSERT(jnewblk->jn_state & GOINGAWAY, 8721 ("softdep_setup_blkfree: jnewblk not canceled.")); 8722 WORKLIST_REMOVE(wk); 8723 #ifdef SUJ_DEBUG 8724 /* 8725 * Assert that this block is free in the bitmap 8726 * before we discard the jnewblk. 8727 */ 8728 fs = VFSTOUFS(mp)->um_fs; 8729 cgp = (struct cg *)bp->b_data; 8730 blksfree = cg_blksfree(cgp); 8731 bno = dtogd(fs, jnewblk->jn_blkno); 8732 for (i = jnewblk->jn_oldfrags; 8733 i < jnewblk->jn_frags; i++) { 8734 if (isset(blksfree, bno + i)) 8735 continue; 8736 panic("softdep_setup_blkfree: not free"); 8737 } 8738 #endif 8739 /* 8740 * Even if it's not attached we can free immediately 8741 * as the new bitmap is correct. 8742 */ 8743 wk->wk_state |= COMPLETE | ATTACHED; 8744 free_jnewblk(jnewblk); 8745 } 8746 /* 8747 * The buf must be locked by the caller otherwise these could 8748 * be added while it's being written and the write would 8749 * complete them before they made it to disk. 8750 */ 8751 jwork_move(&bp->b_dep, wkhd); 8752 } 8753 8754 #ifdef SUJ_DEBUG 8755 /* 8756 * Assert that we are not freeing a block which has an outstanding 8757 * allocation dependency. 8758 */ 8759 fs = VFSTOUFS(mp)->um_fs; 8760 bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno)); 8761 end = blkno + frags; 8762 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { 8763 /* 8764 * Don't match against blocks that will be freed when the 8765 * background write is done. 8766 */ 8767 if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) == 8768 (COMPLETE | DEPCOMPLETE)) 8769 continue; 8770 jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags; 8771 jend = jnewblk->jn_blkno + jnewblk->jn_frags; 8772 if ((blkno >= jstart && blkno < jend) || 8773 (end > jstart && end <= jend)) { 8774 printf("state 0x%X %jd - %d %d dep %p\n", 8775 jnewblk->jn_state, jnewblk->jn_blkno, 8776 jnewblk->jn_oldfrags, jnewblk->jn_frags, 8777 jnewblk->jn_newblk); 8778 panic("softdep_setup_blkfree: " 8779 "%jd-%jd(%d) overlaps with %jd-%jd", 8780 blkno, end, frags, jstart, jend); 8781 } 8782 } 8783 #endif 8784 FREE_LOCK(&lk); 8785 } 8786 8787 static void 8788 initiate_write_bmsafemap(bmsafemap, bp) 8789 struct bmsafemap *bmsafemap; 8790 struct buf *bp; /* The cg block. */ 8791 { 8792 struct jaddref *jaddref; 8793 struct jnewblk *jnewblk; 8794 uint8_t *inosused; 8795 uint8_t *blksfree; 8796 struct cg *cgp; 8797 struct fs *fs; 8798 int cleared; 8799 ino_t ino; 8800 long bno; 8801 int i; 8802 8803 if (bmsafemap->sm_state & IOSTARTED) 8804 panic("initiate_write_bmsafemap: Already started\n"); 8805 bmsafemap->sm_state |= IOSTARTED; 8806 /* 8807 * Clear any inode allocations which are pending journal writes. 8808 */ 8809 if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) { 8810 cgp = (struct cg *)bp->b_data; 8811 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 8812 inosused = cg_inosused(cgp); 8813 LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) { 8814 ino = jaddref->ja_ino % fs->fs_ipg; 8815 /* 8816 * If this is a background copy the inode may not 8817 * be marked used yet. 8818 */ 8819 if (isset(inosused, ino)) { 8820 if ((jaddref->ja_mode & IFMT) == IFDIR) 8821 cgp->cg_cs.cs_ndir--; 8822 cgp->cg_cs.cs_nifree++; 8823 clrbit(inosused, ino); 8824 jaddref->ja_state &= ~ATTACHED; 8825 jaddref->ja_state |= UNDONE; 8826 stat_jaddref++; 8827 } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0) 8828 panic("initiate_write_bmsafemap: inode %d " 8829 "marked free", jaddref->ja_ino); 8830 } 8831 } 8832 /* 8833 * Clear any block allocations which are pending journal writes. 8834 */ 8835 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { 8836 cgp = (struct cg *)bp->b_data; 8837 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 8838 blksfree = cg_blksfree(cgp); 8839 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { 8840 bno = dtogd(fs, jnewblk->jn_blkno); 8841 cleared = 0; 8842 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; 8843 i++) { 8844 if (isclr(blksfree, bno + i)) { 8845 cleared = 1; 8846 setbit(blksfree, bno + i); 8847 } 8848 } 8849 /* 8850 * We may not clear the block if it's a background 8851 * copy. In that case there is no reason to detach 8852 * it. 8853 */ 8854 if (cleared) { 8855 stat_jnewblk++; 8856 jnewblk->jn_state &= ~ATTACHED; 8857 jnewblk->jn_state |= UNDONE; 8858 } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0) 8859 panic("initiate_write_bmsafemap: block %jd " 8860 "marked free", jnewblk->jn_blkno); 8861 } 8862 } 8863 /* 8864 * Move allocation lists to the written lists so they can be 8865 * cleared once the block write is complete. 8866 */ 8867 LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr, 8868 inodedep, id_deps); 8869 LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, 8870 newblk, nb_deps); 8871 } 8872 8873 /* 8874 * This routine is called during the completion interrupt 8875 * service routine for a disk write (from the procedure called 8876 * by the device driver to inform the filesystem caches of 8877 * a request completion). It should be called early in this 8878 * procedure, before the block is made available to other 8879 * processes or other routines are called. 8880 * 8881 */ 8882 static void 8883 softdep_disk_write_complete(bp) 8884 struct buf *bp; /* describes the completed disk write */ 8885 { 8886 struct worklist *wk; 8887 struct worklist *owk; 8888 struct workhead reattach; 8889 struct buf *sbp; 8890 8891 /* 8892 * If an error occurred while doing the write, then the data 8893 * has not hit the disk and the dependencies cannot be unrolled. 8894 */ 8895 if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) 8896 return; 8897 LIST_INIT(&reattach); 8898 /* 8899 * This lock must not be released anywhere in this code segment. 8900 */ 8901 sbp = NULL; 8902 owk = NULL; 8903 ACQUIRE_LOCK(&lk); 8904 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 8905 WORKLIST_REMOVE(wk); 8906 if (wk == owk) 8907 panic("duplicate worklist: %p\n", wk); 8908 owk = wk; 8909 switch (wk->wk_type) { 8910 8911 case D_PAGEDEP: 8912 if (handle_written_filepage(WK_PAGEDEP(wk), bp)) 8913 WORKLIST_INSERT(&reattach, wk); 8914 continue; 8915 8916 case D_INODEDEP: 8917 if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) 8918 WORKLIST_INSERT(&reattach, wk); 8919 continue; 8920 8921 case D_BMSAFEMAP: 8922 if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp)) 8923 WORKLIST_INSERT(&reattach, wk); 8924 continue; 8925 8926 case D_MKDIR: 8927 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 8928 continue; 8929 8930 case D_ALLOCDIRECT: 8931 wk->wk_state |= COMPLETE; 8932 handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL); 8933 continue; 8934 8935 case D_ALLOCINDIR: 8936 wk->wk_state |= COMPLETE; 8937 handle_allocindir_partdone(WK_ALLOCINDIR(wk)); 8938 continue; 8939 8940 case D_INDIRDEP: 8941 if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp)) 8942 WORKLIST_INSERT(&reattach, wk); 8943 continue; 8944 8945 case D_FREEBLKS: 8946 wk->wk_state |= COMPLETE; 8947 if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) 8948 add_to_worklist(wk, 1); 8949 continue; 8950 8951 case D_FREEWORK: 8952 handle_written_freework(WK_FREEWORK(wk)); 8953 break; 8954 8955 case D_FREEDEP: 8956 free_freedep(WK_FREEDEP(wk)); 8957 continue; 8958 8959 case D_JSEGDEP: 8960 free_jsegdep(WK_JSEGDEP(wk)); 8961 continue; 8962 8963 case D_JSEG: 8964 handle_written_jseg(WK_JSEG(wk), bp); 8965 continue; 8966 8967 case D_SBDEP: 8968 if (handle_written_sbdep(WK_SBDEP(wk), bp)) 8969 WORKLIST_INSERT(&reattach, wk); 8970 continue; 8971 8972 default: 8973 panic("handle_disk_write_complete: Unknown type %s", 8974 TYPENAME(wk->wk_type)); 8975 /* NOTREACHED */ 8976 } 8977 } 8978 /* 8979 * Reattach any requests that must be redone. 8980 */ 8981 while ((wk = LIST_FIRST(&reattach)) != NULL) { 8982 WORKLIST_REMOVE(wk); 8983 WORKLIST_INSERT(&bp->b_dep, wk); 8984 } 8985 FREE_LOCK(&lk); 8986 if (sbp) 8987 brelse(sbp); 8988 } 8989 8990 /* 8991 * Called from within softdep_disk_write_complete above. Note that 8992 * this routine is always called from interrupt level with further 8993 * splbio interrupts blocked. 8994 */ 8995 static void 8996 handle_allocdirect_partdone(adp, wkhd) 8997 struct allocdirect *adp; /* the completed allocdirect */ 8998 struct workhead *wkhd; /* Work to do when inode is writtne. */ 8999 { 9000 struct allocdirectlst *listhead; 9001 struct allocdirect *listadp; 9002 struct inodedep *inodedep; 9003 long bsize; 9004 9005 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 9006 return; 9007 /* 9008 * The on-disk inode cannot claim to be any larger than the last 9009 * fragment that has been written. Otherwise, the on-disk inode 9010 * might have fragments that were not the last block in the file 9011 * which would corrupt the filesystem. Thus, we cannot free any 9012 * allocdirects after one whose ad_oldblkno claims a fragment as 9013 * these blocks must be rolled back to zero before writing the inode. 9014 * We check the currently active set of allocdirects in id_inoupdt 9015 * or id_extupdt as appropriate. 9016 */ 9017 inodedep = adp->ad_inodedep; 9018 bsize = inodedep->id_fs->fs_bsize; 9019 if (adp->ad_state & EXTDATA) 9020 listhead = &inodedep->id_extupdt; 9021 else 9022 listhead = &inodedep->id_inoupdt; 9023 TAILQ_FOREACH(listadp, listhead, ad_next) { 9024 /* found our block */ 9025 if (listadp == adp) 9026 break; 9027 /* continue if ad_oldlbn is not a fragment */ 9028 if (listadp->ad_oldsize == 0 || 9029 listadp->ad_oldsize == bsize) 9030 continue; 9031 /* hit a fragment */ 9032 return; 9033 } 9034 /* 9035 * If we have reached the end of the current list without 9036 * finding the just finished dependency, then it must be 9037 * on the future dependency list. Future dependencies cannot 9038 * be freed until they are moved to the current list. 9039 */ 9040 if (listadp == NULL) { 9041 #ifdef DEBUG 9042 if (adp->ad_state & EXTDATA) 9043 listhead = &inodedep->id_newextupdt; 9044 else 9045 listhead = &inodedep->id_newinoupdt; 9046 TAILQ_FOREACH(listadp, listhead, ad_next) 9047 /* found our block */ 9048 if (listadp == adp) 9049 break; 9050 if (listadp == NULL) 9051 panic("handle_allocdirect_partdone: lost dep"); 9052 #endif /* DEBUG */ 9053 return; 9054 } 9055 /* 9056 * If we have found the just finished dependency, then queue 9057 * it along with anything that follows it that is complete. 9058 * Since the pointer has not yet been written in the inode 9059 * as the dependency prevents it, place the allocdirect on the 9060 * bufwait list where it will be freed once the pointer is 9061 * valid. 9062 */ 9063 if (wkhd == NULL) 9064 wkhd = &inodedep->id_bufwait; 9065 for (; adp; adp = listadp) { 9066 listadp = TAILQ_NEXT(adp, ad_next); 9067 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 9068 return; 9069 TAILQ_REMOVE(listhead, adp, ad_next); 9070 WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list); 9071 } 9072 } 9073 9074 /* 9075 * Called from within softdep_disk_write_complete above. This routine 9076 * completes successfully written allocindirs. 9077 */ 9078 static void 9079 handle_allocindir_partdone(aip) 9080 struct allocindir *aip; /* the completed allocindir */ 9081 { 9082 struct indirdep *indirdep; 9083 9084 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) 9085 return; 9086 indirdep = aip->ai_indirdep; 9087 LIST_REMOVE(aip, ai_next); 9088 if (indirdep->ir_state & UNDONE) { 9089 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); 9090 return; 9091 } 9092 if (indirdep->ir_state & UFS1FMT) 9093 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 9094 aip->ai_newblkno; 9095 else 9096 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 9097 aip->ai_newblkno; 9098 /* 9099 * Await the pointer write before freeing the allocindir. 9100 */ 9101 LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next); 9102 } 9103 9104 /* 9105 * Release segments held on a jwork list. 9106 */ 9107 static void 9108 handle_jwork(wkhd) 9109 struct workhead *wkhd; 9110 { 9111 struct worklist *wk; 9112 9113 while ((wk = LIST_FIRST(wkhd)) != NULL) { 9114 WORKLIST_REMOVE(wk); 9115 switch (wk->wk_type) { 9116 case D_JSEGDEP: 9117 free_jsegdep(WK_JSEGDEP(wk)); 9118 continue; 9119 default: 9120 panic("handle_jwork: Unknown type %s\n", 9121 TYPENAME(wk->wk_type)); 9122 } 9123 } 9124 } 9125 9126 /* 9127 * Handle the bufwait list on an inode when it is safe to release items 9128 * held there. This normally happens after an inode block is written but 9129 * may be delayed and handled later if there are pending journal items that 9130 * are not yet safe to be released. 9131 */ 9132 static struct freefile * 9133 handle_bufwait(inodedep, refhd) 9134 struct inodedep *inodedep; 9135 struct workhead *refhd; 9136 { 9137 struct jaddref *jaddref; 9138 struct freefile *freefile; 9139 struct worklist *wk; 9140 9141 freefile = NULL; 9142 while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { 9143 WORKLIST_REMOVE(wk); 9144 switch (wk->wk_type) { 9145 case D_FREEFILE: 9146 /* 9147 * We defer adding freefile to the worklist 9148 * until all other additions have been made to 9149 * ensure that it will be done after all the 9150 * old blocks have been freed. 9151 */ 9152 if (freefile != NULL) 9153 panic("handle_bufwait: freefile"); 9154 freefile = WK_FREEFILE(wk); 9155 continue; 9156 9157 case D_MKDIR: 9158 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); 9159 continue; 9160 9161 case D_DIRADD: 9162 diradd_inode_written(WK_DIRADD(wk), inodedep); 9163 continue; 9164 9165 case D_FREEFRAG: 9166 wk->wk_state |= COMPLETE; 9167 if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) 9168 add_to_worklist(wk, 0); 9169 continue; 9170 9171 case D_DIRREM: 9172 wk->wk_state |= COMPLETE; 9173 add_to_worklist(wk, 0); 9174 continue; 9175 9176 case D_ALLOCDIRECT: 9177 case D_ALLOCINDIR: 9178 free_newblk(WK_NEWBLK(wk)); 9179 continue; 9180 9181 case D_JNEWBLK: 9182 wk->wk_state |= COMPLETE; 9183 free_jnewblk(WK_JNEWBLK(wk)); 9184 continue; 9185 9186 /* 9187 * Save freed journal segments and add references on 9188 * the supplied list which will delay their release 9189 * until the cg bitmap is cleared on disk. 9190 */ 9191 case D_JSEGDEP: 9192 if (refhd == NULL) 9193 free_jsegdep(WK_JSEGDEP(wk)); 9194 else 9195 WORKLIST_INSERT(refhd, wk); 9196 continue; 9197 9198 case D_JADDREF: 9199 jaddref = WK_JADDREF(wk); 9200 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, 9201 if_deps); 9202 /* 9203 * Transfer any jaddrefs to the list to be freed with 9204 * the bitmap if we're handling a removed file. 9205 */ 9206 if (refhd == NULL) { 9207 wk->wk_state |= COMPLETE; 9208 free_jaddref(jaddref); 9209 } else 9210 WORKLIST_INSERT(refhd, wk); 9211 continue; 9212 9213 default: 9214 panic("handle_bufwait: Unknown type %p(%s)", 9215 wk, TYPENAME(wk->wk_type)); 9216 /* NOTREACHED */ 9217 } 9218 } 9219 return (freefile); 9220 } 9221 /* 9222 * Called from within softdep_disk_write_complete above to restore 9223 * in-memory inode block contents to their most up-to-date state. Note 9224 * that this routine is always called from interrupt level with further 9225 * splbio interrupts blocked. 9226 */ 9227 static int 9228 handle_written_inodeblock(inodedep, bp) 9229 struct inodedep *inodedep; 9230 struct buf *bp; /* buffer containing the inode block */ 9231 { 9232 struct freefile *freefile; 9233 struct allocdirect *adp, *nextadp; 9234 struct ufs1_dinode *dp1 = NULL; 9235 struct ufs2_dinode *dp2 = NULL; 9236 struct workhead wkhd; 9237 int hadchanges, fstype; 9238 ino_t freelink; 9239 9240 LIST_INIT(&wkhd); 9241 hadchanges = 0; 9242 freefile = NULL; 9243 if ((inodedep->id_state & IOSTARTED) == 0) 9244 panic("handle_written_inodeblock: not started"); 9245 inodedep->id_state &= ~IOSTARTED; 9246 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) { 9247 fstype = UFS1; 9248 dp1 = (struct ufs1_dinode *)bp->b_data + 9249 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 9250 freelink = dp1->di_freelink; 9251 } else { 9252 fstype = UFS2; 9253 dp2 = (struct ufs2_dinode *)bp->b_data + 9254 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 9255 freelink = dp2->di_freelink; 9256 } 9257 /* 9258 * If we wrote a valid freelink pointer during the last write 9259 * record it here. 9260 */ 9261 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 9262 struct inodedep *inon; 9263 9264 inon = TAILQ_NEXT(inodedep, id_unlinked); 9265 if ((inon == NULL && freelink == 0) || 9266 (inon && inon->id_ino == freelink)) { 9267 if (inon) 9268 inon->id_state |= UNLINKPREV; 9269 inodedep->id_state |= UNLINKNEXT; 9270 } else 9271 hadchanges = 1; 9272 } 9273 /* Leave this inodeblock dirty until it's in the list. */ 9274 if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) 9275 hadchanges = 1; 9276 /* 9277 * If we had to rollback the inode allocation because of 9278 * bitmaps being incomplete, then simply restore it. 9279 * Keep the block dirty so that it will not be reclaimed until 9280 * all associated dependencies have been cleared and the 9281 * corresponding updates written to disk. 9282 */ 9283 if (inodedep->id_savedino1 != NULL) { 9284 hadchanges = 1; 9285 if (fstype == UFS1) 9286 *dp1 = *inodedep->id_savedino1; 9287 else 9288 *dp2 = *inodedep->id_savedino2; 9289 free(inodedep->id_savedino1, M_SAVEDINO); 9290 inodedep->id_savedino1 = NULL; 9291 if ((bp->b_flags & B_DELWRI) == 0) 9292 stat_inode_bitmap++; 9293 bdirty(bp); 9294 /* 9295 * If the inode is clear here and GOINGAWAY it will never 9296 * be written. Process the bufwait and clear any pending 9297 * work which may include the freefile. 9298 */ 9299 if (inodedep->id_state & GOINGAWAY) 9300 goto bufwait; 9301 return (1); 9302 } 9303 inodedep->id_state |= COMPLETE; 9304 /* 9305 * Roll forward anything that had to be rolled back before 9306 * the inode could be updated. 9307 */ 9308 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { 9309 nextadp = TAILQ_NEXT(adp, ad_next); 9310 if (adp->ad_state & ATTACHED) 9311 panic("handle_written_inodeblock: new entry"); 9312 if (fstype == UFS1) { 9313 if (adp->ad_offset < NDADDR) { 9314 if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno) 9315 panic("%s %s #%jd mismatch %d != %jd", 9316 "handle_written_inodeblock:", 9317 "direct pointer", 9318 (intmax_t)adp->ad_offset, 9319 dp1->di_db[adp->ad_offset], 9320 (intmax_t)adp->ad_oldblkno); 9321 dp1->di_db[adp->ad_offset] = adp->ad_newblkno; 9322 } else { 9323 if (dp1->di_ib[adp->ad_offset - NDADDR] != 0) 9324 panic("%s: %s #%jd allocated as %d", 9325 "handle_written_inodeblock", 9326 "indirect pointer", 9327 (intmax_t)adp->ad_offset - NDADDR, 9328 dp1->di_ib[adp->ad_offset - NDADDR]); 9329 dp1->di_ib[adp->ad_offset - NDADDR] = 9330 adp->ad_newblkno; 9331 } 9332 } else { 9333 if (adp->ad_offset < NDADDR) { 9334 if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno) 9335 panic("%s: %s #%jd %s %jd != %jd", 9336 "handle_written_inodeblock", 9337 "direct pointer", 9338 (intmax_t)adp->ad_offset, "mismatch", 9339 (intmax_t)dp2->di_db[adp->ad_offset], 9340 (intmax_t)adp->ad_oldblkno); 9341 dp2->di_db[adp->ad_offset] = adp->ad_newblkno; 9342 } else { 9343 if (dp2->di_ib[adp->ad_offset - NDADDR] != 0) 9344 panic("%s: %s #%jd allocated as %jd", 9345 "handle_written_inodeblock", 9346 "indirect pointer", 9347 (intmax_t)adp->ad_offset - NDADDR, 9348 (intmax_t) 9349 dp2->di_ib[adp->ad_offset - NDADDR]); 9350 dp2->di_ib[adp->ad_offset - NDADDR] = 9351 adp->ad_newblkno; 9352 } 9353 } 9354 adp->ad_state &= ~UNDONE; 9355 adp->ad_state |= ATTACHED; 9356 hadchanges = 1; 9357 } 9358 for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) { 9359 nextadp = TAILQ_NEXT(adp, ad_next); 9360 if (adp->ad_state & ATTACHED) 9361 panic("handle_written_inodeblock: new entry"); 9362 if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno) 9363 panic("%s: direct pointers #%jd %s %jd != %jd", 9364 "handle_written_inodeblock", 9365 (intmax_t)adp->ad_offset, "mismatch", 9366 (intmax_t)dp2->di_extb[adp->ad_offset], 9367 (intmax_t)adp->ad_oldblkno); 9368 dp2->di_extb[adp->ad_offset] = adp->ad_newblkno; 9369 adp->ad_state &= ~UNDONE; 9370 adp->ad_state |= ATTACHED; 9371 hadchanges = 1; 9372 } 9373 if (hadchanges && (bp->b_flags & B_DELWRI) == 0) 9374 stat_direct_blk_ptrs++; 9375 /* 9376 * Reset the file size to its most up-to-date value. 9377 */ 9378 if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) 9379 panic("handle_written_inodeblock: bad size"); 9380 if (inodedep->id_savednlink > LINK_MAX) 9381 panic("handle_written_inodeblock: Invalid link count " 9382 "%d for inodedep %p", inodedep->id_savednlink, inodedep); 9383 if (fstype == UFS1) { 9384 if (dp1->di_nlink != inodedep->id_savednlink) { 9385 dp1->di_nlink = inodedep->id_savednlink; 9386 hadchanges = 1; 9387 } 9388 if (dp1->di_size != inodedep->id_savedsize) { 9389 dp1->di_size = inodedep->id_savedsize; 9390 hadchanges = 1; 9391 } 9392 } else { 9393 if (dp2->di_nlink != inodedep->id_savednlink) { 9394 dp2->di_nlink = inodedep->id_savednlink; 9395 hadchanges = 1; 9396 } 9397 if (dp2->di_size != inodedep->id_savedsize) { 9398 dp2->di_size = inodedep->id_savedsize; 9399 hadchanges = 1; 9400 } 9401 if (dp2->di_extsize != inodedep->id_savedextsize) { 9402 dp2->di_extsize = inodedep->id_savedextsize; 9403 hadchanges = 1; 9404 } 9405 } 9406 inodedep->id_savedsize = -1; 9407 inodedep->id_savedextsize = -1; 9408 inodedep->id_savednlink = -1; 9409 /* 9410 * If there were any rollbacks in the inode block, then it must be 9411 * marked dirty so that its will eventually get written back in 9412 * its correct form. 9413 */ 9414 if (hadchanges) 9415 bdirty(bp); 9416 bufwait: 9417 /* 9418 * Process any allocdirects that completed during the update. 9419 */ 9420 if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) 9421 handle_allocdirect_partdone(adp, &wkhd); 9422 if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) 9423 handle_allocdirect_partdone(adp, &wkhd); 9424 /* 9425 * Process deallocations that were held pending until the 9426 * inode had been written to disk. Freeing of the inode 9427 * is delayed until after all blocks have been freed to 9428 * avoid creation of new <vfsid, inum, lbn> triples 9429 * before the old ones have been deleted. Completely 9430 * unlinked inodes are not processed until the unlinked 9431 * inode list is written or the last reference is removed. 9432 */ 9433 if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) { 9434 freefile = handle_bufwait(inodedep, NULL); 9435 if (freefile && !LIST_EMPTY(&wkhd)) { 9436 WORKLIST_INSERT(&wkhd, &freefile->fx_list); 9437 freefile = NULL; 9438 } 9439 } 9440 /* 9441 * Move rolled forward dependency completions to the bufwait list 9442 * now that those that were already written have been processed. 9443 */ 9444 if (!LIST_EMPTY(&wkhd) && hadchanges == 0) 9445 panic("handle_written_inodeblock: bufwait but no changes"); 9446 jwork_move(&inodedep->id_bufwait, &wkhd); 9447 9448 if (freefile != NULL) { 9449 /* 9450 * If the inode is goingaway it was never written. Fake up 9451 * the state here so free_inodedep() can succeed. 9452 */ 9453 if (inodedep->id_state & GOINGAWAY) 9454 inodedep->id_state |= COMPLETE | DEPCOMPLETE; 9455 if (free_inodedep(inodedep) == 0) 9456 panic("handle_written_inodeblock: live inodedep %p", 9457 inodedep); 9458 add_to_worklist(&freefile->fx_list, 0); 9459 return (0); 9460 } 9461 9462 /* 9463 * If no outstanding dependencies, free it. 9464 */ 9465 if (free_inodedep(inodedep) || 9466 (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 && 9467 TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && 9468 TAILQ_FIRST(&inodedep->id_extupdt) == 0 && 9469 LIST_FIRST(&inodedep->id_bufwait) == 0)) 9470 return (0); 9471 return (hadchanges); 9472 } 9473 9474 static int 9475 handle_written_indirdep(indirdep, bp, bpp) 9476 struct indirdep *indirdep; 9477 struct buf *bp; 9478 struct buf **bpp; 9479 { 9480 struct allocindir *aip; 9481 int chgs; 9482 9483 if (indirdep->ir_state & GOINGAWAY) 9484 panic("disk_write_complete: indirdep gone"); 9485 chgs = 0; 9486 /* 9487 * If there were rollbacks revert them here. 9488 */ 9489 if (indirdep->ir_saveddata) { 9490 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); 9491 free(indirdep->ir_saveddata, M_INDIRDEP); 9492 indirdep->ir_saveddata = 0; 9493 chgs = 1; 9494 } 9495 indirdep->ir_state &= ~UNDONE; 9496 indirdep->ir_state |= ATTACHED; 9497 /* 9498 * Move allocindirs with written pointers to the completehd if 9499 * the indirdep's pointer is not yet written. Otherwise 9500 * free them here. 9501 */ 9502 while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) { 9503 LIST_REMOVE(aip, ai_next); 9504 if ((indirdep->ir_state & DEPCOMPLETE) == 0) { 9505 LIST_INSERT_HEAD(&indirdep->ir_completehd, aip, 9506 ai_next); 9507 continue; 9508 } 9509 free_newblk(&aip->ai_block); 9510 } 9511 /* 9512 * Move allocindirs that have finished dependency processing from 9513 * the done list to the write list after updating the pointers. 9514 */ 9515 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { 9516 handle_allocindir_partdone(aip); 9517 if (aip == LIST_FIRST(&indirdep->ir_donehd)) 9518 panic("disk_write_complete: not gone"); 9519 chgs = 1; 9520 } 9521 /* 9522 * If this indirdep has been detached from its newblk during 9523 * I/O we need to keep this dep attached to the buffer so 9524 * deallocate_dependencies can find it and properly resolve 9525 * any outstanding dependencies. 9526 */ 9527 if ((indirdep->ir_state & (ONDEPLIST | DEPCOMPLETE)) == 0) 9528 chgs = 1; 9529 if ((bp->b_flags & B_DELWRI) == 0) 9530 stat_indir_blk_ptrs++; 9531 /* 9532 * If there were no changes we can discard the savedbp and detach 9533 * ourselves from the buf. We are only carrying completed pointers 9534 * in this case. 9535 */ 9536 if (chgs == 0) { 9537 struct buf *sbp; 9538 9539 sbp = indirdep->ir_savebp; 9540 sbp->b_flags |= B_INVAL | B_NOCACHE; 9541 indirdep->ir_savebp = NULL; 9542 if (*bpp != NULL) 9543 panic("handle_written_indirdep: bp already exists."); 9544 *bpp = sbp; 9545 } else 9546 bdirty(bp); 9547 /* 9548 * If there are no fresh dependencies and none waiting on writes 9549 * we can free the indirdep. 9550 */ 9551 if ((indirdep->ir_state & DEPCOMPLETE) && chgs == 0) { 9552 if (indirdep->ir_state & ONDEPLIST) 9553 LIST_REMOVE(indirdep, ir_next); 9554 free_indirdep(indirdep); 9555 return (0); 9556 } 9557 9558 return (chgs); 9559 } 9560 9561 /* 9562 * Process a diradd entry after its dependent inode has been written. 9563 * This routine must be called with splbio interrupts blocked. 9564 */ 9565 static void 9566 diradd_inode_written(dap, inodedep) 9567 struct diradd *dap; 9568 struct inodedep *inodedep; 9569 { 9570 9571 dap->da_state |= COMPLETE; 9572 complete_diradd(dap); 9573 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 9574 } 9575 9576 /* 9577 * Returns true if the bmsafemap will have rollbacks when written. Must 9578 * only be called with lk and the buf lock on the cg held. 9579 */ 9580 static int 9581 bmsafemap_rollbacks(bmsafemap) 9582 struct bmsafemap *bmsafemap; 9583 { 9584 9585 return (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd) | 9586 !LIST_EMPTY(&bmsafemap->sm_jnewblkhd)); 9587 } 9588 9589 /* 9590 * Complete a write to a bmsafemap structure. Roll forward any bitmap 9591 * changes if it's not a background write. Set all written dependencies 9592 * to DEPCOMPLETE and free the structure if possible. 9593 */ 9594 static int 9595 handle_written_bmsafemap(bmsafemap, bp) 9596 struct bmsafemap *bmsafemap; 9597 struct buf *bp; 9598 { 9599 struct newblk *newblk; 9600 struct inodedep *inodedep; 9601 struct jaddref *jaddref, *jatmp; 9602 struct jnewblk *jnewblk, *jntmp; 9603 uint8_t *inosused; 9604 uint8_t *blksfree; 9605 struct cg *cgp; 9606 struct fs *fs; 9607 ino_t ino; 9608 long bno; 9609 int chgs; 9610 int i; 9611 9612 if ((bmsafemap->sm_state & IOSTARTED) == 0) 9613 panic("initiate_write_bmsafemap: Not started\n"); 9614 chgs = 0; 9615 bmsafemap->sm_state &= ~IOSTARTED; 9616 /* 9617 * Restore unwritten inode allocation pending jaddref writes. 9618 */ 9619 if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) { 9620 cgp = (struct cg *)bp->b_data; 9621 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 9622 inosused = cg_inosused(cgp); 9623 LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd, 9624 ja_bmdeps, jatmp) { 9625 if ((jaddref->ja_state & UNDONE) == 0) 9626 continue; 9627 ino = jaddref->ja_ino % fs->fs_ipg; 9628 if (isset(inosused, ino)) 9629 panic("handle_written_bmsafemap: " 9630 "re-allocated inode"); 9631 if ((bp->b_xflags & BX_BKGRDMARKER) == 0) { 9632 if ((jaddref->ja_mode & IFMT) == IFDIR) 9633 cgp->cg_cs.cs_ndir++; 9634 cgp->cg_cs.cs_nifree--; 9635 setbit(inosused, ino); 9636 chgs = 1; 9637 } 9638 jaddref->ja_state &= ~UNDONE; 9639 jaddref->ja_state |= ATTACHED; 9640 free_jaddref(jaddref); 9641 } 9642 } 9643 /* 9644 * Restore any block allocations which are pending journal writes. 9645 */ 9646 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { 9647 cgp = (struct cg *)bp->b_data; 9648 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 9649 blksfree = cg_blksfree(cgp); 9650 LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps, 9651 jntmp) { 9652 if ((jnewblk->jn_state & UNDONE) == 0) 9653 continue; 9654 bno = dtogd(fs, jnewblk->jn_blkno); 9655 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; 9656 i++) { 9657 if (bp->b_xflags & BX_BKGRDMARKER) 9658 break; 9659 if ((jnewblk->jn_state & NEWBLOCK) == 0 && 9660 isclr(blksfree, bno + i)) 9661 panic("handle_written_bmsafemap: " 9662 "re-allocated fragment"); 9663 clrbit(blksfree, bno + i); 9664 chgs = 1; 9665 } 9666 jnewblk->jn_state &= ~(UNDONE | NEWBLOCK); 9667 jnewblk->jn_state |= ATTACHED; 9668 free_jnewblk(jnewblk); 9669 } 9670 } 9671 while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) { 9672 newblk->nb_state |= DEPCOMPLETE; 9673 newblk->nb_state &= ~ONDEPLIST; 9674 newblk->nb_bmsafemap = NULL; 9675 LIST_REMOVE(newblk, nb_deps); 9676 if (newblk->nb_list.wk_type == D_ALLOCDIRECT) 9677 handle_allocdirect_partdone( 9678 WK_ALLOCDIRECT(&newblk->nb_list), NULL); 9679 else if (newblk->nb_list.wk_type == D_ALLOCINDIR) 9680 handle_allocindir_partdone( 9681 WK_ALLOCINDIR(&newblk->nb_list)); 9682 else if (newblk->nb_list.wk_type != D_NEWBLK) 9683 panic("handle_written_bmsafemap: Unexpected type: %s", 9684 TYPENAME(newblk->nb_list.wk_type)); 9685 } 9686 while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) { 9687 inodedep->id_state |= DEPCOMPLETE; 9688 inodedep->id_state &= ~ONDEPLIST; 9689 LIST_REMOVE(inodedep, id_deps); 9690 inodedep->id_bmsafemap = NULL; 9691 } 9692 if (LIST_EMPTY(&bmsafemap->sm_jaddrefhd) && 9693 LIST_EMPTY(&bmsafemap->sm_jnewblkhd) && 9694 LIST_EMPTY(&bmsafemap->sm_newblkhd) && 9695 LIST_EMPTY(&bmsafemap->sm_inodedephd)) { 9696 if (chgs) 9697 bdirty(bp); 9698 LIST_REMOVE(bmsafemap, sm_hash); 9699 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 9700 return (0); 9701 } 9702 bdirty(bp); 9703 return (1); 9704 } 9705 9706 /* 9707 * Try to free a mkdir dependency. 9708 */ 9709 static void 9710 complete_mkdir(mkdir) 9711 struct mkdir *mkdir; 9712 { 9713 struct diradd *dap; 9714 9715 if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE) 9716 return; 9717 LIST_REMOVE(mkdir, md_mkdirs); 9718 dap = mkdir->md_diradd; 9719 dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); 9720 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) { 9721 dap->da_state |= DEPCOMPLETE; 9722 complete_diradd(dap); 9723 } 9724 WORKITEM_FREE(mkdir, D_MKDIR); 9725 } 9726 9727 /* 9728 * Handle the completion of a mkdir dependency. 9729 */ 9730 static void 9731 handle_written_mkdir(mkdir, type) 9732 struct mkdir *mkdir; 9733 int type; 9734 { 9735 9736 if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type) 9737 panic("handle_written_mkdir: bad type"); 9738 mkdir->md_state |= COMPLETE; 9739 complete_mkdir(mkdir); 9740 } 9741 9742 static void 9743 free_pagedep(pagedep) 9744 struct pagedep *pagedep; 9745 { 9746 int i; 9747 9748 if (pagedep->pd_state & (NEWBLOCK | ONWORKLIST)) 9749 return; 9750 for (i = 0; i < DAHASHSZ; i++) 9751 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) 9752 return; 9753 if (!LIST_EMPTY(&pagedep->pd_jmvrefhd)) 9754 return; 9755 if (!LIST_EMPTY(&pagedep->pd_dirremhd)) 9756 return; 9757 if (!LIST_EMPTY(&pagedep->pd_pendinghd)) 9758 return; 9759 LIST_REMOVE(pagedep, pd_hash); 9760 WORKITEM_FREE(pagedep, D_PAGEDEP); 9761 } 9762 9763 /* 9764 * Called from within softdep_disk_write_complete above. 9765 * A write operation was just completed. Removed inodes can 9766 * now be freed and associated block pointers may be committed. 9767 * Note that this routine is always called from interrupt level 9768 * with further splbio interrupts blocked. 9769 */ 9770 static int 9771 handle_written_filepage(pagedep, bp) 9772 struct pagedep *pagedep; 9773 struct buf *bp; /* buffer containing the written page */ 9774 { 9775 struct dirrem *dirrem; 9776 struct diradd *dap, *nextdap; 9777 struct direct *ep; 9778 int i, chgs; 9779 9780 if ((pagedep->pd_state & IOSTARTED) == 0) 9781 panic("handle_written_filepage: not started"); 9782 pagedep->pd_state &= ~IOSTARTED; 9783 /* 9784 * Process any directory removals that have been committed. 9785 */ 9786 while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { 9787 LIST_REMOVE(dirrem, dm_next); 9788 dirrem->dm_state |= COMPLETE; 9789 dirrem->dm_dirinum = pagedep->pd_ino; 9790 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), 9791 ("handle_written_filepage: Journal entries not written.")); 9792 add_to_worklist(&dirrem->dm_list, 0); 9793 } 9794 /* 9795 * Free any directory additions that have been committed. 9796 * If it is a newly allocated block, we have to wait until 9797 * the on-disk directory inode claims the new block. 9798 */ 9799 if ((pagedep->pd_state & NEWBLOCK) == 0) 9800 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 9801 free_diradd(dap, NULL); 9802 /* 9803 * Uncommitted directory entries must be restored. 9804 */ 9805 for (chgs = 0, i = 0; i < DAHASHSZ; i++) { 9806 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 9807 dap = nextdap) { 9808 nextdap = LIST_NEXT(dap, da_pdlist); 9809 if (dap->da_state & ATTACHED) 9810 panic("handle_written_filepage: attached"); 9811 ep = (struct direct *) 9812 ((char *)bp->b_data + dap->da_offset); 9813 ep->d_ino = dap->da_newinum; 9814 dap->da_state &= ~UNDONE; 9815 dap->da_state |= ATTACHED; 9816 chgs = 1; 9817 /* 9818 * If the inode referenced by the directory has 9819 * been written out, then the dependency can be 9820 * moved to the pending list. 9821 */ 9822 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 9823 LIST_REMOVE(dap, da_pdlist); 9824 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, 9825 da_pdlist); 9826 } 9827 } 9828 } 9829 /* 9830 * If there were any rollbacks in the directory, then it must be 9831 * marked dirty so that its will eventually get written back in 9832 * its correct form. 9833 */ 9834 if (chgs) { 9835 if ((bp->b_flags & B_DELWRI) == 0) 9836 stat_dir_entry++; 9837 bdirty(bp); 9838 return (1); 9839 } 9840 /* 9841 * If we are not waiting for a new directory block to be 9842 * claimed by its inode, then the pagedep will be freed. 9843 * Otherwise it will remain to track any new entries on 9844 * the page in case they are fsync'ed. 9845 */ 9846 if ((pagedep->pd_state & NEWBLOCK) == 0 && 9847 LIST_EMPTY(&pagedep->pd_jmvrefhd)) { 9848 LIST_REMOVE(pagedep, pd_hash); 9849 WORKITEM_FREE(pagedep, D_PAGEDEP); 9850 } 9851 return (0); 9852 } 9853 9854 /* 9855 * Writing back in-core inode structures. 9856 * 9857 * The filesystem only accesses an inode's contents when it occupies an 9858 * "in-core" inode structure. These "in-core" structures are separate from 9859 * the page frames used to cache inode blocks. Only the latter are 9860 * transferred to/from the disk. So, when the updated contents of the 9861 * "in-core" inode structure are copied to the corresponding in-memory inode 9862 * block, the dependencies are also transferred. The following procedure is 9863 * called when copying a dirty "in-core" inode to a cached inode block. 9864 */ 9865 9866 /* 9867 * Called when an inode is loaded from disk. If the effective link count 9868 * differed from the actual link count when it was last flushed, then we 9869 * need to ensure that the correct effective link count is put back. 9870 */ 9871 void 9872 softdep_load_inodeblock(ip) 9873 struct inode *ip; /* the "in_core" copy of the inode */ 9874 { 9875 struct inodedep *inodedep; 9876 9877 /* 9878 * Check for alternate nlink count. 9879 */ 9880 ip->i_effnlink = ip->i_nlink; 9881 ACQUIRE_LOCK(&lk); 9882 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 9883 &inodedep) == 0) { 9884 FREE_LOCK(&lk); 9885 return; 9886 } 9887 ip->i_effnlink -= inodedep->id_nlinkdelta; 9888 FREE_LOCK(&lk); 9889 } 9890 9891 /* 9892 * This routine is called just before the "in-core" inode 9893 * information is to be copied to the in-memory inode block. 9894 * Recall that an inode block contains several inodes. If 9895 * the force flag is set, then the dependencies will be 9896 * cleared so that the update can always be made. Note that 9897 * the buffer is locked when this routine is called, so we 9898 * will never be in the middle of writing the inode block 9899 * to disk. 9900 */ 9901 void 9902 softdep_update_inodeblock(ip, bp, waitfor) 9903 struct inode *ip; /* the "in_core" copy of the inode */ 9904 struct buf *bp; /* the buffer containing the inode block */ 9905 int waitfor; /* nonzero => update must be allowed */ 9906 { 9907 struct inodedep *inodedep; 9908 struct inoref *inoref; 9909 struct worklist *wk; 9910 struct mount *mp; 9911 struct buf *ibp; 9912 struct fs *fs; 9913 int error; 9914 9915 mp = UFSTOVFS(ip->i_ump); 9916 fs = ip->i_fs; 9917 /* 9918 * Preserve the freelink that is on disk. clear_unlinked_inodedep() 9919 * does not have access to the in-core ip so must write directly into 9920 * the inode block buffer when setting freelink. 9921 */ 9922 if (fs->fs_magic == FS_UFS1_MAGIC) 9923 DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data + 9924 ino_to_fsbo(fs, ip->i_number))->di_freelink); 9925 else 9926 DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data + 9927 ino_to_fsbo(fs, ip->i_number))->di_freelink); 9928 /* 9929 * If the effective link count is not equal to the actual link 9930 * count, then we must track the difference in an inodedep while 9931 * the inode is (potentially) tossed out of the cache. Otherwise, 9932 * if there is no existing inodedep, then there are no dependencies 9933 * to track. 9934 */ 9935 ACQUIRE_LOCK(&lk); 9936 again: 9937 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 9938 FREE_LOCK(&lk); 9939 if (ip->i_effnlink != ip->i_nlink) 9940 panic("softdep_update_inodeblock: bad link count"); 9941 return; 9942 } 9943 if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) 9944 panic("softdep_update_inodeblock: bad delta"); 9945 /* 9946 * If we're flushing all dependencies we must also move any waiting 9947 * for journal writes onto the bufwait list prior to I/O. 9948 */ 9949 if (waitfor) { 9950 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 9951 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 9952 == DEPCOMPLETE) { 9953 stat_jwait_inode++; 9954 jwait(&inoref->if_list); 9955 goto again; 9956 } 9957 } 9958 } 9959 /* 9960 * Changes have been initiated. Anything depending on these 9961 * changes cannot occur until this inode has been written. 9962 */ 9963 inodedep->id_state &= ~COMPLETE; 9964 if ((inodedep->id_state & ONWORKLIST) == 0) 9965 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); 9966 /* 9967 * Any new dependencies associated with the incore inode must 9968 * now be moved to the list associated with the buffer holding 9969 * the in-memory copy of the inode. Once merged process any 9970 * allocdirects that are completed by the merger. 9971 */ 9972 merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); 9973 if (!TAILQ_EMPTY(&inodedep->id_inoupdt)) 9974 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt), 9975 NULL); 9976 merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); 9977 if (!TAILQ_EMPTY(&inodedep->id_extupdt)) 9978 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt), 9979 NULL); 9980 /* 9981 * Now that the inode has been pushed into the buffer, the 9982 * operations dependent on the inode being written to disk 9983 * can be moved to the id_bufwait so that they will be 9984 * processed when the buffer I/O completes. 9985 */ 9986 while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { 9987 WORKLIST_REMOVE(wk); 9988 WORKLIST_INSERT(&inodedep->id_bufwait, wk); 9989 } 9990 /* 9991 * Newly allocated inodes cannot be written until the bitmap 9992 * that allocates them have been written (indicated by 9993 * DEPCOMPLETE being set in id_state). If we are doing a 9994 * forced sync (e.g., an fsync on a file), we force the bitmap 9995 * to be written so that the update can be done. 9996 */ 9997 if (waitfor == 0) { 9998 FREE_LOCK(&lk); 9999 return; 10000 } 10001 retry: 10002 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) { 10003 FREE_LOCK(&lk); 10004 return; 10005 } 10006 ibp = inodedep->id_bmsafemap->sm_buf; 10007 ibp = getdirtybuf(ibp, &lk, MNT_WAIT); 10008 if (ibp == NULL) { 10009 /* 10010 * If ibp came back as NULL, the dependency could have been 10011 * freed while we slept. Look it up again, and check to see 10012 * that it has completed. 10013 */ 10014 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 10015 goto retry; 10016 FREE_LOCK(&lk); 10017 return; 10018 } 10019 FREE_LOCK(&lk); 10020 if ((error = bwrite(ibp)) != 0) 10021 softdep_error("softdep_update_inodeblock: bwrite", error); 10022 } 10023 10024 /* 10025 * Merge the a new inode dependency list (such as id_newinoupdt) into an 10026 * old inode dependency list (such as id_inoupdt). This routine must be 10027 * called with splbio interrupts blocked. 10028 */ 10029 static void 10030 merge_inode_lists(newlisthead, oldlisthead) 10031 struct allocdirectlst *newlisthead; 10032 struct allocdirectlst *oldlisthead; 10033 { 10034 struct allocdirect *listadp, *newadp; 10035 10036 newadp = TAILQ_FIRST(newlisthead); 10037 for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) { 10038 if (listadp->ad_offset < newadp->ad_offset) { 10039 listadp = TAILQ_NEXT(listadp, ad_next); 10040 continue; 10041 } 10042 TAILQ_REMOVE(newlisthead, newadp, ad_next); 10043 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); 10044 if (listadp->ad_offset == newadp->ad_offset) { 10045 allocdirect_merge(oldlisthead, newadp, 10046 listadp); 10047 listadp = newadp; 10048 } 10049 newadp = TAILQ_FIRST(newlisthead); 10050 } 10051 while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) { 10052 TAILQ_REMOVE(newlisthead, newadp, ad_next); 10053 TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next); 10054 } 10055 } 10056 10057 /* 10058 * If we are doing an fsync, then we must ensure that any directory 10059 * entries for the inode have been written after the inode gets to disk. 10060 */ 10061 int 10062 softdep_fsync(vp) 10063 struct vnode *vp; /* the "in_core" copy of the inode */ 10064 { 10065 struct inodedep *inodedep; 10066 struct pagedep *pagedep; 10067 struct inoref *inoref; 10068 struct worklist *wk; 10069 struct diradd *dap; 10070 struct mount *mp; 10071 struct vnode *pvp; 10072 struct inode *ip; 10073 struct buf *bp; 10074 struct fs *fs; 10075 struct thread *td = curthread; 10076 int error, flushparent, pagedep_new_block; 10077 ino_t parentino; 10078 ufs_lbn_t lbn; 10079 10080 ip = VTOI(vp); 10081 fs = ip->i_fs; 10082 mp = vp->v_mount; 10083 ACQUIRE_LOCK(&lk); 10084 restart: 10085 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 10086 FREE_LOCK(&lk); 10087 return (0); 10088 } 10089 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 10090 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 10091 == DEPCOMPLETE) { 10092 stat_jwait_inode++; 10093 jwait(&inoref->if_list); 10094 goto restart; 10095 } 10096 } 10097 if (!LIST_EMPTY(&inodedep->id_inowait) || 10098 !TAILQ_EMPTY(&inodedep->id_extupdt) || 10099 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 10100 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 10101 !TAILQ_EMPTY(&inodedep->id_newinoupdt)) 10102 panic("softdep_fsync: pending ops %p", inodedep); 10103 for (error = 0, flushparent = 0; ; ) { 10104 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) 10105 break; 10106 if (wk->wk_type != D_DIRADD) 10107 panic("softdep_fsync: Unexpected type %s", 10108 TYPENAME(wk->wk_type)); 10109 dap = WK_DIRADD(wk); 10110 /* 10111 * Flush our parent if this directory entry has a MKDIR_PARENT 10112 * dependency or is contained in a newly allocated block. 10113 */ 10114 if (dap->da_state & DIRCHG) 10115 pagedep = dap->da_previous->dm_pagedep; 10116 else 10117 pagedep = dap->da_pagedep; 10118 parentino = pagedep->pd_ino; 10119 lbn = pagedep->pd_lbn; 10120 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) 10121 panic("softdep_fsync: dirty"); 10122 if ((dap->da_state & MKDIR_PARENT) || 10123 (pagedep->pd_state & NEWBLOCK)) 10124 flushparent = 1; 10125 else 10126 flushparent = 0; 10127 /* 10128 * If we are being fsync'ed as part of vgone'ing this vnode, 10129 * then we will not be able to release and recover the 10130 * vnode below, so we just have to give up on writing its 10131 * directory entry out. It will eventually be written, just 10132 * not now, but then the user was not asking to have it 10133 * written, so we are not breaking any promises. 10134 */ 10135 if (vp->v_iflag & VI_DOOMED) 10136 break; 10137 /* 10138 * We prevent deadlock by always fetching inodes from the 10139 * root, moving down the directory tree. Thus, when fetching 10140 * our parent directory, we first try to get the lock. If 10141 * that fails, we must unlock ourselves before requesting 10142 * the lock on our parent. See the comment in ufs_lookup 10143 * for details on possible races. 10144 */ 10145 FREE_LOCK(&lk); 10146 if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp, 10147 FFSV_FORCEINSMQ)) { 10148 error = vfs_busy(mp, MBF_NOWAIT); 10149 if (error != 0) { 10150 vfs_ref(mp); 10151 VOP_UNLOCK(vp, 0); 10152 error = vfs_busy(mp, 0); 10153 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 10154 vfs_rel(mp); 10155 if (error != 0) 10156 return (ENOENT); 10157 if (vp->v_iflag & VI_DOOMED) { 10158 vfs_unbusy(mp); 10159 return (ENOENT); 10160 } 10161 } 10162 VOP_UNLOCK(vp, 0); 10163 error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE, 10164 &pvp, FFSV_FORCEINSMQ); 10165 vfs_unbusy(mp); 10166 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 10167 if (vp->v_iflag & VI_DOOMED) { 10168 if (error == 0) 10169 vput(pvp); 10170 error = ENOENT; 10171 } 10172 if (error != 0) 10173 return (error); 10174 } 10175 /* 10176 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps 10177 * that are contained in direct blocks will be resolved by 10178 * doing a ffs_update. Pagedeps contained in indirect blocks 10179 * may require a complete sync'ing of the directory. So, we 10180 * try the cheap and fast ffs_update first, and if that fails, 10181 * then we do the slower ffs_syncvnode of the directory. 10182 */ 10183 if (flushparent) { 10184 int locked; 10185 10186 if ((error = ffs_update(pvp, 1)) != 0) { 10187 vput(pvp); 10188 return (error); 10189 } 10190 ACQUIRE_LOCK(&lk); 10191 locked = 1; 10192 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) { 10193 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) { 10194 if (wk->wk_type != D_DIRADD) 10195 panic("softdep_fsync: Unexpected type %s", 10196 TYPENAME(wk->wk_type)); 10197 dap = WK_DIRADD(wk); 10198 if (dap->da_state & DIRCHG) 10199 pagedep = dap->da_previous->dm_pagedep; 10200 else 10201 pagedep = dap->da_pagedep; 10202 pagedep_new_block = pagedep->pd_state & NEWBLOCK; 10203 FREE_LOCK(&lk); 10204 locked = 0; 10205 if (pagedep_new_block && 10206 (error = ffs_syncvnode(pvp, MNT_WAIT))) { 10207 vput(pvp); 10208 return (error); 10209 } 10210 } 10211 } 10212 if (locked) 10213 FREE_LOCK(&lk); 10214 } 10215 /* 10216 * Flush directory page containing the inode's name. 10217 */ 10218 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred, 10219 &bp); 10220 if (error == 0) 10221 error = bwrite(bp); 10222 else 10223 brelse(bp); 10224 vput(pvp); 10225 if (error != 0) 10226 return (error); 10227 ACQUIRE_LOCK(&lk); 10228 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) 10229 break; 10230 } 10231 FREE_LOCK(&lk); 10232 return (0); 10233 } 10234 10235 /* 10236 * Flush all the dirty bitmaps associated with the block device 10237 * before flushing the rest of the dirty blocks so as to reduce 10238 * the number of dependencies that will have to be rolled back. 10239 */ 10240 void 10241 softdep_fsync_mountdev(vp) 10242 struct vnode *vp; 10243 { 10244 struct buf *bp, *nbp; 10245 struct worklist *wk; 10246 struct bufobj *bo; 10247 10248 if (!vn_isdisk(vp, NULL)) 10249 panic("softdep_fsync_mountdev: vnode not a disk"); 10250 bo = &vp->v_bufobj; 10251 restart: 10252 BO_LOCK(bo); 10253 ACQUIRE_LOCK(&lk); 10254 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 10255 /* 10256 * If it is already scheduled, skip to the next buffer. 10257 */ 10258 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 10259 continue; 10260 10261 if ((bp->b_flags & B_DELWRI) == 0) 10262 panic("softdep_fsync_mountdev: not dirty"); 10263 /* 10264 * We are only interested in bitmaps with outstanding 10265 * dependencies. 10266 */ 10267 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || 10268 wk->wk_type != D_BMSAFEMAP || 10269 (bp->b_vflags & BV_BKGRDINPROG)) { 10270 BUF_UNLOCK(bp); 10271 continue; 10272 } 10273 FREE_LOCK(&lk); 10274 BO_UNLOCK(bo); 10275 bremfree(bp); 10276 (void) bawrite(bp); 10277 goto restart; 10278 } 10279 FREE_LOCK(&lk); 10280 drain_output(vp); 10281 BO_UNLOCK(bo); 10282 } 10283 10284 /* 10285 * This routine is called when we are trying to synchronously flush a 10286 * file. This routine must eliminate any filesystem metadata dependencies 10287 * so that the syncing routine can succeed by pushing the dirty blocks 10288 * associated with the file. If any I/O errors occur, they are returned. 10289 */ 10290 int 10291 softdep_sync_metadata(struct vnode *vp) 10292 { 10293 struct pagedep *pagedep; 10294 struct allocindir *aip; 10295 struct newblk *newblk; 10296 struct buf *bp, *nbp; 10297 struct worklist *wk; 10298 struct bufobj *bo; 10299 int i, error, waitfor; 10300 10301 if (!DOINGSOFTDEP(vp)) 10302 return (0); 10303 /* 10304 * Ensure that any direct block dependencies have been cleared. 10305 */ 10306 ACQUIRE_LOCK(&lk); 10307 if ((error = flush_inodedep_deps(vp->v_mount, VTOI(vp)->i_number))) { 10308 FREE_LOCK(&lk); 10309 return (error); 10310 } 10311 FREE_LOCK(&lk); 10312 /* 10313 * For most files, the only metadata dependencies are the 10314 * cylinder group maps that allocate their inode or blocks. 10315 * The block allocation dependencies can be found by traversing 10316 * the dependency lists for any buffers that remain on their 10317 * dirty buffer list. The inode allocation dependency will 10318 * be resolved when the inode is updated with MNT_WAIT. 10319 * This work is done in two passes. The first pass grabs most 10320 * of the buffers and begins asynchronously writing them. The 10321 * only way to wait for these asynchronous writes is to sleep 10322 * on the filesystem vnode which may stay busy for a long time 10323 * if the filesystem is active. So, instead, we make a second 10324 * pass over the dependencies blocking on each write. In the 10325 * usual case we will be blocking against a write that we 10326 * initiated, so when it is done the dependency will have been 10327 * resolved. Thus the second pass is expected to end quickly. 10328 */ 10329 waitfor = MNT_NOWAIT; 10330 bo = &vp->v_bufobj; 10331 10332 top: 10333 /* 10334 * We must wait for any I/O in progress to finish so that 10335 * all potential buffers on the dirty list will be visible. 10336 */ 10337 BO_LOCK(bo); 10338 drain_output(vp); 10339 while ((bp = TAILQ_FIRST(&bo->bo_dirty.bv_hd)) != NULL) { 10340 bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT); 10341 if (bp) 10342 break; 10343 } 10344 BO_UNLOCK(bo); 10345 if (bp == NULL) 10346 return (0); 10347 loop: 10348 /* While syncing snapshots, we must allow recursive lookups */ 10349 BUF_AREC(bp); 10350 ACQUIRE_LOCK(&lk); 10351 /* 10352 * As we hold the buffer locked, none of its dependencies 10353 * will disappear. 10354 */ 10355 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 10356 switch (wk->wk_type) { 10357 10358 case D_ALLOCDIRECT: 10359 case D_ALLOCINDIR: 10360 newblk = WK_NEWBLK(wk); 10361 if (newblk->nb_jnewblk != NULL) { 10362 stat_jwait_newblk++; 10363 jwait(&newblk->nb_jnewblk->jn_list); 10364 goto restart; 10365 } 10366 if (newblk->nb_state & DEPCOMPLETE) 10367 continue; 10368 nbp = newblk->nb_bmsafemap->sm_buf; 10369 nbp = getdirtybuf(nbp, &lk, waitfor); 10370 if (nbp == NULL) 10371 continue; 10372 FREE_LOCK(&lk); 10373 if (waitfor == MNT_NOWAIT) { 10374 bawrite(nbp); 10375 } else if ((error = bwrite(nbp)) != 0) { 10376 break; 10377 } 10378 ACQUIRE_LOCK(&lk); 10379 continue; 10380 10381 case D_INDIRDEP: 10382 restart: 10383 10384 LIST_FOREACH(aip, 10385 &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) { 10386 newblk = (struct newblk *)aip; 10387 if (newblk->nb_jnewblk != NULL) { 10388 stat_jwait_newblk++; 10389 jwait(&newblk->nb_jnewblk->jn_list); 10390 goto restart; 10391 } 10392 if (newblk->nb_state & DEPCOMPLETE) 10393 continue; 10394 nbp = newblk->nb_bmsafemap->sm_buf; 10395 nbp = getdirtybuf(nbp, &lk, MNT_WAIT); 10396 if (nbp == NULL) 10397 goto restart; 10398 FREE_LOCK(&lk); 10399 if ((error = bwrite(nbp)) != 0) { 10400 goto loop_end; 10401 } 10402 ACQUIRE_LOCK(&lk); 10403 goto restart; 10404 } 10405 continue; 10406 10407 case D_PAGEDEP: 10408 /* 10409 * We are trying to sync a directory that may 10410 * have dependencies on both its own metadata 10411 * and/or dependencies on the inodes of any 10412 * recently allocated files. We walk its diradd 10413 * lists pushing out the associated inode. 10414 */ 10415 pagedep = WK_PAGEDEP(wk); 10416 for (i = 0; i < DAHASHSZ; i++) { 10417 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) 10418 continue; 10419 if ((error = 10420 flush_pagedep_deps(vp, wk->wk_mp, 10421 &pagedep->pd_diraddhd[i]))) { 10422 FREE_LOCK(&lk); 10423 goto loop_end; 10424 } 10425 } 10426 continue; 10427 10428 default: 10429 panic("softdep_sync_metadata: Unknown type %s", 10430 TYPENAME(wk->wk_type)); 10431 /* NOTREACHED */ 10432 } 10433 loop_end: 10434 /* We reach here only in error and unlocked */ 10435 if (error == 0) 10436 panic("softdep_sync_metadata: zero error"); 10437 BUF_NOREC(bp); 10438 bawrite(bp); 10439 return (error); 10440 } 10441 FREE_LOCK(&lk); 10442 BO_LOCK(bo); 10443 while ((nbp = TAILQ_NEXT(bp, b_bobufs)) != NULL) { 10444 nbp = getdirtybuf(nbp, BO_MTX(bo), MNT_WAIT); 10445 if (nbp) 10446 break; 10447 } 10448 BO_UNLOCK(bo); 10449 BUF_NOREC(bp); 10450 bawrite(bp); 10451 if (nbp != NULL) { 10452 bp = nbp; 10453 goto loop; 10454 } 10455 /* 10456 * The brief unlock is to allow any pent up dependency 10457 * processing to be done. Then proceed with the second pass. 10458 */ 10459 if (waitfor == MNT_NOWAIT) { 10460 waitfor = MNT_WAIT; 10461 goto top; 10462 } 10463 10464 /* 10465 * If we have managed to get rid of all the dirty buffers, 10466 * then we are done. For certain directories and block 10467 * devices, we may need to do further work. 10468 * 10469 * We must wait for any I/O in progress to finish so that 10470 * all potential buffers on the dirty list will be visible. 10471 */ 10472 BO_LOCK(bo); 10473 drain_output(vp); 10474 BO_UNLOCK(bo); 10475 return ffs_update(vp, 1); 10476 /* return (0); */ 10477 } 10478 10479 /* 10480 * Flush the dependencies associated with an inodedep. 10481 * Called with splbio blocked. 10482 */ 10483 static int 10484 flush_inodedep_deps(mp, ino) 10485 struct mount *mp; 10486 ino_t ino; 10487 { 10488 struct inodedep *inodedep; 10489 struct inoref *inoref; 10490 int error, waitfor; 10491 10492 /* 10493 * This work is done in two passes. The first pass grabs most 10494 * of the buffers and begins asynchronously writing them. The 10495 * only way to wait for these asynchronous writes is to sleep 10496 * on the filesystem vnode which may stay busy for a long time 10497 * if the filesystem is active. So, instead, we make a second 10498 * pass over the dependencies blocking on each write. In the 10499 * usual case we will be blocking against a write that we 10500 * initiated, so when it is done the dependency will have been 10501 * resolved. Thus the second pass is expected to end quickly. 10502 * We give a brief window at the top of the loop to allow 10503 * any pending I/O to complete. 10504 */ 10505 for (error = 0, waitfor = MNT_NOWAIT; ; ) { 10506 if (error) 10507 return (error); 10508 FREE_LOCK(&lk); 10509 ACQUIRE_LOCK(&lk); 10510 restart: 10511 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 10512 return (0); 10513 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 10514 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 10515 == DEPCOMPLETE) { 10516 stat_jwait_inode++; 10517 jwait(&inoref->if_list); 10518 goto restart; 10519 } 10520 } 10521 if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) || 10522 flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) || 10523 flush_deplist(&inodedep->id_extupdt, waitfor, &error) || 10524 flush_deplist(&inodedep->id_newextupdt, waitfor, &error)) 10525 continue; 10526 /* 10527 * If pass2, we are done, otherwise do pass 2. 10528 */ 10529 if (waitfor == MNT_WAIT) 10530 break; 10531 waitfor = MNT_WAIT; 10532 } 10533 /* 10534 * Try freeing inodedep in case all dependencies have been removed. 10535 */ 10536 if (inodedep_lookup(mp, ino, 0, &inodedep) != 0) 10537 (void) free_inodedep(inodedep); 10538 return (0); 10539 } 10540 10541 /* 10542 * Flush an inode dependency list. 10543 * Called with splbio blocked. 10544 */ 10545 static int 10546 flush_deplist(listhead, waitfor, errorp) 10547 struct allocdirectlst *listhead; 10548 int waitfor; 10549 int *errorp; 10550 { 10551 struct allocdirect *adp; 10552 struct newblk *newblk; 10553 struct buf *bp; 10554 10555 mtx_assert(&lk, MA_OWNED); 10556 TAILQ_FOREACH(adp, listhead, ad_next) { 10557 newblk = (struct newblk *)adp; 10558 if (newblk->nb_jnewblk != NULL) { 10559 stat_jwait_newblk++; 10560 jwait(&newblk->nb_jnewblk->jn_list); 10561 return (1); 10562 } 10563 if (newblk->nb_state & DEPCOMPLETE) 10564 continue; 10565 bp = newblk->nb_bmsafemap->sm_buf; 10566 bp = getdirtybuf(bp, &lk, waitfor); 10567 if (bp == NULL) { 10568 if (waitfor == MNT_NOWAIT) 10569 continue; 10570 return (1); 10571 } 10572 FREE_LOCK(&lk); 10573 if (waitfor == MNT_NOWAIT) { 10574 bawrite(bp); 10575 } else if ((*errorp = bwrite(bp)) != 0) { 10576 ACQUIRE_LOCK(&lk); 10577 return (1); 10578 } 10579 ACQUIRE_LOCK(&lk); 10580 return (1); 10581 } 10582 return (0); 10583 } 10584 10585 /* 10586 * Flush dependencies associated with an allocdirect block. 10587 */ 10588 static int 10589 flush_newblk_dep(vp, mp, lbn) 10590 struct vnode *vp; 10591 struct mount *mp; 10592 ufs_lbn_t lbn; 10593 { 10594 struct newblk *newblk; 10595 struct bufobj *bo; 10596 struct inode *ip; 10597 struct buf *bp; 10598 ufs2_daddr_t blkno; 10599 int error; 10600 10601 error = 0; 10602 bo = &vp->v_bufobj; 10603 ip = VTOI(vp); 10604 blkno = DIP(ip, i_db[lbn]); 10605 if (blkno == 0) 10606 panic("flush_newblk_dep: Missing block"); 10607 ACQUIRE_LOCK(&lk); 10608 /* 10609 * Loop until all dependencies related to this block are satisfied. 10610 * We must be careful to restart after each sleep in case a write 10611 * completes some part of this process for us. 10612 */ 10613 for (;;) { 10614 if (newblk_lookup(mp, blkno, 0, &newblk) == 0) { 10615 FREE_LOCK(&lk); 10616 break; 10617 } 10618 if (newblk->nb_list.wk_type != D_ALLOCDIRECT) 10619 panic("flush_newblk_deps: Bad newblk %p", newblk); 10620 /* 10621 * Flush the journal. 10622 */ 10623 if (newblk->nb_jnewblk != NULL) { 10624 stat_jwait_newblk++; 10625 jwait(&newblk->nb_jnewblk->jn_list); 10626 continue; 10627 } 10628 /* 10629 * Write the bitmap dependency. 10630 */ 10631 if ((newblk->nb_state & DEPCOMPLETE) == 0) { 10632 bp = newblk->nb_bmsafemap->sm_buf; 10633 bp = getdirtybuf(bp, &lk, MNT_WAIT); 10634 if (bp == NULL) 10635 continue; 10636 FREE_LOCK(&lk); 10637 error = bwrite(bp); 10638 if (error) 10639 break; 10640 ACQUIRE_LOCK(&lk); 10641 continue; 10642 } 10643 /* 10644 * Write the buffer. 10645 */ 10646 FREE_LOCK(&lk); 10647 BO_LOCK(bo); 10648 bp = gbincore(bo, lbn); 10649 if (bp != NULL) { 10650 error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 10651 LK_INTERLOCK, BO_MTX(bo)); 10652 if (error == ENOLCK) { 10653 ACQUIRE_LOCK(&lk); 10654 continue; /* Slept, retry */ 10655 } 10656 if (error != 0) 10657 break; /* Failed */ 10658 if (bp->b_flags & B_DELWRI) { 10659 bremfree(bp); 10660 error = bwrite(bp); 10661 if (error) 10662 break; 10663 } else 10664 BUF_UNLOCK(bp); 10665 } else 10666 BO_UNLOCK(bo); 10667 /* 10668 * We have to wait for the direct pointers to 10669 * point at the newdirblk before the dependency 10670 * will go away. 10671 */ 10672 error = ffs_update(vp, MNT_WAIT); 10673 if (error) 10674 break; 10675 ACQUIRE_LOCK(&lk); 10676 } 10677 return (error); 10678 } 10679 10680 /* 10681 * Eliminate a pagedep dependency by flushing out all its diradd dependencies. 10682 * Called with splbio blocked. 10683 */ 10684 static int 10685 flush_pagedep_deps(pvp, mp, diraddhdp) 10686 struct vnode *pvp; 10687 struct mount *mp; 10688 struct diraddhd *diraddhdp; 10689 { 10690 struct inodedep *inodedep; 10691 struct inoref *inoref; 10692 struct ufsmount *ump; 10693 struct diradd *dap; 10694 struct vnode *vp; 10695 int error = 0; 10696 struct buf *bp; 10697 ino_t inum; 10698 10699 ump = VFSTOUFS(mp); 10700 restart: 10701 while ((dap = LIST_FIRST(diraddhdp)) != NULL) { 10702 /* 10703 * Flush ourselves if this directory entry 10704 * has a MKDIR_PARENT dependency. 10705 */ 10706 if (dap->da_state & MKDIR_PARENT) { 10707 FREE_LOCK(&lk); 10708 if ((error = ffs_update(pvp, MNT_WAIT)) != 0) 10709 break; 10710 ACQUIRE_LOCK(&lk); 10711 /* 10712 * If that cleared dependencies, go on to next. 10713 */ 10714 if (dap != LIST_FIRST(diraddhdp)) 10715 continue; 10716 if (dap->da_state & MKDIR_PARENT) 10717 panic("flush_pagedep_deps: MKDIR_PARENT"); 10718 } 10719 /* 10720 * A newly allocated directory must have its "." and 10721 * ".." entries written out before its name can be 10722 * committed in its parent. 10723 */ 10724 inum = dap->da_newinum; 10725 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) 10726 panic("flush_pagedep_deps: lost inode1"); 10727 /* 10728 * Wait for any pending journal adds to complete so we don't 10729 * cause rollbacks while syncing. 10730 */ 10731 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 10732 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 10733 == DEPCOMPLETE) { 10734 stat_jwait_inode++; 10735 jwait(&inoref->if_list); 10736 goto restart; 10737 } 10738 } 10739 if (dap->da_state & MKDIR_BODY) { 10740 FREE_LOCK(&lk); 10741 if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, 10742 FFSV_FORCEINSMQ))) 10743 break; 10744 error = flush_newblk_dep(vp, mp, 0); 10745 /* 10746 * If we still have the dependency we might need to 10747 * update the vnode to sync the new link count to 10748 * disk. 10749 */ 10750 if (error == 0 && dap == LIST_FIRST(diraddhdp)) 10751 error = ffs_update(vp, MNT_WAIT); 10752 vput(vp); 10753 if (error != 0) 10754 break; 10755 ACQUIRE_LOCK(&lk); 10756 /* 10757 * If that cleared dependencies, go on to next. 10758 */ 10759 if (dap != LIST_FIRST(diraddhdp)) 10760 continue; 10761 if (dap->da_state & MKDIR_BODY) { 10762 inodedep_lookup(UFSTOVFS(ump), inum, 0, 10763 &inodedep); 10764 panic("flush_pagedep_deps: MKDIR_BODY " 10765 "inodedep %p dap %p vp %p", 10766 inodedep, dap, vp); 10767 } 10768 } 10769 /* 10770 * Flush the inode on which the directory entry depends. 10771 * Having accounted for MKDIR_PARENT and MKDIR_BODY above, 10772 * the only remaining dependency is that the updated inode 10773 * count must get pushed to disk. The inode has already 10774 * been pushed into its inode buffer (via VOP_UPDATE) at 10775 * the time of the reference count change. So we need only 10776 * locate that buffer, ensure that there will be no rollback 10777 * caused by a bitmap dependency, then write the inode buffer. 10778 */ 10779 retry: 10780 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) 10781 panic("flush_pagedep_deps: lost inode"); 10782 /* 10783 * If the inode still has bitmap dependencies, 10784 * push them to disk. 10785 */ 10786 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) { 10787 bp = inodedep->id_bmsafemap->sm_buf; 10788 bp = getdirtybuf(bp, &lk, MNT_WAIT); 10789 if (bp == NULL) 10790 goto retry; 10791 FREE_LOCK(&lk); 10792 if ((error = bwrite(bp)) != 0) 10793 break; 10794 ACQUIRE_LOCK(&lk); 10795 if (dap != LIST_FIRST(diraddhdp)) 10796 continue; 10797 } 10798 /* 10799 * If the inode is still sitting in a buffer waiting 10800 * to be written or waiting for the link count to be 10801 * adjusted update it here to flush it to disk. 10802 */ 10803 if (dap == LIST_FIRST(diraddhdp)) { 10804 FREE_LOCK(&lk); 10805 if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, 10806 FFSV_FORCEINSMQ))) 10807 break; 10808 error = ffs_update(vp, MNT_WAIT); 10809 vput(vp); 10810 if (error) 10811 break; 10812 ACQUIRE_LOCK(&lk); 10813 } 10814 /* 10815 * If we have failed to get rid of all the dependencies 10816 * then something is seriously wrong. 10817 */ 10818 if (dap == LIST_FIRST(diraddhdp)) { 10819 inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep); 10820 panic("flush_pagedep_deps: failed to flush " 10821 "inodedep %p ino %d dap %p", inodedep, inum, dap); 10822 } 10823 } 10824 if (error) 10825 ACQUIRE_LOCK(&lk); 10826 return (error); 10827 } 10828 10829 /* 10830 * A large burst of file addition or deletion activity can drive the 10831 * memory load excessively high. First attempt to slow things down 10832 * using the techniques below. If that fails, this routine requests 10833 * the offending operations to fall back to running synchronously 10834 * until the memory load returns to a reasonable level. 10835 */ 10836 int 10837 softdep_slowdown(vp) 10838 struct vnode *vp; 10839 { 10840 struct ufsmount *ump; 10841 int jlow; 10842 int max_softdeps_hard; 10843 10844 ACQUIRE_LOCK(&lk); 10845 jlow = 0; 10846 /* 10847 * Check for journal space if needed. 10848 */ 10849 if (DOINGSUJ(vp)) { 10850 ump = VFSTOUFS(vp->v_mount); 10851 if (journal_space(ump, 0) == 0) 10852 jlow = 1; 10853 } 10854 max_softdeps_hard = max_softdeps * 11 / 10; 10855 if (num_dirrem < max_softdeps_hard / 2 && 10856 num_inodedep < max_softdeps_hard && 10857 VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps && 10858 num_freeblkdep < max_softdeps_hard && jlow == 0) { 10859 FREE_LOCK(&lk); 10860 return (0); 10861 } 10862 if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps || jlow) 10863 softdep_speedup(); 10864 stat_sync_limit_hit += 1; 10865 FREE_LOCK(&lk); 10866 return (1); 10867 } 10868 10869 /* 10870 * Called by the allocation routines when they are about to fail 10871 * in the hope that we can free up some disk space. 10872 * 10873 * First check to see if the work list has anything on it. If it has, 10874 * clean up entries until we successfully free some space. Because this 10875 * process holds inodes locked, we cannot handle any remove requests 10876 * that might block on a locked inode as that could lead to deadlock. 10877 * If the worklist yields no free space, encourage the syncer daemon 10878 * to help us. In no event will we try for longer than tickdelay seconds. 10879 */ 10880 int 10881 softdep_request_cleanup(fs, vp) 10882 struct fs *fs; 10883 struct vnode *vp; 10884 { 10885 struct ufsmount *ump; 10886 long starttime; 10887 ufs2_daddr_t needed; 10888 int error; 10889 10890 ump = VTOI(vp)->i_ump; 10891 mtx_assert(UFS_MTX(ump), MA_OWNED); 10892 needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize; 10893 starttime = time_second + tickdelay; 10894 /* 10895 * If we are being called because of a process doing a 10896 * copy-on-write, then it is not safe to update the vnode 10897 * as we may recurse into the copy-on-write routine. 10898 */ 10899 if (!(curthread->td_pflags & TDP_COWINPROGRESS)) { 10900 UFS_UNLOCK(ump); 10901 error = ffs_update(vp, 1); 10902 UFS_LOCK(ump); 10903 if (error != 0) 10904 return (0); 10905 } 10906 while (fs->fs_pendingblocks > 0 && fs->fs_cstotal.cs_nbfree <= needed) { 10907 if (time_second > starttime) 10908 return (0); 10909 UFS_UNLOCK(ump); 10910 ACQUIRE_LOCK(&lk); 10911 process_removes(vp); 10912 if (ump->softdep_on_worklist > 0 && 10913 process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) { 10914 stat_worklist_push += 1; 10915 FREE_LOCK(&lk); 10916 UFS_LOCK(ump); 10917 continue; 10918 } 10919 request_cleanup(UFSTOVFS(ump), FLUSH_REMOVE_WAIT); 10920 FREE_LOCK(&lk); 10921 UFS_LOCK(ump); 10922 } 10923 return (1); 10924 } 10925 10926 /* 10927 * If memory utilization has gotten too high, deliberately slow things 10928 * down and speed up the I/O processing. 10929 */ 10930 extern struct thread *syncertd; 10931 static int 10932 request_cleanup(mp, resource) 10933 struct mount *mp; 10934 int resource; 10935 { 10936 struct thread *td = curthread; 10937 struct ufsmount *ump; 10938 10939 mtx_assert(&lk, MA_OWNED); 10940 /* 10941 * We never hold up the filesystem syncer or buf daemon. 10942 */ 10943 if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF)) 10944 return (0); 10945 ump = VFSTOUFS(mp); 10946 /* 10947 * First check to see if the work list has gotten backlogged. 10948 * If it has, co-opt this process to help clean up two entries. 10949 * Because this process may hold inodes locked, we cannot 10950 * handle any remove requests that might block on a locked 10951 * inode as that could lead to deadlock. We set TDP_SOFTDEP 10952 * to avoid recursively processing the worklist. 10953 */ 10954 if (ump->softdep_on_worklist > max_softdeps / 10) { 10955 td->td_pflags |= TDP_SOFTDEP; 10956 process_worklist_item(mp, LK_NOWAIT); 10957 process_worklist_item(mp, LK_NOWAIT); 10958 td->td_pflags &= ~TDP_SOFTDEP; 10959 stat_worklist_push += 2; 10960 return(1); 10961 } 10962 /* 10963 * Next, we attempt to speed up the syncer process. If that 10964 * is successful, then we allow the process to continue. 10965 */ 10966 if (softdep_speedup() && resource != FLUSH_REMOVE_WAIT) 10967 return(0); 10968 /* 10969 * If we are resource constrained on inode dependencies, try 10970 * flushing some dirty inodes. Otherwise, we are constrained 10971 * by file deletions, so try accelerating flushes of directories 10972 * with removal dependencies. We would like to do the cleanup 10973 * here, but we probably hold an inode locked at this point and 10974 * that might deadlock against one that we try to clean. So, 10975 * the best that we can do is request the syncer daemon to do 10976 * the cleanup for us. 10977 */ 10978 switch (resource) { 10979 10980 case FLUSH_INODES: 10981 stat_ino_limit_push += 1; 10982 req_clear_inodedeps += 1; 10983 stat_countp = &stat_ino_limit_hit; 10984 break; 10985 10986 case FLUSH_REMOVE: 10987 case FLUSH_REMOVE_WAIT: 10988 stat_blk_limit_push += 1; 10989 req_clear_remove += 1; 10990 stat_countp = &stat_blk_limit_hit; 10991 break; 10992 10993 default: 10994 panic("request_cleanup: unknown type"); 10995 } 10996 /* 10997 * Hopefully the syncer daemon will catch up and awaken us. 10998 * We wait at most tickdelay before proceeding in any case. 10999 */ 11000 proc_waiting += 1; 11001 if (callout_pending(&softdep_callout) == FALSE) 11002 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, 11003 pause_timer, 0); 11004 11005 msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0); 11006 proc_waiting -= 1; 11007 return (1); 11008 } 11009 11010 /* 11011 * Awaken processes pausing in request_cleanup and clear proc_waiting 11012 * to indicate that there is no longer a timer running. 11013 */ 11014 static void 11015 pause_timer(arg) 11016 void *arg; 11017 { 11018 11019 /* 11020 * The callout_ API has acquired mtx and will hold it around this 11021 * function call. 11022 */ 11023 *stat_countp += 1; 11024 wakeup_one(&proc_waiting); 11025 if (proc_waiting > 0) 11026 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, 11027 pause_timer, 0); 11028 } 11029 11030 /* 11031 * Flush out a directory with at least one removal dependency in an effort to 11032 * reduce the number of dirrem, freefile, and freeblks dependency structures. 11033 */ 11034 static void 11035 clear_remove(td) 11036 struct thread *td; 11037 { 11038 struct pagedep_hashhead *pagedephd; 11039 struct pagedep *pagedep; 11040 static int next = 0; 11041 struct mount *mp; 11042 struct vnode *vp; 11043 struct bufobj *bo; 11044 int error, cnt; 11045 ino_t ino; 11046 11047 mtx_assert(&lk, MA_OWNED); 11048 11049 for (cnt = 0; cnt < pagedep_hash; cnt++) { 11050 pagedephd = &pagedep_hashtbl[next++]; 11051 if (next >= pagedep_hash) 11052 next = 0; 11053 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 11054 if (LIST_EMPTY(&pagedep->pd_dirremhd)) 11055 continue; 11056 mp = pagedep->pd_list.wk_mp; 11057 ino = pagedep->pd_ino; 11058 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 11059 continue; 11060 FREE_LOCK(&lk); 11061 11062 /* 11063 * Let unmount clear deps 11064 */ 11065 error = vfs_busy(mp, MBF_NOWAIT); 11066 if (error != 0) 11067 goto finish_write; 11068 error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, 11069 FFSV_FORCEINSMQ); 11070 vfs_unbusy(mp); 11071 if (error != 0) { 11072 softdep_error("clear_remove: vget", error); 11073 goto finish_write; 11074 } 11075 if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) 11076 softdep_error("clear_remove: fsync", error); 11077 bo = &vp->v_bufobj; 11078 BO_LOCK(bo); 11079 drain_output(vp); 11080 BO_UNLOCK(bo); 11081 vput(vp); 11082 finish_write: 11083 vn_finished_write(mp); 11084 ACQUIRE_LOCK(&lk); 11085 return; 11086 } 11087 } 11088 } 11089 11090 /* 11091 * Clear out a block of dirty inodes in an effort to reduce 11092 * the number of inodedep dependency structures. 11093 */ 11094 static void 11095 clear_inodedeps(td) 11096 struct thread *td; 11097 { 11098 struct inodedep_hashhead *inodedephd; 11099 struct inodedep *inodedep; 11100 static int next = 0; 11101 struct mount *mp; 11102 struct vnode *vp; 11103 struct fs *fs; 11104 int error, cnt; 11105 ino_t firstino, lastino, ino; 11106 11107 mtx_assert(&lk, MA_OWNED); 11108 /* 11109 * Pick a random inode dependency to be cleared. 11110 * We will then gather up all the inodes in its block 11111 * that have dependencies and flush them out. 11112 */ 11113 for (cnt = 0; cnt < inodedep_hash; cnt++) { 11114 inodedephd = &inodedep_hashtbl[next++]; 11115 if (next >= inodedep_hash) 11116 next = 0; 11117 if ((inodedep = LIST_FIRST(inodedephd)) != NULL) 11118 break; 11119 } 11120 if (inodedep == NULL) 11121 return; 11122 fs = inodedep->id_fs; 11123 mp = inodedep->id_list.wk_mp; 11124 /* 11125 * Find the last inode in the block with dependencies. 11126 */ 11127 firstino = inodedep->id_ino & ~(INOPB(fs) - 1); 11128 for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) 11129 if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0) 11130 break; 11131 /* 11132 * Asynchronously push all but the last inode with dependencies. 11133 * Synchronously push the last inode with dependencies to ensure 11134 * that the inode block gets written to free up the inodedeps. 11135 */ 11136 for (ino = firstino; ino <= lastino; ino++) { 11137 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 11138 continue; 11139 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 11140 continue; 11141 FREE_LOCK(&lk); 11142 error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */ 11143 if (error != 0) { 11144 vn_finished_write(mp); 11145 ACQUIRE_LOCK(&lk); 11146 return; 11147 } 11148 if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, 11149 FFSV_FORCEINSMQ)) != 0) { 11150 softdep_error("clear_inodedeps: vget", error); 11151 vfs_unbusy(mp); 11152 vn_finished_write(mp); 11153 ACQUIRE_LOCK(&lk); 11154 return; 11155 } 11156 vfs_unbusy(mp); 11157 if (ino == lastino) { 11158 if ((error = ffs_syncvnode(vp, MNT_WAIT))) 11159 softdep_error("clear_inodedeps: fsync1", error); 11160 } else { 11161 if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) 11162 softdep_error("clear_inodedeps: fsync2", error); 11163 BO_LOCK(&vp->v_bufobj); 11164 drain_output(vp); 11165 BO_UNLOCK(&vp->v_bufobj); 11166 } 11167 vput(vp); 11168 vn_finished_write(mp); 11169 ACQUIRE_LOCK(&lk); 11170 } 11171 } 11172 11173 /* 11174 * Function to determine if the buffer has outstanding dependencies 11175 * that will cause a roll-back if the buffer is written. If wantcount 11176 * is set, return number of dependencies, otherwise just yes or no. 11177 */ 11178 static int 11179 softdep_count_dependencies(bp, wantcount) 11180 struct buf *bp; 11181 int wantcount; 11182 { 11183 struct worklist *wk; 11184 struct bmsafemap *bmsafemap; 11185 struct inodedep *inodedep; 11186 struct indirdep *indirdep; 11187 struct freeblks *freeblks; 11188 struct allocindir *aip; 11189 struct pagedep *pagedep; 11190 struct dirrem *dirrem; 11191 struct newblk *newblk; 11192 struct mkdir *mkdir; 11193 struct diradd *dap; 11194 int i, retval; 11195 11196 retval = 0; 11197 ACQUIRE_LOCK(&lk); 11198 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 11199 switch (wk->wk_type) { 11200 11201 case D_INODEDEP: 11202 inodedep = WK_INODEDEP(wk); 11203 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 11204 /* bitmap allocation dependency */ 11205 retval += 1; 11206 if (!wantcount) 11207 goto out; 11208 } 11209 if (TAILQ_FIRST(&inodedep->id_inoupdt)) { 11210 /* direct block pointer dependency */ 11211 retval += 1; 11212 if (!wantcount) 11213 goto out; 11214 } 11215 if (TAILQ_FIRST(&inodedep->id_extupdt)) { 11216 /* direct block pointer dependency */ 11217 retval += 1; 11218 if (!wantcount) 11219 goto out; 11220 } 11221 if (TAILQ_FIRST(&inodedep->id_inoreflst)) { 11222 /* Add reference dependency. */ 11223 retval += 1; 11224 if (!wantcount) 11225 goto out; 11226 } 11227 continue; 11228 11229 case D_INDIRDEP: 11230 indirdep = WK_INDIRDEP(wk); 11231 11232 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 11233 /* indirect block pointer dependency */ 11234 retval += 1; 11235 if (!wantcount) 11236 goto out; 11237 } 11238 continue; 11239 11240 case D_PAGEDEP: 11241 pagedep = WK_PAGEDEP(wk); 11242 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { 11243 if (LIST_FIRST(&dirrem->dm_jremrefhd)) { 11244 /* Journal remove ref dependency. */ 11245 retval += 1; 11246 if (!wantcount) 11247 goto out; 11248 } 11249 } 11250 for (i = 0; i < DAHASHSZ; i++) { 11251 11252 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 11253 /* directory entry dependency */ 11254 retval += 1; 11255 if (!wantcount) 11256 goto out; 11257 } 11258 } 11259 continue; 11260 11261 case D_BMSAFEMAP: 11262 bmsafemap = WK_BMSAFEMAP(wk); 11263 if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) { 11264 /* Add reference dependency. */ 11265 retval += 1; 11266 if (!wantcount) 11267 goto out; 11268 } 11269 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) { 11270 /* Allocate block dependency. */ 11271 retval += 1; 11272 if (!wantcount) 11273 goto out; 11274 } 11275 continue; 11276 11277 case D_FREEBLKS: 11278 freeblks = WK_FREEBLKS(wk); 11279 if (LIST_FIRST(&freeblks->fb_jfreeblkhd)) { 11280 /* Freeblk journal dependency. */ 11281 retval += 1; 11282 if (!wantcount) 11283 goto out; 11284 } 11285 continue; 11286 11287 case D_ALLOCDIRECT: 11288 case D_ALLOCINDIR: 11289 newblk = WK_NEWBLK(wk); 11290 if (newblk->nb_jnewblk) { 11291 /* Journal allocate dependency. */ 11292 retval += 1; 11293 if (!wantcount) 11294 goto out; 11295 } 11296 continue; 11297 11298 case D_MKDIR: 11299 mkdir = WK_MKDIR(wk); 11300 if (mkdir->md_jaddref) { 11301 /* Journal reference dependency. */ 11302 retval += 1; 11303 if (!wantcount) 11304 goto out; 11305 } 11306 continue; 11307 11308 case D_FREEWORK: 11309 case D_FREEDEP: 11310 case D_JSEGDEP: 11311 case D_JSEG: 11312 case D_SBDEP: 11313 /* never a dependency on these blocks */ 11314 continue; 11315 11316 default: 11317 panic("softdep_count_dependencies: Unexpected type %s", 11318 TYPENAME(wk->wk_type)); 11319 /* NOTREACHED */ 11320 } 11321 } 11322 out: 11323 FREE_LOCK(&lk); 11324 return retval; 11325 } 11326 11327 /* 11328 * Acquire exclusive access to a buffer. 11329 * Must be called with a locked mtx parameter. 11330 * Return acquired buffer or NULL on failure. 11331 */ 11332 static struct buf * 11333 getdirtybuf(bp, mtx, waitfor) 11334 struct buf *bp; 11335 struct mtx *mtx; 11336 int waitfor; 11337 { 11338 int error; 11339 11340 mtx_assert(mtx, MA_OWNED); 11341 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) { 11342 if (waitfor != MNT_WAIT) 11343 return (NULL); 11344 error = BUF_LOCK(bp, 11345 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx); 11346 /* 11347 * Even if we sucessfully acquire bp here, we have dropped 11348 * mtx, which may violates our guarantee. 11349 */ 11350 if (error == 0) 11351 BUF_UNLOCK(bp); 11352 else if (error != ENOLCK) 11353 panic("getdirtybuf: inconsistent lock: %d", error); 11354 mtx_lock(mtx); 11355 return (NULL); 11356 } 11357 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 11358 if (mtx == &lk && waitfor == MNT_WAIT) { 11359 mtx_unlock(mtx); 11360 BO_LOCK(bp->b_bufobj); 11361 BUF_UNLOCK(bp); 11362 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 11363 bp->b_vflags |= BV_BKGRDWAIT; 11364 msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), 11365 PRIBIO | PDROP, "getbuf", 0); 11366 } else 11367 BO_UNLOCK(bp->b_bufobj); 11368 mtx_lock(mtx); 11369 return (NULL); 11370 } 11371 BUF_UNLOCK(bp); 11372 if (waitfor != MNT_WAIT) 11373 return (NULL); 11374 /* 11375 * The mtx argument must be bp->b_vp's mutex in 11376 * this case. 11377 */ 11378 #ifdef DEBUG_VFS_LOCKS 11379 if (bp->b_vp->v_type != VCHR) 11380 ASSERT_BO_LOCKED(bp->b_bufobj); 11381 #endif 11382 bp->b_vflags |= BV_BKGRDWAIT; 11383 msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0); 11384 return (NULL); 11385 } 11386 if ((bp->b_flags & B_DELWRI) == 0) { 11387 BUF_UNLOCK(bp); 11388 return (NULL); 11389 } 11390 bremfree(bp); 11391 return (bp); 11392 } 11393 11394 11395 /* 11396 * Check if it is safe to suspend the file system now. On entry, 11397 * the vnode interlock for devvp should be held. Return 0 with 11398 * the mount interlock held if the file system can be suspended now, 11399 * otherwise return EAGAIN with the mount interlock held. 11400 */ 11401 int 11402 softdep_check_suspend(struct mount *mp, 11403 struct vnode *devvp, 11404 int softdep_deps, 11405 int softdep_accdeps, 11406 int secondary_writes, 11407 int secondary_accwrites) 11408 { 11409 struct bufobj *bo; 11410 struct ufsmount *ump; 11411 int error; 11412 11413 ump = VFSTOUFS(mp); 11414 bo = &devvp->v_bufobj; 11415 ASSERT_BO_LOCKED(bo); 11416 11417 for (;;) { 11418 if (!TRY_ACQUIRE_LOCK(&lk)) { 11419 BO_UNLOCK(bo); 11420 ACQUIRE_LOCK(&lk); 11421 FREE_LOCK(&lk); 11422 BO_LOCK(bo); 11423 continue; 11424 } 11425 MNT_ILOCK(mp); 11426 if (mp->mnt_secondary_writes != 0) { 11427 FREE_LOCK(&lk); 11428 BO_UNLOCK(bo); 11429 msleep(&mp->mnt_secondary_writes, 11430 MNT_MTX(mp), 11431 (PUSER - 1) | PDROP, "secwr", 0); 11432 BO_LOCK(bo); 11433 continue; 11434 } 11435 break; 11436 } 11437 11438 /* 11439 * Reasons for needing more work before suspend: 11440 * - Dirty buffers on devvp. 11441 * - Softdep activity occurred after start of vnode sync loop 11442 * - Secondary writes occurred after start of vnode sync loop 11443 */ 11444 error = 0; 11445 if (bo->bo_numoutput > 0 || 11446 bo->bo_dirty.bv_cnt > 0 || 11447 softdep_deps != 0 || 11448 ump->softdep_deps != 0 || 11449 softdep_accdeps != ump->softdep_accdeps || 11450 secondary_writes != 0 || 11451 mp->mnt_secondary_writes != 0 || 11452 secondary_accwrites != mp->mnt_secondary_accwrites) 11453 error = EAGAIN; 11454 FREE_LOCK(&lk); 11455 BO_UNLOCK(bo); 11456 return (error); 11457 } 11458 11459 11460 /* 11461 * Get the number of dependency structures for the file system, both 11462 * the current number and the total number allocated. These will 11463 * later be used to detect that softdep processing has occurred. 11464 */ 11465 void 11466 softdep_get_depcounts(struct mount *mp, 11467 int *softdep_depsp, 11468 int *softdep_accdepsp) 11469 { 11470 struct ufsmount *ump; 11471 11472 ump = VFSTOUFS(mp); 11473 ACQUIRE_LOCK(&lk); 11474 *softdep_depsp = ump->softdep_deps; 11475 *softdep_accdepsp = ump->softdep_accdeps; 11476 FREE_LOCK(&lk); 11477 } 11478 11479 /* 11480 * Wait for pending output on a vnode to complete. 11481 * Must be called with vnode lock and interlock locked. 11482 * 11483 * XXX: Should just be a call to bufobj_wwait(). 11484 */ 11485 static void 11486 drain_output(vp) 11487 struct vnode *vp; 11488 { 11489 struct bufobj *bo; 11490 11491 bo = &vp->v_bufobj; 11492 ASSERT_VOP_LOCKED(vp, "drain_output"); 11493 ASSERT_BO_LOCKED(bo); 11494 11495 while (bo->bo_numoutput) { 11496 bo->bo_flag |= BO_WWAIT; 11497 msleep((caddr_t)&bo->bo_numoutput, 11498 BO_MTX(bo), PRIBIO + 1, "drainvp", 0); 11499 } 11500 } 11501 11502 /* 11503 * Called whenever a buffer that is being invalidated or reallocated 11504 * contains dependencies. This should only happen if an I/O error has 11505 * occurred. The routine is called with the buffer locked. 11506 */ 11507 static void 11508 softdep_deallocate_dependencies(bp) 11509 struct buf *bp; 11510 { 11511 11512 if ((bp->b_ioflags & BIO_ERROR) == 0) 11513 panic("softdep_deallocate_dependencies: dangling deps"); 11514 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); 11515 panic("softdep_deallocate_dependencies: unrecovered I/O error"); 11516 } 11517 11518 /* 11519 * Function to handle asynchronous write errors in the filesystem. 11520 */ 11521 static void 11522 softdep_error(func, error) 11523 char *func; 11524 int error; 11525 { 11526 11527 /* XXX should do something better! */ 11528 printf("%s: got error %d while accessing filesystem\n", func, error); 11529 } 11530 11531 #ifdef DDB 11532 11533 static void 11534 inodedep_print(struct inodedep *inodedep, int verbose) 11535 { 11536 db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d" 11537 " saveino %p\n", 11538 inodedep, inodedep->id_fs, inodedep->id_state, 11539 (intmax_t)inodedep->id_ino, 11540 (intmax_t)fsbtodb(inodedep->id_fs, 11541 ino_to_fsba(inodedep->id_fs, inodedep->id_ino)), 11542 inodedep->id_nlinkdelta, inodedep->id_savednlink, 11543 inodedep->id_savedino1); 11544 11545 if (verbose == 0) 11546 return; 11547 11548 db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, " 11549 "mkdiradd %p\n", 11550 LIST_FIRST(&inodedep->id_pendinghd), 11551 LIST_FIRST(&inodedep->id_bufwait), 11552 LIST_FIRST(&inodedep->id_inowait), 11553 TAILQ_FIRST(&inodedep->id_inoreflst), 11554 inodedep->id_mkdiradd); 11555 db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n", 11556 TAILQ_FIRST(&inodedep->id_inoupdt), 11557 TAILQ_FIRST(&inodedep->id_newinoupdt), 11558 TAILQ_FIRST(&inodedep->id_extupdt), 11559 TAILQ_FIRST(&inodedep->id_newextupdt)); 11560 } 11561 11562 DB_SHOW_COMMAND(inodedep, db_show_inodedep) 11563 { 11564 11565 if (have_addr == 0) { 11566 db_printf("Address required\n"); 11567 return; 11568 } 11569 inodedep_print((struct inodedep*)addr, 1); 11570 } 11571 11572 DB_SHOW_COMMAND(inodedeps, db_show_inodedeps) 11573 { 11574 struct inodedep_hashhead *inodedephd; 11575 struct inodedep *inodedep; 11576 struct fs *fs; 11577 int cnt; 11578 11579 fs = have_addr ? (struct fs *)addr : NULL; 11580 for (cnt = 0; cnt < inodedep_hash; cnt++) { 11581 inodedephd = &inodedep_hashtbl[cnt]; 11582 LIST_FOREACH(inodedep, inodedephd, id_hash) { 11583 if (fs != NULL && fs != inodedep->id_fs) 11584 continue; 11585 inodedep_print(inodedep, 0); 11586 } 11587 } 11588 } 11589 11590 DB_SHOW_COMMAND(worklist, db_show_worklist) 11591 { 11592 struct worklist *wk; 11593 11594 if (have_addr == 0) { 11595 db_printf("Address required\n"); 11596 return; 11597 } 11598 wk = (struct worklist *)addr; 11599 printf("worklist: %p type %s state 0x%X\n", 11600 wk, TYPENAME(wk->wk_type), wk->wk_state); 11601 } 11602 11603 DB_SHOW_COMMAND(workhead, db_show_workhead) 11604 { 11605 struct workhead *wkhd; 11606 struct worklist *wk; 11607 int i; 11608 11609 if (have_addr == 0) { 11610 db_printf("Address required\n"); 11611 return; 11612 } 11613 wkhd = (struct workhead *)addr; 11614 wk = LIST_FIRST(wkhd); 11615 for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list)) 11616 db_printf("worklist: %p type %s state 0x%X", 11617 wk, TYPENAME(wk->wk_type), wk->wk_state); 11618 if (i == 100) 11619 db_printf("workhead overflow"); 11620 printf("\n"); 11621 } 11622 11623 11624 DB_SHOW_COMMAND(mkdirs, db_show_mkdirs) 11625 { 11626 struct jaddref *jaddref; 11627 struct diradd *diradd; 11628 struct mkdir *mkdir; 11629 11630 LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) { 11631 diradd = mkdir->md_diradd; 11632 db_printf("mkdir: %p state 0x%X dap %p state 0x%X", 11633 mkdir, mkdir->md_state, diradd, diradd->da_state); 11634 if ((jaddref = mkdir->md_jaddref) != NULL) 11635 db_printf(" jaddref %p jaddref state 0x%X", 11636 jaddref, jaddref->ja_state); 11637 db_printf("\n"); 11638 } 11639 } 11640 11641 #endif /* DDB */ 11642 11643 #endif /* SOFTUPDATES */ 11644