1 /* 2 * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * The soft updates code is derived from the appendix of a University 5 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, 6 * "Soft Updates: A Solution to the Metadata Update Problem in File 7 * Systems", CSE-TR-254-95, August 1995). 8 * 9 * Further information about soft updates can be obtained from: 10 * 11 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 12 * 1614 Oxford Street mckusick@mckusick.com 13 * Berkeley, CA 94709-1608 +1-510-843-9542 14 * USA 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 26 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 27 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 28 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 29 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 30 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00 39 * $FreeBSD: src/sys/ufs/ffs/ffs_softdep.c,v 1.57.2.11 2002/02/05 18:46:53 dillon Exp $ 40 */ 41 42 /* 43 * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide. 44 */ 45 #ifndef DIAGNOSTIC 46 #define DIAGNOSTIC 47 #endif 48 #ifndef DEBUG 49 #define DEBUG 50 #endif 51 52 #include <sys/param.h> 53 #include <sys/kernel.h> 54 #include <sys/systm.h> 55 #include <sys/buf.h> 56 #include <sys/malloc.h> 57 #include <sys/mount.h> 58 #include <sys/proc.h> 59 #include <sys/syslog.h> 60 #include <sys/vnode.h> 61 #include <sys/conf.h> 62 #include <machine/inttypes.h> 63 #include "dir.h" 64 #include "quota.h" 65 #include "inode.h" 66 #include "ufsmount.h" 67 #include "fs.h" 68 #include "softdep.h" 69 #include "ffs_extern.h" 70 #include "ufs_extern.h" 71 72 #include <sys/buf2.h> 73 #include <sys/thread2.h> 74 #include <sys/lock.h> 75 76 /* 77 * These definitions need to be adapted to the system to which 78 * this file is being ported. 79 */ 80 /* 81 * malloc types defined for the softdep system. 82 */ 83 MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies"); 84 MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies"); 85 MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation"); 86 MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map"); 87 MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode"); 88 MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies"); 89 MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block"); 90 MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode"); 91 MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode"); 92 MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated"); 93 MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry"); 94 MALLOC_DEFINE(M_MKDIR, "mkdir","New directory"); 95 MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted"); 96 97 #define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE) 98 99 #define D_PAGEDEP 0 100 #define D_INODEDEP 1 101 #define D_NEWBLK 2 102 #define D_BMSAFEMAP 3 103 #define D_ALLOCDIRECT 4 104 #define D_INDIRDEP 5 105 #define D_ALLOCINDIR 6 106 #define D_FREEFRAG 7 107 #define D_FREEBLKS 8 108 #define D_FREEFILE 9 109 #define D_DIRADD 10 110 #define D_MKDIR 11 111 #define D_DIRREM 12 112 #define D_LAST D_DIRREM 113 114 /* 115 * translate from workitem type to memory type 116 * MUST match the defines above, such that memtype[D_XXX] == M_XXX 117 */ 118 static struct malloc_type *memtype[] = { 119 M_PAGEDEP, 120 M_INODEDEP, 121 M_NEWBLK, 122 M_BMSAFEMAP, 123 M_ALLOCDIRECT, 124 M_INDIRDEP, 125 M_ALLOCINDIR, 126 M_FREEFRAG, 127 M_FREEBLKS, 128 M_FREEFILE, 129 M_DIRADD, 130 M_MKDIR, 131 M_DIRREM 132 }; 133 134 #define DtoM(type) (memtype[type]) 135 136 /* 137 * Names of malloc types. 138 */ 139 #define TYPENAME(type) \ 140 ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???") 141 /* 142 * End system adaptaion definitions. 143 */ 144 145 /* 146 * Internal function prototypes. 147 */ 148 static void softdep_error(char *, int); 149 static void drain_output(struct vnode *, int); 150 static int getdirtybuf(struct buf **, int); 151 static void clear_remove(struct thread *); 152 static void clear_inodedeps(struct thread *); 153 static int flush_pagedep_deps(struct vnode *, struct mount *, 154 struct diraddhd *); 155 static int flush_inodedep_deps(struct fs *, ino_t); 156 static int handle_written_filepage(struct pagedep *, struct buf *); 157 static void diradd_inode_written(struct diradd *, struct inodedep *); 158 static int handle_written_inodeblock(struct inodedep *, struct buf *); 159 static void handle_allocdirect_partdone(struct allocdirect *); 160 static void handle_allocindir_partdone(struct allocindir *); 161 static void initiate_write_filepage(struct pagedep *, struct buf *); 162 static void handle_written_mkdir(struct mkdir *, int); 163 static void initiate_write_inodeblock(struct inodedep *, struct buf *); 164 static void handle_workitem_freefile(struct freefile *); 165 static void handle_workitem_remove(struct dirrem *); 166 static struct dirrem *newdirrem(struct buf *, struct inode *, 167 struct inode *, int, struct dirrem **); 168 static void free_diradd(struct diradd *); 169 static void free_allocindir(struct allocindir *, struct inodedep *); 170 static int indir_trunc (struct inode *, off_t, int, ufs_lbn_t, long *); 171 static void deallocate_dependencies(struct buf *, struct inodedep *); 172 static void free_allocdirect(struct allocdirectlst *, 173 struct allocdirect *, int); 174 static int check_inode_unwritten(struct inodedep *); 175 static int free_inodedep(struct inodedep *); 176 static void handle_workitem_freeblocks(struct freeblks *); 177 static void merge_inode_lists(struct inodedep *); 178 static void setup_allocindir_phase2(struct buf *, struct inode *, 179 struct allocindir *); 180 static struct allocindir *newallocindir(struct inode *, int, ufs_daddr_t, 181 ufs_daddr_t); 182 static void handle_workitem_freefrag(struct freefrag *); 183 static struct freefrag *newfreefrag(struct inode *, ufs_daddr_t, long); 184 static void allocdirect_merge(struct allocdirectlst *, 185 struct allocdirect *, struct allocdirect *); 186 static struct bmsafemap *bmsafemap_lookup(struct buf *); 187 static int newblk_lookup(struct fs *, ufs_daddr_t, int, 188 struct newblk **); 189 static int inodedep_lookup(struct fs *, ino_t, int, struct inodedep **); 190 static int pagedep_lookup(struct inode *, ufs_lbn_t, int, 191 struct pagedep **); 192 static int request_cleanup(int, int); 193 static int process_worklist_item(struct mount *, int); 194 static void add_to_worklist(struct worklist *); 195 196 /* 197 * Exported softdep operations. 198 */ 199 static void softdep_disk_io_initiation(struct buf *); 200 static void softdep_disk_write_complete(struct buf *); 201 static void softdep_deallocate_dependencies(struct buf *); 202 static int softdep_fsync(struct vnode *); 203 static int softdep_process_worklist(struct mount *); 204 static void softdep_move_dependencies(struct buf *, struct buf *); 205 static int softdep_count_dependencies(struct buf *bp, int); 206 static int softdep_checkread(struct buf *bp); 207 static int softdep_checkwrite(struct buf *bp); 208 209 static struct bio_ops softdep_bioops = { 210 .io_start = softdep_disk_io_initiation, 211 .io_complete = softdep_disk_write_complete, 212 .io_deallocate = softdep_deallocate_dependencies, 213 .io_fsync = softdep_fsync, 214 .io_sync = softdep_process_worklist, 215 .io_movedeps = softdep_move_dependencies, 216 .io_countdeps = softdep_count_dependencies, 217 .io_checkread = softdep_checkread, 218 .io_checkwrite = softdep_checkwrite 219 }; 220 221 /* 222 * Locking primitives. 223 */ 224 static void acquire_lock(struct lock *); 225 static void free_lock(struct lock *); 226 #ifdef INVARIANTS 227 static int lock_held(struct lock *); 228 #endif 229 230 static struct lock lk; 231 232 #define ACQUIRE_LOCK(lkp) acquire_lock(lkp) 233 #define FREE_LOCK(lkp) free_lock(lkp) 234 235 static void 236 acquire_lock(struct lock *lkp) 237 { 238 lockmgr(lkp, LK_EXCLUSIVE); 239 } 240 241 static void 242 free_lock(struct lock *lkp) 243 { 244 lockmgr(lkp, LK_RELEASE); 245 } 246 247 #ifdef INVARIANTS 248 static int 249 lock_held(struct lock *lkp) 250 { 251 return lockcountnb(lkp); 252 } 253 #endif 254 255 /* 256 * Place holder for real semaphores. 257 */ 258 struct sema { 259 int value; 260 thread_t holder; 261 char *name; 262 int timo; 263 }; 264 static void sema_init(struct sema *, char *, int); 265 static int sema_get(struct sema *, struct lock *); 266 static void sema_release(struct sema *); 267 268 #define NOHOLDER ((struct thread *) -1) 269 270 static void 271 sema_init(struct sema *semap, char *name, int timo) 272 { 273 semap->holder = NOHOLDER; 274 semap->value = 0; 275 semap->name = name; 276 semap->timo = timo; 277 } 278 279 static int 280 sema_get(struct sema *semap, struct lock *interlock) 281 { 282 if (semap->value++ > 0) { 283 if (interlock) 284 lksleep(semap, interlock, 0, semap->name, semap->timo); 285 else 286 tsleep(semap, 0, semap->name, semap->timo); 287 return (0); 288 } 289 semap->holder = curthread; 290 return (1); 291 } 292 293 static void 294 sema_release(struct sema *semap) 295 { 296 if (semap->value <= 0 || semap->holder != curthread) { 297 panic("sema_release: not held"); 298 } 299 if (--semap->value > 0) { 300 semap->value = 0; 301 wakeup(semap); 302 } 303 semap->holder = NOHOLDER; 304 } 305 306 /* 307 * Worklist queue management. 308 * These routines require that the lock be held. 309 */ 310 static void worklist_insert(struct workhead *, struct worklist *); 311 static void worklist_remove(struct worklist *); 312 static void workitem_free(struct worklist *, int); 313 314 #define WORKLIST_INSERT_BP(bp, item) do { \ 315 (bp)->b_ops = &softdep_bioops; \ 316 worklist_insert(&(bp)->b_dep, item); \ 317 } while (0) 318 319 #define WORKLIST_INSERT(head, item) worklist_insert(head, item) 320 #define WORKLIST_REMOVE(item) worklist_remove(item) 321 #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type) 322 323 static void 324 worklist_insert(struct workhead *head, struct worklist *item) 325 { 326 KKASSERT(lock_held(&lk) > 0); 327 328 if (item->wk_state & ONWORKLIST) { 329 panic("worklist_insert: already on list"); 330 } 331 item->wk_state |= ONWORKLIST; 332 LIST_INSERT_HEAD(head, item, wk_list); 333 } 334 335 static void 336 worklist_remove(struct worklist *item) 337 { 338 339 KKASSERT(lock_held(&lk)); 340 if ((item->wk_state & ONWORKLIST) == 0) 341 panic("worklist_remove: not on list"); 342 343 item->wk_state &= ~ONWORKLIST; 344 LIST_REMOVE(item, wk_list); 345 } 346 347 static void 348 workitem_free(struct worklist *item, int type) 349 { 350 351 if (item->wk_state & ONWORKLIST) 352 panic("workitem_free: still on list"); 353 if (item->wk_type != type) 354 panic("workitem_free: type mismatch"); 355 356 kfree(item, DtoM(type)); 357 } 358 359 /* 360 * Workitem queue management 361 */ 362 static struct workhead softdep_workitem_pending; 363 static int num_on_worklist; /* number of worklist items to be processed */ 364 static int softdep_worklist_busy; /* 1 => trying to do unmount */ 365 static int softdep_worklist_req; /* serialized waiters */ 366 static int max_softdeps; /* maximum number of structs before slowdown */ 367 static int tickdelay = 2; /* number of ticks to pause during slowdown */ 368 static int *stat_countp; /* statistic to count in proc_waiting timeout */ 369 static int proc_waiting; /* tracks whether we have a timeout posted */ 370 static struct thread *filesys_syncer; /* proc of filesystem syncer process */ 371 static int req_clear_inodedeps; /* syncer process flush some inodedeps */ 372 #define FLUSH_INODES 1 373 static int req_clear_remove; /* syncer process flush some freeblks */ 374 #define FLUSH_REMOVE 2 375 /* 376 * runtime statistics 377 */ 378 static int stat_worklist_push; /* number of worklist cleanups */ 379 static int stat_blk_limit_push; /* number of times block limit neared */ 380 static int stat_ino_limit_push; /* number of times inode limit neared */ 381 static int stat_blk_limit_hit; /* number of times block slowdown imposed */ 382 static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ 383 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */ 384 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ 385 static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ 386 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ 387 static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ 388 #ifdef DEBUG 389 #include <vm/vm.h> 390 #include <sys/sysctl.h> 391 SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, 392 "Maximum soft dependencies before slowdown occurs"); 393 SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, 394 "Ticks to delay before allocating during slowdown"); 395 SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0, 396 "Number of worklist cleanups"); 397 SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0, 398 "Number of times block limit neared"); 399 SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0, 400 "Number of times inode limit neared"); 401 SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, 402 "Number of times block slowdown imposed"); 403 SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, 404 "Number of times inode slowdown imposed "); 405 SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, 406 "Number of synchronous slowdowns imposed"); 407 SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, 408 "Bufs redirtied as indir ptrs not written"); 409 SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, 410 "Bufs redirtied as inode bitmap not written"); 411 SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, 412 "Bufs redirtied as direct ptrs not written"); 413 SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, 414 "Bufs redirtied as dir entry cannot write"); 415 #endif /* DEBUG */ 416 417 /* 418 * Add an item to the end of the work queue. 419 * This routine requires that the lock be held. 420 * This is the only routine that adds items to the list. 421 * The following routine is the only one that removes items 422 * and does so in order from first to last. 423 */ 424 static void 425 add_to_worklist(struct worklist *wk) 426 { 427 static struct worklist *worklist_tail; 428 429 if (wk->wk_state & ONWORKLIST) { 430 panic("add_to_worklist: already on list"); 431 } 432 wk->wk_state |= ONWORKLIST; 433 if (LIST_FIRST(&softdep_workitem_pending) == NULL) 434 LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list); 435 else 436 LIST_INSERT_AFTER(worklist_tail, wk, wk_list); 437 worklist_tail = wk; 438 num_on_worklist += 1; 439 } 440 441 /* 442 * Process that runs once per second to handle items in the background queue. 443 * 444 * Note that we ensure that everything is done in the order in which they 445 * appear in the queue. The code below depends on this property to ensure 446 * that blocks of a file are freed before the inode itself is freed. This 447 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated 448 * until all the old ones have been purged from the dependency lists. 449 * 450 * bioops callback - hold io_token 451 */ 452 static int 453 softdep_process_worklist(struct mount *matchmnt) 454 { 455 thread_t td = curthread; 456 int matchcnt, loopcount; 457 long starttime; 458 459 ACQUIRE_LOCK(&lk); 460 461 /* 462 * Record the process identifier of our caller so that we can give 463 * this process preferential treatment in request_cleanup below. 464 */ 465 filesys_syncer = td; 466 matchcnt = 0; 467 468 /* 469 * There is no danger of having multiple processes run this 470 * code, but we have to single-thread it when softdep_flushfiles() 471 * is in operation to get an accurate count of the number of items 472 * related to its mount point that are in the list. 473 */ 474 if (matchmnt == NULL) { 475 if (softdep_worklist_busy < 0) { 476 matchcnt = -1; 477 goto done; 478 } 479 softdep_worklist_busy += 1; 480 } 481 482 /* 483 * If requested, try removing inode or removal dependencies. 484 */ 485 if (req_clear_inodedeps) { 486 clear_inodedeps(td); 487 req_clear_inodedeps -= 1; 488 wakeup_one(&proc_waiting); 489 } 490 if (req_clear_remove) { 491 clear_remove(td); 492 req_clear_remove -= 1; 493 wakeup_one(&proc_waiting); 494 } 495 loopcount = 1; 496 starttime = time_second; 497 while (num_on_worklist > 0) { 498 matchcnt += process_worklist_item(matchmnt, 0); 499 500 /* 501 * If a umount operation wants to run the worklist 502 * accurately, abort. 503 */ 504 if (softdep_worklist_req && matchmnt == NULL) { 505 matchcnt = -1; 506 break; 507 } 508 509 /* 510 * If requested, try removing inode or removal dependencies. 511 */ 512 if (req_clear_inodedeps) { 513 clear_inodedeps(td); 514 req_clear_inodedeps -= 1; 515 wakeup_one(&proc_waiting); 516 } 517 if (req_clear_remove) { 518 clear_remove(td); 519 req_clear_remove -= 1; 520 wakeup_one(&proc_waiting); 521 } 522 /* 523 * We do not generally want to stop for buffer space, but if 524 * we are really being a buffer hog, we will stop and wait. 525 */ 526 if (loopcount++ % 128 == 0) { 527 FREE_LOCK(&lk); 528 bwillinode(1); 529 ACQUIRE_LOCK(&lk); 530 } 531 532 /* 533 * Never allow processing to run for more than one 534 * second. Otherwise the other syncer tasks may get 535 * excessively backlogged. 536 */ 537 if (starttime != time_second && matchmnt == NULL) { 538 matchcnt = -1; 539 break; 540 } 541 } 542 if (matchmnt == NULL) { 543 --softdep_worklist_busy; 544 if (softdep_worklist_req && softdep_worklist_busy == 0) 545 wakeup(&softdep_worklist_req); 546 } 547 done: 548 FREE_LOCK(&lk); 549 return (matchcnt); 550 } 551 552 /* 553 * Process one item on the worklist. 554 */ 555 static int 556 process_worklist_item(struct mount *matchmnt, int flags) 557 { 558 struct worklist *wk; 559 struct dirrem *dirrem; 560 struct fs *matchfs; 561 struct vnode *vp; 562 int matchcnt = 0; 563 564 matchfs = NULL; 565 if (matchmnt != NULL) 566 matchfs = VFSTOUFS(matchmnt)->um_fs; 567 568 /* 569 * Normally we just process each item on the worklist in order. 570 * However, if we are in a situation where we cannot lock any 571 * inodes, we have to skip over any dirrem requests whose 572 * vnodes are resident and locked. 573 */ 574 LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) { 575 if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM) 576 break; 577 dirrem = WK_DIRREM(wk); 578 vp = ufs_ihashlookup(VFSTOUFS(dirrem->dm_mnt)->um_dev, 579 dirrem->dm_oldinum); 580 if (vp == NULL || !vn_islocked(vp)) 581 break; 582 } 583 if (wk == NULL) { 584 return (0); 585 } 586 WORKLIST_REMOVE(wk); 587 num_on_worklist -= 1; 588 FREE_LOCK(&lk); 589 switch (wk->wk_type) { 590 case D_DIRREM: 591 /* removal of a directory entry */ 592 if (WK_DIRREM(wk)->dm_mnt == matchmnt) 593 matchcnt += 1; 594 handle_workitem_remove(WK_DIRREM(wk)); 595 break; 596 597 case D_FREEBLKS: 598 /* releasing blocks and/or fragments from a file */ 599 if (WK_FREEBLKS(wk)->fb_fs == matchfs) 600 matchcnt += 1; 601 handle_workitem_freeblocks(WK_FREEBLKS(wk)); 602 break; 603 604 case D_FREEFRAG: 605 /* releasing a fragment when replaced as a file grows */ 606 if (WK_FREEFRAG(wk)->ff_fs == matchfs) 607 matchcnt += 1; 608 handle_workitem_freefrag(WK_FREEFRAG(wk)); 609 break; 610 611 case D_FREEFILE: 612 /* releasing an inode when its link count drops to 0 */ 613 if (WK_FREEFILE(wk)->fx_fs == matchfs) 614 matchcnt += 1; 615 handle_workitem_freefile(WK_FREEFILE(wk)); 616 break; 617 618 default: 619 panic("%s_process_worklist: Unknown type %s", 620 "softdep", TYPENAME(wk->wk_type)); 621 /* NOTREACHED */ 622 } 623 ACQUIRE_LOCK(&lk); 624 return (matchcnt); 625 } 626 627 /* 628 * Move dependencies from one buffer to another. 629 * 630 * bioops callback - hold io_token 631 */ 632 static void 633 softdep_move_dependencies(struct buf *oldbp, struct buf *newbp) 634 { 635 struct worklist *wk, *wktail; 636 637 if (LIST_FIRST(&newbp->b_dep) != NULL) 638 panic("softdep_move_dependencies: need merge code"); 639 wktail = NULL; 640 ACQUIRE_LOCK(&lk); 641 while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { 642 LIST_REMOVE(wk, wk_list); 643 if (wktail == NULL) 644 LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); 645 else 646 LIST_INSERT_AFTER(wktail, wk, wk_list); 647 wktail = wk; 648 newbp->b_ops = &softdep_bioops; 649 } 650 FREE_LOCK(&lk); 651 } 652 653 /* 654 * Purge the work list of all items associated with a particular mount point. 655 */ 656 int 657 softdep_flushfiles(struct mount *oldmnt, int flags) 658 { 659 struct vnode *devvp; 660 int error, loopcnt; 661 662 /* 663 * Await our turn to clear out the queue, then serialize access. 664 */ 665 ACQUIRE_LOCK(&lk); 666 while (softdep_worklist_busy != 0) { 667 softdep_worklist_req += 1; 668 lksleep(&softdep_worklist_req, &lk, 0, "softflush", 0); 669 softdep_worklist_req -= 1; 670 } 671 softdep_worklist_busy = -1; 672 FREE_LOCK(&lk); 673 674 if ((error = ffs_flushfiles(oldmnt, flags)) != 0) { 675 softdep_worklist_busy = 0; 676 if (softdep_worklist_req) 677 wakeup(&softdep_worklist_req); 678 return (error); 679 } 680 /* 681 * Alternately flush the block device associated with the mount 682 * point and process any dependencies that the flushing 683 * creates. In theory, this loop can happen at most twice, 684 * but we give it a few extra just to be sure. 685 */ 686 devvp = VFSTOUFS(oldmnt)->um_devvp; 687 for (loopcnt = 10; loopcnt > 0; ) { 688 if (softdep_process_worklist(oldmnt) == 0) { 689 loopcnt--; 690 /* 691 * Do another flush in case any vnodes were brought in 692 * as part of the cleanup operations. 693 */ 694 if ((error = ffs_flushfiles(oldmnt, flags)) != 0) 695 break; 696 /* 697 * If we still found nothing to do, we are really done. 698 */ 699 if (softdep_process_worklist(oldmnt) == 0) 700 break; 701 } 702 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 703 error = VOP_FSYNC(devvp, MNT_WAIT, 0); 704 vn_unlock(devvp); 705 if (error) 706 break; 707 } 708 ACQUIRE_LOCK(&lk); 709 softdep_worklist_busy = 0; 710 if (softdep_worklist_req) 711 wakeup(&softdep_worklist_req); 712 FREE_LOCK(&lk); 713 714 /* 715 * If we are unmounting then it is an error to fail. If we 716 * are simply trying to downgrade to read-only, then filesystem 717 * activity can keep us busy forever, so we just fail with EBUSY. 718 */ 719 if (loopcnt == 0) { 720 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) 721 panic("softdep_flushfiles: looping"); 722 error = EBUSY; 723 } 724 return (error); 725 } 726 727 /* 728 * Structure hashing. 729 * 730 * There are three types of structures that can be looked up: 731 * 1) pagedep structures identified by mount point, inode number, 732 * and logical block. 733 * 2) inodedep structures identified by mount point and inode number. 734 * 3) newblk structures identified by mount point and 735 * physical block number. 736 * 737 * The "pagedep" and "inodedep" dependency structures are hashed 738 * separately from the file blocks and inodes to which they correspond. 739 * This separation helps when the in-memory copy of an inode or 740 * file block must be replaced. It also obviates the need to access 741 * an inode or file page when simply updating (or de-allocating) 742 * dependency structures. Lookup of newblk structures is needed to 743 * find newly allocated blocks when trying to associate them with 744 * their allocdirect or allocindir structure. 745 * 746 * The lookup routines optionally create and hash a new instance when 747 * an existing entry is not found. 748 */ 749 #define DEPALLOC 0x0001 /* allocate structure if lookup fails */ 750 #define NODELAY 0x0002 /* cannot do background work */ 751 752 /* 753 * Structures and routines associated with pagedep caching. 754 */ 755 LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl; 756 u_long pagedep_hash; /* size of hash table - 1 */ 757 #define PAGEDEP_HASH(mp, inum, lbn) \ 758 (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ 759 pagedep_hash]) 760 static struct sema pagedep_in_progress; 761 762 /* 763 * Helper routine for pagedep_lookup() 764 */ 765 static __inline 766 struct pagedep * 767 pagedep_find(struct pagedep_hashhead *pagedephd, ino_t ino, ufs_lbn_t lbn, 768 struct mount *mp) 769 { 770 struct pagedep *pagedep; 771 772 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 773 if (ino == pagedep->pd_ino && 774 lbn == pagedep->pd_lbn && 775 mp == pagedep->pd_mnt) { 776 return (pagedep); 777 } 778 } 779 return(NULL); 780 } 781 782 /* 783 * Look up a pagedep. Return 1 if found, 0 if not found. 784 * If not found, allocate if DEPALLOC flag is passed. 785 * Found or allocated entry is returned in pagedeppp. 786 * This routine must be called with splbio interrupts blocked. 787 */ 788 static int 789 pagedep_lookup(struct inode *ip, ufs_lbn_t lbn, int flags, 790 struct pagedep **pagedeppp) 791 { 792 struct pagedep *pagedep; 793 struct pagedep_hashhead *pagedephd; 794 struct mount *mp; 795 int i; 796 797 KKASSERT(lock_held(&lk) > 0); 798 799 mp = ITOV(ip)->v_mount; 800 pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn); 801 top: 802 *pagedeppp = pagedep_find(pagedephd, ip->i_number, lbn, mp); 803 if (*pagedeppp) 804 return(1); 805 if ((flags & DEPALLOC) == 0) 806 return (0); 807 if (sema_get(&pagedep_in_progress, &lk) == 0) 808 goto top; 809 810 FREE_LOCK(&lk); 811 pagedep = kmalloc(sizeof(struct pagedep), M_PAGEDEP, 812 M_SOFTDEP_FLAGS | M_ZERO); 813 ACQUIRE_LOCK(&lk); 814 if (pagedep_find(pagedephd, ip->i_number, lbn, mp)) { 815 kprintf("pagedep_lookup: blocking race avoided\n"); 816 sema_release(&pagedep_in_progress); 817 kfree(pagedep, M_PAGEDEP); 818 goto top; 819 } 820 821 pagedep->pd_list.wk_type = D_PAGEDEP; 822 pagedep->pd_mnt = mp; 823 pagedep->pd_ino = ip->i_number; 824 pagedep->pd_lbn = lbn; 825 LIST_INIT(&pagedep->pd_dirremhd); 826 LIST_INIT(&pagedep->pd_pendinghd); 827 for (i = 0; i < DAHASHSZ; i++) 828 LIST_INIT(&pagedep->pd_diraddhd[i]); 829 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); 830 sema_release(&pagedep_in_progress); 831 *pagedeppp = pagedep; 832 return (0); 833 } 834 835 /* 836 * Structures and routines associated with inodedep caching. 837 */ 838 LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl; 839 static u_long inodedep_hash; /* size of hash table - 1 */ 840 static long num_inodedep; /* number of inodedep allocated */ 841 #define INODEDEP_HASH(fs, inum) \ 842 (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) 843 static struct sema inodedep_in_progress; 844 845 /* 846 * Helper routine for inodedep_lookup() 847 */ 848 static __inline 849 struct inodedep * 850 inodedep_find(struct inodedep_hashhead *inodedephd, struct fs *fs, ino_t inum) 851 { 852 struct inodedep *inodedep; 853 854 LIST_FOREACH(inodedep, inodedephd, id_hash) { 855 if (inum == inodedep->id_ino && fs == inodedep->id_fs) 856 return(inodedep); 857 } 858 return (NULL); 859 } 860 861 /* 862 * Look up a inodedep. Return 1 if found, 0 if not found. 863 * If not found, allocate if DEPALLOC flag is passed. 864 * Found or allocated entry is returned in inodedeppp. 865 * This routine must be called with splbio interrupts blocked. 866 */ 867 static int 868 inodedep_lookup(struct fs *fs, ino_t inum, int flags, 869 struct inodedep **inodedeppp) 870 { 871 struct inodedep *inodedep; 872 struct inodedep_hashhead *inodedephd; 873 int firsttry; 874 875 KKASSERT(lock_held(&lk) > 0); 876 877 firsttry = 1; 878 inodedephd = INODEDEP_HASH(fs, inum); 879 top: 880 *inodedeppp = inodedep_find(inodedephd, fs, inum); 881 if (*inodedeppp) 882 return (1); 883 if ((flags & DEPALLOC) == 0) 884 return (0); 885 /* 886 * If we are over our limit, try to improve the situation. 887 */ 888 if (num_inodedep > max_softdeps && firsttry && 889 speedup_syncer() == 0 && (flags & NODELAY) == 0 && 890 request_cleanup(FLUSH_INODES, 1)) { 891 firsttry = 0; 892 goto top; 893 } 894 if (sema_get(&inodedep_in_progress, &lk) == 0) 895 goto top; 896 897 FREE_LOCK(&lk); 898 inodedep = kmalloc(sizeof(struct inodedep), M_INODEDEP, 899 M_SOFTDEP_FLAGS | M_ZERO); 900 ACQUIRE_LOCK(&lk); 901 if (inodedep_find(inodedephd, fs, inum)) { 902 kprintf("inodedep_lookup: blocking race avoided\n"); 903 sema_release(&inodedep_in_progress); 904 kfree(inodedep, M_INODEDEP); 905 goto top; 906 } 907 inodedep->id_list.wk_type = D_INODEDEP; 908 inodedep->id_fs = fs; 909 inodedep->id_ino = inum; 910 inodedep->id_state = ALLCOMPLETE; 911 inodedep->id_nlinkdelta = 0; 912 inodedep->id_savedino = NULL; 913 inodedep->id_savedsize = -1; 914 inodedep->id_buf = NULL; 915 LIST_INIT(&inodedep->id_pendinghd); 916 LIST_INIT(&inodedep->id_inowait); 917 LIST_INIT(&inodedep->id_bufwait); 918 TAILQ_INIT(&inodedep->id_inoupdt); 919 TAILQ_INIT(&inodedep->id_newinoupdt); 920 num_inodedep += 1; 921 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); 922 sema_release(&inodedep_in_progress); 923 *inodedeppp = inodedep; 924 return (0); 925 } 926 927 /* 928 * Structures and routines associated with newblk caching. 929 */ 930 LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl; 931 u_long newblk_hash; /* size of hash table - 1 */ 932 #define NEWBLK_HASH(fs, inum) \ 933 (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) 934 static struct sema newblk_in_progress; 935 936 /* 937 * Helper routine for newblk_lookup() 938 */ 939 static __inline 940 struct newblk * 941 newblk_find(struct newblk_hashhead *newblkhd, struct fs *fs, 942 ufs_daddr_t newblkno) 943 { 944 struct newblk *newblk; 945 946 LIST_FOREACH(newblk, newblkhd, nb_hash) { 947 if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs) 948 return (newblk); 949 } 950 return(NULL); 951 } 952 953 /* 954 * Look up a newblk. Return 1 if found, 0 if not found. 955 * If not found, allocate if DEPALLOC flag is passed. 956 * Found or allocated entry is returned in newblkpp. 957 */ 958 static int 959 newblk_lookup(struct fs *fs, ufs_daddr_t newblkno, int flags, 960 struct newblk **newblkpp) 961 { 962 struct newblk *newblk; 963 struct newblk_hashhead *newblkhd; 964 965 newblkhd = NEWBLK_HASH(fs, newblkno); 966 top: 967 *newblkpp = newblk_find(newblkhd, fs, newblkno); 968 if (*newblkpp) 969 return(1); 970 if ((flags & DEPALLOC) == 0) 971 return (0); 972 if (sema_get(&newblk_in_progress, NULL) == 0) 973 goto top; 974 975 newblk = kmalloc(sizeof(struct newblk), M_NEWBLK, 976 M_SOFTDEP_FLAGS | M_ZERO); 977 978 if (newblk_find(newblkhd, fs, newblkno)) { 979 kprintf("newblk_lookup: blocking race avoided\n"); 980 sema_release(&pagedep_in_progress); 981 kfree(newblk, M_NEWBLK); 982 goto top; 983 } 984 newblk->nb_state = 0; 985 newblk->nb_fs = fs; 986 newblk->nb_newblkno = newblkno; 987 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); 988 sema_release(&newblk_in_progress); 989 *newblkpp = newblk; 990 return (0); 991 } 992 993 /* 994 * Executed during filesystem system initialization before 995 * mounting any filesystems. 996 */ 997 void 998 softdep_initialize(void) 999 { 1000 LIST_INIT(&mkdirlisthd); 1001 LIST_INIT(&softdep_workitem_pending); 1002 max_softdeps = min(desiredvnodes * 8, 1003 M_INODEDEP->ks_limit / (2 * sizeof(struct inodedep))); 1004 pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, 1005 &pagedep_hash); 1006 lockinit(&lk, "ffs_softdep", 0, LK_CANRECURSE); 1007 sema_init(&pagedep_in_progress, "pagedep", 0); 1008 inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); 1009 sema_init(&inodedep_in_progress, "inodedep", 0); 1010 newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash); 1011 sema_init(&newblk_in_progress, "newblk", 0); 1012 add_bio_ops(&softdep_bioops); 1013 } 1014 1015 /* 1016 * Called at mount time to notify the dependency code that a 1017 * filesystem wishes to use it. 1018 */ 1019 int 1020 softdep_mount(struct vnode *devvp, struct mount *mp, struct fs *fs) 1021 { 1022 struct csum cstotal; 1023 struct cg *cgp; 1024 struct buf *bp; 1025 int error, cyl; 1026 1027 mp->mnt_flag &= ~MNT_ASYNC; 1028 mp->mnt_flag |= MNT_SOFTDEP; 1029 mp->mnt_bioops = &softdep_bioops; 1030 /* 1031 * When doing soft updates, the counters in the 1032 * superblock may have gotten out of sync, so we have 1033 * to scan the cylinder groups and recalculate them. 1034 */ 1035 if (fs->fs_clean != 0) 1036 return (0); 1037 bzero(&cstotal, sizeof cstotal); 1038 for (cyl = 0; cyl < fs->fs_ncg; cyl++) { 1039 if ((error = bread(devvp, fsbtodoff(fs, cgtod(fs, cyl)), 1040 fs->fs_cgsize, &bp)) != 0) { 1041 brelse(bp); 1042 return (error); 1043 } 1044 cgp = (struct cg *)bp->b_data; 1045 cstotal.cs_nffree += cgp->cg_cs.cs_nffree; 1046 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; 1047 cstotal.cs_nifree += cgp->cg_cs.cs_nifree; 1048 cstotal.cs_ndir += cgp->cg_cs.cs_ndir; 1049 fs->fs_cs(fs, cyl) = cgp->cg_cs; 1050 brelse(bp); 1051 } 1052 #ifdef DEBUG 1053 if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) 1054 kprintf("ffs_mountfs: superblock updated for soft updates\n"); 1055 #endif 1056 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); 1057 return (0); 1058 } 1059 1060 /* 1061 * Protecting the freemaps (or bitmaps). 1062 * 1063 * To eliminate the need to execute fsck before mounting a filesystem 1064 * after a power failure, one must (conservatively) guarantee that the 1065 * on-disk copy of the bitmaps never indicate that a live inode or block is 1066 * free. So, when a block or inode is allocated, the bitmap should be 1067 * updated (on disk) before any new pointers. When a block or inode is 1068 * freed, the bitmap should not be updated until all pointers have been 1069 * reset. The latter dependency is handled by the delayed de-allocation 1070 * approach described below for block and inode de-allocation. The former 1071 * dependency is handled by calling the following procedure when a block or 1072 * inode is allocated. When an inode is allocated an "inodedep" is created 1073 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. 1074 * Each "inodedep" is also inserted into the hash indexing structure so 1075 * that any additional link additions can be made dependent on the inode 1076 * allocation. 1077 * 1078 * The ufs filesystem maintains a number of free block counts (e.g., per 1079 * cylinder group, per cylinder and per <cylinder, rotational position> pair) 1080 * in addition to the bitmaps. These counts are used to improve efficiency 1081 * during allocation and therefore must be consistent with the bitmaps. 1082 * There is no convenient way to guarantee post-crash consistency of these 1083 * counts with simple update ordering, for two main reasons: (1) The counts 1084 * and bitmaps for a single cylinder group block are not in the same disk 1085 * sector. If a disk write is interrupted (e.g., by power failure), one may 1086 * be written and the other not. (2) Some of the counts are located in the 1087 * superblock rather than the cylinder group block. So, we focus our soft 1088 * updates implementation on protecting the bitmaps. When mounting a 1089 * filesystem, we recompute the auxiliary counts from the bitmaps. 1090 */ 1091 1092 /* 1093 * Called just after updating the cylinder group block to allocate an inode. 1094 * 1095 * Parameters: 1096 * bp: buffer for cylgroup block with inode map 1097 * ip: inode related to allocation 1098 * newinum: new inode number being allocated 1099 */ 1100 void 1101 softdep_setup_inomapdep(struct buf *bp, struct inode *ip, ino_t newinum) 1102 { 1103 struct inodedep *inodedep; 1104 struct bmsafemap *bmsafemap; 1105 1106 /* 1107 * Create a dependency for the newly allocated inode. 1108 * Panic if it already exists as something is seriously wrong. 1109 * Otherwise add it to the dependency list for the buffer holding 1110 * the cylinder group map from which it was allocated. 1111 */ 1112 ACQUIRE_LOCK(&lk); 1113 if ((inodedep_lookup(ip->i_fs, newinum, DEPALLOC|NODELAY, &inodedep))) { 1114 panic("softdep_setup_inomapdep: found inode"); 1115 } 1116 inodedep->id_buf = bp; 1117 inodedep->id_state &= ~DEPCOMPLETE; 1118 bmsafemap = bmsafemap_lookup(bp); 1119 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); 1120 FREE_LOCK(&lk); 1121 } 1122 1123 /* 1124 * Called just after updating the cylinder group block to 1125 * allocate block or fragment. 1126 * 1127 * Parameters: 1128 * bp: buffer for cylgroup block with block map 1129 * fs: filesystem doing allocation 1130 * newblkno: number of newly allocated block 1131 */ 1132 void 1133 softdep_setup_blkmapdep(struct buf *bp, struct fs *fs, 1134 ufs_daddr_t newblkno) 1135 { 1136 struct newblk *newblk; 1137 struct bmsafemap *bmsafemap; 1138 1139 /* 1140 * Create a dependency for the newly allocated block. 1141 * Add it to the dependency list for the buffer holding 1142 * the cylinder group map from which it was allocated. 1143 */ 1144 if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0) 1145 panic("softdep_setup_blkmapdep: found block"); 1146 ACQUIRE_LOCK(&lk); 1147 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp); 1148 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 1149 FREE_LOCK(&lk); 1150 } 1151 1152 /* 1153 * Find the bmsafemap associated with a cylinder group buffer. 1154 * If none exists, create one. The buffer must be locked when 1155 * this routine is called and this routine must be called with 1156 * splbio interrupts blocked. 1157 */ 1158 static struct bmsafemap * 1159 bmsafemap_lookup(struct buf *bp) 1160 { 1161 struct bmsafemap *bmsafemap; 1162 struct worklist *wk; 1163 1164 KKASSERT(lock_held(&lk) > 0); 1165 1166 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 1167 if (wk->wk_type == D_BMSAFEMAP) 1168 return (WK_BMSAFEMAP(wk)); 1169 } 1170 FREE_LOCK(&lk); 1171 bmsafemap = kmalloc(sizeof(struct bmsafemap), M_BMSAFEMAP, 1172 M_SOFTDEP_FLAGS); 1173 bmsafemap->sm_list.wk_type = D_BMSAFEMAP; 1174 bmsafemap->sm_list.wk_state = 0; 1175 bmsafemap->sm_buf = bp; 1176 LIST_INIT(&bmsafemap->sm_allocdirecthd); 1177 LIST_INIT(&bmsafemap->sm_allocindirhd); 1178 LIST_INIT(&bmsafemap->sm_inodedephd); 1179 LIST_INIT(&bmsafemap->sm_newblkhd); 1180 ACQUIRE_LOCK(&lk); 1181 WORKLIST_INSERT_BP(bp, &bmsafemap->sm_list); 1182 return (bmsafemap); 1183 } 1184 1185 /* 1186 * Direct block allocation dependencies. 1187 * 1188 * When a new block is allocated, the corresponding disk locations must be 1189 * initialized (with zeros or new data) before the on-disk inode points to 1190 * them. Also, the freemap from which the block was allocated must be 1191 * updated (on disk) before the inode's pointer. These two dependencies are 1192 * independent of each other and are needed for all file blocks and indirect 1193 * blocks that are pointed to directly by the inode. Just before the 1194 * "in-core" version of the inode is updated with a newly allocated block 1195 * number, a procedure (below) is called to setup allocation dependency 1196 * structures. These structures are removed when the corresponding 1197 * dependencies are satisfied or when the block allocation becomes obsolete 1198 * (i.e., the file is deleted, the block is de-allocated, or the block is a 1199 * fragment that gets upgraded). All of these cases are handled in 1200 * procedures described later. 1201 * 1202 * When a file extension causes a fragment to be upgraded, either to a larger 1203 * fragment or to a full block, the on-disk location may change (if the 1204 * previous fragment could not simply be extended). In this case, the old 1205 * fragment must be de-allocated, but not until after the inode's pointer has 1206 * been updated. In most cases, this is handled by later procedures, which 1207 * will construct a "freefrag" structure to be added to the workitem queue 1208 * when the inode update is complete (or obsolete). The main exception to 1209 * this is when an allocation occurs while a pending allocation dependency 1210 * (for the same block pointer) remains. This case is handled in the main 1211 * allocation dependency setup procedure by immediately freeing the 1212 * unreferenced fragments. 1213 * 1214 * Parameters: 1215 * ip: inode to which block is being added 1216 * lbn: block pointer within inode 1217 * newblkno: disk block number being added 1218 * oldblkno: previous block number, 0 unless frag 1219 * newsize: size of new block 1220 * oldsize: size of new block 1221 * bp: bp for allocated block 1222 */ 1223 void 1224 softdep_setup_allocdirect(struct inode *ip, ufs_lbn_t lbn, ufs_daddr_t newblkno, 1225 ufs_daddr_t oldblkno, long newsize, long oldsize, 1226 struct buf *bp) 1227 { 1228 struct allocdirect *adp, *oldadp; 1229 struct allocdirectlst *adphead; 1230 struct bmsafemap *bmsafemap; 1231 struct inodedep *inodedep; 1232 struct pagedep *pagedep; 1233 struct newblk *newblk; 1234 1235 adp = kmalloc(sizeof(struct allocdirect), M_ALLOCDIRECT, 1236 M_SOFTDEP_FLAGS | M_ZERO); 1237 adp->ad_list.wk_type = D_ALLOCDIRECT; 1238 adp->ad_lbn = lbn; 1239 adp->ad_newblkno = newblkno; 1240 adp->ad_oldblkno = oldblkno; 1241 adp->ad_newsize = newsize; 1242 adp->ad_oldsize = oldsize; 1243 adp->ad_state = ATTACHED; 1244 if (newblkno == oldblkno) 1245 adp->ad_freefrag = NULL; 1246 else 1247 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); 1248 1249 if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) 1250 panic("softdep_setup_allocdirect: lost block"); 1251 1252 ACQUIRE_LOCK(&lk); 1253 inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep); 1254 adp->ad_inodedep = inodedep; 1255 1256 if (newblk->nb_state == DEPCOMPLETE) { 1257 adp->ad_state |= DEPCOMPLETE; 1258 adp->ad_buf = NULL; 1259 } else { 1260 bmsafemap = newblk->nb_bmsafemap; 1261 adp->ad_buf = bmsafemap->sm_buf; 1262 LIST_REMOVE(newblk, nb_deps); 1263 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); 1264 } 1265 LIST_REMOVE(newblk, nb_hash); 1266 kfree(newblk, M_NEWBLK); 1267 1268 WORKLIST_INSERT_BP(bp, &adp->ad_list); 1269 if (lbn >= NDADDR) { 1270 /* allocating an indirect block */ 1271 if (oldblkno != 0) { 1272 panic("softdep_setup_allocdirect: non-zero indir"); 1273 } 1274 } else { 1275 /* 1276 * Allocating a direct block. 1277 * 1278 * If we are allocating a directory block, then we must 1279 * allocate an associated pagedep to track additions and 1280 * deletions. 1281 */ 1282 if ((ip->i_mode & IFMT) == IFDIR && 1283 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) { 1284 WORKLIST_INSERT_BP(bp, &pagedep->pd_list); 1285 } 1286 } 1287 /* 1288 * The list of allocdirects must be kept in sorted and ascending 1289 * order so that the rollback routines can quickly determine the 1290 * first uncommitted block (the size of the file stored on disk 1291 * ends at the end of the lowest committed fragment, or if there 1292 * are no fragments, at the end of the highest committed block). 1293 * Since files generally grow, the typical case is that the new 1294 * block is to be added at the end of the list. We speed this 1295 * special case by checking against the last allocdirect in the 1296 * list before laboriously traversing the list looking for the 1297 * insertion point. 1298 */ 1299 adphead = &inodedep->id_newinoupdt; 1300 oldadp = TAILQ_LAST(adphead, allocdirectlst); 1301 if (oldadp == NULL || oldadp->ad_lbn <= lbn) { 1302 /* insert at end of list */ 1303 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 1304 if (oldadp != NULL && oldadp->ad_lbn == lbn) 1305 allocdirect_merge(adphead, adp, oldadp); 1306 FREE_LOCK(&lk); 1307 return; 1308 } 1309 TAILQ_FOREACH(oldadp, adphead, ad_next) { 1310 if (oldadp->ad_lbn >= lbn) 1311 break; 1312 } 1313 if (oldadp == NULL) { 1314 panic("softdep_setup_allocdirect: lost entry"); 1315 } 1316 /* insert in middle of list */ 1317 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 1318 if (oldadp->ad_lbn == lbn) 1319 allocdirect_merge(adphead, adp, oldadp); 1320 FREE_LOCK(&lk); 1321 } 1322 1323 /* 1324 * Replace an old allocdirect dependency with a newer one. 1325 * This routine must be called with splbio interrupts blocked. 1326 * 1327 * Parameters: 1328 * adphead: head of list holding allocdirects 1329 * newadp: allocdirect being added 1330 * oldadp: existing allocdirect being checked 1331 */ 1332 static void 1333 allocdirect_merge(struct allocdirectlst *adphead, 1334 struct allocdirect *newadp, 1335 struct allocdirect *oldadp) 1336 { 1337 struct freefrag *freefrag; 1338 1339 KKASSERT(lock_held(&lk) > 0); 1340 1341 if (newadp->ad_oldblkno != oldadp->ad_newblkno || 1342 newadp->ad_oldsize != oldadp->ad_newsize || 1343 newadp->ad_lbn >= NDADDR) { 1344 panic("allocdirect_check: old %d != new %d || lbn %ld >= %d", 1345 newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn, 1346 NDADDR); 1347 } 1348 newadp->ad_oldblkno = oldadp->ad_oldblkno; 1349 newadp->ad_oldsize = oldadp->ad_oldsize; 1350 /* 1351 * If the old dependency had a fragment to free or had never 1352 * previously had a block allocated, then the new dependency 1353 * can immediately post its freefrag and adopt the old freefrag. 1354 * This action is done by swapping the freefrag dependencies. 1355 * The new dependency gains the old one's freefrag, and the 1356 * old one gets the new one and then immediately puts it on 1357 * the worklist when it is freed by free_allocdirect. It is 1358 * not possible to do this swap when the old dependency had a 1359 * non-zero size but no previous fragment to free. This condition 1360 * arises when the new block is an extension of the old block. 1361 * Here, the first part of the fragment allocated to the new 1362 * dependency is part of the block currently claimed on disk by 1363 * the old dependency, so cannot legitimately be freed until the 1364 * conditions for the new dependency are fulfilled. 1365 */ 1366 if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { 1367 freefrag = newadp->ad_freefrag; 1368 newadp->ad_freefrag = oldadp->ad_freefrag; 1369 oldadp->ad_freefrag = freefrag; 1370 } 1371 free_allocdirect(adphead, oldadp, 0); 1372 } 1373 1374 /* 1375 * Allocate a new freefrag structure if needed. 1376 */ 1377 static struct freefrag * 1378 newfreefrag(struct inode *ip, ufs_daddr_t blkno, long size) 1379 { 1380 struct freefrag *freefrag; 1381 struct fs *fs; 1382 1383 if (blkno == 0) 1384 return (NULL); 1385 fs = ip->i_fs; 1386 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) 1387 panic("newfreefrag: frag size"); 1388 freefrag = kmalloc(sizeof(struct freefrag), M_FREEFRAG, 1389 M_SOFTDEP_FLAGS); 1390 freefrag->ff_list.wk_type = D_FREEFRAG; 1391 freefrag->ff_state = ip->i_uid & ~ONWORKLIST; /* XXX - used below */ 1392 freefrag->ff_inum = ip->i_number; 1393 freefrag->ff_fs = fs; 1394 freefrag->ff_devvp = ip->i_devvp; 1395 freefrag->ff_blkno = blkno; 1396 freefrag->ff_fragsize = size; 1397 return (freefrag); 1398 } 1399 1400 /* 1401 * This workitem de-allocates fragments that were replaced during 1402 * file block allocation. 1403 */ 1404 static void 1405 handle_workitem_freefrag(struct freefrag *freefrag) 1406 { 1407 struct inode tip; 1408 1409 tip.i_fs = freefrag->ff_fs; 1410 tip.i_devvp = freefrag->ff_devvp; 1411 tip.i_dev = freefrag->ff_devvp->v_rdev; 1412 tip.i_number = freefrag->ff_inum; 1413 tip.i_uid = freefrag->ff_state & ~ONWORKLIST; /* XXX - set above */ 1414 ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize); 1415 kfree(freefrag, M_FREEFRAG); 1416 } 1417 1418 /* 1419 * Indirect block allocation dependencies. 1420 * 1421 * The same dependencies that exist for a direct block also exist when 1422 * a new block is allocated and pointed to by an entry in a block of 1423 * indirect pointers. The undo/redo states described above are also 1424 * used here. Because an indirect block contains many pointers that 1425 * may have dependencies, a second copy of the entire in-memory indirect 1426 * block is kept. The buffer cache copy is always completely up-to-date. 1427 * The second copy, which is used only as a source for disk writes, 1428 * contains only the safe pointers (i.e., those that have no remaining 1429 * update dependencies). The second copy is freed when all pointers 1430 * are safe. The cache is not allowed to replace indirect blocks with 1431 * pending update dependencies. If a buffer containing an indirect 1432 * block with dependencies is written, these routines will mark it 1433 * dirty again. It can only be successfully written once all the 1434 * dependencies are removed. The ffs_fsync routine in conjunction with 1435 * softdep_sync_metadata work together to get all the dependencies 1436 * removed so that a file can be successfully written to disk. Three 1437 * procedures are used when setting up indirect block pointer 1438 * dependencies. The division is necessary because of the organization 1439 * of the "balloc" routine and because of the distinction between file 1440 * pages and file metadata blocks. 1441 */ 1442 1443 /* 1444 * Allocate a new allocindir structure. 1445 * 1446 * Parameters: 1447 * ip: inode for file being extended 1448 * ptrno: offset of pointer in indirect block 1449 * newblkno: disk block number being added 1450 * oldblkno: previous block number, 0 if none 1451 */ 1452 static struct allocindir * 1453 newallocindir(struct inode *ip, int ptrno, ufs_daddr_t newblkno, 1454 ufs_daddr_t oldblkno) 1455 { 1456 struct allocindir *aip; 1457 1458 aip = kmalloc(sizeof(struct allocindir), M_ALLOCINDIR, 1459 M_SOFTDEP_FLAGS | M_ZERO); 1460 aip->ai_list.wk_type = D_ALLOCINDIR; 1461 aip->ai_state = ATTACHED; 1462 aip->ai_offset = ptrno; 1463 aip->ai_newblkno = newblkno; 1464 aip->ai_oldblkno = oldblkno; 1465 aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize); 1466 return (aip); 1467 } 1468 1469 /* 1470 * Called just before setting an indirect block pointer 1471 * to a newly allocated file page. 1472 * 1473 * Parameters: 1474 * ip: inode for file being extended 1475 * lbn: allocated block number within file 1476 * bp: buffer with indirect blk referencing page 1477 * ptrno: offset of pointer in indirect block 1478 * newblkno: disk block number being added 1479 * oldblkno: previous block number, 0 if none 1480 * nbp: buffer holding allocated page 1481 */ 1482 void 1483 softdep_setup_allocindir_page(struct inode *ip, ufs_lbn_t lbn, 1484 struct buf *bp, int ptrno, 1485 ufs_daddr_t newblkno, ufs_daddr_t oldblkno, 1486 struct buf *nbp) 1487 { 1488 struct allocindir *aip; 1489 struct pagedep *pagedep; 1490 1491 aip = newallocindir(ip, ptrno, newblkno, oldblkno); 1492 ACQUIRE_LOCK(&lk); 1493 /* 1494 * If we are allocating a directory page, then we must 1495 * allocate an associated pagedep to track additions and 1496 * deletions. 1497 */ 1498 if ((ip->i_mode & IFMT) == IFDIR && 1499 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) 1500 WORKLIST_INSERT_BP(nbp, &pagedep->pd_list); 1501 WORKLIST_INSERT_BP(nbp, &aip->ai_list); 1502 FREE_LOCK(&lk); 1503 setup_allocindir_phase2(bp, ip, aip); 1504 } 1505 1506 /* 1507 * Called just before setting an indirect block pointer to a 1508 * newly allocated indirect block. 1509 * Parameters: 1510 * nbp: newly allocated indirect block 1511 * ip: inode for file being extended 1512 * bp: indirect block referencing allocated block 1513 * ptrno: offset of pointer in indirect block 1514 * newblkno: disk block number being added 1515 */ 1516 void 1517 softdep_setup_allocindir_meta(struct buf *nbp, struct inode *ip, 1518 struct buf *bp, int ptrno, 1519 ufs_daddr_t newblkno) 1520 { 1521 struct allocindir *aip; 1522 1523 aip = newallocindir(ip, ptrno, newblkno, 0); 1524 ACQUIRE_LOCK(&lk); 1525 WORKLIST_INSERT_BP(nbp, &aip->ai_list); 1526 FREE_LOCK(&lk); 1527 setup_allocindir_phase2(bp, ip, aip); 1528 } 1529 1530 /* 1531 * Called to finish the allocation of the "aip" allocated 1532 * by one of the two routines above. 1533 * 1534 * Parameters: 1535 * bp: in-memory copy of the indirect block 1536 * ip: inode for file being extended 1537 * aip: allocindir allocated by the above routines 1538 */ 1539 static void 1540 setup_allocindir_phase2(struct buf *bp, struct inode *ip, 1541 struct allocindir *aip) 1542 { 1543 struct worklist *wk; 1544 struct indirdep *indirdep, *newindirdep; 1545 struct bmsafemap *bmsafemap; 1546 struct allocindir *oldaip; 1547 struct freefrag *freefrag; 1548 struct newblk *newblk; 1549 1550 if (bp->b_loffset >= 0) 1551 panic("setup_allocindir_phase2: not indir blk"); 1552 for (indirdep = NULL, newindirdep = NULL; ; ) { 1553 ACQUIRE_LOCK(&lk); 1554 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 1555 if (wk->wk_type != D_INDIRDEP) 1556 continue; 1557 indirdep = WK_INDIRDEP(wk); 1558 break; 1559 } 1560 if (indirdep == NULL && newindirdep) { 1561 indirdep = newindirdep; 1562 WORKLIST_INSERT_BP(bp, &indirdep->ir_list); 1563 newindirdep = NULL; 1564 } 1565 FREE_LOCK(&lk); 1566 if (indirdep) { 1567 if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0, 1568 &newblk) == 0) 1569 panic("setup_allocindir: lost block"); 1570 ACQUIRE_LOCK(&lk); 1571 if (newblk->nb_state == DEPCOMPLETE) { 1572 aip->ai_state |= DEPCOMPLETE; 1573 aip->ai_buf = NULL; 1574 } else { 1575 bmsafemap = newblk->nb_bmsafemap; 1576 aip->ai_buf = bmsafemap->sm_buf; 1577 LIST_REMOVE(newblk, nb_deps); 1578 LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd, 1579 aip, ai_deps); 1580 } 1581 LIST_REMOVE(newblk, nb_hash); 1582 kfree(newblk, M_NEWBLK); 1583 aip->ai_indirdep = indirdep; 1584 /* 1585 * Check to see if there is an existing dependency 1586 * for this block. If there is, merge the old 1587 * dependency into the new one. 1588 */ 1589 if (aip->ai_oldblkno == 0) 1590 oldaip = NULL; 1591 else 1592 1593 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) 1594 if (oldaip->ai_offset == aip->ai_offset) 1595 break; 1596 if (oldaip != NULL) { 1597 if (oldaip->ai_newblkno != aip->ai_oldblkno) { 1598 panic("setup_allocindir_phase2: blkno"); 1599 } 1600 aip->ai_oldblkno = oldaip->ai_oldblkno; 1601 freefrag = oldaip->ai_freefrag; 1602 oldaip->ai_freefrag = aip->ai_freefrag; 1603 aip->ai_freefrag = freefrag; 1604 free_allocindir(oldaip, NULL); 1605 } 1606 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); 1607 ((ufs_daddr_t *)indirdep->ir_savebp->b_data) 1608 [aip->ai_offset] = aip->ai_oldblkno; 1609 FREE_LOCK(&lk); 1610 } 1611 if (newindirdep) { 1612 /* 1613 * Avoid any possibility of data corruption by 1614 * ensuring that our old version is thrown away. 1615 */ 1616 newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE; 1617 brelse(newindirdep->ir_savebp); 1618 WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP); 1619 } 1620 if (indirdep) 1621 break; 1622 newindirdep = kmalloc(sizeof(struct indirdep), M_INDIRDEP, 1623 M_SOFTDEP_FLAGS); 1624 newindirdep->ir_list.wk_type = D_INDIRDEP; 1625 newindirdep->ir_state = ATTACHED; 1626 LIST_INIT(&newindirdep->ir_deplisthd); 1627 LIST_INIT(&newindirdep->ir_donehd); 1628 if (bp->b_bio2.bio_offset == NOOFFSET) { 1629 VOP_BMAP(bp->b_vp, bp->b_bio1.bio_offset, 1630 &bp->b_bio2.bio_offset, NULL, NULL, 1631 BUF_CMD_WRITE); 1632 } 1633 KKASSERT(bp->b_bio2.bio_offset != NOOFFSET); 1634 newindirdep->ir_savebp = getblk(ip->i_devvp, 1635 bp->b_bio2.bio_offset, 1636 bp->b_bcount, 0, 0); 1637 BUF_KERNPROC(newindirdep->ir_savebp); 1638 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); 1639 } 1640 } 1641 1642 /* 1643 * Block de-allocation dependencies. 1644 * 1645 * When blocks are de-allocated, the on-disk pointers must be nullified before 1646 * the blocks are made available for use by other files. (The true 1647 * requirement is that old pointers must be nullified before new on-disk 1648 * pointers are set. We chose this slightly more stringent requirement to 1649 * reduce complexity.) Our implementation handles this dependency by updating 1650 * the inode (or indirect block) appropriately but delaying the actual block 1651 * de-allocation (i.e., freemap and free space count manipulation) until 1652 * after the updated versions reach stable storage. After the disk is 1653 * updated, the blocks can be safely de-allocated whenever it is convenient. 1654 * This implementation handles only the common case of reducing a file's 1655 * length to zero. Other cases are handled by the conventional synchronous 1656 * write approach. 1657 * 1658 * The ffs implementation with which we worked double-checks 1659 * the state of the block pointers and file size as it reduces 1660 * a file's length. Some of this code is replicated here in our 1661 * soft updates implementation. The freeblks->fb_chkcnt field is 1662 * used to transfer a part of this information to the procedure 1663 * that eventually de-allocates the blocks. 1664 * 1665 * This routine should be called from the routine that shortens 1666 * a file's length, before the inode's size or block pointers 1667 * are modified. It will save the block pointer information for 1668 * later release and zero the inode so that the calling routine 1669 * can release it. 1670 */ 1671 struct softdep_setup_freeblocks_info { 1672 struct fs *fs; 1673 struct inode *ip; 1674 }; 1675 1676 static int softdep_setup_freeblocks_bp(struct buf *bp, void *data); 1677 1678 /* 1679 * Parameters: 1680 * ip: The inode whose length is to be reduced 1681 * length: The new length for the file 1682 */ 1683 void 1684 softdep_setup_freeblocks(struct inode *ip, off_t length) 1685 { 1686 struct softdep_setup_freeblocks_info info; 1687 struct freeblks *freeblks; 1688 struct inodedep *inodedep; 1689 struct allocdirect *adp; 1690 struct vnode *vp; 1691 struct buf *bp; 1692 struct fs *fs; 1693 int i, error, delay; 1694 int count; 1695 1696 fs = ip->i_fs; 1697 if (length != 0) 1698 panic("softde_setup_freeblocks: non-zero length"); 1699 freeblks = kmalloc(sizeof(struct freeblks), M_FREEBLKS, 1700 M_SOFTDEP_FLAGS | M_ZERO); 1701 freeblks->fb_list.wk_type = D_FREEBLKS; 1702 freeblks->fb_state = ATTACHED; 1703 freeblks->fb_uid = ip->i_uid; 1704 freeblks->fb_previousinum = ip->i_number; 1705 freeblks->fb_devvp = ip->i_devvp; 1706 freeblks->fb_fs = fs; 1707 freeblks->fb_oldsize = ip->i_size; 1708 freeblks->fb_newsize = length; 1709 freeblks->fb_chkcnt = ip->i_blocks; 1710 for (i = 0; i < NDADDR; i++) { 1711 freeblks->fb_dblks[i] = ip->i_db[i]; 1712 ip->i_db[i] = 0; 1713 } 1714 for (i = 0; i < NIADDR; i++) { 1715 freeblks->fb_iblks[i] = ip->i_ib[i]; 1716 ip->i_ib[i] = 0; 1717 } 1718 ip->i_blocks = 0; 1719 ip->i_size = 0; 1720 /* 1721 * Push the zero'ed inode to to its disk buffer so that we are free 1722 * to delete its dependencies below. Once the dependencies are gone 1723 * the buffer can be safely released. 1724 */ 1725 if ((error = bread(ip->i_devvp, 1726 fsbtodoff(fs, ino_to_fsba(fs, ip->i_number)), 1727 (int)fs->fs_bsize, &bp)) != 0) 1728 softdep_error("softdep_setup_freeblocks", error); 1729 *((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = 1730 ip->i_din; 1731 /* 1732 * Find and eliminate any inode dependencies. 1733 */ 1734 ACQUIRE_LOCK(&lk); 1735 (void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep); 1736 if ((inodedep->id_state & IOSTARTED) != 0) { 1737 panic("softdep_setup_freeblocks: inode busy"); 1738 } 1739 /* 1740 * Add the freeblks structure to the list of operations that 1741 * must await the zero'ed inode being written to disk. If we 1742 * still have a bitmap dependency (delay == 0), then the inode 1743 * has never been written to disk, so we can process the 1744 * freeblks below once we have deleted the dependencies. 1745 */ 1746 delay = (inodedep->id_state & DEPCOMPLETE); 1747 if (delay) 1748 WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list); 1749 /* 1750 * Because the file length has been truncated to zero, any 1751 * pending block allocation dependency structures associated 1752 * with this inode are obsolete and can simply be de-allocated. 1753 * We must first merge the two dependency lists to get rid of 1754 * any duplicate freefrag structures, then purge the merged list. 1755 */ 1756 merge_inode_lists(inodedep); 1757 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) 1758 free_allocdirect(&inodedep->id_inoupdt, adp, 1); 1759 FREE_LOCK(&lk); 1760 bdwrite(bp); 1761 /* 1762 * We must wait for any I/O in progress to finish so that 1763 * all potential buffers on the dirty list will be visible. 1764 * Once they are all there, walk the list and get rid of 1765 * any dependencies. 1766 */ 1767 vp = ITOV(ip); 1768 ACQUIRE_LOCK(&lk); 1769 drain_output(vp, 1); 1770 1771 info.fs = fs; 1772 info.ip = ip; 1773 lwkt_gettoken(&vp->v_token); 1774 do { 1775 count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 1776 softdep_setup_freeblocks_bp, &info); 1777 } while (count != 0); 1778 lwkt_reltoken(&vp->v_token); 1779 1780 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0) 1781 (void)free_inodedep(inodedep); 1782 1783 if (delay) { 1784 freeblks->fb_state |= DEPCOMPLETE; 1785 /* 1786 * If the inode with zeroed block pointers is now on disk 1787 * we can start freeing blocks. Add freeblks to the worklist 1788 * instead of calling handle_workitem_freeblocks directly as 1789 * it is more likely that additional IO is needed to complete 1790 * the request here than in the !delay case. 1791 */ 1792 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) 1793 add_to_worklist(&freeblks->fb_list); 1794 } 1795 1796 FREE_LOCK(&lk); 1797 /* 1798 * If the inode has never been written to disk (delay == 0), 1799 * then we can process the freeblks now that we have deleted 1800 * the dependencies. 1801 */ 1802 if (!delay) 1803 handle_workitem_freeblocks(freeblks); 1804 } 1805 1806 static int 1807 softdep_setup_freeblocks_bp(struct buf *bp, void *data) 1808 { 1809 struct softdep_setup_freeblocks_info *info = data; 1810 struct inodedep *inodedep; 1811 1812 if (getdirtybuf(&bp, MNT_WAIT) == 0) { 1813 kprintf("softdep_setup_freeblocks_bp(1): caught bp %p going away\n", bp); 1814 return(-1); 1815 } 1816 if (bp->b_vp != ITOV(info->ip) || (bp->b_flags & B_DELWRI) == 0) { 1817 kprintf("softdep_setup_freeblocks_bp(2): caught bp %p going away\n", bp); 1818 BUF_UNLOCK(bp); 1819 return(-1); 1820 } 1821 (void) inodedep_lookup(info->fs, info->ip->i_number, 0, &inodedep); 1822 deallocate_dependencies(bp, inodedep); 1823 bp->b_flags |= B_INVAL | B_NOCACHE; 1824 FREE_LOCK(&lk); 1825 brelse(bp); 1826 ACQUIRE_LOCK(&lk); 1827 return(1); 1828 } 1829 1830 /* 1831 * Reclaim any dependency structures from a buffer that is about to 1832 * be reallocated to a new vnode. The buffer must be locked, thus, 1833 * no I/O completion operations can occur while we are manipulating 1834 * its associated dependencies. The mutex is held so that other I/O's 1835 * associated with related dependencies do not occur. 1836 */ 1837 static void 1838 deallocate_dependencies(struct buf *bp, struct inodedep *inodedep) 1839 { 1840 struct worklist *wk; 1841 struct indirdep *indirdep; 1842 struct allocindir *aip; 1843 struct pagedep *pagedep; 1844 struct dirrem *dirrem; 1845 struct diradd *dap; 1846 int i; 1847 1848 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 1849 switch (wk->wk_type) { 1850 1851 case D_INDIRDEP: 1852 indirdep = WK_INDIRDEP(wk); 1853 /* 1854 * None of the indirect pointers will ever be visible, 1855 * so they can simply be tossed. GOINGAWAY ensures 1856 * that allocated pointers will be saved in the buffer 1857 * cache until they are freed. Note that they will 1858 * only be able to be found by their physical address 1859 * since the inode mapping the logical address will 1860 * be gone. The save buffer used for the safe copy 1861 * was allocated in setup_allocindir_phase2 using 1862 * the physical address so it could be used for this 1863 * purpose. Hence we swap the safe copy with the real 1864 * copy, allowing the safe copy to be freed and holding 1865 * on to the real copy for later use in indir_trunc. 1866 * 1867 * NOTE: ir_savebp is relative to the block device 1868 * so b_bio1 contains the device block number. 1869 */ 1870 if (indirdep->ir_state & GOINGAWAY) { 1871 panic("deallocate_dependencies: already gone"); 1872 } 1873 indirdep->ir_state |= GOINGAWAY; 1874 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != NULL) 1875 free_allocindir(aip, inodedep); 1876 if (bp->b_bio1.bio_offset >= 0 || 1877 bp->b_bio2.bio_offset != indirdep->ir_savebp->b_bio1.bio_offset) { 1878 panic("deallocate_dependencies: not indir"); 1879 } 1880 bcopy(bp->b_data, indirdep->ir_savebp->b_data, 1881 bp->b_bcount); 1882 WORKLIST_REMOVE(wk); 1883 WORKLIST_INSERT_BP(indirdep->ir_savebp, wk); 1884 continue; 1885 1886 case D_PAGEDEP: 1887 pagedep = WK_PAGEDEP(wk); 1888 /* 1889 * None of the directory additions will ever be 1890 * visible, so they can simply be tossed. 1891 */ 1892 for (i = 0; i < DAHASHSZ; i++) 1893 while ((dap = 1894 LIST_FIRST(&pagedep->pd_diraddhd[i]))) 1895 free_diradd(dap); 1896 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 1897 free_diradd(dap); 1898 /* 1899 * Copy any directory remove dependencies to the list 1900 * to be processed after the zero'ed inode is written. 1901 * If the inode has already been written, then they 1902 * can be dumped directly onto the work list. 1903 */ 1904 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { 1905 LIST_REMOVE(dirrem, dm_next); 1906 dirrem->dm_dirinum = pagedep->pd_ino; 1907 if (inodedep == NULL || 1908 (inodedep->id_state & ALLCOMPLETE) == 1909 ALLCOMPLETE) 1910 add_to_worklist(&dirrem->dm_list); 1911 else 1912 WORKLIST_INSERT(&inodedep->id_bufwait, 1913 &dirrem->dm_list); 1914 } 1915 WORKLIST_REMOVE(&pagedep->pd_list); 1916 LIST_REMOVE(pagedep, pd_hash); 1917 WORKITEM_FREE(pagedep, D_PAGEDEP); 1918 continue; 1919 1920 case D_ALLOCINDIR: 1921 free_allocindir(WK_ALLOCINDIR(wk), inodedep); 1922 continue; 1923 1924 case D_ALLOCDIRECT: 1925 case D_INODEDEP: 1926 panic("deallocate_dependencies: Unexpected type %s", 1927 TYPENAME(wk->wk_type)); 1928 /* NOTREACHED */ 1929 1930 default: 1931 panic("deallocate_dependencies: Unknown type %s", 1932 TYPENAME(wk->wk_type)); 1933 /* NOTREACHED */ 1934 } 1935 } 1936 } 1937 1938 /* 1939 * Free an allocdirect. Generate a new freefrag work request if appropriate. 1940 * This routine must be called with splbio interrupts blocked. 1941 */ 1942 static void 1943 free_allocdirect(struct allocdirectlst *adphead, 1944 struct allocdirect *adp, int delay) 1945 { 1946 KKASSERT(lock_held(&lk) > 0); 1947 1948 if ((adp->ad_state & DEPCOMPLETE) == 0) 1949 LIST_REMOVE(adp, ad_deps); 1950 TAILQ_REMOVE(adphead, adp, ad_next); 1951 if ((adp->ad_state & COMPLETE) == 0) 1952 WORKLIST_REMOVE(&adp->ad_list); 1953 if (adp->ad_freefrag != NULL) { 1954 if (delay) 1955 WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, 1956 &adp->ad_freefrag->ff_list); 1957 else 1958 add_to_worklist(&adp->ad_freefrag->ff_list); 1959 } 1960 WORKITEM_FREE(adp, D_ALLOCDIRECT); 1961 } 1962 1963 /* 1964 * Prepare an inode to be freed. The actual free operation is not 1965 * done until the zero'ed inode has been written to disk. 1966 */ 1967 void 1968 softdep_freefile(struct vnode *pvp, ino_t ino, int mode) 1969 { 1970 struct inode *ip = VTOI(pvp); 1971 struct inodedep *inodedep; 1972 struct freefile *freefile; 1973 1974 /* 1975 * This sets up the inode de-allocation dependency. 1976 */ 1977 freefile = kmalloc(sizeof(struct freefile), M_FREEFILE, 1978 M_SOFTDEP_FLAGS); 1979 freefile->fx_list.wk_type = D_FREEFILE; 1980 freefile->fx_list.wk_state = 0; 1981 freefile->fx_mode = mode; 1982 freefile->fx_oldinum = ino; 1983 freefile->fx_devvp = ip->i_devvp; 1984 freefile->fx_fs = ip->i_fs; 1985 1986 /* 1987 * If the inodedep does not exist, then the zero'ed inode has 1988 * been written to disk. If the allocated inode has never been 1989 * written to disk, then the on-disk inode is zero'ed. In either 1990 * case we can free the file immediately. 1991 */ 1992 ACQUIRE_LOCK(&lk); 1993 if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 || 1994 check_inode_unwritten(inodedep)) { 1995 FREE_LOCK(&lk); 1996 handle_workitem_freefile(freefile); 1997 return; 1998 } 1999 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); 2000 FREE_LOCK(&lk); 2001 } 2002 2003 /* 2004 * Check to see if an inode has never been written to disk. If 2005 * so free the inodedep and return success, otherwise return failure. 2006 * This routine must be called with splbio interrupts blocked. 2007 * 2008 * If we still have a bitmap dependency, then the inode has never 2009 * been written to disk. Drop the dependency as it is no longer 2010 * necessary since the inode is being deallocated. We set the 2011 * ALLCOMPLETE flags since the bitmap now properly shows that the 2012 * inode is not allocated. Even if the inode is actively being 2013 * written, it has been rolled back to its zero'ed state, so we 2014 * are ensured that a zero inode is what is on the disk. For short 2015 * lived files, this change will usually result in removing all the 2016 * dependencies from the inode so that it can be freed immediately. 2017 */ 2018 static int 2019 check_inode_unwritten(struct inodedep *inodedep) 2020 { 2021 2022 if ((inodedep->id_state & DEPCOMPLETE) != 0 || 2023 LIST_FIRST(&inodedep->id_pendinghd) != NULL || 2024 LIST_FIRST(&inodedep->id_bufwait) != NULL || 2025 LIST_FIRST(&inodedep->id_inowait) != NULL || 2026 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || 2027 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL || 2028 inodedep->id_nlinkdelta != 0) 2029 return (0); 2030 2031 /* 2032 * Another process might be in initiate_write_inodeblock 2033 * trying to allocate memory without holding "Softdep Lock". 2034 */ 2035 if ((inodedep->id_state & IOSTARTED) != 0 && 2036 inodedep->id_savedino == NULL) 2037 return(0); 2038 2039 inodedep->id_state |= ALLCOMPLETE; 2040 LIST_REMOVE(inodedep, id_deps); 2041 inodedep->id_buf = NULL; 2042 if (inodedep->id_state & ONWORKLIST) 2043 WORKLIST_REMOVE(&inodedep->id_list); 2044 if (inodedep->id_savedino != NULL) { 2045 kfree(inodedep->id_savedino, M_INODEDEP); 2046 inodedep->id_savedino = NULL; 2047 } 2048 if (free_inodedep(inodedep) == 0) { 2049 panic("check_inode_unwritten: busy inode"); 2050 } 2051 return (1); 2052 } 2053 2054 /* 2055 * Try to free an inodedep structure. Return 1 if it could be freed. 2056 */ 2057 static int 2058 free_inodedep(struct inodedep *inodedep) 2059 { 2060 2061 if ((inodedep->id_state & ONWORKLIST) != 0 || 2062 (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || 2063 LIST_FIRST(&inodedep->id_pendinghd) != NULL || 2064 LIST_FIRST(&inodedep->id_bufwait) != NULL || 2065 LIST_FIRST(&inodedep->id_inowait) != NULL || 2066 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || 2067 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL || 2068 inodedep->id_nlinkdelta != 0 || inodedep->id_savedino != NULL) 2069 return (0); 2070 LIST_REMOVE(inodedep, id_hash); 2071 WORKITEM_FREE(inodedep, D_INODEDEP); 2072 num_inodedep -= 1; 2073 return (1); 2074 } 2075 2076 /* 2077 * This workitem routine performs the block de-allocation. 2078 * The workitem is added to the pending list after the updated 2079 * inode block has been written to disk. As mentioned above, 2080 * checks regarding the number of blocks de-allocated (compared 2081 * to the number of blocks allocated for the file) are also 2082 * performed in this function. 2083 */ 2084 static void 2085 handle_workitem_freeblocks(struct freeblks *freeblks) 2086 { 2087 struct inode tip; 2088 ufs_daddr_t bn; 2089 struct fs *fs; 2090 int i, level, bsize; 2091 long nblocks, blocksreleased = 0; 2092 int error, allerror = 0; 2093 ufs_lbn_t baselbns[NIADDR], tmpval; 2094 2095 tip.i_number = freeblks->fb_previousinum; 2096 tip.i_devvp = freeblks->fb_devvp; 2097 tip.i_dev = freeblks->fb_devvp->v_rdev; 2098 tip.i_fs = freeblks->fb_fs; 2099 tip.i_size = freeblks->fb_oldsize; 2100 tip.i_uid = freeblks->fb_uid; 2101 fs = freeblks->fb_fs; 2102 tmpval = 1; 2103 baselbns[0] = NDADDR; 2104 for (i = 1; i < NIADDR; i++) { 2105 tmpval *= NINDIR(fs); 2106 baselbns[i] = baselbns[i - 1] + tmpval; 2107 } 2108 nblocks = btodb(fs->fs_bsize); 2109 blocksreleased = 0; 2110 /* 2111 * Indirect blocks first. 2112 */ 2113 for (level = (NIADDR - 1); level >= 0; level--) { 2114 if ((bn = freeblks->fb_iblks[level]) == 0) 2115 continue; 2116 if ((error = indir_trunc(&tip, fsbtodoff(fs, bn), level, 2117 baselbns[level], &blocksreleased)) == 0) 2118 allerror = error; 2119 ffs_blkfree(&tip, bn, fs->fs_bsize); 2120 blocksreleased += nblocks; 2121 } 2122 /* 2123 * All direct blocks or frags. 2124 */ 2125 for (i = (NDADDR - 1); i >= 0; i--) { 2126 if ((bn = freeblks->fb_dblks[i]) == 0) 2127 continue; 2128 bsize = blksize(fs, &tip, i); 2129 ffs_blkfree(&tip, bn, bsize); 2130 blocksreleased += btodb(bsize); 2131 } 2132 2133 #ifdef DIAGNOSTIC 2134 if (freeblks->fb_chkcnt != blocksreleased) 2135 kprintf("handle_workitem_freeblocks: block count\n"); 2136 if (allerror) 2137 softdep_error("handle_workitem_freeblks", allerror); 2138 #endif /* DIAGNOSTIC */ 2139 WORKITEM_FREE(freeblks, D_FREEBLKS); 2140 } 2141 2142 /* 2143 * Release blocks associated with the inode ip and stored in the indirect 2144 * block at doffset. If level is greater than SINGLE, the block is an 2145 * indirect block and recursive calls to indirtrunc must be used to 2146 * cleanse other indirect blocks. 2147 */ 2148 static int 2149 indir_trunc(struct inode *ip, off_t doffset, int level, ufs_lbn_t lbn, 2150 long *countp) 2151 { 2152 struct buf *bp; 2153 ufs_daddr_t *bap; 2154 ufs_daddr_t nb; 2155 struct fs *fs; 2156 struct worklist *wk; 2157 struct indirdep *indirdep; 2158 int i, lbnadd, nblocks; 2159 int error, allerror = 0; 2160 2161 fs = ip->i_fs; 2162 lbnadd = 1; 2163 for (i = level; i > 0; i--) 2164 lbnadd *= NINDIR(fs); 2165 /* 2166 * Get buffer of block pointers to be freed. This routine is not 2167 * called until the zero'ed inode has been written, so it is safe 2168 * to free blocks as they are encountered. Because the inode has 2169 * been zero'ed, calls to bmap on these blocks will fail. So, we 2170 * have to use the on-disk address and the block device for the 2171 * filesystem to look them up. If the file was deleted before its 2172 * indirect blocks were all written to disk, the routine that set 2173 * us up (deallocate_dependencies) will have arranged to leave 2174 * a complete copy of the indirect block in memory for our use. 2175 * Otherwise we have to read the blocks in from the disk. 2176 */ 2177 ACQUIRE_LOCK(&lk); 2178 if ((bp = findblk(ip->i_devvp, doffset, FINDBLK_TEST)) != NULL && 2179 (wk = LIST_FIRST(&bp->b_dep)) != NULL) { 2180 /* 2181 * bp must be ir_savebp, which is held locked for our use. 2182 */ 2183 if (wk->wk_type != D_INDIRDEP || 2184 (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp || 2185 (indirdep->ir_state & GOINGAWAY) == 0) { 2186 panic("indir_trunc: lost indirdep"); 2187 } 2188 WORKLIST_REMOVE(wk); 2189 WORKITEM_FREE(indirdep, D_INDIRDEP); 2190 if (LIST_FIRST(&bp->b_dep) != NULL) { 2191 panic("indir_trunc: dangling dep"); 2192 } 2193 FREE_LOCK(&lk); 2194 } else { 2195 FREE_LOCK(&lk); 2196 error = bread(ip->i_devvp, doffset, (int)fs->fs_bsize, &bp); 2197 if (error) 2198 return (error); 2199 } 2200 /* 2201 * Recursively free indirect blocks. 2202 */ 2203 bap = (ufs_daddr_t *)bp->b_data; 2204 nblocks = btodb(fs->fs_bsize); 2205 for (i = NINDIR(fs) - 1; i >= 0; i--) { 2206 if ((nb = bap[i]) == 0) 2207 continue; 2208 if (level != 0) { 2209 if ((error = indir_trunc(ip, fsbtodoff(fs, nb), 2210 level - 1, lbn + (i * lbnadd), countp)) != 0) 2211 allerror = error; 2212 } 2213 ffs_blkfree(ip, nb, fs->fs_bsize); 2214 *countp += nblocks; 2215 } 2216 bp->b_flags |= B_INVAL | B_NOCACHE; 2217 brelse(bp); 2218 return (allerror); 2219 } 2220 2221 /* 2222 * Free an allocindir. 2223 * This routine must be called with splbio interrupts blocked. 2224 */ 2225 static void 2226 free_allocindir(struct allocindir *aip, struct inodedep *inodedep) 2227 { 2228 struct freefrag *freefrag; 2229 2230 KKASSERT(lock_held(&lk) > 0); 2231 2232 if ((aip->ai_state & DEPCOMPLETE) == 0) 2233 LIST_REMOVE(aip, ai_deps); 2234 if (aip->ai_state & ONWORKLIST) 2235 WORKLIST_REMOVE(&aip->ai_list); 2236 LIST_REMOVE(aip, ai_next); 2237 if ((freefrag = aip->ai_freefrag) != NULL) { 2238 if (inodedep == NULL) 2239 add_to_worklist(&freefrag->ff_list); 2240 else 2241 WORKLIST_INSERT(&inodedep->id_bufwait, 2242 &freefrag->ff_list); 2243 } 2244 WORKITEM_FREE(aip, D_ALLOCINDIR); 2245 } 2246 2247 /* 2248 * Directory entry addition dependencies. 2249 * 2250 * When adding a new directory entry, the inode (with its incremented link 2251 * count) must be written to disk before the directory entry's pointer to it. 2252 * Also, if the inode is newly allocated, the corresponding freemap must be 2253 * updated (on disk) before the directory entry's pointer. These requirements 2254 * are met via undo/redo on the directory entry's pointer, which consists 2255 * simply of the inode number. 2256 * 2257 * As directory entries are added and deleted, the free space within a 2258 * directory block can become fragmented. The ufs filesystem will compact 2259 * a fragmented directory block to make space for a new entry. When this 2260 * occurs, the offsets of previously added entries change. Any "diradd" 2261 * dependency structures corresponding to these entries must be updated with 2262 * the new offsets. 2263 */ 2264 2265 /* 2266 * This routine is called after the in-memory inode's link 2267 * count has been incremented, but before the directory entry's 2268 * pointer to the inode has been set. 2269 * 2270 * Parameters: 2271 * bp: buffer containing directory block 2272 * dp: inode for directory 2273 * diroffset: offset of new entry in directory 2274 * newinum: inode referenced by new directory entry 2275 * newdirbp: non-NULL => contents of new mkdir 2276 */ 2277 void 2278 softdep_setup_directory_add(struct buf *bp, struct inode *dp, off_t diroffset, 2279 ino_t newinum, struct buf *newdirbp) 2280 { 2281 int offset; /* offset of new entry within directory block */ 2282 ufs_lbn_t lbn; /* block in directory containing new entry */ 2283 struct fs *fs; 2284 struct diradd *dap; 2285 struct pagedep *pagedep; 2286 struct inodedep *inodedep; 2287 struct mkdir *mkdir1, *mkdir2; 2288 2289 /* 2290 * Whiteouts have no dependencies. 2291 */ 2292 if (newinum == WINO) { 2293 if (newdirbp != NULL) 2294 bdwrite(newdirbp); 2295 return; 2296 } 2297 2298 fs = dp->i_fs; 2299 lbn = lblkno(fs, diroffset); 2300 offset = blkoff(fs, diroffset); 2301 dap = kmalloc(sizeof(struct diradd), M_DIRADD, 2302 M_SOFTDEP_FLAGS | M_ZERO); 2303 dap->da_list.wk_type = D_DIRADD; 2304 dap->da_offset = offset; 2305 dap->da_newinum = newinum; 2306 dap->da_state = ATTACHED; 2307 if (newdirbp == NULL) { 2308 dap->da_state |= DEPCOMPLETE; 2309 ACQUIRE_LOCK(&lk); 2310 } else { 2311 dap->da_state |= MKDIR_BODY | MKDIR_PARENT; 2312 mkdir1 = kmalloc(sizeof(struct mkdir), M_MKDIR, 2313 M_SOFTDEP_FLAGS); 2314 mkdir1->md_list.wk_type = D_MKDIR; 2315 mkdir1->md_state = MKDIR_BODY; 2316 mkdir1->md_diradd = dap; 2317 mkdir2 = kmalloc(sizeof(struct mkdir), M_MKDIR, 2318 M_SOFTDEP_FLAGS); 2319 mkdir2->md_list.wk_type = D_MKDIR; 2320 mkdir2->md_state = MKDIR_PARENT; 2321 mkdir2->md_diradd = dap; 2322 /* 2323 * Dependency on "." and ".." being written to disk. 2324 */ 2325 mkdir1->md_buf = newdirbp; 2326 ACQUIRE_LOCK(&lk); 2327 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); 2328 WORKLIST_INSERT_BP(newdirbp, &mkdir1->md_list); 2329 FREE_LOCK(&lk); 2330 bdwrite(newdirbp); 2331 /* 2332 * Dependency on link count increase for parent directory 2333 */ 2334 ACQUIRE_LOCK(&lk); 2335 if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0 2336 || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 2337 dap->da_state &= ~MKDIR_PARENT; 2338 WORKITEM_FREE(mkdir2, D_MKDIR); 2339 } else { 2340 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 2341 WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list); 2342 } 2343 } 2344 /* 2345 * Link into parent directory pagedep to await its being written. 2346 */ 2347 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) 2348 WORKLIST_INSERT_BP(bp, &pagedep->pd_list); 2349 dap->da_pagedep = pagedep; 2350 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, 2351 da_pdlist); 2352 /* 2353 * Link into its inodedep. Put it on the id_bufwait list if the inode 2354 * is not yet written. If it is written, do the post-inode write 2355 * processing to put it on the id_pendinghd list. 2356 */ 2357 (void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep); 2358 if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) 2359 diradd_inode_written(dap, inodedep); 2360 else 2361 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 2362 FREE_LOCK(&lk); 2363 } 2364 2365 /* 2366 * This procedure is called to change the offset of a directory 2367 * entry when compacting a directory block which must be owned 2368 * exclusively by the caller. Note that the actual entry movement 2369 * must be done in this procedure to ensure that no I/O completions 2370 * occur while the move is in progress. 2371 * 2372 * Parameters: 2373 * dp: inode for directory 2374 * base: address of dp->i_offset 2375 * oldloc: address of old directory location 2376 * newloc: address of new directory location 2377 * entrysize: size of directory entry 2378 */ 2379 void 2380 softdep_change_directoryentry_offset(struct inode *dp, caddr_t base, 2381 caddr_t oldloc, caddr_t newloc, 2382 int entrysize) 2383 { 2384 int offset, oldoffset, newoffset; 2385 struct pagedep *pagedep; 2386 struct diradd *dap; 2387 ufs_lbn_t lbn; 2388 2389 ACQUIRE_LOCK(&lk); 2390 lbn = lblkno(dp->i_fs, dp->i_offset); 2391 offset = blkoff(dp->i_fs, dp->i_offset); 2392 if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0) 2393 goto done; 2394 oldoffset = offset + (oldloc - base); 2395 newoffset = offset + (newloc - base); 2396 2397 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) { 2398 if (dap->da_offset != oldoffset) 2399 continue; 2400 dap->da_offset = newoffset; 2401 if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset)) 2402 break; 2403 LIST_REMOVE(dap, da_pdlist); 2404 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)], 2405 dap, da_pdlist); 2406 break; 2407 } 2408 if (dap == NULL) { 2409 2410 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) { 2411 if (dap->da_offset == oldoffset) { 2412 dap->da_offset = newoffset; 2413 break; 2414 } 2415 } 2416 } 2417 done: 2418 bcopy(oldloc, newloc, entrysize); 2419 FREE_LOCK(&lk); 2420 } 2421 2422 /* 2423 * Free a diradd dependency structure. This routine must be called 2424 * with splbio interrupts blocked. 2425 */ 2426 static void 2427 free_diradd(struct diradd *dap) 2428 { 2429 struct dirrem *dirrem; 2430 struct pagedep *pagedep; 2431 struct inodedep *inodedep; 2432 struct mkdir *mkdir, *nextmd; 2433 2434 KKASSERT(lock_held(&lk) > 0); 2435 2436 WORKLIST_REMOVE(&dap->da_list); 2437 LIST_REMOVE(dap, da_pdlist); 2438 if ((dap->da_state & DIRCHG) == 0) { 2439 pagedep = dap->da_pagedep; 2440 } else { 2441 dirrem = dap->da_previous; 2442 pagedep = dirrem->dm_pagedep; 2443 dirrem->dm_dirinum = pagedep->pd_ino; 2444 add_to_worklist(&dirrem->dm_list); 2445 } 2446 if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum, 2447 0, &inodedep) != 0) 2448 (void) free_inodedep(inodedep); 2449 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 2450 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 2451 nextmd = LIST_NEXT(mkdir, md_mkdirs); 2452 if (mkdir->md_diradd != dap) 2453 continue; 2454 dap->da_state &= ~mkdir->md_state; 2455 WORKLIST_REMOVE(&mkdir->md_list); 2456 LIST_REMOVE(mkdir, md_mkdirs); 2457 WORKITEM_FREE(mkdir, D_MKDIR); 2458 } 2459 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 2460 panic("free_diradd: unfound ref"); 2461 } 2462 } 2463 WORKITEM_FREE(dap, D_DIRADD); 2464 } 2465 2466 /* 2467 * Directory entry removal dependencies. 2468 * 2469 * When removing a directory entry, the entry's inode pointer must be 2470 * zero'ed on disk before the corresponding inode's link count is decremented 2471 * (possibly freeing the inode for re-use). This dependency is handled by 2472 * updating the directory entry but delaying the inode count reduction until 2473 * after the directory block has been written to disk. After this point, the 2474 * inode count can be decremented whenever it is convenient. 2475 */ 2476 2477 /* 2478 * This routine should be called immediately after removing 2479 * a directory entry. The inode's link count should not be 2480 * decremented by the calling procedure -- the soft updates 2481 * code will do this task when it is safe. 2482 * 2483 * Parameters: 2484 * bp: buffer containing directory block 2485 * dp: inode for the directory being modified 2486 * ip: inode for directory entry being removed 2487 * isrmdir: indicates if doing RMDIR 2488 */ 2489 void 2490 softdep_setup_remove(struct buf *bp, struct inode *dp, struct inode *ip, 2491 int isrmdir) 2492 { 2493 struct dirrem *dirrem, *prevdirrem; 2494 2495 /* 2496 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. 2497 */ 2498 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 2499 2500 /* 2501 * If the COMPLETE flag is clear, then there were no active 2502 * entries and we want to roll back to a zeroed entry until 2503 * the new inode is committed to disk. If the COMPLETE flag is 2504 * set then we have deleted an entry that never made it to 2505 * disk. If the entry we deleted resulted from a name change, 2506 * then the old name still resides on disk. We cannot delete 2507 * its inode (returned to us in prevdirrem) until the zeroed 2508 * directory entry gets to disk. The new inode has never been 2509 * referenced on the disk, so can be deleted immediately. 2510 */ 2511 if ((dirrem->dm_state & COMPLETE) == 0) { 2512 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, 2513 dm_next); 2514 FREE_LOCK(&lk); 2515 } else { 2516 if (prevdirrem != NULL) 2517 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, 2518 prevdirrem, dm_next); 2519 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; 2520 FREE_LOCK(&lk); 2521 handle_workitem_remove(dirrem); 2522 } 2523 } 2524 2525 /* 2526 * Allocate a new dirrem if appropriate and return it along with 2527 * its associated pagedep. Called without a lock, returns with lock. 2528 */ 2529 static long num_dirrem; /* number of dirrem allocated */ 2530 2531 /* 2532 * Parameters: 2533 * bp: buffer containing directory block 2534 * dp: inode for the directory being modified 2535 * ip: inode for directory entry being removed 2536 * isrmdir: indicates if doing RMDIR 2537 * prevdirremp: previously referenced inode, if any 2538 */ 2539 static struct dirrem * 2540 newdirrem(struct buf *bp, struct inode *dp, struct inode *ip, 2541 int isrmdir, struct dirrem **prevdirremp) 2542 { 2543 int offset; 2544 ufs_lbn_t lbn; 2545 struct diradd *dap; 2546 struct dirrem *dirrem; 2547 struct pagedep *pagedep; 2548 2549 /* 2550 * Whiteouts have no deletion dependencies. 2551 */ 2552 if (ip == NULL) 2553 panic("newdirrem: whiteout"); 2554 /* 2555 * If we are over our limit, try to improve the situation. 2556 * Limiting the number of dirrem structures will also limit 2557 * the number of freefile and freeblks structures. 2558 */ 2559 if (num_dirrem > max_softdeps / 2 && speedup_syncer() == 0) 2560 (void) request_cleanup(FLUSH_REMOVE, 0); 2561 num_dirrem += 1; 2562 dirrem = kmalloc(sizeof(struct dirrem), M_DIRREM, 2563 M_SOFTDEP_FLAGS | M_ZERO); 2564 dirrem->dm_list.wk_type = D_DIRREM; 2565 dirrem->dm_state = isrmdir ? RMDIR : 0; 2566 dirrem->dm_mnt = ITOV(ip)->v_mount; 2567 dirrem->dm_oldinum = ip->i_number; 2568 *prevdirremp = NULL; 2569 2570 ACQUIRE_LOCK(&lk); 2571 lbn = lblkno(dp->i_fs, dp->i_offset); 2572 offset = blkoff(dp->i_fs, dp->i_offset); 2573 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) 2574 WORKLIST_INSERT_BP(bp, &pagedep->pd_list); 2575 dirrem->dm_pagedep = pagedep; 2576 /* 2577 * Check for a diradd dependency for the same directory entry. 2578 * If present, then both dependencies become obsolete and can 2579 * be de-allocated. Check for an entry on both the pd_dirraddhd 2580 * list and the pd_pendinghd list. 2581 */ 2582 2583 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) 2584 if (dap->da_offset == offset) 2585 break; 2586 if (dap == NULL) { 2587 2588 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 2589 if (dap->da_offset == offset) 2590 break; 2591 if (dap == NULL) 2592 return (dirrem); 2593 } 2594 /* 2595 * Must be ATTACHED at this point. 2596 */ 2597 if ((dap->da_state & ATTACHED) == 0) { 2598 panic("newdirrem: not ATTACHED"); 2599 } 2600 if (dap->da_newinum != ip->i_number) { 2601 panic("newdirrem: inum %"PRId64" should be %"PRId64, 2602 ip->i_number, dap->da_newinum); 2603 } 2604 /* 2605 * If we are deleting a changed name that never made it to disk, 2606 * then return the dirrem describing the previous inode (which 2607 * represents the inode currently referenced from this entry on disk). 2608 */ 2609 if ((dap->da_state & DIRCHG) != 0) { 2610 *prevdirremp = dap->da_previous; 2611 dap->da_state &= ~DIRCHG; 2612 dap->da_pagedep = pagedep; 2613 } 2614 /* 2615 * We are deleting an entry that never made it to disk. 2616 * Mark it COMPLETE so we can delete its inode immediately. 2617 */ 2618 dirrem->dm_state |= COMPLETE; 2619 free_diradd(dap); 2620 return (dirrem); 2621 } 2622 2623 /* 2624 * Directory entry change dependencies. 2625 * 2626 * Changing an existing directory entry requires that an add operation 2627 * be completed first followed by a deletion. The semantics for the addition 2628 * are identical to the description of adding a new entry above except 2629 * that the rollback is to the old inode number rather than zero. Once 2630 * the addition dependency is completed, the removal is done as described 2631 * in the removal routine above. 2632 */ 2633 2634 /* 2635 * This routine should be called immediately after changing 2636 * a directory entry. The inode's link count should not be 2637 * decremented by the calling procedure -- the soft updates 2638 * code will perform this task when it is safe. 2639 * 2640 * Parameters: 2641 * bp: buffer containing directory block 2642 * dp: inode for the directory being modified 2643 * ip: inode for directory entry being removed 2644 * newinum: new inode number for changed entry 2645 * isrmdir: indicates if doing RMDIR 2646 */ 2647 void 2648 softdep_setup_directory_change(struct buf *bp, struct inode *dp, 2649 struct inode *ip, ino_t newinum, 2650 int isrmdir) 2651 { 2652 int offset; 2653 struct diradd *dap = NULL; 2654 struct dirrem *dirrem, *prevdirrem; 2655 struct pagedep *pagedep; 2656 struct inodedep *inodedep; 2657 2658 offset = blkoff(dp->i_fs, dp->i_offset); 2659 2660 /* 2661 * Whiteouts do not need diradd dependencies. 2662 */ 2663 if (newinum != WINO) { 2664 dap = kmalloc(sizeof(struct diradd), M_DIRADD, 2665 M_SOFTDEP_FLAGS | M_ZERO); 2666 dap->da_list.wk_type = D_DIRADD; 2667 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; 2668 dap->da_offset = offset; 2669 dap->da_newinum = newinum; 2670 } 2671 2672 /* 2673 * Allocate a new dirrem and ACQUIRE_LOCK. 2674 */ 2675 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 2676 pagedep = dirrem->dm_pagedep; 2677 /* 2678 * The possible values for isrmdir: 2679 * 0 - non-directory file rename 2680 * 1 - directory rename within same directory 2681 * inum - directory rename to new directory of given inode number 2682 * When renaming to a new directory, we are both deleting and 2683 * creating a new directory entry, so the link count on the new 2684 * directory should not change. Thus we do not need the followup 2685 * dirrem which is usually done in handle_workitem_remove. We set 2686 * the DIRCHG flag to tell handle_workitem_remove to skip the 2687 * followup dirrem. 2688 */ 2689 if (isrmdir > 1) 2690 dirrem->dm_state |= DIRCHG; 2691 2692 /* 2693 * Whiteouts have no additional dependencies, 2694 * so just put the dirrem on the correct list. 2695 */ 2696 if (newinum == WINO) { 2697 if ((dirrem->dm_state & COMPLETE) == 0) { 2698 LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, 2699 dm_next); 2700 } else { 2701 dirrem->dm_dirinum = pagedep->pd_ino; 2702 add_to_worklist(&dirrem->dm_list); 2703 } 2704 FREE_LOCK(&lk); 2705 return; 2706 } 2707 2708 /* 2709 * If the COMPLETE flag is clear, then there were no active 2710 * entries and we want to roll back to the previous inode until 2711 * the new inode is committed to disk. If the COMPLETE flag is 2712 * set, then we have deleted an entry that never made it to disk. 2713 * If the entry we deleted resulted from a name change, then the old 2714 * inode reference still resides on disk. Any rollback that we do 2715 * needs to be to that old inode (returned to us in prevdirrem). If 2716 * the entry we deleted resulted from a create, then there is 2717 * no entry on the disk, so we want to roll back to zero rather 2718 * than the uncommitted inode. In either of the COMPLETE cases we 2719 * want to immediately free the unwritten and unreferenced inode. 2720 */ 2721 if ((dirrem->dm_state & COMPLETE) == 0) { 2722 dap->da_previous = dirrem; 2723 } else { 2724 if (prevdirrem != NULL) { 2725 dap->da_previous = prevdirrem; 2726 } else { 2727 dap->da_state &= ~DIRCHG; 2728 dap->da_pagedep = pagedep; 2729 } 2730 dirrem->dm_dirinum = pagedep->pd_ino; 2731 add_to_worklist(&dirrem->dm_list); 2732 } 2733 /* 2734 * Link into its inodedep. Put it on the id_bufwait list if the inode 2735 * is not yet written. If it is written, do the post-inode write 2736 * processing to put it on the id_pendinghd list. 2737 */ 2738 if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 || 2739 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 2740 dap->da_state |= COMPLETE; 2741 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 2742 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 2743 } else { 2744 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 2745 dap, da_pdlist); 2746 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 2747 } 2748 FREE_LOCK(&lk); 2749 } 2750 2751 /* 2752 * Called whenever the link count on an inode is changed. 2753 * It creates an inode dependency so that the new reference(s) 2754 * to the inode cannot be committed to disk until the updated 2755 * inode has been written. 2756 * 2757 * Parameters: 2758 * ip: the inode with the increased link count 2759 */ 2760 void 2761 softdep_change_linkcnt(struct inode *ip) 2762 { 2763 struct inodedep *inodedep; 2764 2765 ACQUIRE_LOCK(&lk); 2766 (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep); 2767 if (ip->i_nlink < ip->i_effnlink) { 2768 panic("softdep_change_linkcnt: bad delta"); 2769 } 2770 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 2771 FREE_LOCK(&lk); 2772 } 2773 2774 /* 2775 * This workitem decrements the inode's link count. 2776 * If the link count reaches zero, the file is removed. 2777 */ 2778 static void 2779 handle_workitem_remove(struct dirrem *dirrem) 2780 { 2781 struct inodedep *inodedep; 2782 struct vnode *vp; 2783 struct inode *ip; 2784 ino_t oldinum; 2785 int error; 2786 2787 error = VFS_VGET(dirrem->dm_mnt, NULL, dirrem->dm_oldinum, &vp); 2788 if (error) { 2789 softdep_error("handle_workitem_remove: vget", error); 2790 return; 2791 } 2792 ip = VTOI(vp); 2793 ACQUIRE_LOCK(&lk); 2794 if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0){ 2795 panic("handle_workitem_remove: lost inodedep"); 2796 } 2797 /* 2798 * Normal file deletion. 2799 */ 2800 if ((dirrem->dm_state & RMDIR) == 0) { 2801 ip->i_nlink--; 2802 ip->i_flag |= IN_CHANGE; 2803 if (ip->i_nlink < ip->i_effnlink) { 2804 panic("handle_workitem_remove: bad file delta"); 2805 } 2806 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 2807 FREE_LOCK(&lk); 2808 vput(vp); 2809 num_dirrem -= 1; 2810 WORKITEM_FREE(dirrem, D_DIRREM); 2811 return; 2812 } 2813 /* 2814 * Directory deletion. Decrement reference count for both the 2815 * just deleted parent directory entry and the reference for ".". 2816 * Next truncate the directory to length zero. When the 2817 * truncation completes, arrange to have the reference count on 2818 * the parent decremented to account for the loss of "..". 2819 */ 2820 ip->i_nlink -= 2; 2821 ip->i_flag |= IN_CHANGE; 2822 if (ip->i_nlink < ip->i_effnlink) { 2823 panic("handle_workitem_remove: bad dir delta"); 2824 } 2825 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 2826 FREE_LOCK(&lk); 2827 if ((error = ffs_truncate(vp, (off_t)0, 0, proc0.p_ucred)) != 0) 2828 softdep_error("handle_workitem_remove: truncate", error); 2829 /* 2830 * Rename a directory to a new parent. Since, we are both deleting 2831 * and creating a new directory entry, the link count on the new 2832 * directory should not change. Thus we skip the followup dirrem. 2833 */ 2834 if (dirrem->dm_state & DIRCHG) { 2835 vput(vp); 2836 num_dirrem -= 1; 2837 WORKITEM_FREE(dirrem, D_DIRREM); 2838 return; 2839 } 2840 /* 2841 * If the inodedep does not exist, then the zero'ed inode has 2842 * been written to disk. If the allocated inode has never been 2843 * written to disk, then the on-disk inode is zero'ed. In either 2844 * case we can remove the file immediately. 2845 */ 2846 ACQUIRE_LOCK(&lk); 2847 dirrem->dm_state = 0; 2848 oldinum = dirrem->dm_oldinum; 2849 dirrem->dm_oldinum = dirrem->dm_dirinum; 2850 if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 || 2851 check_inode_unwritten(inodedep)) { 2852 FREE_LOCK(&lk); 2853 vput(vp); 2854 handle_workitem_remove(dirrem); 2855 return; 2856 } 2857 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); 2858 FREE_LOCK(&lk); 2859 ip->i_flag |= IN_CHANGE; 2860 ffs_update(vp, 0); 2861 vput(vp); 2862 } 2863 2864 /* 2865 * Inode de-allocation dependencies. 2866 * 2867 * When an inode's link count is reduced to zero, it can be de-allocated. We 2868 * found it convenient to postpone de-allocation until after the inode is 2869 * written to disk with its new link count (zero). At this point, all of the 2870 * on-disk inode's block pointers are nullified and, with careful dependency 2871 * list ordering, all dependencies related to the inode will be satisfied and 2872 * the corresponding dependency structures de-allocated. So, if/when the 2873 * inode is reused, there will be no mixing of old dependencies with new 2874 * ones. This artificial dependency is set up by the block de-allocation 2875 * procedure above (softdep_setup_freeblocks) and completed by the 2876 * following procedure. 2877 */ 2878 static void 2879 handle_workitem_freefile(struct freefile *freefile) 2880 { 2881 struct vnode vp; 2882 struct inode tip; 2883 struct inodedep *idp; 2884 int error; 2885 2886 #ifdef DEBUG 2887 ACQUIRE_LOCK(&lk); 2888 error = inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp); 2889 FREE_LOCK(&lk); 2890 if (error) 2891 panic("handle_workitem_freefile: inodedep survived"); 2892 #endif 2893 tip.i_devvp = freefile->fx_devvp; 2894 tip.i_dev = freefile->fx_devvp->v_rdev; 2895 tip.i_fs = freefile->fx_fs; 2896 vp.v_data = &tip; 2897 if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0) 2898 softdep_error("handle_workitem_freefile", error); 2899 WORKITEM_FREE(freefile, D_FREEFILE); 2900 } 2901 2902 /* 2903 * Helper function which unlinks marker element from work list and returns 2904 * the next element on the list. 2905 */ 2906 static __inline struct worklist * 2907 markernext(struct worklist *marker) 2908 { 2909 struct worklist *next; 2910 2911 next = LIST_NEXT(marker, wk_list); 2912 LIST_REMOVE(marker, wk_list); 2913 return next; 2914 } 2915 2916 /* 2917 * checkread, checkwrite 2918 * 2919 * bioops callback - hold io_token 2920 */ 2921 static int 2922 softdep_checkread(struct buf *bp) 2923 { 2924 /* nothing to do, mp lock not needed */ 2925 return(0); 2926 } 2927 2928 /* 2929 * bioops callback - hold io_token 2930 */ 2931 static int 2932 softdep_checkwrite(struct buf *bp) 2933 { 2934 /* nothing to do, mp lock not needed */ 2935 return(0); 2936 } 2937 2938 /* 2939 * Disk writes. 2940 * 2941 * The dependency structures constructed above are most actively used when file 2942 * system blocks are written to disk. No constraints are placed on when a 2943 * block can be written, but unsatisfied update dependencies are made safe by 2944 * modifying (or replacing) the source memory for the duration of the disk 2945 * write. When the disk write completes, the memory block is again brought 2946 * up-to-date. 2947 * 2948 * In-core inode structure reclamation. 2949 * 2950 * Because there are a finite number of "in-core" inode structures, they are 2951 * reused regularly. By transferring all inode-related dependencies to the 2952 * in-memory inode block and indexing them separately (via "inodedep"s), we 2953 * can allow "in-core" inode structures to be reused at any time and avoid 2954 * any increase in contention. 2955 * 2956 * Called just before entering the device driver to initiate a new disk I/O. 2957 * The buffer must be locked, thus, no I/O completion operations can occur 2958 * while we are manipulating its associated dependencies. 2959 * 2960 * bioops callback - hold io_token 2961 * 2962 * Parameters: 2963 * bp: structure describing disk write to occur 2964 */ 2965 static void 2966 softdep_disk_io_initiation(struct buf *bp) 2967 { 2968 struct worklist *wk; 2969 struct worklist marker; 2970 struct indirdep *indirdep; 2971 2972 /* 2973 * We only care about write operations. There should never 2974 * be dependencies for reads. 2975 */ 2976 if (bp->b_cmd == BUF_CMD_READ) 2977 panic("softdep_disk_io_initiation: read"); 2978 2979 ACQUIRE_LOCK(&lk); 2980 marker.wk_type = D_LAST + 1; /* Not a normal workitem */ 2981 2982 /* 2983 * Do any necessary pre-I/O processing. 2984 */ 2985 for (wk = LIST_FIRST(&bp->b_dep); wk; wk = markernext(&marker)) { 2986 LIST_INSERT_AFTER(wk, &marker, wk_list); 2987 2988 switch (wk->wk_type) { 2989 case D_PAGEDEP: 2990 initiate_write_filepage(WK_PAGEDEP(wk), bp); 2991 continue; 2992 2993 case D_INODEDEP: 2994 initiate_write_inodeblock(WK_INODEDEP(wk), bp); 2995 continue; 2996 2997 case D_INDIRDEP: 2998 indirdep = WK_INDIRDEP(wk); 2999 if (indirdep->ir_state & GOINGAWAY) 3000 panic("disk_io_initiation: indirdep gone"); 3001 /* 3002 * If there are no remaining dependencies, this 3003 * will be writing the real pointers, so the 3004 * dependency can be freed. 3005 */ 3006 if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) { 3007 indirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE; 3008 brelse(indirdep->ir_savebp); 3009 /* inline expand WORKLIST_REMOVE(wk); */ 3010 wk->wk_state &= ~ONWORKLIST; 3011 LIST_REMOVE(wk, wk_list); 3012 WORKITEM_FREE(indirdep, D_INDIRDEP); 3013 continue; 3014 } 3015 /* 3016 * Replace up-to-date version with safe version. 3017 */ 3018 indirdep->ir_saveddata = kmalloc(bp->b_bcount, 3019 M_INDIRDEP, 3020 M_SOFTDEP_FLAGS); 3021 ACQUIRE_LOCK(&lk); 3022 indirdep->ir_state &= ~ATTACHED; 3023 indirdep->ir_state |= UNDONE; 3024 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 3025 bcopy(indirdep->ir_savebp->b_data, bp->b_data, 3026 bp->b_bcount); 3027 FREE_LOCK(&lk); 3028 continue; 3029 3030 case D_MKDIR: 3031 case D_BMSAFEMAP: 3032 case D_ALLOCDIRECT: 3033 case D_ALLOCINDIR: 3034 continue; 3035 3036 default: 3037 panic("handle_disk_io_initiation: Unexpected type %s", 3038 TYPENAME(wk->wk_type)); 3039 /* NOTREACHED */ 3040 } 3041 } 3042 FREE_LOCK(&lk); 3043 } 3044 3045 /* 3046 * Called from within the procedure above to deal with unsatisfied 3047 * allocation dependencies in a directory. The buffer must be locked, 3048 * thus, no I/O completion operations can occur while we are 3049 * manipulating its associated dependencies. 3050 */ 3051 static void 3052 initiate_write_filepage(struct pagedep *pagedep, struct buf *bp) 3053 { 3054 struct diradd *dap; 3055 struct direct *ep; 3056 int i; 3057 3058 if (pagedep->pd_state & IOSTARTED) { 3059 /* 3060 * This can only happen if there is a driver that does not 3061 * understand chaining. Here biodone will reissue the call 3062 * to strategy for the incomplete buffers. 3063 */ 3064 kprintf("initiate_write_filepage: already started\n"); 3065 return; 3066 } 3067 pagedep->pd_state |= IOSTARTED; 3068 ACQUIRE_LOCK(&lk); 3069 for (i = 0; i < DAHASHSZ; i++) { 3070 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 3071 ep = (struct direct *) 3072 ((char *)bp->b_data + dap->da_offset); 3073 if (ep->d_ino != dap->da_newinum) { 3074 panic("%s: dir inum %d != new %"PRId64, 3075 "initiate_write_filepage", 3076 ep->d_ino, dap->da_newinum); 3077 } 3078 if (dap->da_state & DIRCHG) 3079 ep->d_ino = dap->da_previous->dm_oldinum; 3080 else 3081 ep->d_ino = 0; 3082 dap->da_state &= ~ATTACHED; 3083 dap->da_state |= UNDONE; 3084 } 3085 } 3086 FREE_LOCK(&lk); 3087 } 3088 3089 /* 3090 * Called from within the procedure above to deal with unsatisfied 3091 * allocation dependencies in an inodeblock. The buffer must be 3092 * locked, thus, no I/O completion operations can occur while we 3093 * are manipulating its associated dependencies. 3094 * 3095 * Parameters: 3096 * bp: The inode block 3097 */ 3098 static void 3099 initiate_write_inodeblock(struct inodedep *inodedep, struct buf *bp) 3100 { 3101 struct allocdirect *adp, *lastadp; 3102 struct ufs1_dinode *dp; 3103 struct ufs1_dinode *sip; 3104 struct fs *fs; 3105 ufs_lbn_t prevlbn = 0; 3106 int i, deplist; 3107 3108 if (inodedep->id_state & IOSTARTED) 3109 panic("initiate_write_inodeblock: already started"); 3110 inodedep->id_state |= IOSTARTED; 3111 fs = inodedep->id_fs; 3112 dp = (struct ufs1_dinode *)bp->b_data + 3113 ino_to_fsbo(fs, inodedep->id_ino); 3114 /* 3115 * If the bitmap is not yet written, then the allocated 3116 * inode cannot be written to disk. 3117 */ 3118 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 3119 if (inodedep->id_savedino != NULL) 3120 panic("initiate_write_inodeblock: already doing I/O"); 3121 sip = kmalloc(sizeof(struct ufs1_dinode), M_INODEDEP, 3122 M_SOFTDEP_FLAGS); 3123 inodedep->id_savedino = sip; 3124 *inodedep->id_savedino = *dp; 3125 bzero((caddr_t)dp, sizeof(struct ufs1_dinode)); 3126 dp->di_gen = inodedep->id_savedino->di_gen; 3127 return; 3128 } 3129 /* 3130 * If no dependencies, then there is nothing to roll back. 3131 */ 3132 inodedep->id_savedsize = dp->di_size; 3133 if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL) 3134 return; 3135 /* 3136 * Set the dependencies to busy. 3137 */ 3138 ACQUIRE_LOCK(&lk); 3139 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 3140 adp = TAILQ_NEXT(adp, ad_next)) { 3141 #ifdef DIAGNOSTIC 3142 if (deplist != 0 && prevlbn >= adp->ad_lbn) { 3143 panic("softdep_write_inodeblock: lbn order"); 3144 } 3145 prevlbn = adp->ad_lbn; 3146 if (adp->ad_lbn < NDADDR && 3147 dp->di_db[adp->ad_lbn] != adp->ad_newblkno) { 3148 panic("%s: direct pointer #%ld mismatch %d != %d", 3149 "softdep_write_inodeblock", adp->ad_lbn, 3150 dp->di_db[adp->ad_lbn], adp->ad_newblkno); 3151 } 3152 if (adp->ad_lbn >= NDADDR && 3153 dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) { 3154 panic("%s: indirect pointer #%ld mismatch %d != %d", 3155 "softdep_write_inodeblock", adp->ad_lbn - NDADDR, 3156 dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno); 3157 } 3158 deplist |= 1 << adp->ad_lbn; 3159 if ((adp->ad_state & ATTACHED) == 0) { 3160 panic("softdep_write_inodeblock: Unknown state 0x%x", 3161 adp->ad_state); 3162 } 3163 #endif /* DIAGNOSTIC */ 3164 adp->ad_state &= ~ATTACHED; 3165 adp->ad_state |= UNDONE; 3166 } 3167 /* 3168 * The on-disk inode cannot claim to be any larger than the last 3169 * fragment that has been written. Otherwise, the on-disk inode 3170 * might have fragments that were not the last block in the file 3171 * which would corrupt the filesystem. 3172 */ 3173 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 3174 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 3175 if (adp->ad_lbn >= NDADDR) 3176 break; 3177 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; 3178 /* keep going until hitting a rollback to a frag */ 3179 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 3180 continue; 3181 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; 3182 for (i = adp->ad_lbn + 1; i < NDADDR; i++) { 3183 #ifdef DIAGNOSTIC 3184 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) { 3185 panic("softdep_write_inodeblock: lost dep1"); 3186 } 3187 #endif /* DIAGNOSTIC */ 3188 dp->di_db[i] = 0; 3189 } 3190 for (i = 0; i < NIADDR; i++) { 3191 #ifdef DIAGNOSTIC 3192 if (dp->di_ib[i] != 0 && 3193 (deplist & ((1 << NDADDR) << i)) == 0) { 3194 panic("softdep_write_inodeblock: lost dep2"); 3195 } 3196 #endif /* DIAGNOSTIC */ 3197 dp->di_ib[i] = 0; 3198 } 3199 FREE_LOCK(&lk); 3200 return; 3201 } 3202 /* 3203 * If we have zero'ed out the last allocated block of the file, 3204 * roll back the size to the last currently allocated block. 3205 * We know that this last allocated block is a full-sized as 3206 * we already checked for fragments in the loop above. 3207 */ 3208 if (lastadp != NULL && 3209 dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { 3210 for (i = lastadp->ad_lbn; i >= 0; i--) 3211 if (dp->di_db[i] != 0) 3212 break; 3213 dp->di_size = (i + 1) * fs->fs_bsize; 3214 } 3215 /* 3216 * The only dependencies are for indirect blocks. 3217 * 3218 * The file size for indirect block additions is not guaranteed. 3219 * Such a guarantee would be non-trivial to achieve. The conventional 3220 * synchronous write implementation also does not make this guarantee. 3221 * Fsck should catch and fix discrepancies. Arguably, the file size 3222 * can be over-estimated without destroying integrity when the file 3223 * moves into the indirect blocks (i.e., is large). If we want to 3224 * postpone fsck, we are stuck with this argument. 3225 */ 3226 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 3227 dp->di_ib[adp->ad_lbn - NDADDR] = 0; 3228 FREE_LOCK(&lk); 3229 } 3230 3231 /* 3232 * This routine is called during the completion interrupt 3233 * service routine for a disk write (from the procedure called 3234 * by the device driver to inform the filesystem caches of 3235 * a request completion). It should be called early in this 3236 * procedure, before the block is made available to other 3237 * processes or other routines are called. 3238 * 3239 * bioops callback - hold io_token 3240 * 3241 * Parameters: 3242 * bp: describes the completed disk write 3243 */ 3244 static void 3245 softdep_disk_write_complete(struct buf *bp) 3246 { 3247 struct worklist *wk; 3248 struct workhead reattach; 3249 struct newblk *newblk; 3250 struct allocindir *aip; 3251 struct allocdirect *adp; 3252 struct indirdep *indirdep; 3253 struct inodedep *inodedep; 3254 struct bmsafemap *bmsafemap; 3255 3256 ACQUIRE_LOCK(&lk); 3257 3258 LIST_INIT(&reattach); 3259 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 3260 WORKLIST_REMOVE(wk); 3261 switch (wk->wk_type) { 3262 3263 case D_PAGEDEP: 3264 if (handle_written_filepage(WK_PAGEDEP(wk), bp)) 3265 WORKLIST_INSERT(&reattach, wk); 3266 continue; 3267 3268 case D_INODEDEP: 3269 if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) 3270 WORKLIST_INSERT(&reattach, wk); 3271 continue; 3272 3273 case D_BMSAFEMAP: 3274 bmsafemap = WK_BMSAFEMAP(wk); 3275 while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) { 3276 newblk->nb_state |= DEPCOMPLETE; 3277 newblk->nb_bmsafemap = NULL; 3278 LIST_REMOVE(newblk, nb_deps); 3279 } 3280 while ((adp = 3281 LIST_FIRST(&bmsafemap->sm_allocdirecthd))) { 3282 adp->ad_state |= DEPCOMPLETE; 3283 adp->ad_buf = NULL; 3284 LIST_REMOVE(adp, ad_deps); 3285 handle_allocdirect_partdone(adp); 3286 } 3287 while ((aip = 3288 LIST_FIRST(&bmsafemap->sm_allocindirhd))) { 3289 aip->ai_state |= DEPCOMPLETE; 3290 aip->ai_buf = NULL; 3291 LIST_REMOVE(aip, ai_deps); 3292 handle_allocindir_partdone(aip); 3293 } 3294 while ((inodedep = 3295 LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) { 3296 inodedep->id_state |= DEPCOMPLETE; 3297 LIST_REMOVE(inodedep, id_deps); 3298 inodedep->id_buf = NULL; 3299 } 3300 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 3301 continue; 3302 3303 case D_MKDIR: 3304 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 3305 continue; 3306 3307 case D_ALLOCDIRECT: 3308 adp = WK_ALLOCDIRECT(wk); 3309 adp->ad_state |= COMPLETE; 3310 handle_allocdirect_partdone(adp); 3311 continue; 3312 3313 case D_ALLOCINDIR: 3314 aip = WK_ALLOCINDIR(wk); 3315 aip->ai_state |= COMPLETE; 3316 handle_allocindir_partdone(aip); 3317 continue; 3318 3319 case D_INDIRDEP: 3320 indirdep = WK_INDIRDEP(wk); 3321 if (indirdep->ir_state & GOINGAWAY) { 3322 panic("disk_write_complete: indirdep gone"); 3323 } 3324 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); 3325 kfree(indirdep->ir_saveddata, M_INDIRDEP); 3326 indirdep->ir_saveddata = 0; 3327 indirdep->ir_state &= ~UNDONE; 3328 indirdep->ir_state |= ATTACHED; 3329 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL) { 3330 handle_allocindir_partdone(aip); 3331 if (aip == LIST_FIRST(&indirdep->ir_donehd)) { 3332 panic("disk_write_complete: not gone"); 3333 } 3334 } 3335 WORKLIST_INSERT(&reattach, wk); 3336 if ((bp->b_flags & B_DELWRI) == 0) 3337 stat_indir_blk_ptrs++; 3338 bdirty(bp); 3339 continue; 3340 3341 default: 3342 panic("handle_disk_write_complete: Unknown type %s", 3343 TYPENAME(wk->wk_type)); 3344 /* NOTREACHED */ 3345 } 3346 } 3347 /* 3348 * Reattach any requests that must be redone. 3349 */ 3350 while ((wk = LIST_FIRST(&reattach)) != NULL) { 3351 WORKLIST_REMOVE(wk); 3352 WORKLIST_INSERT_BP(bp, wk); 3353 } 3354 3355 FREE_LOCK(&lk); 3356 } 3357 3358 /* 3359 * Called from within softdep_disk_write_complete above. Note that 3360 * this routine is always called from interrupt level with further 3361 * splbio interrupts blocked. 3362 * 3363 * Parameters: 3364 * adp: the completed allocdirect 3365 */ 3366 static void 3367 handle_allocdirect_partdone(struct allocdirect *adp) 3368 { 3369 struct allocdirect *listadp; 3370 struct inodedep *inodedep; 3371 long bsize; 3372 3373 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 3374 return; 3375 if (adp->ad_buf != NULL) 3376 panic("handle_allocdirect_partdone: dangling dep"); 3377 3378 /* 3379 * The on-disk inode cannot claim to be any larger than the last 3380 * fragment that has been written. Otherwise, the on-disk inode 3381 * might have fragments that were not the last block in the file 3382 * which would corrupt the filesystem. Thus, we cannot free any 3383 * allocdirects after one whose ad_oldblkno claims a fragment as 3384 * these blocks must be rolled back to zero before writing the inode. 3385 * We check the currently active set of allocdirects in id_inoupdt. 3386 */ 3387 inodedep = adp->ad_inodedep; 3388 bsize = inodedep->id_fs->fs_bsize; 3389 TAILQ_FOREACH(listadp, &inodedep->id_inoupdt, ad_next) { 3390 /* found our block */ 3391 if (listadp == adp) 3392 break; 3393 /* continue if ad_oldlbn is not a fragment */ 3394 if (listadp->ad_oldsize == 0 || 3395 listadp->ad_oldsize == bsize) 3396 continue; 3397 /* hit a fragment */ 3398 return; 3399 } 3400 /* 3401 * If we have reached the end of the current list without 3402 * finding the just finished dependency, then it must be 3403 * on the future dependency list. Future dependencies cannot 3404 * be freed until they are moved to the current list. 3405 */ 3406 if (listadp == NULL) { 3407 #ifdef DEBUG 3408 TAILQ_FOREACH(listadp, &inodedep->id_newinoupdt, ad_next) 3409 /* found our block */ 3410 if (listadp == adp) 3411 break; 3412 if (listadp == NULL) 3413 panic("handle_allocdirect_partdone: lost dep"); 3414 #endif /* DEBUG */ 3415 return; 3416 } 3417 /* 3418 * If we have found the just finished dependency, then free 3419 * it along with anything that follows it that is complete. 3420 */ 3421 for (; adp; adp = listadp) { 3422 listadp = TAILQ_NEXT(adp, ad_next); 3423 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 3424 return; 3425 free_allocdirect(&inodedep->id_inoupdt, adp, 1); 3426 } 3427 } 3428 3429 /* 3430 * Called from within softdep_disk_write_complete above. Note that 3431 * this routine is always called from interrupt level with further 3432 * splbio interrupts blocked. 3433 * 3434 * Parameters: 3435 * aip: the completed allocindir 3436 */ 3437 static void 3438 handle_allocindir_partdone(struct allocindir *aip) 3439 { 3440 struct indirdep *indirdep; 3441 3442 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) 3443 return; 3444 if (aip->ai_buf != NULL) 3445 panic("handle_allocindir_partdone: dangling dependency"); 3446 3447 indirdep = aip->ai_indirdep; 3448 if (indirdep->ir_state & UNDONE) { 3449 LIST_REMOVE(aip, ai_next); 3450 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); 3451 return; 3452 } 3453 ((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 3454 aip->ai_newblkno; 3455 LIST_REMOVE(aip, ai_next); 3456 if (aip->ai_freefrag != NULL) 3457 add_to_worklist(&aip->ai_freefrag->ff_list); 3458 WORKITEM_FREE(aip, D_ALLOCINDIR); 3459 } 3460 3461 /* 3462 * Called from within softdep_disk_write_complete above to restore 3463 * in-memory inode block contents to their most up-to-date state. Note 3464 * that this routine is always called from interrupt level with further 3465 * splbio interrupts blocked. 3466 * 3467 * Parameters: 3468 * bp: buffer containing the inode block 3469 */ 3470 static int 3471 handle_written_inodeblock(struct inodedep *inodedep, struct buf *bp) 3472 { 3473 struct worklist *wk, *filefree; 3474 struct allocdirect *adp, *nextadp; 3475 struct ufs1_dinode *dp; 3476 int hadchanges; 3477 3478 if ((inodedep->id_state & IOSTARTED) == 0) 3479 panic("handle_written_inodeblock: not started"); 3480 3481 inodedep->id_state &= ~IOSTARTED; 3482 dp = (struct ufs1_dinode *)bp->b_data + 3483 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 3484 /* 3485 * If we had to rollback the inode allocation because of 3486 * bitmaps being incomplete, then simply restore it. 3487 * Keep the block dirty so that it will not be reclaimed until 3488 * all associated dependencies have been cleared and the 3489 * corresponding updates written to disk. 3490 */ 3491 if (inodedep->id_savedino != NULL) { 3492 *dp = *inodedep->id_savedino; 3493 kfree(inodedep->id_savedino, M_INODEDEP); 3494 inodedep->id_savedino = NULL; 3495 if ((bp->b_flags & B_DELWRI) == 0) 3496 stat_inode_bitmap++; 3497 bdirty(bp); 3498 return (1); 3499 } 3500 inodedep->id_state |= COMPLETE; 3501 /* 3502 * Roll forward anything that had to be rolled back before 3503 * the inode could be updated. 3504 */ 3505 hadchanges = 0; 3506 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { 3507 nextadp = TAILQ_NEXT(adp, ad_next); 3508 if (adp->ad_state & ATTACHED) 3509 panic("handle_written_inodeblock: new entry"); 3510 3511 if (adp->ad_lbn < NDADDR) { 3512 if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno) { 3513 panic("%s: %s #%ld mismatch %d != %d", 3514 "handle_written_inodeblock", 3515 "direct pointer", adp->ad_lbn, 3516 dp->di_db[adp->ad_lbn], adp->ad_oldblkno); 3517 } 3518 dp->di_db[adp->ad_lbn] = adp->ad_newblkno; 3519 } else { 3520 if (dp->di_ib[adp->ad_lbn - NDADDR] != 0) { 3521 panic("%s: %s #%ld allocated as %d", 3522 "handle_written_inodeblock", 3523 "indirect pointer", adp->ad_lbn - NDADDR, 3524 dp->di_ib[adp->ad_lbn - NDADDR]); 3525 } 3526 dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno; 3527 } 3528 adp->ad_state &= ~UNDONE; 3529 adp->ad_state |= ATTACHED; 3530 hadchanges = 1; 3531 } 3532 if (hadchanges && (bp->b_flags & B_DELWRI) == 0) 3533 stat_direct_blk_ptrs++; 3534 /* 3535 * Reset the file size to its most up-to-date value. 3536 */ 3537 if (inodedep->id_savedsize == -1) { 3538 panic("handle_written_inodeblock: bad size"); 3539 } 3540 if (dp->di_size != inodedep->id_savedsize) { 3541 dp->di_size = inodedep->id_savedsize; 3542 hadchanges = 1; 3543 } 3544 inodedep->id_savedsize = -1; 3545 /* 3546 * If there were any rollbacks in the inode block, then it must be 3547 * marked dirty so that its will eventually get written back in 3548 * its correct form. 3549 */ 3550 if (hadchanges) 3551 bdirty(bp); 3552 /* 3553 * Process any allocdirects that completed during the update. 3554 */ 3555 if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) 3556 handle_allocdirect_partdone(adp); 3557 /* 3558 * Process deallocations that were held pending until the 3559 * inode had been written to disk. Freeing of the inode 3560 * is delayed until after all blocks have been freed to 3561 * avoid creation of new <vfsid, inum, lbn> triples 3562 * before the old ones have been deleted. 3563 */ 3564 filefree = NULL; 3565 while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { 3566 WORKLIST_REMOVE(wk); 3567 switch (wk->wk_type) { 3568 3569 case D_FREEFILE: 3570 /* 3571 * We defer adding filefree to the worklist until 3572 * all other additions have been made to ensure 3573 * that it will be done after all the old blocks 3574 * have been freed. 3575 */ 3576 if (filefree != NULL) { 3577 panic("handle_written_inodeblock: filefree"); 3578 } 3579 filefree = wk; 3580 continue; 3581 3582 case D_MKDIR: 3583 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); 3584 continue; 3585 3586 case D_DIRADD: 3587 diradd_inode_written(WK_DIRADD(wk), inodedep); 3588 continue; 3589 3590 case D_FREEBLKS: 3591 wk->wk_state |= COMPLETE; 3592 if ((wk->wk_state & ALLCOMPLETE) != ALLCOMPLETE) 3593 continue; 3594 /* -- fall through -- */ 3595 case D_FREEFRAG: 3596 case D_DIRREM: 3597 add_to_worklist(wk); 3598 continue; 3599 3600 default: 3601 panic("handle_written_inodeblock: Unknown type %s", 3602 TYPENAME(wk->wk_type)); 3603 /* NOTREACHED */ 3604 } 3605 } 3606 if (filefree != NULL) { 3607 if (free_inodedep(inodedep) == 0) { 3608 panic("handle_written_inodeblock: live inodedep"); 3609 } 3610 add_to_worklist(filefree); 3611 return (0); 3612 } 3613 3614 /* 3615 * If no outstanding dependencies, free it. 3616 */ 3617 if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0) 3618 return (0); 3619 return (hadchanges); 3620 } 3621 3622 /* 3623 * Process a diradd entry after its dependent inode has been written. 3624 * This routine must be called with splbio interrupts blocked. 3625 */ 3626 static void 3627 diradd_inode_written(struct diradd *dap, struct inodedep *inodedep) 3628 { 3629 struct pagedep *pagedep; 3630 3631 dap->da_state |= COMPLETE; 3632 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3633 if (dap->da_state & DIRCHG) 3634 pagedep = dap->da_previous->dm_pagedep; 3635 else 3636 pagedep = dap->da_pagedep; 3637 LIST_REMOVE(dap, da_pdlist); 3638 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 3639 } 3640 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 3641 } 3642 3643 /* 3644 * Handle the completion of a mkdir dependency. 3645 */ 3646 static void 3647 handle_written_mkdir(struct mkdir *mkdir, int type) 3648 { 3649 struct diradd *dap; 3650 struct pagedep *pagedep; 3651 3652 if (mkdir->md_state != type) { 3653 panic("handle_written_mkdir: bad type"); 3654 } 3655 dap = mkdir->md_diradd; 3656 dap->da_state &= ~type; 3657 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) 3658 dap->da_state |= DEPCOMPLETE; 3659 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3660 if (dap->da_state & DIRCHG) 3661 pagedep = dap->da_previous->dm_pagedep; 3662 else 3663 pagedep = dap->da_pagedep; 3664 LIST_REMOVE(dap, da_pdlist); 3665 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 3666 } 3667 LIST_REMOVE(mkdir, md_mkdirs); 3668 WORKITEM_FREE(mkdir, D_MKDIR); 3669 } 3670 3671 /* 3672 * Called from within softdep_disk_write_complete above. 3673 * A write operation was just completed. Removed inodes can 3674 * now be freed and associated block pointers may be committed. 3675 * Note that this routine is always called from interrupt level 3676 * with further splbio interrupts blocked. 3677 * 3678 * Parameters: 3679 * bp: buffer containing the written page 3680 */ 3681 static int 3682 handle_written_filepage(struct pagedep *pagedep, struct buf *bp) 3683 { 3684 struct dirrem *dirrem; 3685 struct diradd *dap, *nextdap; 3686 struct direct *ep; 3687 int i, chgs; 3688 3689 if ((pagedep->pd_state & IOSTARTED) == 0) { 3690 panic("handle_written_filepage: not started"); 3691 } 3692 pagedep->pd_state &= ~IOSTARTED; 3693 /* 3694 * Process any directory removals that have been committed. 3695 */ 3696 while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { 3697 LIST_REMOVE(dirrem, dm_next); 3698 dirrem->dm_dirinum = pagedep->pd_ino; 3699 add_to_worklist(&dirrem->dm_list); 3700 } 3701 /* 3702 * Free any directory additions that have been committed. 3703 */ 3704 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 3705 free_diradd(dap); 3706 /* 3707 * Uncommitted directory entries must be restored. 3708 */ 3709 for (chgs = 0, i = 0; i < DAHASHSZ; i++) { 3710 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 3711 dap = nextdap) { 3712 nextdap = LIST_NEXT(dap, da_pdlist); 3713 if (dap->da_state & ATTACHED) { 3714 panic("handle_written_filepage: attached"); 3715 } 3716 ep = (struct direct *) 3717 ((char *)bp->b_data + dap->da_offset); 3718 ep->d_ino = dap->da_newinum; 3719 dap->da_state &= ~UNDONE; 3720 dap->da_state |= ATTACHED; 3721 chgs = 1; 3722 /* 3723 * If the inode referenced by the directory has 3724 * been written out, then the dependency can be 3725 * moved to the pending list. 3726 */ 3727 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3728 LIST_REMOVE(dap, da_pdlist); 3729 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, 3730 da_pdlist); 3731 } 3732 } 3733 } 3734 /* 3735 * If there were any rollbacks in the directory, then it must be 3736 * marked dirty so that its will eventually get written back in 3737 * its correct form. 3738 */ 3739 if (chgs) { 3740 if ((bp->b_flags & B_DELWRI) == 0) 3741 stat_dir_entry++; 3742 bdirty(bp); 3743 } 3744 /* 3745 * If no dependencies remain, the pagedep will be freed. 3746 * Otherwise it will remain to update the page before it 3747 * is written back to disk. 3748 */ 3749 if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) { 3750 for (i = 0; i < DAHASHSZ; i++) 3751 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL) 3752 break; 3753 if (i == DAHASHSZ) { 3754 LIST_REMOVE(pagedep, pd_hash); 3755 WORKITEM_FREE(pagedep, D_PAGEDEP); 3756 return (0); 3757 } 3758 } 3759 return (1); 3760 } 3761 3762 /* 3763 * Writing back in-core inode structures. 3764 * 3765 * The filesystem only accesses an inode's contents when it occupies an 3766 * "in-core" inode structure. These "in-core" structures are separate from 3767 * the page frames used to cache inode blocks. Only the latter are 3768 * transferred to/from the disk. So, when the updated contents of the 3769 * "in-core" inode structure are copied to the corresponding in-memory inode 3770 * block, the dependencies are also transferred. The following procedure is 3771 * called when copying a dirty "in-core" inode to a cached inode block. 3772 */ 3773 3774 /* 3775 * Called when an inode is loaded from disk. If the effective link count 3776 * differed from the actual link count when it was last flushed, then we 3777 * need to ensure that the correct effective link count is put back. 3778 * 3779 * Parameters: 3780 * ip: the "in_core" copy of the inode 3781 */ 3782 void 3783 softdep_load_inodeblock(struct inode *ip) 3784 { 3785 struct inodedep *inodedep; 3786 3787 /* 3788 * Check for alternate nlink count. 3789 */ 3790 ip->i_effnlink = ip->i_nlink; 3791 ACQUIRE_LOCK(&lk); 3792 if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { 3793 FREE_LOCK(&lk); 3794 return; 3795 } 3796 ip->i_effnlink -= inodedep->id_nlinkdelta; 3797 FREE_LOCK(&lk); 3798 } 3799 3800 /* 3801 * This routine is called just before the "in-core" inode 3802 * information is to be copied to the in-memory inode block. 3803 * Recall that an inode block contains several inodes. If 3804 * the force flag is set, then the dependencies will be 3805 * cleared so that the update can always be made. Note that 3806 * the buffer is locked when this routine is called, so we 3807 * will never be in the middle of writing the inode block 3808 * to disk. 3809 * 3810 * Parameters: 3811 * ip: the "in_core" copy of the inode 3812 * bp: the buffer containing the inode block 3813 * waitfor: nonzero => update must be allowed 3814 */ 3815 void 3816 softdep_update_inodeblock(struct inode *ip, struct buf *bp, 3817 int waitfor) 3818 { 3819 struct inodedep *inodedep; 3820 struct worklist *wk; 3821 struct buf *ibp; 3822 int error, gotit; 3823 3824 /* 3825 * If the effective link count is not equal to the actual link 3826 * count, then we must track the difference in an inodedep while 3827 * the inode is (potentially) tossed out of the cache. Otherwise, 3828 * if there is no existing inodedep, then there are no dependencies 3829 * to track. 3830 */ 3831 ACQUIRE_LOCK(&lk); 3832 if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { 3833 FREE_LOCK(&lk); 3834 if (ip->i_effnlink != ip->i_nlink) 3835 panic("softdep_update_inodeblock: bad link count"); 3836 return; 3837 } 3838 if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) { 3839 panic("softdep_update_inodeblock: bad delta"); 3840 } 3841 /* 3842 * Changes have been initiated. Anything depending on these 3843 * changes cannot occur until this inode has been written. 3844 */ 3845 inodedep->id_state &= ~COMPLETE; 3846 if ((inodedep->id_state & ONWORKLIST) == 0) 3847 WORKLIST_INSERT_BP(bp, &inodedep->id_list); 3848 /* 3849 * Any new dependencies associated with the incore inode must 3850 * now be moved to the list associated with the buffer holding 3851 * the in-memory copy of the inode. Once merged process any 3852 * allocdirects that are completed by the merger. 3853 */ 3854 merge_inode_lists(inodedep); 3855 if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL) 3856 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt)); 3857 /* 3858 * Now that the inode has been pushed into the buffer, the 3859 * operations dependent on the inode being written to disk 3860 * can be moved to the id_bufwait so that they will be 3861 * processed when the buffer I/O completes. 3862 */ 3863 while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { 3864 WORKLIST_REMOVE(wk); 3865 WORKLIST_INSERT(&inodedep->id_bufwait, wk); 3866 } 3867 /* 3868 * Newly allocated inodes cannot be written until the bitmap 3869 * that allocates them have been written (indicated by 3870 * DEPCOMPLETE being set in id_state). If we are doing a 3871 * forced sync (e.g., an fsync on a file), we force the bitmap 3872 * to be written so that the update can be done. 3873 */ 3874 if (waitfor == 0) { 3875 FREE_LOCK(&lk); 3876 return; 3877 } 3878 retry: 3879 if ((inodedep->id_state & DEPCOMPLETE) != 0) { 3880 FREE_LOCK(&lk); 3881 return; 3882 } 3883 gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT); 3884 if (gotit == 0) { 3885 if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) != 0) 3886 goto retry; 3887 FREE_LOCK(&lk); 3888 return; 3889 } 3890 ibp = inodedep->id_buf; 3891 FREE_LOCK(&lk); 3892 if ((error = bwrite(ibp)) != 0) 3893 softdep_error("softdep_update_inodeblock: bwrite", error); 3894 } 3895 3896 /* 3897 * Merge the new inode dependency list (id_newinoupdt) into the old 3898 * inode dependency list (id_inoupdt). This routine must be called 3899 * with splbio interrupts blocked. 3900 */ 3901 static void 3902 merge_inode_lists(struct inodedep *inodedep) 3903 { 3904 struct allocdirect *listadp, *newadp; 3905 3906 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); 3907 for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) { 3908 if (listadp->ad_lbn < newadp->ad_lbn) { 3909 listadp = TAILQ_NEXT(listadp, ad_next); 3910 continue; 3911 } 3912 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); 3913 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); 3914 if (listadp->ad_lbn == newadp->ad_lbn) { 3915 allocdirect_merge(&inodedep->id_inoupdt, newadp, 3916 listadp); 3917 listadp = newadp; 3918 } 3919 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); 3920 } 3921 while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) { 3922 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); 3923 TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next); 3924 } 3925 } 3926 3927 /* 3928 * If we are doing an fsync, then we must ensure that any directory 3929 * entries for the inode have been written after the inode gets to disk. 3930 * 3931 * bioops callback - hold io_token 3932 * 3933 * Parameters: 3934 * vp: the "in_core" copy of the inode 3935 */ 3936 static int 3937 softdep_fsync(struct vnode *vp) 3938 { 3939 struct inodedep *inodedep; 3940 struct pagedep *pagedep; 3941 struct worklist *wk; 3942 struct diradd *dap; 3943 struct mount *mnt; 3944 struct vnode *pvp; 3945 struct inode *ip; 3946 struct buf *bp; 3947 struct fs *fs; 3948 int error, flushparent; 3949 ino_t parentino; 3950 ufs_lbn_t lbn; 3951 3952 /* 3953 * Move check from original kernel code, possibly not needed any 3954 * more with the per-mount bioops. 3955 */ 3956 if ((vp->v_mount->mnt_flag & MNT_SOFTDEP) == 0) 3957 return (0); 3958 3959 ip = VTOI(vp); 3960 fs = ip->i_fs; 3961 ACQUIRE_LOCK(&lk); 3962 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) { 3963 FREE_LOCK(&lk); 3964 return (0); 3965 } 3966 if (LIST_FIRST(&inodedep->id_inowait) != NULL || 3967 LIST_FIRST(&inodedep->id_bufwait) != NULL || 3968 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || 3969 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) { 3970 panic("softdep_fsync: pending ops"); 3971 } 3972 for (error = 0, flushparent = 0; ; ) { 3973 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) 3974 break; 3975 if (wk->wk_type != D_DIRADD) { 3976 panic("softdep_fsync: Unexpected type %s", 3977 TYPENAME(wk->wk_type)); 3978 } 3979 dap = WK_DIRADD(wk); 3980 /* 3981 * Flush our parent if this directory entry 3982 * has a MKDIR_PARENT dependency. 3983 */ 3984 if (dap->da_state & DIRCHG) 3985 pagedep = dap->da_previous->dm_pagedep; 3986 else 3987 pagedep = dap->da_pagedep; 3988 mnt = pagedep->pd_mnt; 3989 parentino = pagedep->pd_ino; 3990 lbn = pagedep->pd_lbn; 3991 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) { 3992 panic("softdep_fsync: dirty"); 3993 } 3994 flushparent = dap->da_state & MKDIR_PARENT; 3995 /* 3996 * If we are being fsync'ed as part of vgone'ing this vnode, 3997 * then we will not be able to release and recover the 3998 * vnode below, so we just have to give up on writing its 3999 * directory entry out. It will eventually be written, just 4000 * not now, but then the user was not asking to have it 4001 * written, so we are not breaking any promises. 4002 */ 4003 if (vp->v_flag & VRECLAIMED) 4004 break; 4005 /* 4006 * We prevent deadlock by always fetching inodes from the 4007 * root, moving down the directory tree. Thus, when fetching 4008 * our parent directory, we must unlock ourselves before 4009 * requesting the lock on our parent. See the comment in 4010 * ufs_lookup for details on possible races. 4011 */ 4012 FREE_LOCK(&lk); 4013 vn_unlock(vp); 4014 error = VFS_VGET(mnt, NULL, parentino, &pvp); 4015 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 4016 if (error != 0) { 4017 return (error); 4018 } 4019 if (flushparent) { 4020 if ((error = ffs_update(pvp, 1)) != 0) { 4021 vput(pvp); 4022 return (error); 4023 } 4024 } 4025 /* 4026 * Flush directory page containing the inode's name. 4027 */ 4028 error = bread(pvp, lblktodoff(fs, lbn), blksize(fs, VTOI(pvp), lbn), &bp); 4029 if (error == 0) 4030 error = bwrite(bp); 4031 vput(pvp); 4032 if (error != 0) { 4033 return (error); 4034 } 4035 ACQUIRE_LOCK(&lk); 4036 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) 4037 break; 4038 } 4039 FREE_LOCK(&lk); 4040 return (0); 4041 } 4042 4043 /* 4044 * Flush all the dirty bitmaps associated with the block device 4045 * before flushing the rest of the dirty blocks so as to reduce 4046 * the number of dependencies that will have to be rolled back. 4047 */ 4048 static int softdep_fsync_mountdev_bp(struct buf *bp, void *data); 4049 4050 void 4051 softdep_fsync_mountdev(struct vnode *vp) 4052 { 4053 if (!vn_isdisk(vp, NULL)) 4054 panic("softdep_fsync_mountdev: vnode not a disk"); 4055 ACQUIRE_LOCK(&lk); 4056 lwkt_gettoken(&vp->v_token); 4057 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 4058 softdep_fsync_mountdev_bp, vp); 4059 lwkt_reltoken(&vp->v_token); 4060 drain_output(vp, 1); 4061 FREE_LOCK(&lk); 4062 } 4063 4064 static int 4065 softdep_fsync_mountdev_bp(struct buf *bp, void *data) 4066 { 4067 struct worklist *wk; 4068 struct vnode *vp = data; 4069 4070 /* 4071 * If it is already scheduled, skip to the next buffer. 4072 */ 4073 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) 4074 return(0); 4075 if (bp->b_vp != vp || (bp->b_flags & B_DELWRI) == 0) { 4076 BUF_UNLOCK(bp); 4077 kprintf("softdep_fsync_mountdev_bp: warning, buffer %p ripped out from under vnode %p\n", bp, vp); 4078 return(0); 4079 } 4080 /* 4081 * We are only interested in bitmaps with outstanding 4082 * dependencies. 4083 */ 4084 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || 4085 wk->wk_type != D_BMSAFEMAP) { 4086 BUF_UNLOCK(bp); 4087 return(0); 4088 } 4089 bremfree(bp); 4090 FREE_LOCK(&lk); 4091 (void) bawrite(bp); 4092 ACQUIRE_LOCK(&lk); 4093 return(0); 4094 } 4095 4096 /* 4097 * This routine is called when we are trying to synchronously flush a 4098 * file. This routine must eliminate any filesystem metadata dependencies 4099 * so that the syncing routine can succeed by pushing the dirty blocks 4100 * associated with the file. If any I/O errors occur, they are returned. 4101 */ 4102 struct softdep_sync_metadata_info { 4103 struct vnode *vp; 4104 int waitfor; 4105 }; 4106 4107 static int softdep_sync_metadata_bp(struct buf *bp, void *data); 4108 4109 int 4110 softdep_sync_metadata(struct vnode *vp, struct thread *td) 4111 { 4112 struct softdep_sync_metadata_info info; 4113 int error, waitfor; 4114 4115 /* 4116 * Check whether this vnode is involved in a filesystem 4117 * that is doing soft dependency processing. 4118 */ 4119 if (!vn_isdisk(vp, NULL)) { 4120 if (!DOINGSOFTDEP(vp)) 4121 return (0); 4122 } else 4123 if (vp->v_rdev->si_mountpoint == NULL || 4124 (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP) == 0) 4125 return (0); 4126 /* 4127 * Ensure that any direct block dependencies have been cleared. 4128 */ 4129 ACQUIRE_LOCK(&lk); 4130 if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) { 4131 FREE_LOCK(&lk); 4132 return (error); 4133 } 4134 /* 4135 * For most files, the only metadata dependencies are the 4136 * cylinder group maps that allocate their inode or blocks. 4137 * The block allocation dependencies can be found by traversing 4138 * the dependency lists for any buffers that remain on their 4139 * dirty buffer list. The inode allocation dependency will 4140 * be resolved when the inode is updated with MNT_WAIT. 4141 * This work is done in two passes. The first pass grabs most 4142 * of the buffers and begins asynchronously writing them. The 4143 * only way to wait for these asynchronous writes is to sleep 4144 * on the filesystem vnode which may stay busy for a long time 4145 * if the filesystem is active. So, instead, we make a second 4146 * pass over the dependencies blocking on each write. In the 4147 * usual case we will be blocking against a write that we 4148 * initiated, so when it is done the dependency will have been 4149 * resolved. Thus the second pass is expected to end quickly. 4150 */ 4151 waitfor = MNT_NOWAIT; 4152 top: 4153 /* 4154 * We must wait for any I/O in progress to finish so that 4155 * all potential buffers on the dirty list will be visible. 4156 */ 4157 drain_output(vp, 1); 4158 4159 info.vp = vp; 4160 info.waitfor = waitfor; 4161 lwkt_gettoken(&vp->v_token); 4162 error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 4163 softdep_sync_metadata_bp, &info); 4164 lwkt_reltoken(&vp->v_token); 4165 if (error < 0) { 4166 FREE_LOCK(&lk); 4167 return(-error); /* error code */ 4168 } 4169 4170 /* 4171 * The brief unlock is to allow any pent up dependency 4172 * processing to be done. Then proceed with the second pass. 4173 */ 4174 if (waitfor & MNT_NOWAIT) { 4175 waitfor = MNT_WAIT; 4176 FREE_LOCK(&lk); 4177 ACQUIRE_LOCK(&lk); 4178 goto top; 4179 } 4180 4181 /* 4182 * If we have managed to get rid of all the dirty buffers, 4183 * then we are done. For certain directories and block 4184 * devices, we may need to do further work. 4185 * 4186 * We must wait for any I/O in progress to finish so that 4187 * all potential buffers on the dirty list will be visible. 4188 */ 4189 drain_output(vp, 1); 4190 if (RB_EMPTY(&vp->v_rbdirty_tree)) { 4191 FREE_LOCK(&lk); 4192 return (0); 4193 } 4194 4195 FREE_LOCK(&lk); 4196 /* 4197 * If we are trying to sync a block device, some of its buffers may 4198 * contain metadata that cannot be written until the contents of some 4199 * partially written files have been written to disk. The only easy 4200 * way to accomplish this is to sync the entire filesystem (luckily 4201 * this happens rarely). 4202 */ 4203 if (vn_isdisk(vp, NULL) && 4204 vp->v_rdev && 4205 vp->v_rdev->si_mountpoint && !vn_islocked(vp) && 4206 (error = VFS_SYNC(vp->v_rdev->si_mountpoint, MNT_WAIT)) != 0) 4207 return (error); 4208 return (0); 4209 } 4210 4211 static int 4212 softdep_sync_metadata_bp(struct buf *bp, void *data) 4213 { 4214 struct softdep_sync_metadata_info *info = data; 4215 struct pagedep *pagedep; 4216 struct allocdirect *adp; 4217 struct allocindir *aip; 4218 struct worklist *wk; 4219 struct buf *nbp; 4220 int error; 4221 int i; 4222 4223 if (getdirtybuf(&bp, MNT_WAIT) == 0) { 4224 kprintf("softdep_sync_metadata_bp(1): caught buf %p going away\n", bp); 4225 return (1); 4226 } 4227 if (bp->b_vp != info->vp || (bp->b_flags & B_DELWRI) == 0) { 4228 kprintf("softdep_sync_metadata_bp(2): caught buf %p going away vp %p\n", bp, info->vp); 4229 BUF_UNLOCK(bp); 4230 return(1); 4231 } 4232 4233 /* 4234 * As we hold the buffer locked, none of its dependencies 4235 * will disappear. 4236 */ 4237 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 4238 switch (wk->wk_type) { 4239 4240 case D_ALLOCDIRECT: 4241 adp = WK_ALLOCDIRECT(wk); 4242 if (adp->ad_state & DEPCOMPLETE) 4243 break; 4244 nbp = adp->ad_buf; 4245 if (getdirtybuf(&nbp, info->waitfor) == 0) 4246 break; 4247 FREE_LOCK(&lk); 4248 if (info->waitfor & MNT_NOWAIT) { 4249 bawrite(nbp); 4250 } else if ((error = bwrite(nbp)) != 0) { 4251 bawrite(bp); 4252 ACQUIRE_LOCK(&lk); 4253 return (-error); 4254 } 4255 ACQUIRE_LOCK(&lk); 4256 break; 4257 4258 case D_ALLOCINDIR: 4259 aip = WK_ALLOCINDIR(wk); 4260 if (aip->ai_state & DEPCOMPLETE) 4261 break; 4262 nbp = aip->ai_buf; 4263 if (getdirtybuf(&nbp, info->waitfor) == 0) 4264 break; 4265 FREE_LOCK(&lk); 4266 if (info->waitfor & MNT_NOWAIT) { 4267 bawrite(nbp); 4268 } else if ((error = bwrite(nbp)) != 0) { 4269 bawrite(bp); 4270 ACQUIRE_LOCK(&lk); 4271 return (-error); 4272 } 4273 ACQUIRE_LOCK(&lk); 4274 break; 4275 4276 case D_INDIRDEP: 4277 restart: 4278 4279 LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) { 4280 if (aip->ai_state & DEPCOMPLETE) 4281 continue; 4282 nbp = aip->ai_buf; 4283 if (getdirtybuf(&nbp, MNT_WAIT) == 0) 4284 goto restart; 4285 FREE_LOCK(&lk); 4286 if ((error = bwrite(nbp)) != 0) { 4287 bawrite(bp); 4288 ACQUIRE_LOCK(&lk); 4289 return (-error); 4290 } 4291 ACQUIRE_LOCK(&lk); 4292 goto restart; 4293 } 4294 break; 4295 4296 case D_INODEDEP: 4297 if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs, 4298 WK_INODEDEP(wk)->id_ino)) != 0) { 4299 FREE_LOCK(&lk); 4300 bawrite(bp); 4301 ACQUIRE_LOCK(&lk); 4302 return (-error); 4303 } 4304 break; 4305 4306 case D_PAGEDEP: 4307 /* 4308 * We are trying to sync a directory that may 4309 * have dependencies on both its own metadata 4310 * and/or dependencies on the inodes of any 4311 * recently allocated files. We walk its diradd 4312 * lists pushing out the associated inode. 4313 */ 4314 pagedep = WK_PAGEDEP(wk); 4315 for (i = 0; i < DAHASHSZ; i++) { 4316 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) 4317 continue; 4318 if ((error = 4319 flush_pagedep_deps(info->vp, 4320 pagedep->pd_mnt, 4321 &pagedep->pd_diraddhd[i]))) { 4322 FREE_LOCK(&lk); 4323 bawrite(bp); 4324 ACQUIRE_LOCK(&lk); 4325 return (-error); 4326 } 4327 } 4328 break; 4329 4330 case D_MKDIR: 4331 /* 4332 * This case should never happen if the vnode has 4333 * been properly sync'ed. However, if this function 4334 * is used at a place where the vnode has not yet 4335 * been sync'ed, this dependency can show up. So, 4336 * rather than panic, just flush it. 4337 */ 4338 nbp = WK_MKDIR(wk)->md_buf; 4339 if (getdirtybuf(&nbp, info->waitfor) == 0) 4340 break; 4341 FREE_LOCK(&lk); 4342 if (info->waitfor & MNT_NOWAIT) { 4343 bawrite(nbp); 4344 } else if ((error = bwrite(nbp)) != 0) { 4345 bawrite(bp); 4346 ACQUIRE_LOCK(&lk); 4347 return (-error); 4348 } 4349 ACQUIRE_LOCK(&lk); 4350 break; 4351 4352 case D_BMSAFEMAP: 4353 /* 4354 * This case should never happen if the vnode has 4355 * been properly sync'ed. However, if this function 4356 * is used at a place where the vnode has not yet 4357 * been sync'ed, this dependency can show up. So, 4358 * rather than panic, just flush it. 4359 * 4360 * nbp can wind up == bp if a device node for the 4361 * same filesystem is being fsynced at the same time, 4362 * leading to a panic if we don't catch the case. 4363 */ 4364 nbp = WK_BMSAFEMAP(wk)->sm_buf; 4365 if (nbp == bp) 4366 break; 4367 if (getdirtybuf(&nbp, info->waitfor) == 0) 4368 break; 4369 FREE_LOCK(&lk); 4370 if (info->waitfor & MNT_NOWAIT) { 4371 bawrite(nbp); 4372 } else if ((error = bwrite(nbp)) != 0) { 4373 bawrite(bp); 4374 ACQUIRE_LOCK(&lk); 4375 return (-error); 4376 } 4377 ACQUIRE_LOCK(&lk); 4378 break; 4379 4380 default: 4381 panic("softdep_sync_metadata: Unknown type %s", 4382 TYPENAME(wk->wk_type)); 4383 /* NOTREACHED */ 4384 } 4385 } 4386 FREE_LOCK(&lk); 4387 bawrite(bp); 4388 ACQUIRE_LOCK(&lk); 4389 return(0); 4390 } 4391 4392 /* 4393 * Flush the dependencies associated with an inodedep. 4394 * Called with splbio blocked. 4395 */ 4396 static int 4397 flush_inodedep_deps(struct fs *fs, ino_t ino) 4398 { 4399 struct inodedep *inodedep; 4400 struct allocdirect *adp; 4401 int error, waitfor; 4402 struct buf *bp; 4403 4404 /* 4405 * This work is done in two passes. The first pass grabs most 4406 * of the buffers and begins asynchronously writing them. The 4407 * only way to wait for these asynchronous writes is to sleep 4408 * on the filesystem vnode which may stay busy for a long time 4409 * if the filesystem is active. So, instead, we make a second 4410 * pass over the dependencies blocking on each write. In the 4411 * usual case we will be blocking against a write that we 4412 * initiated, so when it is done the dependency will have been 4413 * resolved. Thus the second pass is expected to end quickly. 4414 * We give a brief window at the top of the loop to allow 4415 * any pending I/O to complete. 4416 */ 4417 for (waitfor = MNT_NOWAIT; ; ) { 4418 FREE_LOCK(&lk); 4419 ACQUIRE_LOCK(&lk); 4420 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) 4421 return (0); 4422 TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) { 4423 if (adp->ad_state & DEPCOMPLETE) 4424 continue; 4425 bp = adp->ad_buf; 4426 if (getdirtybuf(&bp, waitfor) == 0) { 4427 if (waitfor & MNT_NOWAIT) 4428 continue; 4429 break; 4430 } 4431 FREE_LOCK(&lk); 4432 if (waitfor & MNT_NOWAIT) { 4433 bawrite(bp); 4434 } else if ((error = bwrite(bp)) != 0) { 4435 ACQUIRE_LOCK(&lk); 4436 return (error); 4437 } 4438 ACQUIRE_LOCK(&lk); 4439 break; 4440 } 4441 if (adp != NULL) 4442 continue; 4443 TAILQ_FOREACH(adp, &inodedep->id_newinoupdt, ad_next) { 4444 if (adp->ad_state & DEPCOMPLETE) 4445 continue; 4446 bp = adp->ad_buf; 4447 if (getdirtybuf(&bp, waitfor) == 0) { 4448 if (waitfor & MNT_NOWAIT) 4449 continue; 4450 break; 4451 } 4452 FREE_LOCK(&lk); 4453 if (waitfor & MNT_NOWAIT) { 4454 bawrite(bp); 4455 } else if ((error = bwrite(bp)) != 0) { 4456 ACQUIRE_LOCK(&lk); 4457 return (error); 4458 } 4459 ACQUIRE_LOCK(&lk); 4460 break; 4461 } 4462 if (adp != NULL) 4463 continue; 4464 /* 4465 * If pass2, we are done, otherwise do pass 2. 4466 */ 4467 if (waitfor == MNT_WAIT) 4468 break; 4469 waitfor = MNT_WAIT; 4470 } 4471 /* 4472 * Try freeing inodedep in case all dependencies have been removed. 4473 */ 4474 if (inodedep_lookup(fs, ino, 0, &inodedep) != 0) 4475 (void) free_inodedep(inodedep); 4476 return (0); 4477 } 4478 4479 /* 4480 * Eliminate a pagedep dependency by flushing out all its diradd dependencies. 4481 * Called with splbio blocked. 4482 */ 4483 static int 4484 flush_pagedep_deps(struct vnode *pvp, struct mount *mp, 4485 struct diraddhd *diraddhdp) 4486 { 4487 struct inodedep *inodedep; 4488 struct ufsmount *ump; 4489 struct diradd *dap; 4490 struct vnode *vp; 4491 int gotit, error = 0; 4492 struct buf *bp; 4493 ino_t inum; 4494 4495 ump = VFSTOUFS(mp); 4496 while ((dap = LIST_FIRST(diraddhdp)) != NULL) { 4497 /* 4498 * Flush ourselves if this directory entry 4499 * has a MKDIR_PARENT dependency. 4500 */ 4501 if (dap->da_state & MKDIR_PARENT) { 4502 FREE_LOCK(&lk); 4503 if ((error = ffs_update(pvp, 1)) != 0) 4504 break; 4505 ACQUIRE_LOCK(&lk); 4506 /* 4507 * If that cleared dependencies, go on to next. 4508 */ 4509 if (dap != LIST_FIRST(diraddhdp)) 4510 continue; 4511 if (dap->da_state & MKDIR_PARENT) { 4512 panic("flush_pagedep_deps: MKDIR_PARENT"); 4513 } 4514 } 4515 /* 4516 * A newly allocated directory must have its "." and 4517 * ".." entries written out before its name can be 4518 * committed in its parent. We do not want or need 4519 * the full semantics of a synchronous VOP_FSYNC as 4520 * that may end up here again, once for each directory 4521 * level in the filesystem. Instead, we push the blocks 4522 * and wait for them to clear. We have to fsync twice 4523 * because the first call may choose to defer blocks 4524 * that still have dependencies, but deferral will 4525 * happen at most once. 4526 */ 4527 inum = dap->da_newinum; 4528 if (dap->da_state & MKDIR_BODY) { 4529 FREE_LOCK(&lk); 4530 if ((error = VFS_VGET(mp, NULL, inum, &vp)) != 0) 4531 break; 4532 if ((error=VOP_FSYNC(vp, MNT_NOWAIT, 0)) || 4533 (error=VOP_FSYNC(vp, MNT_NOWAIT, 0))) { 4534 vput(vp); 4535 break; 4536 } 4537 drain_output(vp, 0); 4538 vput(vp); 4539 ACQUIRE_LOCK(&lk); 4540 /* 4541 * If that cleared dependencies, go on to next. 4542 */ 4543 if (dap != LIST_FIRST(diraddhdp)) 4544 continue; 4545 if (dap->da_state & MKDIR_BODY) { 4546 panic("flush_pagedep_deps: MKDIR_BODY"); 4547 } 4548 } 4549 /* 4550 * Flush the inode on which the directory entry depends. 4551 * Having accounted for MKDIR_PARENT and MKDIR_BODY above, 4552 * the only remaining dependency is that the updated inode 4553 * count must get pushed to disk. The inode has already 4554 * been pushed into its inode buffer (via VOP_UPDATE) at 4555 * the time of the reference count change. So we need only 4556 * locate that buffer, ensure that there will be no rollback 4557 * caused by a bitmap dependency, then write the inode buffer. 4558 */ 4559 if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) { 4560 panic("flush_pagedep_deps: lost inode"); 4561 } 4562 /* 4563 * If the inode still has bitmap dependencies, 4564 * push them to disk. 4565 */ 4566 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 4567 gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT); 4568 FREE_LOCK(&lk); 4569 if (gotit && (error = bwrite(inodedep->id_buf)) != 0) 4570 break; 4571 ACQUIRE_LOCK(&lk); 4572 if (dap != LIST_FIRST(diraddhdp)) 4573 continue; 4574 } 4575 /* 4576 * If the inode is still sitting in a buffer waiting 4577 * to be written, push it to disk. 4578 */ 4579 FREE_LOCK(&lk); 4580 if ((error = bread(ump->um_devvp, 4581 fsbtodoff(ump->um_fs, ino_to_fsba(ump->um_fs, inum)), 4582 (int)ump->um_fs->fs_bsize, &bp)) != 0) 4583 break; 4584 if ((error = bwrite(bp)) != 0) 4585 break; 4586 ACQUIRE_LOCK(&lk); 4587 /* 4588 * If we have failed to get rid of all the dependencies 4589 * then something is seriously wrong. 4590 */ 4591 if (dap == LIST_FIRST(diraddhdp)) { 4592 panic("flush_pagedep_deps: flush failed"); 4593 } 4594 } 4595 if (error) 4596 ACQUIRE_LOCK(&lk); 4597 return (error); 4598 } 4599 4600 /* 4601 * A large burst of file addition or deletion activity can drive the 4602 * memory load excessively high. First attempt to slow things down 4603 * using the techniques below. If that fails, this routine requests 4604 * the offending operations to fall back to running synchronously 4605 * until the memory load returns to a reasonable level. 4606 */ 4607 int 4608 softdep_slowdown(struct vnode *vp) 4609 { 4610 int max_softdeps_hard; 4611 4612 max_softdeps_hard = max_softdeps * 11 / 10; 4613 if (num_dirrem < max_softdeps_hard / 2 && 4614 num_inodedep < max_softdeps_hard) 4615 return (0); 4616 stat_sync_limit_hit += 1; 4617 return (1); 4618 } 4619 4620 /* 4621 * If memory utilization has gotten too high, deliberately slow things 4622 * down and speed up the I/O processing. 4623 */ 4624 static int 4625 request_cleanup(int resource, int islocked) 4626 { 4627 struct thread *td = curthread; /* XXX */ 4628 4629 /* 4630 * We never hold up the filesystem syncer process. 4631 */ 4632 if (td == filesys_syncer) 4633 return (0); 4634 /* 4635 * First check to see if the work list has gotten backlogged. 4636 * If it has, co-opt this process to help clean up two entries. 4637 * Because this process may hold inodes locked, we cannot 4638 * handle any remove requests that might block on a locked 4639 * inode as that could lead to deadlock. 4640 */ 4641 if (num_on_worklist > max_softdeps / 10) { 4642 process_worklist_item(NULL, LK_NOWAIT); 4643 process_worklist_item(NULL, LK_NOWAIT); 4644 stat_worklist_push += 2; 4645 return(1); 4646 } 4647 4648 /* 4649 * If we are resource constrained on inode dependencies, try 4650 * flushing some dirty inodes. Otherwise, we are constrained 4651 * by file deletions, so try accelerating flushes of directories 4652 * with removal dependencies. We would like to do the cleanup 4653 * here, but we probably hold an inode locked at this point and 4654 * that might deadlock against one that we try to clean. So, 4655 * the best that we can do is request the syncer daemon to do 4656 * the cleanup for us. 4657 */ 4658 switch (resource) { 4659 4660 case FLUSH_INODES: 4661 stat_ino_limit_push += 1; 4662 req_clear_inodedeps += 1; 4663 stat_countp = &stat_ino_limit_hit; 4664 break; 4665 4666 case FLUSH_REMOVE: 4667 stat_blk_limit_push += 1; 4668 req_clear_remove += 1; 4669 stat_countp = &stat_blk_limit_hit; 4670 break; 4671 4672 default: 4673 panic("request_cleanup: unknown type"); 4674 } 4675 /* 4676 * Hopefully the syncer daemon will catch up and awaken us. 4677 * We wait at most tickdelay before proceeding in any case. 4678 */ 4679 if (islocked == 0) 4680 ACQUIRE_LOCK(&lk); 4681 lksleep(&proc_waiting, &lk, 0, "softupdate", 4682 tickdelay > 2 ? tickdelay : 2); 4683 if (islocked == 0) 4684 FREE_LOCK(&lk); 4685 return (1); 4686 } 4687 4688 /* 4689 * Flush out a directory with at least one removal dependency in an effort to 4690 * reduce the number of dirrem, freefile, and freeblks dependency structures. 4691 */ 4692 static void 4693 clear_remove(struct thread *td) 4694 { 4695 struct pagedep_hashhead *pagedephd; 4696 struct pagedep *pagedep; 4697 static int next = 0; 4698 struct mount *mp; 4699 struct vnode *vp; 4700 int error, cnt; 4701 ino_t ino; 4702 4703 ACQUIRE_LOCK(&lk); 4704 for (cnt = 0; cnt < pagedep_hash; cnt++) { 4705 pagedephd = &pagedep_hashtbl[next++]; 4706 if (next >= pagedep_hash) 4707 next = 0; 4708 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 4709 if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL) 4710 continue; 4711 mp = pagedep->pd_mnt; 4712 ino = pagedep->pd_ino; 4713 FREE_LOCK(&lk); 4714 if ((error = VFS_VGET(mp, NULL, ino, &vp)) != 0) { 4715 softdep_error("clear_remove: vget", error); 4716 return; 4717 } 4718 if ((error = VOP_FSYNC(vp, MNT_NOWAIT, 0))) 4719 softdep_error("clear_remove: fsync", error); 4720 drain_output(vp, 0); 4721 vput(vp); 4722 return; 4723 } 4724 } 4725 FREE_LOCK(&lk); 4726 } 4727 4728 /* 4729 * Clear out a block of dirty inodes in an effort to reduce 4730 * the number of inodedep dependency structures. 4731 */ 4732 struct clear_inodedeps_info { 4733 struct fs *fs; 4734 struct mount *mp; 4735 }; 4736 4737 static int 4738 clear_inodedeps_mountlist_callback(struct mount *mp, void *data) 4739 { 4740 struct clear_inodedeps_info *info = data; 4741 4742 if ((mp->mnt_flag & MNT_SOFTDEP) && info->fs == VFSTOUFS(mp)->um_fs) { 4743 info->mp = mp; 4744 return(-1); 4745 } 4746 return(0); 4747 } 4748 4749 static void 4750 clear_inodedeps(struct thread *td) 4751 { 4752 struct clear_inodedeps_info info; 4753 struct inodedep_hashhead *inodedephd; 4754 struct inodedep *inodedep; 4755 static int next = 0; 4756 struct vnode *vp; 4757 struct fs *fs; 4758 int error, cnt; 4759 ino_t firstino, lastino, ino; 4760 4761 ACQUIRE_LOCK(&lk); 4762 /* 4763 * Pick a random inode dependency to be cleared. 4764 * We will then gather up all the inodes in its block 4765 * that have dependencies and flush them out. 4766 */ 4767 for (cnt = 0; cnt < inodedep_hash; cnt++) { 4768 inodedephd = &inodedep_hashtbl[next++]; 4769 if (next >= inodedep_hash) 4770 next = 0; 4771 if ((inodedep = LIST_FIRST(inodedephd)) != NULL) 4772 break; 4773 } 4774 if (inodedep == NULL) { 4775 FREE_LOCK(&lk); 4776 return; 4777 } 4778 /* 4779 * Ugly code to find mount point given pointer to superblock. 4780 */ 4781 fs = inodedep->id_fs; 4782 info.mp = NULL; 4783 info.fs = fs; 4784 mountlist_scan(clear_inodedeps_mountlist_callback, 4785 &info, MNTSCAN_FORWARD|MNTSCAN_NOBUSY); 4786 /* 4787 * Find the last inode in the block with dependencies. 4788 */ 4789 firstino = inodedep->id_ino & ~(INOPB(fs) - 1); 4790 for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) 4791 if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0) 4792 break; 4793 /* 4794 * Asynchronously push all but the last inode with dependencies. 4795 * Synchronously push the last inode with dependencies to ensure 4796 * that the inode block gets written to free up the inodedeps. 4797 */ 4798 for (ino = firstino; ino <= lastino; ino++) { 4799 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) 4800 continue; 4801 FREE_LOCK(&lk); 4802 if ((error = VFS_VGET(info.mp, NULL, ino, &vp)) != 0) { 4803 softdep_error("clear_inodedeps: vget", error); 4804 return; 4805 } 4806 if (ino == lastino) { 4807 if ((error = VOP_FSYNC(vp, MNT_WAIT, 0))) 4808 softdep_error("clear_inodedeps: fsync1", error); 4809 } else { 4810 if ((error = VOP_FSYNC(vp, MNT_NOWAIT, 0))) 4811 softdep_error("clear_inodedeps: fsync2", error); 4812 drain_output(vp, 0); 4813 } 4814 vput(vp); 4815 ACQUIRE_LOCK(&lk); 4816 } 4817 FREE_LOCK(&lk); 4818 } 4819 4820 /* 4821 * Function to determine if the buffer has outstanding dependencies 4822 * that will cause a roll-back if the buffer is written. If wantcount 4823 * is set, return number of dependencies, otherwise just yes or no. 4824 * 4825 * bioops callback - hold io_token 4826 */ 4827 static int 4828 softdep_count_dependencies(struct buf *bp, int wantcount) 4829 { 4830 struct worklist *wk; 4831 struct inodedep *inodedep; 4832 struct indirdep *indirdep; 4833 struct allocindir *aip; 4834 struct pagedep *pagedep; 4835 struct diradd *dap; 4836 int i, retval; 4837 4838 retval = 0; 4839 ACQUIRE_LOCK(&lk); 4840 4841 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 4842 switch (wk->wk_type) { 4843 4844 case D_INODEDEP: 4845 inodedep = WK_INODEDEP(wk); 4846 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 4847 /* bitmap allocation dependency */ 4848 retval += 1; 4849 if (!wantcount) 4850 goto out; 4851 } 4852 if (TAILQ_FIRST(&inodedep->id_inoupdt)) { 4853 /* direct block pointer dependency */ 4854 retval += 1; 4855 if (!wantcount) 4856 goto out; 4857 } 4858 continue; 4859 4860 case D_INDIRDEP: 4861 indirdep = WK_INDIRDEP(wk); 4862 4863 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 4864 /* indirect block pointer dependency */ 4865 retval += 1; 4866 if (!wantcount) 4867 goto out; 4868 } 4869 continue; 4870 4871 case D_PAGEDEP: 4872 pagedep = WK_PAGEDEP(wk); 4873 for (i = 0; i < DAHASHSZ; i++) { 4874 4875 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 4876 /* directory entry dependency */ 4877 retval += 1; 4878 if (!wantcount) 4879 goto out; 4880 } 4881 } 4882 continue; 4883 4884 case D_BMSAFEMAP: 4885 case D_ALLOCDIRECT: 4886 case D_ALLOCINDIR: 4887 case D_MKDIR: 4888 /* never a dependency on these blocks */ 4889 continue; 4890 4891 default: 4892 panic("softdep_check_for_rollback: Unexpected type %s", 4893 TYPENAME(wk->wk_type)); 4894 /* NOTREACHED */ 4895 } 4896 } 4897 out: 4898 FREE_LOCK(&lk); 4899 4900 return retval; 4901 } 4902 4903 /* 4904 * getdirtybuf: 4905 * 4906 * Acquire exclusive access to a buffer. Requires softdep lock 4907 * to be held on entry. If waitfor is MNT_WAIT, may release/reacquire 4908 * softdep lock. 4909 * 4910 * Returns 1 if the buffer was locked, 0 otherwise. 4911 */ 4912 static int 4913 getdirtybuf(struct buf **bpp, int waitfor) 4914 { 4915 struct buf *bp; 4916 int error; 4917 4918 bp = *bpp; 4919 if (bp == NULL) 4920 return (0); 4921 4922 for (;;) { 4923 /* Must acquire buffer lock with ffs_softdep lock held */ 4924 KKASSERT(lock_held(&lk) > 0); 4925 error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT); 4926 if (error == 0) 4927 break; 4928 4929 if (waitfor != MNT_WAIT) 4930 return (0); 4931 4932 /* 4933 * Release ffs_softdep lock around sleep/wait for buffer lock. 4934 * 4935 * We must acquire buffer lock with softdep lock held, so 4936 * we must retry locking the buffer after we wake. 4937 */ 4938 FREE_LOCK(&lk); 4939 error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL); 4940 ACQUIRE_LOCK(&lk); 4941 if (error == 0) 4942 BUF_UNLOCK(bp); 4943 else if (error == ENOLCK) 4944 ; 4945 else 4946 panic("getdirtybuf: Inconsistent lock"); 4947 } 4948 4949 /* Buffer wasn't dirty */ 4950 if ((bp->b_flags & B_DELWRI) == 0) { 4951 BUF_UNLOCK(bp); 4952 return (0); 4953 } 4954 bremfree(bp); 4955 return (1); 4956 } 4957 4958 /* 4959 * Wait for pending output on a vnode to complete. 4960 * Must be called with vnode locked. 4961 */ 4962 static void 4963 drain_output(struct vnode *vp, int islocked) 4964 { 4965 4966 if (!islocked) 4967 ACQUIRE_LOCK(&lk); 4968 while (bio_track_active(&vp->v_track_write)) { 4969 FREE_LOCK(&lk); 4970 bio_track_wait(&vp->v_track_write, 0, 0); 4971 ACQUIRE_LOCK(&lk); 4972 } 4973 if (!islocked) 4974 FREE_LOCK(&lk); 4975 } 4976 4977 /* 4978 * Called whenever a buffer that is being invalidated or reallocated 4979 * contains dependencies. This should only happen if an I/O error has 4980 * occurred. The routine is called with the buffer locked. 4981 * 4982 * bioops callback - hold io_token 4983 */ 4984 static void 4985 softdep_deallocate_dependencies(struct buf *bp) 4986 { 4987 /* nothing to do, mp lock not needed */ 4988 if ((bp->b_flags & B_ERROR) == 0) 4989 panic("softdep_deallocate_dependencies: dangling deps"); 4990 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntfromname, bp->b_error); 4991 panic("softdep_deallocate_dependencies: unrecovered I/O error"); 4992 } 4993 4994 /* 4995 * Function to handle asynchronous write errors in the filesystem. 4996 */ 4997 void 4998 softdep_error(char *func, int error) 4999 { 5000 /* XXX should do something better! */ 5001 kprintf("%s: got error %d while accessing filesystem\n", func, error); 5002 } 5003