xref: /dragonfly/sys/vfs/ufs/ffs_softdep.c (revision 3ff63cda)
1 /*
2  * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
3  *
4  * The soft updates code is derived from the appendix of a University
5  * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
6  * "Soft Updates: A Solution to the Metadata Update Problem in File
7  * Systems", CSE-TR-254-95, August 1995).
8  *
9  * Further information about soft updates can be obtained from:
10  *
11  *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
12  *	1614 Oxford Street		mckusick@mckusick.com
13  *	Berkeley, CA 94709-1608		+1-510-843-9542
14  *	USA
15  *
16  * Redistribution and use in source and binary forms, with or without
17  * modification, are permitted provided that the following conditions
18  * are met:
19  *
20  * 1. Redistributions of source code must retain the above copyright
21  *    notice, this list of conditions and the following disclaimer.
22  * 2. Redistributions in binary form must reproduce the above copyright
23  *    notice, this list of conditions and the following disclaimer in the
24  *    documentation and/or other materials provided with the distribution.
25  *
26  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
27  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
28  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
29  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
30  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
39  * $FreeBSD: src/sys/ufs/ffs/ffs_softdep.c,v 1.57.2.11 2002/02/05 18:46:53 dillon Exp $
40  */
41 
42 /*
43  * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
44  */
45 #ifndef DIAGNOSTIC
46 #define DIAGNOSTIC
47 #endif
48 #ifndef DEBUG
49 #define DEBUG
50 #endif
51 
52 #include <sys/param.h>
53 #include <sys/kernel.h>
54 #include <sys/systm.h>
55 #include <sys/buf.h>
56 #include <sys/malloc.h>
57 #include <sys/mount.h>
58 #include <sys/proc.h>
59 #include <sys/syslog.h>
60 #include <sys/vnode.h>
61 #include <sys/conf.h>
62 #include <machine/inttypes.h>
63 #include "dir.h"
64 #include "quota.h"
65 #include "inode.h"
66 #include "ufsmount.h"
67 #include "fs.h"
68 #include "softdep.h"
69 #include "ffs_extern.h"
70 #include "ufs_extern.h"
71 
72 #include <sys/buf2.h>
73 #include <sys/lock.h>
74 
75 /*
76  * These definitions need to be adapted to the system to which
77  * this file is being ported.
78  */
79 /*
80  * malloc types defined for the softdep system.
81  */
82 MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
83 MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
84 MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
85 MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
86 MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
87 MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
88 MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
89 MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
90 MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
91 MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
92 MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
93 MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
94 MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
95 
96 #define M_SOFTDEP_FLAGS		(M_WAITOK | M_USE_RESERVE)
97 
98 #define	D_PAGEDEP	0
99 #define	D_INODEDEP	1
100 #define	D_NEWBLK	2
101 #define	D_BMSAFEMAP	3
102 #define	D_ALLOCDIRECT	4
103 #define	D_INDIRDEP	5
104 #define	D_ALLOCINDIR	6
105 #define	D_FREEFRAG	7
106 #define	D_FREEBLKS	8
107 #define	D_FREEFILE	9
108 #define	D_DIRADD	10
109 #define	D_MKDIR		11
110 #define	D_DIRREM	12
111 #define D_LAST		D_DIRREM
112 
113 /*
114  * translate from workitem type to memory type
115  * MUST match the defines above, such that memtype[D_XXX] == M_XXX
116  */
117 static struct malloc_type *memtype[] = {
118 	M_PAGEDEP,
119 	M_INODEDEP,
120 	M_NEWBLK,
121 	M_BMSAFEMAP,
122 	M_ALLOCDIRECT,
123 	M_INDIRDEP,
124 	M_ALLOCINDIR,
125 	M_FREEFRAG,
126 	M_FREEBLKS,
127 	M_FREEFILE,
128 	M_DIRADD,
129 	M_MKDIR,
130 	M_DIRREM
131 };
132 
133 #define DtoM(type) (memtype[type])
134 
135 /*
136  * Names of malloc types.
137  */
138 #define TYPENAME(type)  \
139 	((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
140 /*
141  * End system adaptaion definitions.
142  */
143 
144 /*
145  * Internal function prototypes.
146  */
147 static	void softdep_error(char *, int);
148 static	void drain_output(struct vnode *, int);
149 static	int getdirtybuf(struct buf **, int);
150 static	void clear_remove(struct thread *);
151 static	void clear_inodedeps(struct thread *);
152 static	int flush_pagedep_deps(struct vnode *, struct mount *,
153 	    struct diraddhd *);
154 static	int flush_inodedep_deps(struct fs *, ino_t);
155 static	int handle_written_filepage(struct pagedep *, struct buf *);
156 static  void diradd_inode_written(struct diradd *, struct inodedep *);
157 static	int handle_written_inodeblock(struct inodedep *, struct buf *);
158 static	void handle_allocdirect_partdone(struct allocdirect *);
159 static	void handle_allocindir_partdone(struct allocindir *);
160 static	void initiate_write_filepage(struct pagedep *, struct buf *);
161 static	void handle_written_mkdir(struct mkdir *, int);
162 static	void initiate_write_inodeblock(struct inodedep *, struct buf *);
163 static	void handle_workitem_freefile(struct freefile *);
164 static	void handle_workitem_remove(struct dirrem *);
165 static	struct dirrem *newdirrem(struct buf *, struct inode *,
166 	    struct inode *, int, struct dirrem **);
167 static	void free_diradd(struct diradd *);
168 static	void free_allocindir(struct allocindir *, struct inodedep *);
169 static	int indir_trunc (struct inode *, off_t, int, ufs_lbn_t, long *);
170 static	void deallocate_dependencies(struct buf *, struct inodedep *);
171 static	void free_allocdirect(struct allocdirectlst *,
172 	    struct allocdirect *, int);
173 static	int check_inode_unwritten(struct inodedep *);
174 static	int free_inodedep(struct inodedep *);
175 static	void handle_workitem_freeblocks(struct freeblks *);
176 static	void merge_inode_lists(struct inodedep *);
177 static	void setup_allocindir_phase2(struct buf *, struct inode *,
178 	    struct allocindir *);
179 static	struct allocindir *newallocindir(struct inode *, int, ufs_daddr_t,
180 	    ufs_daddr_t);
181 static	void handle_workitem_freefrag(struct freefrag *);
182 static	struct freefrag *newfreefrag(struct inode *, ufs_daddr_t, long);
183 static	void allocdirect_merge(struct allocdirectlst *,
184 	    struct allocdirect *, struct allocdirect *);
185 static	struct bmsafemap *bmsafemap_lookup(struct buf *);
186 static	int newblk_lookup(struct fs *, ufs_daddr_t, int,
187 	    struct newblk **);
188 static	int inodedep_lookup(struct fs *, ino_t, int, struct inodedep **);
189 static	int pagedep_lookup(struct inode *, ufs_lbn_t, int,
190 	    struct pagedep **);
191 static	int request_cleanup(int);
192 static	int process_worklist_item(struct mount *, int);
193 static	void add_to_worklist(struct worklist *);
194 
195 /*
196  * Exported softdep operations.
197  */
198 static	void softdep_disk_io_initiation(struct buf *);
199 static	void softdep_disk_write_complete(struct buf *);
200 static	void softdep_deallocate_dependencies(struct buf *);
201 static	int softdep_fsync(struct vnode *);
202 static	int softdep_process_worklist(struct mount *);
203 static	void softdep_move_dependencies(struct buf *, struct buf *);
204 static	int softdep_count_dependencies(struct buf *bp, int);
205 static  int softdep_checkread(struct buf *bp);
206 static  int softdep_checkwrite(struct buf *bp);
207 
208 static struct bio_ops softdep_bioops = {
209 	.io_start = softdep_disk_io_initiation,
210 	.io_complete = softdep_disk_write_complete,
211 	.io_deallocate = softdep_deallocate_dependencies,
212 	.io_fsync = softdep_fsync,
213 	.io_sync = softdep_process_worklist,
214 	.io_movedeps = softdep_move_dependencies,
215 	.io_countdeps = softdep_count_dependencies,
216 	.io_checkread = softdep_checkread,
217 	.io_checkwrite = softdep_checkwrite
218 };
219 
220 /*
221  * Locking primitives.
222  */
223 static	void acquire_lock(struct lock *);
224 static	void free_lock(struct lock *);
225 #ifdef INVARIANTS
226 static	int lock_held(struct lock *);
227 #endif
228 
229 static struct lock lk;
230 
231 #define ACQUIRE_LOCK(lkp)		acquire_lock(lkp)
232 #define FREE_LOCK(lkp)			free_lock(lkp)
233 
234 static void
235 acquire_lock(struct lock *lkp)
236 {
237 	lockmgr(lkp, LK_EXCLUSIVE);
238 }
239 
240 static void
241 free_lock(struct lock *lkp)
242 {
243 	lockmgr(lkp, LK_RELEASE);
244 }
245 
246 #ifdef INVARIANTS
247 static int
248 lock_held(struct lock *lkp)
249 {
250 	return lockinuse(lkp);
251 }
252 #endif
253 
254 /*
255  * Place holder for real semaphores.
256  */
257 struct sema {
258 	int	value;
259 	thread_t holder;
260 	char	*name;
261 	int	timo;
262 	struct spinlock spin;
263 };
264 static	void sema_init(struct sema *, char *, int);
265 static	int sema_get(struct sema *, struct lock *);
266 static	void sema_release(struct sema *, struct lock *);
267 
268 #define NOHOLDER	((struct thread *) -1)
269 
270 static void
271 sema_init(struct sema *semap, char *name, int timo)
272 {
273 	semap->holder = NOHOLDER;
274 	semap->value = 0;
275 	semap->name = name;
276 	semap->timo = timo;
277 	spin_init(&semap->spin, "ufssema");
278 }
279 
280 /*
281  * Obtain exclusive access, semaphore is protected by the interlock.
282  * If interlock is NULL we must protect the semaphore ourselves.
283  */
284 static int
285 sema_get(struct sema *semap, struct lock *interlock)
286 {
287 	int rv;
288 
289 	if (interlock) {
290 		if (semap->value > 0) {
291 			++semap->value;		/* serves as wakeup flag */
292 			lksleep(semap, interlock, 0,
293 				semap->name, semap->timo);
294 			rv = 0;
295 		} else {
296 			semap->value = 1;	/* serves as owned flag */
297 			semap->holder = curthread;
298 			rv = 1;
299 		}
300 	} else {
301 		spin_lock(&semap->spin);
302 		if (semap->value > 0) {
303 			++semap->value;		/* serves as wakeup flag */
304 			ssleep(semap, &semap->spin, 0,
305 				semap->name, semap->timo);
306 			spin_unlock(&semap->spin);
307 			rv = 0;
308 		} else {
309 			semap->value = 1;	/* serves as owned flag */
310 			semap->holder = curthread;
311 			spin_unlock(&semap->spin);
312 			rv = 1;
313 		}
314 	}
315 	return (rv);
316 }
317 
318 static void
319 sema_release(struct sema *semap, struct lock *lk)
320 {
321 	if (semap->value <= 0 || semap->holder != curthread)
322 		panic("sema_release: not held");
323 	if (lk) {
324 		semap->holder = NOHOLDER;
325 		if (--semap->value > 0) {
326 			semap->value = 0;
327 			wakeup(semap);
328 		}
329 	} else {
330 		spin_lock(&semap->spin);
331 		semap->holder = NOHOLDER;
332 		if (--semap->value > 0) {
333 			semap->value = 0;
334 			spin_unlock(&semap->spin);
335 			wakeup(semap);
336 		} else {
337 			spin_unlock(&semap->spin);
338 		}
339 	}
340 }
341 
342 /*
343  * Worklist queue management.
344  * These routines require that the lock be held.
345  */
346 static	void worklist_insert(struct workhead *, struct worklist *);
347 static	void worklist_remove(struct worklist *);
348 static	void workitem_free(struct worklist *, int);
349 
350 #define WORKLIST_INSERT_BP(bp, item) do {	\
351 	(bp)->b_ops = &softdep_bioops;		\
352 	worklist_insert(&(bp)->b_dep, item);	\
353 } while (0)
354 
355 #define WORKLIST_INSERT(head, item) worklist_insert(head, item)
356 #define WORKLIST_REMOVE(item) worklist_remove(item)
357 #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type)
358 
359 static void
360 worklist_insert(struct workhead *head, struct worklist *item)
361 {
362 	KKASSERT(lock_held(&lk));
363 
364 	if (item->wk_state & ONWORKLIST) {
365 		panic("worklist_insert: already on list");
366 	}
367 	item->wk_state |= ONWORKLIST;
368 	LIST_INSERT_HEAD(head, item, wk_list);
369 }
370 
371 static void
372 worklist_remove(struct worklist *item)
373 {
374 
375 	KKASSERT(lock_held(&lk));
376 	if ((item->wk_state & ONWORKLIST) == 0)
377 		panic("worklist_remove: not on list");
378 
379 	item->wk_state &= ~ONWORKLIST;
380 	LIST_REMOVE(item, wk_list);
381 }
382 
383 static void
384 workitem_free(struct worklist *item, int type)
385 {
386 
387 	if (item->wk_state & ONWORKLIST)
388 		panic("workitem_free: still on list");
389 	if (item->wk_type != type)
390 		panic("workitem_free: type mismatch");
391 
392 	kfree(item, DtoM(type));
393 }
394 
395 /*
396  * Workitem queue management
397  */
398 static struct workhead softdep_workitem_pending;
399 static int num_on_worklist;	/* number of worklist items to be processed */
400 static int softdep_worklist_busy; /* 1 => trying to do unmount */
401 static int softdep_worklist_req; /* serialized waiters */
402 static int max_softdeps;	/* maximum number of structs before slowdown */
403 static int tickdelay = 2;	/* number of ticks to pause during slowdown */
404 static int *stat_countp;	/* statistic to count in proc_waiting timeout */
405 static int proc_waiting;	/* tracks whether we have a timeout posted */
406 static struct thread *filesys_syncer; /* proc of filesystem syncer process */
407 static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
408 #define FLUSH_INODES	1
409 static int req_clear_remove;	/* syncer process flush some freeblks */
410 #define FLUSH_REMOVE	2
411 /*
412  * runtime statistics
413  */
414 static int stat_worklist_push;	/* number of worklist cleanups */
415 static int stat_blk_limit_push;	/* number of times block limit neared */
416 static int stat_ino_limit_push;	/* number of times inode limit neared */
417 static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
418 static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
419 static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
420 static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
421 static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
422 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
423 static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
424 #ifdef DEBUG
425 #include <vm/vm.h>
426 #include <sys/sysctl.h>
427 SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0,
428     "Maximum soft dependencies before slowdown occurs");
429 SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0,
430     "Ticks to delay before allocating during slowdown");
431 SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,
432     "Number of worklist cleanups");
433 SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,
434     "Number of times block limit neared");
435 SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,
436     "Number of times inode limit neared");
437 SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0,
438     "Number of times block slowdown imposed");
439 SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0,
440     "Number of times inode slowdown imposed ");
441 SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0,
442     "Number of synchronous slowdowns imposed");
443 SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0,
444     "Bufs redirtied as indir ptrs not written");
445 SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0,
446     "Bufs redirtied as inode bitmap not written");
447 SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0,
448     "Bufs redirtied as direct ptrs not written");
449 SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0,
450     "Bufs redirtied as dir entry cannot write");
451 #endif /* DEBUG */
452 
453 /*
454  * Add an item to the end of the work queue.
455  * This routine requires that the lock be held.
456  * This is the only routine that adds items to the list.
457  * The following routine is the only one that removes items
458  * and does so in order from first to last.
459  */
460 static void
461 add_to_worklist(struct worklist *wk)
462 {
463 	static struct worklist *worklist_tail;
464 
465 	if (wk->wk_state & ONWORKLIST) {
466 		panic("add_to_worklist: already on list");
467 	}
468 	wk->wk_state |= ONWORKLIST;
469 	if (LIST_FIRST(&softdep_workitem_pending) == NULL)
470 		LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
471 	else
472 		LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
473 	worklist_tail = wk;
474 	num_on_worklist += 1;
475 }
476 
477 /*
478  * Process that runs once per second to handle items in the background queue.
479  *
480  * Note that we ensure that everything is done in the order in which they
481  * appear in the queue. The code below depends on this property to ensure
482  * that blocks of a file are freed before the inode itself is freed. This
483  * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
484  * until all the old ones have been purged from the dependency lists.
485  *
486  * bioops callback - hold io_token
487  */
488 static int
489 softdep_process_worklist(struct mount *matchmnt)
490 {
491 	thread_t td = curthread;
492 	int matchcnt, loopcount;
493 	int starttime;
494 
495 	ACQUIRE_LOCK(&lk);
496 
497 	/*
498 	 * Record the process identifier of our caller so that we can give
499 	 * this process preferential treatment in request_cleanup below.
500 	 */
501 	filesys_syncer = td;
502 	matchcnt = 0;
503 
504 	/*
505 	 * There is no danger of having multiple processes run this
506 	 * code, but we have to single-thread it when softdep_flushfiles()
507 	 * is in operation to get an accurate count of the number of items
508 	 * related to its mount point that are in the list.
509 	 */
510 	if (matchmnt == NULL) {
511 		if (softdep_worklist_busy < 0) {
512 			matchcnt = -1;
513 			goto done;
514 		}
515 		softdep_worklist_busy += 1;
516 	}
517 
518 	/*
519 	 * If requested, try removing inode or removal dependencies.
520 	 */
521 	if (req_clear_inodedeps) {
522 		clear_inodedeps(td);
523 		req_clear_inodedeps -= 1;
524 		wakeup_one(&proc_waiting);
525 	}
526 	if (req_clear_remove) {
527 		clear_remove(td);
528 		req_clear_remove -= 1;
529 		wakeup_one(&proc_waiting);
530 	}
531 	loopcount = 1;
532 	starttime = ticks;
533 	while (num_on_worklist > 0) {
534 		matchcnt += process_worklist_item(matchmnt, 0);
535 
536 		/*
537 		 * If a umount operation wants to run the worklist
538 		 * accurately, abort.
539 		 */
540 		if (softdep_worklist_req && matchmnt == NULL) {
541 			matchcnt = -1;
542 			break;
543 		}
544 
545 		/*
546 		 * If requested, try removing inode or removal dependencies.
547 		 */
548 		if (req_clear_inodedeps) {
549 			clear_inodedeps(td);
550 			req_clear_inodedeps -= 1;
551 			wakeup_one(&proc_waiting);
552 		}
553 		if (req_clear_remove) {
554 			clear_remove(td);
555 			req_clear_remove -= 1;
556 			wakeup_one(&proc_waiting);
557 		}
558 		/*
559 		 * We do not generally want to stop for buffer space, but if
560 		 * we are really being a buffer hog, we will stop and wait.
561 		 */
562 		if (loopcount++ % 128 == 0) {
563 			FREE_LOCK(&lk);
564 			bwillinode(1);
565 			ACQUIRE_LOCK(&lk);
566 		}
567 
568 		/*
569 		 * Never allow processing to run for more than one
570 		 * second. Otherwise the other syncer tasks may get
571 		 * excessively backlogged.
572 		 *
573 		 * Use ticks to avoid boundary condition w/time_second or
574 		 * time_uptime.
575 		 */
576 		if ((ticks - starttime) > hz && matchmnt == NULL) {
577 			matchcnt = -1;
578 			break;
579 		}
580 	}
581 	if (matchmnt == NULL) {
582 		--softdep_worklist_busy;
583 		if (softdep_worklist_req && softdep_worklist_busy == 0)
584 			wakeup(&softdep_worklist_req);
585 	}
586 done:
587 	FREE_LOCK(&lk);
588 	return (matchcnt);
589 }
590 
591 /*
592  * Process one item on the worklist.
593  */
594 static int
595 process_worklist_item(struct mount *matchmnt, int flags)
596 {
597 	struct ufsmount *ump;
598 	struct worklist *wk;
599 	struct dirrem *dirrem;
600 	struct fs *matchfs;
601 	struct vnode *vp;
602 	int matchcnt = 0;
603 
604 	KKASSERT(lock_held(&lk));
605 
606 	matchfs = NULL;
607 	if (matchmnt != NULL)
608 		matchfs = VFSTOUFS(matchmnt)->um_fs;
609 
610 	/*
611 	 * Normally we just process each item on the worklist in order.
612 	 * However, if we are in a situation where we cannot lock any
613 	 * inodes, we have to skip over any dirrem requests whose
614 	 * vnodes are resident and locked.
615 	 */
616 	LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) {
617 		if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
618 			break;
619 		dirrem = WK_DIRREM(wk);
620 		ump = VFSTOUFS(dirrem->dm_mnt);
621 		lwkt_gettoken(&ump->um_mountp->mnt_token);
622 		vp = ufs_ihashlookup(ump, ump->um_dev, dirrem->dm_oldinum);
623 		lwkt_reltoken(&ump->um_mountp->mnt_token);
624 		if (vp == NULL || !vn_islocked(vp))
625 			break;
626 	}
627 	if (wk == NULL) {
628 		return (0);
629 	}
630 	WORKLIST_REMOVE(wk);
631 	num_on_worklist -= 1;
632 	FREE_LOCK(&lk);
633 	switch (wk->wk_type) {
634 	case D_DIRREM:
635 		/* removal of a directory entry */
636 		if (WK_DIRREM(wk)->dm_mnt == matchmnt)
637 			matchcnt += 1;
638 		handle_workitem_remove(WK_DIRREM(wk));
639 		break;
640 
641 	case D_FREEBLKS:
642 		/* releasing blocks and/or fragments from a file */
643 		if (WK_FREEBLKS(wk)->fb_fs == matchfs)
644 			matchcnt += 1;
645 		handle_workitem_freeblocks(WK_FREEBLKS(wk));
646 		break;
647 
648 	case D_FREEFRAG:
649 		/* releasing a fragment when replaced as a file grows */
650 		if (WK_FREEFRAG(wk)->ff_fs == matchfs)
651 			matchcnt += 1;
652 		handle_workitem_freefrag(WK_FREEFRAG(wk));
653 		break;
654 
655 	case D_FREEFILE:
656 		/* releasing an inode when its link count drops to 0 */
657 		if (WK_FREEFILE(wk)->fx_fs == matchfs)
658 			matchcnt += 1;
659 		handle_workitem_freefile(WK_FREEFILE(wk));
660 		break;
661 
662 	default:
663 		panic("%s_process_worklist: Unknown type %s",
664 		    "softdep", TYPENAME(wk->wk_type));
665 		/* NOTREACHED */
666 	}
667 	ACQUIRE_LOCK(&lk);
668 	return (matchcnt);
669 }
670 
671 /*
672  * Move dependencies from one buffer to another.
673  *
674  * bioops callback - hold io_token
675  */
676 static void
677 softdep_move_dependencies(struct buf *oldbp, struct buf *newbp)
678 {
679 	struct worklist *wk, *wktail;
680 
681 	if (LIST_FIRST(&newbp->b_dep) != NULL)
682 		panic("softdep_move_dependencies: need merge code");
683 	wktail = NULL;
684 	ACQUIRE_LOCK(&lk);
685 	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
686 		LIST_REMOVE(wk, wk_list);
687 		if (wktail == NULL)
688 			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
689 		else
690 			LIST_INSERT_AFTER(wktail, wk, wk_list);
691 		wktail = wk;
692 		newbp->b_ops = &softdep_bioops;
693 	}
694 	FREE_LOCK(&lk);
695 }
696 
697 /*
698  * Purge the work list of all items associated with a particular mount point.
699  */
700 int
701 softdep_flushfiles(struct mount *oldmnt, int flags)
702 {
703 	struct vnode *devvp;
704 	int error, loopcnt;
705 
706 	/*
707 	 * Await our turn to clear out the queue, then serialize access.
708 	 */
709 	ACQUIRE_LOCK(&lk);
710 	while (softdep_worklist_busy != 0) {
711 		softdep_worklist_req += 1;
712 		lksleep(&softdep_worklist_req, &lk, 0, "softflush", 0);
713 		softdep_worklist_req -= 1;
714 	}
715 	softdep_worklist_busy = -1;
716 	FREE_LOCK(&lk);
717 
718 	if ((error = ffs_flushfiles(oldmnt, flags)) != 0) {
719 		softdep_worklist_busy = 0;
720 		if (softdep_worklist_req)
721 			wakeup(&softdep_worklist_req);
722 		return (error);
723 	}
724 	/*
725 	 * Alternately flush the block device associated with the mount
726 	 * point and process any dependencies that the flushing
727 	 * creates. In theory, this loop can happen at most twice,
728 	 * but we give it a few extra just to be sure.
729 	 */
730 	devvp = VFSTOUFS(oldmnt)->um_devvp;
731 	for (loopcnt = 10; loopcnt > 0; ) {
732 		if (softdep_process_worklist(oldmnt) == 0) {
733 			loopcnt--;
734 			/*
735 			 * Do another flush in case any vnodes were brought in
736 			 * as part of the cleanup operations.
737 			 */
738 			if ((error = ffs_flushfiles(oldmnt, flags)) != 0)
739 				break;
740 			/*
741 			 * If we still found nothing to do, we are really done.
742 			 */
743 			if (softdep_process_worklist(oldmnt) == 0)
744 				break;
745 		}
746 		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
747 		error = VOP_FSYNC(devvp, MNT_WAIT, 0);
748 		vn_unlock(devvp);
749 		if (error)
750 			break;
751 	}
752 	ACQUIRE_LOCK(&lk);
753 	softdep_worklist_busy = 0;
754 	if (softdep_worklist_req)
755 		wakeup(&softdep_worklist_req);
756 	FREE_LOCK(&lk);
757 
758 	/*
759 	 * If we are unmounting then it is an error to fail. If we
760 	 * are simply trying to downgrade to read-only, then filesystem
761 	 * activity can keep us busy forever, so we just fail with EBUSY.
762 	 */
763 	if (loopcnt == 0) {
764 		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
765 			panic("softdep_flushfiles: looping");
766 		error = EBUSY;
767 	}
768 	return (error);
769 }
770 
771 /*
772  * Structure hashing.
773  *
774  * There are three types of structures that can be looked up:
775  *	1) pagedep structures identified by mount point, inode number,
776  *	   and logical block.
777  *	2) inodedep structures identified by mount point and inode number.
778  *	3) newblk structures identified by mount point and
779  *	   physical block number.
780  *
781  * The "pagedep" and "inodedep" dependency structures are hashed
782  * separately from the file blocks and inodes to which they correspond.
783  * This separation helps when the in-memory copy of an inode or
784  * file block must be replaced. It also obviates the need to access
785  * an inode or file page when simply updating (or de-allocating)
786  * dependency structures. Lookup of newblk structures is needed to
787  * find newly allocated blocks when trying to associate them with
788  * their allocdirect or allocindir structure.
789  *
790  * The lookup routines optionally create and hash a new instance when
791  * an existing entry is not found.
792  */
793 #define DEPALLOC	0x0001	/* allocate structure if lookup fails */
794 #define NODELAY		0x0002	/* cannot do background work */
795 
796 /*
797  * Structures and routines associated with pagedep caching.
798  */
799 LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
800 u_long	pagedep_hash;		/* size of hash table - 1 */
801 #define	PAGEDEP_HASH(mp, inum, lbn) \
802 	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
803 	    pagedep_hash])
804 static struct sema pagedep_in_progress;
805 
806 /*
807  * Helper routine for pagedep_lookup()
808  */
809 static __inline
810 struct pagedep *
811 pagedep_find(struct pagedep_hashhead *pagedephd, ino_t ino, ufs_lbn_t lbn,
812 	     struct mount *mp)
813 {
814 	struct pagedep *pagedep;
815 
816 	LIST_FOREACH(pagedep, pagedephd, pd_hash) {
817 		if (ino == pagedep->pd_ino &&
818 		    lbn == pagedep->pd_lbn &&
819 		    mp == pagedep->pd_mnt) {
820 			return (pagedep);
821 		}
822 	}
823 	return(NULL);
824 }
825 
826 /*
827  * Look up a pagedep. Return 1 if found, 0 if not found.
828  * If not found, allocate if DEPALLOC flag is passed.
829  * Found or allocated entry is returned in pagedeppp.
830  * This routine must be called with splbio interrupts blocked.
831  */
832 static int
833 pagedep_lookup(struct inode *ip, ufs_lbn_t lbn, int flags,
834 	       struct pagedep **pagedeppp)
835 {
836 	struct pagedep *pagedep;
837 	struct pagedep_hashhead *pagedephd;
838 	struct mount *mp;
839 	int i;
840 
841 	KKASSERT(lock_held(&lk));
842 
843 	mp = ITOV(ip)->v_mount;
844 	pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
845 top:
846 	*pagedeppp = pagedep_find(pagedephd, ip->i_number, lbn, mp);
847 	if (*pagedeppp)
848 		return(1);
849 	if ((flags & DEPALLOC) == 0)
850 		return (0);
851 	if (sema_get(&pagedep_in_progress, &lk) == 0)
852 		goto top;
853 
854 	FREE_LOCK(&lk);
855 	pagedep = kmalloc(sizeof(struct pagedep), M_PAGEDEP,
856 			  M_SOFTDEP_FLAGS | M_ZERO);
857 	ACQUIRE_LOCK(&lk);
858 	if (pagedep_find(pagedephd, ip->i_number, lbn, mp)) {
859 		kprintf("pagedep_lookup: blocking race avoided\n");
860 		sema_release(&pagedep_in_progress, &lk);
861 		kfree(pagedep, M_PAGEDEP);
862 		goto top;
863 	}
864 
865 	pagedep->pd_list.wk_type = D_PAGEDEP;
866 	pagedep->pd_mnt = mp;
867 	pagedep->pd_ino = ip->i_number;
868 	pagedep->pd_lbn = lbn;
869 	LIST_INIT(&pagedep->pd_dirremhd);
870 	LIST_INIT(&pagedep->pd_pendinghd);
871 	for (i = 0; i < DAHASHSZ; i++)
872 		LIST_INIT(&pagedep->pd_diraddhd[i]);
873 	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
874 	sema_release(&pagedep_in_progress, &lk);
875 	*pagedeppp = pagedep;
876 	return (0);
877 }
878 
879 /*
880  * Structures and routines associated with inodedep caching.
881  */
882 LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
883 static u_long	inodedep_hash;	/* size of hash table - 1 */
884 static long	num_inodedep;	/* number of inodedep allocated */
885 #define	INODEDEP_HASH(fs, inum) \
886       (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
887 static struct sema inodedep_in_progress;
888 
889 /*
890  * Helper routine for inodedep_lookup()
891  */
892 static __inline
893 struct inodedep *
894 inodedep_find(struct inodedep_hashhead *inodedephd, struct fs *fs, ino_t inum)
895 {
896 	struct inodedep *inodedep;
897 
898 	LIST_FOREACH(inodedep, inodedephd, id_hash) {
899 		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
900 			return(inodedep);
901 	}
902 	return (NULL);
903 }
904 
905 /*
906  * Look up a inodedep. Return 1 if found, 0 if not found.
907  * If not found, allocate if DEPALLOC flag is passed.
908  * Found or allocated entry is returned in inodedeppp.
909  * This routine must be called with splbio interrupts blocked.
910  */
911 static int
912 inodedep_lookup(struct fs *fs, ino_t inum, int flags,
913 	        struct inodedep **inodedeppp)
914 {
915 	struct inodedep *inodedep;
916 	struct inodedep_hashhead *inodedephd;
917 
918 	KKASSERT(lock_held(&lk));
919 
920 	inodedephd = INODEDEP_HASH(fs, inum);
921 top:
922 	*inodedeppp = inodedep_find(inodedephd, fs, inum);
923 	if (*inodedeppp)
924 		return (1);
925 	if ((flags & DEPALLOC) == 0)
926 		return (0);
927 
928 	/*
929 	 * If we are over our limit, try to improve the situation.
930 	 */
931 	if (num_inodedep > max_softdeps / 2)
932 		speedup_syncer(NULL);
933 	if (num_inodedep > max_softdeps &&
934 	    (flags & NODELAY) == 0 &&
935 	    request_cleanup(FLUSH_INODES)) {
936 		goto top;
937 	}
938 	if (sema_get(&inodedep_in_progress, &lk) == 0)
939 		goto top;
940 
941 	FREE_LOCK(&lk);
942 	inodedep = kmalloc(sizeof(struct inodedep), M_INODEDEP,
943 			   M_SOFTDEP_FLAGS | M_ZERO);
944 	ACQUIRE_LOCK(&lk);
945 	if (inodedep_find(inodedephd, fs, inum)) {
946 		kprintf("inodedep_lookup: blocking race avoided\n");
947 		sema_release(&inodedep_in_progress, &lk);
948 		kfree(inodedep, M_INODEDEP);
949 		goto top;
950 	}
951 	inodedep->id_list.wk_type = D_INODEDEP;
952 	inodedep->id_fs = fs;
953 	inodedep->id_ino = inum;
954 	inodedep->id_state = ALLCOMPLETE;
955 	inodedep->id_nlinkdelta = 0;
956 	inodedep->id_savedino = NULL;
957 	inodedep->id_savedsize = -1;
958 	inodedep->id_buf = NULL;
959 	LIST_INIT(&inodedep->id_pendinghd);
960 	LIST_INIT(&inodedep->id_inowait);
961 	LIST_INIT(&inodedep->id_bufwait);
962 	TAILQ_INIT(&inodedep->id_inoupdt);
963 	TAILQ_INIT(&inodedep->id_newinoupdt);
964 	num_inodedep += 1;
965 	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
966 	sema_release(&inodedep_in_progress, &lk);
967 	*inodedeppp = inodedep;
968 	return (0);
969 }
970 
971 /*
972  * Structures and routines associated with newblk caching.
973  */
974 LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
975 u_long	newblk_hash;		/* size of hash table - 1 */
976 #define	NEWBLK_HASH(fs, inum) \
977 	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
978 static struct sema newblk_in_progress;
979 
980 /*
981  * Helper routine for newblk_lookup()
982  */
983 static __inline
984 struct newblk *
985 newblk_find(struct newblk_hashhead *newblkhd, struct fs *fs,
986 	    ufs_daddr_t newblkno)
987 {
988 	struct newblk *newblk;
989 
990 	LIST_FOREACH(newblk, newblkhd, nb_hash) {
991 		if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
992 			return (newblk);
993 	}
994 	return(NULL);
995 }
996 
997 /*
998  * Look up a newblk. Return 1 if found, 0 if not found.
999  * If not found, allocate if DEPALLOC flag is passed.
1000  * Found or allocated entry is returned in newblkpp.
1001  */
1002 static int
1003 newblk_lookup(struct fs *fs, ufs_daddr_t newblkno, int flags,
1004 	      struct newblk **newblkpp)
1005 {
1006 	struct newblk *newblk;
1007 	struct newblk_hashhead *newblkhd;
1008 
1009 	newblkhd = NEWBLK_HASH(fs, newblkno);
1010 top:
1011 	*newblkpp = newblk_find(newblkhd, fs, newblkno);
1012 	if (*newblkpp)
1013 		return(1);
1014 	if ((flags & DEPALLOC) == 0)
1015 		return (0);
1016 	if (sema_get(&newblk_in_progress, NULL) == 0)
1017 		goto top;
1018 
1019 	newblk = kmalloc(sizeof(struct newblk), M_NEWBLK,
1020 			 M_SOFTDEP_FLAGS | M_ZERO);
1021 
1022 	if (newblk_find(newblkhd, fs, newblkno)) {
1023 		kprintf("newblk_lookup: blocking race avoided\n");
1024 		sema_release(&pagedep_in_progress, NULL);
1025 		kfree(newblk, M_NEWBLK);
1026 		goto top;
1027 	}
1028 	newblk->nb_state = 0;
1029 	newblk->nb_fs = fs;
1030 	newblk->nb_newblkno = newblkno;
1031 	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
1032 	sema_release(&newblk_in_progress, NULL);
1033 	*newblkpp = newblk;
1034 	return (0);
1035 }
1036 
1037 /*
1038  * Executed during filesystem system initialization before
1039  * mounting any filesystems.
1040  */
1041 void
1042 softdep_initialize(void)
1043 {
1044 	size_t idsize = sizeof(struct inodedep);
1045 	int hsize = vfs_inodehashsize();
1046 
1047 	LIST_INIT(&mkdirlisthd);
1048 	LIST_INIT(&softdep_workitem_pending);
1049 	max_softdeps = min(maxvnodes * 8, M_INODEDEP->ks_limit / (2 * idsize));
1050 
1051 	/*
1052 	 * Cap it at 100,000, having more just gets kinda silly.
1053 	 */
1054 	max_softdeps = min(max_softdeps, 100000);
1055 
1056 	pagedep_hashtbl = hashinit(hsize / 4, M_PAGEDEP, &pagedep_hash);
1057 	lockinit(&lk, "ffs_softdep", 0, LK_CANRECURSE);
1058 	sema_init(&pagedep_in_progress, "pagedep", 0);
1059 	inodedep_hashtbl = hashinit(hsize, M_INODEDEP, &inodedep_hash);
1060 	sema_init(&inodedep_in_progress, "inodedep", 0);
1061 	newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
1062 	sema_init(&newblk_in_progress, "newblk", 0);
1063 	add_bio_ops(&softdep_bioops);
1064 }
1065 
1066 /*
1067  * Called at mount time to notify the dependency code that a
1068  * filesystem wishes to use it.
1069  */
1070 int
1071 softdep_mount(struct vnode *devvp, struct mount *mp, struct fs *fs)
1072 {
1073 	struct csum cstotal;
1074 	struct cg *cgp;
1075 	struct buf *bp;
1076 	int error, cyl;
1077 
1078 	mp->mnt_flag &= ~MNT_ASYNC;
1079 	mp->mnt_flag |= MNT_SOFTDEP;
1080 	mp->mnt_bioops = &softdep_bioops;
1081 	/*
1082 	 * When doing soft updates, the counters in the
1083 	 * superblock may have gotten out of sync, so we have
1084 	 * to scan the cylinder groups and recalculate them.
1085 	 */
1086 	if (fs->fs_clean != 0)
1087 		return (0);
1088 	bzero(&cstotal, sizeof cstotal);
1089 	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
1090 		if ((error = bread(devvp, fsbtodoff(fs, cgtod(fs, cyl)),
1091 				   fs->fs_cgsize, &bp)) != 0) {
1092 			brelse(bp);
1093 			return (error);
1094 		}
1095 		cgp = (struct cg *)bp->b_data;
1096 		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
1097 		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
1098 		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
1099 		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
1100 		fs->fs_cs(fs, cyl) = cgp->cg_cs;
1101 		brelse(bp);
1102 	}
1103 #ifdef DEBUG
1104 	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
1105 		kprintf("ffs_mountfs: superblock updated for soft updates\n");
1106 #endif
1107 	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
1108 	return (0);
1109 }
1110 
1111 /*
1112  * Protecting the freemaps (or bitmaps).
1113  *
1114  * To eliminate the need to execute fsck before mounting a filesystem
1115  * after a power failure, one must (conservatively) guarantee that the
1116  * on-disk copy of the bitmaps never indicate that a live inode or block is
1117  * free.  So, when a block or inode is allocated, the bitmap should be
1118  * updated (on disk) before any new pointers.  When a block or inode is
1119  * freed, the bitmap should not be updated until all pointers have been
1120  * reset.  The latter dependency is handled by the delayed de-allocation
1121  * approach described below for block and inode de-allocation.  The former
1122  * dependency is handled by calling the following procedure when a block or
1123  * inode is allocated. When an inode is allocated an "inodedep" is created
1124  * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
1125  * Each "inodedep" is also inserted into the hash indexing structure so
1126  * that any additional link additions can be made dependent on the inode
1127  * allocation.
1128  *
1129  * The ufs filesystem maintains a number of free block counts (e.g., per
1130  * cylinder group, per cylinder and per <cylinder, rotational position> pair)
1131  * in addition to the bitmaps.  These counts are used to improve efficiency
1132  * during allocation and therefore must be consistent with the bitmaps.
1133  * There is no convenient way to guarantee post-crash consistency of these
1134  * counts with simple update ordering, for two main reasons: (1) The counts
1135  * and bitmaps for a single cylinder group block are not in the same disk
1136  * sector.  If a disk write is interrupted (e.g., by power failure), one may
1137  * be written and the other not.  (2) Some of the counts are located in the
1138  * superblock rather than the cylinder group block. So, we focus our soft
1139  * updates implementation on protecting the bitmaps. When mounting a
1140  * filesystem, we recompute the auxiliary counts from the bitmaps.
1141  */
1142 
1143 /*
1144  * Called just after updating the cylinder group block to allocate an inode.
1145  *
1146  * Parameters:
1147  *	bp:		buffer for cylgroup block with inode map
1148  *	ip:		inode related to allocation
1149  *	newinum:	new inode number being allocated
1150  */
1151 void
1152 softdep_setup_inomapdep(struct buf *bp, struct inode *ip, ino_t newinum)
1153 {
1154 	struct inodedep *inodedep;
1155 	struct bmsafemap *bmsafemap;
1156 
1157 	/*
1158 	 * Create a dependency for the newly allocated inode.
1159 	 * Panic if it already exists as something is seriously wrong.
1160 	 * Otherwise add it to the dependency list for the buffer holding
1161 	 * the cylinder group map from which it was allocated.
1162 	 */
1163 	ACQUIRE_LOCK(&lk);
1164 	if ((inodedep_lookup(ip->i_fs, newinum, DEPALLOC|NODELAY, &inodedep))) {
1165 		panic("softdep_setup_inomapdep: found inode");
1166 	}
1167 	inodedep->id_buf = bp;
1168 	inodedep->id_state &= ~DEPCOMPLETE;
1169 	bmsafemap = bmsafemap_lookup(bp);
1170 	LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
1171 	FREE_LOCK(&lk);
1172 }
1173 
1174 /*
1175  * Called just after updating the cylinder group block to
1176  * allocate block or fragment.
1177  *
1178  * Parameters:
1179  *	bp:		buffer for cylgroup block with block map
1180  *	fs:		filesystem doing allocation
1181  *	newblkno:	number of newly allocated block
1182  */
1183 void
1184 softdep_setup_blkmapdep(struct buf *bp, struct fs *fs,
1185 			ufs_daddr_t newblkno)
1186 {
1187 	struct newblk *newblk;
1188 	struct bmsafemap *bmsafemap;
1189 
1190 	/*
1191 	 * Create a dependency for the newly allocated block.
1192 	 * Add it to the dependency list for the buffer holding
1193 	 * the cylinder group map from which it was allocated.
1194 	 */
1195 	if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
1196 		panic("softdep_setup_blkmapdep: found block");
1197 	ACQUIRE_LOCK(&lk);
1198 	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
1199 	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
1200 	FREE_LOCK(&lk);
1201 }
1202 
1203 /*
1204  * Find the bmsafemap associated with a cylinder group buffer.
1205  * If none exists, create one. The buffer must be locked when
1206  * this routine is called and this routine must be called with
1207  * splbio interrupts blocked.
1208  */
1209 static struct bmsafemap *
1210 bmsafemap_lookup(struct buf *bp)
1211 {
1212 	struct bmsafemap *bmsafemap;
1213 	struct worklist *wk;
1214 
1215 	KKASSERT(lock_held(&lk));
1216 
1217 	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
1218 		if (wk->wk_type == D_BMSAFEMAP)
1219 			return (WK_BMSAFEMAP(wk));
1220 	}
1221 	FREE_LOCK(&lk);
1222 	bmsafemap = kmalloc(sizeof(struct bmsafemap), M_BMSAFEMAP,
1223 			    M_SOFTDEP_FLAGS);
1224 	bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
1225 	bmsafemap->sm_list.wk_state = 0;
1226 	bmsafemap->sm_buf = bp;
1227 	LIST_INIT(&bmsafemap->sm_allocdirecthd);
1228 	LIST_INIT(&bmsafemap->sm_allocindirhd);
1229 	LIST_INIT(&bmsafemap->sm_inodedephd);
1230 	LIST_INIT(&bmsafemap->sm_newblkhd);
1231 	ACQUIRE_LOCK(&lk);
1232 	WORKLIST_INSERT_BP(bp, &bmsafemap->sm_list);
1233 	return (bmsafemap);
1234 }
1235 
1236 /*
1237  * Direct block allocation dependencies.
1238  *
1239  * When a new block is allocated, the corresponding disk locations must be
1240  * initialized (with zeros or new data) before the on-disk inode points to
1241  * them.  Also, the freemap from which the block was allocated must be
1242  * updated (on disk) before the inode's pointer. These two dependencies are
1243  * independent of each other and are needed for all file blocks and indirect
1244  * blocks that are pointed to directly by the inode.  Just before the
1245  * "in-core" version of the inode is updated with a newly allocated block
1246  * number, a procedure (below) is called to setup allocation dependency
1247  * structures.  These structures are removed when the corresponding
1248  * dependencies are satisfied or when the block allocation becomes obsolete
1249  * (i.e., the file is deleted, the block is de-allocated, or the block is a
1250  * fragment that gets upgraded).  All of these cases are handled in
1251  * procedures described later.
1252  *
1253  * When a file extension causes a fragment to be upgraded, either to a larger
1254  * fragment or to a full block, the on-disk location may change (if the
1255  * previous fragment could not simply be extended). In this case, the old
1256  * fragment must be de-allocated, but not until after the inode's pointer has
1257  * been updated. In most cases, this is handled by later procedures, which
1258  * will construct a "freefrag" structure to be added to the workitem queue
1259  * when the inode update is complete (or obsolete).  The main exception to
1260  * this is when an allocation occurs while a pending allocation dependency
1261  * (for the same block pointer) remains.  This case is handled in the main
1262  * allocation dependency setup procedure by immediately freeing the
1263  * unreferenced fragments.
1264  *
1265  * Parameters:
1266  *	ip:		inode to which block is being added
1267  *	lbn:		block pointer within inode
1268  *	newblkno:	disk block number being added
1269  *	oldblkno:	previous block number, 0 unless frag
1270  *	newsize:	size of new block
1271  *	oldsize:	size of new block
1272  *	bp:		bp for allocated block
1273  */
1274 void
1275 softdep_setup_allocdirect(struct inode *ip, ufs_lbn_t lbn, ufs_daddr_t newblkno,
1276 			  ufs_daddr_t oldblkno, long newsize, long oldsize,
1277 			  struct buf *bp)
1278 {
1279 	struct allocdirect *adp, *oldadp;
1280 	struct allocdirectlst *adphead;
1281 	struct bmsafemap *bmsafemap;
1282 	struct inodedep *inodedep;
1283 	struct pagedep *pagedep;
1284 	struct newblk *newblk;
1285 
1286 	adp = kmalloc(sizeof(struct allocdirect), M_ALLOCDIRECT,
1287 		      M_SOFTDEP_FLAGS | M_ZERO);
1288 	adp->ad_list.wk_type = D_ALLOCDIRECT;
1289 	adp->ad_lbn = lbn;
1290 	adp->ad_newblkno = newblkno;
1291 	adp->ad_oldblkno = oldblkno;
1292 	adp->ad_newsize = newsize;
1293 	adp->ad_oldsize = oldsize;
1294 	adp->ad_state = ATTACHED;
1295 	if (newblkno == oldblkno)
1296 		adp->ad_freefrag = NULL;
1297 	else
1298 		adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1299 
1300 	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1301 		panic("softdep_setup_allocdirect: lost block");
1302 
1303 	ACQUIRE_LOCK(&lk);
1304 	inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep);
1305 	adp->ad_inodedep = inodedep;
1306 
1307 	if (newblk->nb_state == DEPCOMPLETE) {
1308 		adp->ad_state |= DEPCOMPLETE;
1309 		adp->ad_buf = NULL;
1310 	} else {
1311 		bmsafemap = newblk->nb_bmsafemap;
1312 		adp->ad_buf = bmsafemap->sm_buf;
1313 		LIST_REMOVE(newblk, nb_deps);
1314 		LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1315 	}
1316 	LIST_REMOVE(newblk, nb_hash);
1317 	kfree(newblk, M_NEWBLK);
1318 
1319 	WORKLIST_INSERT_BP(bp, &adp->ad_list);
1320 	if (lbn >= UFS_NDADDR) {
1321 		/* allocating an indirect block */
1322 		if (oldblkno != 0) {
1323 			panic("softdep_setup_allocdirect: non-zero indir");
1324 		}
1325 	} else {
1326 		/*
1327 		 * Allocating a direct block.
1328 		 *
1329 		 * If we are allocating a directory block, then we must
1330 		 * allocate an associated pagedep to track additions and
1331 		 * deletions.
1332 		 */
1333 		if ((ip->i_mode & IFMT) == IFDIR &&
1334 		    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) {
1335 			WORKLIST_INSERT_BP(bp, &pagedep->pd_list);
1336 		}
1337 	}
1338 	/*
1339 	 * The list of allocdirects must be kept in sorted and ascending
1340 	 * order so that the rollback routines can quickly determine the
1341 	 * first uncommitted block (the size of the file stored on disk
1342 	 * ends at the end of the lowest committed fragment, or if there
1343 	 * are no fragments, at the end of the highest committed block).
1344 	 * Since files generally grow, the typical case is that the new
1345 	 * block is to be added at the end of the list. We speed this
1346 	 * special case by checking against the last allocdirect in the
1347 	 * list before laboriously traversing the list looking for the
1348 	 * insertion point.
1349 	 */
1350 	adphead = &inodedep->id_newinoupdt;
1351 	oldadp = TAILQ_LAST(adphead, allocdirectlst);
1352 	if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1353 		/* insert at end of list */
1354 		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1355 		if (oldadp != NULL && oldadp->ad_lbn == lbn)
1356 			allocdirect_merge(adphead, adp, oldadp);
1357 		FREE_LOCK(&lk);
1358 		return;
1359 	}
1360 	TAILQ_FOREACH(oldadp, adphead, ad_next) {
1361 		if (oldadp->ad_lbn >= lbn)
1362 			break;
1363 	}
1364 	if (oldadp == NULL) {
1365 		panic("softdep_setup_allocdirect: lost entry");
1366 	}
1367 	/* insert in middle of list */
1368 	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1369 	if (oldadp->ad_lbn == lbn)
1370 		allocdirect_merge(adphead, adp, oldadp);
1371 	FREE_LOCK(&lk);
1372 }
1373 
1374 /*
1375  * Replace an old allocdirect dependency with a newer one.
1376  * This routine must be called with splbio interrupts blocked.
1377  *
1378  * Parameters:
1379  *	adphead:	head of list holding allocdirects
1380  *	newadp:		allocdirect being added
1381  *	oldadp:		existing allocdirect being checked
1382  */
1383 static void
1384 allocdirect_merge(struct allocdirectlst *adphead,
1385 		  struct allocdirect *newadp,
1386 		  struct allocdirect *oldadp)
1387 {
1388 	struct freefrag *freefrag;
1389 
1390 	KKASSERT(lock_held(&lk));
1391 
1392 	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
1393 	    newadp->ad_oldsize != oldadp->ad_newsize ||
1394 	    newadp->ad_lbn >= UFS_NDADDR) {
1395 		panic("allocdirect_check: old %d != new %d || lbn %ld >= %d",
1396 		    newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn,
1397 		    UFS_NDADDR);
1398 	}
1399 	newadp->ad_oldblkno = oldadp->ad_oldblkno;
1400 	newadp->ad_oldsize = oldadp->ad_oldsize;
1401 	/*
1402 	 * If the old dependency had a fragment to free or had never
1403 	 * previously had a block allocated, then the new dependency
1404 	 * can immediately post its freefrag and adopt the old freefrag.
1405 	 * This action is done by swapping the freefrag dependencies.
1406 	 * The new dependency gains the old one's freefrag, and the
1407 	 * old one gets the new one and then immediately puts it on
1408 	 * the worklist when it is freed by free_allocdirect. It is
1409 	 * not possible to do this swap when the old dependency had a
1410 	 * non-zero size but no previous fragment to free. This condition
1411 	 * arises when the new block is an extension of the old block.
1412 	 * Here, the first part of the fragment allocated to the new
1413 	 * dependency is part of the block currently claimed on disk by
1414 	 * the old dependency, so cannot legitimately be freed until the
1415 	 * conditions for the new dependency are fulfilled.
1416 	 */
1417 	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
1418 		freefrag = newadp->ad_freefrag;
1419 		newadp->ad_freefrag = oldadp->ad_freefrag;
1420 		oldadp->ad_freefrag = freefrag;
1421 	}
1422 	free_allocdirect(adphead, oldadp, 0);
1423 }
1424 
1425 /*
1426  * Allocate a new freefrag structure if needed.
1427  */
1428 static struct freefrag *
1429 newfreefrag(struct inode *ip, ufs_daddr_t blkno, long size)
1430 {
1431 	struct freefrag *freefrag;
1432 	struct fs *fs;
1433 
1434 	if (blkno == 0)
1435 		return (NULL);
1436 	fs = ip->i_fs;
1437 	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
1438 		panic("newfreefrag: frag size");
1439 	freefrag = kmalloc(sizeof(struct freefrag), M_FREEFRAG,
1440 			   M_SOFTDEP_FLAGS);
1441 	freefrag->ff_list.wk_type = D_FREEFRAG;
1442 	freefrag->ff_state = ip->i_uid & ~ONWORKLIST;	/* XXX - used below */
1443 	freefrag->ff_inum = ip->i_number;
1444 	freefrag->ff_fs = fs;
1445 	freefrag->ff_devvp = ip->i_devvp;
1446 	freefrag->ff_blkno = blkno;
1447 	freefrag->ff_fragsize = size;
1448 	return (freefrag);
1449 }
1450 
1451 /*
1452  * This workitem de-allocates fragments that were replaced during
1453  * file block allocation.
1454  */
1455 static void
1456 handle_workitem_freefrag(struct freefrag *freefrag)
1457 {
1458 	struct inode tip;
1459 
1460 	tip.i_fs = freefrag->ff_fs;
1461 	tip.i_devvp = freefrag->ff_devvp;
1462 	tip.i_dev = freefrag->ff_devvp->v_rdev;
1463 	tip.i_number = freefrag->ff_inum;
1464 	tip.i_uid = freefrag->ff_state & ~ONWORKLIST;	/* XXX - set above */
1465 	ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize);
1466 	kfree(freefrag, M_FREEFRAG);
1467 }
1468 
1469 /*
1470  * Indirect block allocation dependencies.
1471  *
1472  * The same dependencies that exist for a direct block also exist when
1473  * a new block is allocated and pointed to by an entry in a block of
1474  * indirect pointers. The undo/redo states described above are also
1475  * used here. Because an indirect block contains many pointers that
1476  * may have dependencies, a second copy of the entire in-memory indirect
1477  * block is kept. The buffer cache copy is always completely up-to-date.
1478  * The second copy, which is used only as a source for disk writes,
1479  * contains only the safe pointers (i.e., those that have no remaining
1480  * update dependencies). The second copy is freed when all pointers
1481  * are safe. The cache is not allowed to replace indirect blocks with
1482  * pending update dependencies. If a buffer containing an indirect
1483  * block with dependencies is written, these routines will mark it
1484  * dirty again. It can only be successfully written once all the
1485  * dependencies are removed. The ffs_fsync routine in conjunction with
1486  * softdep_sync_metadata work together to get all the dependencies
1487  * removed so that a file can be successfully written to disk. Three
1488  * procedures are used when setting up indirect block pointer
1489  * dependencies. The division is necessary because of the organization
1490  * of the "balloc" routine and because of the distinction between file
1491  * pages and file metadata blocks.
1492  */
1493 
1494 /*
1495  * Allocate a new allocindir structure.
1496  *
1497  * Parameters:
1498  *	ip:		inode for file being extended
1499  *	ptrno:		offset of pointer in indirect block
1500  *	newblkno:	disk block number being added
1501  *	oldblkno:	previous block number, 0 if none
1502  */
1503 static struct allocindir *
1504 newallocindir(struct inode *ip, int ptrno, ufs_daddr_t newblkno,
1505 	      ufs_daddr_t oldblkno)
1506 {
1507 	struct allocindir *aip;
1508 
1509 	aip = kmalloc(sizeof(struct allocindir), M_ALLOCINDIR,
1510 		      M_SOFTDEP_FLAGS | M_ZERO);
1511 	aip->ai_list.wk_type = D_ALLOCINDIR;
1512 	aip->ai_state = ATTACHED;
1513 	aip->ai_offset = ptrno;
1514 	aip->ai_newblkno = newblkno;
1515 	aip->ai_oldblkno = oldblkno;
1516 	aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
1517 	return (aip);
1518 }
1519 
1520 /*
1521  * Called just before setting an indirect block pointer
1522  * to a newly allocated file page.
1523  *
1524  * Parameters:
1525  *	ip:		inode for file being extended
1526  *	lbn:		allocated block number within file
1527  *	bp:		buffer with indirect blk referencing page
1528  *	ptrno:		offset of pointer in indirect block
1529  *	newblkno:	disk block number being added
1530  *	oldblkno:	previous block number, 0 if none
1531  *	nbp:		buffer holding allocated page
1532  */
1533 void
1534 softdep_setup_allocindir_page(struct inode *ip, ufs_lbn_t lbn,
1535 			      struct buf *bp, int ptrno,
1536 			      ufs_daddr_t newblkno, ufs_daddr_t oldblkno,
1537 			      struct buf *nbp)
1538 {
1539 	struct allocindir *aip;
1540 	struct pagedep *pagedep;
1541 
1542 	aip = newallocindir(ip, ptrno, newblkno, oldblkno);
1543 	ACQUIRE_LOCK(&lk);
1544 	/*
1545 	 * If we are allocating a directory page, then we must
1546 	 * allocate an associated pagedep to track additions and
1547 	 * deletions.
1548 	 */
1549 	if ((ip->i_mode & IFMT) == IFDIR &&
1550 	    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1551 		WORKLIST_INSERT_BP(nbp, &pagedep->pd_list);
1552 	WORKLIST_INSERT_BP(nbp, &aip->ai_list);
1553 	FREE_LOCK(&lk);
1554 	setup_allocindir_phase2(bp, ip, aip);
1555 }
1556 
1557 /*
1558  * Called just before setting an indirect block pointer to a
1559  * newly allocated indirect block.
1560  * Parameters:
1561  *	nbp:		newly allocated indirect block
1562  *	ip:		inode for file being extended
1563  *	bp:		indirect block referencing allocated block
1564  *	ptrno:		offset of pointer in indirect block
1565  *	newblkno:	disk block number being added
1566  */
1567 void
1568 softdep_setup_allocindir_meta(struct buf *nbp, struct inode *ip,
1569 			      struct buf *bp, int ptrno,
1570 			      ufs_daddr_t newblkno)
1571 {
1572 	struct allocindir *aip;
1573 
1574 	aip = newallocindir(ip, ptrno, newblkno, 0);
1575 	ACQUIRE_LOCK(&lk);
1576 	WORKLIST_INSERT_BP(nbp, &aip->ai_list);
1577 	FREE_LOCK(&lk);
1578 	setup_allocindir_phase2(bp, ip, aip);
1579 }
1580 
1581 /*
1582  * Called to finish the allocation of the "aip" allocated
1583  * by one of the two routines above.
1584  *
1585  * Parameters:
1586  *	bp:	in-memory copy of the indirect block
1587  *	ip:	inode for file being extended
1588  *	aip:	allocindir allocated by the above routines
1589  */
1590 static void
1591 setup_allocindir_phase2(struct buf *bp, struct inode *ip,
1592 			struct allocindir *aip)
1593 {
1594 	struct worklist *wk;
1595 	struct indirdep *indirdep, *newindirdep;
1596 	struct bmsafemap *bmsafemap;
1597 	struct allocindir *oldaip;
1598 	struct freefrag *freefrag;
1599 	struct newblk *newblk;
1600 
1601 	if (bp->b_loffset >= 0)
1602 		panic("setup_allocindir_phase2: not indir blk");
1603 	for (indirdep = NULL, newindirdep = NULL; ; ) {
1604 		ACQUIRE_LOCK(&lk);
1605 		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
1606 			if (wk->wk_type != D_INDIRDEP)
1607 				continue;
1608 			indirdep = WK_INDIRDEP(wk);
1609 			break;
1610 		}
1611 		if (indirdep == NULL && newindirdep) {
1612 			indirdep = newindirdep;
1613 			WORKLIST_INSERT_BP(bp, &indirdep->ir_list);
1614 			newindirdep = NULL;
1615 		}
1616 		FREE_LOCK(&lk);
1617 		if (indirdep) {
1618 			if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
1619 			    &newblk) == 0)
1620 				panic("setup_allocindir: lost block");
1621 			ACQUIRE_LOCK(&lk);
1622 			if (newblk->nb_state == DEPCOMPLETE) {
1623 				aip->ai_state |= DEPCOMPLETE;
1624 				aip->ai_buf = NULL;
1625 			} else {
1626 				bmsafemap = newblk->nb_bmsafemap;
1627 				aip->ai_buf = bmsafemap->sm_buf;
1628 				LIST_REMOVE(newblk, nb_deps);
1629 				LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
1630 				    aip, ai_deps);
1631 			}
1632 			LIST_REMOVE(newblk, nb_hash);
1633 			kfree(newblk, M_NEWBLK);
1634 			aip->ai_indirdep = indirdep;
1635 			/*
1636 			 * Check to see if there is an existing dependency
1637 			 * for this block. If there is, merge the old
1638 			 * dependency into the new one.
1639 			 */
1640 			if (aip->ai_oldblkno == 0)
1641 				oldaip = NULL;
1642 			else
1643 
1644 				LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
1645 					if (oldaip->ai_offset == aip->ai_offset)
1646 						break;
1647 			if (oldaip != NULL) {
1648 				if (oldaip->ai_newblkno != aip->ai_oldblkno) {
1649 					panic("setup_allocindir_phase2: blkno");
1650 				}
1651 				aip->ai_oldblkno = oldaip->ai_oldblkno;
1652 				freefrag = oldaip->ai_freefrag;
1653 				oldaip->ai_freefrag = aip->ai_freefrag;
1654 				aip->ai_freefrag = freefrag;
1655 				free_allocindir(oldaip, NULL);
1656 			}
1657 			LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
1658 			((ufs_daddr_t *)indirdep->ir_savebp->b_data)
1659 			    [aip->ai_offset] = aip->ai_oldblkno;
1660 			FREE_LOCK(&lk);
1661 		}
1662 		if (newindirdep) {
1663 			/*
1664 			 * Avoid any possibility of data corruption by
1665 			 * ensuring that our old version is thrown away.
1666 			 */
1667 			newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
1668 			brelse(newindirdep->ir_savebp);
1669 			WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
1670 		}
1671 		if (indirdep)
1672 			break;
1673 		newindirdep = kmalloc(sizeof(struct indirdep), M_INDIRDEP,
1674 				      M_SOFTDEP_FLAGS);
1675 		newindirdep->ir_list.wk_type = D_INDIRDEP;
1676 		newindirdep->ir_state = ATTACHED;
1677 		LIST_INIT(&newindirdep->ir_deplisthd);
1678 		LIST_INIT(&newindirdep->ir_donehd);
1679 		if (bp->b_bio2.bio_offset == NOOFFSET) {
1680 			VOP_BMAP(bp->b_vp, bp->b_bio1.bio_offset,
1681 				 &bp->b_bio2.bio_offset, NULL, NULL,
1682 				 BUF_CMD_WRITE);
1683 		}
1684 		KKASSERT(bp->b_bio2.bio_offset != NOOFFSET);
1685 		newindirdep->ir_savebp = getblk(ip->i_devvp,
1686 						bp->b_bio2.bio_offset,
1687 					        bp->b_bcount, 0, 0);
1688 		BUF_KERNPROC(newindirdep->ir_savebp);
1689 		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
1690 	}
1691 }
1692 
1693 /*
1694  * Block de-allocation dependencies.
1695  *
1696  * When blocks are de-allocated, the on-disk pointers must be nullified before
1697  * the blocks are made available for use by other files.  (The true
1698  * requirement is that old pointers must be nullified before new on-disk
1699  * pointers are set.  We chose this slightly more stringent requirement to
1700  * reduce complexity.) Our implementation handles this dependency by updating
1701  * the inode (or indirect block) appropriately but delaying the actual block
1702  * de-allocation (i.e., freemap and free space count manipulation) until
1703  * after the updated versions reach stable storage.  After the disk is
1704  * updated, the blocks can be safely de-allocated whenever it is convenient.
1705  * This implementation handles only the common case of reducing a file's
1706  * length to zero. Other cases are handled by the conventional synchronous
1707  * write approach.
1708  *
1709  * The ffs implementation with which we worked double-checks
1710  * the state of the block pointers and file size as it reduces
1711  * a file's length.  Some of this code is replicated here in our
1712  * soft updates implementation.  The freeblks->fb_chkcnt field is
1713  * used to transfer a part of this information to the procedure
1714  * that eventually de-allocates the blocks.
1715  *
1716  * This routine should be called from the routine that shortens
1717  * a file's length, before the inode's size or block pointers
1718  * are modified. It will save the block pointer information for
1719  * later release and zero the inode so that the calling routine
1720  * can release it.
1721  */
1722 struct softdep_setup_freeblocks_info {
1723 	struct fs *fs;
1724 	struct inode *ip;
1725 };
1726 
1727 static int softdep_setup_freeblocks_bp(struct buf *bp, void *data);
1728 
1729 /*
1730  * Parameters:
1731  *	ip:	The inode whose length is to be reduced
1732  *	length:	The new length for the file
1733  */
1734 void
1735 softdep_setup_freeblocks(struct inode *ip, off_t length)
1736 {
1737 	struct softdep_setup_freeblocks_info info;
1738 	struct freeblks *freeblks;
1739 	struct inodedep *inodedep;
1740 	struct allocdirect *adp;
1741 	struct vnode *vp;
1742 	struct buf *bp;
1743 	struct fs *fs;
1744 	int i, error, delay;
1745 	int count;
1746 
1747 	fs = ip->i_fs;
1748 	if (length != 0)
1749 		panic("softde_setup_freeblocks: non-zero length");
1750 	freeblks = kmalloc(sizeof(struct freeblks), M_FREEBLKS,
1751 			   M_SOFTDEP_FLAGS | M_ZERO);
1752 	freeblks->fb_list.wk_type = D_FREEBLKS;
1753 	freeblks->fb_state = ATTACHED;
1754 	freeblks->fb_uid = ip->i_uid;
1755 	freeblks->fb_previousinum = ip->i_number;
1756 	freeblks->fb_devvp = ip->i_devvp;
1757 	freeblks->fb_fs = fs;
1758 	freeblks->fb_oldsize = ip->i_size;
1759 	freeblks->fb_newsize = length;
1760 	freeblks->fb_chkcnt = ip->i_blocks;
1761 	for (i = 0; i < UFS_NDADDR; i++) {
1762 		freeblks->fb_dblks[i] = ip->i_db[i];
1763 		ip->i_db[i] = 0;
1764 	}
1765 	for (i = 0; i < UFS_NIADDR; i++) {
1766 		freeblks->fb_iblks[i] = ip->i_ib[i];
1767 		ip->i_ib[i] = 0;
1768 	}
1769 	ip->i_blocks = 0;
1770 	ip->i_size = 0;
1771 	/*
1772 	 * Push the zero'ed inode to to its disk buffer so that we are free
1773 	 * to delete its dependencies below. Once the dependencies are gone
1774 	 * the buffer can be safely released.
1775 	 */
1776 	if ((error = bread(ip->i_devvp,
1777 			    fsbtodoff(fs, ino_to_fsba(fs, ip->i_number)),
1778 	    (int)fs->fs_bsize, &bp)) != 0)
1779 		softdep_error("softdep_setup_freeblocks", error);
1780 	*((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) =
1781 	    ip->i_din;
1782 	/*
1783 	 * Find and eliminate any inode dependencies.
1784 	 */
1785 	ACQUIRE_LOCK(&lk);
1786 	(void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
1787 	if ((inodedep->id_state & IOSTARTED) != 0) {
1788 		panic("softdep_setup_freeblocks: inode busy");
1789 	}
1790 	/*
1791 	 * Add the freeblks structure to the list of operations that
1792 	 * must await the zero'ed inode being written to disk. If we
1793 	 * still have a bitmap dependency (delay == 0), then the inode
1794 	 * has never been written to disk, so we can process the
1795 	 * freeblks below once we have deleted the dependencies.
1796 	 */
1797 	delay = (inodedep->id_state & DEPCOMPLETE);
1798 	if (delay)
1799 		WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
1800 	/*
1801 	 * Because the file length has been truncated to zero, any
1802 	 * pending block allocation dependency structures associated
1803 	 * with this inode are obsolete and can simply be de-allocated.
1804 	 * We must first merge the two dependency lists to get rid of
1805 	 * any duplicate freefrag structures, then purge the merged list.
1806 	 */
1807 	merge_inode_lists(inodedep);
1808 	while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
1809 		free_allocdirect(&inodedep->id_inoupdt, adp, 1);
1810 	FREE_LOCK(&lk);
1811 	bdwrite(bp);
1812 	/*
1813 	 * We must wait for any I/O in progress to finish so that
1814 	 * all potential buffers on the dirty list will be visible.
1815 	 * Once they are all there, walk the list and get rid of
1816 	 * any dependencies.
1817 	 */
1818 	vp = ITOV(ip);
1819 	ACQUIRE_LOCK(&lk);
1820 	drain_output(vp, 1);
1821 
1822 	info.fs = fs;
1823 	info.ip = ip;
1824 	lwkt_gettoken(&vp->v_token);
1825 	do {
1826 		count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
1827 				softdep_setup_freeblocks_bp, &info);
1828 	} while (count != 0);
1829 	lwkt_reltoken(&vp->v_token);
1830 
1831 	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0)
1832 		(void)free_inodedep(inodedep);
1833 
1834 	if (delay) {
1835 		freeblks->fb_state |= DEPCOMPLETE;
1836 		/*
1837 		 * If the inode with zeroed block pointers is now on disk
1838 		 * we can start freeing blocks. Add freeblks to the worklist
1839 		 * instead of calling  handle_workitem_freeblocks directly as
1840 		 * it is more likely that additional IO is needed to complete
1841 		 * the request here than in the !delay case.
1842 		 */
1843 		if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
1844 			add_to_worklist(&freeblks->fb_list);
1845 	}
1846 
1847 	FREE_LOCK(&lk);
1848 	/*
1849 	 * If the inode has never been written to disk (delay == 0),
1850 	 * then we can process the freeblks now that we have deleted
1851 	 * the dependencies.
1852 	 */
1853 	if (!delay)
1854 		handle_workitem_freeblocks(freeblks);
1855 }
1856 
1857 static int
1858 softdep_setup_freeblocks_bp(struct buf *bp, void *data)
1859 {
1860 	struct softdep_setup_freeblocks_info *info = data;
1861 	struct inodedep *inodedep;
1862 
1863 	if (getdirtybuf(&bp, MNT_WAIT) == 0) {
1864 		kprintf("softdep_setup_freeblocks_bp(1): caught bp %p going away\n", bp);
1865 		return(-1);
1866 	}
1867 	if (bp->b_vp != ITOV(info->ip) || (bp->b_flags & B_DELWRI) == 0) {
1868 		kprintf("softdep_setup_freeblocks_bp(2): caught bp %p going away\n", bp);
1869 		BUF_UNLOCK(bp);
1870 		return(-1);
1871 	}
1872 	(void) inodedep_lookup(info->fs, info->ip->i_number, 0, &inodedep);
1873 	deallocate_dependencies(bp, inodedep);
1874 	bp->b_flags |= B_INVAL | B_NOCACHE;
1875 	FREE_LOCK(&lk);
1876 	brelse(bp);
1877 	ACQUIRE_LOCK(&lk);
1878 	return(1);
1879 }
1880 
1881 /*
1882  * Reclaim any dependency structures from a buffer that is about to
1883  * be reallocated to a new vnode. The buffer must be locked, thus,
1884  * no I/O completion operations can occur while we are manipulating
1885  * its associated dependencies. The mutex is held so that other I/O's
1886  * associated with related dependencies do not occur.
1887  */
1888 static void
1889 deallocate_dependencies(struct buf *bp, struct inodedep *inodedep)
1890 {
1891 	struct worklist *wk;
1892 	struct indirdep *indirdep;
1893 	struct allocindir *aip;
1894 	struct pagedep *pagedep;
1895 	struct dirrem *dirrem;
1896 	struct diradd *dap;
1897 	int i;
1898 
1899 	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
1900 		switch (wk->wk_type) {
1901 
1902 		case D_INDIRDEP:
1903 			indirdep = WK_INDIRDEP(wk);
1904 			/*
1905 			 * None of the indirect pointers will ever be visible,
1906 			 * so they can simply be tossed. GOINGAWAY ensures
1907 			 * that allocated pointers will be saved in the buffer
1908 			 * cache until they are freed. Note that they will
1909 			 * only be able to be found by their physical address
1910 			 * since the inode mapping the logical address will
1911 			 * be gone. The save buffer used for the safe copy
1912 			 * was allocated in setup_allocindir_phase2 using
1913 			 * the physical address so it could be used for this
1914 			 * purpose. Hence we swap the safe copy with the real
1915 			 * copy, allowing the safe copy to be freed and holding
1916 			 * on to the real copy for later use in indir_trunc.
1917 			 *
1918 			 * NOTE: ir_savebp is relative to the block device
1919 			 * so b_bio1 contains the device block number.
1920 			 */
1921 			if (indirdep->ir_state & GOINGAWAY) {
1922 				panic("deallocate_dependencies: already gone");
1923 			}
1924 			indirdep->ir_state |= GOINGAWAY;
1925 			while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != NULL)
1926 				free_allocindir(aip, inodedep);
1927 			if (bp->b_bio1.bio_offset >= 0 ||
1928 			    bp->b_bio2.bio_offset != indirdep->ir_savebp->b_bio1.bio_offset) {
1929 				panic("deallocate_dependencies: not indir");
1930 			}
1931 			bcopy(bp->b_data, indirdep->ir_savebp->b_data,
1932 			    bp->b_bcount);
1933 			WORKLIST_REMOVE(wk);
1934 			WORKLIST_INSERT_BP(indirdep->ir_savebp, wk);
1935 			continue;
1936 
1937 		case D_PAGEDEP:
1938 			pagedep = WK_PAGEDEP(wk);
1939 			/*
1940 			 * None of the directory additions will ever be
1941 			 * visible, so they can simply be tossed.
1942 			 */
1943 			for (i = 0; i < DAHASHSZ; i++)
1944 				while ((dap =
1945 				    LIST_FIRST(&pagedep->pd_diraddhd[i])))
1946 					free_diradd(dap);
1947 			while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
1948 				free_diradd(dap);
1949 			/*
1950 			 * Copy any directory remove dependencies to the list
1951 			 * to be processed after the zero'ed inode is written.
1952 			 * If the inode has already been written, then they
1953 			 * can be dumped directly onto the work list.
1954 			 */
1955 			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
1956 				LIST_REMOVE(dirrem, dm_next);
1957 				dirrem->dm_dirinum = pagedep->pd_ino;
1958 				if (inodedep == NULL ||
1959 				    (inodedep->id_state & ALLCOMPLETE) ==
1960 				     ALLCOMPLETE)
1961 					add_to_worklist(&dirrem->dm_list);
1962 				else
1963 					WORKLIST_INSERT(&inodedep->id_bufwait,
1964 					    &dirrem->dm_list);
1965 			}
1966 			WORKLIST_REMOVE(&pagedep->pd_list);
1967 			LIST_REMOVE(pagedep, pd_hash);
1968 			WORKITEM_FREE(pagedep, D_PAGEDEP);
1969 			continue;
1970 
1971 		case D_ALLOCINDIR:
1972 			free_allocindir(WK_ALLOCINDIR(wk), inodedep);
1973 			continue;
1974 
1975 		case D_ALLOCDIRECT:
1976 		case D_INODEDEP:
1977 			panic("deallocate_dependencies: Unexpected type %s",
1978 			    TYPENAME(wk->wk_type));
1979 			/* NOTREACHED */
1980 
1981 		default:
1982 			panic("deallocate_dependencies: Unknown type %s",
1983 			    TYPENAME(wk->wk_type));
1984 			/* NOTREACHED */
1985 		}
1986 	}
1987 }
1988 
1989 /*
1990  * Free an allocdirect. Generate a new freefrag work request if appropriate.
1991  * This routine must be called with splbio interrupts blocked.
1992  */
1993 static void
1994 free_allocdirect(struct allocdirectlst *adphead,
1995 		 struct allocdirect *adp, int delay)
1996 {
1997 	KKASSERT(lock_held(&lk));
1998 
1999 	if ((adp->ad_state & DEPCOMPLETE) == 0)
2000 		LIST_REMOVE(adp, ad_deps);
2001 	TAILQ_REMOVE(adphead, adp, ad_next);
2002 	if ((adp->ad_state & COMPLETE) == 0)
2003 		WORKLIST_REMOVE(&adp->ad_list);
2004 	if (adp->ad_freefrag != NULL) {
2005 		if (delay)
2006 			WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
2007 			    &adp->ad_freefrag->ff_list);
2008 		else
2009 			add_to_worklist(&adp->ad_freefrag->ff_list);
2010 	}
2011 	WORKITEM_FREE(adp, D_ALLOCDIRECT);
2012 }
2013 
2014 /*
2015  * Prepare an inode to be freed. The actual free operation is not
2016  * done until the zero'ed inode has been written to disk.
2017  */
2018 void
2019 softdep_freefile(struct vnode *pvp, ino_t ino, int mode)
2020 {
2021 	struct inode *ip = VTOI(pvp);
2022 	struct inodedep *inodedep;
2023 	struct freefile *freefile;
2024 
2025 	/*
2026 	 * This sets up the inode de-allocation dependency.
2027 	 */
2028 	freefile = kmalloc(sizeof(struct freefile), M_FREEFILE,
2029 			   M_SOFTDEP_FLAGS);
2030 	freefile->fx_list.wk_type = D_FREEFILE;
2031 	freefile->fx_list.wk_state = 0;
2032 	freefile->fx_mode = mode;
2033 	freefile->fx_oldinum = ino;
2034 	freefile->fx_devvp = ip->i_devvp;
2035 	freefile->fx_fs = ip->i_fs;
2036 
2037 	/*
2038 	 * If the inodedep does not exist, then the zero'ed inode has
2039 	 * been written to disk. If the allocated inode has never been
2040 	 * written to disk, then the on-disk inode is zero'ed. In either
2041 	 * case we can free the file immediately.
2042 	 */
2043 	ACQUIRE_LOCK(&lk);
2044 	if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 ||
2045 	    check_inode_unwritten(inodedep)) {
2046 		FREE_LOCK(&lk);
2047 		handle_workitem_freefile(freefile);
2048 		return;
2049 	}
2050 	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
2051 	FREE_LOCK(&lk);
2052 }
2053 
2054 /*
2055  * Check to see if an inode has never been written to disk. If
2056  * so free the inodedep and return success, otherwise return failure.
2057  * This routine must be called with splbio interrupts blocked.
2058  *
2059  * If we still have a bitmap dependency, then the inode has never
2060  * been written to disk. Drop the dependency as it is no longer
2061  * necessary since the inode is being deallocated. We set the
2062  * ALLCOMPLETE flags since the bitmap now properly shows that the
2063  * inode is not allocated. Even if the inode is actively being
2064  * written, it has been rolled back to its zero'ed state, so we
2065  * are ensured that a zero inode is what is on the disk. For short
2066  * lived files, this change will usually result in removing all the
2067  * dependencies from the inode so that it can be freed immediately.
2068  */
2069 static int
2070 check_inode_unwritten(struct inodedep *inodedep)
2071 {
2072 
2073 	if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
2074 	    LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
2075 	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
2076 	    LIST_FIRST(&inodedep->id_inowait) != NULL ||
2077 	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
2078 	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
2079 	    inodedep->id_nlinkdelta != 0)
2080 		return (0);
2081 
2082 	/*
2083 	 * Another process might be in initiate_write_inodeblock
2084 	 * trying to allocate memory without holding "Softdep Lock".
2085 	 */
2086 	if ((inodedep->id_state & IOSTARTED) != 0 &&
2087 	    inodedep->id_savedino == NULL)
2088 		return(0);
2089 
2090 	inodedep->id_state |= ALLCOMPLETE;
2091 	LIST_REMOVE(inodedep, id_deps);
2092 	inodedep->id_buf = NULL;
2093 	if (inodedep->id_state & ONWORKLIST)
2094 		WORKLIST_REMOVE(&inodedep->id_list);
2095 	if (inodedep->id_savedino != NULL) {
2096 		kfree(inodedep->id_savedino, M_INODEDEP);
2097 		inodedep->id_savedino = NULL;
2098 	}
2099 	if (free_inodedep(inodedep) == 0) {
2100 		panic("check_inode_unwritten: busy inode");
2101 	}
2102 	return (1);
2103 }
2104 
2105 /*
2106  * Try to free an inodedep structure. Return 1 if it could be freed.
2107  */
2108 static int
2109 free_inodedep(struct inodedep *inodedep)
2110 {
2111 
2112 	if ((inodedep->id_state & ONWORKLIST) != 0 ||
2113 	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
2114 	    LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
2115 	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
2116 	    LIST_FIRST(&inodedep->id_inowait) != NULL ||
2117 	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
2118 	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
2119 	    inodedep->id_nlinkdelta != 0 || inodedep->id_savedino != NULL)
2120 		return (0);
2121 	LIST_REMOVE(inodedep, id_hash);
2122 	WORKITEM_FREE(inodedep, D_INODEDEP);
2123 	num_inodedep -= 1;
2124 	return (1);
2125 }
2126 
2127 /*
2128  * This workitem routine performs the block de-allocation.
2129  * The workitem is added to the pending list after the updated
2130  * inode block has been written to disk.  As mentioned above,
2131  * checks regarding the number of blocks de-allocated (compared
2132  * to the number of blocks allocated for the file) are also
2133  * performed in this function.
2134  */
2135 static void
2136 handle_workitem_freeblocks(struct freeblks *freeblks)
2137 {
2138 	struct inode tip;
2139 	ufs_daddr_t bn;
2140 	struct fs *fs;
2141 	int i, level, bsize;
2142 	long nblocks, blocksreleased = 0;
2143 	int error, allerror = 0;
2144 	ufs_lbn_t baselbns[UFS_NIADDR], tmpval;
2145 
2146 	tip.i_number = freeblks->fb_previousinum;
2147 	tip.i_devvp = freeblks->fb_devvp;
2148 	tip.i_dev = freeblks->fb_devvp->v_rdev;
2149 	tip.i_fs = freeblks->fb_fs;
2150 	tip.i_size = freeblks->fb_oldsize;
2151 	tip.i_uid = freeblks->fb_uid;
2152 	fs = freeblks->fb_fs;
2153 	tmpval = 1;
2154 	baselbns[0] = UFS_NDADDR;
2155 	for (i = 1; i < UFS_NIADDR; i++) {
2156 		tmpval *= NINDIR(fs);
2157 		baselbns[i] = baselbns[i - 1] + tmpval;
2158 	}
2159 	nblocks = btodb(fs->fs_bsize);
2160 	blocksreleased = 0;
2161 	/*
2162 	 * Indirect blocks first.
2163 	 */
2164 	for (level = (UFS_NIADDR - 1); level >= 0; level--) {
2165 		if ((bn = freeblks->fb_iblks[level]) == 0)
2166 			continue;
2167 		if ((error = indir_trunc(&tip, fsbtodoff(fs, bn), level,
2168 		    baselbns[level], &blocksreleased)) == 0)
2169 			allerror = error;
2170 		ffs_blkfree(&tip, bn, fs->fs_bsize);
2171 		blocksreleased += nblocks;
2172 	}
2173 	/*
2174 	 * All direct blocks or frags.
2175 	 */
2176 	for (i = (UFS_NDADDR - 1); i >= 0; i--) {
2177 		if ((bn = freeblks->fb_dblks[i]) == 0)
2178 			continue;
2179 		bsize = blksize(fs, &tip, i);
2180 		ffs_blkfree(&tip, bn, bsize);
2181 		blocksreleased += btodb(bsize);
2182 	}
2183 
2184 #ifdef DIAGNOSTIC
2185 	if (freeblks->fb_chkcnt != blocksreleased)
2186 		kprintf("handle_workitem_freeblocks: block count\n");
2187 	if (allerror)
2188 		softdep_error("handle_workitem_freeblks", allerror);
2189 #endif /* DIAGNOSTIC */
2190 	WORKITEM_FREE(freeblks, D_FREEBLKS);
2191 }
2192 
2193 /*
2194  * Release blocks associated with the inode ip and stored in the indirect
2195  * block at doffset. If level is greater than SINGLE, the block is an
2196  * indirect block and recursive calls to indirtrunc must be used to
2197  * cleanse other indirect blocks.
2198  */
2199 static int
2200 indir_trunc(struct inode *ip, off_t doffset, int level, ufs_lbn_t lbn,
2201 	    long *countp)
2202 {
2203 	struct buf *bp;
2204 	ufs_daddr_t *bap;
2205 	ufs_daddr_t nb;
2206 	struct fs *fs;
2207 	struct worklist *wk;
2208 	struct indirdep *indirdep;
2209 	int i, lbnadd, nblocks;
2210 	int error, allerror = 0;
2211 
2212 	fs = ip->i_fs;
2213 	lbnadd = 1;
2214 	for (i = level; i > 0; i--)
2215 		lbnadd *= NINDIR(fs);
2216 	/*
2217 	 * Get buffer of block pointers to be freed. This routine is not
2218 	 * called until the zero'ed inode has been written, so it is safe
2219 	 * to free blocks as they are encountered. Because the inode has
2220 	 * been zero'ed, calls to bmap on these blocks will fail. So, we
2221 	 * have to use the on-disk address and the block device for the
2222 	 * filesystem to look them up. If the file was deleted before its
2223 	 * indirect blocks were all written to disk, the routine that set
2224 	 * us up (deallocate_dependencies) will have arranged to leave
2225 	 * a complete copy of the indirect block in memory for our use.
2226 	 * Otherwise we have to read the blocks in from the disk.
2227 	 */
2228 	ACQUIRE_LOCK(&lk);
2229 	if ((bp = findblk(ip->i_devvp, doffset, FINDBLK_TEST)) != NULL &&
2230 	    (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2231 		/*
2232 		 * bp must be ir_savebp, which is held locked for our use.
2233 		 */
2234 		if (wk->wk_type != D_INDIRDEP ||
2235 		    (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
2236 		    (indirdep->ir_state & GOINGAWAY) == 0) {
2237 			panic("indir_trunc: lost indirdep");
2238 		}
2239 		WORKLIST_REMOVE(wk);
2240 		WORKITEM_FREE(indirdep, D_INDIRDEP);
2241 		if (LIST_FIRST(&bp->b_dep) != NULL) {
2242 			panic("indir_trunc: dangling dep");
2243 		}
2244 		FREE_LOCK(&lk);
2245 	} else {
2246 		FREE_LOCK(&lk);
2247 		error = bread(ip->i_devvp, doffset, (int)fs->fs_bsize, &bp);
2248 		if (error)
2249 			return (error);
2250 	}
2251 	/*
2252 	 * Recursively free indirect blocks.
2253 	 */
2254 	bap = (ufs_daddr_t *)bp->b_data;
2255 	nblocks = btodb(fs->fs_bsize);
2256 	for (i = NINDIR(fs) - 1; i >= 0; i--) {
2257 		if ((nb = bap[i]) == 0)
2258 			continue;
2259 		if (level != 0) {
2260 			if ((error = indir_trunc(ip, fsbtodoff(fs, nb),
2261 			     level - 1, lbn + (i * lbnadd), countp)) != 0)
2262 				allerror = error;
2263 		}
2264 		ffs_blkfree(ip, nb, fs->fs_bsize);
2265 		*countp += nblocks;
2266 	}
2267 	bp->b_flags |= B_INVAL | B_NOCACHE;
2268 	brelse(bp);
2269 	return (allerror);
2270 }
2271 
2272 /*
2273  * Free an allocindir.
2274  * This routine must be called with splbio interrupts blocked.
2275  */
2276 static void
2277 free_allocindir(struct allocindir *aip, struct inodedep *inodedep)
2278 {
2279 	struct freefrag *freefrag;
2280 
2281 	KKASSERT(lock_held(&lk));
2282 
2283 	if ((aip->ai_state & DEPCOMPLETE) == 0)
2284 		LIST_REMOVE(aip, ai_deps);
2285 	if (aip->ai_state & ONWORKLIST)
2286 		WORKLIST_REMOVE(&aip->ai_list);
2287 	LIST_REMOVE(aip, ai_next);
2288 	if ((freefrag = aip->ai_freefrag) != NULL) {
2289 		if (inodedep == NULL)
2290 			add_to_worklist(&freefrag->ff_list);
2291 		else
2292 			WORKLIST_INSERT(&inodedep->id_bufwait,
2293 			    &freefrag->ff_list);
2294 	}
2295 	WORKITEM_FREE(aip, D_ALLOCINDIR);
2296 }
2297 
2298 /*
2299  * Directory entry addition dependencies.
2300  *
2301  * When adding a new directory entry, the inode (with its incremented link
2302  * count) must be written to disk before the directory entry's pointer to it.
2303  * Also, if the inode is newly allocated, the corresponding freemap must be
2304  * updated (on disk) before the directory entry's pointer. These requirements
2305  * are met via undo/redo on the directory entry's pointer, which consists
2306  * simply of the inode number.
2307  *
2308  * As directory entries are added and deleted, the free space within a
2309  * directory block can become fragmented.  The ufs filesystem will compact
2310  * a fragmented directory block to make space for a new entry. When this
2311  * occurs, the offsets of previously added entries change. Any "diradd"
2312  * dependency structures corresponding to these entries must be updated with
2313  * the new offsets.
2314  */
2315 
2316 /*
2317  * This routine is called after the in-memory inode's link
2318  * count has been incremented, but before the directory entry's
2319  * pointer to the inode has been set.
2320  *
2321  * Parameters:
2322  *	bp:		buffer containing directory block
2323  *	dp:		inode for directory
2324  *	diroffset:	offset of new entry in directory
2325  *	newinum:	inode referenced by new directory entry
2326  *	newdirbp:	non-NULL => contents of new mkdir
2327  */
2328 void
2329 softdep_setup_directory_add(struct buf *bp, struct inode *dp, off_t diroffset,
2330 			    ino_t newinum, struct buf *newdirbp)
2331 {
2332 	int offset;		/* offset of new entry within directory block */
2333 	ufs_lbn_t lbn;		/* block in directory containing new entry */
2334 	struct fs *fs;
2335 	struct diradd *dap;
2336 	struct pagedep *pagedep;
2337 	struct inodedep *inodedep;
2338 	struct mkdir *mkdir1, *mkdir2;
2339 
2340 	/*
2341 	 * Whiteouts have no dependencies.
2342 	 */
2343 	if (newinum == UFS_WINO) {
2344 		if (newdirbp != NULL)
2345 			bdwrite(newdirbp);
2346 		return;
2347 	}
2348 
2349 	fs = dp->i_fs;
2350 	lbn = lblkno(fs, diroffset);
2351 	offset = blkoff(fs, diroffset);
2352 	dap = kmalloc(sizeof(struct diradd), M_DIRADD,
2353 		      M_SOFTDEP_FLAGS | M_ZERO);
2354 	dap->da_list.wk_type = D_DIRADD;
2355 	dap->da_offset = offset;
2356 	dap->da_newinum = newinum;
2357 	dap->da_state = ATTACHED;
2358 	if (newdirbp == NULL) {
2359 		dap->da_state |= DEPCOMPLETE;
2360 		ACQUIRE_LOCK(&lk);
2361 	} else {
2362 		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
2363 		mkdir1 = kmalloc(sizeof(struct mkdir), M_MKDIR,
2364 				 M_SOFTDEP_FLAGS);
2365 		mkdir1->md_list.wk_type = D_MKDIR;
2366 		mkdir1->md_state = MKDIR_BODY;
2367 		mkdir1->md_diradd = dap;
2368 		mkdir2 = kmalloc(sizeof(struct mkdir), M_MKDIR,
2369 				 M_SOFTDEP_FLAGS);
2370 		mkdir2->md_list.wk_type = D_MKDIR;
2371 		mkdir2->md_state = MKDIR_PARENT;
2372 		mkdir2->md_diradd = dap;
2373 		/*
2374 		 * Dependency on "." and ".." being written to disk.
2375 		 */
2376 		mkdir1->md_buf = newdirbp;
2377 		ACQUIRE_LOCK(&lk);
2378 		LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
2379 		WORKLIST_INSERT_BP(newdirbp, &mkdir1->md_list);
2380 		FREE_LOCK(&lk);
2381 		bdwrite(newdirbp);
2382 		/*
2383 		 * Dependency on link count increase for parent directory
2384 		 */
2385 		ACQUIRE_LOCK(&lk);
2386 		if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0
2387 		    || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2388 			dap->da_state &= ~MKDIR_PARENT;
2389 			WORKITEM_FREE(mkdir2, D_MKDIR);
2390 		} else {
2391 			LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
2392 			WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
2393 		}
2394 	}
2395 	/*
2396 	 * Link into parent directory pagedep to await its being written.
2397 	 */
2398 	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2399 		WORKLIST_INSERT_BP(bp, &pagedep->pd_list);
2400 	dap->da_pagedep = pagedep;
2401 	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
2402 	    da_pdlist);
2403 	/*
2404 	 * Link into its inodedep. Put it on the id_bufwait list if the inode
2405 	 * is not yet written. If it is written, do the post-inode write
2406 	 * processing to put it on the id_pendinghd list.
2407 	 */
2408 	(void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
2409 	if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
2410 		diradd_inode_written(dap, inodedep);
2411 	else
2412 		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2413 	FREE_LOCK(&lk);
2414 }
2415 
2416 /*
2417  * This procedure is called to change the offset of a directory
2418  * entry when compacting a directory block which must be owned
2419  * exclusively by the caller. Note that the actual entry movement
2420  * must be done in this procedure to ensure that no I/O completions
2421  * occur while the move is in progress.
2422  *
2423  * Parameters:
2424  *	dp:	inode for directory
2425  *	base:		address of dp->i_offset
2426  *	oldloc:		address of old directory location
2427  *	newloc:		address of new directory location
2428  *	entrysize:	size of directory entry
2429  */
2430 void
2431 softdep_change_directoryentry_offset(struct inode *dp, caddr_t base,
2432 				     caddr_t oldloc, caddr_t newloc,
2433 				     int entrysize)
2434 {
2435 	int offset, oldoffset, newoffset;
2436 	struct pagedep *pagedep;
2437 	struct diradd *dap;
2438 	ufs_lbn_t lbn;
2439 
2440 	ACQUIRE_LOCK(&lk);
2441 	lbn = lblkno(dp->i_fs, dp->i_offset);
2442 	offset = blkoff(dp->i_fs, dp->i_offset);
2443 	if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
2444 		goto done;
2445 	oldoffset = offset + (oldloc - base);
2446 	newoffset = offset + (newloc - base);
2447 
2448 	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
2449 		if (dap->da_offset != oldoffset)
2450 			continue;
2451 		dap->da_offset = newoffset;
2452 		if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
2453 			break;
2454 		LIST_REMOVE(dap, da_pdlist);
2455 		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
2456 		    dap, da_pdlist);
2457 		break;
2458 	}
2459 	if (dap == NULL) {
2460 
2461 		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
2462 			if (dap->da_offset == oldoffset) {
2463 				dap->da_offset = newoffset;
2464 				break;
2465 			}
2466 		}
2467 	}
2468 done:
2469 	bcopy(oldloc, newloc, entrysize);
2470 	FREE_LOCK(&lk);
2471 }
2472 
2473 /*
2474  * Free a diradd dependency structure. This routine must be called
2475  * with splbio interrupts blocked.
2476  */
2477 static void
2478 free_diradd(struct diradd *dap)
2479 {
2480 	struct dirrem *dirrem;
2481 	struct pagedep *pagedep;
2482 	struct inodedep *inodedep;
2483 	struct mkdir *mkdir, *nextmd;
2484 
2485 	KKASSERT(lock_held(&lk));
2486 
2487 	WORKLIST_REMOVE(&dap->da_list);
2488 	LIST_REMOVE(dap, da_pdlist);
2489 	if ((dap->da_state & DIRCHG) == 0) {
2490 		pagedep = dap->da_pagedep;
2491 	} else {
2492 		dirrem = dap->da_previous;
2493 		pagedep = dirrem->dm_pagedep;
2494 		dirrem->dm_dirinum = pagedep->pd_ino;
2495 		add_to_worklist(&dirrem->dm_list);
2496 	}
2497 	if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
2498 	    0, &inodedep) != 0)
2499 		(void) free_inodedep(inodedep);
2500 	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2501 		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
2502 			nextmd = LIST_NEXT(mkdir, md_mkdirs);
2503 			if (mkdir->md_diradd != dap)
2504 				continue;
2505 			dap->da_state &= ~mkdir->md_state;
2506 			WORKLIST_REMOVE(&mkdir->md_list);
2507 			LIST_REMOVE(mkdir, md_mkdirs);
2508 			WORKITEM_FREE(mkdir, D_MKDIR);
2509 		}
2510 		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2511 			panic("free_diradd: unfound ref");
2512 		}
2513 	}
2514 	WORKITEM_FREE(dap, D_DIRADD);
2515 }
2516 
2517 /*
2518  * Directory entry removal dependencies.
2519  *
2520  * When removing a directory entry, the entry's inode pointer must be
2521  * zero'ed on disk before the corresponding inode's link count is decremented
2522  * (possibly freeing the inode for re-use). This dependency is handled by
2523  * updating the directory entry but delaying the inode count reduction until
2524  * after the directory block has been written to disk. After this point, the
2525  * inode count can be decremented whenever it is convenient.
2526  */
2527 
2528 /*
2529  * This routine should be called immediately after removing
2530  * a directory entry.  The inode's link count should not be
2531  * decremented by the calling procedure -- the soft updates
2532  * code will do this task when it is safe.
2533  *
2534  * Parameters:
2535  *	bp:		buffer containing directory block
2536  *	dp:		inode for the directory being modified
2537  *	ip:		inode for directory entry being removed
2538  *	isrmdir:	indicates if doing RMDIR
2539  */
2540 void
2541 softdep_setup_remove(struct buf *bp, struct inode *dp, struct inode *ip,
2542 		     int isrmdir)
2543 {
2544 	struct dirrem *dirrem, *prevdirrem;
2545 
2546 	/*
2547 	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
2548 	 */
2549 	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
2550 
2551 	/*
2552 	 * If the COMPLETE flag is clear, then there were no active
2553 	 * entries and we want to roll back to a zeroed entry until
2554 	 * the new inode is committed to disk. If the COMPLETE flag is
2555 	 * set then we have deleted an entry that never made it to
2556 	 * disk. If the entry we deleted resulted from a name change,
2557 	 * then the old name still resides on disk. We cannot delete
2558 	 * its inode (returned to us in prevdirrem) until the zeroed
2559 	 * directory entry gets to disk. The new inode has never been
2560 	 * referenced on the disk, so can be deleted immediately.
2561 	 */
2562 	if ((dirrem->dm_state & COMPLETE) == 0) {
2563 		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
2564 		    dm_next);
2565 		FREE_LOCK(&lk);
2566 	} else {
2567 		if (prevdirrem != NULL)
2568 			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
2569 			    prevdirrem, dm_next);
2570 		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
2571 		FREE_LOCK(&lk);
2572 		handle_workitem_remove(dirrem);
2573 	}
2574 }
2575 
2576 /*
2577  * Allocate a new dirrem if appropriate and return it along with
2578  * its associated pagedep. Called without a lock, returns with lock.
2579  */
2580 static long num_dirrem;		/* number of dirrem allocated */
2581 
2582 /*
2583  * Parameters:
2584  *	bp:		buffer containing directory block
2585  *	dp:		inode for the directory being modified
2586  *	ip:		inode for directory entry being removed
2587  *	isrmdir:	indicates if doing RMDIR
2588  *	prevdirremp:	previously referenced inode, if any
2589  */
2590 static struct dirrem *
2591 newdirrem(struct buf *bp, struct inode *dp, struct inode *ip,
2592 	  int isrmdir, struct dirrem **prevdirremp)
2593 {
2594 	int offset;
2595 	ufs_lbn_t lbn;
2596 	struct diradd *dap;
2597 	struct dirrem *dirrem;
2598 	struct pagedep *pagedep;
2599 
2600 	/*
2601 	 * Whiteouts have no deletion dependencies.
2602 	 */
2603 	if (ip == NULL)
2604 		panic("newdirrem: whiteout");
2605 	/*
2606 	 * If we are over our limit, try to improve the situation.
2607 	 * Limiting the number of dirrem structures will also limit
2608 	 * the number of freefile and freeblks structures.
2609 	 */
2610 	if (num_dirrem > max_softdeps / 4)
2611 		speedup_syncer(NULL);
2612 	if (num_dirrem > max_softdeps / 2) {
2613 		ACQUIRE_LOCK(&lk);
2614 		request_cleanup(FLUSH_REMOVE);
2615 		FREE_LOCK(&lk);
2616 	}
2617 
2618 	num_dirrem += 1;
2619 	dirrem = kmalloc(sizeof(struct dirrem), M_DIRREM,
2620 			 M_SOFTDEP_FLAGS | M_ZERO);
2621 	dirrem->dm_list.wk_type = D_DIRREM;
2622 	dirrem->dm_state = isrmdir ? RMDIR : 0;
2623 	dirrem->dm_mnt = ITOV(ip)->v_mount;
2624 	dirrem->dm_oldinum = ip->i_number;
2625 	*prevdirremp = NULL;
2626 
2627 	ACQUIRE_LOCK(&lk);
2628 	lbn = lblkno(dp->i_fs, dp->i_offset);
2629 	offset = blkoff(dp->i_fs, dp->i_offset);
2630 	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2631 		WORKLIST_INSERT_BP(bp, &pagedep->pd_list);
2632 	dirrem->dm_pagedep = pagedep;
2633 	/*
2634 	 * Check for a diradd dependency for the same directory entry.
2635 	 * If present, then both dependencies become obsolete and can
2636 	 * be de-allocated. Check for an entry on both the pd_dirraddhd
2637 	 * list and the pd_pendinghd list.
2638 	 */
2639 
2640 	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
2641 		if (dap->da_offset == offset)
2642 			break;
2643 	if (dap == NULL) {
2644 
2645 		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
2646 			if (dap->da_offset == offset)
2647 				break;
2648 		if (dap == NULL)
2649 			return (dirrem);
2650 	}
2651 	/*
2652 	 * Must be ATTACHED at this point.
2653 	 */
2654 	if ((dap->da_state & ATTACHED) == 0) {
2655 		panic("newdirrem: not ATTACHED");
2656 	}
2657 	if (dap->da_newinum != ip->i_number) {
2658 		panic("newdirrem: inum %"PRId64" should be %"PRId64,
2659 		    ip->i_number, dap->da_newinum);
2660 	}
2661 	/*
2662 	 * If we are deleting a changed name that never made it to disk,
2663 	 * then return the dirrem describing the previous inode (which
2664 	 * represents the inode currently referenced from this entry on disk).
2665 	 */
2666 	if ((dap->da_state & DIRCHG) != 0) {
2667 		*prevdirremp = dap->da_previous;
2668 		dap->da_state &= ~DIRCHG;
2669 		dap->da_pagedep = pagedep;
2670 	}
2671 	/*
2672 	 * We are deleting an entry that never made it to disk.
2673 	 * Mark it COMPLETE so we can delete its inode immediately.
2674 	 */
2675 	dirrem->dm_state |= COMPLETE;
2676 	free_diradd(dap);
2677 	return (dirrem);
2678 }
2679 
2680 /*
2681  * Directory entry change dependencies.
2682  *
2683  * Changing an existing directory entry requires that an add operation
2684  * be completed first followed by a deletion. The semantics for the addition
2685  * are identical to the description of adding a new entry above except
2686  * that the rollback is to the old inode number rather than zero. Once
2687  * the addition dependency is completed, the removal is done as described
2688  * in the removal routine above.
2689  */
2690 
2691 /*
2692  * This routine should be called immediately after changing
2693  * a directory entry.  The inode's link count should not be
2694  * decremented by the calling procedure -- the soft updates
2695  * code will perform this task when it is safe.
2696  *
2697  * Parameters:
2698  *	bp:		buffer containing directory block
2699  *	dp:		inode for the directory being modified
2700  *	ip:		inode for directory entry being removed
2701  *	newinum:	new inode number for changed entry
2702  *	isrmdir:	indicates if doing RMDIR
2703  */
2704 void
2705 softdep_setup_directory_change(struct buf *bp, struct inode *dp,
2706 			       struct inode *ip, ino_t newinum,
2707 			       int isrmdir)
2708 {
2709 	int offset;
2710 	struct diradd *dap = NULL;
2711 	struct dirrem *dirrem, *prevdirrem;
2712 	struct pagedep *pagedep;
2713 	struct inodedep *inodedep;
2714 
2715 	offset = blkoff(dp->i_fs, dp->i_offset);
2716 
2717 	/*
2718 	 * Whiteouts do not need diradd dependencies.
2719 	 */
2720 	if (newinum != UFS_WINO) {
2721 		dap = kmalloc(sizeof(struct diradd), M_DIRADD,
2722 			      M_SOFTDEP_FLAGS | M_ZERO);
2723 		dap->da_list.wk_type = D_DIRADD;
2724 		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
2725 		dap->da_offset = offset;
2726 		dap->da_newinum = newinum;
2727 	}
2728 
2729 	/*
2730 	 * Allocate a new dirrem and ACQUIRE_LOCK.
2731 	 */
2732 	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
2733 	pagedep = dirrem->dm_pagedep;
2734 	/*
2735 	 * The possible values for isrmdir:
2736 	 *	0 - non-directory file rename
2737 	 *	1 - directory rename within same directory
2738 	 *   inum - directory rename to new directory of given inode number
2739 	 * When renaming to a new directory, we are both deleting and
2740 	 * creating a new directory entry, so the link count on the new
2741 	 * directory should not change. Thus we do not need the followup
2742 	 * dirrem which is usually done in handle_workitem_remove. We set
2743 	 * the DIRCHG flag to tell handle_workitem_remove to skip the
2744 	 * followup dirrem.
2745 	 */
2746 	if (isrmdir > 1)
2747 		dirrem->dm_state |= DIRCHG;
2748 
2749 	/*
2750 	 * Whiteouts have no additional dependencies,
2751 	 * so just put the dirrem on the correct list.
2752 	 */
2753 	if (newinum == UFS_WINO) {
2754 		if ((dirrem->dm_state & COMPLETE) == 0) {
2755 			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
2756 			    dm_next);
2757 		} else {
2758 			dirrem->dm_dirinum = pagedep->pd_ino;
2759 			add_to_worklist(&dirrem->dm_list);
2760 		}
2761 		FREE_LOCK(&lk);
2762 		return;
2763 	}
2764 
2765 	/*
2766 	 * If the COMPLETE flag is clear, then there were no active
2767 	 * entries and we want to roll back to the previous inode until
2768 	 * the new inode is committed to disk. If the COMPLETE flag is
2769 	 * set, then we have deleted an entry that never made it to disk.
2770 	 * If the entry we deleted resulted from a name change, then the old
2771 	 * inode reference still resides on disk. Any rollback that we do
2772 	 * needs to be to that old inode (returned to us in prevdirrem). If
2773 	 * the entry we deleted resulted from a create, then there is
2774 	 * no entry on the disk, so we want to roll back to zero rather
2775 	 * than the uncommitted inode. In either of the COMPLETE cases we
2776 	 * want to immediately free the unwritten and unreferenced inode.
2777 	 */
2778 	if ((dirrem->dm_state & COMPLETE) == 0) {
2779 		dap->da_previous = dirrem;
2780 	} else {
2781 		if (prevdirrem != NULL) {
2782 			dap->da_previous = prevdirrem;
2783 		} else {
2784 			dap->da_state &= ~DIRCHG;
2785 			dap->da_pagedep = pagedep;
2786 		}
2787 		dirrem->dm_dirinum = pagedep->pd_ino;
2788 		add_to_worklist(&dirrem->dm_list);
2789 	}
2790 	/*
2791 	 * Link into its inodedep. Put it on the id_bufwait list if the inode
2792 	 * is not yet written. If it is written, do the post-inode write
2793 	 * processing to put it on the id_pendinghd list.
2794 	 */
2795 	if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 ||
2796 	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2797 		dap->da_state |= COMPLETE;
2798 		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
2799 		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
2800 	} else {
2801 		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
2802 		    dap, da_pdlist);
2803 		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2804 	}
2805 	FREE_LOCK(&lk);
2806 }
2807 
2808 /*
2809  * Called whenever the link count on an inode is changed.
2810  * It creates an inode dependency so that the new reference(s)
2811  * to the inode cannot be committed to disk until the updated
2812  * inode has been written.
2813  *
2814  * Parameters:
2815  *	ip:	the inode with the increased link count
2816  */
2817 void
2818 softdep_change_linkcnt(struct inode *ip)
2819 {
2820 	struct inodedep *inodedep;
2821 
2822 	ACQUIRE_LOCK(&lk);
2823 	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
2824 	if (ip->i_nlink < ip->i_effnlink) {
2825 		panic("softdep_change_linkcnt: bad delta");
2826 	}
2827 	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2828 	FREE_LOCK(&lk);
2829 }
2830 
2831 /*
2832  * This workitem decrements the inode's link count.
2833  * If the link count reaches zero, the file is removed.
2834  */
2835 static void
2836 handle_workitem_remove(struct dirrem *dirrem)
2837 {
2838 	struct inodedep *inodedep;
2839 	struct vnode *vp;
2840 	struct inode *ip;
2841 	ino_t oldinum;
2842 	int error;
2843 
2844 	error = VFS_VGET(dirrem->dm_mnt, NULL, dirrem->dm_oldinum, &vp);
2845 	if (error) {
2846 		softdep_error("handle_workitem_remove: vget", error);
2847 		return;
2848 	}
2849 	ip = VTOI(vp);
2850 	ACQUIRE_LOCK(&lk);
2851 	if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0){
2852 		panic("handle_workitem_remove: lost inodedep");
2853 	}
2854 	/*
2855 	 * Normal file deletion.
2856 	 */
2857 	if ((dirrem->dm_state & RMDIR) == 0) {
2858 		ip->i_nlink--;
2859 		ip->i_flag |= IN_CHANGE;
2860 		if (ip->i_nlink < ip->i_effnlink) {
2861 			panic("handle_workitem_remove: bad file delta");
2862 		}
2863 		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2864 		FREE_LOCK(&lk);
2865 		vput(vp);
2866 		num_dirrem -= 1;
2867 		WORKITEM_FREE(dirrem, D_DIRREM);
2868 		return;
2869 	}
2870 	/*
2871 	 * Directory deletion. Decrement reference count for both the
2872 	 * just deleted parent directory entry and the reference for ".".
2873 	 * Next truncate the directory to length zero. When the
2874 	 * truncation completes, arrange to have the reference count on
2875 	 * the parent decremented to account for the loss of "..".
2876 	 */
2877 	ip->i_nlink -= 2;
2878 	ip->i_flag |= IN_CHANGE;
2879 	if (ip->i_nlink < ip->i_effnlink) {
2880 		panic("handle_workitem_remove: bad dir delta");
2881 	}
2882 	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2883 	FREE_LOCK(&lk);
2884 	if ((error = ffs_truncate(vp, (off_t)0, 0, proc0.p_ucred)) != 0)
2885 		softdep_error("handle_workitem_remove: truncate", error);
2886 	/*
2887 	 * Rename a directory to a new parent. Since, we are both deleting
2888 	 * and creating a new directory entry, the link count on the new
2889 	 * directory should not change. Thus we skip the followup dirrem.
2890 	 */
2891 	if (dirrem->dm_state & DIRCHG) {
2892 		vput(vp);
2893 		num_dirrem -= 1;
2894 		WORKITEM_FREE(dirrem, D_DIRREM);
2895 		return;
2896 	}
2897 	/*
2898 	 * If the inodedep does not exist, then the zero'ed inode has
2899 	 * been written to disk. If the allocated inode has never been
2900 	 * written to disk, then the on-disk inode is zero'ed. In either
2901 	 * case we can remove the file immediately.
2902 	 */
2903 	ACQUIRE_LOCK(&lk);
2904 	dirrem->dm_state = 0;
2905 	oldinum = dirrem->dm_oldinum;
2906 	dirrem->dm_oldinum = dirrem->dm_dirinum;
2907 	if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 ||
2908 	    check_inode_unwritten(inodedep)) {
2909 		FREE_LOCK(&lk);
2910 		vput(vp);
2911 		handle_workitem_remove(dirrem);
2912 		return;
2913 	}
2914 	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
2915 	FREE_LOCK(&lk);
2916 	ip->i_flag |= IN_CHANGE;
2917 	ffs_update(vp, 0);
2918 	vput(vp);
2919 }
2920 
2921 /*
2922  * Inode de-allocation dependencies.
2923  *
2924  * When an inode's link count is reduced to zero, it can be de-allocated. We
2925  * found it convenient to postpone de-allocation until after the inode is
2926  * written to disk with its new link count (zero).  At this point, all of the
2927  * on-disk inode's block pointers are nullified and, with careful dependency
2928  * list ordering, all dependencies related to the inode will be satisfied and
2929  * the corresponding dependency structures de-allocated.  So, if/when the
2930  * inode is reused, there will be no mixing of old dependencies with new
2931  * ones.  This artificial dependency is set up by the block de-allocation
2932  * procedure above (softdep_setup_freeblocks) and completed by the
2933  * following procedure.
2934  */
2935 static void
2936 handle_workitem_freefile(struct freefile *freefile)
2937 {
2938 	struct vnode vp;
2939 	struct inode tip;
2940 	struct inodedep *idp;
2941 	int error;
2942 
2943 #ifdef DEBUG
2944 	ACQUIRE_LOCK(&lk);
2945 	error = inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp);
2946 	FREE_LOCK(&lk);
2947 	if (error)
2948 		panic("handle_workitem_freefile: inodedep survived");
2949 #endif
2950 	tip.i_devvp = freefile->fx_devvp;
2951 	tip.i_dev = freefile->fx_devvp->v_rdev;
2952 	tip.i_fs = freefile->fx_fs;
2953 	vp.v_data = &tip;
2954 	if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0)
2955 		softdep_error("handle_workitem_freefile", error);
2956 	WORKITEM_FREE(freefile, D_FREEFILE);
2957 }
2958 
2959 /*
2960  * Helper function which unlinks marker element from work list and returns
2961  * the next element on the list.
2962  */
2963 static __inline struct worklist *
2964 markernext(struct worklist *marker)
2965 {
2966 	struct worklist *next;
2967 
2968 	next = LIST_NEXT(marker, wk_list);
2969 	LIST_REMOVE(marker, wk_list);
2970 	return next;
2971 }
2972 
2973 /*
2974  * checkread, checkwrite
2975  *
2976  * bioops callback - hold io_token
2977  */
2978 static  int
2979 softdep_checkread(struct buf *bp)
2980 {
2981 	/* nothing to do, mp lock not needed */
2982 	return(0);
2983 }
2984 
2985 /*
2986  * bioops callback - hold io_token
2987  */
2988 static  int
2989 softdep_checkwrite(struct buf *bp)
2990 {
2991 	/* nothing to do, mp lock not needed */
2992 	return(0);
2993 }
2994 
2995 /*
2996  * Disk writes.
2997  *
2998  * The dependency structures constructed above are most actively used when file
2999  * system blocks are written to disk.  No constraints are placed on when a
3000  * block can be written, but unsatisfied update dependencies are made safe by
3001  * modifying (or replacing) the source memory for the duration of the disk
3002  * write.  When the disk write completes, the memory block is again brought
3003  * up-to-date.
3004  *
3005  * In-core inode structure reclamation.
3006  *
3007  * Because there are a finite number of "in-core" inode structures, they are
3008  * reused regularly.  By transferring all inode-related dependencies to the
3009  * in-memory inode block and indexing them separately (via "inodedep"s), we
3010  * can allow "in-core" inode structures to be reused at any time and avoid
3011  * any increase in contention.
3012  *
3013  * Called just before entering the device driver to initiate a new disk I/O.
3014  * The buffer must be locked, thus, no I/O completion operations can occur
3015  * while we are manipulating its associated dependencies.
3016  *
3017  * bioops callback - hold io_token
3018  *
3019  * Parameters:
3020  *	bp:	structure describing disk write to occur
3021  */
3022 static void
3023 softdep_disk_io_initiation(struct buf *bp)
3024 {
3025 	struct worklist *wk;
3026 	struct worklist marker;
3027 	struct indirdep *indirdep;
3028 
3029 	/*
3030 	 * We only care about write operations. There should never
3031 	 * be dependencies for reads.
3032 	 */
3033 	if (bp->b_cmd == BUF_CMD_READ)
3034 		panic("softdep_disk_io_initiation: read");
3035 
3036 	ACQUIRE_LOCK(&lk);
3037 	marker.wk_type = D_LAST + 1;	/* Not a normal workitem */
3038 
3039 	/*
3040 	 * Do any necessary pre-I/O processing.
3041 	 */
3042 	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = markernext(&marker)) {
3043 		LIST_INSERT_AFTER(wk, &marker, wk_list);
3044 
3045 		switch (wk->wk_type) {
3046 		case D_PAGEDEP:
3047 			initiate_write_filepage(WK_PAGEDEP(wk), bp);
3048 			continue;
3049 
3050 		case D_INODEDEP:
3051 			initiate_write_inodeblock(WK_INODEDEP(wk), bp);
3052 			continue;
3053 
3054 		case D_INDIRDEP:
3055 			indirdep = WK_INDIRDEP(wk);
3056 			if (indirdep->ir_state & GOINGAWAY)
3057 				panic("disk_io_initiation: indirdep gone");
3058 			/*
3059 			 * If there are no remaining dependencies, this
3060 			 * will be writing the real pointers, so the
3061 			 * dependency can be freed.
3062 			 */
3063 			if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
3064 				indirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
3065 				brelse(indirdep->ir_savebp);
3066 				/* inline expand WORKLIST_REMOVE(wk); */
3067 				wk->wk_state &= ~ONWORKLIST;
3068 				LIST_REMOVE(wk, wk_list);
3069 				WORKITEM_FREE(indirdep, D_INDIRDEP);
3070 				continue;
3071 			}
3072 			/*
3073 			 * Replace up-to-date version with safe version.
3074 			 */
3075 			indirdep->ir_saveddata = kmalloc(bp->b_bcount,
3076 							 M_INDIRDEP,
3077 							 M_SOFTDEP_FLAGS);
3078 			ACQUIRE_LOCK(&lk);
3079 			indirdep->ir_state &= ~ATTACHED;
3080 			indirdep->ir_state |= UNDONE;
3081 			bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
3082 			bcopy(indirdep->ir_savebp->b_data, bp->b_data,
3083 			    bp->b_bcount);
3084 			FREE_LOCK(&lk);
3085 			continue;
3086 
3087 		case D_MKDIR:
3088 		case D_BMSAFEMAP:
3089 		case D_ALLOCDIRECT:
3090 		case D_ALLOCINDIR:
3091 			continue;
3092 
3093 		default:
3094 			panic("handle_disk_io_initiation: Unexpected type %s",
3095 			    TYPENAME(wk->wk_type));
3096 			/* NOTREACHED */
3097 		}
3098 	}
3099 	FREE_LOCK(&lk);
3100 }
3101 
3102 /*
3103  * Called from within the procedure above to deal with unsatisfied
3104  * allocation dependencies in a directory. The buffer must be locked,
3105  * thus, no I/O completion operations can occur while we are
3106  * manipulating its associated dependencies.
3107  */
3108 static void
3109 initiate_write_filepage(struct pagedep *pagedep, struct buf *bp)
3110 {
3111 	struct diradd *dap;
3112 	struct direct *ep;
3113 	int i;
3114 
3115 	if (pagedep->pd_state & IOSTARTED) {
3116 		/*
3117 		 * This can only happen if there is a driver that does not
3118 		 * understand chaining. Here biodone will reissue the call
3119 		 * to strategy for the incomplete buffers.
3120 		 */
3121 		kprintf("initiate_write_filepage: already started\n");
3122 		return;
3123 	}
3124 	pagedep->pd_state |= IOSTARTED;
3125 	ACQUIRE_LOCK(&lk);
3126 	for (i = 0; i < DAHASHSZ; i++) {
3127 		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
3128 			ep = (struct direct *)
3129 			    ((char *)bp->b_data + dap->da_offset);
3130 			if (ep->d_ino != dap->da_newinum) {
3131 				panic("%s: dir inum %d != new %"PRId64,
3132 				    "initiate_write_filepage",
3133 				    ep->d_ino, dap->da_newinum);
3134 			}
3135 			if (dap->da_state & DIRCHG)
3136 				ep->d_ino = dap->da_previous->dm_oldinum;
3137 			else
3138 				ep->d_ino = 0;
3139 			dap->da_state &= ~ATTACHED;
3140 			dap->da_state |= UNDONE;
3141 		}
3142 	}
3143 	FREE_LOCK(&lk);
3144 }
3145 
3146 /*
3147  * Called from within the procedure above to deal with unsatisfied
3148  * allocation dependencies in an inodeblock. The buffer must be
3149  * locked, thus, no I/O completion operations can occur while we
3150  * are manipulating its associated dependencies.
3151  *
3152  * Parameters:
3153  *	bp:	The inode block
3154  */
3155 static void
3156 initiate_write_inodeblock(struct inodedep *inodedep, struct buf *bp)
3157 {
3158 	struct allocdirect *adp, *lastadp;
3159 	struct ufs1_dinode *dp;
3160 	struct ufs1_dinode *sip;
3161 	struct fs *fs;
3162 	ufs_lbn_t prevlbn = 0;
3163 	int i, deplist;
3164 
3165 	if (inodedep->id_state & IOSTARTED)
3166 		panic("initiate_write_inodeblock: already started");
3167 	inodedep->id_state |= IOSTARTED;
3168 	fs = inodedep->id_fs;
3169 	dp = (struct ufs1_dinode *)bp->b_data +
3170 	    ino_to_fsbo(fs, inodedep->id_ino);
3171 	/*
3172 	 * If the bitmap is not yet written, then the allocated
3173 	 * inode cannot be written to disk.
3174 	 */
3175 	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
3176 		if (inodedep->id_savedino != NULL)
3177 			panic("initiate_write_inodeblock: already doing I/O");
3178 		sip = kmalloc(sizeof(struct ufs1_dinode), M_INODEDEP,
3179 			      M_SOFTDEP_FLAGS);
3180 		inodedep->id_savedino = sip;
3181 		*inodedep->id_savedino = *dp;
3182 		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
3183 		dp->di_gen = inodedep->id_savedino->di_gen;
3184 		return;
3185 	}
3186 	/*
3187 	 * If no dependencies, then there is nothing to roll back.
3188 	 */
3189 	inodedep->id_savedsize = dp->di_size;
3190 	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
3191 		return;
3192 	/*
3193 	 * Set the dependencies to busy.
3194 	 */
3195 	ACQUIRE_LOCK(&lk);
3196 	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3197 	     adp = TAILQ_NEXT(adp, ad_next)) {
3198 #ifdef DIAGNOSTIC
3199 		if (deplist != 0 && prevlbn >= adp->ad_lbn) {
3200 			panic("softdep_write_inodeblock: lbn order");
3201 		}
3202 		prevlbn = adp->ad_lbn;
3203 		if (adp->ad_lbn < UFS_NDADDR &&
3204 		    dp->di_db[adp->ad_lbn] != adp->ad_newblkno) {
3205 			panic("%s: direct pointer #%ld mismatch %d != %d",
3206 			    "softdep_write_inodeblock", adp->ad_lbn,
3207 			    dp->di_db[adp->ad_lbn], adp->ad_newblkno);
3208 		}
3209 		if (adp->ad_lbn >= UFS_NDADDR &&
3210 		    dp->di_ib[adp->ad_lbn - UFS_NDADDR] != adp->ad_newblkno) {
3211 			panic("%s: indirect pointer #%ld mismatch %d != %d",
3212 			    "softdep_write_inodeblock",
3213 			    adp->ad_lbn - UFS_NDADDR,
3214 			    dp->di_ib[adp->ad_lbn - UFS_NDADDR],
3215 			    adp->ad_newblkno);
3216 		}
3217 		deplist |= 1 << adp->ad_lbn;
3218 		if ((adp->ad_state & ATTACHED) == 0) {
3219 			panic("softdep_write_inodeblock: Unknown state 0x%x",
3220 			    adp->ad_state);
3221 		}
3222 #endif /* DIAGNOSTIC */
3223 		adp->ad_state &= ~ATTACHED;
3224 		adp->ad_state |= UNDONE;
3225 	}
3226 	/*
3227 	 * The on-disk inode cannot claim to be any larger than the last
3228 	 * fragment that has been written. Otherwise, the on-disk inode
3229 	 * might have fragments that were not the last block in the file
3230 	 * which would corrupt the filesystem.
3231 	 */
3232 	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3233 	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3234 		if (adp->ad_lbn >= UFS_NDADDR)
3235 			break;
3236 		dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
3237 		/* keep going until hitting a rollback to a frag */
3238 		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3239 			continue;
3240 		dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3241 		for (i = adp->ad_lbn + 1; i < UFS_NDADDR; i++) {
3242 #ifdef DIAGNOSTIC
3243 			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
3244 				panic("softdep_write_inodeblock: lost dep1");
3245 			}
3246 #endif /* DIAGNOSTIC */
3247 			dp->di_db[i] = 0;
3248 		}
3249 		for (i = 0; i < UFS_NIADDR; i++) {
3250 #ifdef DIAGNOSTIC
3251 			if (dp->di_ib[i] != 0 &&
3252 			    (deplist & ((1 << UFS_NDADDR) << i)) == 0) {
3253 				panic("softdep_write_inodeblock: lost dep2");
3254 			}
3255 #endif /* DIAGNOSTIC */
3256 			dp->di_ib[i] = 0;
3257 		}
3258 		FREE_LOCK(&lk);
3259 		return;
3260 	}
3261 	/*
3262 	 * If we have zero'ed out the last allocated block of the file,
3263 	 * roll back the size to the last currently allocated block.
3264 	 * We know that this last allocated block is a full-sized as
3265 	 * we already checked for fragments in the loop above.
3266 	 */
3267 	if (lastadp != NULL &&
3268 	    dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3269 		for (i = lastadp->ad_lbn; i >= 0; i--)
3270 			if (dp->di_db[i] != 0)
3271 				break;
3272 		dp->di_size = (i + 1) * fs->fs_bsize;
3273 	}
3274 	/*
3275 	 * The only dependencies are for indirect blocks.
3276 	 *
3277 	 * The file size for indirect block additions is not guaranteed.
3278 	 * Such a guarantee would be non-trivial to achieve. The conventional
3279 	 * synchronous write implementation also does not make this guarantee.
3280 	 * Fsck should catch and fix discrepancies. Arguably, the file size
3281 	 * can be over-estimated without destroying integrity when the file
3282 	 * moves into the indirect blocks (i.e., is large). If we want to
3283 	 * postpone fsck, we are stuck with this argument.
3284 	 */
3285 	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
3286 		dp->di_ib[adp->ad_lbn - UFS_NDADDR] = 0;
3287 	FREE_LOCK(&lk);
3288 }
3289 
3290 /*
3291  * This routine is called during the completion interrupt
3292  * service routine for a disk write (from the procedure called
3293  * by the device driver to inform the filesystem caches of
3294  * a request completion).  It should be called early in this
3295  * procedure, before the block is made available to other
3296  * processes or other routines are called.
3297  *
3298  * bioops callback - hold io_token
3299  *
3300  * Parameters:
3301  *	bp:	describes the completed disk write
3302  */
3303 static void
3304 softdep_disk_write_complete(struct buf *bp)
3305 {
3306 	struct worklist *wk;
3307 	struct workhead reattach;
3308 	struct newblk *newblk;
3309 	struct allocindir *aip;
3310 	struct allocdirect *adp;
3311 	struct indirdep *indirdep;
3312 	struct inodedep *inodedep;
3313 	struct bmsafemap *bmsafemap;
3314 
3315 	ACQUIRE_LOCK(&lk);
3316 
3317 	LIST_INIT(&reattach);
3318 	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
3319 		WORKLIST_REMOVE(wk);
3320 		switch (wk->wk_type) {
3321 
3322 		case D_PAGEDEP:
3323 			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
3324 				WORKLIST_INSERT(&reattach, wk);
3325 			continue;
3326 
3327 		case D_INODEDEP:
3328 			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
3329 				WORKLIST_INSERT(&reattach, wk);
3330 			continue;
3331 
3332 		case D_BMSAFEMAP:
3333 			bmsafemap = WK_BMSAFEMAP(wk);
3334 			while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
3335 				newblk->nb_state |= DEPCOMPLETE;
3336 				newblk->nb_bmsafemap = NULL;
3337 				LIST_REMOVE(newblk, nb_deps);
3338 			}
3339 			while ((adp =
3340 			   LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
3341 				adp->ad_state |= DEPCOMPLETE;
3342 				adp->ad_buf = NULL;
3343 				LIST_REMOVE(adp, ad_deps);
3344 				handle_allocdirect_partdone(adp);
3345 			}
3346 			while ((aip =
3347 			    LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
3348 				aip->ai_state |= DEPCOMPLETE;
3349 				aip->ai_buf = NULL;
3350 				LIST_REMOVE(aip, ai_deps);
3351 				handle_allocindir_partdone(aip);
3352 			}
3353 			while ((inodedep =
3354 			     LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
3355 				inodedep->id_state |= DEPCOMPLETE;
3356 				LIST_REMOVE(inodedep, id_deps);
3357 				inodedep->id_buf = NULL;
3358 			}
3359 			WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
3360 			continue;
3361 
3362 		case D_MKDIR:
3363 			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
3364 			continue;
3365 
3366 		case D_ALLOCDIRECT:
3367 			adp = WK_ALLOCDIRECT(wk);
3368 			adp->ad_state |= COMPLETE;
3369 			handle_allocdirect_partdone(adp);
3370 			continue;
3371 
3372 		case D_ALLOCINDIR:
3373 			aip = WK_ALLOCINDIR(wk);
3374 			aip->ai_state |= COMPLETE;
3375 			handle_allocindir_partdone(aip);
3376 			continue;
3377 
3378 		case D_INDIRDEP:
3379 			indirdep = WK_INDIRDEP(wk);
3380 			if (indirdep->ir_state & GOINGAWAY) {
3381 				panic("disk_write_complete: indirdep gone");
3382 			}
3383 			bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
3384 			kfree(indirdep->ir_saveddata, M_INDIRDEP);
3385 			indirdep->ir_saveddata = NULL;
3386 			indirdep->ir_state &= ~UNDONE;
3387 			indirdep->ir_state |= ATTACHED;
3388 			while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL) {
3389 				handle_allocindir_partdone(aip);
3390 				if (aip == LIST_FIRST(&indirdep->ir_donehd)) {
3391 					panic("disk_write_complete: not gone");
3392 				}
3393 			}
3394 			WORKLIST_INSERT(&reattach, wk);
3395 			if ((bp->b_flags & B_DELWRI) == 0)
3396 				stat_indir_blk_ptrs++;
3397 			bdirty(bp);
3398 			continue;
3399 
3400 		default:
3401 			panic("handle_disk_write_complete: Unknown type %s",
3402 			    TYPENAME(wk->wk_type));
3403 			/* NOTREACHED */
3404 		}
3405 	}
3406 	/*
3407 	 * Reattach any requests that must be redone.
3408 	 */
3409 	while ((wk = LIST_FIRST(&reattach)) != NULL) {
3410 		WORKLIST_REMOVE(wk);
3411 		WORKLIST_INSERT_BP(bp, wk);
3412 	}
3413 
3414 	FREE_LOCK(&lk);
3415 }
3416 
3417 /*
3418  * Called from within softdep_disk_write_complete above. Note that
3419  * this routine is always called from interrupt level with further
3420  * splbio interrupts blocked.
3421  *
3422  * Parameters:
3423  *	adp:	the completed allocdirect
3424  */
3425 static void
3426 handle_allocdirect_partdone(struct allocdirect *adp)
3427 {
3428 	struct allocdirect *listadp;
3429 	struct inodedep *inodedep;
3430 	long bsize;
3431 
3432 	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3433 		return;
3434 	if (adp->ad_buf != NULL)
3435 		panic("handle_allocdirect_partdone: dangling dep");
3436 
3437 	/*
3438 	 * The on-disk inode cannot claim to be any larger than the last
3439 	 * fragment that has been written. Otherwise, the on-disk inode
3440 	 * might have fragments that were not the last block in the file
3441 	 * which would corrupt the filesystem. Thus, we cannot free any
3442 	 * allocdirects after one whose ad_oldblkno claims a fragment as
3443 	 * these blocks must be rolled back to zero before writing the inode.
3444 	 * We check the currently active set of allocdirects in id_inoupdt.
3445 	 */
3446 	inodedep = adp->ad_inodedep;
3447 	bsize = inodedep->id_fs->fs_bsize;
3448 	TAILQ_FOREACH(listadp, &inodedep->id_inoupdt, ad_next) {
3449 		/* found our block */
3450 		if (listadp == adp)
3451 			break;
3452 		/* continue if ad_oldlbn is not a fragment */
3453 		if (listadp->ad_oldsize == 0 ||
3454 		    listadp->ad_oldsize == bsize)
3455 			continue;
3456 		/* hit a fragment */
3457 		return;
3458 	}
3459 	/*
3460 	 * If we have reached the end of the current list without
3461 	 * finding the just finished dependency, then it must be
3462 	 * on the future dependency list. Future dependencies cannot
3463 	 * be freed until they are moved to the current list.
3464 	 */
3465 	if (listadp == NULL) {
3466 #ifdef DEBUG
3467 		TAILQ_FOREACH(listadp, &inodedep->id_newinoupdt, ad_next)
3468 			/* found our block */
3469 			if (listadp == adp)
3470 				break;
3471 		if (listadp == NULL)
3472 			panic("handle_allocdirect_partdone: lost dep");
3473 #endif /* DEBUG */
3474 		return;
3475 	}
3476 	/*
3477 	 * If we have found the just finished dependency, then free
3478 	 * it along with anything that follows it that is complete.
3479 	 */
3480 	for (; adp; adp = listadp) {
3481 		listadp = TAILQ_NEXT(adp, ad_next);
3482 		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3483 			return;
3484 		free_allocdirect(&inodedep->id_inoupdt, adp, 1);
3485 	}
3486 }
3487 
3488 /*
3489  * Called from within softdep_disk_write_complete above. Note that
3490  * this routine is always called from interrupt level with further
3491  * splbio interrupts blocked.
3492  *
3493  * Parameters:
3494  *	aip:	the completed allocindir
3495  */
3496 static void
3497 handle_allocindir_partdone(struct allocindir *aip)
3498 {
3499 	struct indirdep *indirdep;
3500 
3501 	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
3502 		return;
3503 	if (aip->ai_buf != NULL)
3504 		panic("handle_allocindir_partdone: dangling dependency");
3505 
3506 	indirdep = aip->ai_indirdep;
3507 	if (indirdep->ir_state & UNDONE) {
3508 		LIST_REMOVE(aip, ai_next);
3509 		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
3510 		return;
3511 	}
3512 	((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
3513 	    aip->ai_newblkno;
3514 	LIST_REMOVE(aip, ai_next);
3515 	if (aip->ai_freefrag != NULL)
3516 		add_to_worklist(&aip->ai_freefrag->ff_list);
3517 	WORKITEM_FREE(aip, D_ALLOCINDIR);
3518 }
3519 
3520 /*
3521  * Called from within softdep_disk_write_complete above to restore
3522  * in-memory inode block contents to their most up-to-date state. Note
3523  * that this routine is always called from interrupt level with further
3524  * splbio interrupts blocked.
3525  *
3526  * Parameters:
3527  *	bp:	buffer containing the inode block
3528  */
3529 static int
3530 handle_written_inodeblock(struct inodedep *inodedep, struct buf *bp)
3531 {
3532 	struct worklist *wk, *filefree;
3533 	struct allocdirect *adp, *nextadp;
3534 	struct ufs1_dinode *dp;
3535 	int hadchanges;
3536 
3537 	if ((inodedep->id_state & IOSTARTED) == 0)
3538 		panic("handle_written_inodeblock: not started");
3539 
3540 	inodedep->id_state &= ~IOSTARTED;
3541 	dp = (struct ufs1_dinode *)bp->b_data +
3542 	    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
3543 	/*
3544 	 * If we had to rollback the inode allocation because of
3545 	 * bitmaps being incomplete, then simply restore it.
3546 	 * Keep the block dirty so that it will not be reclaimed until
3547 	 * all associated dependencies have been cleared and the
3548 	 * corresponding updates written to disk.
3549 	 */
3550 	if (inodedep->id_savedino != NULL) {
3551 		*dp = *inodedep->id_savedino;
3552 		kfree(inodedep->id_savedino, M_INODEDEP);
3553 		inodedep->id_savedino = NULL;
3554 		if ((bp->b_flags & B_DELWRI) == 0)
3555 			stat_inode_bitmap++;
3556 		bdirty(bp);
3557 		return (1);
3558 	}
3559 	inodedep->id_state |= COMPLETE;
3560 	/*
3561 	 * Roll forward anything that had to be rolled back before
3562 	 * the inode could be updated.
3563 	 */
3564 	hadchanges = 0;
3565 	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
3566 		nextadp = TAILQ_NEXT(adp, ad_next);
3567 		if (adp->ad_state & ATTACHED)
3568 			panic("handle_written_inodeblock: new entry");
3569 
3570 		if (adp->ad_lbn < UFS_NDADDR) {
3571 			if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno) {
3572 				panic("%s: %s #%ld mismatch %d != %d",
3573 				    "handle_written_inodeblock",
3574 				    "direct pointer", adp->ad_lbn,
3575 				    dp->di_db[adp->ad_lbn], adp->ad_oldblkno);
3576 			}
3577 			dp->di_db[adp->ad_lbn] = adp->ad_newblkno;
3578 		} else {
3579 			if (dp->di_ib[adp->ad_lbn - UFS_NDADDR] != 0) {
3580 				panic("%s: %s #%ld allocated as %d",
3581 				    "handle_written_inodeblock",
3582 				    "indirect pointer",
3583 				    adp->ad_lbn - UFS_NDADDR,
3584 				    dp->di_ib[adp->ad_lbn - UFS_NDADDR]);
3585 			}
3586 			dp->di_ib[adp->ad_lbn - UFS_NDADDR] = adp->ad_newblkno;
3587 		}
3588 		adp->ad_state &= ~UNDONE;
3589 		adp->ad_state |= ATTACHED;
3590 		hadchanges = 1;
3591 	}
3592 	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
3593 		stat_direct_blk_ptrs++;
3594 	/*
3595 	 * Reset the file size to its most up-to-date value.
3596 	 */
3597 	if (inodedep->id_savedsize == -1) {
3598 		panic("handle_written_inodeblock: bad size");
3599 	}
3600 	if (dp->di_size != inodedep->id_savedsize) {
3601 		dp->di_size = inodedep->id_savedsize;
3602 		hadchanges = 1;
3603 	}
3604 	inodedep->id_savedsize = -1;
3605 	/*
3606 	 * If there were any rollbacks in the inode block, then it must be
3607 	 * marked dirty so that its will eventually get written back in
3608 	 * its correct form.
3609 	 */
3610 	if (hadchanges)
3611 		bdirty(bp);
3612 	/*
3613 	 * Process any allocdirects that completed during the update.
3614 	 */
3615 	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
3616 		handle_allocdirect_partdone(adp);
3617 	/*
3618 	 * Process deallocations that were held pending until the
3619 	 * inode had been written to disk. Freeing of the inode
3620 	 * is delayed until after all blocks have been freed to
3621 	 * avoid creation of new <vfsid, inum, lbn> triples
3622 	 * before the old ones have been deleted.
3623 	 */
3624 	filefree = NULL;
3625 	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
3626 		WORKLIST_REMOVE(wk);
3627 		switch (wk->wk_type) {
3628 
3629 		case D_FREEFILE:
3630 			/*
3631 			 * We defer adding filefree to the worklist until
3632 			 * all other additions have been made to ensure
3633 			 * that it will be done after all the old blocks
3634 			 * have been freed.
3635 			 */
3636 			if (filefree != NULL) {
3637 				panic("handle_written_inodeblock: filefree");
3638 			}
3639 			filefree = wk;
3640 			continue;
3641 
3642 		case D_MKDIR:
3643 			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
3644 			continue;
3645 
3646 		case D_DIRADD:
3647 			diradd_inode_written(WK_DIRADD(wk), inodedep);
3648 			continue;
3649 
3650 		case D_FREEBLKS:
3651 			wk->wk_state |= COMPLETE;
3652 			if ((wk->wk_state  & ALLCOMPLETE) != ALLCOMPLETE)
3653 				continue;
3654 			/* -- fall through -- */
3655 		case D_FREEFRAG:
3656 		case D_DIRREM:
3657 			add_to_worklist(wk);
3658 			continue;
3659 
3660 		default:
3661 			panic("handle_written_inodeblock: Unknown type %s",
3662 			    TYPENAME(wk->wk_type));
3663 			/* NOTREACHED */
3664 		}
3665 	}
3666 	if (filefree != NULL) {
3667 		if (free_inodedep(inodedep) == 0) {
3668 			panic("handle_written_inodeblock: live inodedep");
3669 		}
3670 		add_to_worklist(filefree);
3671 		return (0);
3672 	}
3673 
3674 	/*
3675 	 * If no outstanding dependencies, free it.
3676 	 */
3677 	if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
3678 		return (0);
3679 	return (hadchanges);
3680 }
3681 
3682 /*
3683  * Process a diradd entry after its dependent inode has been written.
3684  * This routine must be called with splbio interrupts blocked.
3685  */
3686 static void
3687 diradd_inode_written(struct diradd *dap, struct inodedep *inodedep)
3688 {
3689 	struct pagedep *pagedep;
3690 
3691 	dap->da_state |= COMPLETE;
3692 	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3693 		if (dap->da_state & DIRCHG)
3694 			pagedep = dap->da_previous->dm_pagedep;
3695 		else
3696 			pagedep = dap->da_pagedep;
3697 		LIST_REMOVE(dap, da_pdlist);
3698 		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3699 	}
3700 	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
3701 }
3702 
3703 /*
3704  * Handle the completion of a mkdir dependency.
3705  */
3706 static void
3707 handle_written_mkdir(struct mkdir *mkdir, int type)
3708 {
3709 	struct diradd *dap;
3710 	struct pagedep *pagedep;
3711 
3712 	if (mkdir->md_state != type) {
3713 		panic("handle_written_mkdir: bad type");
3714 	}
3715 	dap = mkdir->md_diradd;
3716 	dap->da_state &= ~type;
3717 	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
3718 		dap->da_state |= DEPCOMPLETE;
3719 	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3720 		if (dap->da_state & DIRCHG)
3721 			pagedep = dap->da_previous->dm_pagedep;
3722 		else
3723 			pagedep = dap->da_pagedep;
3724 		LIST_REMOVE(dap, da_pdlist);
3725 		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3726 	}
3727 	LIST_REMOVE(mkdir, md_mkdirs);
3728 	WORKITEM_FREE(mkdir, D_MKDIR);
3729 }
3730 
3731 /*
3732  * Called from within softdep_disk_write_complete above.
3733  * A write operation was just completed. Removed inodes can
3734  * now be freed and associated block pointers may be committed.
3735  * Note that this routine is always called from interrupt level
3736  * with further splbio interrupts blocked.
3737  *
3738  * Parameters:
3739  *	bp:	buffer containing the written page
3740  */
3741 static int
3742 handle_written_filepage(struct pagedep *pagedep, struct buf *bp)
3743 {
3744 	struct dirrem *dirrem;
3745 	struct diradd *dap, *nextdap;
3746 	struct direct *ep;
3747 	int i, chgs;
3748 
3749 	if ((pagedep->pd_state & IOSTARTED) == 0) {
3750 		panic("handle_written_filepage: not started");
3751 	}
3752 	pagedep->pd_state &= ~IOSTARTED;
3753 	/*
3754 	 * Process any directory removals that have been committed.
3755 	 */
3756 	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
3757 		LIST_REMOVE(dirrem, dm_next);
3758 		dirrem->dm_dirinum = pagedep->pd_ino;
3759 		add_to_worklist(&dirrem->dm_list);
3760 	}
3761 	/*
3762 	 * Free any directory additions that have been committed.
3763 	 */
3764 	while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
3765 		free_diradd(dap);
3766 	/*
3767 	 * Uncommitted directory entries must be restored.
3768 	 */
3769 	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
3770 		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
3771 		     dap = nextdap) {
3772 			nextdap = LIST_NEXT(dap, da_pdlist);
3773 			if (dap->da_state & ATTACHED) {
3774 				panic("handle_written_filepage: attached");
3775 			}
3776 			ep = (struct direct *)
3777 			    ((char *)bp->b_data + dap->da_offset);
3778 			ep->d_ino = dap->da_newinum;
3779 			dap->da_state &= ~UNDONE;
3780 			dap->da_state |= ATTACHED;
3781 			chgs = 1;
3782 			/*
3783 			 * If the inode referenced by the directory has
3784 			 * been written out, then the dependency can be
3785 			 * moved to the pending list.
3786 			 */
3787 			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3788 				LIST_REMOVE(dap, da_pdlist);
3789 				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
3790 				    da_pdlist);
3791 			}
3792 		}
3793 	}
3794 	/*
3795 	 * If there were any rollbacks in the directory, then it must be
3796 	 * marked dirty so that its will eventually get written back in
3797 	 * its correct form.
3798 	 */
3799 	if (chgs) {
3800 		if ((bp->b_flags & B_DELWRI) == 0)
3801 			stat_dir_entry++;
3802 		bdirty(bp);
3803 	}
3804 	/*
3805 	 * If no dependencies remain, the pagedep will be freed.
3806 	 * Otherwise it will remain to update the page before it
3807 	 * is written back to disk.
3808 	 */
3809 	if (LIST_FIRST(&pagedep->pd_pendinghd) == NULL) {
3810 		for (i = 0; i < DAHASHSZ; i++)
3811 			if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
3812 				break;
3813 		if (i == DAHASHSZ) {
3814 			LIST_REMOVE(pagedep, pd_hash);
3815 			WORKITEM_FREE(pagedep, D_PAGEDEP);
3816 			return (0);
3817 		}
3818 	}
3819 	return (1);
3820 }
3821 
3822 /*
3823  * Writing back in-core inode structures.
3824  *
3825  * The filesystem only accesses an inode's contents when it occupies an
3826  * "in-core" inode structure.  These "in-core" structures are separate from
3827  * the page frames used to cache inode blocks.  Only the latter are
3828  * transferred to/from the disk.  So, when the updated contents of the
3829  * "in-core" inode structure are copied to the corresponding in-memory inode
3830  * block, the dependencies are also transferred.  The following procedure is
3831  * called when copying a dirty "in-core" inode to a cached inode block.
3832  */
3833 
3834 /*
3835  * Called when an inode is loaded from disk. If the effective link count
3836  * differed from the actual link count when it was last flushed, then we
3837  * need to ensure that the correct effective link count is put back.
3838  *
3839  * Parameters:
3840  *	ip:	the "in_core" copy of the inode
3841  */
3842 void
3843 softdep_load_inodeblock(struct inode *ip)
3844 {
3845 	struct inodedep *inodedep;
3846 
3847 	/*
3848 	 * Check for alternate nlink count.
3849 	 */
3850 	ip->i_effnlink = ip->i_nlink;
3851 	ACQUIRE_LOCK(&lk);
3852 	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3853 		FREE_LOCK(&lk);
3854 		return;
3855 	}
3856 	ip->i_effnlink -= inodedep->id_nlinkdelta;
3857 	FREE_LOCK(&lk);
3858 }
3859 
3860 /*
3861  * This routine is called just before the "in-core" inode
3862  * information is to be copied to the in-memory inode block.
3863  * Recall that an inode block contains several inodes. If
3864  * the force flag is set, then the dependencies will be
3865  * cleared so that the update can always be made. Note that
3866  * the buffer is locked when this routine is called, so we
3867  * will never be in the middle of writing the inode block
3868  * to disk.
3869  *
3870  * Parameters:
3871  *	ip:		the "in_core" copy of the inode
3872  *	bp:		the buffer containing the inode block
3873  *	waitfor:	nonzero => update must be allowed
3874  */
3875 void
3876 softdep_update_inodeblock(struct inode *ip, struct buf *bp,
3877 			  int waitfor)
3878 {
3879 	struct inodedep *inodedep;
3880 	struct worklist *wk;
3881 	struct buf *ibp;
3882 	int error, gotit;
3883 
3884 	/*
3885 	 * If the effective link count is not equal to the actual link
3886 	 * count, then we must track the difference in an inodedep while
3887 	 * the inode is (potentially) tossed out of the cache. Otherwise,
3888 	 * if there is no existing inodedep, then there are no dependencies
3889 	 * to track.
3890 	 */
3891 	ACQUIRE_LOCK(&lk);
3892 	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3893 		FREE_LOCK(&lk);
3894 		if (ip->i_effnlink != ip->i_nlink)
3895 			panic("softdep_update_inodeblock: bad link count");
3896 		return;
3897 	}
3898 	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) {
3899 		panic("softdep_update_inodeblock: bad delta");
3900 	}
3901 	/*
3902 	 * Changes have been initiated. Anything depending on these
3903 	 * changes cannot occur until this inode has been written.
3904 	 */
3905 	inodedep->id_state &= ~COMPLETE;
3906 	if ((inodedep->id_state & ONWORKLIST) == 0)
3907 		WORKLIST_INSERT_BP(bp, &inodedep->id_list);
3908 	/*
3909 	 * Any new dependencies associated with the incore inode must
3910 	 * now be moved to the list associated with the buffer holding
3911 	 * the in-memory copy of the inode. Once merged process any
3912 	 * allocdirects that are completed by the merger.
3913 	 */
3914 	merge_inode_lists(inodedep);
3915 	if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
3916 		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
3917 	/*
3918 	 * Now that the inode has been pushed into the buffer, the
3919 	 * operations dependent on the inode being written to disk
3920 	 * can be moved to the id_bufwait so that they will be
3921 	 * processed when the buffer I/O completes.
3922 	 */
3923 	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
3924 		WORKLIST_REMOVE(wk);
3925 		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
3926 	}
3927 	/*
3928 	 * Newly allocated inodes cannot be written until the bitmap
3929 	 * that allocates them have been written (indicated by
3930 	 * DEPCOMPLETE being set in id_state). If we are doing a
3931 	 * forced sync (e.g., an fsync on a file), we force the bitmap
3932 	 * to be written so that the update can be done.
3933 	 */
3934 	if (waitfor == 0) {
3935 		FREE_LOCK(&lk);
3936 		return;
3937 	}
3938 retry:
3939 	if ((inodedep->id_state & DEPCOMPLETE) != 0) {
3940 		FREE_LOCK(&lk);
3941 		return;
3942 	}
3943 	gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
3944 	if (gotit == 0) {
3945 		if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) != 0)
3946 			goto retry;
3947 		FREE_LOCK(&lk);
3948 		return;
3949 	}
3950 	ibp = inodedep->id_buf;
3951 	FREE_LOCK(&lk);
3952 	if ((error = bwrite(ibp)) != 0)
3953 		softdep_error("softdep_update_inodeblock: bwrite", error);
3954 }
3955 
3956 /*
3957  * Merge the new inode dependency list (id_newinoupdt) into the old
3958  * inode dependency list (id_inoupdt). This routine must be called
3959  * with splbio interrupts blocked.
3960  */
3961 static void
3962 merge_inode_lists(struct inodedep *inodedep)
3963 {
3964 	struct allocdirect *listadp, *newadp;
3965 
3966 	newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3967 	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
3968 		if (listadp->ad_lbn < newadp->ad_lbn) {
3969 			listadp = TAILQ_NEXT(listadp, ad_next);
3970 			continue;
3971 		}
3972 		TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3973 		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
3974 		if (listadp->ad_lbn == newadp->ad_lbn) {
3975 			allocdirect_merge(&inodedep->id_inoupdt, newadp,
3976 			    listadp);
3977 			listadp = newadp;
3978 		}
3979 		newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3980 	}
3981 	while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) {
3982 		TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3983 		TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next);
3984 	}
3985 }
3986 
3987 /*
3988  * If we are doing an fsync, then we must ensure that any directory
3989  * entries for the inode have been written after the inode gets to disk.
3990  *
3991  * bioops callback - hold io_token
3992  *
3993  * Parameters:
3994  *	vp:	the "in_core" copy of the inode
3995  */
3996 static int
3997 softdep_fsync(struct vnode *vp)
3998 {
3999 	struct inodedep *inodedep;
4000 	struct pagedep *pagedep;
4001 	struct worklist *wk;
4002 	struct diradd *dap;
4003 	struct mount *mnt;
4004 	struct vnode *pvp;
4005 	struct inode *ip;
4006 	struct buf *bp;
4007 	struct fs *fs;
4008 	int error, flushparent;
4009 	ino_t parentino;
4010 	ufs_lbn_t lbn;
4011 
4012 	/*
4013 	 * Move check from original kernel code, possibly not needed any
4014 	 * more with the per-mount bioops.
4015 	 */
4016 	if ((vp->v_mount->mnt_flag & MNT_SOFTDEP) == 0)
4017 		return (0);
4018 
4019 	ip = VTOI(vp);
4020 	fs = ip->i_fs;
4021 	ACQUIRE_LOCK(&lk);
4022 	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
4023 		FREE_LOCK(&lk);
4024 		return (0);
4025 	}
4026 	if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
4027 	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
4028 	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
4029 	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) {
4030 		panic("softdep_fsync: pending ops");
4031 	}
4032 	for (error = 0, flushparent = 0; ; ) {
4033 		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
4034 			break;
4035 		if (wk->wk_type != D_DIRADD) {
4036 			panic("softdep_fsync: Unexpected type %s",
4037 			    TYPENAME(wk->wk_type));
4038 		}
4039 		dap = WK_DIRADD(wk);
4040 		/*
4041 		 * Flush our parent if this directory entry
4042 		 * has a MKDIR_PARENT dependency.
4043 		 */
4044 		if (dap->da_state & DIRCHG)
4045 			pagedep = dap->da_previous->dm_pagedep;
4046 		else
4047 			pagedep = dap->da_pagedep;
4048 		mnt = pagedep->pd_mnt;
4049 		parentino = pagedep->pd_ino;
4050 		lbn = pagedep->pd_lbn;
4051 		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) {
4052 			panic("softdep_fsync: dirty");
4053 		}
4054 		flushparent = dap->da_state & MKDIR_PARENT;
4055 		/*
4056 		 * If we are being fsync'ed as part of vgone'ing this vnode,
4057 		 * then we will not be able to release and recover the
4058 		 * vnode below, so we just have to give up on writing its
4059 		 * directory entry out. It will eventually be written, just
4060 		 * not now, but then the user was not asking to have it
4061 		 * written, so we are not breaking any promises.
4062 		 */
4063 		if (vp->v_flag & VRECLAIMED)
4064 			break;
4065 		/*
4066 		 * We prevent deadlock by always fetching inodes from the
4067 		 * root, moving down the directory tree. Thus, when fetching
4068 		 * our parent directory, we must unlock ourselves before
4069 		 * requesting the lock on our parent. See the comment in
4070 		 * ufs_lookup for details on possible races.
4071 		 */
4072 		FREE_LOCK(&lk);
4073 		vn_unlock(vp);
4074 		error = VFS_VGET(mnt, NULL, parentino, &pvp);
4075 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4076 		if (error != 0) {
4077 			return (error);
4078 		}
4079 		if (flushparent) {
4080 			if ((error = ffs_update(pvp, 1)) != 0) {
4081 				vput(pvp);
4082 				return (error);
4083 			}
4084 		}
4085 		/*
4086 		 * Flush directory page containing the inode's name.
4087 		 */
4088 		error = bread(pvp, lblktodoff(fs, lbn), blksize(fs, VTOI(pvp), lbn), &bp);
4089 		if (error == 0)
4090 			error = bwrite(bp);
4091 		vput(pvp);
4092 		if (error != 0) {
4093 			return (error);
4094 		}
4095 		ACQUIRE_LOCK(&lk);
4096 		if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
4097 			break;
4098 	}
4099 	FREE_LOCK(&lk);
4100 	return (0);
4101 }
4102 
4103 /*
4104  * Flush all the dirty bitmaps associated with the block device
4105  * before flushing the rest of the dirty blocks so as to reduce
4106  * the number of dependencies that will have to be rolled back.
4107  */
4108 static int softdep_fsync_mountdev_bp(struct buf *bp, void *data);
4109 
4110 void
4111 softdep_fsync_mountdev(struct vnode *vp)
4112 {
4113 	if (!vn_isdisk(vp, NULL))
4114 		panic("softdep_fsync_mountdev: vnode not a disk");
4115 	ACQUIRE_LOCK(&lk);
4116 	lwkt_gettoken(&vp->v_token);
4117 	RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
4118 		softdep_fsync_mountdev_bp, vp);
4119 	lwkt_reltoken(&vp->v_token);
4120 	drain_output(vp, 1);
4121 	FREE_LOCK(&lk);
4122 }
4123 
4124 static int
4125 softdep_fsync_mountdev_bp(struct buf *bp, void *data)
4126 {
4127 	struct worklist *wk;
4128 	struct vnode *vp = data;
4129 
4130 	/*
4131 	 * If it is already scheduled, skip to the next buffer.
4132 	 */
4133 	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT))
4134 		return(0);
4135 	if (bp->b_vp != vp || (bp->b_flags & B_DELWRI) == 0) {
4136 		BUF_UNLOCK(bp);
4137 		kprintf("softdep_fsync_mountdev_bp: warning, buffer %p ripped out from under vnode %p\n", bp, vp);
4138 		return(0);
4139 	}
4140 	/*
4141 	 * We are only interested in bitmaps with outstanding
4142 	 * dependencies.
4143 	 */
4144 	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
4145 	    wk->wk_type != D_BMSAFEMAP) {
4146 		BUF_UNLOCK(bp);
4147 		return(0);
4148 	}
4149 	bremfree(bp);
4150 	FREE_LOCK(&lk);
4151 	(void) bawrite(bp);
4152 	ACQUIRE_LOCK(&lk);
4153 	return(0);
4154 }
4155 
4156 /*
4157  * This routine is called when we are trying to synchronously flush a
4158  * file. This routine must eliminate any filesystem metadata dependencies
4159  * so that the syncing routine can succeed by pushing the dirty blocks
4160  * associated with the file. If any I/O errors occur, they are returned.
4161  */
4162 struct softdep_sync_metadata_info {
4163 	struct vnode *vp;
4164 	int waitfor;
4165 };
4166 
4167 static int softdep_sync_metadata_bp(struct buf *bp, void *data);
4168 
4169 int
4170 softdep_sync_metadata(struct vnode *vp, struct thread *td)
4171 {
4172 	struct softdep_sync_metadata_info info;
4173 	int error, waitfor;
4174 
4175 	/*
4176 	 * Check whether this vnode is involved in a filesystem
4177 	 * that is doing soft dependency processing.
4178 	 */
4179 	if (!vn_isdisk(vp, NULL)) {
4180 		if (!DOINGSOFTDEP(vp))
4181 			return (0);
4182 	} else
4183 		if (vp->v_rdev->si_mountpoint == NULL ||
4184 		    (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP) == 0)
4185 			return (0);
4186 	/*
4187 	 * Ensure that any direct block dependencies have been cleared.
4188 	 */
4189 	ACQUIRE_LOCK(&lk);
4190 	if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
4191 		FREE_LOCK(&lk);
4192 		return (error);
4193 	}
4194 	/*
4195 	 * For most files, the only metadata dependencies are the
4196 	 * cylinder group maps that allocate their inode or blocks.
4197 	 * The block allocation dependencies can be found by traversing
4198 	 * the dependency lists for any buffers that remain on their
4199 	 * dirty buffer list. The inode allocation dependency will
4200 	 * be resolved when the inode is updated with MNT_WAIT.
4201 	 * This work is done in two passes. The first pass grabs most
4202 	 * of the buffers and begins asynchronously writing them. The
4203 	 * only way to wait for these asynchronous writes is to sleep
4204 	 * on the filesystem vnode which may stay busy for a long time
4205 	 * if the filesystem is active. So, instead, we make a second
4206 	 * pass over the dependencies blocking on each write. In the
4207 	 * usual case we will be blocking against a write that we
4208 	 * initiated, so when it is done the dependency will have been
4209 	 * resolved. Thus the second pass is expected to end quickly.
4210 	 */
4211 	waitfor = MNT_NOWAIT;
4212 top:
4213 	/*
4214 	 * We must wait for any I/O in progress to finish so that
4215 	 * all potential buffers on the dirty list will be visible.
4216 	 */
4217 	drain_output(vp, 1);
4218 
4219 	info.vp = vp;
4220 	info.waitfor = waitfor;
4221 	lwkt_gettoken(&vp->v_token);
4222 	error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
4223 			softdep_sync_metadata_bp, &info);
4224 	lwkt_reltoken(&vp->v_token);
4225 	if (error < 0) {
4226 		FREE_LOCK(&lk);
4227 		return(-error);	/* error code */
4228 	}
4229 
4230 	/*
4231 	 * The brief unlock is to allow any pent up dependency
4232 	 * processing to be done.  Then proceed with the second pass.
4233 	 */
4234 	if (waitfor & MNT_NOWAIT) {
4235 		waitfor = MNT_WAIT;
4236 		FREE_LOCK(&lk);
4237 		ACQUIRE_LOCK(&lk);
4238 		goto top;
4239 	}
4240 
4241 	/*
4242 	 * If we have managed to get rid of all the dirty buffers,
4243 	 * then we are done. For certain directories and block
4244 	 * devices, we may need to do further work.
4245 	 *
4246 	 * We must wait for any I/O in progress to finish so that
4247 	 * all potential buffers on the dirty list will be visible.
4248 	 */
4249 	drain_output(vp, 1);
4250 	if (RB_EMPTY(&vp->v_rbdirty_tree)) {
4251 		FREE_LOCK(&lk);
4252 		return (0);
4253 	}
4254 
4255 	FREE_LOCK(&lk);
4256 	/*
4257 	 * If we are trying to sync a block device, some of its buffers may
4258 	 * contain metadata that cannot be written until the contents of some
4259 	 * partially written files have been written to disk. The only easy
4260 	 * way to accomplish this is to sync the entire filesystem (luckily
4261 	 * this happens rarely).
4262 	 */
4263 	if (vn_isdisk(vp, NULL) &&
4264 	    vp->v_rdev &&
4265 	    vp->v_rdev->si_mountpoint && !vn_islocked(vp) &&
4266 	    (error = VFS_SYNC(vp->v_rdev->si_mountpoint, MNT_WAIT)) != 0)
4267 		return (error);
4268 	return (0);
4269 }
4270 
4271 static int
4272 softdep_sync_metadata_bp(struct buf *bp, void *data)
4273 {
4274 	struct softdep_sync_metadata_info *info = data;
4275 	struct pagedep *pagedep;
4276 	struct allocdirect *adp;
4277 	struct allocindir *aip;
4278 	struct worklist *wk;
4279 	struct buf *nbp;
4280 	int error;
4281 	int i;
4282 
4283 	if (getdirtybuf(&bp, MNT_WAIT) == 0) {
4284 		kprintf("softdep_sync_metadata_bp(1): caught buf %p going away\n", bp);
4285 		return (1);
4286 	}
4287 	if (bp->b_vp != info->vp || (bp->b_flags & B_DELWRI) == 0) {
4288 		kprintf("softdep_sync_metadata_bp(2): caught buf %p going away vp %p\n", bp, info->vp);
4289 		BUF_UNLOCK(bp);
4290 		return(1);
4291 	}
4292 
4293 	/*
4294 	 * As we hold the buffer locked, none of its dependencies
4295 	 * will disappear.
4296 	 */
4297 	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
4298 		switch (wk->wk_type) {
4299 
4300 		case D_ALLOCDIRECT:
4301 			adp = WK_ALLOCDIRECT(wk);
4302 			if (adp->ad_state & DEPCOMPLETE)
4303 				break;
4304 			nbp = adp->ad_buf;
4305 			if (getdirtybuf(&nbp, info->waitfor) == 0)
4306 				break;
4307 			FREE_LOCK(&lk);
4308 			if (info->waitfor & MNT_NOWAIT) {
4309 				bawrite(nbp);
4310 			} else if ((error = bwrite(nbp)) != 0) {
4311 				bawrite(bp);
4312 				ACQUIRE_LOCK(&lk);
4313 				return (-error);
4314 			}
4315 			ACQUIRE_LOCK(&lk);
4316 			break;
4317 
4318 		case D_ALLOCINDIR:
4319 			aip = WK_ALLOCINDIR(wk);
4320 			if (aip->ai_state & DEPCOMPLETE)
4321 				break;
4322 			nbp = aip->ai_buf;
4323 			if (getdirtybuf(&nbp, info->waitfor) == 0)
4324 				break;
4325 			FREE_LOCK(&lk);
4326 			if (info->waitfor & MNT_NOWAIT) {
4327 				bawrite(nbp);
4328 			} else if ((error = bwrite(nbp)) != 0) {
4329 				bawrite(bp);
4330 				ACQUIRE_LOCK(&lk);
4331 				return (-error);
4332 			}
4333 			ACQUIRE_LOCK(&lk);
4334 			break;
4335 
4336 		case D_INDIRDEP:
4337 		restart:
4338 
4339 			LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
4340 				if (aip->ai_state & DEPCOMPLETE)
4341 					continue;
4342 				nbp = aip->ai_buf;
4343 				if (getdirtybuf(&nbp, MNT_WAIT) == 0)
4344 					goto restart;
4345 				FREE_LOCK(&lk);
4346 				if ((error = bwrite(nbp)) != 0) {
4347 					bawrite(bp);
4348 					ACQUIRE_LOCK(&lk);
4349 					return (-error);
4350 				}
4351 				ACQUIRE_LOCK(&lk);
4352 				goto restart;
4353 			}
4354 			break;
4355 
4356 		case D_INODEDEP:
4357 			if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
4358 			    WK_INODEDEP(wk)->id_ino)) != 0) {
4359 				FREE_LOCK(&lk);
4360 				bawrite(bp);
4361 				ACQUIRE_LOCK(&lk);
4362 				return (-error);
4363 			}
4364 			break;
4365 
4366 		case D_PAGEDEP:
4367 			/*
4368 			 * We are trying to sync a directory that may
4369 			 * have dependencies on both its own metadata
4370 			 * and/or dependencies on the inodes of any
4371 			 * recently allocated files. We walk its diradd
4372 			 * lists pushing out the associated inode.
4373 			 */
4374 			pagedep = WK_PAGEDEP(wk);
4375 			for (i = 0; i < DAHASHSZ; i++) {
4376 				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL)
4377 					continue;
4378 				if ((error =
4379 				    flush_pagedep_deps(info->vp,
4380 						pagedep->pd_mnt,
4381 						&pagedep->pd_diraddhd[i]))) {
4382 					FREE_LOCK(&lk);
4383 					bawrite(bp);
4384 					ACQUIRE_LOCK(&lk);
4385 					return (-error);
4386 				}
4387 			}
4388 			break;
4389 
4390 		case D_MKDIR:
4391 			/*
4392 			 * This case should never happen if the vnode has
4393 			 * been properly sync'ed. However, if this function
4394 			 * is used at a place where the vnode has not yet
4395 			 * been sync'ed, this dependency can show up. So,
4396 			 * rather than panic, just flush it.
4397 			 */
4398 			nbp = WK_MKDIR(wk)->md_buf;
4399 			if (getdirtybuf(&nbp, info->waitfor) == 0)
4400 				break;
4401 			FREE_LOCK(&lk);
4402 			if (info->waitfor & MNT_NOWAIT) {
4403 				bawrite(nbp);
4404 			} else if ((error = bwrite(nbp)) != 0) {
4405 				bawrite(bp);
4406 				ACQUIRE_LOCK(&lk);
4407 				return (-error);
4408 			}
4409 			ACQUIRE_LOCK(&lk);
4410 			break;
4411 
4412 		case D_BMSAFEMAP:
4413 			/*
4414 			 * This case should never happen if the vnode has
4415 			 * been properly sync'ed. However, if this function
4416 			 * is used at a place where the vnode has not yet
4417 			 * been sync'ed, this dependency can show up. So,
4418 			 * rather than panic, just flush it.
4419 			 *
4420 			 * nbp can wind up == bp if a device node for the
4421 			 * same filesystem is being fsynced at the same time,
4422 			 * leading to a panic if we don't catch the case.
4423 			 */
4424 			nbp = WK_BMSAFEMAP(wk)->sm_buf;
4425 			if (nbp == bp)
4426 				break;
4427 			if (getdirtybuf(&nbp, info->waitfor) == 0)
4428 				break;
4429 			FREE_LOCK(&lk);
4430 			if (info->waitfor & MNT_NOWAIT) {
4431 				bawrite(nbp);
4432 			} else if ((error = bwrite(nbp)) != 0) {
4433 				bawrite(bp);
4434 				ACQUIRE_LOCK(&lk);
4435 				return (-error);
4436 			}
4437 			ACQUIRE_LOCK(&lk);
4438 			break;
4439 
4440 		default:
4441 			panic("softdep_sync_metadata: Unknown type %s",
4442 			    TYPENAME(wk->wk_type));
4443 			/* NOTREACHED */
4444 		}
4445 	}
4446 	FREE_LOCK(&lk);
4447 	bawrite(bp);
4448 	ACQUIRE_LOCK(&lk);
4449 	return(0);
4450 }
4451 
4452 /*
4453  * Flush the dependencies associated with an inodedep.
4454  * Called with splbio blocked.
4455  */
4456 static int
4457 flush_inodedep_deps(struct fs *fs, ino_t ino)
4458 {
4459 	struct inodedep *inodedep;
4460 	struct allocdirect *adp;
4461 	int error, waitfor;
4462 	struct buf *bp;
4463 
4464 	/*
4465 	 * This work is done in two passes. The first pass grabs most
4466 	 * of the buffers and begins asynchronously writing them. The
4467 	 * only way to wait for these asynchronous writes is to sleep
4468 	 * on the filesystem vnode which may stay busy for a long time
4469 	 * if the filesystem is active. So, instead, we make a second
4470 	 * pass over the dependencies blocking on each write. In the
4471 	 * usual case we will be blocking against a write that we
4472 	 * initiated, so when it is done the dependency will have been
4473 	 * resolved. Thus the second pass is expected to end quickly.
4474 	 * We give a brief window at the top of the loop to allow
4475 	 * any pending I/O to complete.
4476 	 */
4477 	for (waitfor = MNT_NOWAIT; ; ) {
4478 		FREE_LOCK(&lk);
4479 		ACQUIRE_LOCK(&lk);
4480 		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
4481 			return (0);
4482 		TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) {
4483 			if (adp->ad_state & DEPCOMPLETE)
4484 				continue;
4485 			bp = adp->ad_buf;
4486 			if (getdirtybuf(&bp, waitfor) == 0) {
4487 				if (waitfor & MNT_NOWAIT)
4488 					continue;
4489 				break;
4490 			}
4491 			FREE_LOCK(&lk);
4492 			if (waitfor & MNT_NOWAIT) {
4493 				bawrite(bp);
4494 			} else if ((error = bwrite(bp)) != 0) {
4495 				ACQUIRE_LOCK(&lk);
4496 				return (error);
4497 			}
4498 			ACQUIRE_LOCK(&lk);
4499 			break;
4500 		}
4501 		if (adp != NULL)
4502 			continue;
4503 		TAILQ_FOREACH(adp, &inodedep->id_newinoupdt, ad_next) {
4504 			if (adp->ad_state & DEPCOMPLETE)
4505 				continue;
4506 			bp = adp->ad_buf;
4507 			if (getdirtybuf(&bp, waitfor) == 0) {
4508 				if (waitfor & MNT_NOWAIT)
4509 					continue;
4510 				break;
4511 			}
4512 			FREE_LOCK(&lk);
4513 			if (waitfor & MNT_NOWAIT) {
4514 				bawrite(bp);
4515 			} else if ((error = bwrite(bp)) != 0) {
4516 				ACQUIRE_LOCK(&lk);
4517 				return (error);
4518 			}
4519 			ACQUIRE_LOCK(&lk);
4520 			break;
4521 		}
4522 		if (adp != NULL)
4523 			continue;
4524 		/*
4525 		 * If pass2, we are done, otherwise do pass 2.
4526 		 */
4527 		if (waitfor == MNT_WAIT)
4528 			break;
4529 		waitfor = MNT_WAIT;
4530 	}
4531 	/*
4532 	 * Try freeing inodedep in case all dependencies have been removed.
4533 	 */
4534 	if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
4535 		(void) free_inodedep(inodedep);
4536 	return (0);
4537 }
4538 
4539 /*
4540  * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
4541  * Called with splbio blocked.
4542  */
4543 static int
4544 flush_pagedep_deps(struct vnode *pvp, struct mount *mp,
4545 		   struct diraddhd *diraddhdp)
4546 {
4547 	struct inodedep *inodedep;
4548 	struct ufsmount *ump;
4549 	struct diradd *dap;
4550 	struct worklist *wk;
4551 	struct vnode *vp;
4552 	int gotit, error = 0;
4553 	struct buf *bp;
4554 	ino_t inum;
4555 
4556 	ump = VFSTOUFS(mp);
4557 	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
4558 		/*
4559 		 * Flush ourselves if this directory entry
4560 		 * has a MKDIR_PARENT dependency.
4561 		 */
4562 		if (dap->da_state & MKDIR_PARENT) {
4563 			FREE_LOCK(&lk);
4564 			if ((error = ffs_update(pvp, 1)) != 0)
4565 				break;
4566 			ACQUIRE_LOCK(&lk);
4567 			/*
4568 			 * If that cleared dependencies, go on to next.
4569 			 */
4570 			if (dap != LIST_FIRST(diraddhdp))
4571 				continue;
4572 			if (dap->da_state & MKDIR_PARENT) {
4573 				panic("flush_pagedep_deps: MKDIR_PARENT");
4574 			}
4575 		}
4576 		/*
4577 		 * A newly allocated directory must have its "." and
4578 		 * ".." entries written out before its name can be
4579 		 * committed in its parent. We do not want or need
4580 		 * the full semantics of a synchronous VOP_FSYNC as
4581 		 * that may end up here again, once for each directory
4582 		 * level in the filesystem. Instead, we push the blocks
4583 		 * and wait for them to clear. We have to fsync twice
4584 		 * because the first call may choose to defer blocks
4585 		 * that still have dependencies, but deferral will
4586 		 * happen at most once.
4587 		 */
4588 		inum = dap->da_newinum;
4589 		if (dap->da_state & MKDIR_BODY) {
4590 			FREE_LOCK(&lk);
4591 			if ((error = VFS_VGET(mp, NULL, inum, &vp)) != 0)
4592 				break;
4593 			if ((error=VOP_FSYNC(vp, MNT_NOWAIT, 0)) ||
4594 			    (error=VOP_FSYNC(vp, MNT_NOWAIT, 0))) {
4595 				vput(vp);
4596 				break;
4597 			}
4598 			drain_output(vp, 0);
4599 			/*
4600 			 * If first block is still dirty with a D_MKDIR
4601 			 * dependency then it needs to be written now.
4602 			 */
4603 			error = 0;
4604 			ACQUIRE_LOCK(&lk);
4605 			bp = findblk(vp, 0, FINDBLK_TEST);
4606 			if (bp == NULL) {
4607 				FREE_LOCK(&lk);
4608 				goto mkdir_body_continue;
4609 			}
4610 			LIST_FOREACH(wk, &bp->b_dep, wk_list)
4611 				if (wk->wk_type == D_MKDIR) {
4612 					gotit = getdirtybuf(&bp, MNT_WAIT);
4613 					FREE_LOCK(&lk);
4614 					if (gotit && (error = bwrite(bp)) != 0)
4615 						goto mkdir_body_continue;
4616 					break;
4617 				}
4618 			if (wk == NULL)
4619 				FREE_LOCK(&lk);
4620 		mkdir_body_continue:
4621 			vput(vp);
4622 			/* Flushing of first block failed. */
4623 			if (error)
4624 				break;
4625 			ACQUIRE_LOCK(&lk);
4626 			/*
4627 			 * If that cleared dependencies, go on to next.
4628 			 */
4629 			if (dap != LIST_FIRST(diraddhdp))
4630 				continue;
4631 			if (dap->da_state & MKDIR_BODY) {
4632 				panic("flush_pagedep_deps: %p MKDIR_BODY", dap);
4633 			}
4634 		}
4635 		/*
4636 		 * Flush the inode on which the directory entry depends.
4637 		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
4638 		 * the only remaining dependency is that the updated inode
4639 		 * count must get pushed to disk. The inode has already
4640 		 * been pushed into its inode buffer (via VOP_UPDATE) at
4641 		 * the time of the reference count change. So we need only
4642 		 * locate that buffer, ensure that there will be no rollback
4643 		 * caused by a bitmap dependency, then write the inode buffer.
4644 		 */
4645 retry_lookup:
4646 		if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) {
4647 			panic("flush_pagedep_deps: lost inode");
4648 		}
4649 		/*
4650 		 * If the inode still has bitmap dependencies,
4651 		 * push them to disk.
4652 		 */
4653 		if ((inodedep->id_state & DEPCOMPLETE) == 0) {
4654 			gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
4655 			if (gotit == 0)
4656 				goto retry_lookup;
4657 			FREE_LOCK(&lk);
4658 			if (gotit && (error = bwrite(inodedep->id_buf)) != 0)
4659 				break;
4660 			ACQUIRE_LOCK(&lk);
4661 			if (dap != LIST_FIRST(diraddhdp))
4662 				continue;
4663 		}
4664 		/*
4665 		 * If the inode is still sitting in a buffer waiting
4666 		 * to be written, push it to disk.
4667 		 */
4668 		FREE_LOCK(&lk);
4669 		if ((error = bread(ump->um_devvp,
4670 			fsbtodoff(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
4671 		    (int)ump->um_fs->fs_bsize, &bp)) != 0)
4672 			break;
4673 		if ((error = bwrite(bp)) != 0)
4674 			break;
4675 		ACQUIRE_LOCK(&lk);
4676 		/*
4677 		 * If we have failed to get rid of all the dependencies
4678 		 * then something is seriously wrong.
4679 		 */
4680 		if (dap == LIST_FIRST(diraddhdp)) {
4681 			panic("flush_pagedep_deps: flush failed");
4682 		}
4683 	}
4684 	if (error)
4685 		ACQUIRE_LOCK(&lk);
4686 	return (error);
4687 }
4688 
4689 /*
4690  * A large burst of file addition or deletion activity can drive the
4691  * memory load excessively high. First attempt to slow things down
4692  * using the techniques below. If that fails, this routine requests
4693  * the offending operations to fall back to running synchronously
4694  * until the memory load returns to a reasonable level.
4695  */
4696 int
4697 softdep_slowdown(struct vnode *vp)
4698 {
4699 	int max_softdeps_hard;
4700 
4701 	max_softdeps_hard = max_softdeps * 11 / 10;
4702 	if (num_dirrem < max_softdeps_hard / 2 &&
4703 	    num_inodedep < max_softdeps_hard)
4704 		return (0);
4705 	stat_sync_limit_hit += 1;
4706 	return (1);
4707 }
4708 
4709 /*
4710  * If memory utilization has gotten too high, deliberately slow things
4711  * down and speed up the I/O processing.
4712  */
4713 static int
4714 request_cleanup(int resource)
4715 {
4716 	struct thread *td = curthread;		/* XXX */
4717 
4718 	KKASSERT(lock_held(&lk));
4719 
4720 	/*
4721 	 * We never hold up the filesystem syncer process.
4722 	 */
4723 	if (td == filesys_syncer)
4724 		return (0);
4725 	/*
4726 	 * First check to see if the work list has gotten backlogged.
4727 	 * If it has, co-opt this process to help clean up two entries.
4728 	 * Because this process may hold inodes locked, we cannot
4729 	 * handle any remove requests that might block on a locked
4730 	 * inode as that could lead to deadlock.
4731 	 */
4732 	if (num_on_worklist > max_softdeps / 10) {
4733 		process_worklist_item(NULL, LK_NOWAIT);
4734 		process_worklist_item(NULL, LK_NOWAIT);
4735 		stat_worklist_push += 2;
4736 		return(1);
4737 	}
4738 
4739 	/*
4740 	 * If we are resource constrained on inode dependencies, try
4741 	 * flushing some dirty inodes. Otherwise, we are constrained
4742 	 * by file deletions, so try accelerating flushes of directories
4743 	 * with removal dependencies. We would like to do the cleanup
4744 	 * here, but we probably hold an inode locked at this point and
4745 	 * that might deadlock against one that we try to clean. So,
4746 	 * the best that we can do is request the syncer daemon to do
4747 	 * the cleanup for us.
4748 	 */
4749 	switch (resource) {
4750 
4751 	case FLUSH_INODES:
4752 		stat_ino_limit_push += 1;
4753 		req_clear_inodedeps += 1;
4754 		stat_countp = &stat_ino_limit_hit;
4755 		break;
4756 
4757 	case FLUSH_REMOVE:
4758 		stat_blk_limit_push += 1;
4759 		req_clear_remove += 1;
4760 		stat_countp = &stat_blk_limit_hit;
4761 		break;
4762 
4763 	default:
4764 		panic("request_cleanup: unknown type");
4765 	}
4766 	/*
4767 	 * Hopefully the syncer daemon will catch up and awaken us.
4768 	 * We wait at most tickdelay before proceeding in any case.
4769 	 */
4770 	lksleep(&proc_waiting, &lk, 0, "softupdate",
4771 		tickdelay > 2 ? tickdelay : 2);
4772 	return (1);
4773 }
4774 
4775 /*
4776  * Flush out a directory with at least one removal dependency in an effort to
4777  * reduce the number of dirrem, freefile, and freeblks dependency structures.
4778  */
4779 static void
4780 clear_remove(struct thread *td)
4781 {
4782 	struct pagedep_hashhead *pagedephd;
4783 	struct pagedep *pagedep;
4784 	static int next = 0;
4785 	struct mount *mp;
4786 	struct vnode *vp;
4787 	int error, cnt;
4788 	ino_t ino;
4789 
4790 	ACQUIRE_LOCK(&lk);
4791 	for (cnt = 0; cnt < pagedep_hash; cnt++) {
4792 		pagedephd = &pagedep_hashtbl[next++];
4793 		if (next >= pagedep_hash)
4794 			next = 0;
4795 		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
4796 			if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
4797 				continue;
4798 			mp = pagedep->pd_mnt;
4799 			ino = pagedep->pd_ino;
4800 			FREE_LOCK(&lk);
4801 			if ((error = VFS_VGET(mp, NULL, ino, &vp)) != 0) {
4802 				softdep_error("clear_remove: vget", error);
4803 				return;
4804 			}
4805 			if ((error = VOP_FSYNC(vp, MNT_NOWAIT, 0)))
4806 				softdep_error("clear_remove: fsync", error);
4807 			drain_output(vp, 0);
4808 			vput(vp);
4809 			return;
4810 		}
4811 	}
4812 	FREE_LOCK(&lk);
4813 }
4814 
4815 /*
4816  * Clear out a block of dirty inodes in an effort to reduce
4817  * the number of inodedep dependency structures.
4818  */
4819 struct clear_inodedeps_info {
4820 	struct fs *fs;
4821 	struct mount *mp;
4822 };
4823 
4824 static int
4825 clear_inodedeps_mountlist_callback(struct mount *mp, void *data)
4826 {
4827 	struct clear_inodedeps_info *info = data;
4828 
4829 	if ((mp->mnt_flag & MNT_SOFTDEP) && info->fs == VFSTOUFS(mp)->um_fs) {
4830 		info->mp = mp;
4831 		return(-1);
4832 	}
4833 	return(0);
4834 }
4835 
4836 static void
4837 clear_inodedeps(struct thread *td)
4838 {
4839 	struct clear_inodedeps_info info;
4840 	struct inodedep_hashhead *inodedephd;
4841 	struct inodedep *inodedep;
4842 	static int next = 0;
4843 	struct vnode *vp;
4844 	struct fs *fs;
4845 	int error, cnt;
4846 	ino_t firstino, lastino, ino;
4847 
4848 	ACQUIRE_LOCK(&lk);
4849 	/*
4850 	 * Pick a random inode dependency to be cleared.
4851 	 * We will then gather up all the inodes in its block
4852 	 * that have dependencies and flush them out.
4853 	 */
4854 	inodedep = NULL;	/* avoid gcc warnings */
4855 	for (cnt = 0; cnt < inodedep_hash; cnt++) {
4856 		inodedephd = &inodedep_hashtbl[next++];
4857 		if (next >= inodedep_hash)
4858 			next = 0;
4859 		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
4860 			break;
4861 	}
4862 	if (inodedep == NULL) {
4863 		FREE_LOCK(&lk);
4864 		return;
4865 	}
4866 	/*
4867 	 * Ugly code to find mount point given pointer to superblock.
4868 	 */
4869 	fs = inodedep->id_fs;
4870 	info.mp = NULL;
4871 	info.fs = fs;
4872 	mountlist_scan(clear_inodedeps_mountlist_callback,
4873 			&info, MNTSCAN_FORWARD|MNTSCAN_NOBUSY);
4874 	/*
4875 	 * Find the last inode in the block with dependencies.
4876 	 */
4877 	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
4878 	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
4879 		if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
4880 			break;
4881 	/*
4882 	 * Asynchronously push all but the last inode with dependencies.
4883 	 * Synchronously push the last inode with dependencies to ensure
4884 	 * that the inode block gets written to free up the inodedeps.
4885 	 */
4886 	for (ino = firstino; ino <= lastino; ino++) {
4887 		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
4888 			continue;
4889 		FREE_LOCK(&lk);
4890 		if ((error = VFS_VGET(info.mp, NULL, ino, &vp)) != 0) {
4891 			softdep_error("clear_inodedeps: vget", error);
4892 			return;
4893 		}
4894 		if (ino == lastino) {
4895 			if ((error = VOP_FSYNC(vp, MNT_WAIT, 0)))
4896 				softdep_error("clear_inodedeps: fsync1", error);
4897 		} else {
4898 			if ((error = VOP_FSYNC(vp, MNT_NOWAIT, 0)))
4899 				softdep_error("clear_inodedeps: fsync2", error);
4900 			drain_output(vp, 0);
4901 		}
4902 		vput(vp);
4903 		ACQUIRE_LOCK(&lk);
4904 	}
4905 	FREE_LOCK(&lk);
4906 }
4907 
4908 /*
4909  * Function to determine if the buffer has outstanding dependencies
4910  * that will cause a roll-back if the buffer is written. If wantcount
4911  * is set, return number of dependencies, otherwise just yes or no.
4912  *
4913  * bioops callback - hold io_token
4914  */
4915 static int
4916 softdep_count_dependencies(struct buf *bp, int wantcount)
4917 {
4918 	struct worklist *wk;
4919 	struct inodedep *inodedep;
4920 	struct indirdep *indirdep;
4921 	struct allocindir *aip;
4922 	struct pagedep *pagedep;
4923 	struct diradd *dap;
4924 	int i, retval;
4925 
4926 	retval = 0;
4927 	ACQUIRE_LOCK(&lk);
4928 
4929 	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
4930 		switch (wk->wk_type) {
4931 
4932 		case D_INODEDEP:
4933 			inodedep = WK_INODEDEP(wk);
4934 			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
4935 				/* bitmap allocation dependency */
4936 				retval += 1;
4937 				if (!wantcount)
4938 					goto out;
4939 			}
4940 			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
4941 				/* direct block pointer dependency */
4942 				retval += 1;
4943 				if (!wantcount)
4944 					goto out;
4945 			}
4946 			continue;
4947 
4948 		case D_INDIRDEP:
4949 			indirdep = WK_INDIRDEP(wk);
4950 
4951 			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
4952 				/* indirect block pointer dependency */
4953 				retval += 1;
4954 				if (!wantcount)
4955 					goto out;
4956 			}
4957 			continue;
4958 
4959 		case D_PAGEDEP:
4960 			pagedep = WK_PAGEDEP(wk);
4961 			for (i = 0; i < DAHASHSZ; i++) {
4962 
4963 				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
4964 					/* directory entry dependency */
4965 					retval += 1;
4966 					if (!wantcount)
4967 						goto out;
4968 				}
4969 			}
4970 			continue;
4971 
4972 		case D_BMSAFEMAP:
4973 		case D_ALLOCDIRECT:
4974 		case D_ALLOCINDIR:
4975 		case D_MKDIR:
4976 			/* never a dependency on these blocks */
4977 			continue;
4978 
4979 		default:
4980 			panic("softdep_check_for_rollback: Unexpected type %s",
4981 			    TYPENAME(wk->wk_type));
4982 			/* NOTREACHED */
4983 		}
4984 	}
4985 out:
4986 	FREE_LOCK(&lk);
4987 
4988 	return retval;
4989 }
4990 
4991 /*
4992  * Acquire exclusive access to a buffer. Requires softdep lock
4993  * to be held on entry. If waitfor is MNT_WAIT, may release/reacquire
4994  * softdep lock.
4995  *
4996  * Returns 1 if the buffer was locked, 0 if it was not locked or
4997  * if we had to block.
4998  *
4999  * NOTE!  In order to return 1 we must acquire the buffer lock prior
5000  *	  to any release of &lk.  Once we release &lk it's all over.
5001  *	  We may still have to block on the (type-stable) bp in that
5002  *	  case, but we must then unlock it and return 0.
5003  */
5004 static int
5005 getdirtybuf(struct buf **bpp, int waitfor)
5006 {
5007 	struct buf *bp;
5008 	int error;
5009 
5010 	/*
5011 	 * If the contents of *bpp is NULL the caller presumably lost a race.
5012 	 */
5013 	bp = *bpp;
5014 	if (bp == NULL)
5015 		return (0);
5016 
5017 	/*
5018 	 * Try to obtain the buffer lock without deadlocking on &lk.
5019 	 */
5020 	KKASSERT(lock_held(&lk));
5021 	error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT);
5022 	if (error == 0) {
5023 		/*
5024 		 * If the buffer is no longer dirty the OS already wrote it
5025 		 * out, return failure.
5026 		 */
5027 		if ((bp->b_flags & B_DELWRI) == 0) {
5028 			BUF_UNLOCK(bp);
5029 			return (0);
5030 		}
5031 
5032 		/*
5033 		 * Finish nominal buffer locking sequence return success.
5034 		 *
5035 		 * Since we are not using a normal getblk(), and UFS
5036 		 * isn't KVABIO aware, we must make sure that the bp
5037 		 * is synchronized before returning it.
5038 		 */
5039 		bremfree(bp);
5040 		bkvasync_all(bp);
5041 		return (1);
5042 	}
5043 
5044 	/*
5045 	 * Failure case.
5046 	 *
5047 	 * If we are not being asked to wait, return 0 immediately.
5048 	 */
5049 	if (waitfor != MNT_WAIT)
5050 		return (0);
5051 
5052 	/*
5053 	 * Once we release the softdep lock we can never return success,
5054 	 * but we still have to block on the type-stable buf for the caller
5055 	 * to be able to retry without livelocking the system.
5056 	 *
5057 	 * The caller will normally retry in this case.
5058 	 */
5059 	FREE_LOCK(&lk);
5060 	error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL);
5061 	ACQUIRE_LOCK(&lk);
5062 	if (error == 0)
5063 		BUF_UNLOCK(bp);
5064 	return (0);
5065 }
5066 
5067 /*
5068  * Wait for pending output on a vnode to complete.
5069  * Must be called with vnode locked.
5070  */
5071 static void
5072 drain_output(struct vnode *vp, int islocked)
5073 {
5074 
5075 	if (!islocked)
5076 		ACQUIRE_LOCK(&lk);
5077 	while (bio_track_active(&vp->v_track_write)) {
5078 		FREE_LOCK(&lk);
5079 		bio_track_wait(&vp->v_track_write, 0, 0);
5080 		ACQUIRE_LOCK(&lk);
5081 	}
5082 	if (!islocked)
5083 		FREE_LOCK(&lk);
5084 }
5085 
5086 /*
5087  * Called whenever a buffer that is being invalidated or reallocated
5088  * contains dependencies. This should only happen if an I/O error has
5089  * occurred. The routine is called with the buffer locked.
5090  *
5091  * bioops callback - hold io_token
5092  */
5093 static void
5094 softdep_deallocate_dependencies(struct buf *bp)
5095 {
5096 	/* nothing to do, mp lock not needed */
5097 	if ((bp->b_flags & B_ERROR) == 0)
5098 		panic("softdep_deallocate_dependencies: dangling deps");
5099 	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntfromname, bp->b_error);
5100 	panic("softdep_deallocate_dependencies: unrecovered I/O error");
5101 }
5102 
5103 /*
5104  * Function to handle asynchronous write errors in the filesystem.
5105  */
5106 void
5107 softdep_error(char *func, int error)
5108 {
5109 	/* XXX should do something better! */
5110 	kprintf("%s: got error %d while accessing filesystem\n", func, error);
5111 }
5112