xref: /original-bsd/sys/ufs/lfs/lfs_segment.c (revision b3c06cab)
1 /*
2  * Copyright (c) 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * %sccs.include.redist.c%
6  *
7  *	@(#)lfs_segment.c	8.8 (Berkeley) 05/08/95
8  */
9 
10 #include <sys/param.h>
11 #include <sys/systm.h>
12 #include <sys/namei.h>
13 #include <sys/kernel.h>
14 #include <sys/resourcevar.h>
15 #include <sys/file.h>
16 #include <sys/stat.h>
17 #include <sys/buf.h>
18 #include <sys/proc.h>
19 #include <sys/conf.h>
20 #include <sys/vnode.h>
21 #include <sys/malloc.h>
22 #include <sys/mount.h>
23 
24 #include <miscfs/specfs/specdev.h>
25 #include <miscfs/fifofs/fifo.h>
26 
27 #include <ufs/ufs/quota.h>
28 #include <ufs/ufs/inode.h>
29 #include <ufs/ufs/dir.h>
30 #include <ufs/ufs/ufsmount.h>
31 #include <ufs/ufs/ufs_extern.h>
32 
33 #include <ufs/lfs/lfs.h>
34 #include <ufs/lfs/lfs_extern.h>
35 
36 extern int count_lock_queue __P((void));
37 
38 #define MAX_ACTIVE	10
39 /*
40  * Determine if it's OK to start a partial in this segment, or if we need
41  * to go on to a new segment.
42  */
43 #define	LFS_PARTIAL_FITS(fs) \
44 	((fs)->lfs_dbpseg - ((fs)->lfs_offset - (fs)->lfs_curseg) > \
45 	1 << (fs)->lfs_fsbtodb)
46 
47 void	 lfs_callback __P((struct buf *));
48 void	 lfs_gather __P((struct lfs *, struct segment *,
49 	     struct vnode *, int (*) __P((struct lfs *, struct buf *))));
50 int	 lfs_gatherblock __P((struct segment *, struct buf *, int *));
51 void	 lfs_iset __P((struct inode *, ufs_daddr_t, time_t));
52 int	 lfs_match_data __P((struct lfs *, struct buf *));
53 int	 lfs_match_dindir __P((struct lfs *, struct buf *));
54 int	 lfs_match_indir __P((struct lfs *, struct buf *));
55 int	 lfs_match_tindir __P((struct lfs *, struct buf *));
56 void	 lfs_newseg __P((struct lfs *));
57 void	 lfs_shellsort __P((struct buf **, ufs_daddr_t *, register int));
58 void	 lfs_supercallback __P((struct buf *));
59 void	 lfs_updatemeta __P((struct segment *));
60 int	 lfs_vref __P((struct vnode *));
61 void	 lfs_vunref __P((struct vnode *));
62 void	 lfs_writefile __P((struct lfs *, struct segment *, struct vnode *));
63 int	 lfs_writeinode __P((struct lfs *, struct segment *, struct inode *));
64 int	 lfs_writeseg __P((struct lfs *, struct segment *));
65 void	 lfs_writesuper __P((struct lfs *));
66 void	 lfs_writevnodes __P((struct lfs *fs, struct mount *mp,
67 	    struct segment *sp, int dirops));
68 
69 int	lfs_allclean_wakeup;		/* Cleaner wakeup address. */
70 
71 /* Statistics Counters */
72 #define DOSTATS
73 struct lfs_stats lfs_stats;
74 
75 /* op values to lfs_writevnodes */
76 #define	VN_REG	0
77 #define	VN_DIROP	1
78 #define	VN_EMPTY	2
79 
80 /*
81  * Ifile and meta data blocks are not marked busy, so segment writes MUST be
82  * single threaded.  Currently, there are two paths into lfs_segwrite, sync()
83  * and getnewbuf().  They both mark the file system busy.  Lfs_vflush()
84  * explicitly marks the file system busy.  So lfs_segwrite is safe.  I think.
85  */
86 
87 int
88 lfs_vflush(vp)
89 	struct vnode *vp;
90 {
91 	struct inode *ip;
92 	struct lfs *fs;
93 	struct segment *sp;
94 
95 	fs = VFSTOUFS(vp->v_mount)->um_lfs;
96 	if (fs->lfs_nactive > MAX_ACTIVE)
97 		return(lfs_segwrite(vp->v_mount, SEGM_SYNC|SEGM_CKP));
98 	lfs_seglock(fs, SEGM_SYNC);
99 	sp = fs->lfs_sp;
100 
101 
102 	ip = VTOI(vp);
103 	if (vp->v_dirtyblkhd.lh_first == NULL)
104 		lfs_writevnodes(fs, vp->v_mount, sp, VN_EMPTY);
105 
106 	do {
107 		do {
108 			if (vp->v_dirtyblkhd.lh_first != NULL)
109 				lfs_writefile(fs, sp, vp);
110 		} while (lfs_writeinode(fs, sp, ip));
111 
112 	} while (lfs_writeseg(fs, sp) && ip->i_number == LFS_IFILE_INUM);
113 
114 #ifdef DOSTATS
115 	++lfs_stats.nwrites;
116 	if (sp->seg_flags & SEGM_SYNC)
117 		++lfs_stats.nsync_writes;
118 	if (sp->seg_flags & SEGM_CKP)
119 		++lfs_stats.ncheckpoints;
120 #endif
121 	lfs_segunlock(fs);
122 	return (0);
123 }
124 
125 void
126 lfs_writevnodes(fs, mp, sp, op)
127 	struct lfs *fs;
128 	struct mount *mp;
129 	struct segment *sp;
130 	int op;
131 {
132 	struct inode *ip;
133 	struct vnode *vp;
134 
135 /* BEGIN HACK */
136 #define	VN_OFFSET	(((void *)&vp->v_mntvnodes.le_next) - (void *)vp)
137 #define	BACK_VP(VP)	((struct vnode *)(((void *)VP->v_mntvnodes.le_prev) - VN_OFFSET))
138 #define	BEG_OF_VLIST	((struct vnode *)(((void *)&mp->mnt_vnodelist.lh_first) - VN_OFFSET))
139 
140 /* Find last vnode. */
141 loop:   for (vp = mp->mnt_vnodelist.lh_first;
142 	     vp && vp->v_mntvnodes.le_next != NULL;
143 	     vp = vp->v_mntvnodes.le_next);
144 	for (; vp && vp != BEG_OF_VLIST; vp = BACK_VP(vp)) {
145 /* END HACK */
146 /*
147 loop:
148 	for (vp = mp->mnt_vnodelist.lh_first;
149 	     vp != NULL;
150 	     vp = vp->v_mntvnodes.le_next) {
151 */
152 		/*
153 		 * If the vnode that we are about to sync is no longer
154 		 * associated with this mount point, start over.
155 		 */
156 		if (vp->v_mount != mp)
157 			goto loop;
158 
159 		/* XXX ignore dirops for now
160 		if (op == VN_DIROP && !(vp->v_flag & VDIROP) ||
161 		    op != VN_DIROP && (vp->v_flag & VDIROP))
162 			continue;
163 		*/
164 
165 		if (op == VN_EMPTY && vp->v_dirtyblkhd.lh_first)
166 			continue;
167 
168 		if (vp->v_type == VNON)
169 			continue;
170 
171 		if (lfs_vref(vp))
172 			continue;
173 
174 		/*
175 		 * Write the inode/file if dirty and it's not the
176 		 * the IFILE.
177 		 */
178 		ip = VTOI(vp);
179 		if ((ip->i_flag &
180 		    (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE) ||
181 		    vp->v_dirtyblkhd.lh_first != NULL) &&
182 		    ip->i_number != LFS_IFILE_INUM) {
183 			if (vp->v_dirtyblkhd.lh_first != NULL)
184 				lfs_writefile(fs, sp, vp);
185 			(void) lfs_writeinode(fs, sp, ip);
186 		}
187 		vp->v_flag &= ~VDIROP;
188 		lfs_vunref(vp);
189 	}
190 }
191 
192 int
193 lfs_segwrite(mp, flags)
194 	struct mount *mp;
195 	int flags;			/* Do a checkpoint. */
196 {
197 	struct buf *bp;
198 	struct inode *ip;
199 	struct lfs *fs;
200 	struct segment *sp;
201 	struct vnode *vp;
202 	SEGUSE *segusep;
203 	ufs_daddr_t ibno;
204 	CLEANERINFO *cip;
205 	int clean, do_ckp, error, i;
206 
207 	fs = VFSTOUFS(mp)->um_lfs;
208 
209  	/*
210  	 * If we have fewer than 2 clean segments, wait until cleaner
211 	 * writes.
212  	 */
213 	do {
214 		LFS_CLEANERINFO(cip, fs, bp);
215 		clean = cip->clean;
216 		brelse(bp);
217 		if (clean <= 2) {
218 			/* printf ("segs clean: %d\n", clean); */
219 			wakeup(&lfs_allclean_wakeup);
220 			if (error = tsleep(&fs->lfs_avail, PRIBIO + 1,
221 			    "lfs writer", 0))
222 				return (error);
223 		}
224 	} while (clean <= 2 );
225 
226 	/*
227 	 * Allocate a segment structure and enough space to hold pointers to
228 	 * the maximum possible number of buffers which can be described in a
229 	 * single summary block.
230 	 */
231 	do_ckp = flags & SEGM_CKP || fs->lfs_nactive > MAX_ACTIVE;
232 	lfs_seglock(fs, flags | (do_ckp ? SEGM_CKP : 0));
233 	sp = fs->lfs_sp;
234 
235 	lfs_writevnodes(fs, mp, sp, VN_REG);
236 
237 	/* XXX ignore ordering of dirops for now */
238 	/* XXX
239 	fs->lfs_writer = 1;
240 	if (fs->lfs_dirops && (error =
241 	    tsleep(&fs->lfs_writer, PRIBIO + 1, "lfs writer", 0))) {
242 		free(sp->bpp, M_SEGMENT);
243 		free(sp, M_SEGMENT);
244 		fs->lfs_writer = 0;
245 		return (error);
246 	}
247 
248 	lfs_writevnodes(fs, mp, sp, VN_DIROP);
249 	*/
250 
251 	/*
252 	 * If we are doing a checkpoint, mark everything since the
253 	 * last checkpoint as no longer ACTIVE.
254 	 */
255 	if (do_ckp)
256 		for (ibno = fs->lfs_cleansz + fs->lfs_segtabsz;
257 		     --ibno >= fs->lfs_cleansz; ) {
258 			if (bread(fs->lfs_ivnode, ibno, fs->lfs_bsize,
259 			    NOCRED, &bp))
260 
261 				panic("lfs: ifile read");
262 			segusep = (SEGUSE *)bp->b_data;
263 			for (i = fs->lfs_sepb; i--; segusep++)
264 				segusep->su_flags &= ~SEGUSE_ACTIVE;
265 
266 			error = VOP_BWRITE(bp);
267 		}
268 
269 	if (do_ckp || fs->lfs_doifile) {
270 redo:
271 		vp = fs->lfs_ivnode;
272 		while (vget(vp, 1));
273 		ip = VTOI(vp);
274 		if (vp->v_dirtyblkhd.lh_first != NULL)
275 			lfs_writefile(fs, sp, vp);
276 		(void)lfs_writeinode(fs, sp, ip);
277 		vput(vp);
278 		if (lfs_writeseg(fs, sp) && do_ckp)
279 			goto redo;
280 	} else
281 		(void) lfs_writeseg(fs, sp);
282 
283 	/*
284 	 * If the I/O count is non-zero, sleep until it reaches zero.  At the
285 	 * moment, the user's process hangs around so we can sleep.
286 	 */
287 	/* XXX ignore dirops for now
288 	fs->lfs_writer = 0;
289 	fs->lfs_doifile = 0;
290 	wakeup(&fs->lfs_dirops);
291 	*/
292 
293 #ifdef DOSTATS
294 	++lfs_stats.nwrites;
295 	if (sp->seg_flags & SEGM_SYNC)
296 		++lfs_stats.nsync_writes;
297 	if (sp->seg_flags & SEGM_CKP)
298 		++lfs_stats.ncheckpoints;
299 #endif
300 	lfs_segunlock(fs);
301 	return (0);
302 }
303 
304 /*
305  * Write the dirty blocks associated with a vnode.
306  */
307 void
308 lfs_writefile(fs, sp, vp)
309 	struct lfs *fs;
310 	struct segment *sp;
311 	struct vnode *vp;
312 {
313 	struct buf *bp;
314 	struct finfo *fip;
315 	IFILE *ifp;
316 
317 	if (sp->seg_bytes_left < fs->lfs_bsize ||
318 	    sp->sum_bytes_left < sizeof(struct finfo))
319 		(void) lfs_writeseg(fs, sp);
320 
321 	sp->sum_bytes_left -= sizeof(struct finfo) - sizeof(ufs_daddr_t);
322 	++((SEGSUM *)(sp->segsum))->ss_nfinfo;
323 
324 	fip = sp->fip;
325 	fip->fi_nblocks = 0;
326 	fip->fi_ino = VTOI(vp)->i_number;
327 	LFS_IENTRY(ifp, fs, fip->fi_ino, bp);
328 	fip->fi_version = ifp->if_version;
329 	brelse(bp);
330 
331 	/*
332 	 * It may not be necessary to write the meta-data blocks at this point,
333 	 * as the roll-forward recovery code should be able to reconstruct the
334 	 * list.
335 	 */
336 	lfs_gather(fs, sp, vp, lfs_match_data);
337 	lfs_gather(fs, sp, vp, lfs_match_indir);
338 	lfs_gather(fs, sp, vp, lfs_match_dindir);
339 #ifdef TRIPLE
340 	lfs_gather(fs, sp, vp, lfs_match_tindir);
341 #endif
342 
343 	fip = sp->fip;
344 	if (fip->fi_nblocks != 0) {
345 		sp->fip =
346 		    (struct finfo *)((caddr_t)fip + sizeof(struct finfo) +
347 		    sizeof(ufs_daddr_t) * (fip->fi_nblocks - 1));
348 		sp->start_lbp = &sp->fip->fi_blocks[0];
349 	} else {
350 		sp->sum_bytes_left += sizeof(struct finfo) - sizeof(ufs_daddr_t);
351 		--((SEGSUM *)(sp->segsum))->ss_nfinfo;
352 	}
353 }
354 
355 int
356 lfs_writeinode(fs, sp, ip)
357 	struct lfs *fs;
358 	struct segment *sp;
359 	struct inode *ip;
360 {
361 	struct buf *bp, *ibp;
362 	IFILE *ifp;
363 	SEGUSE *sup;
364 	ufs_daddr_t daddr;
365 	ino_t ino;
366 	int error, i, ndx;
367 	int redo_ifile = 0;
368 
369 	if (!(ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)))
370 		return(0);
371 
372 	/* Allocate a new inode block if necessary. */
373 	if (sp->ibp == NULL) {
374 		/* Allocate a new segment if necessary. */
375 		if (sp->seg_bytes_left < fs->lfs_bsize ||
376 		    sp->sum_bytes_left < sizeof(ufs_daddr_t))
377 			(void) lfs_writeseg(fs, sp);
378 
379 		/* Get next inode block. */
380 		daddr = fs->lfs_offset;
381 		fs->lfs_offset += fsbtodb(fs, 1);
382 		sp->ibp = *sp->cbpp++ =
383 		    lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, daddr,
384 		    fs->lfs_bsize);
385 		/* Zero out inode numbers */
386 		for (i = 0; i < INOPB(fs); ++i)
387 			((struct dinode *)sp->ibp->b_data)[i].di_inumber = 0;
388 		++sp->start_bpp;
389 		fs->lfs_avail -= fsbtodb(fs, 1);
390 		/* Set remaining space counters. */
391 		sp->seg_bytes_left -= fs->lfs_bsize;
392 		sp->sum_bytes_left -= sizeof(ufs_daddr_t);
393 		ndx = LFS_SUMMARY_SIZE / sizeof(ufs_daddr_t) -
394 		    sp->ninodes / INOPB(fs) - 1;
395 		((ufs_daddr_t *)(sp->segsum))[ndx] = daddr;
396 	}
397 
398 	/* Update the inode times and copy the inode onto the inode page. */
399 	if (ip->i_flag & IN_MODIFIED)
400 		--fs->lfs_uinodes;
401 	ITIMES(ip, &time, &time);
402 	ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE);
403 	bp = sp->ibp;
404 	((struct dinode *)bp->b_data)[sp->ninodes % INOPB(fs)] = ip->i_din;
405 	/* Increment inode count in segment summary block. */
406 	++((SEGSUM *)(sp->segsum))->ss_ninos;
407 
408 	/* If this page is full, set flag to allocate a new page. */
409 	if (++sp->ninodes % INOPB(fs) == 0)
410 		sp->ibp = NULL;
411 
412 	/*
413 	 * If updating the ifile, update the super-block.  Update the disk
414 	 * address and access times for this inode in the ifile.
415 	 */
416 	ino = ip->i_number;
417 	if (ino == LFS_IFILE_INUM) {
418 		daddr = fs->lfs_idaddr;
419 		fs->lfs_idaddr = bp->b_blkno;
420 	} else {
421 		LFS_IENTRY(ifp, fs, ino, ibp);
422 		daddr = ifp->if_daddr;
423 		ifp->if_daddr = bp->b_blkno;
424 		error = VOP_BWRITE(ibp);
425 	}
426 
427 	/*
428 	 * No need to update segment usage if there was no former inode address
429 	 * or if the last inode address is in the current partial segment.
430 	 */
431 	if (daddr != LFS_UNUSED_DADDR &&
432 	    !(daddr >= fs->lfs_lastpseg && daddr <= bp->b_blkno)) {
433 		LFS_SEGENTRY(sup, fs, datosn(fs, daddr), bp);
434 #ifdef DIAGNOSTIC
435 		if (sup->su_nbytes < sizeof(struct dinode)) {
436 			/* XXX -- Change to a panic. */
437 			printf("lfs: negative bytes (segment %d)\n",
438 			    datosn(fs, daddr));
439 			panic("negative bytes");
440 		}
441 #endif
442 		sup->su_nbytes -= sizeof(struct dinode);
443 		redo_ifile =
444 		    (ino == LFS_IFILE_INUM && !(bp->b_flags & B_GATHERED));
445 		error = VOP_BWRITE(bp);
446 	}
447 	return (redo_ifile);
448 }
449 
450 int
451 lfs_gatherblock(sp, bp, sptr)
452 	struct segment *sp;
453 	struct buf *bp;
454 	int *sptr;
455 {
456 	struct lfs *fs;
457 	int version;
458 
459 	/*
460 	 * If full, finish this segment.  We may be doing I/O, so
461 	 * release and reacquire the splbio().
462 	 */
463 #ifdef DIAGNOSTIC
464 	if (sp->vp == NULL)
465 		panic ("lfs_gatherblock: Null vp in segment");
466 #endif
467 	fs = sp->fs;
468 	if (sp->sum_bytes_left < sizeof(ufs_daddr_t) ||
469 	    sp->seg_bytes_left < bp->b_bcount) {
470 		if (sptr)
471 			splx(*sptr);
472 		lfs_updatemeta(sp);
473 
474 		version = sp->fip->fi_version;
475 		(void) lfs_writeseg(fs, sp);
476 
477 		sp->fip->fi_version = version;
478 		sp->fip->fi_ino = VTOI(sp->vp)->i_number;
479 		/* Add the current file to the segment summary. */
480 		++((SEGSUM *)(sp->segsum))->ss_nfinfo;
481 		sp->sum_bytes_left -=
482 		    sizeof(struct finfo) - sizeof(ufs_daddr_t);
483 
484 		if (sptr)
485 			*sptr = splbio();
486 		return(1);
487 	}
488 
489 	/* Insert into the buffer list, update the FINFO block. */
490 	bp->b_flags |= B_GATHERED;
491 	*sp->cbpp++ = bp;
492 	sp->fip->fi_blocks[sp->fip->fi_nblocks++] = bp->b_lblkno;
493 
494 	sp->sum_bytes_left -= sizeof(ufs_daddr_t);
495 	sp->seg_bytes_left -= bp->b_bcount;
496 	return(0);
497 }
498 
499 void
500 lfs_gather(fs, sp, vp, match)
501 	struct lfs *fs;
502 	struct segment *sp;
503 	struct vnode *vp;
504 	int (*match) __P((struct lfs *, struct buf *));
505 {
506 	struct buf *bp;
507 	int s;
508 
509 	sp->vp = vp;
510 	s = splbio();
511 /* This is a hack to see if ordering the blocks in LFS makes a difference. */
512 /* BEGIN HACK */
513 #define	BUF_OFFSET	(((void *)&bp->b_vnbufs.le_next) - (void *)bp)
514 #define	BACK_BUF(BP)	((struct buf *)(((void *)BP->b_vnbufs.le_prev) - BUF_OFFSET))
515 #define	BEG_OF_LIST	((struct buf *)(((void *)&vp->v_dirtyblkhd.lh_first) - BUF_OFFSET))
516 
517 
518 /*loop:	for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = bp->b_vnbufs.le_next) {*/
519 /* Find last buffer. */
520 loop:   for (bp = vp->v_dirtyblkhd.lh_first; bp && bp->b_vnbufs.le_next != NULL;
521 	    bp = bp->b_vnbufs.le_next);
522 	for (; bp && bp != BEG_OF_LIST; bp = BACK_BUF(bp)) {
523 /* END HACK */
524 		if (bp->b_flags & B_BUSY || !match(fs, bp) ||
525 		    bp->b_flags & B_GATHERED)
526 			continue;
527 #ifdef DIAGNOSTIC
528 		if (!(bp->b_flags & B_DELWRI))
529 			panic("lfs_gather: bp not B_DELWRI");
530 		if (!(bp->b_flags & B_LOCKED))
531 			panic("lfs_gather: bp not B_LOCKED");
532 #endif
533 		if (lfs_gatherblock(sp, bp, &s))
534 			goto loop;
535 	}
536 	splx(s);
537 	lfs_updatemeta(sp);
538 	sp->vp = NULL;
539 }
540 
541 
542 /*
543  * Update the metadata that points to the blocks listed in the FINFO
544  * array.
545  */
546 void
547 lfs_updatemeta(sp)
548 	struct segment *sp;
549 {
550 	SEGUSE *sup;
551 	struct buf *bp;
552 	struct lfs *fs;
553 	struct vnode *vp;
554 	struct indir a[NIADDR + 2], *ap;
555 	struct inode *ip;
556 	ufs_daddr_t daddr, lbn, off;
557 	int error, i, nblocks, num;
558 
559 	vp = sp->vp;
560 	nblocks = &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp;
561 	if (nblocks < 0)
562 		panic("This is a bad thing\n");
563 	if (vp == NULL || nblocks == 0)
564 		return;
565 
566 	/* Sort the blocks. */
567 	if (!(sp->seg_flags & SEGM_CLEAN))
568 		lfs_shellsort(sp->start_bpp, sp->start_lbp, nblocks);
569 
570 	/*
571 	 * Record the length of the last block in case it's a fragment.
572 	 * If there are indirect blocks present, they sort last.  An
573 	 * indirect block will be lfs_bsize and its presence indicates
574 	 * that you cannot have fragments.
575 	 */
576 	sp->fip->fi_lastlength = sp->start_bpp[nblocks - 1]->b_bcount;
577 
578 	/*
579 	 * Assign disk addresses, and update references to the logical
580 	 * block and the segment usage information.
581 	 */
582 	fs = sp->fs;
583 	for (i = nblocks; i--; ++sp->start_bpp) {
584 		lbn = *sp->start_lbp++;
585 		(*sp->start_bpp)->b_blkno = off = fs->lfs_offset;
586 		fs->lfs_offset +=
587 		    fragstodb(fs, numfrags(fs, (*sp->start_bpp)->b_bcount));
588 
589 		if (error = ufs_bmaparray(vp, lbn, &daddr, a, &num, NULL))
590 			panic("lfs_updatemeta: ufs_bmaparray %d", error);
591 		ip = VTOI(vp);
592 		switch (num) {
593 		case 0:
594 			ip->i_db[lbn] = off;
595 			break;
596 		case 1:
597 			ip->i_ib[a[0].in_off] = off;
598 			break;
599 		default:
600 			ap = &a[num - 1];
601 			if (bread(vp, ap->in_lbn, fs->lfs_bsize, NOCRED, &bp))
602 				panic("lfs_updatemeta: bread bno %d",
603 				    ap->in_lbn);
604 			/*
605 			 * Bread may create a new indirect block which needs
606 			 * to get counted for the inode.
607 			 */
608 			if (bp->b_blkno == -1 && !(bp->b_flags & B_CACHE)) {
609 				ip->i_blocks += fsbtodb(fs, 1);
610 				fs->lfs_bfree -= fragstodb(fs, fs->lfs_frag);
611 			}
612 			((ufs_daddr_t *)bp->b_data)[ap->in_off] = off;
613 			VOP_BWRITE(bp);
614 		}
615 
616 		/* Update segment usage information. */
617 		if (daddr != UNASSIGNED &&
618 		    !(daddr >= fs->lfs_lastpseg && daddr <= off)) {
619 			LFS_SEGENTRY(sup, fs, datosn(fs, daddr), bp);
620 #ifdef DIAGNOSTIC
621 			if (sup->su_nbytes < (*sp->start_bpp)->b_bcount) {
622 				/* XXX -- Change to a panic. */
623 				printf("lfs: negative bytes (segment %d)\n",
624 				    datosn(fs, daddr));
625 				panic ("Negative Bytes");
626 			}
627 #endif
628 			sup->su_nbytes -= (*sp->start_bpp)->b_bcount;
629 			error = VOP_BWRITE(bp);
630 		}
631 	}
632 }
633 
634 /*
635  * Start a new segment.
636  */
637 int
638 lfs_initseg(fs)
639 	struct lfs *fs;
640 {
641 	struct segment *sp;
642 	SEGUSE *sup;
643 	SEGSUM *ssp;
644 	struct buf *bp;
645 	int repeat;
646 
647 	sp = fs->lfs_sp;
648 
649 	repeat = 0;
650 	/* Advance to the next segment. */
651 	if (!LFS_PARTIAL_FITS(fs)) {
652 		/* Wake up any cleaning procs waiting on this file system. */
653 		wakeup(&lfs_allclean_wakeup);
654 
655 		lfs_newseg(fs);
656 		repeat = 1;
657 		fs->lfs_offset = fs->lfs_curseg;
658 		sp->seg_number = datosn(fs, fs->lfs_curseg);
659 		sp->seg_bytes_left = fs->lfs_dbpseg * DEV_BSIZE;
660 
661 		/*
662 		 * If the segment contains a superblock, update the offset
663 		 * and summary address to skip over it.
664 		 */
665 		LFS_SEGENTRY(sup, fs, sp->seg_number, bp);
666 		if (sup->su_flags & SEGUSE_SUPERBLOCK) {
667 			fs->lfs_offset += LFS_SBPAD / DEV_BSIZE;
668 			sp->seg_bytes_left -= LFS_SBPAD;
669 		}
670 		brelse(bp);
671 	} else {
672 		sp->seg_number = datosn(fs, fs->lfs_curseg);
673 		sp->seg_bytes_left = (fs->lfs_dbpseg -
674 		    (fs->lfs_offset - fs->lfs_curseg)) * DEV_BSIZE;
675 	}
676 	fs->lfs_lastpseg = fs->lfs_offset;
677 
678 	sp->fs = fs;
679 	sp->ibp = NULL;
680 	sp->ninodes = 0;
681 
682 	/* Get a new buffer for SEGSUM and enter it into the buffer list. */
683 	sp->cbpp = sp->bpp;
684 	*sp->cbpp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, fs->lfs_offset,
685 	     LFS_SUMMARY_SIZE);
686 	sp->segsum = (*sp->cbpp)->b_data;
687 	bzero(sp->segsum, LFS_SUMMARY_SIZE);
688 	sp->start_bpp = ++sp->cbpp;
689 	fs->lfs_offset += LFS_SUMMARY_SIZE / DEV_BSIZE;
690 
691 	/* Set point to SEGSUM, initialize it. */
692 	ssp = sp->segsum;
693 	ssp->ss_next = fs->lfs_nextseg;
694 	ssp->ss_nfinfo = ssp->ss_ninos = 0;
695 	ssp->ss_magic = SS_MAGIC;
696 
697 	/* Set pointer to first FINFO, initialize it. */
698 	sp->fip = (struct finfo *)((caddr_t)sp->segsum + sizeof(SEGSUM));
699 	sp->fip->fi_nblocks = 0;
700 	sp->start_lbp = &sp->fip->fi_blocks[0];
701 	sp->fip->fi_lastlength = 0;
702 
703 	sp->seg_bytes_left -= LFS_SUMMARY_SIZE;
704 	sp->sum_bytes_left = LFS_SUMMARY_SIZE - sizeof(SEGSUM);
705 
706 	return(repeat);
707 }
708 
709 /*
710  * Return the next segment to write.
711  */
712 void
713 lfs_newseg(fs)
714 	struct lfs *fs;
715 {
716 	CLEANERINFO *cip;
717 	SEGUSE *sup;
718 	struct buf *bp;
719 	int curseg, isdirty, sn;
720 
721         LFS_SEGENTRY(sup, fs, datosn(fs, fs->lfs_nextseg), bp);
722         sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE;
723 	sup->su_nbytes = 0;
724 	sup->su_nsums = 0;
725 	sup->su_ninos = 0;
726         (void) VOP_BWRITE(bp);
727 
728 	LFS_CLEANERINFO(cip, fs, bp);
729 	--cip->clean;
730 	++cip->dirty;
731 	(void) VOP_BWRITE(bp);
732 
733 	fs->lfs_lastseg = fs->lfs_curseg;
734 	fs->lfs_curseg = fs->lfs_nextseg;
735 	for (sn = curseg = datosn(fs, fs->lfs_curseg);;) {
736 		sn = (sn + 1) % fs->lfs_nseg;
737 		if (sn == curseg)
738 			panic("lfs_nextseg: no clean segments");
739 		LFS_SEGENTRY(sup, fs, sn, bp);
740 		isdirty = sup->su_flags & SEGUSE_DIRTY;
741 		brelse(bp);
742 		if (!isdirty)
743 			break;
744 	}
745 
746 	++fs->lfs_nactive;
747 	fs->lfs_nextseg = sntoda(fs, sn);
748 #ifdef DOSTATS
749 	++lfs_stats.segsused;
750 #endif
751 }
752 
753 int
754 lfs_writeseg(fs, sp)
755 	struct lfs *fs;
756 	struct segment *sp;
757 {
758 	extern int locked_queue_count;
759 	struct buf **bpp, *bp, *cbp;
760 	SEGUSE *sup;
761 	SEGSUM *ssp;
762 	dev_t i_dev;
763 	u_long *datap, *dp;
764 	int do_again, i, nblocks, s;
765 	int (*strategy)__P((struct vop_strategy_args *));
766 	struct vop_strategy_args vop_strategy_a;
767 	u_short ninos;
768 	char *p;
769 long *lp;
770 
771 	/*
772 	 * If there are no buffers other than the segment summary to write
773 	 * and it is not a checkpoint, don't do anything.  On a checkpoint,
774 	 * even if there aren't any buffers, you need to write the superblock.
775 	 */
776 	if ((nblocks = sp->cbpp - sp->bpp) == 1)
777 		return (0);
778 
779 	/* Update the segment usage information. */
780 	LFS_SEGENTRY(sup, fs, sp->seg_number, bp);
781 
782 	/* Loop through all blocks, except the segment summary. */
783 	for (bpp = sp->bpp; ++bpp < sp->cbpp; )
784 		sup->su_nbytes += (*bpp)->b_bcount;
785 
786 	ssp = (SEGSUM *)sp->segsum;
787 
788 	ninos = (ssp->ss_ninos + INOPB(fs) - 1) / INOPB(fs);
789 	sup->su_nbytes += ssp->ss_ninos * sizeof(struct dinode);
790 	sup->su_nbytes += LFS_SUMMARY_SIZE;
791 	sup->su_lastmod = time.tv_sec;
792 	sup->su_ninos += ninos;
793 	++sup->su_nsums;
794 	do_again = !(bp->b_flags & B_GATHERED);
795 	(void)VOP_BWRITE(bp);
796 	/*
797 	 * Compute checksum across data and then across summary; the first
798 	 * block (the summary block) is skipped.  Set the create time here
799 	 * so that it's guaranteed to be later than the inode mod times.
800 	 *
801 	 * XXX
802 	 * Fix this to do it inline, instead of malloc/copy.
803 	 */
804 	datap = dp = malloc(nblocks * sizeof(u_long), M_SEGMENT, M_WAITOK);
805 	for (bpp = sp->bpp, i = nblocks - 1; i--;) {
806 		if ((*++bpp)->b_flags & B_INVAL) {
807 			if (copyin((*bpp)->b_saveaddr, dp++, sizeof(u_long)))
808 				panic("lfs_writeseg: copyin failed");
809 		} else
810 			*dp++ = ((u_long *)(*bpp)->b_data)[0];
811 	}
812 	ssp->ss_create = time.tv_sec;
813 	ssp->ss_datasum = cksum(datap, (nblocks - 1) * sizeof(u_long));
814 	ssp->ss_sumsum =
815 	    cksum(&ssp->ss_datasum, LFS_SUMMARY_SIZE - sizeof(ssp->ss_sumsum));
816 	free(datap, M_SEGMENT);
817 #ifdef DIAGNOSTIC
818 	if (fs->lfs_bfree < fsbtodb(fs, ninos) + LFS_SUMMARY_SIZE / DEV_BSIZE)
819 		panic("lfs_writeseg: No diskspace for summary");
820 #endif
821 	fs->lfs_bfree -= (fsbtodb(fs, ninos) + LFS_SUMMARY_SIZE / DEV_BSIZE);
822 
823 	i_dev = VTOI(fs->lfs_ivnode)->i_dev;
824 	strategy = VTOI(fs->lfs_ivnode)->i_devvp->v_op[VOFFSET(vop_strategy)];
825 
826 	/*
827 	 * When we simply write the blocks we lose a rotation for every block
828 	 * written.  To avoid this problem, we allocate memory in chunks, copy
829 	 * the buffers into the chunk and write the chunk.  MAXPHYS is the
830 	 * largest size I/O devices can handle.
831 	 * When the data is copied to the chunk, turn off the the B_LOCKED bit
832 	 * and brelse the buffer (which will move them to the LRU list).  Add
833 	 * the B_CALL flag to the buffer header so we can count I/O's for the
834 	 * checkpoints and so we can release the allocated memory.
835 	 *
836 	 * XXX
837 	 * This should be removed if the new virtual memory system allows us to
838 	 * easily make the buffers contiguous in kernel memory and if that's
839 	 * fast enough.
840 	 */
841 	for (bpp = sp->bpp, i = nblocks; i;) {
842 		cbp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp,
843 		    (*bpp)->b_blkno, MAXPHYS);
844 		cbp->b_dev = i_dev;
845 		cbp->b_flags |= B_ASYNC | B_BUSY;
846 		cbp->b_bcount = 0;
847 
848 		s = splbio();
849 		++fs->lfs_iocount;
850 		for (p = cbp->b_data; i && cbp->b_bcount < MAXPHYS; i--) {
851 			bp = *bpp;
852 			if (bp->b_bcount > (MAXPHYS - cbp->b_bcount))
853 				break;
854 			bpp++;
855 
856 			/*
857 			 * Fake buffers from the cleaner are marked as B_INVAL.
858 			 * We need to copy the data from user space rather than
859 			 * from the buffer indicated.
860 			 * XXX == what do I do on an error?
861 			 */
862 			if (bp->b_flags & B_INVAL) {
863 				if (copyin(bp->b_saveaddr, p, bp->b_bcount))
864 					panic("lfs_writeseg: copyin failed");
865 			} else
866 				bcopy(bp->b_data, p, bp->b_bcount);
867 
868 			p += bp->b_bcount;
869 			cbp->b_bcount += bp->b_bcount;
870 			if (bp->b_flags & B_LOCKED)
871 				--locked_queue_count;
872 			bp->b_flags &= ~(B_ERROR | B_READ | B_DELWRI |
873 			     B_LOCKED | B_GATHERED);
874 			if (bp->b_flags & B_CALL) {
875 				/* if B_CALL, it was created with newbuf */
876 				brelvp(bp);
877 				if (!(bp->b_flags & B_INVAL))
878 					free(bp->b_data, M_SEGMENT);
879 				free(bp, M_SEGMENT);
880 			} else {
881 				bremfree(bp);
882 				bp->b_flags |= B_DONE;
883 				reassignbuf(bp, bp->b_vp);
884 				brelse(bp);
885 			}
886 		}
887 		++cbp->b_vp->v_numoutput;
888 		splx(s);
889 		/*
890 		 * XXXX This is a gross and disgusting hack.  Since these
891 		 * buffers are physically addressed, they hang off the
892 		 * device vnode (devvp).  As a result, they have no way
893 		 * of getting to the LFS superblock or lfs structure to
894 		 * keep track of the number of I/O's pending.  So, I am
895 		 * going to stuff the fs into the saveaddr field of
896 		 * the buffer (yuk).
897 		 */
898 		cbp->b_saveaddr = (caddr_t)fs;
899 		vop_strategy_a.a_desc = VDESC(vop_strategy);
900 		vop_strategy_a.a_bp = cbp;
901 		(strategy)(&vop_strategy_a);
902 	}
903 	/*
904 	 * XXX
905 	 * Vinvalbuf can move locked buffers off the locked queue
906 	 * and we have no way of knowing about this.  So, after
907 	 * doing a big write, we recalculate how many bufers are
908 	 * really still left on the locked queue.
909 	 */
910 	locked_queue_count = count_lock_queue();
911 	wakeup(&locked_queue_count);
912 #ifdef DOSTATS
913 	++lfs_stats.psegwrites;
914 	lfs_stats.blocktot += nblocks - 1;
915 	if (fs->lfs_sp->seg_flags & SEGM_SYNC)
916 		++lfs_stats.psyncwrites;
917 	if (fs->lfs_sp->seg_flags & SEGM_CLEAN) {
918 		++lfs_stats.pcleanwrites;
919 		lfs_stats.cleanblocks += nblocks - 1;
920 	}
921 #endif
922 	return (lfs_initseg(fs) || do_again);
923 }
924 
925 void
926 lfs_writesuper(fs)
927 	struct lfs *fs;
928 {
929 	struct buf *bp;
930 	dev_t i_dev;
931 	int (*strategy) __P((struct vop_strategy_args *));
932 	int s;
933 	struct vop_strategy_args vop_strategy_a;
934 
935 	i_dev = VTOI(fs->lfs_ivnode)->i_dev;
936 	strategy = VTOI(fs->lfs_ivnode)->i_devvp->v_op[VOFFSET(vop_strategy)];
937 
938 	/* Checksum the superblock and copy it into a buffer. */
939 	fs->lfs_cksum = cksum(fs, sizeof(struct lfs) - sizeof(fs->lfs_cksum));
940 	bp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, fs->lfs_sboffs[0],
941 	    LFS_SBPAD);
942 	*(struct lfs *)bp->b_data = *fs;
943 
944 	/* XXX Toggle between first two superblocks; for now just write first */
945 	bp->b_dev = i_dev;
946 	bp->b_flags |= B_BUSY | B_CALL | B_ASYNC;
947 	bp->b_flags &= ~(B_DONE | B_ERROR | B_READ | B_DELWRI);
948 	bp->b_iodone = lfs_supercallback;
949 	vop_strategy_a.a_desc = VDESC(vop_strategy);
950 	vop_strategy_a.a_bp = bp;
951 	s = splbio();
952 	++bp->b_vp->v_numoutput;
953 	splx(s);
954 	(strategy)(&vop_strategy_a);
955 }
956 
957 /*
958  * Logical block number match routines used when traversing the dirty block
959  * chain.
960  */
961 int
962 lfs_match_data(fs, bp)
963 	struct lfs *fs;
964 	struct buf *bp;
965 {
966 	return (bp->b_lblkno >= 0);
967 }
968 
969 int
970 lfs_match_indir(fs, bp)
971 	struct lfs *fs;
972 	struct buf *bp;
973 {
974 	int lbn;
975 
976 	lbn = bp->b_lblkno;
977 	return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 0);
978 }
979 
980 int
981 lfs_match_dindir(fs, bp)
982 	struct lfs *fs;
983 	struct buf *bp;
984 {
985 	int lbn;
986 
987 	lbn = bp->b_lblkno;
988 	return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 1);
989 }
990 
991 int
992 lfs_match_tindir(fs, bp)
993 	struct lfs *fs;
994 	struct buf *bp;
995 {
996 	int lbn;
997 
998 	lbn = bp->b_lblkno;
999 	return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 2);
1000 }
1001 
1002 /*
1003  * Allocate a new buffer header.
1004  */
1005 struct buf *
1006 lfs_newbuf(vp, daddr, size)
1007 	struct vnode *vp;
1008 	ufs_daddr_t daddr;
1009 	size_t size;
1010 {
1011 	struct buf *bp;
1012 	size_t nbytes;
1013 
1014 	nbytes = roundup(size, DEV_BSIZE);
1015 	bp = malloc(sizeof(struct buf), M_SEGMENT, M_WAITOK);
1016 	bzero(bp, sizeof(struct buf));
1017 	if (nbytes)
1018 		bp->b_data = malloc(nbytes, M_SEGMENT, M_WAITOK);
1019 	bgetvp(vp, bp);
1020 	bp->b_bufsize = size;
1021 	bp->b_bcount = size;
1022 	bp->b_lblkno = daddr;
1023 	bp->b_blkno = daddr;
1024 	bp->b_error = 0;
1025 	bp->b_resid = 0;
1026 	bp->b_iodone = lfs_callback;
1027 	bp->b_flags |= B_BUSY | B_CALL | B_NOCACHE;
1028 	return (bp);
1029 }
1030 
1031 void
1032 lfs_callback(bp)
1033 	struct buf *bp;
1034 {
1035 	struct lfs *fs;
1036 
1037 	fs = (struct lfs *)bp->b_saveaddr;
1038 #ifdef DIAGNOSTIC
1039 	if (fs->lfs_iocount == 0)
1040 		panic("lfs_callback: zero iocount\n");
1041 #endif
1042 	if (--fs->lfs_iocount == 0)
1043 		wakeup(&fs->lfs_iocount);
1044 
1045 	brelvp(bp);
1046 	free(bp->b_data, M_SEGMENT);
1047 	free(bp, M_SEGMENT);
1048 }
1049 
1050 void
1051 lfs_supercallback(bp)
1052 	struct buf *bp;
1053 {
1054 	brelvp(bp);
1055 	free(bp->b_data, M_SEGMENT);
1056 	free(bp, M_SEGMENT);
1057 }
1058 
1059 /*
1060  * Shellsort (diminishing increment sort) from Data Structures and
1061  * Algorithms, Aho, Hopcraft and Ullman, 1983 Edition, page 290;
1062  * see also Knuth Vol. 3, page 84.  The increments are selected from
1063  * formula (8), page 95.  Roughly O(N^3/2).
1064  */
1065 /*
1066  * This is our own private copy of shellsort because we want to sort
1067  * two parallel arrays (the array of buffer pointers and the array of
1068  * logical block numbers) simultaneously.  Note that we cast the array
1069  * of logical block numbers to a unsigned in this routine so that the
1070  * negative block numbers (meta data blocks) sort AFTER the data blocks.
1071  */
1072 void
1073 lfs_shellsort(bp_array, lb_array, nmemb)
1074 	struct buf **bp_array;
1075 	ufs_daddr_t *lb_array;
1076 	register int nmemb;
1077 {
1078 	static int __rsshell_increments[] = { 4, 1, 0 };
1079 	register int incr, *incrp, t1, t2;
1080 	struct buf *bp_temp;
1081 	u_long lb_temp;
1082 
1083 	for (incrp = __rsshell_increments; incr = *incrp++;)
1084 		for (t1 = incr; t1 < nmemb; ++t1)
1085 			for (t2 = t1 - incr; t2 >= 0;)
1086 				if (lb_array[t2] > lb_array[t2 + incr]) {
1087 					lb_temp = lb_array[t2];
1088 					lb_array[t2] = lb_array[t2 + incr];
1089 					lb_array[t2 + incr] = lb_temp;
1090 					bp_temp = bp_array[t2];
1091 					bp_array[t2] = bp_array[t2 + incr];
1092 					bp_array[t2 + incr] = bp_temp;
1093 					t2 -= incr;
1094 				} else
1095 					break;
1096 }
1097 
1098 /*
1099  * Check VXLOCK.  Return 1 if the vnode is locked.  Otherwise, vget it.
1100  */
1101 lfs_vref(vp)
1102 	register struct vnode *vp;
1103 {
1104 
1105 	if (vp->v_flag & VXLOCK)
1106 		return(1);
1107 	return (vget(vp, 0));
1108 }
1109 
1110 void
1111 lfs_vunref(vp)
1112 	register struct vnode *vp;
1113 {
1114 	extern int lfs_no_inactive;
1115 
1116 	/*
1117 	 * This is vrele except that we do not want to VOP_INACTIVE
1118 	 * this vnode. Rather than inline vrele here, we use a global
1119 	 * flag to tell lfs_inactive not to run. Yes, its gross.
1120 	 */
1121 	lfs_no_inactive = 1;
1122 	vrele(vp);
1123 	lfs_no_inactive = 0;
1124 }
1125