xref: /original-bsd/sys/ufs/lfs/lfs_segment.c (revision 860e07fc)
1 /*
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  *
5  * %sccs.include.redist.c%
6  *
7  *	@(#)lfs_segment.c	7.35 (Berkeley) 09/03/92
8  */
9 
10 #include <sys/param.h>
11 #include <sys/systm.h>
12 #include <sys/namei.h>
13 #include <sys/kernel.h>
14 #include <sys/resourcevar.h>
15 #include <sys/file.h>
16 #include <sys/stat.h>
17 #include <sys/buf.h>
18 #include <sys/proc.h>
19 #include <sys/conf.h>
20 #include <sys/vnode.h>
21 #include <sys/malloc.h>
22 #include <sys/mount.h>
23 
24 #include <miscfs/specfs/specdev.h>
25 #include <miscfs/fifofs/fifo.h>
26 
27 #include <ufs/ufs/quota.h>
28 #include <ufs/ufs/inode.h>
29 #include <ufs/ufs/dir.h>
30 #include <ufs/ufs/ufsmount.h>
31 
32 #include <ufs/lfs/lfs.h>
33 #include <ufs/lfs/lfs_extern.h>
34 
35 #define MAX_ACTIVE	10
36 /*
37  * Determine if it's OK to start a partial in this segment, or if we need
38  * to go on to a new segment.
39  */
40 #define	LFS_PARTIAL_FITS(fs) \
41 	((fs)->lfs_dbpseg - ((fs)->lfs_offset - (fs)->lfs_curseg) > \
42 	1 << (fs)->lfs_fsbtodb)
43 
44 void	 lfs_callback __P((struct buf *));
45 void	 lfs_gather __P((struct lfs *, struct segment *,
46 	     struct vnode *, int (*) __P((struct lfs *, struct buf *))));
47 int	 lfs_gatherblock __P((struct segment *, struct buf *, int *));
48 void	 lfs_initseg __P((struct lfs *, struct segment *));
49 void	 lfs_iset __P((struct inode *, daddr_t, time_t));
50 int	 lfs_match_data __P((struct lfs *, struct buf *));
51 int	 lfs_match_dindir __P((struct lfs *, struct buf *));
52 int	 lfs_match_indir __P((struct lfs *, struct buf *));
53 int	 lfs_match_tindir __P((struct lfs *, struct buf *));
54 void	 lfs_newseg __P((struct lfs *));
55 void	 lfs_shellsort __P((struct buf **, daddr_t *, register int));
56 void	 lfs_supercallback __P((struct buf *));
57 void	 lfs_updatemeta __P((struct segment *));
58 void	 lfs_writefile __P((struct lfs *, struct segment *, struct vnode *));
59 int	 lfs_writeinode __P((struct lfs *, struct segment *, struct inode *));
60 int	 lfs_writeseg __P((struct lfs *, struct segment *));
61 void	 lfs_writesuper __P((struct lfs *, struct segment *));
62 void	 lfs_writevnodes __P((struct lfs *fs, struct mount *mp,
63 	    struct segment *sp, int dirops));
64 
65 int	lfs_allclean_wakeup;		/* Cleaner wakeup address. */
66 
67 /*
68  * Ifile and meta data blocks are not marked busy, so segment writes MUST be
69  * single threaded.  Currently, there are two paths into lfs_segwrite, sync()
70  * and getnewbuf().  They both mark the file system busy.  Lfs_vflush()
71  * explicitly marks the file system busy.  So lfs_segwrite is safe.  I think.
72  */
73 
74 int
75 lfs_vflush(vp)
76 	struct vnode *vp;
77 {
78 	struct inode *ip;
79 	struct lfs *fs;
80 	struct segment *sp;
81 	int error, s;
82 
83 	fs = VFSTOUFS(vp->v_mount)->um_lfs;
84 	if (fs->lfs_nactive > MAX_ACTIVE)
85 		return(lfs_segwrite(vp->v_mount, 1));
86 
87 	lfs_seglock(fs);
88 
89 	/*
90 	 * Allocate a segment structure and enough space to hold pointers to
91 	 * the maximum possible number of buffers which can be described in a
92 	 * single summary block.
93 	 */
94 	sp = malloc(sizeof(struct segment), M_SEGMENT, M_WAITOK);
95 	sp->bpp = malloc(((LFS_SUMMARY_SIZE - sizeof(SEGSUM)) /
96 	    sizeof(daddr_t) + 1) * sizeof(struct buf *), M_SEGMENT, M_WAITOK);
97 	sp->seg_flags = SEGM_CKP;
98 	sp->vp = NULL;
99 
100 	/*
101 	 * Keep a cumulative count of the outstanding I/O operations.  If the
102 	 * disk drive catches up with us it could go to zero before we finish,
103 	 * so we artificially increment it by one until we've scheduled all of
104 	 * the writes we intend to do.
105 	 */
106 	s = splbio();
107 	++fs->lfs_iocount;
108 	splx(s);
109 
110 	ip = VTOI(vp);
111 	do {
112 		lfs_initseg(fs, sp);
113 		do {
114 			if (vp->v_dirtyblkhd != NULL)
115 				lfs_writefile(fs, sp, vp);
116 		} while (lfs_writeinode(fs, sp, ip));
117 
118 	} while (lfs_writeseg(fs, sp) && ip->i_number == LFS_IFILE_INUM);
119 
120 	/*
121 	 * If the I/O count is non-zero, sleep until it reaches zero.  At the
122 	 * moment, the user's process hangs around so we can sleep.
123 	 */
124 	s = splbio();
125 	if (--fs->lfs_iocount && (error =
126 	    tsleep(&fs->lfs_iocount, PRIBIO + 1, "lfs vflush", 0))) {
127 		free(sp->bpp, M_SEGMENT);
128 		free(sp, M_SEGMENT);
129 		return (error);
130 	}
131 	splx(s);
132 	lfs_segunlock(fs);
133 
134 	/*
135 	 * XXX
136 	 * Should be writing a checkpoint?
137 	 */
138 	free(sp->bpp, M_SEGMENT);
139 	free(sp, M_SEGMENT);
140 
141 	return (0);
142 }
143 
144 void
145 lfs_writevnodes(fs, mp, sp, dirops)
146 	struct lfs *fs;
147 	struct mount *mp;
148 	struct segment *sp;
149 	int dirops;
150 {
151 	struct inode *ip;
152 	struct vnode *vp;
153 	int error, s;
154 
155 loop:	for (vp = mp->mnt_mounth; vp; vp = vp->v_mountf) {
156 		/*
157 		 * If the vnode that we are about to sync is no longer
158 		 * associated with this mount point, start over.
159 		 */
160 		if (vp->v_mount != mp)
161 			goto loop;
162 
163 		if (dirops && !(vp->v_flag & VDIROP) ||
164 		    !dirops && (vp->v_flag & VDIROP))
165 			continue;
166 		/*
167 		 * XXX
168 		 * Up the ref count so we don't get tossed out of
169 		 * memory.
170 		 */
171 		VREF(vp);
172 
173 		/*
174 		 * Write the inode/file if dirty and it's not the
175 		 * the IFILE.
176 		 */
177 		ip = VTOI(vp);
178 		if ((ip->i_flag & (IMOD | IACC | IUPD | ICHG) ||
179 		    vp->v_dirtyblkhd != NULL) &&
180 		    ip->i_number != LFS_IFILE_INUM) {
181 			if (vp->v_dirtyblkhd != NULL)
182 				lfs_writefile(fs, sp, vp);
183 			(void) lfs_writeinode(fs, sp, ip);
184 		}
185 		vp->v_flag &= ~VDIROP;
186 		vrele(vp);
187 	}
188 }
189 
190 int
191 lfs_segwrite(mp, do_ckp)
192 	struct mount *mp;
193 	int do_ckp;			/* Do a checkpoint. */
194 {
195 	struct buf *bp;
196 	struct inode *ip;
197 	struct lfs *fs;
198 	struct segment *sp;
199 	struct vnode *vp;
200 	SEGUSE *segusep;
201 	daddr_t ibno;
202 	CLEANERINFO *cip;
203 	int clean, error, i, s;
204 
205 	fs = VFSTOUFS(mp)->um_lfs;
206 
207  	/*
208  	 * If we have fewer than 2 clean segments, wait until cleaner
209 	 * writes.
210  	 */
211 	do {
212 		LFS_CLEANERINFO(cip, fs, bp);
213 		clean = cip->clean;
214 		brelse(bp);
215 		if (clean <= 2) {
216 			printf ("segs clean: %d\n", clean);
217 			wakeup(&lfs_allclean_wakeup);
218 			if (error = tsleep(&fs->lfs_avail, PRIBIO + 1,
219 			    "lfs writer", 0))
220 				return (error);
221 		}
222 	} while (clean <= 2 );
223 	lfs_seglock(fs);
224 
225 	/*
226 	 * Allocate a segment structure and enough space to hold pointers to
227 	 * the maximum possible number of buffers which can be described in a
228 	 * single summary block.
229 	 */
230 	do_ckp = do_ckp || fs->lfs_nactive > MAX_ACTIVE;
231 	sp = malloc(sizeof(struct segment), M_SEGMENT, M_WAITOK);
232 	sp->bpp = malloc(((LFS_SUMMARY_SIZE - sizeof(SEGSUM)) /
233 	    sizeof(daddr_t) + 1) * sizeof(struct buf *), M_SEGMENT, M_WAITOK);
234 	sp->seg_flags = do_ckp ? SEGM_CKP : 0;
235 	sp->vp = NULL;
236 	lfs_initseg(fs, sp);
237 
238 	/*
239 	 * Keep a cumulative count of the outstanding I/O operations.  If the
240 	 * disk drive catches up with us it could go to zero before we finish,
241 	 * so we artificially increment it by one until we've scheduled all of
242 	 * the writes we intend to do.  If not a checkpoint, we never do the
243 	 * final decrement, avoiding the wakeup in the callback routine.
244 	 */
245 	s = splbio();
246 	++fs->lfs_iocount;
247 	splx(s);
248 
249 	lfs_writevnodes(fs, mp, sp, 0);
250 	fs->lfs_writer = 1;
251 	if (fs->lfs_dirops && (error =
252 	    tsleep(&fs->lfs_writer, PRIBIO + 1, "lfs writer", 0))) {
253 		free(sp->bpp, M_SEGMENT);
254 		free(sp, M_SEGMENT);
255 		fs->lfs_writer = 0;
256 		return (error);
257 	}
258 
259 	lfs_writevnodes(fs, mp, sp, 1);
260 
261 	/*
262 	 * If we are doing a checkpoint, mark everything since the
263 	 * last checkpoint as no longer ACTIVE.
264 	 */
265 	if (do_ckp)
266 		for (ibno = fs->lfs_cleansz + fs->lfs_segtabsz;
267 		     --ibno >= fs->lfs_cleansz; ) {
268 			if (bread(fs->lfs_ivnode, ibno, fs->lfs_bsize,
269 			    NOCRED, &bp))
270 
271 				panic("lfs: ifile read");
272 			segusep = (SEGUSE *)bp->b_un.b_addr;
273 			for (i = fs->lfs_sepb; i--; segusep++)
274 				segusep->su_flags &= ~SEGUSE_ACTIVE;
275 
276 			error = VOP_BWRITE(bp);
277 		}
278 
279 	if (do_ckp || fs->lfs_doifile) {
280 redo:
281 		vp = fs->lfs_ivnode;
282 		while (vget(vp));
283 		ip = VTOI(vp);
284 		if (vp->v_dirtyblkhd != NULL)
285 			lfs_writefile(fs, sp, vp);
286 		(void)lfs_writeinode(fs, sp, ip);
287 		vput(vp);
288 		if (lfs_writeseg(fs, sp) && do_ckp) {
289 			lfs_initseg(fs, sp);
290 			goto redo;
291 		}
292 	} else
293 		(void) lfs_writeseg(fs, sp);
294 
295 	/*
296 	 * If the I/O count is non-zero, sleep until it reaches zero.  At the
297 	 * moment, the user's process hangs around so we can sleep.
298 	 */
299 	fs->lfs_writer = 0;
300 	fs->lfs_doifile = 0;
301 	wakeup(&fs->lfs_dirops);
302 
303 	s = splbio();
304 	--fs->lfs_iocount;
305 	if (do_ckp) {
306 		if (fs->lfs_iocount && (error =
307 		    tsleep(&fs->lfs_iocount, PRIBIO + 1, "lfs sync", 0))) {
308 			free(sp->bpp, M_SEGMENT);
309 			free(sp, M_SEGMENT);
310 			return (error);
311 		}
312 		splx(s);
313 		fs->lfs_nactive = 0;
314 		lfs_writesuper(fs, sp);
315 	} else
316 		splx(s);
317 
318 	lfs_segunlock(fs);
319 
320 	free(sp->bpp, M_SEGMENT);
321 	free(sp, M_SEGMENT);
322 
323 	return (0);
324 }
325 
326 /*
327  * Write the dirty blocks associated with a vnode.
328  */
329 void
330 lfs_writefile(fs, sp, vp)
331 	struct lfs *fs;
332 	struct segment *sp;
333 	struct vnode *vp;
334 {
335 	struct buf *bp;
336 	struct finfo *fip;
337 	IFILE *ifp;
338 
339 	if (sp->seg_bytes_left < fs->lfs_bsize ||
340 	    sp->sum_bytes_left < sizeof(struct finfo)) {
341 		(void) lfs_writeseg(fs, sp);
342 		lfs_initseg(fs, sp);
343 	}
344 	sp->sum_bytes_left -= sizeof(struct finfo) - sizeof(daddr_t);
345 
346 	fip = sp->fip;
347 	fip->fi_nblocks = 0;
348 	fip->fi_ino = VTOI(vp)->i_number;
349 	LFS_IENTRY(ifp, fs, fip->fi_ino, bp);
350 	fip->fi_version = ifp->if_version;
351 	brelse(bp);
352 
353 	/*
354 	 * It may not be necessary to write the meta-data blocks at this point,
355 	 * as the roll-forward recovery code should be able to reconstruct the
356 	 * list.
357 	 */
358 	lfs_gather(fs, sp, vp, lfs_match_data);
359 	lfs_gather(fs, sp, vp, lfs_match_indir);
360 	lfs_gather(fs, sp, vp, lfs_match_dindir);
361 #ifdef TRIPLE
362 	lfs_gather(fs, sp, vp, lfs_match_tindir);
363 #endif
364 
365 	fip = sp->fip;
366 #ifdef META
367 	printf("lfs_writefile: adding %d blocks\n", fip->fi_nblocks);
368 #endif
369 	if (fip->fi_nblocks != 0) {
370 		++((SEGSUM *)(sp->segsum))->ss_nfinfo;
371 		sp->fip =
372 		    (struct finfo *)((caddr_t)fip + sizeof(struct finfo) +
373 		    sizeof(daddr_t) * (fip->fi_nblocks - 1));
374 		sp->start_lbp = &sp->fip->fi_blocks[0];
375 	} else
376 		sp->sum_bytes_left += sizeof(struct finfo) - sizeof(daddr_t);
377 }
378 
379 int
380 lfs_writeinode(fs, sp, ip)
381 	struct lfs *fs;
382 	struct segment *sp;
383 	struct inode *ip;
384 {
385 	struct buf *bp, *ibp;
386 	IFILE *ifp;
387 	SEGUSE *sup;
388 	daddr_t daddr;
389 	ino_t ino;
390 	int error, ndx;
391 	int redo_ifile = 0;
392 
393 	if (!(ip->i_flag & (IMOD | IACC | IUPD | ICHG)))
394 		return(0);
395 
396 	/* Allocate a new inode block if necessary. */
397 	if (sp->ibp == NULL) {
398 		/* Allocate a new segment if necessary. */
399 		if (sp->seg_bytes_left < fs->lfs_bsize ||
400 		    sp->sum_bytes_left < sizeof(daddr_t)) {
401 			(void) lfs_writeseg(fs, sp);
402 			lfs_initseg(fs, sp);
403 		}
404 
405 		/* Get next inode block. */
406 		daddr = fs->lfs_offset;
407 		fs->lfs_offset += fsbtodb(fs, 1);
408 		sp->ibp = *sp->cbpp++ =
409 		    lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, daddr,
410 		    fs->lfs_bsize);
411 		++sp->start_bpp;
412 		fs->lfs_avail -= fsbtodb(fs, 1);
413 		/* Set remaining space counters. */
414 		sp->seg_bytes_left -= fs->lfs_bsize;
415 		sp->sum_bytes_left -= sizeof(daddr_t);
416 		ndx = LFS_SUMMARY_SIZE / sizeof(daddr_t) -
417 		    sp->ninodes / INOPB(fs) - 1;
418 		((daddr_t *)(sp->segsum))[ndx] = daddr;
419 	}
420 
421 	/* Update the inode times and copy the inode onto the inode page. */
422 	if (ip->i_flag & IMOD)
423 		--fs->lfs_uinodes;
424 	ITIMES(ip, &time, &time);
425 	ip->i_flag &= ~(IMOD | IACC | IUPD | ICHG);
426 	bp = sp->ibp;
427 	bp->b_un.b_dino[sp->ninodes % INOPB(fs)] = ip->i_din;
428 	/* Increment inode count in segment summary block. */
429 	++((SEGSUM *)(sp->segsum))->ss_ninos;
430 
431 	/* If this page is full, set flag to allocate a new page. */
432 	if (++sp->ninodes % INOPB(fs) == 0)
433 		sp->ibp = NULL;
434 
435 	/*
436 	 * If updating the ifile, update the super-block.  Update the disk
437 	 * address and access times for this inode in the ifile.
438 	 */
439 	ino = ip->i_number;
440 	if (ino == LFS_IFILE_INUM) {
441 		daddr = fs->lfs_idaddr;
442 		fs->lfs_idaddr = bp->b_blkno;
443 	} else {
444 		LFS_IENTRY(ifp, fs, ino, ibp);
445 		daddr = ifp->if_daddr;
446 		ifp->if_daddr = bp->b_blkno;
447 		error = VOP_BWRITE(ibp);
448 	}
449 
450 	/*
451 	 * No need to update segment usage if there was no former inode address
452 	 * or if the last inode address is in the current partial segment.
453 	 */
454 	if (daddr != LFS_UNUSED_DADDR &&
455 	    !(daddr >= fs->lfs_lastpseg && daddr <= bp->b_blkno)) {
456 		LFS_SEGENTRY(sup, fs, datosn(fs, daddr), bp);
457 #ifdef DIAGNOSTIC
458 		if (sup->su_nbytes < sizeof(struct dinode)) {
459 			/* XXX -- Change to a panic. */
460 			printf("lfs: negative bytes (segment %d)\n",
461 			    datosn(fs, daddr));
462 			panic("negative bytes");
463 		}
464 #endif
465 		sup->su_nbytes -= sizeof(struct dinode);
466 		redo_ifile =
467 		    (ino == LFS_IFILE_INUM && !(bp->b_flags & B_GATHERED));
468 		error = VOP_BWRITE(bp);
469 	}
470 	return (redo_ifile);
471 }
472 
473 int
474 lfs_gatherblock(sp, bp, sptr)
475 	struct segment *sp;
476 	struct buf *bp;
477 	int *sptr;
478 {
479 	struct lfs *fs;
480 	int version;
481 
482 	/*
483 	 * If full, finish this segment.  We may be doing I/O, so
484 	 * release and reacquire the splbio().
485 	 */
486 #ifdef DIAGNOSTIC
487 	if (sp->vp == NULL)
488 		panic ("lfs_gatherblock: Null vp in segment");
489 #endif
490 	fs = sp->fs;
491 	if (sp->sum_bytes_left < sizeof(daddr_t) ||
492 	    sp->seg_bytes_left < fs->lfs_bsize) {
493 		if (sptr)
494 			splx(*sptr);
495 		lfs_updatemeta(sp);
496 
497 		/* Add the current file to the segment summary. */
498 		++((SEGSUM *)(sp->segsum))->ss_nfinfo;
499 
500 		version = sp->fip->fi_version;
501 		(void) lfs_writeseg(fs, sp);
502 		lfs_initseg(fs, sp);
503 
504 		sp->fip->fi_version = version;
505 		sp->fip->fi_ino = VTOI(sp->vp)->i_number;
506 
507 		sp->sum_bytes_left -=
508 		    sizeof(struct finfo) - sizeof(daddr_t);
509 
510 		if (sptr)
511 			*sptr = splbio();
512 		return(1);
513 	}
514 
515 	/* Insert into the buffer list, update the FINFO block. */
516 	bp->b_flags |= B_GATHERED;
517 	*sp->cbpp++ = bp;
518 	sp->fip->fi_blocks[sp->fip->fi_nblocks++] = bp->b_lblkno;
519 
520 	sp->sum_bytes_left -= sizeof(daddr_t);
521 	sp->seg_bytes_left -= bp->b_bufsize;
522 	return(0);
523 }
524 
525 void
526 lfs_gather(fs, sp, vp, match)
527 	struct lfs *fs;
528 	struct segment *sp;
529 	struct vnode *vp;
530 	int (*match) __P((struct lfs *, struct buf *));
531 {
532 	struct buf *bp;
533 	int s;
534 
535 	sp->vp = vp;
536 	s = splbio();
537 loop:	for (bp = vp->v_dirtyblkhd; bp; bp = bp->b_blockf) {
538 		if (bp->b_flags & B_BUSY || !match(fs, bp) ||
539 		    bp->b_flags & B_GATHERED)
540 			continue;
541 #ifdef DIAGNOSTIC
542 		if (!(bp->b_flags & B_DELWRI))
543 			panic("lfs_gather: bp not B_DELWRI");
544 		if (!(bp->b_flags & B_LOCKED))
545 			panic("lfs_gather: bp not B_LOCKED");
546 #endif
547 		if (lfs_gatherblock(sp, bp, &s))
548 			goto loop;
549 	}
550 	splx(s);
551 	lfs_updatemeta(sp);
552 	sp->vp = NULL;
553 }
554 
555 
556 /*
557  * Update the metadata that points to the blocks listed in the FINFO
558  * array.
559  */
560 void
561 lfs_updatemeta(sp)
562 	struct segment *sp;
563 {
564 	SEGUSE *sup;
565 	struct buf *bp;
566 	struct lfs *fs;
567 	struct vnode *vp;
568 	INDIR a[NIADDR], *ap;
569 	struct inode *ip;
570 	daddr_t daddr, lbn, off;
571 	int db_per_fsb, error, i, nblocks, num;
572 
573 	vp = sp->vp;
574 	nblocks = &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp;
575 	if (vp == NULL || nblocks == 0)
576 		return;
577 
578 	/* Sort the blocks. */
579 	if (!(sp->seg_flags & SEGM_CLEAN))
580 		lfs_shellsort(sp->start_bpp, sp->start_lbp, nblocks);
581 
582 	/*
583 	 * Assign disk addresses, and update references to the logical
584 	 * block and the segment usage information.
585 	 */
586 	fs = sp->fs;
587 	db_per_fsb = fsbtodb(fs, 1);
588 	for (i = nblocks; i--; ++sp->start_bpp) {
589 		lbn = *sp->start_lbp++;
590 		(*sp->start_bpp)->b_blkno = off = fs->lfs_offset;
591 		fs->lfs_offset += db_per_fsb;
592 
593 		if (error = lfs_bmaparray(vp, lbn, &daddr, a, &num))
594 			panic("lfs_updatemeta: lfs_bmaparray %d", error);
595 		ip = VTOI(vp);
596 		switch (num) {
597 		case 0:
598 			ip->i_db[lbn] = off;
599 			break;
600 		case 1:
601 			ip->i_ib[a[0].in_off] = off;
602 			break;
603 		default:
604 			ap = &a[num - 1];
605 			if (bread(vp, ap->in_lbn, fs->lfs_bsize, NOCRED, &bp))
606 				panic("lfs_updatemeta: bread bno %d",
607 				    ap->in_lbn);
608 			/*
609 			 * Bread may create a new indirect block which needs
610 			 * to get counted for the inode.
611 			 */
612 			if (bp->b_blkno == -1 && !(bp->b_flags & B_CACHE)) {
613 printf ("Updatemeta allocating indirect block: shouldn't happen\n");
614 				ip->i_blocks += btodb(fs->lfs_bsize);
615 				fs->lfs_bfree -= btodb(fs->lfs_bsize);
616 			}
617 			bp->b_un.b_daddr[ap->in_off] = off;
618 			VOP_BWRITE(bp);
619 		}
620 
621 		/* Update segment usage information. */
622 		if (daddr != UNASSIGNED) {
623 			LFS_SEGENTRY(sup, fs, datosn(fs, daddr), bp);
624 #ifdef DIAGNOSTIC
625 			if (sup->su_nbytes < fs->lfs_bsize) {
626 				/* XXX -- Change to a panic. */
627 				printf("lfs: negative bytes (segment %d)\n",
628 				    datosn(fs, daddr));
629 				panic ("Negative Bytes");
630 			}
631 #endif
632 			sup->su_nbytes -= fs->lfs_bsize;
633 			error = VOP_BWRITE(bp);
634 		}
635 	}
636 }
637 
638 /*
639  * Start a new segment.
640  */
641 void
642 lfs_initseg(fs, sp)
643 	struct lfs *fs;
644 	struct segment *sp;
645 {
646 	SEGUSE *sup;
647 	SEGSUM *ssp;
648 	struct buf *bp;
649 	daddr_t lbn, *lbnp;
650 
651 	/* Advance to the next segment. */
652 	if (!LFS_PARTIAL_FITS(fs)) {
653 		/* Wake up any cleaning procs waiting on this file system. */
654 		wakeup(&fs->lfs_nextseg);
655 		wakeup(&lfs_allclean_wakeup);
656 
657 		lfs_newseg(fs);
658 		fs->lfs_offset = fs->lfs_curseg;
659 		sp->seg_number = datosn(fs, fs->lfs_curseg);
660 		sp->seg_bytes_left = fs->lfs_dbpseg * DEV_BSIZE;
661 
662 		/*
663 		 * If the segment contains a superblock, update the offset
664 		 * and summary address to skip over it.
665 		 */
666 		LFS_SEGENTRY(sup, fs, sp->seg_number, bp);
667 		if (sup->su_flags & SEGUSE_SUPERBLOCK) {
668 			fs->lfs_offset += LFS_SBPAD / DEV_BSIZE;
669 			sp->seg_bytes_left -= LFS_SBPAD;
670 		}
671 		brelse(bp);
672 	} else {
673 		sp->seg_number = datosn(fs, fs->lfs_curseg);
674 		sp->seg_bytes_left = (fs->lfs_dbpseg -
675 		    (fs->lfs_offset - fs->lfs_curseg)) * DEV_BSIZE;
676 	}
677 	fs->lfs_lastpseg = fs->lfs_offset;
678 
679 	sp->fs = fs;
680 	sp->ibp = NULL;
681 	sp->ninodes = 0;
682 
683 	/* Get a new buffer for SEGSUM and enter it into the buffer list. */
684 	sp->cbpp = sp->bpp;
685 	*sp->cbpp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, fs->lfs_offset,
686 	     LFS_SUMMARY_SIZE);
687 	sp->segsum = (*sp->cbpp)->b_un.b_addr;
688 	sp->start_bpp = ++sp->cbpp;
689 	fs->lfs_offset += LFS_SUMMARY_SIZE / DEV_BSIZE;
690 
691 	/* Set point to SEGSUM, initialize it. */
692 	ssp = sp->segsum;
693 	ssp->ss_next = fs->lfs_nextseg;
694 	ssp->ss_nfinfo = ssp->ss_ninos = 0;
695 
696 	/* Set pointer to first FINFO, initialize it. */
697 	sp->fip = (struct finfo *)(sp->segsum + sizeof(SEGSUM));
698 	sp->fip->fi_nblocks = 0;
699 	sp->start_lbp = &sp->fip->fi_blocks[0];
700 
701 	sp->seg_bytes_left -= LFS_SUMMARY_SIZE;
702 	sp->sum_bytes_left = LFS_SUMMARY_SIZE - sizeof(SEGSUM);
703 }
704 
705 /*
706  * Return the next segment to write.
707  */
708 void
709 lfs_newseg(fs)
710 	struct lfs *fs;
711 {
712 	CLEANERINFO *cip;
713 	SEGUSE *sup;
714 	struct buf *bp;
715 	int curseg, error, isdirty, sn;
716 
717         LFS_SEGENTRY(sup, fs, datosn(fs, fs->lfs_nextseg), bp);
718         sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE;
719 	sup->su_nbytes = 0;
720 	sup->su_nsums = 0;
721 	sup->su_ninos = 0;
722         (void) VOP_BWRITE(bp);
723 
724 	LFS_CLEANERINFO(cip, fs, bp);
725 	--cip->clean;
726 	++cip->dirty;
727 	(void) VOP_BWRITE(bp);
728 
729 	fs->lfs_lastseg = fs->lfs_curseg;
730 	fs->lfs_curseg = fs->lfs_nextseg;
731 	for (sn = curseg = datosn(fs, fs->lfs_curseg);;) {
732 		sn = (sn + 1) % fs->lfs_nseg;
733 		if (sn == curseg)
734 			panic("lfs_nextseg: no clean segments");
735 		LFS_SEGENTRY(sup, fs, sn, bp);
736 		isdirty = sup->su_flags & SEGUSE_DIRTY;
737 		brelse(bp);
738 		if (!isdirty)
739 			break;
740 	}
741 
742 	++fs->lfs_nactive;
743 	fs->lfs_nextseg = sntoda(fs, sn);
744 }
745 
746 int
747 lfs_writeseg(fs, sp)
748 	struct lfs *fs;
749 	struct segment *sp;
750 {
751 	extern int locked_queue_count;
752 	struct buf **bpp, *bp, *cbp;
753 	SEGUSE *sup;
754 	SEGSUM *ssp;
755 	dev_t i_dev;
756 	size_t size;
757 	u_long *datap, *dp;
758 	int ch_per_blk, do_again, error, i, nblocks, num, s;
759 	int (*strategy)__P((struct vop_strategy_args *));
760 	struct vop_strategy_args vop_strategy_a;
761 	u_short ninos;
762 	char *p;
763 
764 	/*
765 	 * If there are no buffers other than the segment summary to write
766 	 * and it is not a checkpoint, don't do anything.  On a checkpoint,
767 	 * even if there aren't any buffers, you need to write the superblock.
768 	 */
769 	if ((nblocks = sp->cbpp - sp->bpp) == 1 && !(sp->seg_flags & SEGM_CKP))
770 		return (0);
771 
772 	ssp = (SEGSUM *)sp->segsum;
773 
774 	/* Update the segment usage information. */
775 	LFS_SEGENTRY(sup, fs, sp->seg_number, bp);
776 	ninos = (ssp->ss_ninos + INOPB(fs) - 1) / INOPB(fs);
777 	sup->su_nbytes += nblocks - 1 - ninos << fs->lfs_bshift;
778 	sup->su_nbytes += ssp->ss_ninos * sizeof(struct dinode);
779 	sup->su_nbytes += LFS_SUMMARY_SIZE;
780 	sup->su_lastmod = time.tv_sec;
781 	sup->su_ninos += ninos;
782 	++sup->su_nsums;
783 	do_again = !(bp->b_flags & B_GATHERED);
784 	(void)VOP_BWRITE(bp);
785 	/*
786 	 * Compute checksum across data and then across summary; the first
787 	 * block (the summary block) is skipped.  Set the create time here
788 	 * so that it's guaranteed to be later than the inode mod times.
789 	 *
790 	 * XXX
791 	 * Fix this to do it inline, instead of malloc/copy.
792 	 */
793 	datap = dp = malloc(nblocks * sizeof(u_long), M_SEGMENT, M_WAITOK);
794 	for (bpp = sp->bpp, i = nblocks - 1; i--;) {
795 		if ((*++bpp)->b_flags & B_INVAL) {
796 			if (copyin((*bpp)->b_saveaddr, dp++, sizeof(u_long)))
797 				panic("lfs_writeseg: copyin failed");
798 		} else
799 			*dp++ = (*bpp)->b_un.b_words[0];
800 	}
801 	ssp->ss_create = time.tv_sec;
802 	ssp->ss_datasum = cksum(datap, (nblocks - 1) * sizeof(u_long));
803 	ssp->ss_sumsum =
804 	    cksum(&ssp->ss_datasum, LFS_SUMMARY_SIZE - sizeof(ssp->ss_sumsum));
805 	free(datap, M_SEGMENT);
806 #ifdef DIAGNOSTIC
807 	if (fs->lfs_bfree < fsbtodb(fs, ninos) + LFS_SUMMARY_SIZE / DEV_BSIZE)
808 		panic("lfs_writeseg: No diskspace for summary");
809 #endif
810 	fs->lfs_bfree -= (fsbtodb(fs, ninos) + LFS_SUMMARY_SIZE / DEV_BSIZE);
811 
812 	i_dev = VTOI(fs->lfs_ivnode)->i_dev;
813 	strategy = VTOI(fs->lfs_ivnode)->i_devvp->v_op[VOFFSET(vop_strategy)];
814 
815 	/*
816 	 * When we simply write the blocks we lose a rotation for every block
817 	 * written.  To avoid this problem, we allocate memory in chunks, copy
818 	 * the buffers into the chunk and write the chunk.  56K was chosen as
819 	 * some driver/controllers can't handle unsigned 16 bit transfers.
820 	 * When the data is copied to the chunk, turn off the the B_LOCKED bit
821 	 * and brelse the buffer (which will move them to the LRU list).  Add
822 	 * the B_CALL flag to the buffer header so we can count I/O's for the
823 	 * checkpoints and so we can release the allocated memory.
824 	 *
825 	 * XXX
826 	 * This should be removed if the new virtual memory system allows us to
827 	 * easily make the buffers contiguous in kernel memory and if that's
828 	 * fast enough.
829 	 */
830 #define	LFS_CHUNKSIZE	(56 * 1024)
831 	ch_per_blk = LFS_CHUNKSIZE / fs->lfs_bsize;
832 	for (bpp = sp->bpp, i = nblocks; i;) {
833 		num = ch_per_blk;
834 		if (num > i)
835 			num = i;
836 		i -= num;
837 		size = num * fs->lfs_bsize;
838 
839 		cbp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp,
840 		    (*bpp)->b_blkno, size);
841 		cbp->b_dev = i_dev;
842 		cbp->b_flags |= B_ASYNC | B_BUSY;
843 
844 		s = splbio();
845 		++fs->lfs_iocount;
846 		for (p = cbp->b_un.b_addr; num--;) {
847 			bp = *bpp++;
848 			/*
849 			 * Fake buffers from the cleaner are marked as B_INVAL.
850 			 * We need to copy the data from user space rather than
851 			 * from the buffer indicated.
852 			 * XXX == what do I do on an error?
853 			 */
854 			if (bp->b_flags & B_INVAL) {
855 				if (copyin(bp->b_saveaddr, p, bp->b_bcount))
856 					panic("lfs_writeseg: copyin failed");
857 			} else
858 				bcopy(bp->b_un.b_addr, p, bp->b_bcount);
859 			p += bp->b_bcount;
860 			if (bp->b_flags & B_LOCKED)
861 				--locked_queue_count;
862 			bp->b_flags &= ~(B_ERROR | B_READ | B_DELWRI |
863 			     B_LOCKED | B_GATHERED);
864 			if (bp->b_flags & B_CALL) {
865 				/* if B_CALL, it was created with newbuf */
866 				brelvp(bp);
867 				free(bp, M_SEGMENT);
868 			} else {
869 				bremfree(bp);
870 				reassignbuf(bp, bp->b_vp);
871 				brelse(bp);
872 			}
873 		}
874 		++cbp->b_vp->v_numoutput;
875 		splx(s);
876 		cbp->b_bcount = p - cbp->b_un.b_addr;
877 		/*
878 		 * XXXX This is a gross and disgusting hack.  Since these
879 		 * buffers are physically addressed, they hang off the
880 		 * device vnode (devvp).  As a result, they have no way
881 		 * of getting to the LFS superblock or lfs structure to
882 		 * keep track of the number of I/O's pending.  So, I am
883 		 * going to stuff the fs into the saveaddr field of
884 		 * the buffer (yuk).
885 		 */
886 		cbp->b_saveaddr = (caddr_t)fs;
887 		vop_strategy_a.a_desc = VDESC(vop_strategy);
888 		vop_strategy_a.a_bp = cbp;
889 		(strategy)(&vop_strategy_a);
890 	}
891 	return (do_again);
892 }
893 
894 void
895 lfs_writesuper(fs, sp)
896 	struct lfs *fs;
897 	struct segment *sp;
898 {
899 	struct buf *bp;
900 	dev_t i_dev;
901 	int (*strategy) __P((struct vop_strategy_args *));
902 	int s;
903 	struct vop_strategy_args vop_strategy_a;
904 
905 	i_dev = VTOI(fs->lfs_ivnode)->i_dev;
906 	strategy = VTOI(fs->lfs_ivnode)->i_devvp->v_op[VOFFSET(vop_strategy)];
907 
908 	/* Checksum the superblock and copy it into a buffer. */
909 	fs->lfs_cksum = cksum(fs, sizeof(struct lfs) - sizeof(fs->lfs_cksum));
910 	bp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, fs->lfs_sboffs[0],
911 	    LFS_SBPAD);
912 	*bp->b_un.b_lfs = *fs;
913 
914 	/* Write the first superblock (wait). */
915 	bp->b_dev = i_dev;
916 	bp->b_flags |= B_BUSY;
917 	bp->b_flags &= ~(B_DONE | B_CALL | B_ERROR | B_READ | B_DELWRI);
918 	vop_strategy_a.a_desc = VDESC(vop_strategy);
919 	vop_strategy_a.a_bp = bp;
920 	s = splbio();
921 	bp->b_vp->v_numoutput += 2;
922 	splx(s);
923 	(strategy)(&vop_strategy_a);
924 	biowait(bp);
925 
926 	/* Write the second superblock (don't wait). */
927 	bp->b_blkno = bp->b_lblkno = fs->lfs_sboffs[1];
928 	bp->b_flags |= B_CALL | B_ASYNC | B_BUSY;
929 	bp->b_flags &= ~(B_DONE | B_ERROR | B_READ | B_DELWRI);
930 	bp->b_iodone = lfs_supercallback;
931 	(strategy)(&vop_strategy_a);
932 }
933 
934 /*
935  * Logical block number match routines used when traversing the dirty block
936  * chain.
937  */
938 int
939 lfs_match_data(fs, bp)
940 	struct lfs *fs;
941 	struct buf *bp;
942 {
943 	return (bp->b_lblkno >= 0);
944 }
945 
946 int
947 lfs_match_indir(fs, bp)
948 	struct lfs *fs;
949 	struct buf *bp;
950 {
951 	int lbn;
952 
953 	lbn = bp->b_lblkno;
954 	return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 0);
955 }
956 
957 int
958 lfs_match_dindir(fs, bp)
959 	struct lfs *fs;
960 	struct buf *bp;
961 {
962 	int lbn;
963 
964 	lbn = bp->b_lblkno;
965 	return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 1);
966 }
967 
968 int
969 lfs_match_tindir(fs, bp)
970 	struct lfs *fs;
971 	struct buf *bp;
972 {
973 	int lbn;
974 
975 	lbn = bp->b_lblkno;
976 	return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 2);
977 }
978 
979 /*
980  * Allocate a new buffer header.
981  */
982 struct buf *
983 lfs_newbuf(vp, daddr, size)
984 	struct vnode *vp;
985 	daddr_t daddr;
986 	size_t size;
987 {
988 	struct buf *bp;
989 	size_t nbytes;
990 
991 	nbytes = roundup(size, DEV_BSIZE);
992 	bp = malloc(sizeof(struct buf) + nbytes, M_SEGMENT, M_WAITOK);
993 	bzero(bp, sizeof(struct buf) + nbytes);
994 	bgetvp(vp, bp);
995 	bp->b_un.b_addr = (caddr_t)(bp + 1);
996 	bp->b_bufsize = size;
997 	bp->b_bcount = size;
998 	bp->b_lblkno = daddr;
999 	bp->b_blkno = daddr;
1000 	bp->b_error = 0;
1001 	bp->b_resid = 0;
1002 	bp->b_iodone = lfs_callback;
1003 	bp->b_flags |= B_BUSY | B_CALL | B_NOCACHE;
1004 	return (bp);
1005 }
1006 
1007 void
1008 lfs_callback(bp)
1009 	struct buf *bp;
1010 {
1011 	struct lfs *fs;
1012 
1013 	fs = (struct lfs *)bp->b_saveaddr;
1014 #ifdef DIAGNOSTIC
1015 	if (fs->lfs_iocount == 0)
1016 		panic("lfs_callback: zero iocount\n");
1017 #endif
1018 	if (--fs->lfs_iocount == 0)
1019 		wakeup(&fs->lfs_iocount);
1020 
1021 	brelvp(bp);
1022 	free(bp, M_SEGMENT);
1023 }
1024 
1025 void
1026 lfs_supercallback(bp)
1027 	struct buf *bp;
1028 {
1029 	brelvp(bp);
1030 	free(bp, M_SEGMENT);
1031 }
1032 
1033 /*
1034  * Shellsort (diminishing increment sort) from Data Structures and
1035  * Algorithms, Aho, Hopcraft and Ullman, 1983 Edition, page 290;
1036  * see also Knuth Vol. 3, page 84.  The increments are selected from
1037  * formula (8), page 95.  Roughly O(N^3/2).
1038  */
1039 /*
1040  * This is our own private copy of shellsort because we want to sort
1041  * two parallel arrays (the array of buffer pointers and the array of
1042  * logical block numbers) simultaneously.  Note that we cast the array
1043  * of logical block numbers to a unsigned in this routine so that the
1044  * negative block numbers (meta data blocks) sort AFTER the data blocks.
1045  */
1046 void
1047 lfs_shellsort(bp_array, lb_array, nmemb)
1048 	struct buf **bp_array;
1049 	daddr_t *lb_array;
1050 	register int nmemb;
1051 {
1052 	static int __rsshell_increments[] = { 4, 1, 0 };
1053 	register int incr, *incrp, t1, t2;
1054 	struct buf *bp_temp;
1055 	u_long lb_temp;
1056 
1057 	for (incrp = __rsshell_increments; incr = *incrp++;)
1058 		for (t1 = incr; t1 < nmemb; ++t1)
1059 			for (t2 = t1 - incr; t2 >= 0;)
1060 				if (lb_array[t2] > lb_array[t2 + incr]) {
1061 					lb_temp = lb_array[t2];
1062 					lb_array[t2] = lb_array[t2 + incr];
1063 					lb_array[t2 + incr] = lb_temp;
1064 					bp_temp = bp_array[t2];
1065 					bp_array[t2] = bp_array[t2 + incr];
1066 					bp_array[t2 + incr] = bp_temp;
1067 					t2 -= incr;
1068 				} else
1069 					break;
1070 }
1071 
1072