xref: /freebsd/sys/fs/ext2fs/ext2_alloc.c (revision aa0a1e58)
1 /*-
2  *  modified for Lites 1.1
3  *
4  *  Aug 1995, Godmar Back (gback@cs.utah.edu)
5  *  University of Utah, Department of Computer Science
6  */
7 /*-
8  * Copyright (c) 1982, 1986, 1989, 1993
9  *	The Regents of the University of California.  All rights reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  *	@(#)ffs_alloc.c	8.8 (Berkeley) 2/21/94
36  * $FreeBSD$
37  */
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/conf.h>
42 #include <sys/vnode.h>
43 #include <sys/stat.h>
44 #include <sys/mount.h>
45 #include <sys/syslog.h>
46 #include <sys/buf.h>
47 
48 #include <fs/ext2fs/inode.h>
49 #include <fs/ext2fs/ext2_mount.h>
50 #include <fs/ext2fs/ext2fs.h>
51 #include <fs/ext2fs/fs.h>
52 #include <fs/ext2fs/ext2_extern.h>
53 
54 static daddr_t	ext2_alloccg(struct inode *, int, daddr_t, int);
55 static u_long	ext2_dirpref(struct inode *);
56 static void	ext2_fserr(struct m_ext2fs *, uid_t, char *);
57 static u_long	ext2_hashalloc(struct inode *, int, long, int,
58 				daddr_t (*)(struct inode *, int, daddr_t,
59 						int));
60 static daddr_t	ext2_nodealloccg(struct inode *, int, daddr_t, int);
61 static daddr_t  ext2_mapsearch(struct m_ext2fs *, char *, daddr_t);
62 #ifdef FANCY_REALLOC
63 static int	ext2_reallocblks(struct vop_reallocblks_args *);
64 #endif
65 
66 /*
67  * Allocate a block in the file system.
68  *
69  * A preference may be optionally specified. If a preference is given
70  * the following hierarchy is used to allocate a block:
71  *   1) allocate the requested block.
72  *   2) allocate a rotationally optimal block in the same cylinder.
73  *   3) allocate a block in the same cylinder group.
74  *   4) quadradically rehash into other cylinder groups, until an
75  *        available block is located.
76  * If no block preference is given the following hierarchy is used
77  * to allocate a block:
78  *   1) allocate a block in the cylinder group that contains the
79  *        inode for the file.
80  *   2) quadradically rehash into other cylinder groups, until an
81  *        available block is located.
82  */
83 int
84 ext2_alloc(ip, lbn, bpref, size, cred, bnp)
85 	struct inode *ip;
86 	int32_t lbn, bpref;
87 	int size;
88 	struct ucred *cred;
89 	int32_t *bnp;
90 {
91 	struct m_ext2fs *fs;
92 	struct ext2mount *ump;
93 	int32_t bno;
94 	int cg;
95 	*bnp = 0;
96 	fs = ip->i_e2fs;
97 	ump = ip->i_ump;
98 	mtx_assert(EXT2_MTX(ump), MA_OWNED);
99 #ifdef DIAGNOSTIC
100 	if ((u_int)size > fs->e2fs_bsize || blkoff(fs, size) != 0) {
101 		vn_printf(ip->i_devvp, "bsize = %lu, size = %d, fs = %s\n",
102 		    (long unsigned int)fs->e2fs_bsize, size, fs->e2fs_fsmnt);
103 		panic("ext2_alloc: bad size");
104 	}
105 	if (cred == NOCRED)
106 		panic("ext2_alloc: missing credential");
107 #endif /* DIAGNOSTIC */
108 	if (size == fs->e2fs_bsize && fs->e2fs->e2fs_fbcount == 0)
109 		goto nospace;
110 	if (cred->cr_uid != 0 &&
111 		fs->e2fs->e2fs_fbcount < fs->e2fs->e2fs_rbcount)
112 		goto nospace;
113 	if (bpref >= fs->e2fs->e2fs_bcount)
114 		bpref = 0;
115 	if (bpref == 0)
116                 cg = ino_to_cg(fs, ip->i_number);
117         else
118                 cg = dtog(fs, bpref);
119         bno = (daddr_t)ext2_hashalloc(ip, cg, bpref, fs->e2fs_bsize,
120                                                  ext2_alloccg);
121         if (bno > 0) {
122 		/* set next_alloc fields as done in block_getblk */
123 		ip->i_next_alloc_block = lbn;
124 		ip->i_next_alloc_goal = bno;
125 
126                 ip->i_blocks += btodb(fs->e2fs_bsize);
127                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
128                 *bnp = bno;
129                 return (0);
130         }
131 nospace:
132 	EXT2_UNLOCK(ump);
133 	ext2_fserr(fs, cred->cr_uid, "file system full");
134 	uprintf("\n%s: write failed, file system is full\n", fs->e2fs_fsmnt);
135 	return (ENOSPC);
136 }
137 
138 /*
139  * Reallocate a sequence of blocks into a contiguous sequence of blocks.
140  *
141  * The vnode and an array of buffer pointers for a range of sequential
142  * logical blocks to be made contiguous is given. The allocator attempts
143  * to find a range of sequential blocks starting as close as possible to
144  * an fs_rotdelay offset from the end of the allocation for the logical
145  * block immediately preceding the current range. If successful, the
146  * physical block numbers in the buffer pointers and in the inode are
147  * changed to reflect the new allocation. If unsuccessful, the allocation
148  * is left unchanged. The success in doing the reallocation is returned.
149  * Note that the error return is not reflected back to the user. Rather
150  * the previous block allocation will be used.
151  */
152 
153 #ifdef FANCY_REALLOC
154 SYSCTL_NODE(_vfs, OID_AUTO, ext2fs, CTLFLAG_RW, 0, "EXT2FS filesystem");
155 
156 static int doasyncfree = 1;
157 SYSCTL_INT(_vfs_ext2fs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0,
158     "Use asychronous writes to update block pointers when freeing blocks");
159 
160 static int doreallocblks = 1;
161 SYSCTL_INT(_vfs_ext2fs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, "");
162 #endif
163 
164 int
165 ext2_reallocblks(ap)
166 	struct vop_reallocblks_args /* {
167 		struct vnode *a_vp;
168 		struct cluster_save *a_buflist;
169 	} */ *ap;
170 {
171 #ifndef FANCY_REALLOC
172 /* printf("ext2_reallocblks not implemented\n"); */
173 return ENOSPC;
174 #else
175 
176 	struct m_ext2fs *fs;
177 	struct inode *ip;
178 	struct vnode *vp;
179 	struct buf *sbp, *ebp;
180 	int32_t *bap, *sbap, *ebap = 0;
181 	struct ext2mount *ump;
182 	struct cluster_save *buflist;
183 	struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp;
184 	int32_t start_lbn, end_lbn, soff, newblk, blkno =0;
185 	int i, len, start_lvl, end_lvl, pref, ssize;
186 
187 	vp = ap->a_vp;
188 	ip = VTOI(vp);
189 	fs = ip->i_e2fs;
190 	ump = ip->i_ump;
191 #ifdef UNKLAR
192 	if (fs->fs_contigsumsize <= 0)
193 		return (ENOSPC);
194 #endif
195 	buflist = ap->a_buflist;
196 	len = buflist->bs_nchildren;
197 	start_lbn = buflist->bs_children[0]->b_lblkno;
198 	end_lbn = start_lbn + len - 1;
199 #ifdef DIAGNOSTIC
200 	for (i = 1; i < len; i++)
201 		if (buflist->bs_children[i]->b_lblkno != start_lbn + i)
202 			panic("ext2_reallocblks: non-cluster");
203 #endif
204 	/*
205 	 * If the latest allocation is in a new cylinder group, assume that
206 	 * the filesystem has decided to move and do not force it back to
207 	 * the previous cylinder group.
208 	 */
209 	if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) !=
210 	    dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno)))
211 		return (ENOSPC);
212 	if (ext2_getlbns(vp, start_lbn, start_ap, &start_lvl) ||
213 	    ext2_getlbns(vp, end_lbn, end_ap, &end_lvl))
214 		return (ENOSPC);
215 	/*
216 	 * Get the starting offset and block map for the first block.
217 	 */
218 	if (start_lvl == 0) {
219 		sbap = &ip->i_db[0];
220 		soff = start_lbn;
221 	} else {
222 		idp = &start_ap[start_lvl - 1];
223 		if (bread(vp, idp->in_lbn, (int)fs->e2fs_bsize, NOCRED, &sbp)) {
224 			brelse(sbp);
225 			return (ENOSPC);
226 		}
227 		sbap = (int32_t *)sbp->b_data;
228 		soff = idp->in_off;
229 	}
230 	/*
231 	 * Find the preferred location for the cluster.
232 	 */
233 	EXT2_LOCK(ump);
234 	pref = ext2_blkpref(ip, start_lbn, soff, sbap, blkno);
235 	/*
236 	 * If the block range spans two block maps, get the second map.
237 	 */
238 	if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) {
239 		ssize = len;
240 	} else {
241 #ifdef DIAGNOSTIC
242 		if (start_ap[start_lvl-1].in_lbn == idp->in_lbn)
243 			panic("ext2_reallocblk: start == end");
244 #endif
245 		ssize = len - (idp->in_off + 1);
246 		if (bread(vp, idp->in_lbn, (int)fs->e2fs_bsize, NOCRED, &ebp)){
247 			EXT2_UNLOCK(ump);
248 			goto fail;
249 		}
250 		ebap = (int32_t *)ebp->b_data;
251 	}
252 	/*
253 	 * Search the block map looking for an allocation of the desired size.
254 	 */
255 	if ((newblk = (int32_t)ext2_hashalloc(ip, dtog(fs, pref), pref,
256 	    len, ext2_clusteralloc)) == 0){
257 		EXT2_UNLOCK(ump);
258 		goto fail;
259 	}
260 	/*
261 	 * We have found a new contiguous block.
262 	 *
263 	 * First we have to replace the old block pointers with the new
264 	 * block pointers in the inode and indirect blocks associated
265 	 * with the file.
266 	 */
267 	blkno = newblk;
268 	for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->e2fs_fpb) {
269 		if (i == ssize)
270 			bap = ebap;
271 			soff = -i;
272 #ifdef DIAGNOSTIC
273 		if (buflist->bs_children[i]->b_blkno != fsbtodb(fs, *bap))
274 			panic("ext2_reallocblks: alloc mismatch");
275 #endif
276 		*bap++ = blkno;
277 	}
278 	/*
279 	 * Next we must write out the modified inode and indirect blocks.
280 	 * For strict correctness, the writes should be synchronous since
281 	 * the old block values may have been written to disk. In practise
282 	 * they are almost never written, but if we are concerned about
283 	 * strict correctness, the `doasyncfree' flag should be set to zero.
284 	 *
285 	 * The test on `doasyncfree' should be changed to test a flag
286 	 * that shows whether the associated buffers and inodes have
287 	 * been written. The flag should be set when the cluster is
288 	 * started and cleared whenever the buffer or inode is flushed.
289 	 * We can then check below to see if it is set, and do the
290 	 * synchronous write only when it has been cleared.
291 	 */
292 	if (sbap != &ip->i_db[0]) {
293 		if (doasyncfree)
294 			bdwrite(sbp);
295 		else
296 			bwrite(sbp);
297 	} else {
298 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
299 		if (!doasyncfree)
300 			ext2_update(vp, 1);
301 	}
302 	if (ssize < len) {
303 		if (doasyncfree)
304 			bdwrite(ebp);
305 		else
306 			bwrite(ebp);
307 	}
308 	/*
309 	 * Last, free the old blocks and assign the new blocks to the buffers.
310 	 */
311 	for (blkno = newblk, i = 0; i < len; i++, blkno += fs->e2fs_fpb) {
312 		ext2_blkfree(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno),
313 		    fs->e2fs_bsize);
314 		buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
315 	}
316 	return (0);
317 
318 fail:
319 	if (ssize < len)
320 		brelse(ebp);
321 	if (sbap != &ip->i_db[0])
322 		brelse(sbp);
323 	return (ENOSPC);
324 
325 #endif /* FANCY_REALLOC */
326 }
327 
328 /*
329  * Allocate an inode in the file system.
330  *
331  */
332 int
333 ext2_valloc(pvp, mode, cred, vpp)
334 	struct vnode *pvp;
335 	int mode;
336 	struct ucred *cred;
337 	struct vnode **vpp;
338 {
339 	struct inode *pip;
340 	struct m_ext2fs *fs;
341 	struct inode *ip;
342 	struct ext2mount *ump;
343 	ino_t ino, ipref;
344 	int i, error, cg;
345 
346 	*vpp = NULL;
347 	pip = VTOI(pvp);
348 	fs = pip->i_e2fs;
349 	ump = pip->i_ump;
350 
351 	EXT2_LOCK(ump);
352 	if (fs->e2fs->e2fs_ficount == 0)
353 		goto noinodes;
354 	/*
355 	 * If it is a directory then obtain a cylinder group based on
356 	 * ext2_dirpref else obtain it using ino_to_cg. The preferred inode is
357 	 * always the next inode.
358 	 */
359 	if((mode & IFMT) == IFDIR) {
360 		cg = ext2_dirpref(pip);
361 		if (fs->e2fs_contigdirs[cg] < 255)
362 			fs->e2fs_contigdirs[cg]++;
363 	} else {
364 		cg = ino_to_cg(fs, pip->i_number);
365 		if (fs->e2fs_contigdirs[cg] > 0)
366 			fs->e2fs_contigdirs[cg]--;
367 	}
368 	ipref = cg * fs->e2fs->e2fs_ipg + 1;
369 	ino = (ino_t)ext2_hashalloc(pip, cg, (long)ipref, mode, ext2_nodealloccg);
370 
371 	if (ino == 0)
372 		goto noinodes;
373 	error = VFS_VGET(pvp->v_mount, ino, LK_EXCLUSIVE, vpp);
374 	if (error) {
375 		ext2_vfree(pvp, ino, mode);
376 		return (error);
377 	}
378 	ip = VTOI(*vpp);
379 
380 	/*
381 	  the question is whether using VGET was such good idea at all -
382 	  Linux doesn't read the old inode in when it's allocating a
383 	  new one. I will set at least i_size & i_blocks the zero.
384 	*/
385 	ip->i_mode = 0;
386 	ip->i_size = 0;
387 	ip->i_blocks = 0;
388 	ip->i_flags = 0;
389         /* now we want to make sure that the block pointers are zeroed out */
390         for (i = 0; i < NDADDR; i++)
391                 ip->i_db[i] = 0;
392         for (i = 0; i < NIADDR; i++)
393                 ip->i_ib[i] = 0;
394 
395 	/*
396 	 * Set up a new generation number for this inode.
397 	 * XXX check if this makes sense in ext2
398 	 */
399 	if (ip->i_gen == 0 || ++ip->i_gen == 0)
400 		ip->i_gen = random() / 2 + 1;
401 /*
402 printf("ext2_valloc: allocated inode %d\n", ino);
403 */
404 	return (0);
405 noinodes:
406 	EXT2_UNLOCK(ump);
407 	ext2_fserr(fs, cred->cr_uid, "out of inodes");
408 	uprintf("\n%s: create/symlink failed, no inodes free\n", fs->e2fs_fsmnt);
409 	return (ENOSPC);
410 }
411 
412 /*
413  * Find a cylinder to place a directory.
414  *
415  * The policy implemented by this algorithm is to allocate a
416  * directory inode in the same cylinder group as its parent
417  * directory, but also to reserve space for its files inodes
418  * and data. Restrict the number of directories which may be
419  * allocated one after another in the same cylinder group
420  * without intervening allocation of files.
421  *
422  * If we allocate a first level directory then force allocation
423  * in another cylinder group.
424  *
425  */
426 static u_long
427 ext2_dirpref(struct inode *pip)
428 {
429 	struct m_ext2fs *fs;
430         int cg, prefcg, dirsize, cgsize;
431 	int avgifree, avgbfree, avgndir, curdirsize;
432 	int minifree, minbfree, maxndir;
433 	int mincg, minndir;
434 	int maxcontigdirs;
435 
436 	mtx_assert(EXT2_MTX(pip->i_ump), MA_OWNED);
437 	fs = pip->i_e2fs;
438 
439  	avgifree = fs->e2fs->e2fs_ficount / fs->e2fs_gcount;
440 	avgbfree = fs->e2fs->e2fs_fbcount / fs->e2fs_gcount;
441 	avgndir  = fs->e2fs_total_dir / fs->e2fs_gcount;
442 
443 	/*
444 	 * Force allocation in another cg if creating a first level dir.
445 	 */
446 	ASSERT_VOP_LOCKED(ITOV(pip), "ext2fs_dirpref");
447 	if (ITOV(pip)->v_vflag & VV_ROOT) {
448 		prefcg = arc4random() % fs->e2fs_gcount;
449 		mincg = prefcg;
450 		minndir = fs->e2fs_ipg;
451 		for (cg = prefcg; cg < fs->e2fs_gcount; cg++)
452 			if (fs->e2fs_gd[cg].ext2bgd_ndirs < minndir &&
453 			    fs->e2fs_gd[cg].ext2bgd_nifree >= avgifree &&
454 			    fs->e2fs_gd[cg].ext2bgd_nbfree >= avgbfree) {
455 				mincg = cg;
456 				minndir = fs->e2fs_gd[cg].ext2bgd_ndirs;
457 			}
458 		for (cg = 0; cg < prefcg; cg++)
459 			if (fs->e2fs_gd[cg].ext2bgd_ndirs < minndir &&
460                             fs->e2fs_gd[cg].ext2bgd_nifree >= avgifree &&
461                             fs->e2fs_gd[cg].ext2bgd_nbfree >= avgbfree) {
462                                 mincg = cg;
463                                 minndir = fs->e2fs_gd[cg].ext2bgd_ndirs;
464                         }
465 
466 		return (mincg);
467 	}
468 
469 	/*
470 	 * Count various limits which used for
471 	 * optimal allocation of a directory inode.
472 	 */
473 	maxndir = min(avgndir + fs->e2fs_ipg / 16, fs->e2fs_ipg);
474 	minifree = avgifree - avgifree / 4;
475 	if (minifree < 1)
476 		minifree = 1;
477 	minbfree = avgbfree - avgbfree / 4;
478 	if (minbfree < 1)
479 		minbfree = 1;
480 	cgsize = fs->e2fs_fsize * fs->e2fs_fpg;
481 	dirsize = AVGDIRSIZE;
482 	curdirsize = avgndir ? (cgsize - avgbfree * fs->e2fs_bsize) / avgndir : 0;
483 	if (dirsize < curdirsize)
484 		dirsize = curdirsize;
485 	if (dirsize <= 0)
486 		maxcontigdirs = 0;		/* dirsize overflowed */
487 	else
488 		maxcontigdirs = min((avgbfree * fs->e2fs_bsize) / dirsize, 255);
489 	maxcontigdirs = min(maxcontigdirs, fs->e2fs_ipg / AFPDIR);
490 	if (maxcontigdirs == 0)
491 		maxcontigdirs = 1;
492 
493 	/*
494 	 * Limit number of dirs in one cg and reserve space for
495 	 * regular files, but only if we have no deficit in
496 	 * inodes or space.
497 	 */
498 	prefcg = ino_to_cg(fs, pip->i_number);
499 	for (cg = prefcg; cg < fs->e2fs_gcount; cg++)
500 		if (fs->e2fs_gd[cg].ext2bgd_ndirs < maxndir &&
501 		    fs->e2fs_gd[cg].ext2bgd_nifree >= minifree &&
502 	    	    fs->e2fs_gd[cg].ext2bgd_nbfree >= minbfree) {
503 			if (fs->e2fs_contigdirs[cg] < maxcontigdirs)
504 				return (cg);
505 		}
506 	for (cg = 0; cg < prefcg; cg++)
507 		if (fs->e2fs_gd[cg].ext2bgd_ndirs < maxndir &&
508 		    fs->e2fs_gd[cg].ext2bgd_nifree >= minifree &&
509 	    	    fs->e2fs_gd[cg].ext2bgd_nbfree >= minbfree) {
510 			if (fs->e2fs_contigdirs[cg] < maxcontigdirs)
511 				return (cg);
512 		}
513 	/*
514 	 * This is a backstop when we have deficit in space.
515 	 */
516 	for (cg = prefcg; cg < fs->e2fs_gcount; cg++)
517 		if (fs->e2fs_gd[cg].ext2bgd_nifree >= avgifree)
518 			return (cg);
519 	for (cg = 0; cg < prefcg; cg++)
520 		if (fs->e2fs_gd[cg].ext2bgd_nifree >= avgifree)
521 			break;
522 	return (cg);
523 }
524 
525 /*
526  * Select the desired position for the next block in a file.
527  *
528  * we try to mimic what Remy does in inode_getblk/block_getblk
529  *
530  * we note: blocknr == 0 means that we're about to allocate either
531  * a direct block or a pointer block at the first level of indirection
532  * (In other words, stuff that will go in i_db[] or i_ib[])
533  *
534  * blocknr != 0 means that we're allocating a block that is none
535  * of the above. Then, blocknr tells us the number of the block
536  * that will hold the pointer
537  */
538 int32_t
539 ext2_blkpref(ip, lbn, indx, bap, blocknr)
540 	struct inode *ip;
541 	int32_t lbn;
542 	int indx;
543 	int32_t *bap;
544 	int32_t blocknr;
545 {
546 	int	tmp;
547 	mtx_assert(EXT2_MTX(ip->i_ump), MA_OWNED);
548 
549 	/* if the next block is actually what we thought it is,
550 	   then set the goal to what we thought it should be
551 	*/
552 	if(ip->i_next_alloc_block == lbn && ip->i_next_alloc_goal != 0)
553 		return ip->i_next_alloc_goal;
554 
555 	/* now check whether we were provided with an array that basically
556 	   tells us previous blocks to which we want to stay closeby
557 	*/
558 	if(bap)
559                 for (tmp = indx - 1; tmp >= 0; tmp--)
560 			if (bap[tmp])
561 				return bap[tmp];
562 
563 	/* else let's fall back to the blocknr, or, if there is none,
564 	   follow the rule that a block should be allocated near its inode
565 	*/
566 	return blocknr ? blocknr :
567 			(int32_t)(ip->i_block_group *
568 			EXT2_BLOCKS_PER_GROUP(ip->i_e2fs)) +
569 			ip->i_e2fs->e2fs->e2fs_first_dblock;
570 }
571 
572 /*
573  * Implement the cylinder overflow algorithm.
574  *
575  * The policy implemented by this algorithm is:
576  *   1) allocate the block in its requested cylinder group.
577  *   2) quadradically rehash on the cylinder group number.
578  *   3) brute force search for a free block.
579  */
580 static u_long
581 ext2_hashalloc(struct inode *ip, int cg, long pref, int size,
582                 daddr_t (*allocator)(struct inode *, int, daddr_t, int))
583 {
584 	struct m_ext2fs *fs;
585 	ino_t result;
586 	int i, icg = cg;
587 
588 	mtx_assert(EXT2_MTX(ip->i_ump), MA_OWNED);
589 	fs = ip->i_e2fs;
590 	/*
591 	 * 1: preferred cylinder group
592 	 */
593 	result = (*allocator)(ip, cg, pref, size);
594 	if (result)
595 		return (result);
596 	/*
597 	 * 2: quadratic rehash
598 	 */
599 	for (i = 1; i < fs->e2fs_gcount; i *= 2) {
600 		cg += i;
601 		if (cg >= fs->e2fs_gcount)
602 			cg -= fs->e2fs_gcount;
603 		result = (*allocator)(ip, cg, 0, size);
604 		if (result)
605 			return (result);
606 	}
607 	/*
608 	 * 3: brute force search
609 	 * Note that we start at i == 2, since 0 was checked initially,
610 	 * and 1 is always checked in the quadratic rehash.
611 	 */
612 	cg = (icg + 2) % fs->e2fs_gcount;
613 	for (i = 2; i < fs->e2fs_gcount; i++) {
614 		result = (*allocator)(ip, cg, 0, size);
615 		if (result)
616 			return (result);
617 		cg++;
618 		if (cg == fs->e2fs_gcount)
619 			cg = 0;
620 	}
621 	return (0);
622 }
623 
624 /*
625  * Determine whether a block can be allocated.
626  *
627  * Check to see if a block of the appropriate size is available,
628  * and if it is, allocate it.
629  */
630 static daddr_t
631 ext2_alloccg(struct inode *ip, int cg, daddr_t bpref, int size)
632 {
633 	struct m_ext2fs *fs;
634 	struct buf *bp;
635 	struct ext2mount *ump;
636 	daddr_t bno, runstart, runlen;
637 	int bit, loc, end, error, start;
638 	char *bbp;
639 	/* XXX ondisk32 */
640 	fs = ip->i_e2fs;
641 	ump = ip->i_ump;
642 	if (fs->e2fs_gd[cg].ext2bgd_nbfree == 0)
643 		return (0);
644 	EXT2_UNLOCK(ump);
645 	error = bread(ip->i_devvp, fsbtodb(fs,
646 		fs->e2fs_gd[cg].ext2bgd_b_bitmap),
647 		(int)fs->e2fs_bsize, NOCRED, &bp);
648 	if (error) {
649 		brelse(bp);
650 		EXT2_LOCK(ump);
651 		return (0);
652 	}
653 	if (fs->e2fs_gd[cg].ext2bgd_nbfree == 0) {
654 		/*
655 		 * Another thread allocated the last block in this
656 		 * group while we were waiting for the buffer.
657 		 */
658 		brelse(bp);
659 		EXT2_LOCK(ump);
660 		return (0);
661 	}
662 	bbp = (char *)bp->b_data;
663 
664 	if (dtog(fs, bpref) != cg)
665 		bpref = 0;
666 	if (bpref != 0) {
667 		bpref = dtogd(fs, bpref);
668 		/*
669 		 * if the requested block is available, use it
670 		 */
671 		if (isclr(bbp, bpref)) {
672 			bno = bpref;
673 			goto gotit;
674 		}
675 	}
676 	/*
677 	 * no blocks in the requested cylinder, so take next
678 	 * available one in this cylinder group.
679 	 * first try to get 8 contigous blocks, then fall back to a single
680 	 * block.
681 	 */
682 	if (bpref)
683 		start = dtogd(fs, bpref) / NBBY;
684 	else
685 		start = 0;
686 	end = howmany(fs->e2fs->e2fs_fpg, NBBY) - start;
687 retry:
688 	runlen = 0;
689 	runstart = 0;
690 	for (loc = start; loc < end; loc++) {
691 		if (bbp[loc] == (char)0xff) {
692 			runlen = 0;
693 			continue;
694 		}
695 
696 		/* Start of a run, find the number of high clear bits. */
697 		if (runlen == 0) {
698 			bit = fls(bbp[loc]);
699 			runlen = NBBY - bit;
700 			runstart = loc * NBBY + bit;
701 		} else if (bbp[loc] == 0) {
702 			/* Continue a run. */
703 			runlen += NBBY;
704 		} else {
705 			/*
706 			 * Finish the current run.  If it isn't long
707 			 * enough, start a new one.
708 			 */
709 			bit = ffs(bbp[loc]) - 1;
710 			runlen += bit;
711 			if (runlen >= 8) {
712 				bno = runstart;
713 				goto gotit;
714 			}
715 
716 			/* Run was too short, start a new one. */
717 			bit = fls(bbp[loc]);
718 			runlen = NBBY - bit;
719 			runstart = loc * NBBY + bit;
720 		}
721 
722 		/* If the current run is long enough, use it. */
723 		if (runlen >= 8) {
724 			bno = runstart;
725 			goto gotit;
726 		}
727 	}
728 	if (start != 0) {
729 		end = start;
730 		start = 0;
731 		goto retry;
732 	}
733 
734 	bno = ext2_mapsearch(fs, bbp, bpref);
735 	if (bno < 0){
736 		brelse(bp);
737 		EXT2_LOCK(ump);
738 		return (0);
739 	}
740 gotit:
741 #ifdef DIAGNOSTIC
742 	if (isset(bbp, bno)) {
743 		printf("ext2fs_alloccgblk: cg=%d bno=%jd fs=%s\n",
744 			cg, (intmax_t)bno, fs->e2fs_fsmnt);
745 		panic("ext2fs_alloccg: dup alloc");
746 	}
747 #endif
748 	setbit(bbp, bno);
749 	EXT2_LOCK(ump);
750 	fs->e2fs->e2fs_fbcount--;
751 	fs->e2fs_gd[cg].ext2bgd_nbfree--;
752 	fs->e2fs_fmod = 1;
753 	EXT2_UNLOCK(ump);
754 	bdwrite(bp);
755 	return (cg * fs->e2fs->e2fs_fpg + fs->e2fs->e2fs_first_dblock + bno);
756 }
757 
758 /*
759  * Determine whether an inode can be allocated.
760  *
761  * Check to see if an inode is available, and if it is,
762  * allocate it using tode in the specified cylinder group.
763  */
764 static daddr_t
765 ext2_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode)
766 {
767 	struct m_ext2fs *fs;
768 	struct buf *bp;
769 	struct ext2mount *ump;
770 	int error, start, len, loc, map, i;
771 	char *ibp;
772 	ipref--; /* to avoid a lot of (ipref -1) */
773 	if (ipref == -1)
774 		ipref = 0;
775 	fs = ip->i_e2fs;
776 	ump = ip->i_ump;
777 	if (fs->e2fs_gd[cg].ext2bgd_nifree == 0)
778 		return (0);
779 	EXT2_UNLOCK(ump);
780 	error = bread(ip->i_devvp, fsbtodb(fs,
781 		fs->e2fs_gd[cg].ext2bgd_i_bitmap),
782 		(int)fs->e2fs_bsize, NOCRED, &bp);
783 	if (error) {
784 		brelse(bp);
785 		EXT2_LOCK(ump);
786 		return (0);
787 	}
788 	if (fs->e2fs_gd[cg].ext2bgd_nifree == 0) {
789 		/*
790 		 * Another thread allocated the last i-node in this
791 		 * group while we were waiting for the buffer.
792 		 */
793 		brelse(bp);
794 		EXT2_LOCK(ump);
795 		return (0);
796 	}
797 	ibp = (char *)bp->b_data;
798 	if (ipref) {
799 		ipref %= fs->e2fs->e2fs_ipg;
800 		if (isclr(ibp, ipref))
801 			goto gotit;
802 	}
803 	start = ipref / NBBY;
804 	len = howmany(fs->e2fs->e2fs_ipg - ipref, NBBY);
805 	loc = skpc(0xff, len, &ibp[start]);
806 	if (loc == 0) {
807 		len = start + 1;
808 		start = 0;
809 		loc = skpc(0xff, len, &ibp[0]);
810 		if (loc == 0) {
811 			printf("cg = %d, ipref = %lld, fs = %s\n",
812 				cg, (long long)ipref, fs->e2fs_fsmnt);
813 			panic("ext2fs_nodealloccg: map corrupted");
814 			/* NOTREACHED */
815 		}
816 	}
817 	i = start + len - loc;
818 	map = ibp[i] ^ 0xff;
819 	if (map == 0) {
820 		printf("fs = %s\n", fs->e2fs_fsmnt);
821 		panic("ext2fs_nodealloccg: block not in map");
822 	}
823 	ipref = i * NBBY + ffs(map) - 1;
824 gotit:
825 	setbit(ibp, ipref);
826 	EXT2_LOCK(ump);
827 	fs->e2fs_gd[cg].ext2bgd_nifree--;
828 	fs->e2fs->e2fs_ficount--;
829 	fs->e2fs_fmod = 1;
830 	if ((mode & IFMT) == IFDIR) {
831 		fs->e2fs_gd[cg].ext2bgd_ndirs++;
832 		fs->e2fs_total_dir++;
833 	}
834 	EXT2_UNLOCK(ump);
835 	bdwrite(bp);
836 	return (cg * fs->e2fs->e2fs_ipg + ipref +1);
837 }
838 
839 /*
840  * Free a block or fragment.
841  *
842  */
843 void
844 ext2_blkfree(ip, bno, size)
845 	struct inode *ip;
846 	int32_t bno;
847 	long size;
848 {
849 	struct m_ext2fs *fs;
850 	struct buf *bp;
851 	struct ext2mount *ump;
852 	int cg, error;
853 	char *bbp;
854 
855 	fs = ip->i_e2fs;
856 	ump = ip->i_ump;
857 	cg = dtog(fs, bno);
858 	if ((u_int)bno >= fs->e2fs->e2fs_bcount) {
859                 printf("bad block %lld, ino %llu\n", (long long)bno,
860                     (unsigned long long)ip->i_number);
861                 ext2_fserr(fs, ip->i_uid, "bad block");
862                 return;
863         }
864         error = bread(ip->i_devvp,
865                 fsbtodb(fs, fs->e2fs_gd[cg].ext2bgd_b_bitmap),
866                 (int)fs->e2fs_bsize, NOCRED, &bp);
867         if (error) {
868                 brelse(bp);
869                 return;
870         }
871         bbp = (char *)bp->b_data;
872         bno = dtogd(fs, bno);
873         if (isclr(bbp, bno)) {
874                 printf("block = %lld, fs = %s\n",
875                      (long long)bno, fs->e2fs_fsmnt);
876                 panic("blkfree: freeing free block");
877         }
878         clrbit(bbp, bno);
879 	EXT2_LOCK(ump);
880         fs->e2fs->e2fs_fbcount++;
881         fs->e2fs_gd[cg].ext2bgd_nbfree++;
882         fs->e2fs_fmod = 1;
883 	EXT2_UNLOCK(ump);
884         bdwrite(bp);
885 }
886 
887 /*
888  * Free an inode.
889  *
890  */
891 int
892 ext2_vfree(pvp, ino, mode)
893 	struct vnode *pvp;
894 	ino_t ino;
895 	int mode;
896 {
897 	struct m_ext2fs *fs;
898 	struct inode *pip;
899 	struct buf *bp;
900 	struct ext2mount *ump;
901 	int error, cg;
902 	char * ibp;
903 /*	mode_t save_i_mode; */
904 
905 	pip = VTOI(pvp);
906 	fs = pip->i_e2fs;
907 	ump = pip->i_ump;
908 	if ((u_int)ino > fs->e2fs_ipg * fs->e2fs_gcount)
909 		panic("ext2_vfree: range: devvp = %p, ino = %d, fs = %s",
910 		    pip->i_devvp, ino, fs->e2fs_fsmnt);
911 
912 	cg = ino_to_cg(fs, ino);
913 	error = bread(pip->i_devvp,
914 		fsbtodb(fs, fs->e2fs_gd[cg].ext2bgd_i_bitmap),
915 		(int)fs->e2fs_bsize, NOCRED, &bp);
916 	if (error) {
917 		brelse(bp);
918 		return (0);
919 	}
920 	ibp = (char *)bp->b_data;
921 	ino = (ino - 1) % fs->e2fs->e2fs_ipg;
922 	if (isclr(ibp, ino)) {
923 		printf("ino = %llu, fs = %s\n",
924 			 (unsigned long long)ino, fs->e2fs_fsmnt);
925 		if (fs->e2fs_ronly == 0)
926 			panic("ifree: freeing free inode");
927 	}
928 	clrbit(ibp, ino);
929 	EXT2_LOCK(ump);
930 	fs->e2fs->e2fs_ficount++;
931 	fs->e2fs_gd[cg].ext2bgd_nifree++;
932 	if ((mode & IFMT) == IFDIR) {
933 		fs->e2fs_gd[cg].ext2bgd_ndirs--;
934 		fs->e2fs_total_dir--;
935 	}
936 	fs->e2fs_fmod = 1;
937 	EXT2_UNLOCK(ump);
938 	bdwrite(bp);
939 	return (0);
940 }
941 
942 /*
943  * Find a block in the specified cylinder group.
944  *
945  * It is a panic if a request is made to find a block if none are
946  * available.
947  */
948 static daddr_t
949 ext2_mapsearch(struct m_ext2fs *fs, char *bbp, daddr_t bpref)
950 {
951 	int start, len, loc, i, map;
952 
953 	/*
954 	 * find the fragment by searching through the free block
955 	 * map for an appropriate bit pattern
956 	 */
957 	if (bpref)
958 		start = dtogd(fs, bpref) / NBBY;
959 	else
960 		start = 0;
961 	len = howmany(fs->e2fs->e2fs_fpg, NBBY) - start;
962 	loc = skpc(0xff, len, &bbp[start]);
963 	if (loc == 0) {
964 		len = start + 1;
965 		start = 0;
966 		loc = skpc(0xff, len, &bbp[start]);
967 		if (loc == 0) {
968 			printf("start = %d, len = %d, fs = %s\n",
969 				start, len, fs->e2fs_fsmnt);
970 			panic("ext2fs_alloccg: map corrupted");
971 			/* NOTREACHED */
972 		}
973 	}
974 	i = start + len - loc;
975 	map = bbp[i] ^ 0xff;
976 	if (map == 0) {
977 		printf("fs = %s\n", fs->e2fs_fsmnt);
978 		panic("ext2fs_mapsearch: block not in map");
979 	}
980 	return (i * NBBY + ffs(map) - 1);
981 }
982 
983 /*
984  * Fserr prints the name of a file system with an error diagnostic.
985  *
986  * The form of the error message is:
987  *	fs: error message
988  */
989 static void
990 ext2_fserr(fs, uid, cp)
991 	struct m_ext2fs *fs;
992 	uid_t uid;
993 	char *cp;
994 {
995 
996 	log(LOG_ERR, "uid %u on %s: %s\n", uid, fs->e2fs_fsmnt, cp);
997 }
998 
999 int
1000 cg_has_sb(int i)
1001 {
1002         int a3, a5, a7;
1003 
1004         if (i == 0 || i == 1)
1005                 return 1;
1006         for (a3 = 3, a5 = 5, a7 = 7;
1007             a3 <= i || a5 <= i || a7 <= i;
1008             a3 *= 3, a5 *= 5, a7 *= 7)
1009                 if (i == a3 || i == a5 || i == a7)
1010                         return 1;
1011         return 0;
1012 }
1013