xref: /illumos-gate/usr/src/uts/common/fs/ufs/ufs_alloc.c (revision 257873cf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 #include <sys/condvar_impl.h>
40 #include <sys/types.h>
41 #include <sys/t_lock.h>
42 #include <sys/debug.h>
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/signal.h>
46 #include <sys/cred.h>
47 #include <sys/proc.h>
48 #include <sys/disp.h>
49 #include <sys/user.h>
50 #include <sys/buf.h>
51 #include <sys/vfs.h>
52 #include <sys/vnode.h>
53 #include <sys/acl.h>
54 #include <sys/fs/ufs_fs.h>
55 #include <sys/fs/ufs_inode.h>
56 #include <sys/fs/ufs_acl.h>
57 #include <sys/fs/ufs_bio.h>
58 #include <sys/fs/ufs_quota.h>
59 #include <sys/kmem.h>
60 #include <sys/fs/ufs_trans.h>
61 #include <sys/fs/ufs_panic.h>
62 #include <sys/errno.h>
63 #include <sys/time.h>
64 #include <sys/sysmacros.h>
65 #include <sys/file.h>
66 #include <sys/fcntl.h>
67 #include <sys/flock.h>
68 #include <fs/fs_subr.h>
69 #include <sys/cmn_err.h>
70 #include <sys/policy.h>
71 
72 static ino_t	hashalloc();
73 static daddr_t	fragextend();
74 static daddr_t	alloccg();
75 static daddr_t	alloccgblk();
76 static ino_t	ialloccg();
77 static daddr_t	mapsearch();
78 
79 extern int	inside[], around[];
80 extern uchar_t	*fragtbl[];
81 void delay();
82 
83 /*
84  * Allocate a block in the file system.
85  *
86  * The size of the requested block is given, which must be some
87  * multiple of fs_fsize and <= fs_bsize.
88  * A preference may be optionally specified. If a preference is given
89  * the following hierarchy is used to allocate a block:
90  *   1) allocate the requested block.
91  *   2) allocate a rotationally optimal block in the same cylinder.
92  *   3) allocate a block in the same cylinder group.
93  *   4) quadratically rehash into other cylinder groups, until an
94  *	available block is located.
95  * If no block preference is given the following hierarchy is used
96  * to allocate a block:
97  *   1) allocate a block in the cylinder group that contains the
98  *	inode for the file.
99  *   2) quadratically rehash into other cylinder groups, until an
100  *	available block is located.
101  */
102 int
103 alloc(struct inode *ip, daddr_t bpref, int size, daddr_t *bnp, cred_t *cr)
104 {
105 	struct fs *fs;
106 	struct ufsvfs *ufsvfsp;
107 	daddr_t bno;
108 	int cg;
109 	int err;
110 	char *errmsg = NULL;
111 	size_t len;
112 
113 	ufsvfsp = ip->i_ufsvfs;
114 	fs = ufsvfsp->vfs_fs;
115 	if ((unsigned)size > fs->fs_bsize || fragoff(fs, size) != 0) {
116 		err = ufs_fault(ITOV(ip), "alloc: bad size, dev = 0x%lx,"
117 		    " bsize = %d, size = %d, fs = %s\n",
118 		    ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
119 		return (err);
120 	}
121 	if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
122 		goto nospace;
123 	if (freespace(fs, ufsvfsp) <= 0 &&
124 	    secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0)
125 		goto nospace;
126 	err = chkdq(ip, (long)btodb(size), 0, cr, &errmsg, &len);
127 	/* Note that may not have err, but may have errmsg */
128 	if (errmsg != NULL) {
129 		uprintf(errmsg);
130 		kmem_free(errmsg, len);
131 		errmsg = NULL;
132 	}
133 	if (err)
134 		return (err);
135 	if (bpref >= fs->fs_size)
136 		bpref = 0;
137 	if (bpref == 0)
138 		cg = (int)itog(fs, ip->i_number);
139 	else
140 		cg = dtog(fs, bpref);
141 
142 	bno = (daddr_t)hashalloc(ip, cg, (long)bpref, size,
143 	    (ulong_t (*)())alloccg);
144 	if (bno > 0) {
145 		*bnp = bno;
146 		return (0);
147 	}
148 
149 	/*
150 	 * hashalloc() failed because some other thread grabbed
151 	 * the last block so unwind the quota operation.  We can
152 	 * ignore the return because subtractions don't fail and
153 	 * size is guaranteed to be >= zero by our caller.
154 	 */
155 	(void) chkdq(ip, -(long)btodb(size), 0, cr, (char **)NULL,
156 	    (size_t *)NULL);
157 
158 nospace:
159 	mutex_enter(&ufsvfsp->vfs_lock);
160 	if ((lbolt - ufsvfsp->vfs_lastwhinetime) > (hz << 2) &&
161 	    (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) {
162 		ufsvfsp->vfs_lastwhinetime = lbolt;
163 		cmn_err(CE_NOTE, "alloc: %s: file system full", fs->fs_fsmnt);
164 	}
165 	mutex_exit(&ufsvfsp->vfs_lock);
166 	return (ENOSPC);
167 }
168 
169 /*
170  * Reallocate a fragment to a bigger size
171  *
172  * The number and size of the old block is given, and a preference
173  * and new size is also specified.  The allocator attempts to extend
174  * the original block.  Failing that, the regular block allocator is
175  * invoked to get an appropriate block.
176  */
177 int
178 realloccg(struct inode *ip, daddr_t bprev, daddr_t bpref, int osize,
179     int nsize, daddr_t *bnp, cred_t *cr)
180 {
181 	daddr_t bno;
182 	struct fs *fs;
183 	struct ufsvfs *ufsvfsp;
184 	int cg, request;
185 	int err;
186 	char *errmsg = NULL;
187 	size_t len;
188 
189 	ufsvfsp = ip->i_ufsvfs;
190 	fs = ufsvfsp->vfs_fs;
191 	if ((unsigned)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
192 	    (unsigned)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
193 		err = ufs_fault(ITOV(ip),
194 		    "realloccg: bad size, dev=0x%lx, bsize=%d, "
195 		    "osize=%d, nsize=%d, fs=%s\n",
196 		    ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt);
197 		return (err);
198 	}
199 	if (freespace(fs, ufsvfsp) <= 0 &&
200 	    secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0)
201 		goto nospace;
202 	if (bprev == 0) {
203 		err = ufs_fault(ITOV(ip),
204 		    "realloccg: bad bprev, dev = 0x%lx, bsize = %d,"
205 		    " bprev = %ld, fs = %s\n", ip->i_dev, fs->fs_bsize, bprev,
206 		    fs->fs_fsmnt);
207 		return (err);
208 	}
209 	err = chkdq(ip, (long)btodb(nsize - osize), 0, cr, &errmsg, &len);
210 	/* Note that may not have err, but may have errmsg */
211 	if (errmsg != NULL) {
212 		uprintf(errmsg);
213 		kmem_free(errmsg, len);
214 		errmsg = NULL;
215 	}
216 	if (err)
217 		return (err);
218 	cg = dtog(fs, bprev);
219 	bno = fragextend(ip, cg, (long)bprev, osize, nsize);
220 	if (bno != 0) {
221 		*bnp = bno;
222 		return (0);
223 	}
224 	if (bpref >= fs->fs_size)
225 		bpref = 0;
226 
227 	/*
228 	 * When optimizing for time we allocate a full block and
229 	 * then only use the upper portion for this request. When
230 	 * this file grows again it will grow into the unused portion
231 	 * of the block (See fragextend() above).  This saves time
232 	 * because an extra disk write would be needed if the frags
233 	 * following the current allocation were not free. The extra
234 	 * disk write is needed to move the data from its current
235 	 * location into the newly allocated position.
236 	 *
237 	 * When optimizing for space we allocate a run of frags
238 	 * that is just the right size for this request.
239 	 */
240 	request = (fs->fs_optim == FS_OPTTIME) ? fs->fs_bsize : nsize;
241 	bno = (daddr_t)hashalloc(ip, cg, (long)bpref, request,
242 	    (ulong_t (*)())alloccg);
243 	if (bno > 0) {
244 		*bnp = bno;
245 		if (nsize < request)
246 			(void) free(ip, bno + numfrags(fs, nsize),
247 			    (off_t)(request - nsize), I_NOCANCEL);
248 		return (0);
249 	}
250 
251 	/*
252 	 * hashalloc() failed because some other thread grabbed
253 	 * the last block so unwind the quota operation.  We can
254 	 * ignore the return because subtractions don't fail, and
255 	 * our caller guarantees nsize >= osize.
256 	 */
257 	(void) chkdq(ip, -(long)btodb(nsize - osize), 0, cr, (char **)NULL,
258 	    (size_t *)NULL);
259 
260 nospace:
261 	mutex_enter(&ufsvfsp->vfs_lock);
262 	if ((lbolt - ufsvfsp->vfs_lastwhinetime) > (hz << 2) &&
263 	    (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) {
264 		ufsvfsp->vfs_lastwhinetime = lbolt;
265 		cmn_err(CE_NOTE,
266 		    "realloccg %s: file system full", fs->fs_fsmnt);
267 	}
268 	mutex_exit(&ufsvfsp->vfs_lock);
269 	return (ENOSPC);
270 }
271 
272 /*
273  * Allocate an inode in the file system.
274  *
275  * A preference may be optionally specified. If a preference is given
276  * the following hierarchy is used to allocate an inode:
277  *   1) allocate the requested inode.
278  *   2) allocate an inode in the same cylinder group.
279  *   3) quadratically rehash into other cylinder groups, until an
280  *	available inode is located.
281  * If no inode preference is given the following hierarchy is used
282  * to allocate an inode:
283  *   1) allocate an inode in cylinder group 0.
284  *   2) quadratically rehash into other cylinder groups, until an
285  *	available inode is located.
286  */
287 int
288 ufs_ialloc(struct inode *pip,
289     ino_t ipref, mode_t mode, struct inode **ipp, cred_t *cr)
290 {
291 	struct inode *ip;
292 	struct fs *fs;
293 	int cg;
294 	ino_t ino;
295 	int err;
296 	int nifree;
297 	struct ufsvfs *ufsvfsp = pip->i_ufsvfs;
298 	char *errmsg = NULL;
299 	size_t len;
300 
301 	ASSERT(RW_WRITE_HELD(&pip->i_rwlock));
302 	fs = pip->i_fs;
303 loop:
304 	nifree = fs->fs_cstotal.cs_nifree;
305 
306 	if (nifree == 0)
307 		goto noinodes;
308 	/*
309 	 * Shadow inodes don't count against a user's inode allocation.
310 	 * They are an implementation method and not a resource.
311 	 */
312 	if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
313 		err = chkiq((struct ufsvfs *)ITOV(pip)->v_vfsp->vfs_data,
314 		    /* change */ 1, (struct inode *)NULL, crgetuid(cr), 0,
315 		    cr, &errmsg, &len);
316 		/*
317 		 * As we haven't acquired any locks yet, dump the message
318 		 * now.
319 		 */
320 		if (errmsg != NULL) {
321 			uprintf(errmsg);
322 			kmem_free(errmsg, len);
323 			errmsg = NULL;
324 		}
325 		if (err)
326 			return (err);
327 	}
328 
329 	if (ipref >= (ulong_t)(fs->fs_ncg * fs->fs_ipg))
330 		ipref = 0;
331 	cg = (int)itog(fs, ipref);
332 	ino = (ino_t)hashalloc(pip, cg, (long)ipref, (int)mode,
333 	    (ulong_t (*)())ialloccg);
334 	if (ino == 0) {
335 		if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
336 			/*
337 			 * We can safely ignore the return from chkiq()
338 			 * because deallocations can only fail if we
339 			 * can't get the user's quota info record off
340 			 * the disk due to an I/O error.  In that case,
341 			 * the quota subsystem is already messed up.
342 			 */
343 			(void) chkiq(ufsvfsp, /* change */ -1,
344 			    (struct inode *)NULL, crgetuid(cr), 0, cr,
345 			    (char **)NULL, (size_t *)NULL);
346 		}
347 		goto noinodes;
348 	}
349 	err = ufs_iget(pip->i_vfs, ino, ipp, cr);
350 	if (err) {
351 		if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
352 			/*
353 			 * See above comment about why it is safe to ignore an
354 			 * error return here.
355 			 */
356 			(void) chkiq(ufsvfsp, /* change */ -1,
357 			    (struct inode *)NULL, crgetuid(cr), 0, cr,
358 			    (char **)NULL, (size_t *)NULL);
359 		}
360 		ufs_ifree(pip, ino, 0);
361 		return (err);
362 	}
363 	ip = *ipp;
364 	ASSERT(!ip->i_ufs_acl);
365 	ASSERT(!ip->i_dquot);
366 	rw_enter(&ip->i_contents, RW_WRITER);
367 
368 	/*
369 	 * Check if we really got a free inode, if not then complain
370 	 * and mark the inode ISTALE so that it will be freed by the
371 	 * ufs idle thread eventually and will not be sent to ufs_delete().
372 	 */
373 	if (ip->i_mode || (ip->i_nlink > 0)) {
374 		ip->i_flag |= ISTALE;
375 		rw_exit(&ip->i_contents);
376 		VN_RELE(ITOV(ip));
377 		cmn_err(CE_WARN,
378 		    "%s: unexpected allocated inode %d, run fsck(1M)%s",
379 		    fs->fs_fsmnt, (int)ino,
380 		    (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
381 		goto loop;
382 	}
383 
384 	/*
385 	 * Check the inode has no size or data blocks.
386 	 * This could have happened if the truncation failed when
387 	 * deleting the inode. It used to be possible for this to occur
388 	 * if a block allocation failed when iteratively truncating a
389 	 * large file using logging and with a full file system.
390 	 * This was fixed with bug fix 4348738. However, truncation may
391 	 * still fail on an IO error. So in all cases for safety and
392 	 * security we clear out the size; the blocks allocated; and
393 	 * pointers to the blocks. This will ultimately cause a fsck
394 	 * error of un-accounted for blocks, but its a fairly benign error,
395 	 * and possibly the correct thing to do anyway as accesssing those
396 	 * blocks agains may lead to more IO errors.
397 	 */
398 	if (ip->i_size || ip->i_blocks) {
399 		int i;
400 
401 		if (ip->i_size) {
402 			cmn_err(CE_WARN,
403 			    "%s: free inode %d had size 0x%llx, run fsck(1M)%s",
404 			    fs->fs_fsmnt, (int)ino, ip->i_size,
405 			    (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
406 		}
407 		/*
408 		 * Clear any garbage left behind.
409 		 */
410 		ip->i_size = (u_offset_t)0;
411 		ip->i_blocks = 0;
412 		for (i = 0; i < NDADDR; i++)
413 			ip->i_db[i] = 0;
414 		for (i = 0; i < NIADDR; i++)
415 			ip->i_ib[i] = 0;
416 	}
417 
418 	/*
419 	 * Initialize the link count
420 	 */
421 	ip->i_nlink = 0;
422 
423 	/*
424 	 * Clear the old flags
425 	 */
426 	ip->i_flag &= IREF;
427 
428 	/*
429 	 * Access times are not really defined if the fs is mounted
430 	 * with 'noatime'. But it can cause nfs clients to fail
431 	 * open() if the atime is not a legal value. Set a legal value
432 	 * here when the inode is allocated.
433 	 */
434 	if (ufsvfsp->vfs_noatime) {
435 		mutex_enter(&ufs_iuniqtime_lock);
436 		ip->i_atime = iuniqtime;
437 		mutex_exit(&ufs_iuniqtime_lock);
438 	}
439 	rw_exit(&ip->i_contents);
440 	return (0);
441 noinodes:
442 	if (!(TRANS_ISTRANS(ufsvfsp)) || !(pip->i_flag & IQUIET))
443 		cmn_err(CE_NOTE, "%s: out of inodes\n", fs->fs_fsmnt);
444 	return (ENOSPC);
445 }
446 
447 /*
448  * Find a cylinder group to place a directory.
449  * Returns an inumber within the selected cylinder group.
450  * Note, the vfs_lock is not needed as we don't require exact cg summary info.
451  *
452  * If the switch ufs_close_dirs is set, then the policy is to use
453  * the current cg if it has more than 25% free inodes and more
454  * than 25% free blocks. Otherwise the cgs are searched from
455  * the beginning and the first cg with the same criteria is
456  * used. If that is also null then we revert to the old algorithm.
457  * This tends to cluster files at the beginning of the disk
458  * until the disk gets full.
459  *
460  * Otherwise if ufs_close_dirs is not set then the original policy is
461  * used which is to select from among those cylinder groups with
462  * above the average number of free inodes, the one with the smallest
463  * number of directories.
464  */
465 
466 int ufs_close_dirs = 1;	/* allocate directories close as possible */
467 
468 ino_t
469 dirpref(inode_t *dp)
470 {
471 	int cg, minndir, mincg, avgifree, mininode, minbpg, ifree;
472 	struct fs *fs = dp->i_fs;
473 
474 	cg = itog(fs, dp->i_number);
475 	mininode = fs->fs_ipg >> 2;
476 	minbpg = fs->fs_maxbpg >> 2;
477 	if (ufs_close_dirs &&
478 	    (fs->fs_cs(fs, cg).cs_nifree > mininode) &&
479 	    (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) {
480 		return (dp->i_number);
481 	}
482 
483 	avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
484 	minndir = fs->fs_ipg;
485 	mincg = 0;
486 	for (cg = 0; cg < fs->fs_ncg; cg++) {
487 		ifree = fs->fs_cs(fs, cg).cs_nifree;
488 		if (ufs_close_dirs &&
489 		    (ifree > mininode) &&
490 		    (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) {
491 			return ((ino_t)(fs->fs_ipg * cg));
492 		}
493 		if ((fs->fs_cs(fs, cg).cs_ndir < minndir) &&
494 		    (ifree >= avgifree)) {
495 			mincg = cg;
496 			minndir = fs->fs_cs(fs, cg).cs_ndir;
497 		}
498 	}
499 	return ((ino_t)(fs->fs_ipg * mincg));
500 }
501 
502 /*
503  * Select the desired position for the next block in a file.  The file is
504  * logically divided into sections. The first section is composed of the
505  * direct blocks. Each additional section contains fs_maxbpg blocks.
506  *
507  * If no blocks have been allocated in the first section, the policy is to
508  * request a block in the same cylinder group as the inode that describes
509  * the file. If no blocks have been allocated in any other section, the
510  * policy is to place the section in a cylinder group with a greater than
511  * average number of free blocks.  An appropriate cylinder group is found
512  * by using a rotor that sweeps the cylinder groups. When a new group of
513  * blocks is needed, the sweep begins in the cylinder group following the
514  * cylinder group from which the previous allocation was made. The sweep
515  * continues until a cylinder group with greater than the average number
516  * of free blocks is found. If the allocation is for the first block in an
517  * indirect block, the information on the previous allocation is unavailable;
518  * here a best guess is made based upon the logical block number being
519  * allocated.
520  *
521  * If a section is already partially allocated, the policy is to
522  * contiguously allocate fs_maxcontig blocks.  The end of one of these
523  * contiguous blocks and the beginning of the next is physically separated
524  * so that the disk head will be in transit between them for at least
525  * fs_rotdelay milliseconds.  This is to allow time for the processor to
526  * schedule another I/O transfer.
527  */
528 daddr_t
529 blkpref(struct inode *ip, daddr_t lbn, int indx, daddr32_t *bap)
530 {
531 	struct fs *fs;
532 	struct ufsvfs *ufsvfsp;
533 	int cg;
534 	int avgbfree, startcg;
535 	daddr_t nextblk;
536 
537 	ufsvfsp = ip->i_ufsvfs;
538 	fs = ip->i_fs;
539 	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
540 		if (lbn < NDADDR) {
541 			cg = itog(fs, ip->i_number);
542 			return (fs->fs_fpg * cg + fs->fs_frag);
543 		}
544 		/*
545 		 * Find a cylinder with greater than average
546 		 * number of unused data blocks.
547 		 */
548 		if (indx == 0 || bap[indx - 1] == 0)
549 			startcg = itog(fs, ip->i_number) + lbn / fs->fs_maxbpg;
550 		else
551 			startcg = dtog(fs, bap[indx - 1]) + 1;
552 		startcg %= fs->fs_ncg;
553 
554 		mutex_enter(&ufsvfsp->vfs_lock);
555 		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
556 		/*
557 		 * used for computing log space for writes/truncs
558 		 */
559 		ufsvfsp->vfs_avgbfree = avgbfree;
560 		for (cg = startcg; cg < fs->fs_ncg; cg++)
561 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
562 				fs->fs_cgrotor = cg;
563 				mutex_exit(&ufsvfsp->vfs_lock);
564 				return (fs->fs_fpg * cg + fs->fs_frag);
565 			}
566 		for (cg = 0; cg <= startcg; cg++)
567 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
568 				fs->fs_cgrotor = cg;
569 				mutex_exit(&ufsvfsp->vfs_lock);
570 				return (fs->fs_fpg * cg + fs->fs_frag);
571 			}
572 		mutex_exit(&ufsvfsp->vfs_lock);
573 		return (NULL);
574 	}
575 	/*
576 	 * One or more previous blocks have been laid out. If less
577 	 * than fs_maxcontig previous blocks are contiguous, the
578 	 * next block is requested contiguously, otherwise it is
579 	 * requested rotationally delayed by fs_rotdelay milliseconds.
580 	 */
581 
582 	nextblk = bap[indx - 1];
583 	/*
584 	 * Provision for fallocate to return positive
585 	 * blk preference based on last allocation
586 	 */
587 	if (nextblk < 0 && nextblk != UFS_HOLE) {
588 		nextblk = (-bap[indx - 1]) + fs->fs_frag;
589 	} else {
590 		nextblk = bap[indx - 1] + fs->fs_frag;
591 	}
592 
593 	if (indx > fs->fs_maxcontig && bap[indx - fs->fs_maxcontig] +
594 	    blkstofrags(fs, fs->fs_maxcontig) != nextblk) {
595 		return (nextblk);
596 	}
597 	if (fs->fs_rotdelay != 0)
598 		/*
599 		 * Here we convert ms of delay to frags as:
600 		 * (frags) = (ms) * (rev/sec) * (sect/rev) /
601 		 * 	((sect/frag) * (ms/sec))
602 		 * then round up to the next block.
603 		 */
604 		nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect /
605 		    (NSPF(fs) * 1000), fs->fs_frag);
606 	return (nextblk);
607 }
608 
609 /*
610  * Free a block or fragment.
611  *
612  * The specified block or fragment is placed back in the
613  * free map. If a fragment is deallocated, a possible
614  * block reassembly is checked.
615  */
616 void
617 free(struct inode *ip, daddr_t bno, off_t size, int flags)
618 {
619 	struct fs *fs = ip->i_fs;
620 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
621 	struct ufs_q *delq = &ufsvfsp->vfs_delete;
622 	struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
623 	struct cg *cgp;
624 	struct buf *bp;
625 	int cg, bmap, bbase;
626 	int i;
627 	uchar_t *blksfree;
628 	int *blktot;
629 	short *blks;
630 	daddr_t blkno, cylno, rpos;
631 
632 	/*
633 	 * fallocate'd files will have negative block address.
634 	 * So negate it again to get original block address.
635 	 */
636 	if (bno < 0 && (bno % fs->fs_frag == 0) && bno != UFS_HOLE) {
637 		bno = -bno;
638 	}
639 
640 	if ((unsigned long)size > fs->fs_bsize || fragoff(fs, size) != 0) {
641 		(void) ufs_fault(ITOV(ip),
642 		    "free: bad size, dev = 0x%lx, bsize = %d, size = %d, "
643 		    "fs = %s\n", ip->i_dev, fs->fs_bsize,
644 		    (int)size, fs->fs_fsmnt);
645 		return;
646 	}
647 	cg = dtog(fs, bno);
648 	ASSERT(!ufs_badblock(ip, bno));
649 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
650 	    (int)fs->fs_cgsize);
651 
652 	cgp = bp->b_un.b_cg;
653 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
654 		brelse(bp);
655 		return;
656 	}
657 
658 	if (!(flags & I_NOCANCEL))
659 		TRANS_CANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size, flags);
660 	if (flags & (I_DIR|I_IBLK|I_SHAD|I_QUOTA)) {
661 		TRANS_MATA_FREE(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size);
662 	}
663 	blksfree = cg_blksfree(cgp);
664 	blktot = cg_blktot(cgp);
665 	mutex_enter(&ufsvfsp->vfs_lock);
666 	cgp->cg_time = gethrestime_sec();
667 	bno = dtogd(fs, bno);
668 	if (size == fs->fs_bsize) {
669 		blkno = fragstoblks(fs, bno);
670 		cylno = cbtocylno(fs, bno);
671 		rpos = cbtorpos(ufsvfsp, bno);
672 		blks = cg_blks(ufsvfsp, cgp, cylno);
673 		if (!isclrblock(fs, blksfree, blkno)) {
674 			mutex_exit(&ufsvfsp->vfs_lock);
675 			brelse(bp);
676 			(void) ufs_fault(ITOV(ip), "free: freeing free block, "
677 			    "dev:0x%lx, block:%ld, ino:%lu, fs:%s",
678 			    ip->i_dev, bno, ip->i_number, fs->fs_fsmnt);
679 			return;
680 		}
681 		setblock(fs, blksfree, blkno);
682 		blks[rpos]++;
683 		blktot[cylno]++;
684 		cgp->cg_cs.cs_nbfree++;		/* Log below */
685 		fs->fs_cstotal.cs_nbfree++;
686 		fs->fs_cs(fs, cg).cs_nbfree++;
687 		if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) {
688 			mutex_enter(&delq->uq_mutex);
689 			delq_info->delq_unreclaimed_blocks -=
690 			    btodb(fs->fs_bsize);
691 			mutex_exit(&delq->uq_mutex);
692 		}
693 	} else {
694 		bbase = bno - fragnum(fs, bno);
695 		/*
696 		 * Decrement the counts associated with the old frags
697 		 */
698 		bmap = blkmap(fs, blksfree, bbase);
699 		fragacct(fs, bmap, cgp->cg_frsum, -1);
700 		/*
701 		 * Deallocate the fragment
702 		 */
703 		for (i = 0; i < numfrags(fs, size); i++) {
704 			if (isset(blksfree, bno + i)) {
705 				brelse(bp);
706 				mutex_exit(&ufsvfsp->vfs_lock);
707 				(void) ufs_fault(ITOV(ip),
708 				    "free: freeing free frag, "
709 				    "dev:0x%lx, blk:%ld, cg:%d, "
710 				    "ino:%lu, fs:%s",
711 				    ip->i_dev,
712 				    bno + i,
713 				    cgp->cg_cgx,
714 				    ip->i_number,
715 				    fs->fs_fsmnt);
716 				return;
717 			}
718 			setbit(blksfree, bno + i);
719 		}
720 		cgp->cg_cs.cs_nffree += i;
721 		fs->fs_cstotal.cs_nffree += i;
722 		fs->fs_cs(fs, cg).cs_nffree += i;
723 		if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) {
724 			mutex_enter(&delq->uq_mutex);
725 			delq_info->delq_unreclaimed_blocks -=
726 			    btodb(i * fs->fs_fsize);
727 			mutex_exit(&delq->uq_mutex);
728 		}
729 		/*
730 		 * Add back in counts associated with the new frags
731 		 */
732 		bmap = blkmap(fs, blksfree, bbase);
733 		fragacct(fs, bmap, cgp->cg_frsum, 1);
734 		/*
735 		 * If a complete block has been reassembled, account for it
736 		 */
737 		blkno = fragstoblks(fs, bbase);
738 		if (isblock(fs, blksfree, blkno)) {
739 			cylno = cbtocylno(fs, bbase);
740 			rpos = cbtorpos(ufsvfsp, bbase);
741 			blks = cg_blks(ufsvfsp, cgp, cylno);
742 			blks[rpos]++;
743 			blktot[cylno]++;
744 			cgp->cg_cs.cs_nffree -= fs->fs_frag;
745 			fs->fs_cstotal.cs_nffree -= fs->fs_frag;
746 			fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
747 			cgp->cg_cs.cs_nbfree++;
748 			fs->fs_cstotal.cs_nbfree++;
749 			fs->fs_cs(fs, cg).cs_nbfree++;
750 		}
751 	}
752 	fs->fs_fmod = 1;
753 	ufs_notclean(ufsvfsp);
754 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
755 	TRANS_SI(ufsvfsp, fs, cg);
756 	bdrwrite(bp);
757 }
758 
759 /*
760  * Free an inode.
761  *
762  * The specified inode is placed back in the free map.
763  */
764 void
765 ufs_ifree(struct inode *ip, ino_t ino, mode_t mode)
766 {
767 	struct fs *fs = ip->i_fs;
768 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
769 	struct cg *cgp;
770 	struct buf *bp;
771 	unsigned int inot;
772 	int cg;
773 	char *iused;
774 
775 	if (ip->i_number == ino && ip->i_mode != 0) {
776 		(void) ufs_fault(ITOV(ip),
777 		    "ufs_ifree: illegal mode: (imode) %o, (omode) %o, ino %d, "
778 		    "fs = %s\n",
779 		    ip->i_mode, mode, (int)ip->i_number, fs->fs_fsmnt);
780 		return;
781 	}
782 	if (ino >= fs->fs_ipg * fs->fs_ncg) {
783 		(void) ufs_fault(ITOV(ip),
784 		    "ifree: range, dev = 0x%x, ino = %d, fs = %s\n",
785 		    (int)ip->i_dev, (int)ino, fs->fs_fsmnt);
786 		return;
787 	}
788 	cg = (int)itog(fs, ino);
789 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
790 	    (int)fs->fs_cgsize);
791 
792 	cgp = bp->b_un.b_cg;
793 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
794 		brelse(bp);
795 		return;
796 	}
797 	mutex_enter(&ufsvfsp->vfs_lock);
798 	cgp->cg_time = gethrestime_sec();
799 	iused = cg_inosused(cgp);
800 	inot = (unsigned int)(ino % (ulong_t)fs->fs_ipg);
801 	if (isclr(iused, inot)) {
802 		mutex_exit(&ufsvfsp->vfs_lock);
803 		brelse(bp);
804 		(void) ufs_fault(ITOV(ip), "ufs_ifree: freeing free inode, "
805 		    "mode: (imode) %o, (omode) %o, ino:%d, "
806 		    "fs:%s",
807 		    ip->i_mode, mode, (int)ino, fs->fs_fsmnt);
808 		return;
809 	}
810 	clrbit(iused, inot);
811 
812 	if (inot < (ulong_t)cgp->cg_irotor)
813 		cgp->cg_irotor = inot;
814 	cgp->cg_cs.cs_nifree++;
815 	fs->fs_cstotal.cs_nifree++;
816 	fs->fs_cs(fs, cg).cs_nifree++;
817 	if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) {
818 		cgp->cg_cs.cs_ndir--;
819 		fs->fs_cstotal.cs_ndir--;
820 		fs->fs_cs(fs, cg).cs_ndir--;
821 	}
822 	fs->fs_fmod = 1;
823 	ufs_notclean(ufsvfsp);
824 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
825 	TRANS_SI(ufsvfsp, fs, cg);
826 	bdrwrite(bp);
827 }
828 
829 /*
830  * Implement the cylinder overflow algorithm.
831  *
832  * The policy implemented by this algorithm is:
833  *   1) allocate the block in its requested cylinder group.
834  *   2) quadratically rehash on the cylinder group number.
835  *   3) brute force search for a free block.
836  * The size parameter means size for data blocks, mode for inodes.
837  */
838 static ino_t
839 hashalloc(struct inode *ip, int cg, long pref, int size, ulong_t (*allocator)())
840 {
841 	struct fs *fs;
842 	int i;
843 	long result;
844 	int icg = cg;
845 
846 	fs = ip->i_fs;
847 	/*
848 	 * 1: preferred cylinder group
849 	 */
850 	result = (*allocator)(ip, cg, pref, size);
851 	if (result)
852 		return (result);
853 	/*
854 	 * 2: quadratic rehash
855 	 */
856 	for (i = 1; i < fs->fs_ncg; i *= 2) {
857 		cg += i;
858 		if (cg >= fs->fs_ncg)
859 			cg -= fs->fs_ncg;
860 		result = (*allocator)(ip, cg, 0, size);
861 		if (result)
862 			return (result);
863 	}
864 	/*
865 	 * 3: brute force search
866 	 * Note that we start at i == 2, since 0 was checked initially,
867 	 * and 1 is always checked in the quadratic rehash.
868 	 */
869 	cg = (icg + 2) % fs->fs_ncg;
870 	for (i = 2; i < fs->fs_ncg; i++) {
871 		result = (*allocator)(ip, cg, 0, size);
872 		if (result)
873 			return (result);
874 		cg++;
875 		if (cg == fs->fs_ncg)
876 			cg = 0;
877 	}
878 	return (NULL);
879 }
880 
881 /*
882  * Determine whether a fragment can be extended.
883  *
884  * Check to see if the necessary fragments are available, and
885  * if they are, allocate them.
886  */
887 static daddr_t
888 fragextend(struct inode *ip, int cg, long bprev, int osize, int nsize)
889 {
890 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
891 	struct fs *fs = ip->i_fs;
892 	struct buf *bp;
893 	struct cg *cgp;
894 	uchar_t *blksfree;
895 	long bno;
896 	int frags, bbase;
897 	int i, j;
898 
899 	if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
900 		return (NULL);
901 	frags = numfrags(fs, nsize);
902 	bbase = (int)fragnum(fs, bprev);
903 	if (bbase > fragnum(fs, (bprev + frags - 1))) {
904 		/* cannot extend across a block boundary */
905 		return (NULL);
906 	}
907 
908 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
909 	    (int)fs->fs_cgsize);
910 	cgp = bp->b_un.b_cg;
911 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
912 		brelse(bp);
913 		return (NULL);
914 	}
915 
916 	blksfree = cg_blksfree(cgp);
917 	mutex_enter(&ufsvfsp->vfs_lock);
918 	bno = dtogd(fs, bprev);
919 	for (i = numfrags(fs, osize); i < frags; i++) {
920 		if (isclr(blksfree, bno + i)) {
921 			mutex_exit(&ufsvfsp->vfs_lock);
922 			brelse(bp);
923 			return (NULL);
924 		}
925 		if ((TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bprev + i)),
926 		    fs->fs_fsize))) {
927 			mutex_exit(&ufsvfsp->vfs_lock);
928 			brelse(bp);
929 			return (NULL);
930 		}
931 	}
932 
933 	cgp->cg_time = gethrestime_sec();
934 	/*
935 	 * The current fragment can be extended,
936 	 * deduct the count on fragment being extended into
937 	 * increase the count on the remaining fragment (if any)
938 	 * allocate the extended piece.
939 	 */
940 	for (i = frags; i < fs->fs_frag - bbase; i++)
941 		if (isclr(blksfree, bno + i))
942 			break;
943 	j = i - numfrags(fs, osize);
944 	cgp->cg_frsum[j]--;
945 	ASSERT(cgp->cg_frsum[j] >= 0);
946 	if (i != frags)
947 		cgp->cg_frsum[i - frags]++;
948 	for (i = numfrags(fs, osize); i < frags; i++) {
949 		clrbit(blksfree, bno + i);
950 		cgp->cg_cs.cs_nffree--;
951 		fs->fs_cs(fs, cg).cs_nffree--;
952 		fs->fs_cstotal.cs_nffree--;
953 	}
954 	fs->fs_fmod = 1;
955 	ufs_notclean(ufsvfsp);
956 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
957 	TRANS_SI(ufsvfsp, fs, cg);
958 	bdrwrite(bp);
959 	return ((daddr_t)bprev);
960 }
961 
962 /*
963  * Determine whether a block can be allocated.
964  *
965  * Check to see if a block of the apprpriate size
966  * is available, and if it is, allocate it.
967  */
968 static daddr_t
969 alloccg(struct inode *ip, int cg, daddr_t bpref, int size)
970 {
971 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
972 	struct fs *fs = ip->i_fs;
973 	struct buf *bp;
974 	struct cg *cgp;
975 	uchar_t *blksfree;
976 	int bno, frags;
977 	int allocsiz;
978 	int i;
979 
980 	/*
981 	 * Searching for space could be time expensive so do some
982 	 * up front checking to verify that there is actually space
983 	 * available (free blocks or free frags).
984 	 */
985 	if (fs->fs_cs(fs, cg).cs_nbfree == 0) {
986 		if (size == fs->fs_bsize)
987 			return (0);
988 
989 		/*
990 		 * If there are not enough free frags then return.
991 		 */
992 		if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, size))
993 			return (0);
994 	}
995 
996 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
997 	    (int)fs->fs_cgsize);
998 
999 	cgp = bp->b_un.b_cg;
1000 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) ||
1001 	    (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) {
1002 		brelse(bp);
1003 		return (0);
1004 	}
1005 	blksfree = cg_blksfree(cgp);
1006 	mutex_enter(&ufsvfsp->vfs_lock);
1007 	cgp->cg_time = gethrestime_sec();
1008 	if (size == fs->fs_bsize) {
1009 		if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0)
1010 			goto errout;
1011 		fs->fs_fmod = 1;
1012 		ufs_notclean(ufsvfsp);
1013 		TRANS_SI(ufsvfsp, fs, cg);
1014 		bdrwrite(bp);
1015 		return (bno);
1016 	}
1017 	/*
1018 	 * Check fragment bitmap to see if any fragments are already available.
1019 	 * mapsearch() may fail because the fragment that fits this request
1020 	 * might still be on the cancel list and not available for re-use yet.
1021 	 * Look for a bigger sized fragment to allocate first before we have
1022 	 * to give up and fragment a whole new block eventually.
1023 	 */
1024 	frags = numfrags(fs, size);
1025 	allocsiz = frags;
1026 next_size:
1027 	for (; allocsiz < fs->fs_frag; allocsiz++)
1028 		if (cgp->cg_frsum[allocsiz] != 0)
1029 			break;
1030 
1031 	if (allocsiz != fs->fs_frag) {
1032 		bno = mapsearch(ufsvfsp, cgp, bpref, allocsiz);
1033 		if (bno < 0 && allocsiz < (fs->fs_frag - 1)) {
1034 			allocsiz++;
1035 			goto next_size;
1036 		}
1037 	}
1038 
1039 	if (allocsiz == fs->fs_frag || bno < 0) {
1040 		/*
1041 		 * No fragments were available, so a block
1042 		 * will be allocated and hacked up.
1043 		 */
1044 		if (cgp->cg_cs.cs_nbfree == 0)
1045 			goto errout;
1046 		if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0)
1047 			goto errout;
1048 		bpref = dtogd(fs, bno);
1049 		for (i = frags; i < fs->fs_frag; i++)
1050 			setbit(blksfree, bpref + i);
1051 		i = fs->fs_frag - frags;
1052 		cgp->cg_cs.cs_nffree += i;
1053 		fs->fs_cstotal.cs_nffree += i;
1054 		fs->fs_cs(fs, cg).cs_nffree += i;
1055 		cgp->cg_frsum[i]++;
1056 		fs->fs_fmod = 1;
1057 		ufs_notclean(ufsvfsp);
1058 		TRANS_SI(ufsvfsp, fs, cg);
1059 		bdrwrite(bp);
1060 		return (bno);
1061 	}
1062 
1063 	for (i = 0; i < frags; i++)
1064 		clrbit(blksfree, bno + i);
1065 	cgp->cg_cs.cs_nffree -= frags;
1066 	fs->fs_cstotal.cs_nffree -= frags;
1067 	fs->fs_cs(fs, cg).cs_nffree -= frags;
1068 	cgp->cg_frsum[allocsiz]--;
1069 	ASSERT(cgp->cg_frsum[allocsiz] >= 0);
1070 	if (frags != allocsiz) {
1071 		cgp->cg_frsum[allocsiz - frags]++;
1072 	}
1073 	fs->fs_fmod = 1;
1074 	ufs_notclean(ufsvfsp);
1075 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1076 	TRANS_SI(ufsvfsp, fs, cg);
1077 	bdrwrite(bp);
1078 	return (cg * fs->fs_fpg + bno);
1079 errout:
1080 	mutex_exit(&ufsvfsp->vfs_lock);
1081 	brelse(bp);
1082 	return (0);
1083 }
1084 
1085 /*
1086  * Allocate a block in a cylinder group.
1087  *
1088  * This algorithm implements the following policy:
1089  *   1) allocate the requested block.
1090  *   2) allocate a rotationally optimal block in the same cylinder.
1091  *   3) allocate the next available block on the block rotor for the
1092  *	specified cylinder group.
1093  * Note that this routine only allocates fs_bsize blocks; these
1094  * blocks may be fragmented by the routine that allocates them.
1095  */
1096 static daddr_t
1097 alloccgblk(
1098 	struct ufsvfs *ufsvfsp,
1099 	struct cg *cgp,
1100 	daddr_t bpref,
1101 	struct buf *bp)
1102 {
1103 	daddr_t bno;
1104 	int cylno, pos, delta, rotbl_size;
1105 	short *cylbp;
1106 	int i;
1107 	struct fs *fs;
1108 	uchar_t *blksfree;
1109 	daddr_t blkno, rpos, frag;
1110 	short *blks;
1111 	int32_t *blktot;
1112 
1113 	ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
1114 	fs = ufsvfsp->vfs_fs;
1115 	blksfree = cg_blksfree(cgp);
1116 	if (bpref == 0) {
1117 		bpref = cgp->cg_rotor;
1118 		goto norot;
1119 	}
1120 	bpref = blknum(fs, bpref);
1121 	bpref = dtogd(fs, bpref);
1122 	/*
1123 	 * If the requested block is available, use it.
1124 	 */
1125 	if (isblock(fs, blksfree, (daddr_t)fragstoblks(fs, bpref))) {
1126 		bno = bpref;
1127 		goto gotit;
1128 	}
1129 	/*
1130 	 * Check for a block available on the same cylinder.
1131 	 */
1132 	cylno = cbtocylno(fs, bpref);
1133 	if (cg_blktot(cgp)[cylno] == 0)
1134 		goto norot;
1135 	if (fs->fs_cpc == 0) {
1136 		/*
1137 		 * Block layout info is not available, so just
1138 		 * have to take any block in this cylinder.
1139 		 */
1140 		bpref = howmany(fs->fs_spc * cylno, NSPF(fs));
1141 		goto norot;
1142 	}
1143 	/*
1144 	 * Check the summary information to see if a block is
1145 	 * available in the requested cylinder starting at the
1146 	 * requested rotational position and proceeding around.
1147 	 */
1148 	cylbp = cg_blks(ufsvfsp, cgp, cylno);
1149 	pos = cbtorpos(ufsvfsp, bpref);
1150 	for (i = pos; i < ufsvfsp->vfs_nrpos; i++)
1151 		if (cylbp[i] > 0)
1152 			break;
1153 	if (i == ufsvfsp->vfs_nrpos)
1154 		for (i = 0; i < pos; i++)
1155 			if (cylbp[i] > 0)
1156 				break;
1157 	if (cylbp[i] > 0) {
1158 		/*
1159 		 * Found a rotational position, now find the actual
1160 		 * block.  A "panic" if none is actually there.
1161 		 */
1162 
1163 		/*
1164 		 * Up to this point, "pos" has referred to the rotational
1165 		 * position of the desired block.  From now on, it holds
1166 		 * the offset of the current cylinder within a cylinder
1167 		 * cycle.  (A cylinder cycle refers to a set of cylinders
1168 		 * which are described by a single rotational table; the
1169 		 * size of the cycle is fs_cpc.)
1170 		 *
1171 		 * bno is set to the block number of the first block within
1172 		 * the current cylinder cycle.
1173 		 */
1174 
1175 		pos = cylno % fs->fs_cpc;
1176 		bno = (cylno - pos) * fs->fs_spc / NSPB(fs);
1177 
1178 		/*
1179 		 * The blocks within a cylinder are grouped into equivalence
1180 		 * classes according to their "rotational position."  There
1181 		 * are two tables used to determine these classes.
1182 		 *
1183 		 * The positional offset table (fs_postbl) has an entry for
1184 		 * each rotational position of each cylinder in a cylinder
1185 		 * cycle.  This entry contains the relative block number
1186 		 * (counting from the start of the cylinder cycle) of the
1187 		 * first block in the equivalence class for that position
1188 		 * and that cylinder.  Positions for which no blocks exist
1189 		 * are indicated by a -1.
1190 		 *
1191 		 * The rotational delta table (fs_rotbl) has an entry for
1192 		 * each block in a cylinder cycle.  This entry contains
1193 		 * the offset from that block to the next block in the
1194 		 * same equivalence class.  The last block in the class
1195 		 * is indicated by a zero in the table.
1196 		 *
1197 		 * The following code, then, walks through all of the blocks
1198 		 * in the cylinder (cylno) which we're allocating within
1199 		 * which are in the equivalence class for the rotational
1200 		 * position (i) which we're allocating within.
1201 		 */
1202 
1203 		if (fs_postbl(ufsvfsp, pos)[i] == -1) {
1204 			(void) ufs_fault(ufsvfsp->vfs_root,
1205 			    "alloccgblk: cyl groups corrupted, pos = %d, "
1206 			    "i = %d, fs = %s\n", pos, i, fs->fs_fsmnt);
1207 			return (0);
1208 		}
1209 
1210 		/*
1211 		 * There is one entry in the rotational table for each block
1212 		 * in the cylinder cycle.  These are whole blocks, not frags.
1213 		 */
1214 
1215 		rotbl_size = (fs->fs_cpc * fs->fs_spc) >>
1216 		    (fs->fs_fragshift + fs->fs_fsbtodb);
1217 
1218 		/*
1219 		 * As we start, "i" is the rotational position within which
1220 		 * we're searching.  After the next line, it will be a block
1221 		 * number (relative to the start of the cylinder cycle)
1222 		 * within the equivalence class of that rotational position.
1223 		 */
1224 
1225 		i = fs_postbl(ufsvfsp, pos)[i];
1226 
1227 		for (;;) {
1228 			if (isblock(fs, blksfree, (daddr_t)(bno + i))) {
1229 				bno = blkstofrags(fs, (bno + i));
1230 				goto gotit;
1231 			}
1232 			delta = fs_rotbl(fs)[i];
1233 			if (delta <= 0 ||		/* End of chain, or */
1234 			    delta + i > rotbl_size)	/* end of table? */
1235 				break;			/* If so, panic. */
1236 			i += delta;
1237 		}
1238 		(void) ufs_fault(ufsvfsp->vfs_root,
1239 		    "alloccgblk: can't find blk in cyl, pos:%d, i:%d, "
1240 		    "fs:%s bno: %x\n", pos, i, fs->fs_fsmnt, (int)bno);
1241 		return (0);
1242 	}
1243 norot:
1244 	/*
1245 	 * No blocks in the requested cylinder, so take
1246 	 * next available one in this cylinder group.
1247 	 */
1248 	bno = mapsearch(ufsvfsp, cgp, bpref, (int)fs->fs_frag);
1249 	if (bno < 0)
1250 		return (0);
1251 	cgp->cg_rotor = bno;
1252 gotit:
1253 	blkno = fragstoblks(fs, bno);
1254 	frag = (cgp->cg_cgx * fs->fs_fpg) + bno;
1255 	if (TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, frag)), fs->fs_bsize))
1256 		goto norot;
1257 	clrblock(fs, blksfree, (long)blkno);
1258 	/*
1259 	 * the other cg/sb/si fields are TRANS'ed by the caller
1260 	 */
1261 	cgp->cg_cs.cs_nbfree--;
1262 	fs->fs_cstotal.cs_nbfree--;
1263 	fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
1264 	cylno = cbtocylno(fs, bno);
1265 	blks = cg_blks(ufsvfsp, cgp, cylno);
1266 	rpos = cbtorpos(ufsvfsp, bno);
1267 	blktot = cg_blktot(cgp);
1268 	blks[rpos]--;
1269 	blktot[cylno]--;
1270 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1271 	fs->fs_fmod = 1;
1272 	return (frag);
1273 }
1274 
1275 /*
1276  * Determine whether an inode can be allocated.
1277  *
1278  * Check to see if an inode is available, and if it is,
1279  * allocate it using the following policy:
1280  *   1) allocate the requested inode.
1281  *   2) allocate the next available inode after the requested
1282  *	inode in the specified cylinder group.
1283  */
1284 static ino_t
1285 ialloccg(struct inode *ip, int cg, daddr_t ipref, int mode)
1286 {
1287 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
1288 	struct fs *fs = ip->i_fs;
1289 	struct cg *cgp;
1290 	struct buf *bp;
1291 	int start, len, loc, map, i;
1292 	char *iused;
1293 
1294 	if (fs->fs_cs(fs, cg).cs_nifree == 0)
1295 		return (0);
1296 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
1297 	    (int)fs->fs_cgsize);
1298 
1299 	cgp = bp->b_un.b_cg;
1300 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) ||
1301 	    cgp->cg_cs.cs_nifree == 0) {
1302 		brelse(bp);
1303 		return (0);
1304 	}
1305 	iused = cg_inosused(cgp);
1306 	mutex_enter(&ufsvfsp->vfs_lock);
1307 	/*
1308 	 * While we are waiting for the mutex, someone may have taken
1309 	 * the last available inode.  Need to recheck.
1310 	 */
1311 	if (cgp->cg_cs.cs_nifree == 0) {
1312 		mutex_exit(&ufsvfsp->vfs_lock);
1313 		brelse(bp);
1314 		return (0);
1315 	}
1316 
1317 	cgp->cg_time = gethrestime_sec();
1318 	if (ipref) {
1319 		ipref %= fs->fs_ipg;
1320 		if (isclr(iused, ipref))
1321 			goto gotit;
1322 	}
1323 	start = cgp->cg_irotor / NBBY;
1324 	len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
1325 	loc = skpc(0xff, (uint_t)len, &iused[start]);
1326 	if (loc == 0) {
1327 		len = start + 1;
1328 		start = 0;
1329 		loc = skpc(0xff, (uint_t)len, &iused[0]);
1330 		if (loc == 0) {
1331 			mutex_exit(&ufsvfsp->vfs_lock);
1332 			(void) ufs_fault(ITOV(ip),
1333 			    "ialloccg: map corrupted, cg = %d, irotor = %d, "
1334 			    "fs = %s\n", cg, (int)cgp->cg_irotor, fs->fs_fsmnt);
1335 			return (0);
1336 		}
1337 	}
1338 	i = start + len - loc;
1339 	map = iused[i];
1340 	ipref = i * NBBY;
1341 	for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) {
1342 		if ((map & i) == 0) {
1343 			cgp->cg_irotor = ipref;
1344 			goto gotit;
1345 		}
1346 	}
1347 
1348 	mutex_exit(&ufsvfsp->vfs_lock);
1349 	(void) ufs_fault(ITOV(ip), "ialloccg: block not in mapfs = %s",
1350 	    fs->fs_fsmnt);
1351 	return (0);
1352 gotit:
1353 	setbit(iused, ipref);
1354 	cgp->cg_cs.cs_nifree--;
1355 	fs->fs_cstotal.cs_nifree--;
1356 	fs->fs_cs(fs, cg).cs_nifree--;
1357 	if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) {
1358 		cgp->cg_cs.cs_ndir++;
1359 		fs->fs_cstotal.cs_ndir++;
1360 		fs->fs_cs(fs, cg).cs_ndir++;
1361 	}
1362 	fs->fs_fmod = 1;
1363 	ufs_notclean(ufsvfsp);
1364 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1365 	TRANS_SI(ufsvfsp, fs, cg);
1366 	bdrwrite(bp);
1367 	return (cg * fs->fs_ipg + ipref);
1368 }
1369 
1370 /*
1371  * Find a block of the specified size in the specified cylinder group.
1372  *
1373  * It is a panic if a request is made to find a block if none are
1374  * available.
1375  */
1376 static daddr_t
1377 mapsearch(struct ufsvfs *ufsvfsp, struct cg *cgp, daddr_t bpref,
1378 	int allocsiz)
1379 {
1380 	struct fs *fs	= ufsvfsp->vfs_fs;
1381 	daddr_t bno, cfrag;
1382 	int start, len, loc, i, last, first, secondtime;
1383 	int blk, field, subfield, pos;
1384 	int gotit;
1385 
1386 	/*
1387 	 * ufsvfs->vfs_lock is held when calling this.
1388 	 */
1389 	/*
1390 	 * Find the fragment by searching through the
1391 	 * free block map for an appropriate bit pattern.
1392 	 */
1393 	if (bpref)
1394 		start = dtogd(fs, bpref) / NBBY;
1395 	else
1396 		start = cgp->cg_frotor / NBBY;
1397 	/*
1398 	 * the following loop performs two scans -- the first scan
1399 	 * searches the bottom half of the array for a match and the
1400 	 * second scan searches the top half of the array.  The loops
1401 	 * have been merged just to make things difficult.
1402 	 */
1403 	first = start;
1404 	last = howmany(fs->fs_fpg, NBBY);
1405 	secondtime = 0;
1406 	cfrag = cgp->cg_cgx * fs->fs_fpg;
1407 	while (first < last) {
1408 		len = last - first;
1409 		/*
1410 		 * search the array for a match
1411 		 */
1412 		loc = scanc((unsigned)len, (uchar_t *)&cg_blksfree(cgp)[first],
1413 		    (uchar_t *)fragtbl[fs->fs_frag],
1414 		    (int)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
1415 		/*
1416 		 * match found
1417 		 */
1418 		if (loc) {
1419 			bno = (last - loc) * NBBY;
1420 
1421 			/*
1422 			 * Found the byte in the map, sift
1423 			 * through the bits to find the selected frag
1424 			 */
1425 			cgp->cg_frotor = bno;
1426 			gotit = 0;
1427 			for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
1428 				blk = blkmap(fs, cg_blksfree(cgp), bno);
1429 				blk <<= 1;
1430 				field = around[allocsiz];
1431 				subfield = inside[allocsiz];
1432 				for (pos = 0;
1433 				    pos <= fs->fs_frag - allocsiz;
1434 				    pos++) {
1435 					if ((blk & field) == subfield) {
1436 						gotit++;
1437 						break;
1438 					}
1439 					field <<= 1;
1440 					subfield <<= 1;
1441 				}
1442 				if (gotit)
1443 					break;
1444 			}
1445 			bno += pos;
1446 
1447 			/*
1448 			 * success if block is *not* being converted from
1449 			 * metadata into userdata (harpy).  If so, ignore.
1450 			 */
1451 			if (!TRANS_ISCANCEL(ufsvfsp,
1452 			    ldbtob(fsbtodb(fs, (cfrag+bno))),
1453 			    allocsiz * fs->fs_fsize))
1454 				return (bno);
1455 
1456 			/*
1457 			 * keep looking -- this block is being converted
1458 			 */
1459 			first = (last - loc) + 1;
1460 			loc = 0;
1461 			if (first < last)
1462 				continue;
1463 		}
1464 		/*
1465 		 * no usable matches in bottom half -- now search the top half
1466 		 */
1467 		if (secondtime)
1468 			/*
1469 			 * no usable matches in top half -- all done
1470 			 */
1471 			break;
1472 		secondtime = 1;
1473 		last = start + 1;
1474 		first = 0;
1475 	}
1476 	/*
1477 	 * no usable matches
1478 	 */
1479 	return ((daddr_t)-1);
1480 }
1481 
1482 #define	UFSNADDR (NDADDR + NIADDR)	/* NADDR applies to (obsolete) S5FS */
1483 #define	IB(i)	(NDADDR + (i))	/* index of i'th indirect block ptr */
1484 #define	SINGLE	0		/* single indirect block ptr */
1485 #define	DOUBLE	1		/* double indirect block ptr */
1486 #define	TRIPLE	2		/* triple indirect block ptr */
1487 
1488 /*
1489  * Acquire a write lock, and keep trying till we get it
1490  */
1491 static int
1492 allocsp_wlockfs(struct vnode *vp, struct lockfs *lf)
1493 {
1494 	int err = 0;
1495 
1496 lockagain:
1497 	do {
1498 		err = ufs_fiolfss(vp, lf);
1499 		if (err)
1500 			return (err);
1501 	} while (!LOCKFS_IS_ULOCK(lf));
1502 
1503 	lf->lf_lock = LOCKFS_WLOCK;
1504 	lf->lf_flags = 0;
1505 	lf->lf_comment = NULL;
1506 	err = ufs__fiolfs(vp, lf, 1, 0);
1507 
1508 	if (err == EBUSY || err == EINVAL)
1509 		goto lockagain;
1510 
1511 	return (err);
1512 }
1513 
1514 /*
1515  * Release the write lock
1516  */
1517 static int
1518 allocsp_unlockfs(struct vnode *vp, struct lockfs *lf)
1519 {
1520 	int err = 0;
1521 
1522 	lf->lf_lock = LOCKFS_ULOCK;
1523 	lf->lf_flags = 0;
1524 	err = ufs__fiolfs(vp, lf, 1, 0);
1525 	return (err);
1526 }
1527 
1528 struct allocsp_undo {
1529 	daddr_t offset;
1530 	daddr_t blk;
1531 	struct allocsp_undo *next;
1532 };
1533 
1534 /*
1535  * ufs_allocsp() can be used to pre-allocate blocks for a file on a given
1536  * file system. For direct blocks, the blocks are allocated from the offset
1537  * requested to the block boundary, then any full blocks are allocated,
1538  * and finally any remainder.
1539  * For indirect blocks the blocks are not initialized and are
1540  * only marked as allocated. These addresses are then stored as negative
1541  * block numbers in the inode to imply special handling. UFS has been modified
1542  * where necessary to understand this new notion.
1543  * Successfully fallocated files will have IFALLOCATE cflag set in the inode.
1544  */
1545 int
1546 ufs_allocsp(struct vnode *vp, struct flock64 *lp, cred_t *cr)
1547 {
1548 	struct lockfs lf;
1549 	int berr, err, resv, issync;
1550 	off_t istart, len; /* istart, special for idb */
1551 	struct inode *ip;
1552 	struct fs *fs;
1553 	struct ufsvfs *ufsvfsp;
1554 	u_offset_t resid, i, uoff;
1555 	daddr32_t db_undo[NDADDR];	/* old direct blocks */
1556 	struct allocsp_undo *ib_undo = NULL;	/* ib undo */
1557 	struct allocsp_undo *undo = NULL;
1558 	u_offset_t osz;			/* old file size */
1559 	int chunkblks = 0;		/* # of blocks in 1 allocation */
1560 	int cnt = 0;
1561 	daddr_t allocblk;
1562 	daddr_t totblks = 0;
1563 	struct ulockfs	*ulp;
1564 	size_t done_len;
1565 	int nbytes, offsetn;
1566 
1567 
1568 	ASSERT(vp->v_type == VREG);
1569 
1570 	ip = VTOI(vp);
1571 	fs = ip->i_fs;
1572 	if ((ufsvfsp = ip->i_ufsvfs) == NULL) {
1573 		err = EIO;
1574 		goto out_allocsp;
1575 	}
1576 
1577 	istart = blkroundup(fs, (lp->l_start));
1578 	len = blkroundup(fs, (lp->l_len));
1579 	chunkblks = blkroundup(fs, ufsvfsp->vfs_iotransz) / fs->fs_bsize;
1580 	ulp = &ufsvfsp->vfs_ulockfs;
1581 
1582 	if (lp->l_start < 0 || lp->l_len <= 0)
1583 		return (EINVAL);
1584 
1585 	/* Quickly check to make sure we have space before we proceed */
1586 	if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree) {
1587 		if (TRANS_ISTRANS(ufsvfsp)) {
1588 			ufs_delete_drain_wait(ufsvfsp, 1);
1589 			if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree)
1590 				return (ENOSPC);
1591 		} else
1592 			return (ENOSPC);
1593 	}
1594 
1595 	/*
1596 	 * We will keep i_rwlock locked as WRITER through out the function
1597 	 * since we don't want anyone else reading or writing to the inode
1598 	 * while we are in the middle of fallocating the file.
1599 	 */
1600 	rw_enter(&ip->i_rwlock, RW_WRITER);
1601 
1602 	/* Back up the direct block list, used for undo later if necessary */
1603 	rw_enter(&ip->i_contents, RW_READER);
1604 	for (i = 0; i < NDADDR; i++)
1605 		db_undo[i] = ip->i_db[i];
1606 	osz = ip->i_size;
1607 	rw_exit(&ip->i_contents);
1608 
1609 	/* Write lock the file system */
1610 	if (err = allocsp_wlockfs(vp, &lf))
1611 		goto exit;
1612 
1613 	/*
1614 	 * Allocate any direct blocks now.
1615 	 * Blocks are allocated from the offset requested to the block
1616 	 * boundary, then any full blocks are allocated, and finally any
1617 	 * remainder.
1618 	 */
1619 	if (lblkno(fs, lp->l_start) < NDADDR) {
1620 		ufs_trans_trunc_resv(ip, ip->i_size + (NDADDR * fs->fs_bsize),
1621 		    &resv, &resid);
1622 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1623 
1624 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1625 		rw_enter(&ip->i_contents, RW_WRITER);
1626 
1627 		done_len = 0;
1628 		while ((done_len < lp->l_len) &&
1629 		    (lblkno(fs, lp->l_start + done_len) < NDADDR)) {
1630 			uoff = (offset_t)(lp->l_start + done_len);
1631 			offsetn = (int)blkoff(fs, uoff);
1632 			nbytes = (int)MIN(fs->fs_bsize - offsetn,
1633 			    lp->l_len - done_len);
1634 
1635 			berr = bmap_write(ip, uoff, offsetn + nbytes,
1636 			    BI_FALLOCATE, &allocblk, cr);
1637 			/* Yikes error, quit */
1638 			if (berr) {
1639 				TRANS_INODE(ufsvfsp, ip);
1640 				rw_exit(&ip->i_contents);
1641 				rw_exit(&ufsvfsp->vfs_dqrwlock);
1642 				TRANS_END_CSYNC(ufsvfsp, err, issync,
1643 				    TOP_ALLOCSP, resv);
1644 				err = allocsp_unlockfs(vp, &lf);
1645 				goto exit;
1646 			}
1647 
1648 			if (allocblk) {
1649 				totblks++;
1650 				if ((uoff + nbytes) > ip->i_size)
1651 					ip->i_size = (uoff + nbytes);
1652 			}
1653 			done_len += nbytes;
1654 		}
1655 
1656 		TRANS_INODE(ufsvfsp, ip);
1657 		rw_exit(&ip->i_contents);
1658 		rw_exit(&ufsvfsp->vfs_dqrwlock);
1659 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
1660 
1661 		/* start offset for indirect allocation */
1662 		istart =  (uoff + nbytes);
1663 	}
1664 
1665 	/* Break the transactions into vfs_iotransz units */
1666 	ufs_trans_trunc_resv(ip, ip->i_size +
1667 	    blkroundup(fs, ufsvfsp->vfs_iotransz), &resv, &resid);
1668 	TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1669 
1670 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1671 	rw_enter(&ip->i_contents, RW_WRITER);
1672 
1673 	/* Now go about fallocating necessary indirect blocks */
1674 	for (i = istart; i < (lp->l_start + lp->l_len); i += fs->fs_bsize) {
1675 		berr = bmap_write(ip, i, fs->fs_bsize, BI_FALLOCATE,
1676 		    &allocblk, cr);
1677 		if (berr) {
1678 			TRANS_INODE(ufsvfsp, ip);
1679 			rw_exit(&ip->i_contents);
1680 			rw_exit(&ufsvfsp->vfs_dqrwlock);
1681 			TRANS_END_CSYNC(ufsvfsp, err, issync,
1682 			    TOP_ALLOCSP, resv);
1683 			err = allocsp_unlockfs(vp, &lf);
1684 			goto exit;
1685 		}
1686 
1687 		/* Update the blk counter only if new block was added */
1688 		if (allocblk) {
1689 			/* Save undo information */
1690 			undo = kmem_alloc(sizeof (struct allocsp_undo),
1691 			    KM_SLEEP);
1692 			undo->offset = i;
1693 			undo->blk = allocblk;
1694 			undo->next = ib_undo;
1695 			ib_undo = undo;
1696 			totblks++;
1697 
1698 			if (i >= ip->i_size)
1699 				ip->i_size += fs->fs_bsize;
1700 		}
1701 		cnt++;
1702 
1703 		/* Being a good UFS citizen, let others get a share */
1704 		if (cnt == chunkblks) {
1705 			/*
1706 			 * If there are waiters or the fs is hard locked,
1707 			 * error locked, or read-only error locked,
1708 			 * quit with EIO
1709 			 */
1710 			if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) ||
1711 			    ULOCKFS_IS_ROELOCK(ulp)) {
1712 				ip->i_cflags |= IFALLOCATE;
1713 				TRANS_INODE(ufsvfsp, ip);
1714 				rw_exit(&ip->i_contents);
1715 				rw_exit(&ufsvfsp->vfs_dqrwlock);
1716 
1717 				TRANS_END_CSYNC(ufsvfsp, err, issync,
1718 				    TOP_ALLOCSP, resv);
1719 				rw_exit(&ip->i_rwlock);
1720 				(void) allocsp_unlockfs(vp, &lf);
1721 				return (EIO);
1722 			}
1723 
1724 			TRANS_INODE(ufsvfsp, ip);
1725 			rw_exit(&ip->i_contents);
1726 			rw_exit(&ufsvfsp->vfs_dqrwlock);
1727 
1728 			/* End the current transaction */
1729 			TRANS_END_CSYNC(ufsvfsp, err, issync,
1730 			    TOP_ALLOCSP, resv);
1731 
1732 			if (CV_HAS_WAITERS(&ulp->ul_cv)) {
1733 				/* Release the write lock */
1734 				if (err = allocsp_unlockfs(vp, &lf))
1735 					goto exit;
1736 
1737 				/* Wake up others waiting to do operations */
1738 				mutex_enter(&ulp->ul_lock);
1739 				cv_broadcast(&ulp->ul_cv);
1740 				mutex_exit(&ulp->ul_lock);
1741 
1742 				/* Grab the write lock again */
1743 				if (err = allocsp_wlockfs(vp, &lf))
1744 					goto exit;
1745 			} /* end of CV_HAS_WAITERS(&ulp->ul_cv) */
1746 
1747 			/* Reserve more space in log for this file */
1748 			ufs_trans_trunc_resv(ip,
1749 			    ip->i_size + blkroundup(fs, ufsvfsp->vfs_iotransz),
1750 			    &resv, &resid);
1751 			TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1752 
1753 			rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1754 			rw_enter(&ip->i_contents, RW_WRITER);
1755 
1756 			cnt = 0;	/* reset cnt b/c of new transaction */
1757 		}
1758 	}
1759 
1760 	if (!err && !berr)
1761 		ip->i_cflags |= IFALLOCATE;
1762 
1763 	/* If the file has grown then correct the file size */
1764 	if (osz < (lp->l_start + lp->l_len))
1765 		ip->i_size = (lp->l_start + lp->l_len);
1766 
1767 	/* Release locks, end log transaction and unlock fs */
1768 	TRANS_INODE(ufsvfsp, ip);
1769 	rw_exit(&ip->i_contents);
1770 	rw_exit(&ufsvfsp->vfs_dqrwlock);
1771 
1772 	TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
1773 	err = allocsp_unlockfs(vp, &lf);
1774 
1775 	/*
1776 	 * @ exit label, we should no longer be holding the fs write lock, and
1777 	 * all logging transactions should have been ended. We still hold
1778 	 * ip->i_rwlock.
1779 	 */
1780 exit:
1781 	/*
1782 	 * File has grown larger than 2GB. Set flag
1783 	 * in superblock to indicate this, if it
1784 	 * is not already set.
1785 	 */
1786 	if ((ip->i_size > MAXOFF32_T) &&
1787 	    !(fs->fs_flags & FSLARGEFILES)) {
1788 		ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1789 		mutex_enter(&ufsvfsp->vfs_lock);
1790 		fs->fs_flags |= FSLARGEFILES;
1791 		ufs_sbwrite(ufsvfsp);
1792 		mutex_exit(&ufsvfsp->vfs_lock);
1793 	}
1794 
1795 	/*
1796 	 * Since we couldn't allocate completely, we will undo the allocations.
1797 	 */
1798 	if (berr) {
1799 		ufs_trans_trunc_resv(ip, totblks * fs->fs_bsize, &resv, &resid);
1800 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1801 
1802 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1803 		rw_enter(&ip->i_contents, RW_WRITER);
1804 
1805 		/* Direct blocks */
1806 		for (i = 0; i < NDADDR; i++) {
1807 			/*
1808 			 * Only free the block if they are not same, and
1809 			 * the old one isn't zero (the fragment was
1810 			 * re-allocated).
1811 			 */
1812 			if (db_undo[i] != ip->i_db[i] && db_undo[i] == 0) {
1813 				free(ip, ip->i_db[i], fs->fs_bsize, 0);
1814 				ip->i_db[i] = 0;
1815 			}
1816 		}
1817 
1818 		/* Undo the indirect blocks */
1819 		while (ib_undo != NULL) {
1820 			undo = ib_undo;
1821 			err = bmap_set_bn(vp, undo->offset, 0);
1822 			if (err)
1823 				cmn_err(CE_PANIC, "ufs_allocsp(): failed to "
1824 				    "undo allocation of block %ld",
1825 				    undo->offset);
1826 			free(ip, undo->blk, fs->fs_bsize, I_IBLK);
1827 			ib_undo = undo->next;
1828 			kmem_free(undo, sizeof (struct allocsp_undo));
1829 		}
1830 
1831 		ip->i_size = osz;
1832 		TRANS_INODE(ufsvfsp, ip);
1833 
1834 		rw_exit(&ip->i_contents);
1835 		rw_exit(&ufsvfsp->vfs_dqrwlock);
1836 
1837 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
1838 
1839 		rw_exit(&ip->i_rwlock);
1840 		return (berr);
1841 	}
1842 
1843 	/*
1844 	 * Don't forget to free the undo chain :)
1845 	 */
1846 	while (ib_undo != NULL) {
1847 		undo = ib_undo;
1848 		ib_undo = undo->next;
1849 		kmem_free(undo, sizeof (struct allocsp_undo));
1850 	}
1851 
1852 	rw_exit(&ip->i_rwlock);
1853 
1854 out_allocsp:
1855 	return (err);
1856 }
1857 
1858 /*
1859  * Free storage space associated with the specified inode.  The portion
1860  * to be freed is specified by lp->l_start and lp->l_len (already
1861  * normalized to a "whence" of 0).
1862  *
1863  * This is an experimental facility whose continued existence is not
1864  * guaranteed.  Currently, we only support the special case
1865  * of l_len == 0, meaning free to end of file.
1866  *
1867  * Blocks are freed in reverse order.  This FILO algorithm will tend to
1868  * maintain a contiguous free list much longer than FIFO.
1869  * See also ufs_itrunc() in ufs_inode.c.
1870  *
1871  * Bug: unused bytes in the last retained block are not cleared.
1872  * This may result in a "hole" in the file that does not read as zeroes.
1873  */
1874 /* ARGSUSED */
1875 int
1876 ufs_freesp(struct vnode *vp, struct flock64 *lp, int flag, cred_t *cr)
1877 {
1878 	int i;
1879 	struct inode *ip = VTOI(vp);
1880 	int error;
1881 
1882 	ASSERT(vp->v_type == VREG);
1883 	ASSERT(lp->l_start >= 0);	/* checked by convoff */
1884 
1885 	if (lp->l_len != 0)
1886 		return (EINVAL);
1887 
1888 	rw_enter(&ip->i_contents, RW_READER);
1889 	if (ip->i_size == (u_offset_t)lp->l_start) {
1890 		rw_exit(&ip->i_contents);
1891 		return (0);
1892 	}
1893 
1894 	/*
1895 	 * Check if there is any active mandatory lock on the
1896 	 * range that will be truncated/expanded.
1897 	 */
1898 	if (MANDLOCK(vp, ip->i_mode)) {
1899 		offset_t save_start;
1900 
1901 		save_start = lp->l_start;
1902 
1903 		if (ip->i_size < lp->l_start) {
1904 			/*
1905 			 * "Truncate up" case: need to make sure there
1906 			 * is no lock beyond current end-of-file. To
1907 			 * do so, we need to set l_start to the size
1908 			 * of the file temporarily.
1909 			 */
1910 			lp->l_start = ip->i_size;
1911 		}
1912 		lp->l_type = F_WRLCK;
1913 		lp->l_sysid = 0;
1914 		lp->l_pid = ttoproc(curthread)->p_pid;
1915 		i = (flag & (FNDELAY|FNONBLOCK)) ? 0 : SLPFLCK;
1916 		rw_exit(&ip->i_contents);
1917 		if ((i = reclock(vp, lp, i, 0, lp->l_start, NULL)) != 0 ||
1918 		    lp->l_type != F_UNLCK) {
1919 			return (i ? i : EAGAIN);
1920 		}
1921 		rw_enter(&ip->i_contents, RW_READER);
1922 
1923 		lp->l_start = save_start;
1924 	}
1925 
1926 	/*
1927 	 * Make sure a write isn't in progress (allocating blocks)
1928 	 * by acquiring i_rwlock (we promised ufs_bmap we wouldn't
1929 	 * truncate while it was allocating blocks).
1930 	 * Grab the locks in the right order.
1931 	 */
1932 	rw_exit(&ip->i_contents);
1933 	rw_enter(&ip->i_rwlock, RW_WRITER);
1934 	error = TRANS_ITRUNC(ip, (u_offset_t)lp->l_start, 0, cr);
1935 	rw_exit(&ip->i_rwlock);
1936 	return (error);
1937 }
1938 
1939 /*
1940  * Find a cg with as close to nb contiguous bytes as possible
1941  *	THIS MAY TAKE MANY DISK READS!
1942  *
1943  * Implemented in an attempt to allocate contiguous blocks for
1944  * writing the ufs log file to, minimizing future disk head seeking
1945  */
1946 daddr_t
1947 contigpref(ufsvfs_t *ufsvfsp, size_t nb)
1948 {
1949 	struct fs	*fs	= ufsvfsp->vfs_fs;
1950 	daddr_t		nblk	= lblkno(fs, blkroundup(fs, nb));
1951 	daddr_t		savebno, curbno, cgbno;
1952 	int		cg, cgblks, savecg, savenblk, curnblk;
1953 	uchar_t		*blksfree;
1954 	buf_t		*bp;
1955 	struct cg	*cgp;
1956 
1957 	savenblk = 0;
1958 	savecg = 0;
1959 	savebno = 0;
1960 	for (cg = 0; cg < fs->fs_ncg; ++cg) {
1961 
1962 		/* not enough free blks for a contig check */
1963 		if (fs->fs_cs(fs, cg).cs_nbfree < nblk)
1964 			continue;
1965 
1966 		/*
1967 		 * find the largest contiguous range in this cg
1968 		 */
1969 		bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev,
1970 		    (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
1971 		    (int)fs->fs_cgsize);
1972 		cgp = bp->b_un.b_cg;
1973 		if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
1974 			brelse(bp);
1975 			continue;
1976 		}
1977 		blksfree = cg_blksfree(cgp);	    /* free array */
1978 		cgblks = fragstoblks(fs, fs->fs_fpg); /* blks in free array */
1979 		cgbno = 0;
1980 		while (cgbno < cgblks && savenblk < nblk) {
1981 			/* find a free block */
1982 			for (; cgbno < cgblks; ++cgbno)
1983 				if (isblock(fs, blksfree, cgbno))
1984 					break;
1985 			curbno = cgbno;
1986 			/* count the number of free blocks */
1987 			for (curnblk = 0; cgbno < cgblks; ++cgbno) {
1988 				if (!isblock(fs, blksfree, cgbno))
1989 					break;
1990 				if (++curnblk >= nblk)
1991 					break;
1992 			}
1993 			if (curnblk > savenblk) {
1994 				savecg = cg;
1995 				savenblk = curnblk;
1996 				savebno = curbno;
1997 			}
1998 		}
1999 		brelse(bp);
2000 		if (savenblk >= nblk)
2001 			break;
2002 	}
2003 
2004 	/* convert block offset in cg to frag offset in cg */
2005 	savebno = blkstofrags(fs, savebno);
2006 
2007 	/* convert frag offset in cg to frag offset in fs */
2008 	savebno += (savecg * fs->fs_fpg);
2009 
2010 	return (savebno);
2011 }
2012