1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28
29 /*
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
32 * All Rights Reserved
33 *
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
36 * contributors.
37 */
38
39 #include <sys/condvar_impl.h>
40 #include <sys/types.h>
41 #include <sys/t_lock.h>
42 #include <sys/debug.h>
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/signal.h>
46 #include <sys/cred.h>
47 #include <sys/proc.h>
48 #include <sys/disp.h>
49 #include <sys/user.h>
50 #include <sys/buf.h>
51 #include <sys/vfs.h>
52 #include <sys/vnode.h>
53 #include <sys/acl.h>
54 #include <sys/fs/ufs_fs.h>
55 #include <sys/fs/ufs_inode.h>
56 #include <sys/fs/ufs_acl.h>
57 #include <sys/fs/ufs_bio.h>
58 #include <sys/fs/ufs_quota.h>
59 #include <sys/kmem.h>
60 #include <sys/fs/ufs_trans.h>
61 #include <sys/fs/ufs_panic.h>
62 #include <sys/errno.h>
63 #include <sys/time.h>
64 #include <sys/sysmacros.h>
65 #include <sys/file.h>
66 #include <sys/fcntl.h>
67 #include <sys/flock.h>
68 #include <fs/fs_subr.h>
69 #include <sys/cmn_err.h>
70 #include <sys/policy.h>
71 #include <sys/fs/ufs_log.h>
72
73 static ino_t hashalloc();
74 static daddr_t fragextend();
75 static daddr_t alloccg();
76 static daddr_t alloccgblk();
77 static ino_t ialloccg();
78 static daddr_t mapsearch();
79 static int findlogstartcg();
80
81 extern int inside[], around[];
82 extern uchar_t *fragtbl[];
83 void delay();
84
85 /*
86 * Allocate a block in the file system.
87 *
88 * The size of the requested block is given, which must be some
89 * multiple of fs_fsize and <= fs_bsize.
90 * A preference may be optionally specified. If a preference is given
91 * the following hierarchy is used to allocate a block:
92 * 1) allocate the requested block.
93 * 2) allocate a rotationally optimal block in the same cylinder.
94 * 3) allocate a block in the same cylinder group.
95 * 4) quadratically rehash into other cylinder groups, until an
96 * available block is located.
97 * If no block preference is given the following hierarchy is used
98 * to allocate a block:
99 * 1) allocate a block in the cylinder group that contains the
100 * inode for the file.
101 * 2) quadratically rehash into other cylinder groups, until an
102 * available block is located.
103 */
104 int
alloc(struct inode * ip,daddr_t bpref,int size,daddr_t * bnp,cred_t * cr)105 alloc(struct inode *ip, daddr_t bpref, int size, daddr_t *bnp, cred_t *cr)
106 {
107 struct fs *fs;
108 struct ufsvfs *ufsvfsp;
109 daddr_t bno;
110 int cg;
111 int err;
112 char *errmsg = NULL;
113 size_t len;
114 clock_t now;
115
116 ufsvfsp = ip->i_ufsvfs;
117 fs = ufsvfsp->vfs_fs;
118 if ((unsigned)size > fs->fs_bsize || fragoff(fs, size) != 0) {
119 err = ufs_fault(ITOV(ip), "alloc: bad size, dev = 0x%lx,"
120 " bsize = %d, size = %d, fs = %s\n",
121 ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
122 return (err);
123 }
124 if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
125 goto nospace;
126 if (freespace(fs, ufsvfsp) <= 0 &&
127 secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0)
128 goto nospace;
129 err = chkdq(ip, (long)btodb(size), 0, cr, &errmsg, &len);
130 /* Note that may not have err, but may have errmsg */
131 if (errmsg != NULL) {
132 uprintf(errmsg);
133 kmem_free(errmsg, len);
134 errmsg = NULL;
135 }
136 if (err)
137 return (err);
138 if (bpref >= fs->fs_size)
139 bpref = 0;
140 if (bpref == 0)
141 cg = (int)itog(fs, ip->i_number);
142 else
143 cg = dtog(fs, bpref);
144
145 bno = (daddr_t)hashalloc(ip, cg, (long)bpref, size,
146 (ulong_t (*)())alloccg);
147 if (bno > 0) {
148 *bnp = bno;
149 return (0);
150 }
151
152 /*
153 * hashalloc() failed because some other thread grabbed
154 * the last block so unwind the quota operation. We can
155 * ignore the return because subtractions don't fail and
156 * size is guaranteed to be >= zero by our caller.
157 */
158 (void) chkdq(ip, -(long)btodb(size), 0, cr, (char **)NULL,
159 (size_t *)NULL);
160
161 nospace:
162 now = ddi_get_lbolt();
163 mutex_enter(&ufsvfsp->vfs_lock);
164 if ((now - ufsvfsp->vfs_lastwhinetime) > (hz << 2) &&
165 (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) {
166 ufsvfsp->vfs_lastwhinetime = now;
167 cmn_err(CE_NOTE, "alloc: %s: file system full", fs->fs_fsmnt);
168 }
169 mutex_exit(&ufsvfsp->vfs_lock);
170 return (ENOSPC);
171 }
172
173 /*
174 * Reallocate a fragment to a bigger size
175 *
176 * The number and size of the old block is given, and a preference
177 * and new size is also specified. The allocator attempts to extend
178 * the original block. Failing that, the regular block allocator is
179 * invoked to get an appropriate block.
180 */
181 int
realloccg(struct inode * ip,daddr_t bprev,daddr_t bpref,int osize,int nsize,daddr_t * bnp,cred_t * cr)182 realloccg(struct inode *ip, daddr_t bprev, daddr_t bpref, int osize,
183 int nsize, daddr_t *bnp, cred_t *cr)
184 {
185 daddr_t bno;
186 struct fs *fs;
187 struct ufsvfs *ufsvfsp;
188 int cg, request;
189 int err;
190 char *errmsg = NULL;
191 size_t len;
192 clock_t now;
193
194 ufsvfsp = ip->i_ufsvfs;
195 fs = ufsvfsp->vfs_fs;
196 if ((unsigned)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
197 (unsigned)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
198 err = ufs_fault(ITOV(ip),
199 "realloccg: bad size, dev=0x%lx, bsize=%d, "
200 "osize=%d, nsize=%d, fs=%s\n",
201 ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt);
202 return (err);
203 }
204 if (freespace(fs, ufsvfsp) <= 0 &&
205 secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0)
206 goto nospace;
207 if (bprev == 0) {
208 err = ufs_fault(ITOV(ip),
209 "realloccg: bad bprev, dev = 0x%lx, bsize = %d,"
210 " bprev = %ld, fs = %s\n", ip->i_dev, fs->fs_bsize, bprev,
211 fs->fs_fsmnt);
212 return (err);
213 }
214 err = chkdq(ip, (long)btodb(nsize - osize), 0, cr, &errmsg, &len);
215 /* Note that may not have err, but may have errmsg */
216 if (errmsg != NULL) {
217 uprintf(errmsg);
218 kmem_free(errmsg, len);
219 errmsg = NULL;
220 }
221 if (err)
222 return (err);
223 cg = dtog(fs, bprev);
224 bno = fragextend(ip, cg, (long)bprev, osize, nsize);
225 if (bno != 0) {
226 *bnp = bno;
227 return (0);
228 }
229 if (bpref >= fs->fs_size)
230 bpref = 0;
231
232 /*
233 * When optimizing for time we allocate a full block and
234 * then only use the upper portion for this request. When
235 * this file grows again it will grow into the unused portion
236 * of the block (See fragextend() above). This saves time
237 * because an extra disk write would be needed if the frags
238 * following the current allocation were not free. The extra
239 * disk write is needed to move the data from its current
240 * location into the newly allocated position.
241 *
242 * When optimizing for space we allocate a run of frags
243 * that is just the right size for this request.
244 */
245 request = (fs->fs_optim == FS_OPTTIME) ? fs->fs_bsize : nsize;
246 bno = (daddr_t)hashalloc(ip, cg, (long)bpref, request,
247 (ulong_t (*)())alloccg);
248 if (bno > 0) {
249 *bnp = bno;
250 if (nsize < request)
251 (void) free(ip, bno + numfrags(fs, nsize),
252 (off_t)(request - nsize), I_NOCANCEL);
253 return (0);
254 }
255
256 /*
257 * hashalloc() failed because some other thread grabbed
258 * the last block so unwind the quota operation. We can
259 * ignore the return because subtractions don't fail, and
260 * our caller guarantees nsize >= osize.
261 */
262 (void) chkdq(ip, -(long)btodb(nsize - osize), 0, cr, (char **)NULL,
263 (size_t *)NULL);
264
265 nospace:
266 now = ddi_get_lbolt();
267 mutex_enter(&ufsvfsp->vfs_lock);
268 if ((now - ufsvfsp->vfs_lastwhinetime) > (hz << 2) &&
269 (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) {
270 ufsvfsp->vfs_lastwhinetime = now;
271 cmn_err(CE_NOTE,
272 "realloccg %s: file system full", fs->fs_fsmnt);
273 }
274 mutex_exit(&ufsvfsp->vfs_lock);
275 return (ENOSPC);
276 }
277
278 /*
279 * Allocate an inode in the file system.
280 *
281 * A preference may be optionally specified. If a preference is given
282 * the following hierarchy is used to allocate an inode:
283 * 1) allocate the requested inode.
284 * 2) allocate an inode in the same cylinder group.
285 * 3) quadratically rehash into other cylinder groups, until an
286 * available inode is located.
287 * If no inode preference is given the following hierarchy is used
288 * to allocate an inode:
289 * 1) allocate an inode in cylinder group 0.
290 * 2) quadratically rehash into other cylinder groups, until an
291 * available inode is located.
292 */
293 int
ufs_ialloc(struct inode * pip,ino_t ipref,mode_t mode,struct inode ** ipp,cred_t * cr)294 ufs_ialloc(struct inode *pip,
295 ino_t ipref, mode_t mode, struct inode **ipp, cred_t *cr)
296 {
297 struct inode *ip;
298 struct fs *fs;
299 int cg;
300 ino_t ino;
301 int err;
302 int nifree;
303 struct ufsvfs *ufsvfsp = pip->i_ufsvfs;
304 char *errmsg = NULL;
305 size_t len;
306
307 ASSERT(RW_WRITE_HELD(&pip->i_rwlock));
308 fs = pip->i_fs;
309 loop:
310 nifree = fs->fs_cstotal.cs_nifree;
311
312 if (nifree == 0)
313 goto noinodes;
314 /*
315 * Shadow inodes don't count against a user's inode allocation.
316 * They are an implementation method and not a resource.
317 */
318 if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
319 err = chkiq((struct ufsvfs *)ITOV(pip)->v_vfsp->vfs_data,
320 /* change */ 1, (struct inode *)NULL, crgetuid(cr), 0,
321 cr, &errmsg, &len);
322 /*
323 * As we haven't acquired any locks yet, dump the message
324 * now.
325 */
326 if (errmsg != NULL) {
327 uprintf(errmsg);
328 kmem_free(errmsg, len);
329 errmsg = NULL;
330 }
331 if (err)
332 return (err);
333 }
334
335 if (ipref >= (ulong_t)(fs->fs_ncg * fs->fs_ipg))
336 ipref = 0;
337 cg = (int)itog(fs, ipref);
338 ino = (ino_t)hashalloc(pip, cg, (long)ipref, (int)mode,
339 (ulong_t (*)())ialloccg);
340 if (ino == 0) {
341 if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
342 /*
343 * We can safely ignore the return from chkiq()
344 * because deallocations can only fail if we
345 * can't get the user's quota info record off
346 * the disk due to an I/O error. In that case,
347 * the quota subsystem is already messed up.
348 */
349 (void) chkiq(ufsvfsp, /* change */ -1,
350 (struct inode *)NULL, crgetuid(cr), 0, cr,
351 (char **)NULL, (size_t *)NULL);
352 }
353 goto noinodes;
354 }
355 err = ufs_iget(pip->i_vfs, ino, ipp, cr);
356 if (err) {
357 if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
358 /*
359 * See above comment about why it is safe to ignore an
360 * error return here.
361 */
362 (void) chkiq(ufsvfsp, /* change */ -1,
363 (struct inode *)NULL, crgetuid(cr), 0, cr,
364 (char **)NULL, (size_t *)NULL);
365 }
366 ufs_ifree(pip, ino, 0);
367 return (err);
368 }
369 ip = *ipp;
370 ASSERT(!ip->i_ufs_acl);
371 ASSERT(!ip->i_dquot);
372 rw_enter(&ip->i_contents, RW_WRITER);
373
374 /*
375 * Check if we really got a free inode, if not then complain
376 * and mark the inode ISTALE so that it will be freed by the
377 * ufs idle thread eventually and will not be sent to ufs_delete().
378 */
379 if (ip->i_mode || (ip->i_nlink > 0)) {
380 ip->i_flag |= ISTALE;
381 rw_exit(&ip->i_contents);
382 VN_RELE(ITOV(ip));
383 cmn_err(CE_WARN,
384 "%s: unexpected allocated inode %d, run fsck(8)%s",
385 fs->fs_fsmnt, (int)ino,
386 (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
387 goto loop;
388 }
389
390 /*
391 * Check the inode has no size or data blocks.
392 * This could have happened if the truncation failed when
393 * deleting the inode. It used to be possible for this to occur
394 * if a block allocation failed when iteratively truncating a
395 * large file using logging and with a full file system.
396 * This was fixed with bug fix 4348738. However, truncation may
397 * still fail on an IO error. So in all cases for safety and
398 * security we clear out the size; the blocks allocated; and
399 * pointers to the blocks. This will ultimately cause a fsck
400 * error of un-accounted for blocks, but its a fairly benign error,
401 * and possibly the correct thing to do anyway as accesssing those
402 * blocks agains may lead to more IO errors.
403 */
404 if (ip->i_size || ip->i_blocks) {
405 int i;
406
407 if (ip->i_size) {
408 cmn_err(CE_WARN,
409 "%s: free inode %d had size 0x%llx, run fsck(8)%s",
410 fs->fs_fsmnt, (int)ino, ip->i_size,
411 (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
412 }
413 /*
414 * Clear any garbage left behind.
415 */
416 ip->i_size = (u_offset_t)0;
417 ip->i_blocks = 0;
418 for (i = 0; i < NDADDR; i++)
419 ip->i_db[i] = 0;
420 for (i = 0; i < NIADDR; i++)
421 ip->i_ib[i] = 0;
422 }
423
424 /*
425 * Initialize the link count
426 */
427 ip->i_nlink = 0;
428
429 /*
430 * Clear the old flags
431 */
432 ip->i_flag &= IREF;
433
434 /*
435 * Access times are not really defined if the fs is mounted
436 * with 'noatime'. But it can cause nfs clients to fail
437 * open() if the atime is not a legal value. Set a legal value
438 * here when the inode is allocated.
439 */
440 if (ufsvfsp->vfs_noatime) {
441 mutex_enter(&ufs_iuniqtime_lock);
442 ip->i_atime = iuniqtime;
443 mutex_exit(&ufs_iuniqtime_lock);
444 }
445 rw_exit(&ip->i_contents);
446 return (0);
447 noinodes:
448 if (!(TRANS_ISTRANS(ufsvfsp)) || !(pip->i_flag & IQUIET))
449 cmn_err(CE_NOTE, "%s: out of inodes\n", fs->fs_fsmnt);
450 return (ENOSPC);
451 }
452
453 /*
454 * Find a cylinder group to place a directory.
455 * Returns an inumber within the selected cylinder group.
456 * Note, the vfs_lock is not needed as we don't require exact cg summary info.
457 *
458 * If the switch ufs_close_dirs is set, then the policy is to use
459 * the current cg if it has more than 25% free inodes and more
460 * than 25% free blocks. Otherwise the cgs are searched from
461 * the beginning and the first cg with the same criteria is
462 * used. If that is also null then we revert to the old algorithm.
463 * This tends to cluster files at the beginning of the disk
464 * until the disk gets full.
465 *
466 * Otherwise if ufs_close_dirs is not set then the original policy is
467 * used which is to select from among those cylinder groups with
468 * above the average number of free inodes, the one with the smallest
469 * number of directories.
470 */
471
472 int ufs_close_dirs = 1; /* allocate directories close as possible */
473
474 ino_t
dirpref(inode_t * dp)475 dirpref(inode_t *dp)
476 {
477 int cg, minndir, mincg, avgifree, mininode, minbpg, ifree;
478 struct fs *fs = dp->i_fs;
479
480 cg = itog(fs, dp->i_number);
481 mininode = fs->fs_ipg >> 2;
482 minbpg = fs->fs_maxbpg >> 2;
483 if (ufs_close_dirs &&
484 (fs->fs_cs(fs, cg).cs_nifree > mininode) &&
485 (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) {
486 return (dp->i_number);
487 }
488
489 avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
490 minndir = fs->fs_ipg;
491 mincg = 0;
492 for (cg = 0; cg < fs->fs_ncg; cg++) {
493 ifree = fs->fs_cs(fs, cg).cs_nifree;
494 if (ufs_close_dirs &&
495 (ifree > mininode) &&
496 (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) {
497 return ((ino_t)(fs->fs_ipg * cg));
498 }
499 if ((fs->fs_cs(fs, cg).cs_ndir < minndir) &&
500 (ifree >= avgifree)) {
501 mincg = cg;
502 minndir = fs->fs_cs(fs, cg).cs_ndir;
503 }
504 }
505 return ((ino_t)(fs->fs_ipg * mincg));
506 }
507
508 /*
509 * Select the desired position for the next block in a file. The file is
510 * logically divided into sections. The first section is composed of the
511 * direct blocks. Each additional section contains fs_maxbpg blocks.
512 *
513 * If no blocks have been allocated in the first section, the policy is to
514 * request a block in the same cylinder group as the inode that describes
515 * the file. If no blocks have been allocated in any other section, the
516 * policy is to place the section in a cylinder group with a greater than
517 * average number of free blocks. An appropriate cylinder group is found
518 * by using a rotor that sweeps the cylinder groups. When a new group of
519 * blocks is needed, the sweep begins in the cylinder group following the
520 * cylinder group from which the previous allocation was made. The sweep
521 * continues until a cylinder group with greater than the average number
522 * of free blocks is found. If the allocation is for the first block in an
523 * indirect block, the information on the previous allocation is unavailable;
524 * here a best guess is made based upon the logical block number being
525 * allocated.
526 *
527 * If a section is already partially allocated, the policy is to
528 * contiguously allocate fs_maxcontig blocks. The end of one of these
529 * contiguous blocks and the beginning of the next is physically separated
530 * so that the disk head will be in transit between them for at least
531 * fs_rotdelay milliseconds. This is to allow time for the processor to
532 * schedule another I/O transfer.
533 */
534 daddr_t
blkpref(struct inode * ip,daddr_t lbn,int indx,daddr32_t * bap)535 blkpref(struct inode *ip, daddr_t lbn, int indx, daddr32_t *bap)
536 {
537 struct fs *fs;
538 struct ufsvfs *ufsvfsp;
539 int cg;
540 int avgbfree, startcg;
541 daddr_t nextblk;
542
543 ufsvfsp = ip->i_ufsvfs;
544 fs = ip->i_fs;
545 if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
546 if (lbn < NDADDR) {
547 cg = itog(fs, ip->i_number);
548 return (fs->fs_fpg * cg + fs->fs_frag);
549 }
550 /*
551 * Find a cylinder with greater than average
552 * number of unused data blocks.
553 */
554 if (indx == 0 || bap[indx - 1] == 0)
555 startcg = itog(fs, ip->i_number) + lbn / fs->fs_maxbpg;
556 else
557 startcg = dtog(fs, bap[indx - 1]) + 1;
558 startcg %= fs->fs_ncg;
559
560 mutex_enter(&ufsvfsp->vfs_lock);
561 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
562 /*
563 * used for computing log space for writes/truncs
564 */
565 ufsvfsp->vfs_avgbfree = avgbfree;
566 for (cg = startcg; cg < fs->fs_ncg; cg++)
567 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
568 fs->fs_cgrotor = cg;
569 mutex_exit(&ufsvfsp->vfs_lock);
570 return (fs->fs_fpg * cg + fs->fs_frag);
571 }
572 for (cg = 0; cg <= startcg; cg++)
573 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
574 fs->fs_cgrotor = cg;
575 mutex_exit(&ufsvfsp->vfs_lock);
576 return (fs->fs_fpg * cg + fs->fs_frag);
577 }
578 mutex_exit(&ufsvfsp->vfs_lock);
579 return (0);
580 }
581 /*
582 * One or more previous blocks have been laid out. If less
583 * than fs_maxcontig previous blocks are contiguous, the
584 * next block is requested contiguously, otherwise it is
585 * requested rotationally delayed by fs_rotdelay milliseconds.
586 */
587
588 nextblk = bap[indx - 1];
589 /*
590 * Provision for fallocate to return positive
591 * blk preference based on last allocation
592 */
593 if (nextblk < 0 && nextblk != UFS_HOLE) {
594 nextblk = (-bap[indx - 1]) + fs->fs_frag;
595 } else {
596 nextblk = bap[indx - 1] + fs->fs_frag;
597 }
598
599 if (indx > fs->fs_maxcontig && bap[indx - fs->fs_maxcontig] +
600 blkstofrags(fs, fs->fs_maxcontig) != nextblk) {
601 return (nextblk);
602 }
603 if (fs->fs_rotdelay != 0)
604 /*
605 * Here we convert ms of delay to frags as:
606 * (frags) = (ms) * (rev/sec) * (sect/rev) /
607 * ((sect/frag) * (ms/sec))
608 * then round up to the next block.
609 */
610 nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect /
611 (NSPF(fs) * 1000), fs->fs_frag);
612 return (nextblk);
613 }
614
615 /*
616 * Free a block or fragment.
617 *
618 * The specified block or fragment is placed back in the
619 * free map. If a fragment is deallocated, a possible
620 * block reassembly is checked.
621 */
622 void
free(struct inode * ip,daddr_t bno,off_t size,int flags)623 free(struct inode *ip, daddr_t bno, off_t size, int flags)
624 {
625 struct fs *fs = ip->i_fs;
626 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
627 struct ufs_q *delq = &ufsvfsp->vfs_delete;
628 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
629 struct cg *cgp;
630 struct buf *bp;
631 int cg, bmap, bbase;
632 int i;
633 uchar_t *blksfree;
634 int *blktot;
635 short *blks;
636 daddr_t blkno, cylno, rpos;
637
638 /*
639 * fallocate'd files will have negative block address.
640 * So negate it again to get original block address.
641 */
642 if (bno < 0 && (bno % fs->fs_frag == 0) && bno != UFS_HOLE) {
643 bno = -bno;
644 }
645
646 if ((unsigned long)size > fs->fs_bsize || fragoff(fs, size) != 0) {
647 (void) ufs_fault(ITOV(ip),
648 "free: bad size, dev = 0x%lx, bsize = %d, size = %d, "
649 "fs = %s\n", ip->i_dev, fs->fs_bsize,
650 (int)size, fs->fs_fsmnt);
651 return;
652 }
653 cg = dtog(fs, bno);
654 ASSERT(!ufs_badblock(ip, bno));
655 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
656 (int)fs->fs_cgsize);
657
658 cgp = bp->b_un.b_cg;
659 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
660 brelse(bp);
661 return;
662 }
663
664 if (!(flags & I_NOCANCEL))
665 TRANS_CANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size, flags);
666 if (flags & (I_DIR|I_IBLK|I_SHAD|I_QUOTA)) {
667 TRANS_MATA_FREE(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size);
668 }
669 blksfree = cg_blksfree(cgp);
670 blktot = cg_blktot(cgp);
671 mutex_enter(&ufsvfsp->vfs_lock);
672 cgp->cg_time = gethrestime_sec();
673 bno = dtogd(fs, bno);
674 if (size == fs->fs_bsize) {
675 blkno = fragstoblks(fs, bno);
676 cylno = cbtocylno(fs, bno);
677 rpos = cbtorpos(ufsvfsp, bno);
678 blks = cg_blks(ufsvfsp, cgp, cylno);
679 if (!isclrblock(fs, blksfree, blkno)) {
680 mutex_exit(&ufsvfsp->vfs_lock);
681 brelse(bp);
682 (void) ufs_fault(ITOV(ip), "free: freeing free block, "
683 "dev:0x%lx, block:%ld, ino:%lu, fs:%s",
684 ip->i_dev, bno, ip->i_number, fs->fs_fsmnt);
685 return;
686 }
687 setblock(fs, blksfree, blkno);
688 blks[rpos]++;
689 blktot[cylno]++;
690 cgp->cg_cs.cs_nbfree++; /* Log below */
691 fs->fs_cstotal.cs_nbfree++;
692 fs->fs_cs(fs, cg).cs_nbfree++;
693 if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) {
694 mutex_enter(&delq->uq_mutex);
695 delq_info->delq_unreclaimed_blocks -=
696 btodb(fs->fs_bsize);
697 mutex_exit(&delq->uq_mutex);
698 }
699 } else {
700 bbase = bno - fragnum(fs, bno);
701 /*
702 * Decrement the counts associated with the old frags
703 */
704 bmap = blkmap(fs, blksfree, bbase);
705 fragacct(fs, bmap, cgp->cg_frsum, -1);
706 /*
707 * Deallocate the fragment
708 */
709 for (i = 0; i < numfrags(fs, size); i++) {
710 if (isset(blksfree, bno + i)) {
711 brelse(bp);
712 mutex_exit(&ufsvfsp->vfs_lock);
713 (void) ufs_fault(ITOV(ip),
714 "free: freeing free frag, "
715 "dev:0x%lx, blk:%ld, cg:%d, "
716 "ino:%lu, fs:%s",
717 ip->i_dev,
718 bno + i,
719 cgp->cg_cgx,
720 ip->i_number,
721 fs->fs_fsmnt);
722 return;
723 }
724 setbit(blksfree, bno + i);
725 }
726 cgp->cg_cs.cs_nffree += i;
727 fs->fs_cstotal.cs_nffree += i;
728 fs->fs_cs(fs, cg).cs_nffree += i;
729 if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) {
730 mutex_enter(&delq->uq_mutex);
731 delq_info->delq_unreclaimed_blocks -=
732 btodb(i * fs->fs_fsize);
733 mutex_exit(&delq->uq_mutex);
734 }
735 /*
736 * Add back in counts associated with the new frags
737 */
738 bmap = blkmap(fs, blksfree, bbase);
739 fragacct(fs, bmap, cgp->cg_frsum, 1);
740 /*
741 * If a complete block has been reassembled, account for it
742 */
743 blkno = fragstoblks(fs, bbase);
744 if (isblock(fs, blksfree, blkno)) {
745 cylno = cbtocylno(fs, bbase);
746 rpos = cbtorpos(ufsvfsp, bbase);
747 blks = cg_blks(ufsvfsp, cgp, cylno);
748 blks[rpos]++;
749 blktot[cylno]++;
750 cgp->cg_cs.cs_nffree -= fs->fs_frag;
751 fs->fs_cstotal.cs_nffree -= fs->fs_frag;
752 fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
753 cgp->cg_cs.cs_nbfree++;
754 fs->fs_cstotal.cs_nbfree++;
755 fs->fs_cs(fs, cg).cs_nbfree++;
756 }
757 }
758 fs->fs_fmod = 1;
759 ufs_notclean(ufsvfsp);
760 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
761 TRANS_SI(ufsvfsp, fs, cg);
762 bdrwrite(bp);
763 }
764
765 /*
766 * Free an inode.
767 *
768 * The specified inode is placed back in the free map.
769 */
770 void
ufs_ifree(struct inode * ip,ino_t ino,mode_t mode)771 ufs_ifree(struct inode *ip, ino_t ino, mode_t mode)
772 {
773 struct fs *fs = ip->i_fs;
774 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
775 struct cg *cgp;
776 struct buf *bp;
777 unsigned int inot;
778 int cg;
779 char *iused;
780
781 if (ip->i_number == ino && ip->i_mode != 0) {
782 (void) ufs_fault(ITOV(ip),
783 "ufs_ifree: illegal mode: (imode) %o, (omode) %o, ino %d, "
784 "fs = %s\n",
785 ip->i_mode, mode, (int)ip->i_number, fs->fs_fsmnt);
786 return;
787 }
788 if (ino >= fs->fs_ipg * fs->fs_ncg) {
789 (void) ufs_fault(ITOV(ip),
790 "ifree: range, dev = 0x%x, ino = %d, fs = %s\n",
791 (int)ip->i_dev, (int)ino, fs->fs_fsmnt);
792 return;
793 }
794 cg = (int)itog(fs, ino);
795 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
796 (int)fs->fs_cgsize);
797
798 cgp = bp->b_un.b_cg;
799 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
800 brelse(bp);
801 return;
802 }
803 mutex_enter(&ufsvfsp->vfs_lock);
804 cgp->cg_time = gethrestime_sec();
805 iused = cg_inosused(cgp);
806 inot = (unsigned int)(ino % (ulong_t)fs->fs_ipg);
807 if (isclr(iused, inot)) {
808 mutex_exit(&ufsvfsp->vfs_lock);
809 brelse(bp);
810 (void) ufs_fault(ITOV(ip), "ufs_ifree: freeing free inode, "
811 "mode: (imode) %o, (omode) %o, ino:%d, "
812 "fs:%s",
813 ip->i_mode, mode, (int)ino, fs->fs_fsmnt);
814 return;
815 }
816 clrbit(iused, inot);
817
818 if (inot < (ulong_t)cgp->cg_irotor)
819 cgp->cg_irotor = inot;
820 cgp->cg_cs.cs_nifree++;
821 fs->fs_cstotal.cs_nifree++;
822 fs->fs_cs(fs, cg).cs_nifree++;
823 if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) {
824 cgp->cg_cs.cs_ndir--;
825 fs->fs_cstotal.cs_ndir--;
826 fs->fs_cs(fs, cg).cs_ndir--;
827 }
828 fs->fs_fmod = 1;
829 ufs_notclean(ufsvfsp);
830 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
831 TRANS_SI(ufsvfsp, fs, cg);
832 bdrwrite(bp);
833 }
834
835 /*
836 * Implement the cylinder overflow algorithm.
837 *
838 * The policy implemented by this algorithm is:
839 * 1) allocate the block in its requested cylinder group.
840 * 2) quadratically rehash on the cylinder group number.
841 * 3) brute force search for a free block.
842 * The size parameter means size for data blocks, mode for inodes.
843 */
844 static ino_t
hashalloc(struct inode * ip,int cg,long pref,int size,ulong_t (* allocator)())845 hashalloc(struct inode *ip, int cg, long pref, int size, ulong_t (*allocator)())
846 {
847 struct fs *fs;
848 int i;
849 long result;
850 int icg = cg;
851
852 fs = ip->i_fs;
853 /*
854 * 1: preferred cylinder group
855 */
856 result = (*allocator)(ip, cg, pref, size);
857 if (result)
858 return (result);
859 /*
860 * 2: quadratic rehash
861 */
862 for (i = 1; i < fs->fs_ncg; i *= 2) {
863 cg += i;
864 if (cg >= fs->fs_ncg)
865 cg -= fs->fs_ncg;
866 result = (*allocator)(ip, cg, 0, size);
867 if (result)
868 return (result);
869 }
870 /*
871 * 3: brute force search
872 * Note that we start at i == 2, since 0 was checked initially,
873 * and 1 is always checked in the quadratic rehash.
874 */
875 cg = (icg + 2) % fs->fs_ncg;
876 for (i = 2; i < fs->fs_ncg; i++) {
877 result = (*allocator)(ip, cg, 0, size);
878 if (result)
879 return (result);
880 cg++;
881 if (cg == fs->fs_ncg)
882 cg = 0;
883 }
884 return (0);
885 }
886
887 /*
888 * Determine whether a fragment can be extended.
889 *
890 * Check to see if the necessary fragments are available, and
891 * if they are, allocate them.
892 */
893 static daddr_t
fragextend(struct inode * ip,int cg,long bprev,int osize,int nsize)894 fragextend(struct inode *ip, int cg, long bprev, int osize, int nsize)
895 {
896 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
897 struct fs *fs = ip->i_fs;
898 struct buf *bp;
899 struct cg *cgp;
900 uchar_t *blksfree;
901 long bno;
902 int frags, bbase;
903 int i, j;
904
905 if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
906 return (0);
907 frags = numfrags(fs, nsize);
908 bbase = (int)fragnum(fs, bprev);
909 if (bbase > fragnum(fs, (bprev + frags - 1))) {
910 /* cannot extend across a block boundary */
911 return (0);
912 }
913
914 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
915 (int)fs->fs_cgsize);
916 cgp = bp->b_un.b_cg;
917 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
918 brelse(bp);
919 return (0);
920 }
921
922 blksfree = cg_blksfree(cgp);
923 mutex_enter(&ufsvfsp->vfs_lock);
924 bno = dtogd(fs, bprev);
925 for (i = numfrags(fs, osize); i < frags; i++) {
926 if (isclr(blksfree, bno + i)) {
927 mutex_exit(&ufsvfsp->vfs_lock);
928 brelse(bp);
929 return (0);
930 }
931 if ((TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bprev + i)),
932 fs->fs_fsize))) {
933 mutex_exit(&ufsvfsp->vfs_lock);
934 brelse(bp);
935 return (0);
936 }
937 }
938
939 cgp->cg_time = gethrestime_sec();
940 /*
941 * The current fragment can be extended,
942 * deduct the count on fragment being extended into
943 * increase the count on the remaining fragment (if any)
944 * allocate the extended piece.
945 */
946 for (i = frags; i < fs->fs_frag - bbase; i++)
947 if (isclr(blksfree, bno + i))
948 break;
949 j = i - numfrags(fs, osize);
950 cgp->cg_frsum[j]--;
951 ASSERT(cgp->cg_frsum[j] >= 0);
952 if (i != frags)
953 cgp->cg_frsum[i - frags]++;
954 for (i = numfrags(fs, osize); i < frags; i++) {
955 clrbit(blksfree, bno + i);
956 cgp->cg_cs.cs_nffree--;
957 fs->fs_cs(fs, cg).cs_nffree--;
958 fs->fs_cstotal.cs_nffree--;
959 }
960 fs->fs_fmod = 1;
961 ufs_notclean(ufsvfsp);
962 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
963 TRANS_SI(ufsvfsp, fs, cg);
964 bdrwrite(bp);
965 return ((daddr_t)bprev);
966 }
967
968 /*
969 * Determine whether a block can be allocated.
970 *
971 * Check to see if a block of the apprpriate size
972 * is available, and if it is, allocate it.
973 */
974 static daddr_t
alloccg(struct inode * ip,int cg,daddr_t bpref,int size)975 alloccg(struct inode *ip, int cg, daddr_t bpref, int size)
976 {
977 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
978 struct fs *fs = ip->i_fs;
979 struct buf *bp;
980 struct cg *cgp;
981 uchar_t *blksfree;
982 int bno, frags;
983 int allocsiz;
984 int i;
985
986 /*
987 * Searching for space could be time expensive so do some
988 * up front checking to verify that there is actually space
989 * available (free blocks or free frags).
990 */
991 if (fs->fs_cs(fs, cg).cs_nbfree == 0) {
992 if (size == fs->fs_bsize)
993 return (0);
994
995 /*
996 * If there are not enough free frags then return.
997 */
998 if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, size))
999 return (0);
1000 }
1001
1002 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
1003 (int)fs->fs_cgsize);
1004
1005 cgp = bp->b_un.b_cg;
1006 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) ||
1007 (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) {
1008 brelse(bp);
1009 return (0);
1010 }
1011 blksfree = cg_blksfree(cgp);
1012 mutex_enter(&ufsvfsp->vfs_lock);
1013 cgp->cg_time = gethrestime_sec();
1014 if (size == fs->fs_bsize) {
1015 if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0)
1016 goto errout;
1017 fs->fs_fmod = 1;
1018 ufs_notclean(ufsvfsp);
1019 TRANS_SI(ufsvfsp, fs, cg);
1020 bdrwrite(bp);
1021 return (bno);
1022 }
1023 /*
1024 * Check fragment bitmap to see if any fragments are already available.
1025 * mapsearch() may fail because the fragment that fits this request
1026 * might still be on the cancel list and not available for re-use yet.
1027 * Look for a bigger sized fragment to allocate first before we have
1028 * to give up and fragment a whole new block eventually.
1029 */
1030 frags = numfrags(fs, size);
1031 allocsiz = frags;
1032 next_size:
1033 for (; allocsiz < fs->fs_frag; allocsiz++)
1034 if (cgp->cg_frsum[allocsiz] != 0)
1035 break;
1036
1037 if (allocsiz != fs->fs_frag) {
1038 bno = mapsearch(ufsvfsp, cgp, bpref, allocsiz);
1039 if (bno < 0 && allocsiz < (fs->fs_frag - 1)) {
1040 allocsiz++;
1041 goto next_size;
1042 }
1043 }
1044
1045 if (allocsiz == fs->fs_frag || bno < 0) {
1046 /*
1047 * No fragments were available, so a block
1048 * will be allocated and hacked up.
1049 */
1050 if (cgp->cg_cs.cs_nbfree == 0)
1051 goto errout;
1052 if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0)
1053 goto errout;
1054 bpref = dtogd(fs, bno);
1055 for (i = frags; i < fs->fs_frag; i++)
1056 setbit(blksfree, bpref + i);
1057 i = fs->fs_frag - frags;
1058 cgp->cg_cs.cs_nffree += i;
1059 fs->fs_cstotal.cs_nffree += i;
1060 fs->fs_cs(fs, cg).cs_nffree += i;
1061 cgp->cg_frsum[i]++;
1062 fs->fs_fmod = 1;
1063 ufs_notclean(ufsvfsp);
1064 TRANS_SI(ufsvfsp, fs, cg);
1065 bdrwrite(bp);
1066 return (bno);
1067 }
1068
1069 for (i = 0; i < frags; i++)
1070 clrbit(blksfree, bno + i);
1071 cgp->cg_cs.cs_nffree -= frags;
1072 fs->fs_cstotal.cs_nffree -= frags;
1073 fs->fs_cs(fs, cg).cs_nffree -= frags;
1074 cgp->cg_frsum[allocsiz]--;
1075 ASSERT(cgp->cg_frsum[allocsiz] >= 0);
1076 if (frags != allocsiz) {
1077 cgp->cg_frsum[allocsiz - frags]++;
1078 }
1079 fs->fs_fmod = 1;
1080 ufs_notclean(ufsvfsp);
1081 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1082 TRANS_SI(ufsvfsp, fs, cg);
1083 bdrwrite(bp);
1084 return (cg * fs->fs_fpg + bno);
1085 errout:
1086 mutex_exit(&ufsvfsp->vfs_lock);
1087 brelse(bp);
1088 return (0);
1089 }
1090
1091 /*
1092 * Allocate a block in a cylinder group.
1093 *
1094 * This algorithm implements the following policy:
1095 * 1) allocate the requested block.
1096 * 2) allocate a rotationally optimal block in the same cylinder.
1097 * 3) allocate the next available block on the block rotor for the
1098 * specified cylinder group.
1099 * Note that this routine only allocates fs_bsize blocks; these
1100 * blocks may be fragmented by the routine that allocates them.
1101 */
1102 static daddr_t
alloccgblk(struct ufsvfs * ufsvfsp,struct cg * cgp,daddr_t bpref,struct buf * bp)1103 alloccgblk(
1104 struct ufsvfs *ufsvfsp,
1105 struct cg *cgp,
1106 daddr_t bpref,
1107 struct buf *bp)
1108 {
1109 daddr_t bno;
1110 int cylno, pos, delta, rotbl_size;
1111 short *cylbp;
1112 int i;
1113 struct fs *fs;
1114 uchar_t *blksfree;
1115 daddr_t blkno, rpos, frag;
1116 short *blks;
1117 int32_t *blktot;
1118
1119 ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
1120 fs = ufsvfsp->vfs_fs;
1121 blksfree = cg_blksfree(cgp);
1122 if (bpref == 0) {
1123 bpref = cgp->cg_rotor;
1124 goto norot;
1125 }
1126 bpref = blknum(fs, bpref);
1127 bpref = dtogd(fs, bpref);
1128 /*
1129 * If the requested block is available, use it.
1130 */
1131 if (isblock(fs, blksfree, (daddr_t)fragstoblks(fs, bpref))) {
1132 bno = bpref;
1133 goto gotit;
1134 }
1135 /*
1136 * Check for a block available on the same cylinder.
1137 */
1138 cylno = cbtocylno(fs, bpref);
1139 if (cg_blktot(cgp)[cylno] == 0)
1140 goto norot;
1141 if (fs->fs_cpc == 0) {
1142 /*
1143 * Block layout info is not available, so just
1144 * have to take any block in this cylinder.
1145 */
1146 bpref = howmany(fs->fs_spc * cylno, NSPF(fs));
1147 goto norot;
1148 }
1149 /*
1150 * Check the summary information to see if a block is
1151 * available in the requested cylinder starting at the
1152 * requested rotational position and proceeding around.
1153 */
1154 cylbp = cg_blks(ufsvfsp, cgp, cylno);
1155 pos = cbtorpos(ufsvfsp, bpref);
1156 for (i = pos; i < ufsvfsp->vfs_nrpos; i++)
1157 if (cylbp[i] > 0)
1158 break;
1159 if (i == ufsvfsp->vfs_nrpos)
1160 for (i = 0; i < pos; i++)
1161 if (cylbp[i] > 0)
1162 break;
1163 if (cylbp[i] > 0) {
1164 /*
1165 * Found a rotational position, now find the actual
1166 * block. A "panic" if none is actually there.
1167 */
1168
1169 /*
1170 * Up to this point, "pos" has referred to the rotational
1171 * position of the desired block. From now on, it holds
1172 * the offset of the current cylinder within a cylinder
1173 * cycle. (A cylinder cycle refers to a set of cylinders
1174 * which are described by a single rotational table; the
1175 * size of the cycle is fs_cpc.)
1176 *
1177 * bno is set to the block number of the first block within
1178 * the current cylinder cycle.
1179 */
1180
1181 pos = cylno % fs->fs_cpc;
1182 bno = (cylno - pos) * fs->fs_spc / NSPB(fs);
1183
1184 /*
1185 * The blocks within a cylinder are grouped into equivalence
1186 * classes according to their "rotational position." There
1187 * are two tables used to determine these classes.
1188 *
1189 * The positional offset table (fs_postbl) has an entry for
1190 * each rotational position of each cylinder in a cylinder
1191 * cycle. This entry contains the relative block number
1192 * (counting from the start of the cylinder cycle) of the
1193 * first block in the equivalence class for that position
1194 * and that cylinder. Positions for which no blocks exist
1195 * are indicated by a -1.
1196 *
1197 * The rotational delta table (fs_rotbl) has an entry for
1198 * each block in a cylinder cycle. This entry contains
1199 * the offset from that block to the next block in the
1200 * same equivalence class. The last block in the class
1201 * is indicated by a zero in the table.
1202 *
1203 * The following code, then, walks through all of the blocks
1204 * in the cylinder (cylno) which we're allocating within
1205 * which are in the equivalence class for the rotational
1206 * position (i) which we're allocating within.
1207 */
1208
1209 if (fs_postbl(ufsvfsp, pos)[i] == -1) {
1210 (void) ufs_fault(ufsvfsp->vfs_root,
1211 "alloccgblk: cyl groups corrupted, pos = %d, "
1212 "i = %d, fs = %s\n", pos, i, fs->fs_fsmnt);
1213 return (0);
1214 }
1215
1216 /*
1217 * There is one entry in the rotational table for each block
1218 * in the cylinder cycle. These are whole blocks, not frags.
1219 */
1220
1221 rotbl_size = (fs->fs_cpc * fs->fs_spc) >>
1222 (fs->fs_fragshift + fs->fs_fsbtodb);
1223
1224 /*
1225 * As we start, "i" is the rotational position within which
1226 * we're searching. After the next line, it will be a block
1227 * number (relative to the start of the cylinder cycle)
1228 * within the equivalence class of that rotational position.
1229 */
1230
1231 i = fs_postbl(ufsvfsp, pos)[i];
1232
1233 for (;;) {
1234 if (isblock(fs, blksfree, (daddr_t)(bno + i))) {
1235 bno = blkstofrags(fs, (bno + i));
1236 goto gotit;
1237 }
1238 delta = fs_rotbl(fs)[i];
1239 if (delta <= 0 || /* End of chain, or */
1240 delta + i > rotbl_size) /* end of table? */
1241 break; /* If so, panic. */
1242 i += delta;
1243 }
1244 (void) ufs_fault(ufsvfsp->vfs_root,
1245 "alloccgblk: can't find blk in cyl, pos:%d, i:%d, "
1246 "fs:%s bno: %x\n", pos, i, fs->fs_fsmnt, (int)bno);
1247 return (0);
1248 }
1249 norot:
1250 /*
1251 * No blocks in the requested cylinder, so take
1252 * next available one in this cylinder group.
1253 */
1254 bno = mapsearch(ufsvfsp, cgp, bpref, (int)fs->fs_frag);
1255 if (bno < 0)
1256 return (0);
1257 cgp->cg_rotor = bno;
1258 gotit:
1259 blkno = fragstoblks(fs, bno);
1260 frag = (cgp->cg_cgx * fs->fs_fpg) + bno;
1261 if (TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, frag)), fs->fs_bsize))
1262 goto norot;
1263 clrblock(fs, blksfree, (long)blkno);
1264 /*
1265 * the other cg/sb/si fields are TRANS'ed by the caller
1266 */
1267 cgp->cg_cs.cs_nbfree--;
1268 fs->fs_cstotal.cs_nbfree--;
1269 fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
1270 cylno = cbtocylno(fs, bno);
1271 blks = cg_blks(ufsvfsp, cgp, cylno);
1272 rpos = cbtorpos(ufsvfsp, bno);
1273 blktot = cg_blktot(cgp);
1274 blks[rpos]--;
1275 blktot[cylno]--;
1276 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1277 fs->fs_fmod = 1;
1278 return (frag);
1279 }
1280
1281 /*
1282 * Determine whether an inode can be allocated.
1283 *
1284 * Check to see if an inode is available, and if it is,
1285 * allocate it using the following policy:
1286 * 1) allocate the requested inode.
1287 * 2) allocate the next available inode after the requested
1288 * inode in the specified cylinder group.
1289 */
1290 static ino_t
ialloccg(struct inode * ip,int cg,daddr_t ipref,int mode)1291 ialloccg(struct inode *ip, int cg, daddr_t ipref, int mode)
1292 {
1293 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
1294 struct fs *fs = ip->i_fs;
1295 struct cg *cgp;
1296 struct buf *bp;
1297 int start, len, loc, map, i;
1298 char *iused;
1299
1300 if (fs->fs_cs(fs, cg).cs_nifree == 0)
1301 return (0);
1302 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
1303 (int)fs->fs_cgsize);
1304
1305 cgp = bp->b_un.b_cg;
1306 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) ||
1307 cgp->cg_cs.cs_nifree == 0) {
1308 brelse(bp);
1309 return (0);
1310 }
1311 iused = cg_inosused(cgp);
1312 mutex_enter(&ufsvfsp->vfs_lock);
1313 /*
1314 * While we are waiting for the mutex, someone may have taken
1315 * the last available inode. Need to recheck.
1316 */
1317 if (cgp->cg_cs.cs_nifree == 0) {
1318 mutex_exit(&ufsvfsp->vfs_lock);
1319 brelse(bp);
1320 return (0);
1321 }
1322
1323 cgp->cg_time = gethrestime_sec();
1324 if (ipref) {
1325 ipref %= fs->fs_ipg;
1326 if (isclr(iused, ipref))
1327 goto gotit;
1328 }
1329 start = cgp->cg_irotor / NBBY;
1330 len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
1331 loc = skpc(0xff, (uint_t)len, &iused[start]);
1332 if (loc == 0) {
1333 len = start + 1;
1334 start = 0;
1335 loc = skpc(0xff, (uint_t)len, &iused[0]);
1336 if (loc == 0) {
1337 mutex_exit(&ufsvfsp->vfs_lock);
1338 (void) ufs_fault(ITOV(ip),
1339 "ialloccg: map corrupted, cg = %d, irotor = %d, "
1340 "fs = %s\n", cg, (int)cgp->cg_irotor, fs->fs_fsmnt);
1341 return (0);
1342 }
1343 }
1344 i = start + len - loc;
1345 map = iused[i];
1346 ipref = i * NBBY;
1347 for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) {
1348 if ((map & i) == 0) {
1349 cgp->cg_irotor = ipref;
1350 goto gotit;
1351 }
1352 }
1353
1354 mutex_exit(&ufsvfsp->vfs_lock);
1355 (void) ufs_fault(ITOV(ip), "ialloccg: block not in mapfs = %s",
1356 fs->fs_fsmnt);
1357 return (0);
1358 gotit:
1359 setbit(iused, ipref);
1360 cgp->cg_cs.cs_nifree--;
1361 fs->fs_cstotal.cs_nifree--;
1362 fs->fs_cs(fs, cg).cs_nifree--;
1363 if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) {
1364 cgp->cg_cs.cs_ndir++;
1365 fs->fs_cstotal.cs_ndir++;
1366 fs->fs_cs(fs, cg).cs_ndir++;
1367 }
1368 fs->fs_fmod = 1;
1369 ufs_notclean(ufsvfsp);
1370 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1371 TRANS_SI(ufsvfsp, fs, cg);
1372 bdrwrite(bp);
1373 return (cg * fs->fs_ipg + ipref);
1374 }
1375
1376 /*
1377 * Find a block of the specified size in the specified cylinder group.
1378 *
1379 * It is a panic if a request is made to find a block if none are
1380 * available.
1381 */
1382 static daddr_t
mapsearch(struct ufsvfs * ufsvfsp,struct cg * cgp,daddr_t bpref,int allocsiz)1383 mapsearch(struct ufsvfs *ufsvfsp, struct cg *cgp, daddr_t bpref, int allocsiz)
1384 {
1385 struct fs *fs = ufsvfsp->vfs_fs;
1386 daddr_t bno, cfrag;
1387 int start, len, loc, i, last, first, secondtime;
1388 int blk, field, subfield, pos;
1389 int gotit;
1390
1391 /*
1392 * ufsvfs->vfs_lock is held when calling this.
1393 */
1394 /*
1395 * Find the fragment by searching through the
1396 * free block map for an appropriate bit pattern.
1397 */
1398 if (bpref)
1399 start = dtogd(fs, bpref) / NBBY;
1400 else
1401 start = cgp->cg_frotor / NBBY;
1402 /*
1403 * the following loop performs two scans -- the first scan
1404 * searches the bottom half of the array for a match and the
1405 * second scan searches the top half of the array. The loops
1406 * have been merged just to make things difficult.
1407 */
1408 first = start;
1409 last = howmany(fs->fs_fpg, NBBY);
1410 secondtime = 0;
1411 cfrag = cgp->cg_cgx * fs->fs_fpg;
1412 while (first < last) {
1413 len = last - first;
1414 /*
1415 * search the array for a match
1416 */
1417 loc = scanc((unsigned)len, (uchar_t *)&cg_blksfree(cgp)[first],
1418 (uchar_t *)fragtbl[fs->fs_frag],
1419 (int)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
1420 /*
1421 * match found
1422 */
1423 if (loc) {
1424 bno = (last - loc) * NBBY;
1425
1426 /*
1427 * Found the byte in the map, sift
1428 * through the bits to find the selected frag
1429 */
1430 cgp->cg_frotor = bno;
1431 gotit = 0;
1432 for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
1433 blk = blkmap(fs, cg_blksfree(cgp), bno);
1434 blk <<= 1;
1435 field = around[allocsiz];
1436 subfield = inside[allocsiz];
1437 for (pos = 0;
1438 pos <= fs->fs_frag - allocsiz;
1439 pos++) {
1440 if ((blk & field) == subfield) {
1441 gotit++;
1442 break;
1443 }
1444 field <<= 1;
1445 subfield <<= 1;
1446 }
1447 if (gotit)
1448 break;
1449 }
1450 bno += pos;
1451
1452 /*
1453 * success if block is *not* being converted from
1454 * metadata into userdata (harpy). If so, ignore.
1455 */
1456 if (!TRANS_ISCANCEL(ufsvfsp,
1457 ldbtob(fsbtodb(fs, (cfrag+bno))),
1458 allocsiz * fs->fs_fsize))
1459 return (bno);
1460
1461 /*
1462 * keep looking -- this block is being converted
1463 */
1464 first = (last - loc) + 1;
1465 loc = 0;
1466 if (first < last)
1467 continue;
1468 }
1469 /*
1470 * no usable matches in bottom half -- now search the top half
1471 */
1472 if (secondtime)
1473 /*
1474 * no usable matches in top half -- all done
1475 */
1476 break;
1477 secondtime = 1;
1478 last = start + 1;
1479 first = 0;
1480 }
1481 /*
1482 * no usable matches
1483 */
1484 return ((daddr_t)-1);
1485 }
1486
1487 #define UFSNADDR (NDADDR + NIADDR) /* NADDR applies to (obsolete) S5FS */
1488 #define IB(i) (NDADDR + (i)) /* index of i'th indirect block ptr */
1489 #define SINGLE 0 /* single indirect block ptr */
1490 #define DOUBLE 1 /* double indirect block ptr */
1491 #define TRIPLE 2 /* triple indirect block ptr */
1492
1493 /*
1494 * Acquire a write lock, and keep trying till we get it
1495 */
1496 static int
allocsp_wlockfs(struct vnode * vp,struct lockfs * lf)1497 allocsp_wlockfs(struct vnode *vp, struct lockfs *lf)
1498 {
1499 int err = 0;
1500
1501 lockagain:
1502 do {
1503 err = ufs_fiolfss(vp, lf);
1504 if (err)
1505 return (err);
1506 } while (!LOCKFS_IS_ULOCK(lf));
1507
1508 lf->lf_lock = LOCKFS_WLOCK;
1509 lf->lf_flags = 0;
1510 lf->lf_comment = NULL;
1511 err = ufs__fiolfs(vp, lf, 1, 0);
1512
1513 if (err == EBUSY || err == EINVAL)
1514 goto lockagain;
1515
1516 return (err);
1517 }
1518
1519 /*
1520 * Release the write lock
1521 */
1522 static int
allocsp_unlockfs(struct vnode * vp,struct lockfs * lf)1523 allocsp_unlockfs(struct vnode *vp, struct lockfs *lf)
1524 {
1525 int err = 0;
1526
1527 lf->lf_lock = LOCKFS_ULOCK;
1528 lf->lf_flags = 0;
1529 err = ufs__fiolfs(vp, lf, 1, 0);
1530 return (err);
1531 }
1532
1533 struct allocsp_undo {
1534 daddr_t offset;
1535 daddr_t blk;
1536 struct allocsp_undo *next;
1537 };
1538
1539 /*
1540 * ufs_allocsp() can be used to pre-allocate blocks for a file on a given
1541 * file system. For direct blocks, the blocks are allocated from the offset
1542 * requested to the block boundary, then any full blocks are allocated,
1543 * and finally any remainder.
1544 * For indirect blocks the blocks are not initialized and are
1545 * only marked as allocated. These addresses are then stored as negative
1546 * block numbers in the inode to imply special handling. UFS has been modified
1547 * where necessary to understand this new notion.
1548 * Successfully fallocated files will have IFALLOCATE cflag set in the inode.
1549 */
1550 int
ufs_allocsp(struct vnode * vp,struct flock64 * lp,cred_t * cr)1551 ufs_allocsp(struct vnode *vp, struct flock64 *lp, cred_t *cr)
1552 {
1553 struct lockfs lf;
1554 int berr, err, resv, issync;
1555 off_t istart, len; /* istart, special for idb */
1556 struct inode *ip;
1557 struct fs *fs;
1558 struct ufsvfs *ufsvfsp;
1559 u_offset_t resid, i, uoff;
1560 daddr32_t db_undo[NDADDR]; /* old direct blocks */
1561 struct allocsp_undo *ib_undo = NULL; /* ib undo */
1562 struct allocsp_undo *undo = NULL;
1563 u_offset_t osz; /* old file size */
1564 int chunkblks = 0; /* # of blocks in 1 allocation */
1565 int cnt = 0;
1566 daddr_t allocblk;
1567 daddr_t totblks = 0;
1568 struct ulockfs *ulp;
1569 size_t done_len;
1570 int nbytes, offsetn;
1571
1572
1573 ASSERT(vp->v_type == VREG);
1574
1575 ip = VTOI(vp);
1576 fs = ip->i_fs;
1577 if ((ufsvfsp = ip->i_ufsvfs) == NULL) {
1578 err = EIO;
1579 goto out_allocsp;
1580 }
1581
1582 istart = blkroundup(fs, (lp->l_start));
1583 len = blkroundup(fs, (lp->l_len));
1584 chunkblks = blkroundup(fs, ufsvfsp->vfs_iotransz) / fs->fs_bsize;
1585 ulp = &ufsvfsp->vfs_ulockfs;
1586
1587 if (lp->l_start < 0 || lp->l_len <= 0)
1588 return (EINVAL);
1589
1590 /* Quickly check to make sure we have space before we proceed */
1591 if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree) {
1592 if (TRANS_ISTRANS(ufsvfsp)) {
1593 ufs_delete_drain_wait(ufsvfsp, 1);
1594 if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree)
1595 return (ENOSPC);
1596 } else
1597 return (ENOSPC);
1598 }
1599
1600 /*
1601 * We will keep i_rwlock locked as WRITER through out the function
1602 * since we don't want anyone else reading or writing to the inode
1603 * while we are in the middle of fallocating the file.
1604 */
1605 rw_enter(&ip->i_rwlock, RW_WRITER);
1606
1607 /* Back up the direct block list, used for undo later if necessary */
1608 rw_enter(&ip->i_contents, RW_READER);
1609 for (i = 0; i < NDADDR; i++)
1610 db_undo[i] = ip->i_db[i];
1611 osz = ip->i_size;
1612 rw_exit(&ip->i_contents);
1613
1614 /* Write lock the file system */
1615 if (err = allocsp_wlockfs(vp, &lf))
1616 goto exit;
1617
1618 /*
1619 * Allocate any direct blocks now.
1620 * Blocks are allocated from the offset requested to the block
1621 * boundary, then any full blocks are allocated, and finally any
1622 * remainder.
1623 */
1624 if (lblkno(fs, lp->l_start) < NDADDR) {
1625 ufs_trans_trunc_resv(ip, ip->i_size + (NDADDR * fs->fs_bsize),
1626 &resv, &resid);
1627 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1628
1629 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1630 rw_enter(&ip->i_contents, RW_WRITER);
1631
1632 done_len = 0;
1633 while ((done_len < lp->l_len) &&
1634 (lblkno(fs, lp->l_start + done_len) < NDADDR)) {
1635 uoff = (offset_t)(lp->l_start + done_len);
1636 offsetn = (int)blkoff(fs, uoff);
1637 nbytes = (int)MIN(fs->fs_bsize - offsetn,
1638 lp->l_len - done_len);
1639
1640 berr = bmap_write(ip, uoff, offsetn + nbytes,
1641 BI_FALLOCATE, &allocblk, cr);
1642 /* Yikes error, quit */
1643 if (berr) {
1644 TRANS_INODE(ufsvfsp, ip);
1645 rw_exit(&ip->i_contents);
1646 rw_exit(&ufsvfsp->vfs_dqrwlock);
1647 TRANS_END_CSYNC(ufsvfsp, err, issync,
1648 TOP_ALLOCSP, resv);
1649 err = allocsp_unlockfs(vp, &lf);
1650 goto exit;
1651 }
1652
1653 if (allocblk) {
1654 totblks++;
1655 if ((uoff + nbytes) > ip->i_size)
1656 ip->i_size = (uoff + nbytes);
1657 }
1658 done_len += nbytes;
1659 }
1660
1661 TRANS_INODE(ufsvfsp, ip);
1662 rw_exit(&ip->i_contents);
1663 rw_exit(&ufsvfsp->vfs_dqrwlock);
1664 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
1665
1666 /* start offset for indirect allocation */
1667 istart = (uoff + nbytes);
1668 }
1669
1670 /* Break the transactions into vfs_iotransz units */
1671 ufs_trans_trunc_resv(ip, ip->i_size +
1672 blkroundup(fs, ufsvfsp->vfs_iotransz), &resv, &resid);
1673 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1674
1675 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1676 rw_enter(&ip->i_contents, RW_WRITER);
1677
1678 /* Now go about fallocating necessary indirect blocks */
1679 for (i = istart; i < (lp->l_start + lp->l_len); i += fs->fs_bsize) {
1680 berr = bmap_write(ip, i, fs->fs_bsize, BI_FALLOCATE,
1681 &allocblk, cr);
1682 if (berr) {
1683 TRANS_INODE(ufsvfsp, ip);
1684 rw_exit(&ip->i_contents);
1685 rw_exit(&ufsvfsp->vfs_dqrwlock);
1686 TRANS_END_CSYNC(ufsvfsp, err, issync,
1687 TOP_ALLOCSP, resv);
1688 err = allocsp_unlockfs(vp, &lf);
1689 goto exit;
1690 }
1691
1692 /* Update the blk counter only if new block was added */
1693 if (allocblk) {
1694 /* Save undo information */
1695 undo = kmem_alloc(sizeof (struct allocsp_undo),
1696 KM_SLEEP);
1697 undo->offset = i;
1698 undo->blk = allocblk;
1699 undo->next = ib_undo;
1700 ib_undo = undo;
1701 totblks++;
1702
1703 if (i >= ip->i_size)
1704 ip->i_size += fs->fs_bsize;
1705 }
1706 cnt++;
1707
1708 /* Being a good UFS citizen, let others get a share */
1709 if (cnt == chunkblks) {
1710 /*
1711 * If there are waiters or the fs is hard locked,
1712 * error locked, or read-only error locked,
1713 * quit with EIO
1714 */
1715 if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) ||
1716 ULOCKFS_IS_ROELOCK(ulp)) {
1717 ip->i_cflags |= IFALLOCATE;
1718 TRANS_INODE(ufsvfsp, ip);
1719 rw_exit(&ip->i_contents);
1720 rw_exit(&ufsvfsp->vfs_dqrwlock);
1721
1722 TRANS_END_CSYNC(ufsvfsp, err, issync,
1723 TOP_ALLOCSP, resv);
1724 rw_exit(&ip->i_rwlock);
1725 (void) allocsp_unlockfs(vp, &lf);
1726 return (EIO);
1727 }
1728
1729 TRANS_INODE(ufsvfsp, ip);
1730 rw_exit(&ip->i_contents);
1731 rw_exit(&ufsvfsp->vfs_dqrwlock);
1732
1733 /* End the current transaction */
1734 TRANS_END_CSYNC(ufsvfsp, err, issync,
1735 TOP_ALLOCSP, resv);
1736
1737 if (CV_HAS_WAITERS(&ulp->ul_cv)) {
1738 /* Release the write lock */
1739 if (err = allocsp_unlockfs(vp, &lf))
1740 goto exit;
1741
1742 /* Wake up others waiting to do operations */
1743 mutex_enter(&ulp->ul_lock);
1744 cv_broadcast(&ulp->ul_cv);
1745 mutex_exit(&ulp->ul_lock);
1746
1747 /* Grab the write lock again */
1748 if (err = allocsp_wlockfs(vp, &lf))
1749 goto exit;
1750 } /* end of CV_HAS_WAITERS(&ulp->ul_cv) */
1751
1752 /* Reserve more space in log for this file */
1753 ufs_trans_trunc_resv(ip,
1754 ip->i_size + blkroundup(fs, ufsvfsp->vfs_iotransz),
1755 &resv, &resid);
1756 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1757
1758 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1759 rw_enter(&ip->i_contents, RW_WRITER);
1760
1761 cnt = 0; /* reset cnt b/c of new transaction */
1762 }
1763 }
1764
1765 if (!err && !berr)
1766 ip->i_cflags |= IFALLOCATE;
1767
1768 /* If the file has grown then correct the file size */
1769 if (osz < (lp->l_start + lp->l_len))
1770 ip->i_size = (lp->l_start + lp->l_len);
1771
1772 /* Release locks, end log transaction and unlock fs */
1773 TRANS_INODE(ufsvfsp, ip);
1774 rw_exit(&ip->i_contents);
1775 rw_exit(&ufsvfsp->vfs_dqrwlock);
1776
1777 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
1778 err = allocsp_unlockfs(vp, &lf);
1779
1780 /*
1781 * @ exit label, we should no longer be holding the fs write lock, and
1782 * all logging transactions should have been ended. We still hold
1783 * ip->i_rwlock.
1784 */
1785 exit:
1786 /*
1787 * File has grown larger than 2GB. Set flag
1788 * in superblock to indicate this, if it
1789 * is not already set.
1790 */
1791 if ((ip->i_size > MAXOFF32_T) &&
1792 !(fs->fs_flags & FSLARGEFILES)) {
1793 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1794 mutex_enter(&ufsvfsp->vfs_lock);
1795 fs->fs_flags |= FSLARGEFILES;
1796 ufs_sbwrite(ufsvfsp);
1797 mutex_exit(&ufsvfsp->vfs_lock);
1798 }
1799
1800 /*
1801 * Since we couldn't allocate completely, we will undo the allocations.
1802 */
1803 if (berr) {
1804 ufs_trans_trunc_resv(ip, totblks * fs->fs_bsize, &resv, &resid);
1805 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1806
1807 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1808 rw_enter(&ip->i_contents, RW_WRITER);
1809
1810 /* Direct blocks */
1811 for (i = 0; i < NDADDR; i++) {
1812 /*
1813 * Only free the block if they are not same, and
1814 * the old one isn't zero (the fragment was
1815 * re-allocated).
1816 */
1817 if (db_undo[i] != ip->i_db[i] && db_undo[i] == 0) {
1818 free(ip, ip->i_db[i], fs->fs_bsize, 0);
1819 ip->i_db[i] = 0;
1820 }
1821 }
1822
1823 /* Undo the indirect blocks */
1824 while (ib_undo != NULL) {
1825 undo = ib_undo;
1826 err = bmap_set_bn(vp, undo->offset, 0);
1827 if (err)
1828 cmn_err(CE_PANIC, "ufs_allocsp(): failed to "
1829 "undo allocation of block %ld",
1830 undo->offset);
1831 free(ip, undo->blk, fs->fs_bsize, I_IBLK);
1832 ib_undo = undo->next;
1833 kmem_free(undo, sizeof (struct allocsp_undo));
1834 }
1835
1836 ip->i_size = osz;
1837 TRANS_INODE(ufsvfsp, ip);
1838
1839 rw_exit(&ip->i_contents);
1840 rw_exit(&ufsvfsp->vfs_dqrwlock);
1841
1842 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
1843
1844 rw_exit(&ip->i_rwlock);
1845 return (berr);
1846 }
1847
1848 /*
1849 * Don't forget to free the undo chain :)
1850 */
1851 while (ib_undo != NULL) {
1852 undo = ib_undo;
1853 ib_undo = undo->next;
1854 kmem_free(undo, sizeof (struct allocsp_undo));
1855 }
1856
1857 rw_exit(&ip->i_rwlock);
1858
1859 out_allocsp:
1860 return (err);
1861 }
1862
1863 /*
1864 * Free storage space associated with the specified inode. The portion
1865 * to be freed is specified by lp->l_start and lp->l_len (already
1866 * normalized to a "whence" of 0).
1867 *
1868 * This is an experimental facility whose continued existence is not
1869 * guaranteed. Currently, we only support the special case
1870 * of l_len == 0, meaning free to end of file.
1871 *
1872 * Blocks are freed in reverse order. This FILO algorithm will tend to
1873 * maintain a contiguous free list much longer than FIFO.
1874 * See also ufs_itrunc() in ufs_inode.c.
1875 *
1876 * Bug: unused bytes in the last retained block are not cleared.
1877 * This may result in a "hole" in the file that does not read as zeroes.
1878 */
1879 /* ARGSUSED */
1880 int
ufs_freesp(struct vnode * vp,struct flock64 * lp,int flag,cred_t * cr)1881 ufs_freesp(struct vnode *vp, struct flock64 *lp, int flag, cred_t *cr)
1882 {
1883 int i;
1884 struct inode *ip = VTOI(vp);
1885 int error;
1886
1887 ASSERT(vp->v_type == VREG);
1888 ASSERT(lp->l_start >= 0); /* checked by convoff */
1889
1890 if (lp->l_len != 0)
1891 return (EINVAL);
1892
1893 rw_enter(&ip->i_contents, RW_READER);
1894 if (ip->i_size == (u_offset_t)lp->l_start) {
1895 rw_exit(&ip->i_contents);
1896 return (0);
1897 }
1898
1899 /*
1900 * Check if there is any active mandatory lock on the
1901 * range that will be truncated/expanded.
1902 */
1903 if (MANDLOCK(vp, ip->i_mode)) {
1904 offset_t save_start;
1905
1906 save_start = lp->l_start;
1907
1908 if (ip->i_size < lp->l_start) {
1909 /*
1910 * "Truncate up" case: need to make sure there
1911 * is no lock beyond current end-of-file. To
1912 * do so, we need to set l_start to the size
1913 * of the file temporarily.
1914 */
1915 lp->l_start = ip->i_size;
1916 }
1917 lp->l_type = F_WRLCK;
1918 lp->l_sysid = 0;
1919 lp->l_pid = ttoproc(curthread)->p_pid;
1920 i = (flag & (FNDELAY|FNONBLOCK)) ? 0 : SLPFLCK;
1921 rw_exit(&ip->i_contents);
1922 if ((i = reclock(vp, lp, i, 0, lp->l_start, NULL)) != 0 ||
1923 lp->l_type != F_UNLCK) {
1924 return (i ? i : EAGAIN);
1925 }
1926 rw_enter(&ip->i_contents, RW_READER);
1927
1928 lp->l_start = save_start;
1929 }
1930
1931 /*
1932 * Make sure a write isn't in progress (allocating blocks)
1933 * by acquiring i_rwlock (we promised ufs_bmap we wouldn't
1934 * truncate while it was allocating blocks).
1935 * Grab the locks in the right order.
1936 */
1937 rw_exit(&ip->i_contents);
1938 rw_enter(&ip->i_rwlock, RW_WRITER);
1939 error = TRANS_ITRUNC(ip, (u_offset_t)lp->l_start, 0, cr);
1940 rw_exit(&ip->i_rwlock);
1941 return (error);
1942 }
1943
1944 /*
1945 * Find a cg with as close to nb contiguous bytes as possible
1946 * THIS MAY TAKE MANY DISK READS!
1947 *
1948 * Implemented in an attempt to allocate contiguous blocks for
1949 * writing the ufs log file to, minimizing future disk head seeking
1950 */
1951 daddr_t
contigpref(ufsvfs_t * ufsvfsp,size_t nb,size_t minb)1952 contigpref(ufsvfs_t *ufsvfsp, size_t nb, size_t minb)
1953 {
1954 struct fs *fs = ufsvfsp->vfs_fs;
1955 daddr_t nblk = lblkno(fs, blkroundup(fs, nb));
1956 daddr_t minblk = lblkno(fs, blkroundup(fs, minb));
1957 daddr_t savebno, curbno, cgbno;
1958 int cg, cgblks, savecg, savenblk, curnblk, startcg;
1959 uchar_t *blksfree;
1960 buf_t *bp;
1961 struct cg *cgp;
1962
1963 savenblk = 0;
1964 savecg = 0;
1965 savebno = 0;
1966
1967 if ((startcg = findlogstartcg(fs, nblk, minblk)) == -1)
1968 cg = 0; /* Nothing suitable found */
1969 else
1970 cg = startcg;
1971
1972 for (; cg < fs->fs_ncg; ++cg) {
1973 /*
1974 * find the largest contiguous range in this cg
1975 */
1976 bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev,
1977 (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
1978 (int)fs->fs_cgsize);
1979 cgp = bp->b_un.b_cg;
1980 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
1981 brelse(bp);
1982 continue;
1983 }
1984 blksfree = cg_blksfree(cgp); /* free array */
1985 cgblks = fragstoblks(fs, fs->fs_fpg); /* blks in free array */
1986 cgbno = 0;
1987 while (cgbno < cgblks && savenblk < nblk) {
1988 /* find a free block */
1989 for (; cgbno < cgblks; ++cgbno) {
1990 if (isblock(fs, blksfree, cgbno)) {
1991 if (startcg != -1) {
1992 brelse(bp);
1993 savecg = startcg;
1994 savebno = cgbno;
1995 goto done;
1996 } else
1997 break;
1998 }
1999 }
2000 curbno = cgbno;
2001 /* count the number of free blocks */
2002 for (curnblk = 0; cgbno < cgblks; ++cgbno) {
2003 if (!isblock(fs, blksfree, cgbno))
2004 break;
2005 if (++curnblk >= nblk)
2006 break;
2007 }
2008 if (curnblk > savenblk) {
2009 savecg = cg;
2010 savenblk = curnblk;
2011 savebno = curbno;
2012 }
2013 }
2014 brelse(bp);
2015 if (savenblk >= nblk)
2016 break;
2017 }
2018
2019 done:
2020
2021 /* convert block offset in cg to frag offset in cg */
2022 savebno = blkstofrags(fs, savebno);
2023
2024 /* convert frag offset in cg to frag offset in fs */
2025 savebno += (savecg * fs->fs_fpg);
2026
2027 return (savebno);
2028 }
2029
2030 /*
2031 * The object of this routine is to find a start point for the UFS log.
2032 * Ideally the space should be allocated from the smallest possible number
2033 * of contiguous cylinder groups. This is found by using a sliding window
2034 * technique. The smallest window of contiguous cylinder groups, which is
2035 * still able to accommodate the target, is found by moving the window
2036 * through the cylinder groups in a single pass. The end of the window is
2037 * advanced until the space is accommodated, then the start is advanced until
2038 * it no longer fits, the end is then advanced again and so on until the
2039 * final cylinder group is reached. The first suitable instance is recorded
2040 * and its starting cg number is returned.
2041 *
2042 * If we are not able to find a minimum amount of space, represented by
2043 * minblk, or to do so uses more than the available extents, then return -1.
2044 */
2045
2046 int
findlogstartcg(struct fs * fs,daddr_t requested,daddr_t minblk)2047 findlogstartcg(struct fs *fs, daddr_t requested, daddr_t minblk)
2048 {
2049 int ncgs; /* number of cylinder groups */
2050 daddr_t target; /* amount of space sought */
2051 int cwidth, ctotal; /* current window width and total */
2052 int bwidth, btotal; /* best window width and total so far */
2053 int s; /* index of the first element in the current window */
2054 int e; /* index of the first element + the width */
2055 /* (i.e. 1 + index of last element) */
2056 int bs; /* index of the first element in the best window so far */
2057 int header, max_extents;
2058
2059 target = requested;
2060 ncgs = fs->fs_ncg;
2061
2062 header = sizeof (extent_block_t) - sizeof (extent_t);
2063 max_extents = ((fs->fs_bsize)-header) / sizeof (extent_t);
2064 cwidth = ctotal = 0;
2065 btotal = -1;
2066 bwidth = ncgs;
2067 s = e = 0;
2068 while (e < ncgs) {
2069 /* Advance the end of the window until it accommodates the target. */
2070 while (ctotal < target && e < ncgs) {
2071 ctotal += fs->fs_cs(fs, e).cs_nbfree;
2072 e++;
2073 }
2074
2075 /*
2076 * Advance the start of the window until it no longer
2077 * accommodates the target.
2078 */
2079 while (ctotal >= target && s < e) {
2080 /* See if this is the smallest window so far. */
2081 cwidth = e - s;
2082 if (cwidth <= bwidth) {
2083 if (cwidth == bwidth && ctotal <= btotal)
2084 goto more;
2085 bwidth = cwidth;
2086 btotal = ctotal;
2087 bs = s;
2088 }
2089 more:
2090 ctotal -= fs->fs_cs(fs, s).cs_nbfree;
2091 s++;
2092 }
2093 }
2094
2095 /*
2096 * If we cannot allocate the minimum required or we use too many
2097 * extents to do so, return -1.
2098 */
2099 if (btotal < minblk || bwidth > max_extents)
2100 bs = -1;
2101
2102 return (bs);
2103 }
2104