1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)ffs_balloc.c 8.8 (Berkeley) 6/16/95 30 * $FreeBSD: src/sys/ufs/ffs/ffs_balloc.c,v 1.26.2.1 2002/10/10 19:48:20 dillon Exp $ 31 * $DragonFly: src/sys/vfs/ufs/ffs_balloc.c,v 1.19 2008/05/21 18:49:49 dillon Exp $ 32 */ 33 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/proc.h> 37 #include <sys/buf.h> 38 #include <sys/lock.h> 39 #include <sys/mount.h> 40 #include <sys/vnode.h> 41 42 #include <sys/buf2.h> 43 44 #include "quota.h" 45 #include "inode.h" 46 #include "ufs_extern.h" 47 48 #include "fs.h" 49 #include "ffs_extern.h" 50 51 /* 52 * ffs_balloc(struct vnode *a_vp, ufs_daddr_t a_lbn, int a_size, 53 * struct ucred *a_cred, int a_flags, struct buf *a_bpp) 54 * 55 * Balloc defines the structure of filesystem storage by allocating 56 * the physical blocks on a device given the inode and the logical 57 * block number in a file. 58 * 59 * NOTE: B_CLRBUF - this flag tells balloc to clear invalid portions 60 * of the buffer. However, any dirty bits will override missing 61 * valid bits. This case occurs when writable mmaps are truncated 62 * and then extended. 63 */ 64 int 65 ffs_balloc(struct vop_balloc_args *ap) 66 { 67 struct inode *ip; 68 ufs_daddr_t lbn; 69 int size; 70 struct ucred *cred; 71 int flags; 72 struct fs *fs; 73 ufs_daddr_t nb; 74 struct buf *bp, *nbp, *dbp; 75 struct vnode *vp; 76 struct indir indirs[NIADDR + 2]; 77 ufs_daddr_t newb, *bap, pref; 78 int deallocated, osize, nsize, num, i, error; 79 ufs_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1]; 80 ufs_daddr_t *lbns_remfree, lbns[NIADDR + 1]; 81 int unwindidx; 82 int seqcount; 83 84 vp = ap->a_vp; 85 ip = VTOI(vp); 86 fs = ip->i_fs; 87 lbn = lblkno(fs, ap->a_startoffset); 88 size = blkoff(fs, ap->a_startoffset) + ap->a_size; 89 if (size > fs->fs_bsize) 90 panic("ffs_balloc: blk too big"); 91 *ap->a_bpp = NULL; 92 if (lbn < 0) 93 return (EFBIG); 94 cred = ap->a_cred; 95 flags = ap->a_flags; 96 97 /* 98 * The vnode must be locked for us to be able to safely mess 99 * around with the inode. 100 */ 101 if (vn_islocked(vp) != LK_EXCLUSIVE) { 102 panic("ffs_balloc: vnode %p not exclusively locked!", vp); 103 } 104 105 /* 106 * If the next write will extend the file into a new block, 107 * and the file is currently composed of a fragment 108 * this fragment has to be extended to be a full block. 109 */ 110 nb = lblkno(fs, ip->i_size); 111 if (nb < NDADDR && nb < lbn) { 112 /* 113 * The filesize prior to this write can fit in direct 114 * blocks (ex. fragmentation is possibly done) 115 * we are now extending the file write beyond 116 * the block which has end of the file prior to this write. 117 */ 118 osize = blksize(fs, ip, nb); 119 /* 120 * osize gives disk allocated size in the last block. It is 121 * either in fragments or a file system block size. 122 */ 123 if (osize < fs->fs_bsize && osize > 0) { 124 /* A few fragments are already allocated, since the 125 * current extends beyond this block allocated the 126 * complete block as fragments are on in last block. 127 */ 128 error = ffs_realloccg(ip, nb, 129 ffs_blkpref(ip, nb, (int)nb, &ip->i_db[0]), 130 osize, (int)fs->fs_bsize, cred, &bp); 131 if (error) 132 return (error); 133 if (DOINGSOFTDEP(vp)) 134 softdep_setup_allocdirect(ip, nb, 135 dofftofsb(fs, bp->b_bio2.bio_offset), 136 ip->i_db[nb], fs->fs_bsize, osize, bp); 137 /* adjust the inode size, we just grew */ 138 ip->i_size = smalllblktosize(fs, nb + 1); 139 ip->i_db[nb] = dofftofsb(fs, bp->b_bio2.bio_offset); 140 ip->i_flag |= IN_CHANGE | IN_UPDATE; 141 if (flags & B_SYNC) 142 bwrite(bp); 143 else 144 bawrite(bp); 145 /* bp is already released here */ 146 } 147 } 148 /* 149 * The first NDADDR blocks are direct blocks 150 */ 151 if (lbn < NDADDR) { 152 nb = ip->i_db[lbn]; 153 if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { 154 error = bread(vp, lblktodoff(fs, lbn), fs->fs_bsize, &bp); 155 if (error) { 156 brelse(bp); 157 return (error); 158 } 159 bp->b_bio2.bio_offset = fsbtodoff(fs, nb); 160 *ap->a_bpp = bp; 161 return (0); 162 } 163 if (nb != 0) { 164 /* 165 * Consider need to reallocate a fragment. 166 */ 167 osize = fragroundup(fs, blkoff(fs, ip->i_size)); 168 nsize = fragroundup(fs, size); 169 if (nsize <= osize) { 170 error = bread(vp, lblktodoff(fs, lbn), 171 osize, &bp); 172 if (error) { 173 brelse(bp); 174 return (error); 175 } 176 bp->b_bio2.bio_offset = fsbtodoff(fs, nb); 177 } else { 178 /* 179 * NOTE: ffs_realloccg() issues a bread(). 180 */ 181 error = ffs_realloccg(ip, lbn, 182 ffs_blkpref(ip, lbn, (int)lbn, 183 &ip->i_db[0]), osize, nsize, cred, &bp); 184 if (error) 185 return (error); 186 if (DOINGSOFTDEP(vp)) 187 softdep_setup_allocdirect(ip, lbn, 188 dofftofsb(fs, bp->b_bio2.bio_offset), 189 nb, nsize, osize, bp); 190 } 191 } else { 192 if (ip->i_size < smalllblktosize(fs, lbn + 1)) 193 nsize = fragroundup(fs, size); 194 else 195 nsize = fs->fs_bsize; 196 error = ffs_alloc(ip, lbn, 197 ffs_blkpref(ip, lbn, (int)lbn, &ip->i_db[0]), 198 nsize, cred, &newb); 199 if (error) 200 return (error); 201 bp = getblk(vp, lblktodoff(fs, lbn), nsize, 0, 0); 202 bp->b_bio2.bio_offset = fsbtodoff(fs, newb); 203 if (flags & B_CLRBUF) 204 vfs_bio_clrbuf(bp); 205 if (DOINGSOFTDEP(vp)) 206 softdep_setup_allocdirect(ip, lbn, newb, 0, 207 nsize, 0, bp); 208 } 209 ip->i_db[lbn] = dofftofsb(fs, bp->b_bio2.bio_offset); 210 ip->i_flag |= IN_CHANGE | IN_UPDATE; 211 *ap->a_bpp = bp; 212 return (0); 213 } 214 /* 215 * Determine the number of levels of indirection. 216 */ 217 pref = 0; 218 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 219 return(error); 220 #ifdef DIAGNOSTIC 221 if (num < 1) 222 panic ("ffs_balloc: ufs_bmaparray returned indirect block"); 223 #endif 224 /* 225 * Get a handle on the data block buffer before working through 226 * indirect blocks to avoid a deadlock between the VM system holding 227 * a locked VM page and issuing a BMAP (which tries to lock the 228 * indirect blocks), and the filesystem holding a locked indirect 229 * block and then trying to read a data block (which tries to lock 230 * the underlying VM pages). 231 */ 232 dbp = getblk(vp, lblktodoff(fs, lbn), fs->fs_bsize, 0, 0); 233 234 /* 235 * Setup undo history 236 */ 237 allocib = NULL; 238 allocblk = allociblk; 239 lbns_remfree = lbns; 240 241 unwindidx = -1; 242 243 /* 244 * Fetch the first indirect block directly from the inode, allocating 245 * one if necessary. 246 */ 247 --num; 248 nb = ip->i_ib[indirs[0].in_off]; 249 if (nb == 0) { 250 pref = ffs_blkpref(ip, lbn, 0, NULL); 251 /* 252 * If the filesystem has run out of space we can skip the 253 * full fsync/undo of the main [fail] case since no undo 254 * history has been built yet. Hence the goto fail2. 255 */ 256 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 257 cred, &newb)) != 0) 258 goto fail2; 259 nb = newb; 260 *allocblk++ = nb; 261 *lbns_remfree++ = indirs[1].in_lbn; 262 bp = getblk(vp, lblktodoff(fs, indirs[1].in_lbn), 263 fs->fs_bsize, 0, 0); 264 bp->b_bio2.bio_offset = fsbtodoff(fs, nb); 265 vfs_bio_clrbuf(bp); 266 if (DOINGSOFTDEP(vp)) { 267 softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off, 268 newb, 0, fs->fs_bsize, 0, bp); 269 bdwrite(bp); 270 } else { 271 /* 272 * Write synchronously so that indirect blocks 273 * never point at garbage. 274 */ 275 if (DOINGASYNC(vp)) 276 bdwrite(bp); 277 else if ((error = bwrite(bp)) != 0) 278 goto fail; 279 } 280 allocib = &ip->i_ib[indirs[0].in_off]; 281 *allocib = nb; 282 ip->i_flag |= IN_CHANGE | IN_UPDATE; 283 } 284 285 /* 286 * Fetch through the indirect blocks, allocating as necessary. 287 */ 288 for (i = 1;;) { 289 error = bread(vp, lblktodoff(fs, indirs[i].in_lbn), (int)fs->fs_bsize, &bp); 290 if (error) { 291 brelse(bp); 292 goto fail; 293 } 294 bap = (ufs_daddr_t *)bp->b_data; 295 nb = bap[indirs[i].in_off]; 296 if (i == num) 297 break; 298 i += 1; 299 if (nb != 0) { 300 bqrelse(bp); 301 continue; 302 } 303 if (pref == 0) 304 pref = ffs_blkpref(ip, lbn, 0, NULL); 305 if ((error = 306 ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) != 0) { 307 brelse(bp); 308 goto fail; 309 } 310 nb = newb; 311 *allocblk++ = nb; 312 *lbns_remfree++ = indirs[i].in_lbn; 313 nbp = getblk(vp, lblktodoff(fs, indirs[i].in_lbn), 314 fs->fs_bsize, 0, 0); 315 nbp->b_bio2.bio_offset = fsbtodoff(fs, nb); 316 vfs_bio_clrbuf(nbp); 317 if (DOINGSOFTDEP(vp)) { 318 softdep_setup_allocindir_meta(nbp, ip, bp, 319 indirs[i - 1].in_off, nb); 320 bdwrite(nbp); 321 } else { 322 /* 323 * Write synchronously so that indirect blocks 324 * never point at garbage. 325 */ 326 if ((error = bwrite(nbp)) != 0) { 327 brelse(bp); 328 goto fail; 329 } 330 } 331 bap[indirs[i - 1].in_off] = nb; 332 if (allocib == NULL && unwindidx < 0) 333 unwindidx = i - 1; 334 /* 335 * If required, write synchronously, otherwise use 336 * delayed write. 337 */ 338 if (flags & B_SYNC) { 339 bwrite(bp); 340 } else { 341 if (bp->b_bufsize == fs->fs_bsize) 342 bp->b_flags |= B_CLUSTEROK; 343 bdwrite(bp); 344 } 345 } 346 347 /* 348 * Get the data block, allocating if necessary. We have already 349 * called getblk() on the data block buffer, dbp. If we have to 350 * allocate it and B_CLRBUF has been set the inference is an intention 351 * to zero out the related disk blocks, so we do not have to issue 352 * a read. Instead we simply call vfs_bio_clrbuf(). If B_CLRBUF is 353 * not set the caller intends to overwrite the entire contents of the 354 * buffer and we don't waste time trying to clean up the contents. 355 * 356 * bp references the current indirect block. When allocating, 357 * the block must be updated. 358 */ 359 if (nb == 0) { 360 pref = ffs_blkpref(ip, lbn, indirs[i].in_off, &bap[0]); 361 error = ffs_alloc(ip, 362 lbn, pref, (int)fs->fs_bsize, cred, &newb); 363 if (error) { 364 brelse(bp); 365 goto fail; 366 } 367 nb = newb; 368 *allocblk++ = nb; 369 *lbns_remfree++ = lbn; 370 dbp->b_bio2.bio_offset = fsbtodoff(fs, nb); 371 if (flags & B_CLRBUF) 372 vfs_bio_clrbuf(dbp); 373 if (DOINGSOFTDEP(vp)) 374 softdep_setup_allocindir_page(ip, lbn, bp, 375 indirs[i].in_off, nb, 0, dbp); 376 bap[indirs[i].in_off] = nb; 377 /* 378 * If required, write synchronously, otherwise use 379 * delayed write. 380 */ 381 if (flags & B_SYNC) { 382 bwrite(bp); 383 } else { 384 if (bp->b_bufsize == fs->fs_bsize) 385 bp->b_flags |= B_CLUSTEROK; 386 bdwrite(bp); 387 } 388 *ap->a_bpp = dbp; 389 return (0); 390 } 391 brelse(bp); 392 393 /* 394 * At this point all related indirect blocks have been allocated 395 * if necessary and released. bp is no longer valid. dbp holds 396 * our getblk()'d data block. 397 * 398 * XXX we previously performed a cluster_read operation here. 399 */ 400 if (flags & B_CLRBUF) { 401 /* 402 * If B_CLRBUF is set we must validate the invalid portions 403 * of the buffer. This typically requires a read-before- 404 * write. The strategy call will fill in bio_offset in that 405 * case. 406 * 407 * If we hit this case we do a cluster read if possible 408 * since nearby data blocks are likely to be accessed soon 409 * too. 410 */ 411 if ((dbp->b_flags & B_CACHE) == 0) { 412 bqrelse(dbp); 413 seqcount = (flags & B_SEQMASK) >> B_SEQSHIFT; 414 if (seqcount && 415 (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 416 error = cluster_read(vp, (off_t)ip->i_size, 417 lblktodoff(fs, lbn), 418 (int)fs->fs_bsize, 419 fs->fs_bsize, 420 seqcount * BKVASIZE, 421 &dbp); 422 } else { 423 error = bread(vp, lblktodoff(fs, lbn), 424 (int)fs->fs_bsize, &dbp); 425 } 426 if (error) 427 goto fail; 428 } else { 429 dbp->b_bio2.bio_offset = fsbtodoff(fs, nb); 430 } 431 } else { 432 /* 433 * If B_CLRBUF is not set the caller intends to overwrite 434 * the entire contents of the buffer. We can simply set 435 * bio_offset and we are done. 436 */ 437 dbp->b_bio2.bio_offset = fsbtodoff(fs, nb); 438 } 439 *ap->a_bpp = dbp; 440 return (0); 441 fail: 442 /* 443 * If we have failed part way through block allocation, we 444 * have to deallocate any indirect blocks that we have allocated. 445 * We have to fsync the file before we start to get rid of all 446 * of its dependencies so that we do not leave them dangling. 447 * We have to sync it at the end so that the soft updates code 448 * does not find any untracked changes. Although this is really 449 * slow, running out of disk space is not expected to be a common 450 * occurence. The error return from fsync is ignored as we already 451 * have an error to return to the user. 452 */ 453 VOP_FSYNC(vp, MNT_WAIT, 0); 454 for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; 455 blkp < allocblk; blkp++, lbns_remfree++) { 456 /* 457 * We shall not leave the freed blocks on the vnode 458 * buffer object lists. 459 */ 460 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0); 461 bp->b_flags |= (B_INVAL | B_RELBUF); 462 brelse(bp); 463 deallocated += fs->fs_bsize; 464 } 465 466 if (allocib != NULL) { 467 *allocib = 0; 468 } else if (unwindidx >= 0) { 469 int r; 470 471 r = bread(vp, lblktodoff(fs, indirs[unwindidx].in_lbn), (int)fs->fs_bsize, &bp); 472 if (r) { 473 panic("Could not unwind indirect block, error %d", r); 474 brelse(bp); 475 } else { 476 bap = (ufs_daddr_t *)bp->b_data; 477 bap[indirs[unwindidx].in_off] = 0; 478 if (flags & B_SYNC) { 479 bwrite(bp); 480 } else { 481 if (bp->b_bufsize == fs->fs_bsize) 482 bp->b_flags |= B_CLUSTEROK; 483 bdwrite(bp); 484 } 485 } 486 } 487 if (deallocated) { 488 #ifdef QUOTA 489 /* 490 * Restore user's disk quota because allocation failed. 491 */ 492 (void) ufs_chkdq(ip, (long)-btodb(deallocated), cred, FORCE); 493 #endif 494 ip->i_blocks -= btodb(deallocated); 495 ip->i_flag |= IN_CHANGE | IN_UPDATE; 496 } 497 VOP_FSYNC(vp, MNT_WAIT, 0); 498 499 /* 500 * After the buffers are invalidated and on-disk pointers are 501 * cleared, free the blocks. 502 */ 503 for (blkp = allociblk; blkp < allocblk; blkp++) { 504 ffs_blkfree(ip, *blkp, fs->fs_bsize); 505 } 506 507 /* 508 * Cleanup the data block we getblk()'d before returning. 509 */ 510 fail2: 511 brelse(dbp); 512 return (error); 513 } 514 515