1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)ffs_balloc.c 8.8 (Berkeley) 6/16/95 34 * $FreeBSD: src/sys/ufs/ffs/ffs_balloc.c,v 1.26.2.1 2002/10/10 19:48:20 dillon Exp $ 35 * $DragonFly: src/sys/vfs/ufs/ffs_balloc.c,v 1.19 2008/05/21 18:49:49 dillon Exp $ 36 */ 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/proc.h> 41 #include <sys/buf.h> 42 #include <sys/lock.h> 43 #include <sys/mount.h> 44 #include <sys/vnode.h> 45 46 #include <sys/buf2.h> 47 48 #include "quota.h" 49 #include "inode.h" 50 #include "ufs_extern.h" 51 52 #include "fs.h" 53 #include "ffs_extern.h" 54 55 /* 56 * ffs_balloc(struct vnode *a_vp, ufs_daddr_t a_lbn, int a_size, 57 * struct ucred *a_cred, int a_flags, struct buf *a_bpp) 58 * 59 * Balloc defines the structure of filesystem storage by allocating 60 * the physical blocks on a device given the inode and the logical 61 * block number in a file. 62 * 63 * NOTE: B_CLRBUF - this flag tells balloc to clear invalid portions 64 * of the buffer. However, any dirty bits will override missing 65 * valid bits. This case occurs when writable mmaps are truncated 66 * and then extended. 67 */ 68 int 69 ffs_balloc(struct vop_balloc_args *ap) 70 { 71 struct inode *ip; 72 ufs_daddr_t lbn; 73 int size; 74 struct ucred *cred; 75 int flags; 76 struct fs *fs; 77 ufs_daddr_t nb; 78 struct buf *bp, *nbp, *dbp; 79 struct vnode *vp; 80 struct indir indirs[NIADDR + 2]; 81 ufs_daddr_t newb, *bap, pref; 82 int deallocated, osize, nsize, num, i, error; 83 ufs_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1]; 84 ufs_daddr_t *lbns_remfree, lbns[NIADDR + 1]; 85 int unwindidx; 86 int seqcount; 87 88 vp = ap->a_vp; 89 ip = VTOI(vp); 90 fs = ip->i_fs; 91 lbn = lblkno(fs, ap->a_startoffset); 92 size = blkoff(fs, ap->a_startoffset) + ap->a_size; 93 if (size > fs->fs_bsize) 94 panic("ffs_balloc: blk too big"); 95 *ap->a_bpp = NULL; 96 if (lbn < 0) 97 return (EFBIG); 98 cred = ap->a_cred; 99 flags = ap->a_flags; 100 101 /* 102 * The vnode must be locked for us to be able to safely mess 103 * around with the inode. 104 */ 105 if (vn_islocked(vp) != LK_EXCLUSIVE) { 106 panic("ffs_balloc: vnode %p not exclusively locked!", vp); 107 } 108 109 /* 110 * If the next write will extend the file into a new block, 111 * and the file is currently composed of a fragment 112 * this fragment has to be extended to be a full block. 113 */ 114 nb = lblkno(fs, ip->i_size); 115 if (nb < NDADDR && nb < lbn) { 116 /* 117 * The filesize prior to this write can fit in direct 118 * blocks (ex. fragmentation is possibly done) 119 * we are now extending the file write beyond 120 * the block which has end of the file prior to this write. 121 */ 122 osize = blksize(fs, ip, nb); 123 /* 124 * osize gives disk allocated size in the last block. It is 125 * either in fragments or a file system block size. 126 */ 127 if (osize < fs->fs_bsize && osize > 0) { 128 /* A few fragments are already allocated, since the 129 * current extends beyond this block allocated the 130 * complete block as fragments are on in last block. 131 */ 132 error = ffs_realloccg(ip, nb, 133 ffs_blkpref(ip, nb, (int)nb, &ip->i_db[0]), 134 osize, (int)fs->fs_bsize, cred, &bp); 135 if (error) 136 return (error); 137 if (DOINGSOFTDEP(vp)) 138 softdep_setup_allocdirect(ip, nb, 139 dofftofsb(fs, bp->b_bio2.bio_offset), 140 ip->i_db[nb], fs->fs_bsize, osize, bp); 141 /* adjust the inode size, we just grew */ 142 ip->i_size = smalllblktosize(fs, nb + 1); 143 ip->i_db[nb] = dofftofsb(fs, bp->b_bio2.bio_offset); 144 ip->i_flag |= IN_CHANGE | IN_UPDATE; 145 if (flags & B_SYNC) 146 bwrite(bp); 147 else 148 bawrite(bp); 149 /* bp is already released here */ 150 } 151 } 152 /* 153 * The first NDADDR blocks are direct blocks 154 */ 155 if (lbn < NDADDR) { 156 nb = ip->i_db[lbn]; 157 if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { 158 error = bread(vp, lblktodoff(fs, lbn), fs->fs_bsize, &bp); 159 if (error) { 160 brelse(bp); 161 return (error); 162 } 163 bp->b_bio2.bio_offset = fsbtodoff(fs, nb); 164 *ap->a_bpp = bp; 165 return (0); 166 } 167 if (nb != 0) { 168 /* 169 * Consider need to reallocate a fragment. 170 */ 171 osize = fragroundup(fs, blkoff(fs, ip->i_size)); 172 nsize = fragroundup(fs, size); 173 if (nsize <= osize) { 174 error = bread(vp, lblktodoff(fs, lbn), 175 osize, &bp); 176 if (error) { 177 brelse(bp); 178 return (error); 179 } 180 bp->b_bio2.bio_offset = fsbtodoff(fs, nb); 181 } else { 182 /* 183 * NOTE: ffs_realloccg() issues a bread(). 184 */ 185 error = ffs_realloccg(ip, lbn, 186 ffs_blkpref(ip, lbn, (int)lbn, 187 &ip->i_db[0]), osize, nsize, cred, &bp); 188 if (error) 189 return (error); 190 if (DOINGSOFTDEP(vp)) 191 softdep_setup_allocdirect(ip, lbn, 192 dofftofsb(fs, bp->b_bio2.bio_offset), 193 nb, nsize, osize, bp); 194 } 195 } else { 196 if (ip->i_size < smalllblktosize(fs, lbn + 1)) 197 nsize = fragroundup(fs, size); 198 else 199 nsize = fs->fs_bsize; 200 error = ffs_alloc(ip, lbn, 201 ffs_blkpref(ip, lbn, (int)lbn, &ip->i_db[0]), 202 nsize, cred, &newb); 203 if (error) 204 return (error); 205 bp = getblk(vp, lblktodoff(fs, lbn), nsize, 0, 0); 206 bp->b_bio2.bio_offset = fsbtodoff(fs, newb); 207 if (flags & B_CLRBUF) 208 vfs_bio_clrbuf(bp); 209 if (DOINGSOFTDEP(vp)) 210 softdep_setup_allocdirect(ip, lbn, newb, 0, 211 nsize, 0, bp); 212 } 213 ip->i_db[lbn] = dofftofsb(fs, bp->b_bio2.bio_offset); 214 ip->i_flag |= IN_CHANGE | IN_UPDATE; 215 *ap->a_bpp = bp; 216 return (0); 217 } 218 /* 219 * Determine the number of levels of indirection. 220 */ 221 pref = 0; 222 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 223 return(error); 224 #ifdef DIAGNOSTIC 225 if (num < 1) 226 panic ("ffs_balloc: ufs_bmaparray returned indirect block"); 227 #endif 228 /* 229 * Get a handle on the data block buffer before working through 230 * indirect blocks to avoid a deadlock between the VM system holding 231 * a locked VM page and issuing a BMAP (which tries to lock the 232 * indirect blocks), and the filesystem holding a locked indirect 233 * block and then trying to read a data block (which tries to lock 234 * the underlying VM pages). 235 */ 236 dbp = getblk(vp, lblktodoff(fs, lbn), fs->fs_bsize, 0, 0); 237 238 /* 239 * Setup undo history 240 */ 241 allocib = NULL; 242 allocblk = allociblk; 243 lbns_remfree = lbns; 244 245 unwindidx = -1; 246 247 /* 248 * Fetch the first indirect block directly from the inode, allocating 249 * one if necessary. 250 */ 251 --num; 252 nb = ip->i_ib[indirs[0].in_off]; 253 if (nb == 0) { 254 pref = ffs_blkpref(ip, lbn, 0, NULL); 255 /* 256 * If the filesystem has run out of space we can skip the 257 * full fsync/undo of the main [fail] case since no undo 258 * history has been built yet. Hence the goto fail2. 259 */ 260 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 261 cred, &newb)) != 0) 262 goto fail2; 263 nb = newb; 264 *allocblk++ = nb; 265 *lbns_remfree++ = indirs[1].in_lbn; 266 bp = getblk(vp, lblktodoff(fs, indirs[1].in_lbn), 267 fs->fs_bsize, 0, 0); 268 bp->b_bio2.bio_offset = fsbtodoff(fs, nb); 269 vfs_bio_clrbuf(bp); 270 if (DOINGSOFTDEP(vp)) { 271 softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off, 272 newb, 0, fs->fs_bsize, 0, bp); 273 bdwrite(bp); 274 } else { 275 /* 276 * Write synchronously so that indirect blocks 277 * never point at garbage. 278 */ 279 if (DOINGASYNC(vp)) 280 bdwrite(bp); 281 else if ((error = bwrite(bp)) != 0) 282 goto fail; 283 } 284 allocib = &ip->i_ib[indirs[0].in_off]; 285 *allocib = nb; 286 ip->i_flag |= IN_CHANGE | IN_UPDATE; 287 } 288 289 /* 290 * Fetch through the indirect blocks, allocating as necessary. 291 */ 292 for (i = 1;;) { 293 error = bread(vp, lblktodoff(fs, indirs[i].in_lbn), (int)fs->fs_bsize, &bp); 294 if (error) { 295 brelse(bp); 296 goto fail; 297 } 298 bap = (ufs_daddr_t *)bp->b_data; 299 nb = bap[indirs[i].in_off]; 300 if (i == num) 301 break; 302 i += 1; 303 if (nb != 0) { 304 bqrelse(bp); 305 continue; 306 } 307 if (pref == 0) 308 pref = ffs_blkpref(ip, lbn, 0, NULL); 309 if ((error = 310 ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) != 0) { 311 brelse(bp); 312 goto fail; 313 } 314 nb = newb; 315 *allocblk++ = nb; 316 *lbns_remfree++ = indirs[i].in_lbn; 317 nbp = getblk(vp, lblktodoff(fs, indirs[i].in_lbn), 318 fs->fs_bsize, 0, 0); 319 nbp->b_bio2.bio_offset = fsbtodoff(fs, nb); 320 vfs_bio_clrbuf(nbp); 321 if (DOINGSOFTDEP(vp)) { 322 softdep_setup_allocindir_meta(nbp, ip, bp, 323 indirs[i - 1].in_off, nb); 324 bdwrite(nbp); 325 } else { 326 /* 327 * Write synchronously so that indirect blocks 328 * never point at garbage. 329 */ 330 if ((error = bwrite(nbp)) != 0) { 331 brelse(bp); 332 goto fail; 333 } 334 } 335 bap[indirs[i - 1].in_off] = nb; 336 if (allocib == NULL && unwindidx < 0) 337 unwindidx = i - 1; 338 /* 339 * If required, write synchronously, otherwise use 340 * delayed write. 341 */ 342 if (flags & B_SYNC) { 343 bwrite(bp); 344 } else { 345 if (bp->b_bufsize == fs->fs_bsize) 346 bp->b_flags |= B_CLUSTEROK; 347 bdwrite(bp); 348 } 349 } 350 351 /* 352 * Get the data block, allocating if necessary. We have already 353 * called getblk() on the data block buffer, dbp. If we have to 354 * allocate it and B_CLRBUF has been set the inference is an intention 355 * to zero out the related disk blocks, so we do not have to issue 356 * a read. Instead we simply call vfs_bio_clrbuf(). If B_CLRBUF is 357 * not set the caller intends to overwrite the entire contents of the 358 * buffer and we don't waste time trying to clean up the contents. 359 * 360 * bp references the current indirect block. When allocating, 361 * the block must be updated. 362 */ 363 if (nb == 0) { 364 pref = ffs_blkpref(ip, lbn, indirs[i].in_off, &bap[0]); 365 error = ffs_alloc(ip, 366 lbn, pref, (int)fs->fs_bsize, cred, &newb); 367 if (error) { 368 brelse(bp); 369 goto fail; 370 } 371 nb = newb; 372 *allocblk++ = nb; 373 *lbns_remfree++ = lbn; 374 dbp->b_bio2.bio_offset = fsbtodoff(fs, nb); 375 if (flags & B_CLRBUF) 376 vfs_bio_clrbuf(dbp); 377 if (DOINGSOFTDEP(vp)) 378 softdep_setup_allocindir_page(ip, lbn, bp, 379 indirs[i].in_off, nb, 0, dbp); 380 bap[indirs[i].in_off] = nb; 381 /* 382 * If required, write synchronously, otherwise use 383 * delayed write. 384 */ 385 if (flags & B_SYNC) { 386 bwrite(bp); 387 } else { 388 if (bp->b_bufsize == fs->fs_bsize) 389 bp->b_flags |= B_CLUSTEROK; 390 bdwrite(bp); 391 } 392 *ap->a_bpp = dbp; 393 return (0); 394 } 395 brelse(bp); 396 397 /* 398 * At this point all related indirect blocks have been allocated 399 * if necessary and released. bp is no longer valid. dbp holds 400 * our getblk()'d data block. 401 * 402 * XXX we previously performed a cluster_read operation here. 403 */ 404 if (flags & B_CLRBUF) { 405 /* 406 * If B_CLRBUF is set we must validate the invalid portions 407 * of the buffer. This typically requires a read-before- 408 * write. The strategy call will fill in bio_offset in that 409 * case. 410 * 411 * If we hit this case we do a cluster read if possible 412 * since nearby data blocks are likely to be accessed soon 413 * too. 414 */ 415 if ((dbp->b_flags & B_CACHE) == 0) { 416 bqrelse(dbp); 417 seqcount = (flags & B_SEQMASK) >> B_SEQSHIFT; 418 if (seqcount && 419 (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 420 error = cluster_read(vp, (off_t)ip->i_size, 421 lblktodoff(fs, lbn), 422 (int)fs->fs_bsize, 423 fs->fs_bsize, 424 seqcount * BKVASIZE, 425 &dbp); 426 } else { 427 error = bread(vp, lblktodoff(fs, lbn), 428 (int)fs->fs_bsize, &dbp); 429 } 430 if (error) 431 goto fail; 432 } else { 433 dbp->b_bio2.bio_offset = fsbtodoff(fs, nb); 434 } 435 } else { 436 /* 437 * If B_CLRBUF is not set the caller intends to overwrite 438 * the entire contents of the buffer. We can simply set 439 * bio_offset and we are done. 440 */ 441 dbp->b_bio2.bio_offset = fsbtodoff(fs, nb); 442 } 443 *ap->a_bpp = dbp; 444 return (0); 445 fail: 446 /* 447 * If we have failed part way through block allocation, we 448 * have to deallocate any indirect blocks that we have allocated. 449 * We have to fsync the file before we start to get rid of all 450 * of its dependencies so that we do not leave them dangling. 451 * We have to sync it at the end so that the soft updates code 452 * does not find any untracked changes. Although this is really 453 * slow, running out of disk space is not expected to be a common 454 * occurence. The error return from fsync is ignored as we already 455 * have an error to return to the user. 456 */ 457 VOP_FSYNC(vp, MNT_WAIT, 0); 458 for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; 459 blkp < allocblk; blkp++, lbns_remfree++) { 460 /* 461 * We shall not leave the freed blocks on the vnode 462 * buffer object lists. 463 */ 464 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0); 465 bp->b_flags |= (B_INVAL | B_RELBUF); 466 brelse(bp); 467 deallocated += fs->fs_bsize; 468 } 469 470 if (allocib != NULL) { 471 *allocib = 0; 472 } else if (unwindidx >= 0) { 473 int r; 474 475 r = bread(vp, lblktodoff(fs, indirs[unwindidx].in_lbn), (int)fs->fs_bsize, &bp); 476 if (r) { 477 panic("Could not unwind indirect block, error %d", r); 478 brelse(bp); 479 } else { 480 bap = (ufs_daddr_t *)bp->b_data; 481 bap[indirs[unwindidx].in_off] = 0; 482 if (flags & B_SYNC) { 483 bwrite(bp); 484 } else { 485 if (bp->b_bufsize == fs->fs_bsize) 486 bp->b_flags |= B_CLUSTEROK; 487 bdwrite(bp); 488 } 489 } 490 } 491 if (deallocated) { 492 #ifdef QUOTA 493 /* 494 * Restore user's disk quota because allocation failed. 495 */ 496 (void) ufs_chkdq(ip, (long)-btodb(deallocated), cred, FORCE); 497 #endif 498 ip->i_blocks -= btodb(deallocated); 499 ip->i_flag |= IN_CHANGE | IN_UPDATE; 500 } 501 VOP_FSYNC(vp, MNT_WAIT, 0); 502 503 /* 504 * After the buffers are invalidated and on-disk pointers are 505 * cleared, free the blocks. 506 */ 507 for (blkp = allociblk; blkp < allocblk; blkp++) { 508 ffs_blkfree(ip, *blkp, fs->fs_bsize); 509 } 510 511 /* 512 * Cleanup the data block we getblk()'d before returning. 513 */ 514 fail2: 515 brelse(dbp); 516 return (error); 517 } 518 519