1 // File system implementation. Five layers: 2 // + Blocks: allocator for raw disk blocks. 3 // + Log: crash recovery for multi-step updates. 4 // + Files: inode allocator, reading, writing, metadata. 5 // + Directories: inode with special contents (list of other inodes!) 6 // + Names: paths like /usr/rtm/xv6/fs.c for convenient naming. 7 // 8 // This file contains the low-level file system manipulation 9 // routines. The (higher-level) system call implementations 10 // are in sysfile.c. 11 12 #include "types.h" 13 #include "defs.h" 14 #include "param.h" 15 #include "stat.h" 16 #include "mmu.h" 17 #include "proc.h" 18 #include "spinlock.h" 19 #include "sleeplock.h" 20 #include "fs.h" 21 #include "buf.h" 22 #include "file.h" 23 24 #define min(a, b) ((a) < (b) ? (a) : (b)) 25 static void itrunc(struct inode*); 26 // there should be one superblock per disk device, but we run with 27 // only one device 28 struct superblock sb; 29 30 // Read the super block. 31 void 32 readsb(int dev, struct superblock *sb) 33 { 34 struct buf *bp; 35 36 bp = bread(dev, 1); 37 memmove(sb, bp->data, sizeof(*sb)); 38 brelse(bp); 39 } 40 41 // Zero a block. 42 static void 43 bzero(int dev, int bno) 44 { 45 struct buf *bp; 46 47 bp = bread(dev, bno); 48 memset(bp->data, 0, BSIZE); 49 log_write(bp); 50 brelse(bp); 51 } 52 53 // Blocks. 54 55 // Allocate a zeroed disk block. 56 static uint 57 balloc(uint dev) 58 { 59 int b, bi, m; 60 struct buf *bp; 61 62 bp = 0; 63 for(b = 0; b < sb.size; b += BPB){ 64 bp = bread(dev, BBLOCK(b, sb)); 65 for(bi = 0; bi < BPB && b + bi < sb.size; bi++){ 66 m = 1 << (bi % 8); 67 if((bp->data[bi/8] & m) == 0){ // Is block free? 68 bp->data[bi/8] |= m; // Mark block in use. 69 log_write(bp); 70 brelse(bp); 71 bzero(dev, b + bi); 72 return b + bi; 73 } 74 } 75 brelse(bp); 76 } 77 panic("balloc: out of blocks"); 78 } 79 80 // Free a disk block. 81 static void 82 bfree(int dev, uint b) 83 { 84 struct buf *bp; 85 int bi, m; 86 87 readsb(dev, &sb); 88 bp = bread(dev, BBLOCK(b, sb)); 89 bi = b % BPB; 90 m = 1 << (bi % 8); 91 if((bp->data[bi/8] & m) == 0) 92 panic("freeing free block"); 93 bp->data[bi/8] &= ~m; 94 log_write(bp); 95 brelse(bp); 96 } 97 98 // Inodes. 99 // 100 // An inode describes a single unnamed file. 101 // The inode disk structure holds metadata: the file's type, 102 // its size, the number of links referring to it, and the 103 // list of blocks holding the file's content. 104 // 105 // The inodes are laid out sequentially on disk at 106 // sb.startinode. Each inode has a number, indicating its 107 // position on the disk. 108 // 109 // The kernel keeps a cache of in-use inodes in memory 110 // to provide a place for synchronizing access 111 // to inodes used by multiple processes. The cached 112 // inodes include book-keeping information that is 113 // not stored on disk: ip->ref and ip->valid. 114 // 115 // An inode and its in-memory representation go through a 116 // sequence of states before they can be used by the 117 // rest of the file system code. 118 // 119 // * Allocation: an inode is allocated if its type (on disk) 120 // is non-zero. ialloc() allocates, and iput() frees if 121 // the reference and link counts have fallen to zero. 122 // 123 // * Referencing in cache: an entry in the inode cache 124 // is free if ip->ref is zero. Otherwise ip->ref tracks 125 // the number of in-memory pointers to the entry (open 126 // files and current directories). iget() finds or 127 // creates a cache entry and increments its ref; iput() 128 // decrements ref. 129 // 130 // * Valid: the information (type, size, &c) in an inode 131 // cache entry is only correct when ip->valid is 1. 132 // ilock() reads the inode from 133 // the disk and sets ip->valid, while iput() clears 134 // ip->valid if ip->ref has fallen to zero. 135 // 136 // * Locked: file system code may only examine and modify 137 // the information in an inode and its content if it 138 // has first locked the inode. 139 // 140 // Thus a typical sequence is: 141 // ip = iget(dev, inum) 142 // ilock(ip) 143 // ... examine and modify ip->xxx ... 144 // iunlock(ip) 145 // iput(ip) 146 // 147 // ilock() is separate from iget() so that system calls can 148 // get a long-term reference to an inode (as for an open file) 149 // and only lock it for short periods (e.g., in read()). 150 // The separation also helps avoid deadlock and races during 151 // pathname lookup. iget() increments ip->ref so that the inode 152 // stays cached and pointers to it remain valid. 153 // 154 // Many internal file system functions expect the caller to 155 // have locked the inodes involved; this lets callers create 156 // multi-step atomic operations. 157 // 158 // The icache.lock spin-lock defends the allocation of icache 159 // entries. Since ip->ref indicates whether an entry is free, 160 // and ip->dev and ip->inum indicate which i-node an entry 161 // holds, one must hold icache.lock while using any of those fields. 162 // 163 // An ip->lock sleep-lock defends all ip-> fields other than ref, 164 // dev, and inum. One must hold ip->lock in order to 165 // read or write that inode's ip->valid, ip->size, ip->type, &c. 166 167 struct { 168 struct spinlock lock; 169 struct inode inode[NINODE]; 170 } icache; 171 172 void 173 iinit(int dev) 174 { 175 int i = 0; 176 177 initlock(&icache.lock, "icache"); 178 for(i = 0; i < NINODE; i++) { 179 initsleeplock(&icache.inode[i].lock, "inode"); 180 } 181 182 readsb(dev, &sb); 183 cprintf("sb: size %d nblocks %d ninodes %d nlog %d logstart %d\ 184 inodestart %d bmap start %d\n", sb.size, sb.nblocks, 185 sb.ninodes, sb.nlog, sb.logstart, sb.inodestart, 186 sb.bmapstart); 187 } 188 189 static struct inode* iget(uint dev, uint inum); 190 191 //PAGEBREAK! 192 // Allocate an inode on device dev. 193 // Mark it as allocated by giving it type type. 194 // Returns an unlocked but allocated and referenced inode. 195 struct inode* 196 ialloc(uint dev, short type) 197 { 198 int inum; 199 struct buf *bp; 200 struct dinode *dip; 201 202 for(inum = 1; inum < sb.ninodes; inum++){ 203 bp = bread(dev, IBLOCK(inum, sb)); 204 dip = (struct dinode*)bp->data + inum%IPB; 205 if(dip->type == 0){ // a free inode 206 memset(dip, 0, sizeof(*dip)); 207 dip->type = type; 208 log_write(bp); // mark it allocated on the disk 209 brelse(bp); 210 return iget(dev, inum); 211 } 212 brelse(bp); 213 } 214 panic("ialloc: no inodes"); 215 } 216 217 // Copy a modified in-memory inode to disk. 218 // Must be called after every change to an ip->xxx field 219 // that lives on disk, since i-node cache is write-through. 220 // Caller must hold ip->lock. 221 void 222 iupdate(struct inode *ip) 223 { 224 struct buf *bp; 225 struct dinode *dip; 226 227 bp = bread(ip->dev, IBLOCK(ip->inum, sb)); 228 dip = (struct dinode*)bp->data + ip->inum%IPB; 229 dip->type = ip->type; 230 dip->major = ip->major; 231 dip->minor = ip->minor; 232 dip->nlink = ip->nlink; 233 dip->size = ip->size; 234 memmove(dip->addrs, ip->addrs, sizeof(ip->addrs)); 235 log_write(bp); 236 brelse(bp); 237 } 238 239 // Find the inode with number inum on device dev 240 // and return the in-memory copy. Does not lock 241 // the inode and does not read it from disk. 242 static struct inode* 243 iget(uint dev, uint inum) 244 { 245 struct inode *ip, *empty; 246 247 acquire(&icache.lock); 248 249 // Is the inode already cached? 250 empty = 0; 251 for(ip = &icache.inode[0]; ip < &icache.inode[NINODE]; ip++){ 252 if(ip->ref > 0 && ip->dev == dev && ip->inum == inum){ 253 ip->ref++; 254 release(&icache.lock); 255 return ip; 256 } 257 if(empty == 0 && ip->ref == 0) // Remember empty slot. 258 empty = ip; 259 } 260 261 // Recycle an inode cache entry. 262 if(empty == 0) 263 panic("iget: no inodes"); 264 265 ip = empty; 266 ip->dev = dev; 267 ip->inum = inum; 268 ip->ref = 1; 269 ip->valid = 0; 270 release(&icache.lock); 271 272 return ip; 273 } 274 275 // Increment reference count for ip. 276 // Returns ip to enable ip = idup(ip1) idiom. 277 struct inode* 278 idup(struct inode *ip) 279 { 280 acquire(&icache.lock); 281 ip->ref++; 282 release(&icache.lock); 283 return ip; 284 } 285 286 // Lock the given inode. 287 // Reads the inode from disk if necessary. 288 void 289 ilock(struct inode *ip) 290 { 291 struct buf *bp; 292 struct dinode *dip; 293 294 if(ip == 0 || ip->ref < 1) 295 panic("ilock"); 296 297 acquiresleep(&ip->lock); 298 299 if(ip->valid == 0){ 300 bp = bread(ip->dev, IBLOCK(ip->inum, sb)); 301 dip = (struct dinode*)bp->data + ip->inum%IPB; 302 ip->type = dip->type; 303 ip->major = dip->major; 304 ip->minor = dip->minor; 305 ip->nlink = dip->nlink; 306 ip->size = dip->size; 307 memmove(ip->addrs, dip->addrs, sizeof(ip->addrs)); 308 brelse(bp); 309 ip->valid = 1; 310 if(ip->type == 0) 311 panic("ilock: no type"); 312 } 313 } 314 315 // Unlock the given inode. 316 void 317 iunlock(struct inode *ip) 318 { 319 if(ip == 0 || !holdingsleep(&ip->lock) || ip->ref < 1) 320 panic("iunlock"); 321 322 releasesleep(&ip->lock); 323 } 324 325 // Drop a reference to an in-memory inode. 326 // If that was the last reference, the inode cache entry can 327 // be recycled. 328 // If that was the last reference and the inode has no links 329 // to it, free the inode (and its content) on disk. 330 // All calls to iput() must be inside a transaction in 331 // case it has to free the inode. 332 void 333 iput(struct inode *ip) 334 { 335 acquiresleep(&ip->lock); 336 if(ip->valid && ip->nlink == 0){ 337 acquire(&icache.lock); 338 int r = ip->ref; 339 release(&icache.lock); 340 if(r == 1){ 341 // inode has no links and no other references: truncate and free. 342 itrunc(ip); 343 ip->type = 0; 344 iupdate(ip); 345 ip->valid = 0; 346 } 347 } 348 releasesleep(&ip->lock); 349 350 acquire(&icache.lock); 351 ip->ref--; 352 release(&icache.lock); 353 } 354 355 // Common idiom: unlock, then put. 356 void 357 iunlockput(struct inode *ip) 358 { 359 iunlock(ip); 360 iput(ip); 361 } 362 363 //PAGEBREAK! 364 // Inode content 365 // 366 // The content (data) associated with each inode is stored 367 // in blocks on the disk. The first NDIRECT block numbers 368 // are listed in ip->addrs[]. The next NINDIRECT blocks are 369 // listed in block ip->addrs[NDIRECT]. 370 371 // Return the disk block address of the nth block in inode ip. 372 // If there is no such block, bmap allocates one. 373 static uint 374 bmap(struct inode *ip, uint bn) 375 { 376 uint addr, *a; 377 struct buf *bp; 378 379 if(bn < NDIRECT){ 380 if((addr = ip->addrs[bn]) == 0) 381 ip->addrs[bn] = addr = balloc(ip->dev); 382 return addr; 383 } 384 bn -= NDIRECT; 385 386 if(bn < NINDIRECT){ 387 // Load indirect block, allocating if necessary. 388 if((addr = ip->addrs[NDIRECT]) == 0) 389 ip->addrs[NDIRECT] = addr = balloc(ip->dev); 390 bp = bread(ip->dev, addr); 391 a = (uint*)bp->data; 392 if((addr = a[bn]) == 0){ 393 a[bn] = addr = balloc(ip->dev); 394 log_write(bp); 395 } 396 brelse(bp); 397 return addr; 398 } 399 400 panic("bmap: out of range"); 401 } 402 403 // Truncate inode (discard contents). 404 // Only called when the inode has no links 405 // to it (no directory entries referring to it) 406 // and has no in-memory reference to it (is 407 // not an open file or current directory). 408 static void 409 itrunc(struct inode *ip) 410 { 411 int i, j; 412 struct buf *bp; 413 uint *a; 414 415 for(i = 0; i < NDIRECT; i++){ 416 if(ip->addrs[i]){ 417 bfree(ip->dev, ip->addrs[i]); 418 ip->addrs[i] = 0; 419 } 420 } 421 422 if(ip->addrs[NDIRECT]){ 423 bp = bread(ip->dev, ip->addrs[NDIRECT]); 424 a = (uint*)bp->data; 425 for(j = 0; j < NINDIRECT; j++){ 426 if(a[j]) 427 bfree(ip->dev, a[j]); 428 } 429 brelse(bp); 430 bfree(ip->dev, ip->addrs[NDIRECT]); 431 ip->addrs[NDIRECT] = 0; 432 } 433 434 ip->size = 0; 435 iupdate(ip); 436 } 437 438 // Copy stat information from inode. 439 // Caller must hold ip->lock. 440 void 441 stati(struct inode *ip, struct stat *st) 442 { 443 st->dev = ip->dev; 444 st->ino = ip->inum; 445 st->type = ip->type; 446 st->nlink = ip->nlink; 447 st->size = ip->size; 448 } 449 450 //PAGEBREAK! 451 // Read data from inode. 452 // Caller must hold ip->lock. 453 int 454 readi(struct inode *ip, char *dst, uint off, uint n) 455 { 456 uint tot, m; 457 struct buf *bp; 458 459 if(ip->type == T_DEV){ 460 if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].read) 461 return -1; 462 return devsw[ip->major].read(ip, dst, n); 463 } 464 465 if(off > ip->size || off + n < off) 466 return -1; 467 if(off + n > ip->size) 468 n = ip->size - off; 469 470 for(tot=0; tot<n; tot+=m, off+=m, dst+=m){ 471 bp = bread(ip->dev, bmap(ip, off/BSIZE)); 472 m = min(n - tot, BSIZE - off%BSIZE); 473 memmove(dst, bp->data + off%BSIZE, m); 474 brelse(bp); 475 } 476 return n; 477 } 478 479 // PAGEBREAK! 480 // Write data to inode. 481 // Caller must hold ip->lock. 482 int 483 writei(struct inode *ip, char *src, uint off, uint n) 484 { 485 uint tot, m; 486 struct buf *bp; 487 488 if(ip->type == T_DEV){ 489 if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].write) 490 return -1; 491 return devsw[ip->major].write(ip, src, n); 492 } 493 494 if(off > ip->size || off + n < off) 495 return -1; 496 if(off + n > MAXFILE*BSIZE) 497 return -1; 498 499 for(tot=0; tot<n; tot+=m, off+=m, src+=m){ 500 bp = bread(ip->dev, bmap(ip, off/BSIZE)); 501 m = min(n - tot, BSIZE - off%BSIZE); 502 memmove(bp->data + off%BSIZE, src, m); 503 log_write(bp); 504 brelse(bp); 505 } 506 507 if(n > 0 && off > ip->size){ 508 ip->size = off; 509 iupdate(ip); 510 } 511 return n; 512 } 513 514 //PAGEBREAK! 515 // Directories 516 517 int 518 namecmp(const char *s, const char *t) 519 { 520 return strncmp(s, t, DIRSIZ); 521 } 522 523 // Look for a directory entry in a directory. 524 // If found, set *poff to byte offset of entry. 525 struct inode* 526 dirlookup(struct inode *dp, char *name, uint *poff) 527 { 528 uint off, inum; 529 struct dirent de; 530 531 if(dp->type != T_DIR) 532 panic("dirlookup not DIR"); 533 534 for(off = 0; off < dp->size; off += sizeof(de)){ 535 if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de)) 536 panic("dirlookup read"); 537 if(de.inum == 0) 538 continue; 539 if(namecmp(name, de.name) == 0){ 540 // entry matches path element 541 if(poff) 542 *poff = off; 543 inum = de.inum; 544 return iget(dp->dev, inum); 545 } 546 } 547 548 return 0; 549 } 550 551 // Write a new directory entry (name, inum) into the directory dp. 552 int 553 dirlink(struct inode *dp, char *name, uint inum) 554 { 555 int off; 556 struct dirent de; 557 struct inode *ip; 558 559 // Check that name is not present. 560 if((ip = dirlookup(dp, name, 0)) != 0){ 561 iput(ip); 562 return -1; 563 } 564 565 // Look for an empty dirent. 566 for(off = 0; off < dp->size; off += sizeof(de)){ 567 if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de)) 568 panic("dirlink read"); 569 if(de.inum == 0) 570 break; 571 } 572 573 strncpy(de.name, name, DIRSIZ); 574 de.inum = inum; 575 if(writei(dp, (char*)&de, off, sizeof(de)) != sizeof(de)) 576 panic("dirlink"); 577 578 return 0; 579 } 580 581 //PAGEBREAK! 582 // Paths 583 584 // Copy the next path element from path into name. 585 // Return a pointer to the element following the copied one. 586 // The returned path has no leading slashes, 587 // so the caller can check *path=='\0' to see if the name is the last one. 588 // If no name to remove, return 0. 589 // 590 // Examples: 591 // skipelem("a/bb/c", name) = "bb/c", setting name = "a" 592 // skipelem("///a//bb", name) = "bb", setting name = "a" 593 // skipelem("a", name) = "", setting name = "a" 594 // skipelem("", name) = skipelem("////", name) = 0 595 // 596 static char* 597 skipelem(char *path, char *name) 598 { 599 char *s; 600 int len; 601 602 while(*path == '/') 603 path++; 604 if(*path == 0) 605 return 0; 606 s = path; 607 while(*path != '/' && *path != 0) 608 path++; 609 len = path - s; 610 if(len >= DIRSIZ) 611 memmove(name, s, DIRSIZ); 612 else { 613 memmove(name, s, len); 614 name[len] = 0; 615 } 616 while(*path == '/') 617 path++; 618 return path; 619 } 620 621 // Look up and return the inode for a path name. 622 // If parent != 0, return the inode for the parent and copy the final 623 // path element into name, which must have room for DIRSIZ bytes. 624 // Must be called inside a transaction since it calls iput(). 625 static struct inode* 626 namex(char *path, int nameiparent, char *name) 627 { 628 struct inode *ip, *next; 629 630 if(*path == '/') 631 ip = iget(ROOTDEV, ROOTINO); 632 else 633 ip = idup(myproc()->cwd); 634 635 while((path = skipelem(path, name)) != 0){ 636 ilock(ip); 637 if(ip->type != T_DIR){ 638 iunlockput(ip); 639 return 0; 640 } 641 if(nameiparent && *path == '\0'){ 642 // Stop one level early. 643 iunlock(ip); 644 return ip; 645 } 646 if((next = dirlookup(ip, name, 0)) == 0){ 647 iunlockput(ip); 648 return 0; 649 } 650 iunlockput(ip); 651 ip = next; 652 } 653 if(nameiparent){ 654 iput(ip); 655 return 0; 656 } 657 return ip; 658 } 659 660 struct inode* 661 namei(char *path) 662 { 663 char name[DIRSIZ]; 664 return namex(path, 0, name); 665 } 666 667 struct inode* 668 nameiparent(char *path, char *name) 669 { 670 return namex(path, 1, name); 671 } 672