1 // File system implementation. Five layers: 2 // + Blocks: allocator for raw disk blocks. 3 // + Log: crash recovery for multi-step updates. 4 // + Files: inode allocator, reading, writing, metadata. 5 // + Directories: inode with special contents (list of other inodes!) 6 // + Names: paths like /usr/rtm/xv6/fs.c for convenient naming. 7 // 8 // This file contains the low-level file system manipulation 9 // routines. The (higher-level) system call implementations 10 // are in sysfile.c. 11 12 #include "types.h" 13 #include "defs.h" 14 #include "param.h" 15 #include "stat.h" 16 #include "mmu.h" 17 #include "proc.h" 18 #include "spinlock.h" 19 #include "fs.h" 20 #include "buf.h" 21 #include "file.h" 22 23 #define min(a, b) ((a) < (b) ? (a) : (b)) 24 static void itrunc(struct inode*); 25 struct superblock sb; // there should be one per dev, but we run with one dev 26 27 // Read the super block. 28 void 29 readsb(int dev, struct superblock *sb) 30 { 31 struct buf *bp; 32 33 bp = bread(dev, 1); 34 memmove(sb, bp->data, sizeof(*sb)); 35 brelse(bp); 36 } 37 38 // Zero a block. 39 static void 40 bzero(int dev, int bno) 41 { 42 struct buf *bp; 43 44 bp = bread(dev, bno); 45 memset(bp->data, 0, BSIZE); 46 log_write(bp); 47 brelse(bp); 48 } 49 50 // Blocks. 51 52 // Allocate a zeroed disk block. 53 static uint 54 balloc(uint dev) 55 { 56 int b, bi, m; 57 struct buf *bp; 58 59 bp = 0; 60 for(b = 0; b < sb.size; b += BPB){ 61 bp = bread(dev, BBLOCK(b, sb)); 62 for(bi = 0; bi < BPB && b + bi < sb.size; bi++){ 63 m = 1 << (bi % 8); 64 if((bp->data[bi/8] & m) == 0){ // Is block free? 65 bp->data[bi/8] |= m; // Mark block in use. 66 log_write(bp); 67 brelse(bp); 68 bzero(dev, b + bi); 69 return b + bi; 70 } 71 } 72 brelse(bp); 73 } 74 panic("balloc: out of blocks"); 75 } 76 77 // Free a disk block. 78 static void 79 bfree(int dev, uint b) 80 { 81 struct buf *bp; 82 int bi, m; 83 84 readsb(dev, &sb); 85 bp = bread(dev, BBLOCK(b, sb)); 86 bi = b % BPB; 87 m = 1 << (bi % 8); 88 if((bp->data[bi/8] & m) == 0) 89 panic("freeing free block"); 90 bp->data[bi/8] &= ~m; 91 log_write(bp); 92 brelse(bp); 93 } 94 95 // Inodes. 96 // 97 // An inode describes a single unnamed file. 98 // The inode disk structure holds metadata: the file's type, 99 // its size, the number of links referring to it, and the 100 // list of blocks holding the file's content. 101 // 102 // The inodes are laid out sequentially on disk at 103 // sb.startinode. Each inode has a number, indicating its 104 // position on the disk. 105 // 106 // The kernel keeps a cache of in-use inodes in memory 107 // to provide a place for synchronizing access 108 // to inodes used by multiple processes. The cached 109 // inodes include book-keeping information that is 110 // not stored on disk: ip->ref and ip->flags. 111 // 112 // An inode and its in-memory represtative go through a 113 // sequence of states before they can be used by the 114 // rest of the file system code. 115 // 116 // * Allocation: an inode is allocated if its type (on disk) 117 // is non-zero. ialloc() allocates, iput() frees if 118 // the link count has fallen to zero. 119 // 120 // * Referencing in cache: an entry in the inode cache 121 // is free if ip->ref is zero. Otherwise ip->ref tracks 122 // the number of in-memory pointers to the entry (open 123 // files and current directories). iget() to find or 124 // create a cache entry and increment its ref, iput() 125 // to decrement ref. 126 // 127 // * Valid: the information (type, size, &c) in an inode 128 // cache entry is only correct when the I_VALID bit 129 // is set in ip->flags. ilock() reads the inode from 130 // the disk and sets I_VALID, while iput() clears 131 // I_VALID if ip->ref has fallen to zero. 132 // 133 // * Locked: file system code may only examine and modify 134 // the information in an inode and its content if it 135 // has first locked the inode. The I_BUSY flag indicates 136 // that the inode is locked. ilock() sets I_BUSY, 137 // while iunlock clears it. 138 // 139 // Thus a typical sequence is: 140 // ip = iget(dev, inum) 141 // ilock(ip) 142 // ... examine and modify ip->xxx ... 143 // iunlock(ip) 144 // iput(ip) 145 // 146 // ilock() is separate from iget() so that system calls can 147 // get a long-term reference to an inode (as for an open file) 148 // and only lock it for short periods (e.g., in read()). 149 // The separation also helps avoid deadlock and races during 150 // pathname lookup. iget() increments ip->ref so that the inode 151 // stays cached and pointers to it remain valid. 152 // 153 // Many internal file system functions expect the caller to 154 // have locked the inodes involved; this lets callers create 155 // multi-step atomic operations. 156 157 struct { 158 struct spinlock lock; 159 struct inode inode[NINODE]; 160 } icache; 161 162 void 163 iinit(int dev) 164 { 165 initlock(&icache.lock, "icache"); 166 readsb(dev, &sb); 167 cprintf("sb: size %d nblocks %d ninodes %d nlog %d logstart %d inodestart %d bmap start %d\n", sb.size, 168 sb.nblocks, sb.ninodes, sb.nlog, sb.logstart, sb.inodestart, sb.bmapstart); 169 } 170 171 static struct inode* iget(uint dev, uint inum); 172 173 //PAGEBREAK! 174 // Allocate a new inode with the given type on device dev. 175 // A free inode has a type of zero. 176 struct inode* 177 ialloc(uint dev, short type) 178 { 179 int inum; 180 struct buf *bp; 181 struct dinode *dip; 182 183 for(inum = 1; inum < sb.ninodes; inum++){ 184 bp = bread(dev, IBLOCK(inum, sb)); 185 dip = (struct dinode*)bp->data + inum%IPB; 186 if(dip->type == 0){ // a free inode 187 memset(dip, 0, sizeof(*dip)); 188 dip->type = type; 189 log_write(bp); // mark it allocated on the disk 190 brelse(bp); 191 return iget(dev, inum); 192 } 193 brelse(bp); 194 } 195 panic("ialloc: no inodes"); 196 } 197 198 // Copy a modified in-memory inode to disk. 199 void 200 iupdate(struct inode *ip) 201 { 202 struct buf *bp; 203 struct dinode *dip; 204 205 bp = bread(ip->dev, IBLOCK(ip->inum, sb)); 206 dip = (struct dinode*)bp->data + ip->inum%IPB; 207 dip->type = ip->type; 208 dip->major = ip->major; 209 dip->minor = ip->minor; 210 dip->nlink = ip->nlink; 211 dip->size = ip->size; 212 memmove(dip->addrs, ip->addrs, sizeof(ip->addrs)); 213 log_write(bp); 214 brelse(bp); 215 } 216 217 // Find the inode with number inum on device dev 218 // and return the in-memory copy. Does not lock 219 // the inode and does not read it from disk. 220 static struct inode* 221 iget(uint dev, uint inum) 222 { 223 struct inode *ip, *empty; 224 225 acquire(&icache.lock); 226 227 // Is the inode already cached? 228 empty = 0; 229 for(ip = &icache.inode[0]; ip < &icache.inode[NINODE]; ip++){ 230 if(ip->ref > 0 && ip->dev == dev && ip->inum == inum){ 231 ip->ref++; 232 release(&icache.lock); 233 return ip; 234 } 235 if(empty == 0 && ip->ref == 0) // Remember empty slot. 236 empty = ip; 237 } 238 239 // Recycle an inode cache entry. 240 if(empty == 0) 241 panic("iget: no inodes"); 242 243 ip = empty; 244 ip->dev = dev; 245 ip->inum = inum; 246 ip->ref = 1; 247 ip->flags = 0; 248 release(&icache.lock); 249 250 return ip; 251 } 252 253 // Increment reference count for ip. 254 // Returns ip to enable ip = idup(ip1) idiom. 255 struct inode* 256 idup(struct inode *ip) 257 { 258 acquire(&icache.lock); 259 ip->ref++; 260 release(&icache.lock); 261 return ip; 262 } 263 264 // Lock the given inode. 265 // Reads the inode from disk if necessary. 266 void 267 ilock(struct inode *ip) 268 { 269 struct buf *bp; 270 struct dinode *dip; 271 272 if(ip == 0 || ip->ref < 1) 273 panic("ilock"); 274 275 acquire(&icache.lock); 276 while(ip->flags & I_BUSY) 277 sleep(ip, &icache.lock); 278 ip->flags |= I_BUSY; 279 release(&icache.lock); 280 281 if(!(ip->flags & I_VALID)){ 282 bp = bread(ip->dev, IBLOCK(ip->inum, sb)); 283 dip = (struct dinode*)bp->data + ip->inum%IPB; 284 ip->type = dip->type; 285 ip->major = dip->major; 286 ip->minor = dip->minor; 287 ip->nlink = dip->nlink; 288 ip->size = dip->size; 289 memmove(ip->addrs, dip->addrs, sizeof(ip->addrs)); 290 brelse(bp); 291 ip->flags |= I_VALID; 292 if(ip->type == 0) 293 panic("ilock: no type"); 294 } 295 } 296 297 // Unlock the given inode. 298 void 299 iunlock(struct inode *ip) 300 { 301 if(ip == 0 || !(ip->flags & I_BUSY) || ip->ref < 1) 302 panic("iunlock"); 303 304 acquire(&icache.lock); 305 ip->flags &= ~I_BUSY; 306 wakeup(ip); 307 release(&icache.lock); 308 } 309 310 // Drop a reference to an in-memory inode. 311 // If that was the last reference, the inode cache entry can 312 // be recycled. 313 // If that was the last reference and the inode has no links 314 // to it, free the inode (and its content) on disk. 315 // All calls to iput() must be inside a transaction in 316 // case it has to free the inode. 317 void 318 iput(struct inode *ip) 319 { 320 acquire(&icache.lock); 321 if(ip->ref == 1 && (ip->flags & I_VALID) && ip->nlink == 0){ 322 // inode has no links and no other references: truncate and free. 323 if(ip->flags & I_BUSY) 324 panic("iput busy"); 325 ip->flags |= I_BUSY; 326 release(&icache.lock); 327 itrunc(ip); 328 ip->type = 0; 329 iupdate(ip); 330 acquire(&icache.lock); 331 ip->flags = 0; 332 wakeup(ip); 333 } 334 ip->ref--; 335 release(&icache.lock); 336 } 337 338 // Common idiom: unlock, then put. 339 void 340 iunlockput(struct inode *ip) 341 { 342 iunlock(ip); 343 iput(ip); 344 } 345 346 //PAGEBREAK! 347 // Inode content 348 // 349 // The content (data) associated with each inode is stored 350 // in blocks on the disk. The first NDIRECT block numbers 351 // are listed in ip->addrs[]. The next NINDIRECT blocks are 352 // listed in block ip->addrs[NDIRECT]. 353 354 // Return the disk block address of the nth block in inode ip. 355 // If there is no such block, bmap allocates one. 356 static uint 357 bmap(struct inode *ip, uint bn) 358 { 359 uint addr, *a; 360 struct buf *bp; 361 362 if(bn < NDIRECT){ 363 if((addr = ip->addrs[bn]) == 0) 364 ip->addrs[bn] = addr = balloc(ip->dev); 365 return addr; 366 } 367 bn -= NDIRECT; 368 369 if(bn < NINDIRECT){ 370 // Load indirect block, allocating if necessary. 371 if((addr = ip->addrs[NDIRECT]) == 0) 372 ip->addrs[NDIRECT] = addr = balloc(ip->dev); 373 bp = bread(ip->dev, addr); 374 a = (uint*)bp->data; 375 if((addr = a[bn]) == 0){ 376 a[bn] = addr = balloc(ip->dev); 377 log_write(bp); 378 } 379 brelse(bp); 380 return addr; 381 } 382 383 panic("bmap: out of range"); 384 } 385 386 // Truncate inode (discard contents). 387 // Only called when the inode has no links 388 // to it (no directory entries referring to it) 389 // and has no in-memory reference to it (is 390 // not an open file or current directory). 391 static void 392 itrunc(struct inode *ip) 393 { 394 int i, j; 395 struct buf *bp; 396 uint *a; 397 398 for(i = 0; i < NDIRECT; i++){ 399 if(ip->addrs[i]){ 400 bfree(ip->dev, ip->addrs[i]); 401 ip->addrs[i] = 0; 402 } 403 } 404 405 if(ip->addrs[NDIRECT]){ 406 bp = bread(ip->dev, ip->addrs[NDIRECT]); 407 a = (uint*)bp->data; 408 for(j = 0; j < NINDIRECT; j++){ 409 if(a[j]) 410 bfree(ip->dev, a[j]); 411 } 412 brelse(bp); 413 bfree(ip->dev, ip->addrs[NDIRECT]); 414 ip->addrs[NDIRECT] = 0; 415 } 416 417 ip->size = 0; 418 iupdate(ip); 419 } 420 421 // Copy stat information from inode. 422 void 423 stati(struct inode *ip, struct stat *st) 424 { 425 st->dev = ip->dev; 426 st->ino = ip->inum; 427 st->type = ip->type; 428 st->nlink = ip->nlink; 429 st->size = ip->size; 430 } 431 432 //PAGEBREAK! 433 // Read data from inode. 434 int 435 readi(struct inode *ip, char *dst, uint off, uint n) 436 { 437 uint tot, m; 438 struct buf *bp; 439 440 if(ip->type == T_DEV){ 441 if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].read) 442 return -1; 443 return devsw[ip->major].read(ip, dst, n); 444 } 445 446 if(off > ip->size || off + n < off) 447 return -1; 448 if(off + n > ip->size) 449 n = ip->size - off; 450 451 for(tot=0; tot<n; tot+=m, off+=m, dst+=m){ 452 bp = bread(ip->dev, bmap(ip, off/BSIZE)); 453 m = min(n - tot, BSIZE - off%BSIZE); 454 memmove(dst, bp->data + off%BSIZE, m); 455 brelse(bp); 456 } 457 return n; 458 } 459 460 // PAGEBREAK! 461 // Write data to inode. 462 int 463 writei(struct inode *ip, char *src, uint off, uint n) 464 { 465 uint tot, m; 466 struct buf *bp; 467 468 if(ip->type == T_DEV){ 469 if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].write) 470 return -1; 471 return devsw[ip->major].write(ip, src, n); 472 } 473 474 if(off > ip->size || off + n < off) 475 return -1; 476 if(off + n > MAXFILE*BSIZE) 477 return -1; 478 479 for(tot=0; tot<n; tot+=m, off+=m, src+=m){ 480 bp = bread(ip->dev, bmap(ip, off/BSIZE)); 481 m = min(n - tot, BSIZE - off%BSIZE); 482 memmove(bp->data + off%BSIZE, src, m); 483 log_write(bp); 484 brelse(bp); 485 } 486 487 if(n > 0 && off > ip->size){ 488 ip->size = off; 489 iupdate(ip); 490 } 491 return n; 492 } 493 494 //PAGEBREAK! 495 // Directories 496 497 int 498 namecmp(const char *s, const char *t) 499 { 500 return strncmp(s, t, DIRSIZ); 501 } 502 503 // Look for a directory entry in a directory. 504 // If found, set *poff to byte offset of entry. 505 struct inode* 506 dirlookup(struct inode *dp, char *name, uint *poff) 507 { 508 uint off, inum; 509 struct dirent de; 510 511 if(dp->type != T_DIR) 512 panic("dirlookup not DIR"); 513 514 for(off = 0; off < dp->size; off += sizeof(de)){ 515 if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de)) 516 panic("dirlink read"); 517 if(de.inum == 0) 518 continue; 519 if(namecmp(name, de.name) == 0){ 520 // entry matches path element 521 if(poff) 522 *poff = off; 523 inum = de.inum; 524 return iget(dp->dev, inum); 525 } 526 } 527 528 return 0; 529 } 530 531 // Write a new directory entry (name, inum) into the directory dp. 532 int 533 dirlink(struct inode *dp, char *name, uint inum) 534 { 535 int off; 536 struct dirent de; 537 struct inode *ip; 538 539 // Check that name is not present. 540 if((ip = dirlookup(dp, name, 0)) != 0){ 541 iput(ip); 542 return -1; 543 } 544 545 // Look for an empty dirent. 546 for(off = 0; off < dp->size; off += sizeof(de)){ 547 if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de)) 548 panic("dirlink read"); 549 if(de.inum == 0) 550 break; 551 } 552 553 strncpy(de.name, name, DIRSIZ); 554 de.inum = inum; 555 if(writei(dp, (char*)&de, off, sizeof(de)) != sizeof(de)) 556 panic("dirlink"); 557 558 return 0; 559 } 560 561 //PAGEBREAK! 562 // Paths 563 564 // Copy the next path element from path into name. 565 // Return a pointer to the element following the copied one. 566 // The returned path has no leading slashes, 567 // so the caller can check *path=='\0' to see if the name is the last one. 568 // If no name to remove, return 0. 569 // 570 // Examples: 571 // skipelem("a/bb/c", name) = "bb/c", setting name = "a" 572 // skipelem("///a//bb", name) = "bb", setting name = "a" 573 // skipelem("a", name) = "", setting name = "a" 574 // skipelem("", name) = skipelem("////", name) = 0 575 // 576 static char* 577 skipelem(char *path, char *name) 578 { 579 char *s; 580 int len; 581 582 while(*path == '/') 583 path++; 584 if(*path == 0) 585 return 0; 586 s = path; 587 while(*path != '/' && *path != 0) 588 path++; 589 len = path - s; 590 if(len >= DIRSIZ) 591 memmove(name, s, DIRSIZ); 592 else { 593 memmove(name, s, len); 594 name[len] = 0; 595 } 596 while(*path == '/') 597 path++; 598 return path; 599 } 600 601 // Look up and return the inode for a path name. 602 // If parent != 0, return the inode for the parent and copy the final 603 // path element into name, which must have room for DIRSIZ bytes. 604 // Must be called inside a transaction since it calls iput(). 605 static struct inode* 606 namex(char *path, int nameiparent, char *name) 607 { 608 struct inode *ip, *next; 609 610 if(*path == '/') 611 ip = iget(ROOTDEV, ROOTINO); 612 else 613 ip = idup(proc->cwd); 614 615 while((path = skipelem(path, name)) != 0){ 616 ilock(ip); 617 if(ip->type != T_DIR){ 618 iunlockput(ip); 619 return 0; 620 } 621 if(nameiparent && *path == '\0'){ 622 // Stop one level early. 623 iunlock(ip); 624 return ip; 625 } 626 if((next = dirlookup(ip, name, 0)) == 0){ 627 iunlockput(ip); 628 return 0; 629 } 630 iunlockput(ip); 631 ip = next; 632 } 633 if(nameiparent){ 634 iput(ip); 635 return 0; 636 } 637 return ip; 638 } 639 640 struct inode* 641 namei(char *path) 642 { 643 char name[DIRSIZ]; 644 return namex(path, 0, name); 645 } 646 647 struct inode* 648 nameiparent(char *path, char *name) 649 { 650 return namex(path, 1, name); 651 } 652