1 // File system implementation. Five layers: 2 // + Blocks: allocator for raw disk blocks. 3 // + Log: crash recovery for multi-step updates. 4 // + Files: inode allocator, reading, writing, metadata. 5 // + Directories: inode with special contents (list of other inodes!) 6 // + Names: paths like /usr/rtm/xv6/fs.c for convenient naming. 7 // 8 // This file contains the low-level file system manipulation 9 // routines. The (higher-level) system call implementations 10 // are in sysfile.c. 11 12 #include "types.h" 13 #include "defs.h" 14 #include "param.h" 15 #include "stat.h" 16 #include "mmu.h" 17 #include "proc.h" 18 #include "spinlock.h" 19 #include "fs.h" 20 #include "buf.h" 21 #include "file.h" 22 23 #define min(a, b) ((a) < (b) ? (a) : (b)) 24 static void itrunc(struct inode*); 25 26 // Read the super block. 27 void 28 readsb(int dev, struct superblock *sb) 29 { 30 struct buf *bp; 31 32 bp = bread(dev, 1); 33 memmove(sb, bp->data, sizeof(*sb)); 34 brelse(bp); 35 } 36 37 // Zero a block. 38 static void 39 bzero(int dev, int bno) 40 { 41 struct buf *bp; 42 43 bp = bread(dev, bno); 44 memset(bp->data, 0, BSIZE); 45 log_write(bp); 46 brelse(bp); 47 } 48 49 // Blocks. 50 51 // Allocate a zeroed disk block. 52 static uint 53 balloc(uint dev) 54 { 55 int b, bi, m; 56 struct buf *bp; 57 struct superblock sb; 58 59 bp = 0; 60 readsb(dev, &sb); 61 for(b = 0; b < sb.size; b += BPB){ 62 bp = bread(dev, BBLOCK(b, sb.ninodes)); 63 for(bi = 0; bi < BPB && b + bi < sb.size; bi++){ 64 m = 1 << (bi % 8); 65 if((bp->data[bi/8] & m) == 0){ // Is block free? 66 bp->data[bi/8] |= m; // Mark block in use. 67 log_write(bp); 68 brelse(bp); 69 bzero(dev, b + bi); 70 return b + bi; 71 } 72 } 73 brelse(bp); 74 } 75 panic("balloc: out of blocks"); 76 } 77 78 // Free a disk block. 79 static void 80 bfree(int dev, uint b) 81 { 82 struct buf *bp; 83 struct superblock sb; 84 int bi, m; 85 86 readsb(dev, &sb); 87 bp = bread(dev, BBLOCK(b, sb.ninodes)); 88 bi = b % BPB; 89 m = 1 << (bi % 8); 90 if((bp->data[bi/8] & m) == 0) 91 panic("freeing free block"); 92 bp->data[bi/8] &= ~m; 93 log_write(bp); 94 brelse(bp); 95 } 96 97 // Inodes. 98 // 99 // An inode describes a single unnamed file. 100 // The inode disk structure holds metadata: the file's type, 101 // its size, the number of links referring to it, and the 102 // list of blocks holding the file's content. 103 // 104 // The inodes are laid out sequentially on disk immediately after 105 // the superblock. Each inode has a number, indicating its 106 // position on the disk. 107 // 108 // The kernel keeps a cache of in-use inodes in memory 109 // to provide a place for synchronizing access 110 // to inodes used by multiple processes. The cached 111 // inodes include book-keeping information that is 112 // not stored on disk: ip->ref and ip->flags. 113 // 114 // An inode and its in-memory represtative go through a 115 // sequence of states before they can be used by the 116 // rest of the file system code. 117 // 118 // * Allocation: an inode is allocated if its type (on disk) 119 // is non-zero. ialloc() allocates, iput() frees if 120 // the link count has fallen to zero. 121 // 122 // * Referencing in cache: an entry in the inode cache 123 // is free if ip->ref is zero. Otherwise ip->ref tracks 124 // the number of in-memory pointers to the entry (open 125 // files and current directories). iget() to find or 126 // create a cache entry and increment its ref, iput() 127 // to decrement ref. 128 // 129 // * Valid: the information (type, size, &c) in an inode 130 // cache entry is only correct when the I_VALID bit 131 // is set in ip->flags. ilock() reads the inode from 132 // the disk and sets I_VALID, while iput() clears 133 // I_VALID if ip->ref has fallen to zero. 134 // 135 // * Locked: file system code may only examine and modify 136 // the information in an inode and its content if it 137 // has first locked the inode. The I_BUSY flag indicates 138 // that the inode is locked. ilock() sets I_BUSY, 139 // while iunlock clears it. 140 // 141 // Thus a typical sequence is: 142 // ip = iget(dev, inum) 143 // ilock(ip) 144 // ... examine and modify ip->xxx ... 145 // iunlock(ip) 146 // iput(ip) 147 // 148 // ilock() is separate from iget() so that system calls can 149 // get a long-term reference to an inode (as for an open file) 150 // and only lock it for short periods (e.g., in read()). 151 // The separation also helps avoid deadlock and races during 152 // pathname lookup. iget() increments ip->ref so that the inode 153 // stays cached and pointers to it remain valid. 154 // 155 // Many internal file system functions expect the caller to 156 // have locked the inodes involved; this lets callers create 157 // multi-step atomic operations. 158 159 struct { 160 struct spinlock lock; 161 struct inode inode[NINODE]; 162 } icache; 163 164 void 165 iinit(void) 166 { 167 initlock(&icache.lock, "icache"); 168 } 169 170 static struct inode* iget(uint dev, uint inum); 171 172 //PAGEBREAK! 173 // Allocate a new inode with the given type on device dev. 174 // A free inode has a type of zero. 175 struct inode* 176 ialloc(uint dev, short type) 177 { 178 int inum; 179 struct buf *bp; 180 struct dinode *dip; 181 struct superblock sb; 182 183 readsb(dev, &sb); 184 185 for(inum = 1; inum < sb.ninodes; inum++){ 186 bp = bread(dev, IBLOCK(inum)); 187 dip = (struct dinode*)bp->data + inum%IPB; 188 if(dip->type == 0){ // a free inode 189 memset(dip, 0, sizeof(*dip)); 190 dip->type = type; 191 log_write(bp); // mark it allocated on the disk 192 brelse(bp); 193 return iget(dev, inum); 194 } 195 brelse(bp); 196 } 197 panic("ialloc: no inodes"); 198 } 199 200 // Copy a modified in-memory inode to disk. 201 void 202 iupdate(struct inode *ip) 203 { 204 struct buf *bp; 205 struct dinode *dip; 206 207 bp = bread(ip->dev, IBLOCK(ip->inum)); 208 dip = (struct dinode*)bp->data + ip->inum%IPB; 209 dip->type = ip->type; 210 dip->major = ip->major; 211 dip->minor = ip->minor; 212 dip->nlink = ip->nlink; 213 dip->size = ip->size; 214 memmove(dip->addrs, ip->addrs, sizeof(ip->addrs)); 215 log_write(bp); 216 brelse(bp); 217 } 218 219 // Find the inode with number inum on device dev 220 // and return the in-memory copy. Does not lock 221 // the inode and does not read it from disk. 222 static struct inode* 223 iget(uint dev, uint inum) 224 { 225 struct inode *ip, *empty; 226 227 acquire(&icache.lock); 228 229 // Is the inode already cached? 230 empty = 0; 231 for(ip = &icache.inode[0]; ip < &icache.inode[NINODE]; ip++){ 232 if(ip->ref > 0 && ip->dev == dev && ip->inum == inum){ 233 ip->ref++; 234 release(&icache.lock); 235 return ip; 236 } 237 if(empty == 0 && ip->ref == 0) // Remember empty slot. 238 empty = ip; 239 } 240 241 // Recycle an inode cache entry. 242 if(empty == 0) 243 panic("iget: no inodes"); 244 245 ip = empty; 246 ip->dev = dev; 247 ip->inum = inum; 248 ip->ref = 1; 249 ip->flags = 0; 250 release(&icache.lock); 251 252 return ip; 253 } 254 255 // Increment reference count for ip. 256 // Returns ip to enable ip = idup(ip1) idiom. 257 struct inode* 258 idup(struct inode *ip) 259 { 260 acquire(&icache.lock); 261 ip->ref++; 262 release(&icache.lock); 263 return ip; 264 } 265 266 // Lock the given inode. 267 // Reads the inode from disk if necessary. 268 void 269 ilock(struct inode *ip) 270 { 271 struct buf *bp; 272 struct dinode *dip; 273 274 if(ip == 0 || ip->ref < 1) 275 panic("ilock"); 276 277 acquire(&icache.lock); 278 while(ip->flags & I_BUSY) 279 sleep(ip, &icache.lock); 280 ip->flags |= I_BUSY; 281 release(&icache.lock); 282 283 if(!(ip->flags & I_VALID)){ 284 bp = bread(ip->dev, IBLOCK(ip->inum)); 285 dip = (struct dinode*)bp->data + ip->inum%IPB; 286 ip->type = dip->type; 287 ip->major = dip->major; 288 ip->minor = dip->minor; 289 ip->nlink = dip->nlink; 290 ip->size = dip->size; 291 memmove(ip->addrs, dip->addrs, sizeof(ip->addrs)); 292 brelse(bp); 293 ip->flags |= I_VALID; 294 if(ip->type == 0) 295 panic("ilock: no type"); 296 } 297 } 298 299 // Unlock the given inode. 300 void 301 iunlock(struct inode *ip) 302 { 303 if(ip == 0 || !(ip->flags & I_BUSY) || ip->ref < 1) 304 panic("iunlock"); 305 306 acquire(&icache.lock); 307 ip->flags &= ~I_BUSY; 308 wakeup(ip); 309 release(&icache.lock); 310 } 311 312 // Drop a reference to an in-memory inode. 313 // If that was the last reference, the inode cache entry can 314 // be recycled. 315 // If that was the last reference and the inode has no links 316 // to it, free the inode (and its content) on disk. 317 // All calls to iput() must be inside a transaction in 318 // case it has to free the inode. 319 void 320 iput(struct inode *ip) 321 { 322 acquire(&icache.lock); 323 if(ip->ref == 1 && (ip->flags & I_VALID) && ip->nlink == 0){ 324 // inode has no links and no other references: truncate and free. 325 if(ip->flags & I_BUSY) 326 panic("iput busy"); 327 ip->flags |= I_BUSY; 328 release(&icache.lock); 329 itrunc(ip); 330 ip->type = 0; 331 iupdate(ip); 332 acquire(&icache.lock); 333 ip->flags = 0; 334 wakeup(ip); 335 } 336 ip->ref--; 337 release(&icache.lock); 338 } 339 340 // Common idiom: unlock, then put. 341 void 342 iunlockput(struct inode *ip) 343 { 344 iunlock(ip); 345 iput(ip); 346 } 347 348 //PAGEBREAK! 349 // Inode content 350 // 351 // The content (data) associated with each inode is stored 352 // in blocks on the disk. The first NDIRECT block numbers 353 // are listed in ip->addrs[]. The next NINDIRECT blocks are 354 // listed in block ip->addrs[NDIRECT]. 355 356 // Return the disk block address of the nth block in inode ip. 357 // If there is no such block, bmap allocates one. 358 static uint 359 bmap(struct inode *ip, uint bn) 360 { 361 uint addr, *a; 362 struct buf *bp; 363 364 if(bn < NDIRECT){ 365 if((addr = ip->addrs[bn]) == 0) 366 ip->addrs[bn] = addr = balloc(ip->dev); 367 return addr; 368 } 369 bn -= NDIRECT; 370 371 if(bn < NINDIRECT){ 372 // Load indirect block, allocating if necessary. 373 if((addr = ip->addrs[NDIRECT]) == 0) 374 ip->addrs[NDIRECT] = addr = balloc(ip->dev); 375 bp = bread(ip->dev, addr); 376 a = (uint*)bp->data; 377 if((addr = a[bn]) == 0){ 378 a[bn] = addr = balloc(ip->dev); 379 log_write(bp); 380 } 381 brelse(bp); 382 return addr; 383 } 384 385 panic("bmap: out of range"); 386 } 387 388 // Truncate inode (discard contents). 389 // Only called when the inode has no links 390 // to it (no directory entries referring to it) 391 // and has no in-memory reference to it (is 392 // not an open file or current directory). 393 static void 394 itrunc(struct inode *ip) 395 { 396 int i, j; 397 struct buf *bp; 398 uint *a; 399 400 for(i = 0; i < NDIRECT; i++){ 401 if(ip->addrs[i]){ 402 bfree(ip->dev, ip->addrs[i]); 403 ip->addrs[i] = 0; 404 } 405 } 406 407 if(ip->addrs[NDIRECT]){ 408 bp = bread(ip->dev, ip->addrs[NDIRECT]); 409 a = (uint*)bp->data; 410 for(j = 0; j < NINDIRECT; j++){ 411 if(a[j]) 412 bfree(ip->dev, a[j]); 413 } 414 brelse(bp); 415 bfree(ip->dev, ip->addrs[NDIRECT]); 416 ip->addrs[NDIRECT] = 0; 417 } 418 419 ip->size = 0; 420 iupdate(ip); 421 } 422 423 // Copy stat information from inode. 424 void 425 stati(struct inode *ip, struct stat *st) 426 { 427 st->dev = ip->dev; 428 st->ino = ip->inum; 429 st->type = ip->type; 430 st->nlink = ip->nlink; 431 st->size = ip->size; 432 } 433 434 //PAGEBREAK! 435 // Read data from inode. 436 int 437 readi(struct inode *ip, char *dst, uint off, uint n) 438 { 439 uint tot, m; 440 struct buf *bp; 441 442 if(ip->type == T_DEV){ 443 if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].read) 444 return -1; 445 return devsw[ip->major].read(ip, dst, n); 446 } 447 448 if(off > ip->size || off + n < off) 449 return -1; 450 if(off + n > ip->size) 451 n = ip->size - off; 452 453 for(tot=0; tot<n; tot+=m, off+=m, dst+=m){ 454 bp = bread(ip->dev, bmap(ip, off/BSIZE)); 455 m = min(n - tot, BSIZE - off%BSIZE); 456 memmove(dst, bp->data + off%BSIZE, m); 457 brelse(bp); 458 } 459 return n; 460 } 461 462 // PAGEBREAK! 463 // Write data to inode. 464 int 465 writei(struct inode *ip, char *src, uint off, uint n) 466 { 467 uint tot, m; 468 struct buf *bp; 469 470 if(ip->type == T_DEV){ 471 if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].write) 472 return -1; 473 return devsw[ip->major].write(ip, src, n); 474 } 475 476 if(off > ip->size || off + n < off) 477 return -1; 478 if(off + n > MAXFILE*BSIZE) 479 return -1; 480 481 for(tot=0; tot<n; tot+=m, off+=m, src+=m){ 482 bp = bread(ip->dev, bmap(ip, off/BSIZE)); 483 m = min(n - tot, BSIZE - off%BSIZE); 484 memmove(bp->data + off%BSIZE, src, m); 485 log_write(bp); 486 brelse(bp); 487 } 488 489 if(n > 0 && off > ip->size){ 490 ip->size = off; 491 iupdate(ip); 492 } 493 return n; 494 } 495 496 //PAGEBREAK! 497 // Directories 498 499 int 500 namecmp(const char *s, const char *t) 501 { 502 return strncmp(s, t, DIRSIZ); 503 } 504 505 // Look for a directory entry in a directory. 506 // If found, set *poff to byte offset of entry. 507 struct inode* 508 dirlookup(struct inode *dp, char *name, uint *poff) 509 { 510 uint off, inum; 511 struct dirent de; 512 513 if(dp->type != T_DIR) 514 panic("dirlookup not DIR"); 515 516 for(off = 0; off < dp->size; off += sizeof(de)){ 517 if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de)) 518 panic("dirlink read"); 519 if(de.inum == 0) 520 continue; 521 if(namecmp(name, de.name) == 0){ 522 // entry matches path element 523 if(poff) 524 *poff = off; 525 inum = de.inum; 526 return iget(dp->dev, inum); 527 } 528 } 529 530 return 0; 531 } 532 533 // Write a new directory entry (name, inum) into the directory dp. 534 int 535 dirlink(struct inode *dp, char *name, uint inum) 536 { 537 int off; 538 struct dirent de; 539 struct inode *ip; 540 541 // Check that name is not present. 542 if((ip = dirlookup(dp, name, 0)) != 0){ 543 iput(ip); 544 return -1; 545 } 546 547 // Look for an empty dirent. 548 for(off = 0; off < dp->size; off += sizeof(de)){ 549 if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de)) 550 panic("dirlink read"); 551 if(de.inum == 0) 552 break; 553 } 554 555 strncpy(de.name, name, DIRSIZ); 556 de.inum = inum; 557 if(writei(dp, (char*)&de, off, sizeof(de)) != sizeof(de)) 558 panic("dirlink"); 559 560 return 0; 561 } 562 563 //PAGEBREAK! 564 // Paths 565 566 // Copy the next path element from path into name. 567 // Return a pointer to the element following the copied one. 568 // The returned path has no leading slashes, 569 // so the caller can check *path=='\0' to see if the name is the last one. 570 // If no name to remove, return 0. 571 // 572 // Examples: 573 // skipelem("a/bb/c", name) = "bb/c", setting name = "a" 574 // skipelem("///a//bb", name) = "bb", setting name = "a" 575 // skipelem("a", name) = "", setting name = "a" 576 // skipelem("", name) = skipelem("////", name) = 0 577 // 578 static char* 579 skipelem(char *path, char *name) 580 { 581 char *s; 582 int len; 583 584 while(*path == '/') 585 path++; 586 if(*path == 0) 587 return 0; 588 s = path; 589 while(*path != '/' && *path != 0) 590 path++; 591 len = path - s; 592 if(len >= DIRSIZ) 593 memmove(name, s, DIRSIZ); 594 else { 595 memmove(name, s, len); 596 name[len] = 0; 597 } 598 while(*path == '/') 599 path++; 600 return path; 601 } 602 603 // Look up and return the inode for a path name. 604 // If parent != 0, return the inode for the parent and copy the final 605 // path element into name, which must have room for DIRSIZ bytes. 606 // Must be called inside a transaction since it calls iput(). 607 static struct inode* 608 namex(char *path, int nameiparent, char *name) 609 { 610 struct inode *ip, *next; 611 612 if(*path == '/') 613 ip = iget(ROOTDEV, ROOTINO); 614 else 615 ip = idup(proc->cwd); 616 617 while((path = skipelem(path, name)) != 0){ 618 ilock(ip); 619 if(ip->type != T_DIR){ 620 iunlockput(ip); 621 return 0; 622 } 623 if(nameiparent && *path == '\0'){ 624 // Stop one level early. 625 iunlock(ip); 626 return ip; 627 } 628 if((next = dirlookup(ip, name, 0)) == 0){ 629 iunlockput(ip); 630 return 0; 631 } 632 iunlockput(ip); 633 ip = next; 634 } 635 if(nameiparent){ 636 iput(ip); 637 return 0; 638 } 639 return ip; 640 } 641 642 struct inode* 643 namei(char *path) 644 { 645 char name[DIRSIZ]; 646 return namex(path, 0, name); 647 } 648 649 struct inode* 650 nameiparent(char *path, char *name) 651 { 652 return namex(path, 1, name); 653 } 654