1 // File system implementation. Five layers: 2 // + Blocks: allocator for raw disk blocks. 3 // + Log: crash recovery for multi-step updates. 4 // + Files: inode allocator, reading, writing, metadata. 5 // + Directories: inode with special contents (list of other inodes!) 6 // + Names: paths like /usr/rtm/xv6/fs.c for convenient naming. 7 // 8 // This file contains the low-level file system manipulation 9 // routines. The (higher-level) system call implementations 10 // are in sysfile.c. 11 12 #include "types.h" 13 #include "defs.h" 14 #include "param.h" 15 #include "stat.h" 16 #include "mmu.h" 17 #include "proc.h" 18 #include "spinlock.h" 19 #include "sleeplock.h" 20 #include "fs.h" 21 #include "buf.h" 22 #include "file.h" 23 24 #define min(a, b) ((a) < (b) ? (a) : (b)) 25 static void itrunc(struct inode*); 26 // there should be one superblock per disk device, but we run with 27 // only one device 28 struct superblock sb; 29 30 // Read the super block. 31 void 32 readsb(int dev, struct superblock *sb) 33 { 34 struct buf *bp; 35 36 bp = bread(dev, 1); 37 memmove(sb, bp->data, sizeof(*sb)); 38 brelse(bp); 39 } 40 41 // Zero a block. 42 static void 43 bzero(int dev, int bno) 44 { 45 struct buf *bp; 46 47 bp = bread(dev, bno); 48 memset(bp->data, 0, BSIZE); 49 log_write(bp); 50 brelse(bp); 51 } 52 53 // Blocks. 54 55 // Allocate a zeroed disk block. 56 static uint 57 balloc(uint dev) 58 { 59 int b, bi, m; 60 struct buf *bp; 61 62 bp = 0; 63 for(b = 0; b < sb.size; b += BPB){ 64 bp = bread(dev, BBLOCK(b, sb)); 65 for(bi = 0; bi < BPB && b + bi < sb.size; bi++){ 66 m = 1 << (bi % 8); 67 if((bp->data[bi/8] & m) == 0){ // Is block free? 68 bp->data[bi/8] |= m; // Mark block in use. 69 log_write(bp); 70 brelse(bp); 71 bzero(dev, b + bi); 72 return b + bi; 73 } 74 } 75 brelse(bp); 76 } 77 panic("balloc: out of blocks"); 78 } 79 80 // Free a disk block. 81 static void 82 bfree(int dev, uint b) 83 { 84 struct buf *bp; 85 int bi, m; 86 87 readsb(dev, &sb); 88 bp = bread(dev, BBLOCK(b, sb)); 89 bi = b % BPB; 90 m = 1 << (bi % 8); 91 if((bp->data[bi/8] & m) == 0) 92 panic("freeing free block"); 93 bp->data[bi/8] &= ~m; 94 log_write(bp); 95 brelse(bp); 96 } 97 98 // Inodes. 99 // 100 // An inode describes a single unnamed file. 101 // The inode disk structure holds metadata: the file's type, 102 // its size, the number of links referring to it, and the 103 // list of blocks holding the file's content. 104 // 105 // The inodes are laid out sequentially on disk at 106 // sb.startinode. Each inode has a number, indicating its 107 // position on the disk. 108 // 109 // The kernel keeps a cache of in-use inodes in memory 110 // to provide a place for synchronizing access 111 // to inodes used by multiple processes. The cached 112 // inodes include book-keeping information that is 113 // not stored on disk: ip->ref and ip->flags. 114 // 115 // An inode and its in-memory represtative go through a 116 // sequence of states before they can be used by the 117 // rest of the file system code. 118 // 119 // * Allocation: an inode is allocated if its type (on disk) 120 // is non-zero. ialloc() allocates, iput() frees if 121 // the link count has fallen to zero. 122 // 123 // * Referencing in cache: an entry in the inode cache 124 // is free if ip->ref is zero. Otherwise ip->ref tracks 125 // the number of in-memory pointers to the entry (open 126 // files and current directories). iget() to find or 127 // create a cache entry and increment its ref, iput() 128 // to decrement ref. 129 // 130 // * Valid: the information (type, size, &c) in an inode 131 // cache entry is only correct when the I_VALID bit 132 // is set in ip->flags. ilock() reads the inode from 133 // the disk and sets I_VALID, while iput() clears 134 // I_VALID if ip->ref has fallen to zero. 135 // 136 // * Locked: file system code may only examine and modify 137 // the information in an inode and its content if it 138 // has first locked the inode. The I_BUSY flag indicates 139 // that the inode is locked. ilock() sets I_BUSY, 140 // while iunlock clears it. 141 // 142 // Thus a typical sequence is: 143 // ip = iget(dev, inum) 144 // ilock(ip) 145 // ... examine and modify ip->xxx ... 146 // iunlock(ip) 147 // iput(ip) 148 // 149 // ilock() is separate from iget() so that system calls can 150 // get a long-term reference to an inode (as for an open file) 151 // and only lock it for short periods (e.g., in read()). 152 // The separation also helps avoid deadlock and races during 153 // pathname lookup. iget() increments ip->ref so that the inode 154 // stays cached and pointers to it remain valid. 155 // 156 // Many internal file system functions expect the caller to 157 // have locked the inodes involved; this lets callers create 158 // multi-step atomic operations. 159 160 struct { 161 struct spinlock lock; 162 struct inode inode[NINODE]; 163 } icache; 164 165 void 166 iinit(int dev) 167 { 168 initlock(&icache.lock, "icache"); 169 readsb(dev, &sb); 170 cprintf("sb: size %d nblocks %d ninodes %d nlog %d logstart %d\ 171 inodestart %d bmap start %d\n", sb.size, sb.nblocks, 172 sb.ninodes, sb.nlog, sb.logstart, sb.inodestart, 173 sb.bmapstart); 174 } 175 176 static struct inode* iget(uint dev, uint inum); 177 178 //PAGEBREAK! 179 // Allocate a new inode with the given type on device dev. 180 // A free inode has a type of zero. 181 struct inode* 182 ialloc(uint dev, short type) 183 { 184 int inum; 185 struct buf *bp; 186 struct dinode *dip; 187 188 for(inum = 1; inum < sb.ninodes; inum++){ 189 bp = bread(dev, IBLOCK(inum, sb)); 190 dip = (struct dinode*)bp->data + inum%IPB; 191 if(dip->type == 0){ // a free inode 192 memset(dip, 0, sizeof(*dip)); 193 dip->type = type; 194 log_write(bp); // mark it allocated on the disk 195 brelse(bp); 196 return iget(dev, inum); 197 } 198 brelse(bp); 199 } 200 panic("ialloc: no inodes"); 201 } 202 203 // Copy a modified in-memory inode to disk. 204 void 205 iupdate(struct inode *ip) 206 { 207 struct buf *bp; 208 struct dinode *dip; 209 210 bp = bread(ip->dev, IBLOCK(ip->inum, sb)); 211 dip = (struct dinode*)bp->data + ip->inum%IPB; 212 dip->type = ip->type; 213 dip->major = ip->major; 214 dip->minor = ip->minor; 215 dip->nlink = ip->nlink; 216 dip->size = ip->size; 217 memmove(dip->addrs, ip->addrs, sizeof(ip->addrs)); 218 log_write(bp); 219 brelse(bp); 220 } 221 222 // Find the inode with number inum on device dev 223 // and return the in-memory copy. Does not lock 224 // the inode and does not read it from disk. 225 static struct inode* 226 iget(uint dev, uint inum) 227 { 228 struct inode *ip, *empty; 229 230 acquire(&icache.lock); 231 232 // Is the inode already cached? 233 empty = 0; 234 for(ip = &icache.inode[0]; ip < &icache.inode[NINODE]; ip++){ 235 if(ip->ref > 0 && ip->dev == dev && ip->inum == inum){ 236 ip->ref++; 237 release(&icache.lock); 238 return ip; 239 } 240 if(empty == 0 && ip->ref == 0) // Remember empty slot. 241 empty = ip; 242 } 243 244 // Recycle an inode cache entry. 245 if(empty == 0) 246 panic("iget: no inodes"); 247 248 ip = empty; 249 ip->dev = dev; 250 ip->inum = inum; 251 ip->ref = 1; 252 ip->flags = 0; 253 release(&icache.lock); 254 255 return ip; 256 } 257 258 // Increment reference count for ip. 259 // Returns ip to enable ip = idup(ip1) idiom. 260 struct inode* 261 idup(struct inode *ip) 262 { 263 acquire(&icache.lock); 264 ip->ref++; 265 release(&icache.lock); 266 return ip; 267 } 268 269 // Lock the given inode. 270 // Reads the inode from disk if necessary. 271 void 272 ilock(struct inode *ip) 273 { 274 struct buf *bp; 275 struct dinode *dip; 276 277 if(ip == 0 || ip->ref < 1) 278 panic("ilock"); 279 280 acquire(&icache.lock); 281 while(ip->flags & I_BUSY) 282 sleep(ip, &icache.lock); 283 ip->flags |= I_BUSY; 284 release(&icache.lock); 285 286 if(!(ip->flags & I_VALID)){ 287 bp = bread(ip->dev, IBLOCK(ip->inum, sb)); 288 dip = (struct dinode*)bp->data + ip->inum%IPB; 289 ip->type = dip->type; 290 ip->major = dip->major; 291 ip->minor = dip->minor; 292 ip->nlink = dip->nlink; 293 ip->size = dip->size; 294 memmove(ip->addrs, dip->addrs, sizeof(ip->addrs)); 295 brelse(bp); 296 ip->flags |= I_VALID; 297 if(ip->type == 0) 298 panic("ilock: no type"); 299 } 300 } 301 302 // Unlock the given inode. 303 void 304 iunlock(struct inode *ip) 305 { 306 if(ip == 0 || !(ip->flags & I_BUSY) || ip->ref < 1) 307 panic("iunlock"); 308 309 acquire(&icache.lock); 310 ip->flags &= ~I_BUSY; 311 wakeup(ip); 312 release(&icache.lock); 313 } 314 315 // Drop a reference to an in-memory inode. 316 // If that was the last reference, the inode cache entry can 317 // be recycled. 318 // If that was the last reference and the inode has no links 319 // to it, free the inode (and its content) on disk. 320 // All calls to iput() must be inside a transaction in 321 // case it has to free the inode. 322 void 323 iput(struct inode *ip) 324 { 325 acquire(&icache.lock); 326 if(ip->ref == 1 && (ip->flags & I_VALID) && ip->nlink == 0){ 327 // inode has no links and no other references: truncate and free. 328 if(ip->flags & I_BUSY) 329 panic("iput busy"); 330 ip->flags |= I_BUSY; 331 release(&icache.lock); 332 itrunc(ip); 333 ip->type = 0; 334 iupdate(ip); 335 acquire(&icache.lock); 336 ip->flags = 0; 337 wakeup(ip); 338 } 339 ip->ref--; 340 release(&icache.lock); 341 } 342 343 // Common idiom: unlock, then put. 344 void 345 iunlockput(struct inode *ip) 346 { 347 iunlock(ip); 348 iput(ip); 349 } 350 351 //PAGEBREAK! 352 // Inode content 353 // 354 // The content (data) associated with each inode is stored 355 // in blocks on the disk. The first NDIRECT block numbers 356 // are listed in ip->addrs[]. The next NINDIRECT blocks are 357 // listed in block ip->addrs[NDIRECT]. 358 359 // Return the disk block address of the nth block in inode ip. 360 // If there is no such block, bmap allocates one. 361 static uint 362 bmap(struct inode *ip, uint bn) 363 { 364 uint addr, *a; 365 struct buf *bp; 366 367 if(bn < NDIRECT){ 368 if((addr = ip->addrs[bn]) == 0) 369 ip->addrs[bn] = addr = balloc(ip->dev); 370 return addr; 371 } 372 bn -= NDIRECT; 373 374 if(bn < NINDIRECT){ 375 // Load indirect block, allocating if necessary. 376 if((addr = ip->addrs[NDIRECT]) == 0) 377 ip->addrs[NDIRECT] = addr = balloc(ip->dev); 378 bp = bread(ip->dev, addr); 379 a = (uint*)bp->data; 380 if((addr = a[bn]) == 0){ 381 a[bn] = addr = balloc(ip->dev); 382 log_write(bp); 383 } 384 brelse(bp); 385 return addr; 386 } 387 388 panic("bmap: out of range"); 389 } 390 391 // Truncate inode (discard contents). 392 // Only called when the inode has no links 393 // to it (no directory entries referring to it) 394 // and has no in-memory reference to it (is 395 // not an open file or current directory). 396 static void 397 itrunc(struct inode *ip) 398 { 399 int i, j; 400 struct buf *bp; 401 uint *a; 402 403 for(i = 0; i < NDIRECT; i++){ 404 if(ip->addrs[i]){ 405 bfree(ip->dev, ip->addrs[i]); 406 ip->addrs[i] = 0; 407 } 408 } 409 410 if(ip->addrs[NDIRECT]){ 411 bp = bread(ip->dev, ip->addrs[NDIRECT]); 412 a = (uint*)bp->data; 413 for(j = 0; j < NINDIRECT; j++){ 414 if(a[j]) 415 bfree(ip->dev, a[j]); 416 } 417 brelse(bp); 418 bfree(ip->dev, ip->addrs[NDIRECT]); 419 ip->addrs[NDIRECT] = 0; 420 } 421 422 ip->size = 0; 423 iupdate(ip); 424 } 425 426 // Copy stat information from inode. 427 void 428 stati(struct inode *ip, struct stat *st) 429 { 430 st->dev = ip->dev; 431 st->ino = ip->inum; 432 st->type = ip->type; 433 st->nlink = ip->nlink; 434 st->size = ip->size; 435 } 436 437 //PAGEBREAK! 438 // Read data from inode. 439 int 440 readi(struct inode *ip, char *dst, uint off, uint n) 441 { 442 uint tot, m; 443 struct buf *bp; 444 445 if(ip->type == T_DEV){ 446 if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].read) 447 return -1; 448 return devsw[ip->major].read(ip, dst, n); 449 } 450 451 if(off > ip->size || off + n < off) 452 return -1; 453 if(off + n > ip->size) 454 n = ip->size - off; 455 456 for(tot=0; tot<n; tot+=m, off+=m, dst+=m){ 457 bp = bread(ip->dev, bmap(ip, off/BSIZE)); 458 m = min(n - tot, BSIZE - off%BSIZE); 459 /* 460 cprintf("data off %d:\n", off); 461 for (int j = 0; j < min(m, 10); j++) { 462 cprintf("%x ", bp->data[off%BSIZE+j]); 463 } 464 cprintf("\n"); 465 */ 466 memmove(dst, bp->data + off%BSIZE, m); 467 brelse(bp); 468 } 469 return n; 470 } 471 472 // PAGEBREAK! 473 // Write data to inode. 474 int 475 writei(struct inode *ip, char *src, uint off, uint n) 476 { 477 uint tot, m; 478 struct buf *bp; 479 480 if(ip->type == T_DEV){ 481 if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].write) 482 return -1; 483 return devsw[ip->major].write(ip, src, n); 484 } 485 486 if(off > ip->size || off + n < off) 487 return -1; 488 if(off + n > MAXFILE*BSIZE) 489 return -1; 490 491 for(tot=0; tot<n; tot+=m, off+=m, src+=m){ 492 bp = bread(ip->dev, bmap(ip, off/BSIZE)); 493 m = min(n - tot, BSIZE - off%BSIZE); 494 memmove(bp->data + off%BSIZE, src, m); 495 log_write(bp); 496 brelse(bp); 497 } 498 499 if(n > 0 && off > ip->size){ 500 ip->size = off; 501 iupdate(ip); 502 } 503 return n; 504 } 505 506 //PAGEBREAK! 507 // Directories 508 509 int 510 namecmp(const char *s, const char *t) 511 { 512 return strncmp(s, t, DIRSIZ); 513 } 514 515 // Look for a directory entry in a directory. 516 // If found, set *poff to byte offset of entry. 517 struct inode* 518 dirlookup(struct inode *dp, char *name, uint *poff) 519 { 520 uint off, inum; 521 struct dirent de; 522 523 if(dp->type != T_DIR) 524 panic("dirlookup not DIR"); 525 526 for(off = 0; off < dp->size; off += sizeof(de)){ 527 if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de)) 528 panic("dirlink read"); 529 if(de.inum == 0) 530 continue; 531 if(namecmp(name, de.name) == 0){ 532 // entry matches path element 533 if(poff) 534 *poff = off; 535 inum = de.inum; 536 return iget(dp->dev, inum); 537 } 538 } 539 540 return 0; 541 } 542 543 // Write a new directory entry (name, inum) into the directory dp. 544 int 545 dirlink(struct inode *dp, char *name, uint inum) 546 { 547 int off; 548 struct dirent de; 549 struct inode *ip; 550 551 // Check that name is not present. 552 if((ip = dirlookup(dp, name, 0)) != 0){ 553 iput(ip); 554 return -1; 555 } 556 557 // Look for an empty dirent. 558 for(off = 0; off < dp->size; off += sizeof(de)){ 559 if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de)) 560 panic("dirlink read"); 561 if(de.inum == 0) 562 break; 563 } 564 565 strncpy(de.name, name, DIRSIZ); 566 de.inum = inum; 567 if(writei(dp, (char*)&de, off, sizeof(de)) != sizeof(de)) 568 panic("dirlink"); 569 570 return 0; 571 } 572 573 //PAGEBREAK! 574 // Paths 575 576 // Copy the next path element from path into name. 577 // Return a pointer to the element following the copied one. 578 // The returned path has no leading slashes, 579 // so the caller can check *path=='\0' to see if the name is the last one. 580 // If no name to remove, return 0. 581 // 582 // Examples: 583 // skipelem("a/bb/c", name) = "bb/c", setting name = "a" 584 // skipelem("///a//bb", name) = "bb", setting name = "a" 585 // skipelem("a", name) = "", setting name = "a" 586 // skipelem("", name) = skipelem("////", name) = 0 587 // 588 static char* 589 skipelem(char *path, char *name) 590 { 591 char *s; 592 int len; 593 594 while(*path == '/') 595 path++; 596 if(*path == 0) 597 return 0; 598 s = path; 599 while(*path != '/' && *path != 0) 600 path++; 601 len = path - s; 602 if(len >= DIRSIZ) 603 memmove(name, s, DIRSIZ); 604 else { 605 memmove(name, s, len); 606 name[len] = 0; 607 } 608 while(*path == '/') 609 path++; 610 return path; 611 } 612 613 // Look up and return the inode for a path name. 614 // If parent != 0, return the inode for the parent and copy the final 615 // path element into name, which must have room for DIRSIZ bytes. 616 // Must be called inside a transaction since it calls iput(). 617 static struct inode* 618 namex(char *path, int nameiparent, char *name) 619 { 620 struct inode *ip, *next; 621 622 if(*path == '/') 623 ip = iget(ROOTDEV, ROOTINO); 624 else 625 ip = idup(proc->cwd); 626 627 while((path = skipelem(path, name)) != 0){ 628 ilock(ip); 629 if(ip->type != T_DIR){ 630 iunlockput(ip); 631 return 0; 632 } 633 if(nameiparent && *path == '\0'){ 634 // Stop one level early. 635 iunlock(ip); 636 return ip; 637 } 638 if((next = dirlookup(ip, name, 0)) == 0){ 639 iunlockput(ip); 640 return 0; 641 } 642 iunlockput(ip); 643 ip = next; 644 } 645 if(nameiparent){ 646 iput(ip); 647 return 0; 648 } 649 return ip; 650 } 651 652 struct inode* 653 namei(char *path) 654 { 655 char name[DIRSIZ]; 656 return namex(path, 0, name); 657 } 658 659 struct inode* 660 nameiparent(char *path, char *name) 661 { 662 return namex(path, 1, name); 663 } 664