1 // File system implementation. Five layers: 2 // + Blocks: allocator for raw disk blocks. 3 // + Log: crash recovery for multi-step updates. 4 // + Files: inode allocator, reading, writing, metadata. 5 // + Directories: inode with special contents (list of other inodes!) 6 // + Names: paths like /usr/rtm/xv6/fs.c for convenient naming. 7 // 8 // This file contains the low-level file system manipulation 9 // routines. The (higher-level) system call implementations 10 // are in sysfile.c. 11 12 #include "types.h" 13 #include "defs.h" 14 #include "param.h" 15 #include "stat.h" 16 #include "mmu.h" 17 #include "proc.h" 18 #include "spinlock.h" 19 #include "sleeplock.h" 20 #include "fs.h" 21 #include "buf.h" 22 #include "file.h" 23 24 #define min(a, b) ((a) < (b) ? (a) : (b)) 25 static void itrunc(struct inode*); 26 // there should be one superblock per disk device, but we run with 27 // only one device 28 struct superblock sb; 29 30 // Read the super block. 31 void 32 readsb(int dev, struct superblock *sb) 33 { 34 struct buf *bp; 35 36 bp = bread(dev, 1); 37 memmove(sb, bp->data, sizeof(*sb)); 38 brelse(bp); 39 } 40 41 // Zero a block. 42 static void 43 bzero(int dev, int bno) 44 { 45 struct buf *bp; 46 47 bp = bread(dev, bno); 48 memset(bp->data, 0, BSIZE); 49 log_write(bp); 50 brelse(bp); 51 } 52 53 // Blocks. 54 55 // Allocate a zeroed disk block. 56 static uint 57 balloc(uint dev) 58 { 59 int b, bi, m; 60 struct buf *bp; 61 62 bp = 0; 63 for(b = 0; b < sb.size; b += BPB){ 64 bp = bread(dev, BBLOCK(b, sb)); 65 for(bi = 0; bi < BPB && b + bi < sb.size; bi++){ 66 m = 1 << (bi % 8); 67 if((bp->data[bi/8] & m) == 0){ // Is block free? 68 bp->data[bi/8] |= m; // Mark block in use. 69 log_write(bp); 70 brelse(bp); 71 bzero(dev, b + bi); 72 return b + bi; 73 } 74 } 75 brelse(bp); 76 } 77 panic("balloc: out of blocks"); 78 } 79 80 // Free a disk block. 81 static void 82 bfree(int dev, uint b) 83 { 84 struct buf *bp; 85 int bi, m; 86 87 readsb(dev, &sb); 88 bp = bread(dev, BBLOCK(b, sb)); 89 bi = b % BPB; 90 m = 1 << (bi % 8); 91 if((bp->data[bi/8] & m) == 0) 92 panic("freeing free block"); 93 bp->data[bi/8] &= ~m; 94 log_write(bp); 95 brelse(bp); 96 } 97 98 // Inodes. 99 // 100 // An inode describes a single unnamed file. 101 // The inode disk structure holds metadata: the file's type, 102 // its size, the number of links referring to it, and the 103 // list of blocks holding the file's content. 104 // 105 // The inodes are laid out sequentially on disk at 106 // sb.startinode. Each inode has a number, indicating its 107 // position on the disk. 108 // 109 // The kernel keeps a cache of in-use inodes in memory 110 // to provide a place for synchronizing access 111 // to inodes used by multiple processes. The cached 112 // inodes include book-keeping information that is 113 // not stored on disk: ip->ref and ip->flags. 114 // 115 // An inode and its in-memory represtative go through a 116 // sequence of states before they can be used by the 117 // rest of the file system code. 118 // 119 // * Allocation: an inode is allocated if its type (on disk) 120 // is non-zero. ialloc() allocates, iput() frees if 121 // the link count has fallen to zero. 122 // 123 // * Referencing in cache: an entry in the inode cache 124 // is free if ip->ref is zero. Otherwise ip->ref tracks 125 // the number of in-memory pointers to the entry (open 126 // files and current directories). iget() to find or 127 // create a cache entry and increment its ref, iput() 128 // to decrement ref. 129 // 130 // * Valid: the information (type, size, &c) in an inode 131 // cache entry is only correct when the I_VALID bit 132 // is set in ip->flags. ilock() reads the inode from 133 // the disk and sets I_VALID, while iput() clears 134 // I_VALID if ip->ref has fallen to zero. 135 // 136 // * Locked: file system code may only examine and modify 137 // the information in an inode and its content if it 138 // has first locked the inode. 139 // 140 // Thus a typical sequence is: 141 // ip = iget(dev, inum) 142 // ilock(ip) 143 // ... examine and modify ip->xxx ... 144 // iunlock(ip) 145 // iput(ip) 146 // 147 // ilock() is separate from iget() so that system calls can 148 // get a long-term reference to an inode (as for an open file) 149 // and only lock it for short periods (e.g., in read()). 150 // The separation also helps avoid deadlock and races during 151 // pathname lookup. iget() increments ip->ref so that the inode 152 // stays cached and pointers to it remain valid. 153 // 154 // Many internal file system functions expect the caller to 155 // have locked the inodes involved; this lets callers create 156 // multi-step atomic operations. 157 158 struct { 159 struct spinlock lock; 160 struct inode inode[NINODE]; 161 } icache; 162 163 void 164 iinit(int dev) 165 { 166 int i = 0; 167 168 initlock(&icache.lock, "icache"); 169 for(i = 0; i < NINODE; i++) { 170 initsleeplock(&icache.inode[i].lock, "inode"); 171 } 172 173 readsb(dev, &sb); 174 cprintf("sb: size %d nblocks %d ninodes %d nlog %d logstart %d\ 175 inodestart %d bmap start %d\n", sb.size, sb.nblocks, 176 sb.ninodes, sb.nlog, sb.logstart, sb.inodestart, 177 sb.bmapstart); 178 } 179 180 static struct inode* iget(uint dev, uint inum); 181 182 //PAGEBREAK! 183 // Allocate a new inode with the given type on device dev. 184 // A free inode has a type of zero. 185 struct inode* 186 ialloc(uint dev, short type) 187 { 188 int inum; 189 struct buf *bp; 190 struct dinode *dip; 191 192 for(inum = 1; inum < sb.ninodes; inum++){ 193 bp = bread(dev, IBLOCK(inum, sb)); 194 dip = (struct dinode*)bp->data + inum%IPB; 195 if(dip->type == 0){ // a free inode 196 memset(dip, 0, sizeof(*dip)); 197 dip->type = type; 198 log_write(bp); // mark it allocated on the disk 199 brelse(bp); 200 return iget(dev, inum); 201 } 202 brelse(bp); 203 } 204 panic("ialloc: no inodes"); 205 } 206 207 // Copy a modified in-memory inode to disk. 208 void 209 iupdate(struct inode *ip) 210 { 211 struct buf *bp; 212 struct dinode *dip; 213 214 bp = bread(ip->dev, IBLOCK(ip->inum, sb)); 215 dip = (struct dinode*)bp->data + ip->inum%IPB; 216 dip->type = ip->type; 217 dip->major = ip->major; 218 dip->minor = ip->minor; 219 dip->nlink = ip->nlink; 220 dip->size = ip->size; 221 memmove(dip->addrs, ip->addrs, sizeof(ip->addrs)); 222 log_write(bp); 223 brelse(bp); 224 } 225 226 // Find the inode with number inum on device dev 227 // and return the in-memory copy. Does not lock 228 // the inode and does not read it from disk. 229 static struct inode* 230 iget(uint dev, uint inum) 231 { 232 struct inode *ip, *empty; 233 234 acquire(&icache.lock); 235 236 // Is the inode already cached? 237 empty = 0; 238 for(ip = &icache.inode[0]; ip < &icache.inode[NINODE]; ip++){ 239 if(ip->ref > 0 && ip->dev == dev && ip->inum == inum){ 240 ip->ref++; 241 release(&icache.lock); 242 return ip; 243 } 244 if(empty == 0 && ip->ref == 0) // Remember empty slot. 245 empty = ip; 246 } 247 248 // Recycle an inode cache entry. 249 if(empty == 0) 250 panic("iget: no inodes"); 251 252 ip = empty; 253 ip->dev = dev; 254 ip->inum = inum; 255 ip->ref = 1; 256 ip->flags = 0; 257 release(&icache.lock); 258 259 return ip; 260 } 261 262 // Increment reference count for ip. 263 // Returns ip to enable ip = idup(ip1) idiom. 264 struct inode* 265 idup(struct inode *ip) 266 { 267 acquire(&icache.lock); 268 ip->ref++; 269 release(&icache.lock); 270 return ip; 271 } 272 273 // Lock the given inode. 274 // Reads the inode from disk if necessary. 275 void 276 ilock(struct inode *ip) 277 { 278 struct buf *bp; 279 struct dinode *dip; 280 281 if(ip == 0 || ip->ref < 1) 282 panic("ilock"); 283 284 acquiresleep(&ip->lock); 285 286 if(!(ip->flags & I_VALID)){ 287 bp = bread(ip->dev, IBLOCK(ip->inum, sb)); 288 dip = (struct dinode*)bp->data + ip->inum%IPB; 289 ip->type = dip->type; 290 ip->major = dip->major; 291 ip->minor = dip->minor; 292 ip->nlink = dip->nlink; 293 ip->size = dip->size; 294 memmove(ip->addrs, dip->addrs, sizeof(ip->addrs)); 295 brelse(bp); 296 ip->flags |= I_VALID; 297 if(ip->type == 0) 298 panic("ilock: no type"); 299 } 300 } 301 302 // Unlock the given inode. 303 void 304 iunlock(struct inode *ip) 305 { 306 if(ip == 0 || !holdingsleep(&ip->lock) || ip->ref < 1) 307 panic("iunlock"); 308 309 releasesleep(&ip->lock); 310 } 311 312 // Drop a reference to an in-memory inode. 313 // If that was the last reference, the inode cache entry can 314 // be recycled. 315 // If that was the last reference and the inode has no links 316 // to it, free the inode (and its content) on disk. 317 // All calls to iput() must be inside a transaction in 318 // case it has to free the inode. 319 void 320 iput(struct inode *ip) 321 { 322 acquire(&icache.lock); 323 if(ip->ref == 1 && (ip->flags & I_VALID) && ip->nlink == 0){ 324 // inode has no links and no other references: truncate and free. 325 release(&icache.lock); 326 itrunc(ip); 327 ip->type = 0; 328 iupdate(ip); 329 acquire(&icache.lock); 330 ip->flags = 0; 331 } 332 ip->ref--; 333 release(&icache.lock); 334 } 335 336 // Common idiom: unlock, then put. 337 void 338 iunlockput(struct inode *ip) 339 { 340 iunlock(ip); 341 iput(ip); 342 } 343 344 //PAGEBREAK! 345 // Inode content 346 // 347 // The content (data) associated with each inode is stored 348 // in blocks on the disk. The first NDIRECT block numbers 349 // are listed in ip->addrs[]. The next NINDIRECT blocks are 350 // listed in block ip->addrs[NDIRECT]. 351 352 // Return the disk block address of the nth block in inode ip. 353 // If there is no such block, bmap allocates one. 354 static uint 355 bmap(struct inode *ip, uint bn) 356 { 357 uint addr, *a; 358 struct buf *bp; 359 360 if(bn < NDIRECT){ 361 if((addr = ip->addrs[bn]) == 0) 362 ip->addrs[bn] = addr = balloc(ip->dev); 363 return addr; 364 } 365 bn -= NDIRECT; 366 367 if(bn < NINDIRECT){ 368 // Load indirect block, allocating if necessary. 369 if((addr = ip->addrs[NDIRECT]) == 0) 370 ip->addrs[NDIRECT] = addr = balloc(ip->dev); 371 bp = bread(ip->dev, addr); 372 a = (uint*)bp->data; 373 if((addr = a[bn]) == 0){ 374 a[bn] = addr = balloc(ip->dev); 375 log_write(bp); 376 } 377 brelse(bp); 378 return addr; 379 } 380 381 panic("bmap: out of range"); 382 } 383 384 // Truncate inode (discard contents). 385 // Only called when the inode has no links 386 // to it (no directory entries referring to it) 387 // and has no in-memory reference to it (is 388 // not an open file or current directory). 389 static void 390 itrunc(struct inode *ip) 391 { 392 int i, j; 393 struct buf *bp; 394 uint *a; 395 396 for(i = 0; i < NDIRECT; i++){ 397 if(ip->addrs[i]){ 398 bfree(ip->dev, ip->addrs[i]); 399 ip->addrs[i] = 0; 400 } 401 } 402 403 if(ip->addrs[NDIRECT]){ 404 bp = bread(ip->dev, ip->addrs[NDIRECT]); 405 a = (uint*)bp->data; 406 for(j = 0; j < NINDIRECT; j++){ 407 if(a[j]) 408 bfree(ip->dev, a[j]); 409 } 410 brelse(bp); 411 bfree(ip->dev, ip->addrs[NDIRECT]); 412 ip->addrs[NDIRECT] = 0; 413 } 414 415 ip->size = 0; 416 iupdate(ip); 417 } 418 419 // Copy stat information from inode. 420 void 421 stati(struct inode *ip, struct stat *st) 422 { 423 st->dev = ip->dev; 424 st->ino = ip->inum; 425 st->type = ip->type; 426 st->nlink = ip->nlink; 427 st->size = ip->size; 428 } 429 430 //PAGEBREAK! 431 // Read data from inode. 432 int 433 readi(struct inode *ip, char *dst, uint off, uint n) 434 { 435 uint tot, m; 436 struct buf *bp; 437 438 if(ip->type == T_DEV){ 439 if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].read) 440 return -1; 441 return devsw[ip->major].read(ip, dst, n); 442 } 443 444 if(off > ip->size || off + n < off) 445 return -1; 446 if(off + n > ip->size) 447 n = ip->size - off; 448 449 for(tot=0; tot<n; tot+=m, off+=m, dst+=m){ 450 bp = bread(ip->dev, bmap(ip, off/BSIZE)); 451 m = min(n - tot, BSIZE - off%BSIZE); 452 /* 453 cprintf("data off %d:\n", off); 454 for (int j = 0; j < min(m, 10); j++) { 455 cprintf("%x ", bp->data[off%BSIZE+j]); 456 } 457 cprintf("\n"); 458 */ 459 memmove(dst, bp->data + off%BSIZE, m); 460 brelse(bp); 461 } 462 return n; 463 } 464 465 // PAGEBREAK! 466 // Write data to inode. 467 int 468 writei(struct inode *ip, char *src, uint off, uint n) 469 { 470 uint tot, m; 471 struct buf *bp; 472 473 if(ip->type == T_DEV){ 474 if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].write) 475 return -1; 476 return devsw[ip->major].write(ip, src, n); 477 } 478 479 if(off > ip->size || off + n < off) 480 return -1; 481 if(off + n > MAXFILE*BSIZE) 482 return -1; 483 484 for(tot=0; tot<n; tot+=m, off+=m, src+=m){ 485 bp = bread(ip->dev, bmap(ip, off/BSIZE)); 486 m = min(n - tot, BSIZE - off%BSIZE); 487 memmove(bp->data + off%BSIZE, src, m); 488 log_write(bp); 489 brelse(bp); 490 } 491 492 if(n > 0 && off > ip->size){ 493 ip->size = off; 494 iupdate(ip); 495 } 496 return n; 497 } 498 499 //PAGEBREAK! 500 // Directories 501 502 int 503 namecmp(const char *s, const char *t) 504 { 505 return strncmp(s, t, DIRSIZ); 506 } 507 508 // Look for a directory entry in a directory. 509 // If found, set *poff to byte offset of entry. 510 struct inode* 511 dirlookup(struct inode *dp, char *name, uint *poff) 512 { 513 uint off, inum; 514 struct dirent de; 515 516 if(dp->type != T_DIR) 517 panic("dirlookup not DIR"); 518 519 for(off = 0; off < dp->size; off += sizeof(de)){ 520 if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de)) 521 panic("dirlink read"); 522 if(de.inum == 0) 523 continue; 524 if(namecmp(name, de.name) == 0){ 525 // entry matches path element 526 if(poff) 527 *poff = off; 528 inum = de.inum; 529 return iget(dp->dev, inum); 530 } 531 } 532 533 return 0; 534 } 535 536 // Write a new directory entry (name, inum) into the directory dp. 537 int 538 dirlink(struct inode *dp, char *name, uint inum) 539 { 540 int off; 541 struct dirent de; 542 struct inode *ip; 543 544 // Check that name is not present. 545 if((ip = dirlookup(dp, name, 0)) != 0){ 546 iput(ip); 547 return -1; 548 } 549 550 // Look for an empty dirent. 551 for(off = 0; off < dp->size; off += sizeof(de)){ 552 if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de)) 553 panic("dirlink read"); 554 if(de.inum == 0) 555 break; 556 } 557 558 strncpy(de.name, name, DIRSIZ); 559 de.inum = inum; 560 if(writei(dp, (char*)&de, off, sizeof(de)) != sizeof(de)) 561 panic("dirlink"); 562 563 return 0; 564 } 565 566 //PAGEBREAK! 567 // Paths 568 569 // Copy the next path element from path into name. 570 // Return a pointer to the element following the copied one. 571 // The returned path has no leading slashes, 572 // so the caller can check *path=='\0' to see if the name is the last one. 573 // If no name to remove, return 0. 574 // 575 // Examples: 576 // skipelem("a/bb/c", name) = "bb/c", setting name = "a" 577 // skipelem("///a//bb", name) = "bb", setting name = "a" 578 // skipelem("a", name) = "", setting name = "a" 579 // skipelem("", name) = skipelem("////", name) = 0 580 // 581 static char* 582 skipelem(char *path, char *name) 583 { 584 char *s; 585 int len; 586 587 while(*path == '/') 588 path++; 589 if(*path == 0) 590 return 0; 591 s = path; 592 while(*path != '/' && *path != 0) 593 path++; 594 len = path - s; 595 if(len >= DIRSIZ) 596 memmove(name, s, DIRSIZ); 597 else { 598 memmove(name, s, len); 599 name[len] = 0; 600 } 601 while(*path == '/') 602 path++; 603 return path; 604 } 605 606 // Look up and return the inode for a path name. 607 // If parent != 0, return the inode for the parent and copy the final 608 // path element into name, which must have room for DIRSIZ bytes. 609 // Must be called inside a transaction since it calls iput(). 610 static struct inode* 611 namex(char *path, int nameiparent, char *name) 612 { 613 struct inode *ip, *next; 614 615 if(*path == '/') 616 ip = iget(ROOTDEV, ROOTINO); 617 else 618 ip = idup(myproc()->cwd); 619 620 while((path = skipelem(path, name)) != 0){ 621 ilock(ip); 622 if(ip->type != T_DIR){ 623 iunlockput(ip); 624 return 0; 625 } 626 if(nameiparent && *path == '\0'){ 627 // Stop one level early. 628 iunlock(ip); 629 return ip; 630 } 631 if((next = dirlookup(ip, name, 0)) == 0){ 632 iunlockput(ip); 633 return 0; 634 } 635 iunlockput(ip); 636 ip = next; 637 } 638 if(nameiparent){ 639 iput(ip); 640 return 0; 641 } 642 return ip; 643 } 644 645 struct inode* 646 namei(char *path) 647 { 648 char name[DIRSIZ]; 649 return namex(path, 0, name); 650 } 651 652 struct inode* 653 nameiparent(char *path, char *name) 654 { 655 return namex(path, 1, name); 656 } 657