xref: /xv6-public/fs.c (revision 6670d3b5)
1 // File system implementation.  Five layers:
2 //   + Blocks: allocator for raw disk blocks.
3 //   + Log: crash recovery for multi-step updates.
4 //   + Files: inode allocator, reading, writing, metadata.
5 //   + Directories: inode with special contents (list of other inodes!)
6 //   + Names: paths like /usr/rtm/xv6/fs.c for convenient naming.
7 //
8 // This file contains the low-level file system manipulation
9 // routines.  The (higher-level) system call implementations
10 // are in sysfile.c.
11 
12 #include "types.h"
13 #include "defs.h"
14 #include "param.h"
15 #include "stat.h"
16 #include "mmu.h"
17 #include "proc.h"
18 #include "spinlock.h"
19 #include "sleeplock.h"
20 #include "fs.h"
21 #include "buf.h"
22 #include "file.h"
23 
24 #define min(a, b) ((a) < (b) ? (a) : (b))
25 static void itrunc(struct inode*);
26 // there should be one superblock per disk device, but we run with
27 // only one device
28 struct superblock sb;
29 
30 // Read the super block.
31 void
32 readsb(int dev, struct superblock *sb)
33 {
34   struct buf *bp;
35 
36   bp = bread(dev, 1);
37   memmove(sb, bp->data, sizeof(*sb));
38   brelse(bp);
39 }
40 
41 // Zero a block.
42 static void
43 bzero(int dev, int bno)
44 {
45   struct buf *bp;
46 
47   bp = bread(dev, bno);
48   memset(bp->data, 0, BSIZE);
49   log_write(bp);
50   brelse(bp);
51 }
52 
53 // Blocks.
54 
55 // Allocate a zeroed disk block.
56 static uint
57 balloc(uint dev)
58 {
59   int b, bi, m;
60   struct buf *bp;
61 
62   bp = 0;
63   for(b = 0; b < sb.size; b += BPB){
64     bp = bread(dev, BBLOCK(b, sb));
65     for(bi = 0; bi < BPB && b + bi < sb.size; bi++){
66       m = 1 << (bi % 8);
67       if((bp->data[bi/8] & m) == 0){  // Is block free?
68         bp->data[bi/8] |= m;  // Mark block in use.
69         log_write(bp);
70         brelse(bp);
71         bzero(dev, b + bi);
72         return b + bi;
73       }
74     }
75     brelse(bp);
76   }
77   panic("balloc: out of blocks");
78 }
79 
80 // Free a disk block.
81 static void
82 bfree(int dev, uint b)
83 {
84   struct buf *bp;
85   int bi, m;
86 
87   readsb(dev, &sb);
88   bp = bread(dev, BBLOCK(b, sb));
89   bi = b % BPB;
90   m = 1 << (bi % 8);
91   if((bp->data[bi/8] & m) == 0)
92     panic("freeing free block");
93   bp->data[bi/8] &= ~m;
94   log_write(bp);
95   brelse(bp);
96 }
97 
98 // Inodes.
99 //
100 // An inode describes a single unnamed file.
101 // The inode disk structure holds metadata: the file's type,
102 // its size, the number of links referring to it, and the
103 // list of blocks holding the file's content.
104 //
105 // The inodes are laid out sequentially on disk at
106 // sb.startinode. Each inode has a number, indicating its
107 // position on the disk.
108 //
109 // The kernel keeps a cache of in-use inodes in memory
110 // to provide a place for synchronizing access
111 // to inodes used by multiple processes. The cached
112 // inodes include book-keeping information that is
113 // not stored on disk: ip->ref and ip->flags.
114 //
115 // An inode and its in-memory represtative go through a
116 // sequence of states before they can be used by the
117 // rest of the file system code.
118 //
119 // * Allocation: an inode is allocated if its type (on disk)
120 //   is non-zero. ialloc() allocates, iput() frees if
121 //   the link count has fallen to zero.
122 //
123 // * Referencing in cache: an entry in the inode cache
124 //   is free if ip->ref is zero. Otherwise ip->ref tracks
125 //   the number of in-memory pointers to the entry (open
126 //   files and current directories). iget() to find or
127 //   create a cache entry and increment its ref, iput()
128 //   to decrement ref.
129 //
130 // * Valid: the information (type, size, &c) in an inode
131 //   cache entry is only correct when the I_VALID bit
132 //   is set in ip->flags. ilock() reads the inode from
133 //   the disk and sets I_VALID, while iput() clears
134 //   I_VALID if ip->ref has fallen to zero.
135 //
136 // * Locked: file system code may only examine and modify
137 //   the information in an inode and its content if it
138 //   has first locked the inode. The I_BUSY flag indicates
139 //   that the inode is locked. ilock() sets I_BUSY,
140 //   while iunlock clears it.
141 //
142 // Thus a typical sequence is:
143 //   ip = iget(dev, inum)
144 //   ilock(ip)
145 //   ... examine and modify ip->xxx ...
146 //   iunlock(ip)
147 //   iput(ip)
148 //
149 // ilock() is separate from iget() so that system calls can
150 // get a long-term reference to an inode (as for an open file)
151 // and only lock it for short periods (e.g., in read()).
152 // The separation also helps avoid deadlock and races during
153 // pathname lookup. iget() increments ip->ref so that the inode
154 // stays cached and pointers to it remain valid.
155 //
156 // Many internal file system functions expect the caller to
157 // have locked the inodes involved; this lets callers create
158 // multi-step atomic operations.
159 
160 struct {
161   struct spinlock lock;
162   struct inode inode[NINODE];
163 } icache;
164 
165 void
166 iinit(int dev)
167 {
168   initlock(&icache.lock, "icache");
169   readsb(dev, &sb);
170   cprintf("sb: size %d nblocks %d ninodes %d nlog %d logstart %d\
171  inodestart %d bmap start %d\n", sb.size, sb.nblocks,
172           sb.ninodes, sb.nlog, sb.logstart, sb.inodestart,
173           sb.bmapstart);
174 }
175 
176 static struct inode* iget(uint dev, uint inum);
177 
178 //PAGEBREAK!
179 // Allocate a new inode with the given type on device dev.
180 // A free inode has a type of zero.
181 struct inode*
182 ialloc(uint dev, short type)
183 {
184   int inum;
185   struct buf *bp;
186   struct dinode *dip;
187 
188   for(inum = 1; inum < sb.ninodes; inum++){
189     bp = bread(dev, IBLOCK(inum, sb));
190     dip = (struct dinode*)bp->data + inum%IPB;
191     if(dip->type == 0){  // a free inode
192       memset(dip, 0, sizeof(*dip));
193       dip->type = type;
194       log_write(bp);   // mark it allocated on the disk
195       brelse(bp);
196       return iget(dev, inum);
197     }
198     brelse(bp);
199   }
200   panic("ialloc: no inodes");
201 }
202 
203 // Copy a modified in-memory inode to disk.
204 void
205 iupdate(struct inode *ip)
206 {
207   struct buf *bp;
208   struct dinode *dip;
209 
210   bp = bread(ip->dev, IBLOCK(ip->inum, sb));
211   dip = (struct dinode*)bp->data + ip->inum%IPB;
212   dip->type = ip->type;
213   dip->major = ip->major;
214   dip->minor = ip->minor;
215   dip->nlink = ip->nlink;
216   dip->size = ip->size;
217   memmove(dip->addrs, ip->addrs, sizeof(ip->addrs));
218   log_write(bp);
219   brelse(bp);
220 }
221 
222 // Find the inode with number inum on device dev
223 // and return the in-memory copy. Does not lock
224 // the inode and does not read it from disk.
225 static struct inode*
226 iget(uint dev, uint inum)
227 {
228   struct inode *ip, *empty;
229 
230   acquire(&icache.lock);
231 
232   // Is the inode already cached?
233   empty = 0;
234   for(ip = &icache.inode[0]; ip < &icache.inode[NINODE]; ip++){
235     if(ip->ref > 0 && ip->dev == dev && ip->inum == inum){
236       ip->ref++;
237       release(&icache.lock);
238       return ip;
239     }
240     if(empty == 0 && ip->ref == 0)    // Remember empty slot.
241       empty = ip;
242   }
243 
244   // Recycle an inode cache entry.
245   if(empty == 0)
246     panic("iget: no inodes");
247 
248   ip = empty;
249   ip->dev = dev;
250   ip->inum = inum;
251   ip->ref = 1;
252   ip->flags = 0;
253   release(&icache.lock);
254 
255   return ip;
256 }
257 
258 // Increment reference count for ip.
259 // Returns ip to enable ip = idup(ip1) idiom.
260 struct inode*
261 idup(struct inode *ip)
262 {
263   acquire(&icache.lock);
264   ip->ref++;
265   release(&icache.lock);
266   return ip;
267 }
268 
269 // Lock the given inode.
270 // Reads the inode from disk if necessary.
271 void
272 ilock(struct inode *ip)
273 {
274   struct buf *bp;
275   struct dinode *dip;
276 
277   if(ip == 0 || ip->ref < 1)
278     panic("ilock");
279 
280   acquire(&icache.lock);
281   while(ip->flags & I_BUSY)
282     sleep(ip, &icache.lock);
283   ip->flags |= I_BUSY;
284   release(&icache.lock);
285 
286   if(!(ip->flags & I_VALID)){
287     bp = bread(ip->dev, IBLOCK(ip->inum, sb));
288     dip = (struct dinode*)bp->data + ip->inum%IPB;
289     ip->type = dip->type;
290     ip->major = dip->major;
291     ip->minor = dip->minor;
292     ip->nlink = dip->nlink;
293     ip->size = dip->size;
294     memmove(ip->addrs, dip->addrs, sizeof(ip->addrs));
295     brelse(bp);
296     ip->flags |= I_VALID;
297     if(ip->type == 0)
298       panic("ilock: no type");
299   }
300 }
301 
302 // Unlock the given inode.
303 void
304 iunlock(struct inode *ip)
305 {
306   if(ip == 0 || !(ip->flags & I_BUSY) || ip->ref < 1)
307     panic("iunlock");
308 
309   acquire(&icache.lock);
310   ip->flags &= ~I_BUSY;
311   wakeup(ip);
312   release(&icache.lock);
313 }
314 
315 // Drop a reference to an in-memory inode.
316 // If that was the last reference, the inode cache entry can
317 // be recycled.
318 // If that was the last reference and the inode has no links
319 // to it, free the inode (and its content) on disk.
320 // All calls to iput() must be inside a transaction in
321 // case it has to free the inode.
322 void
323 iput(struct inode *ip)
324 {
325   acquire(&icache.lock);
326   if(ip->ref == 1 && (ip->flags & I_VALID) && ip->nlink == 0){
327     // inode has no links and no other references: truncate and free.
328     if(ip->flags & I_BUSY)
329       panic("iput busy");
330     ip->flags |= I_BUSY;
331     release(&icache.lock);
332     itrunc(ip);
333     ip->type = 0;
334     iupdate(ip);
335     acquire(&icache.lock);
336     ip->flags = 0;
337     wakeup(ip);
338   }
339   ip->ref--;
340   release(&icache.lock);
341 }
342 
343 // Common idiom: unlock, then put.
344 void
345 iunlockput(struct inode *ip)
346 {
347   iunlock(ip);
348   iput(ip);
349 }
350 
351 //PAGEBREAK!
352 // Inode content
353 //
354 // The content (data) associated with each inode is stored
355 // in blocks on the disk. The first NDIRECT block numbers
356 // are listed in ip->addrs[].  The next NINDIRECT blocks are
357 // listed in block ip->addrs[NDIRECT].
358 
359 // Return the disk block address of the nth block in inode ip.
360 // If there is no such block, bmap allocates one.
361 static uint
362 bmap(struct inode *ip, uint bn)
363 {
364   uint addr, *a;
365   struct buf *bp;
366 
367   if(bn < NDIRECT){
368     if((addr = ip->addrs[bn]) == 0)
369       ip->addrs[bn] = addr = balloc(ip->dev);
370     return addr;
371   }
372   bn -= NDIRECT;
373 
374   if(bn < NINDIRECT){
375     // Load indirect block, allocating if necessary.
376     if((addr = ip->addrs[NDIRECT]) == 0)
377       ip->addrs[NDIRECT] = addr = balloc(ip->dev);
378     bp = bread(ip->dev, addr);
379     a = (uint*)bp->data;
380     if((addr = a[bn]) == 0){
381       a[bn] = addr = balloc(ip->dev);
382       log_write(bp);
383     }
384     brelse(bp);
385     return addr;
386   }
387 
388   panic("bmap: out of range");
389 }
390 
391 // Truncate inode (discard contents).
392 // Only called when the inode has no links
393 // to it (no directory entries referring to it)
394 // and has no in-memory reference to it (is
395 // not an open file or current directory).
396 static void
397 itrunc(struct inode *ip)
398 {
399   int i, j;
400   struct buf *bp;
401   uint *a;
402 
403   for(i = 0; i < NDIRECT; i++){
404     if(ip->addrs[i]){
405       bfree(ip->dev, ip->addrs[i]);
406       ip->addrs[i] = 0;
407     }
408   }
409 
410   if(ip->addrs[NDIRECT]){
411     bp = bread(ip->dev, ip->addrs[NDIRECT]);
412     a = (uint*)bp->data;
413     for(j = 0; j < NINDIRECT; j++){
414       if(a[j])
415         bfree(ip->dev, a[j]);
416     }
417     brelse(bp);
418     bfree(ip->dev, ip->addrs[NDIRECT]);
419     ip->addrs[NDIRECT] = 0;
420   }
421 
422   ip->size = 0;
423   iupdate(ip);
424 }
425 
426 // Copy stat information from inode.
427 void
428 stati(struct inode *ip, struct stat *st)
429 {
430   st->dev = ip->dev;
431   st->ino = ip->inum;
432   st->type = ip->type;
433   st->nlink = ip->nlink;
434   st->size = ip->size;
435 }
436 
437 //PAGEBREAK!
438 // Read data from inode.
439 int
440 readi(struct inode *ip, char *dst, uint off, uint n)
441 {
442   uint tot, m;
443   struct buf *bp;
444 
445   if(ip->type == T_DEV){
446     if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].read)
447       return -1;
448     return devsw[ip->major].read(ip, dst, n);
449   }
450 
451   if(off > ip->size || off + n < off)
452     return -1;
453   if(off + n > ip->size)
454     n = ip->size - off;
455 
456   for(tot=0; tot<n; tot+=m, off+=m, dst+=m){
457     bp = bread(ip->dev, bmap(ip, off/BSIZE));
458     m = min(n - tot, BSIZE - off%BSIZE);
459     /*
460     cprintf("data off %d:\n", off);
461     for (int j = 0; j < min(m, 10); j++) {
462       cprintf("%x ", bp->data[off%BSIZE+j]);
463     }
464     cprintf("\n");
465     */
466     memmove(dst, bp->data + off%BSIZE, m);
467     brelse(bp);
468   }
469   return n;
470 }
471 
472 // PAGEBREAK!
473 // Write data to inode.
474 int
475 writei(struct inode *ip, char *src, uint off, uint n)
476 {
477   uint tot, m;
478   struct buf *bp;
479 
480   if(ip->type == T_DEV){
481     if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].write)
482       return -1;
483     return devsw[ip->major].write(ip, src, n);
484   }
485 
486   if(off > ip->size || off + n < off)
487     return -1;
488   if(off + n > MAXFILE*BSIZE)
489     return -1;
490 
491   for(tot=0; tot<n; tot+=m, off+=m, src+=m){
492     bp = bread(ip->dev, bmap(ip, off/BSIZE));
493     m = min(n - tot, BSIZE - off%BSIZE);
494     memmove(bp->data + off%BSIZE, src, m);
495     log_write(bp);
496     brelse(bp);
497   }
498 
499   if(n > 0 && off > ip->size){
500     ip->size = off;
501     iupdate(ip);
502   }
503   return n;
504 }
505 
506 //PAGEBREAK!
507 // Directories
508 
509 int
510 namecmp(const char *s, const char *t)
511 {
512   return strncmp(s, t, DIRSIZ);
513 }
514 
515 // Look for a directory entry in a directory.
516 // If found, set *poff to byte offset of entry.
517 struct inode*
518 dirlookup(struct inode *dp, char *name, uint *poff)
519 {
520   uint off, inum;
521   struct dirent de;
522 
523   if(dp->type != T_DIR)
524     panic("dirlookup not DIR");
525 
526   for(off = 0; off < dp->size; off += sizeof(de)){
527     if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
528       panic("dirlink read");
529     if(de.inum == 0)
530       continue;
531     if(namecmp(name, de.name) == 0){
532       // entry matches path element
533       if(poff)
534         *poff = off;
535       inum = de.inum;
536       return iget(dp->dev, inum);
537     }
538   }
539 
540   return 0;
541 }
542 
543 // Write a new directory entry (name, inum) into the directory dp.
544 int
545 dirlink(struct inode *dp, char *name, uint inum)
546 {
547   int off;
548   struct dirent de;
549   struct inode *ip;
550 
551   // Check that name is not present.
552   if((ip = dirlookup(dp, name, 0)) != 0){
553     iput(ip);
554     return -1;
555   }
556 
557   // Look for an empty dirent.
558   for(off = 0; off < dp->size; off += sizeof(de)){
559     if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
560       panic("dirlink read");
561     if(de.inum == 0)
562       break;
563   }
564 
565   strncpy(de.name, name, DIRSIZ);
566   de.inum = inum;
567   if(writei(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
568     panic("dirlink");
569 
570   return 0;
571 }
572 
573 //PAGEBREAK!
574 // Paths
575 
576 // Copy the next path element from path into name.
577 // Return a pointer to the element following the copied one.
578 // The returned path has no leading slashes,
579 // so the caller can check *path=='\0' to see if the name is the last one.
580 // If no name to remove, return 0.
581 //
582 // Examples:
583 //   skipelem("a/bb/c", name) = "bb/c", setting name = "a"
584 //   skipelem("///a//bb", name) = "bb", setting name = "a"
585 //   skipelem("a", name) = "", setting name = "a"
586 //   skipelem("", name) = skipelem("////", name) = 0
587 //
588 static char*
589 skipelem(char *path, char *name)
590 {
591   char *s;
592   int len;
593 
594   while(*path == '/')
595     path++;
596   if(*path == 0)
597     return 0;
598   s = path;
599   while(*path != '/' && *path != 0)
600     path++;
601   len = path - s;
602   if(len >= DIRSIZ)
603     memmove(name, s, DIRSIZ);
604   else {
605     memmove(name, s, len);
606     name[len] = 0;
607   }
608   while(*path == '/')
609     path++;
610   return path;
611 }
612 
613 // Look up and return the inode for a path name.
614 // If parent != 0, return the inode for the parent and copy the final
615 // path element into name, which must have room for DIRSIZ bytes.
616 // Must be called inside a transaction since it calls iput().
617 static struct inode*
618 namex(char *path, int nameiparent, char *name)
619 {
620   struct inode *ip, *next;
621 
622   if(*path == '/')
623     ip = iget(ROOTDEV, ROOTINO);
624   else
625     ip = idup(proc->cwd);
626 
627   while((path = skipelem(path, name)) != 0){
628     ilock(ip);
629     if(ip->type != T_DIR){
630       iunlockput(ip);
631       return 0;
632     }
633     if(nameiparent && *path == '\0'){
634       // Stop one level early.
635       iunlock(ip);
636       return ip;
637     }
638     if((next = dirlookup(ip, name, 0)) == 0){
639       iunlockput(ip);
640       return 0;
641     }
642     iunlockput(ip);
643     ip = next;
644   }
645   if(nameiparent){
646     iput(ip);
647     return 0;
648   }
649   return ip;
650 }
651 
652 struct inode*
653 namei(char *path)
654 {
655   char name[DIRSIZ];
656   return namex(path, 0, name);
657 }
658 
659 struct inode*
660 nameiparent(char *path, char *name)
661 {
662   return namex(path, 1, name);
663 }
664