xref: /xv6-public/fs.c (revision 8320d61b)
1 // File system implementation.  Five layers:
2 //   + Blocks: allocator for raw disk blocks.
3 //   + Log: crash recovery for multi-step updates.
4 //   + Files: inode allocator, reading, writing, metadata.
5 //   + Directories: inode with special contents (list of other inodes!)
6 //   + Names: paths like /usr/rtm/xv6/fs.c for convenient naming.
7 //
8 // This file contains the low-level file system manipulation
9 // routines.  The (higher-level) system call implementations
10 // are in sysfile.c.
11 
12 #include "types.h"
13 #include "defs.h"
14 #include "param.h"
15 #include "stat.h"
16 #include "mmu.h"
17 #include "proc.h"
18 #include "spinlock.h"
19 #include "fs.h"
20 #include "buf.h"
21 #include "file.h"
22 
23 #define min(a, b) ((a) < (b) ? (a) : (b))
24 static void itrunc(struct inode*);
25 struct superblock sb;   // there should be one per dev, but we run with one dev
26 
27 // Read the super block.
28 void
29 readsb(int dev, struct superblock *sb)
30 {
31   struct buf *bp;
32 
33   bp = bread(dev, 1);
34   memmove(sb, bp->data, sizeof(*sb));
35   brelse(bp);
36 }
37 
38 // Zero a block.
39 static void
40 bzero(int dev, int bno)
41 {
42   struct buf *bp;
43 
44   bp = bread(dev, bno);
45   memset(bp->data, 0, BSIZE);
46   log_write(bp);
47   brelse(bp);
48 }
49 
50 // Blocks.
51 
52 // Allocate a zeroed disk block.
53 static uint
54 balloc(uint dev)
55 {
56   int b, bi, m;
57   struct buf *bp;
58 
59   bp = 0;
60   for(b = 0; b < sb.size; b += BPB){
61     bp = bread(dev, BBLOCK(b, sb));
62     for(bi = 0; bi < BPB && b + bi < sb.size; bi++){
63       m = 1 << (bi % 8);
64       if((bp->data[bi/8] & m) == 0){  // Is block free?
65         bp->data[bi/8] |= m;  // Mark block in use.
66         log_write(bp);
67         brelse(bp);
68         bzero(dev, b + bi);
69         return b + bi;
70       }
71     }
72     brelse(bp);
73   }
74   panic("balloc: out of blocks");
75 }
76 
77 // Free a disk block.
78 static void
79 bfree(int dev, uint b)
80 {
81   struct buf *bp;
82   int bi, m;
83 
84   readsb(dev, &sb);
85   bp = bread(dev, BBLOCK(b, sb));
86   bi = b % BPB;
87   m = 1 << (bi % 8);
88   if((bp->data[bi/8] & m) == 0)
89     panic("freeing free block");
90   bp->data[bi/8] &= ~m;
91   log_write(bp);
92   brelse(bp);
93 }
94 
95 // Inodes.
96 //
97 // An inode describes a single unnamed file.
98 // The inode disk structure holds metadata: the file's type,
99 // its size, the number of links referring to it, and the
100 // list of blocks holding the file's content.
101 //
102 // The inodes are laid out sequentially on disk at
103 // sb.startinode. Each inode has a number, indicating its
104 // position on the disk.
105 //
106 // The kernel keeps a cache of in-use inodes in memory
107 // to provide a place for synchronizing access
108 // to inodes used by multiple processes. The cached
109 // inodes include book-keeping information that is
110 // not stored on disk: ip->ref and ip->flags.
111 //
112 // An inode and its in-memory represtative go through a
113 // sequence of states before they can be used by the
114 // rest of the file system code.
115 //
116 // * Allocation: an inode is allocated if its type (on disk)
117 //   is non-zero. ialloc() allocates, iput() frees if
118 //   the link count has fallen to zero.
119 //
120 // * Referencing in cache: an entry in the inode cache
121 //   is free if ip->ref is zero. Otherwise ip->ref tracks
122 //   the number of in-memory pointers to the entry (open
123 //   files and current directories). iget() to find or
124 //   create a cache entry and increment its ref, iput()
125 //   to decrement ref.
126 //
127 // * Valid: the information (type, size, &c) in an inode
128 //   cache entry is only correct when the I_VALID bit
129 //   is set in ip->flags. ilock() reads the inode from
130 //   the disk and sets I_VALID, while iput() clears
131 //   I_VALID if ip->ref has fallen to zero.
132 //
133 // * Locked: file system code may only examine and modify
134 //   the information in an inode and its content if it
135 //   has first locked the inode. The I_BUSY flag indicates
136 //   that the inode is locked. ilock() sets I_BUSY,
137 //   while iunlock clears it.
138 //
139 // Thus a typical sequence is:
140 //   ip = iget(dev, inum)
141 //   ilock(ip)
142 //   ... examine and modify ip->xxx ...
143 //   iunlock(ip)
144 //   iput(ip)
145 //
146 // ilock() is separate from iget() so that system calls can
147 // get a long-term reference to an inode (as for an open file)
148 // and only lock it for short periods (e.g., in read()).
149 // The separation also helps avoid deadlock and races during
150 // pathname lookup. iget() increments ip->ref so that the inode
151 // stays cached and pointers to it remain valid.
152 //
153 // Many internal file system functions expect the caller to
154 // have locked the inodes involved; this lets callers create
155 // multi-step atomic operations.
156 
157 struct {
158   struct spinlock lock;
159   struct inode inode[NINODE];
160 } icache;
161 
162 void
163 iinit(int dev)
164 {
165   initlock(&icache.lock, "icache");
166   readsb(dev, &sb);
167   cprintf("sb: size %d nblocks %d ninodes %d nlog %d logstart %d inodestart %d bmap start %d\n", sb.size,
168           sb.nblocks, sb.ninodes, sb.nlog, sb.logstart, sb.inodestart, sb.bmapstart);
169 }
170 
171 static struct inode* iget(uint dev, uint inum);
172 
173 //PAGEBREAK!
174 // Allocate a new inode with the given type on device dev.
175 // A free inode has a type of zero.
176 struct inode*
177 ialloc(uint dev, short type)
178 {
179   int inum;
180   struct buf *bp;
181   struct dinode *dip;
182 
183   for(inum = 1; inum < sb.ninodes; inum++){
184     bp = bread(dev, IBLOCK(inum, sb));
185     dip = (struct dinode*)bp->data + inum%IPB;
186     if(dip->type == 0){  // a free inode
187       memset(dip, 0, sizeof(*dip));
188       dip->type = type;
189       log_write(bp);   // mark it allocated on the disk
190       brelse(bp);
191       return iget(dev, inum);
192     }
193     brelse(bp);
194   }
195   panic("ialloc: no inodes");
196 }
197 
198 // Copy a modified in-memory inode to disk.
199 void
200 iupdate(struct inode *ip)
201 {
202   struct buf *bp;
203   struct dinode *dip;
204 
205   bp = bread(ip->dev, IBLOCK(ip->inum, sb));
206   dip = (struct dinode*)bp->data + ip->inum%IPB;
207   dip->type = ip->type;
208   dip->major = ip->major;
209   dip->minor = ip->minor;
210   dip->nlink = ip->nlink;
211   dip->size = ip->size;
212   memmove(dip->addrs, ip->addrs, sizeof(ip->addrs));
213   log_write(bp);
214   brelse(bp);
215 }
216 
217 // Find the inode with number inum on device dev
218 // and return the in-memory copy. Does not lock
219 // the inode and does not read it from disk.
220 static struct inode*
221 iget(uint dev, uint inum)
222 {
223   struct inode *ip, *empty;
224 
225   acquire(&icache.lock);
226 
227   // Is the inode already cached?
228   empty = 0;
229   for(ip = &icache.inode[0]; ip < &icache.inode[NINODE]; ip++){
230     if(ip->ref > 0 && ip->dev == dev && ip->inum == inum){
231       ip->ref++;
232       release(&icache.lock);
233       return ip;
234     }
235     if(empty == 0 && ip->ref == 0)    // Remember empty slot.
236       empty = ip;
237   }
238 
239   // Recycle an inode cache entry.
240   if(empty == 0)
241     panic("iget: no inodes");
242 
243   ip = empty;
244   ip->dev = dev;
245   ip->inum = inum;
246   ip->ref = 1;
247   ip->flags = 0;
248   release(&icache.lock);
249 
250   return ip;
251 }
252 
253 // Increment reference count for ip.
254 // Returns ip to enable ip = idup(ip1) idiom.
255 struct inode*
256 idup(struct inode *ip)
257 {
258   acquire(&icache.lock);
259   ip->ref++;
260   release(&icache.lock);
261   return ip;
262 }
263 
264 // Lock the given inode.
265 // Reads the inode from disk if necessary.
266 void
267 ilock(struct inode *ip)
268 {
269   struct buf *bp;
270   struct dinode *dip;
271 
272   if(ip == 0 || ip->ref < 1)
273     panic("ilock");
274 
275   acquire(&icache.lock);
276   while(ip->flags & I_BUSY)
277     sleep(ip, &icache.lock);
278   ip->flags |= I_BUSY;
279   release(&icache.lock);
280 
281   if(!(ip->flags & I_VALID)){
282     bp = bread(ip->dev, IBLOCK(ip->inum, sb));
283     dip = (struct dinode*)bp->data + ip->inum%IPB;
284     ip->type = dip->type;
285     ip->major = dip->major;
286     ip->minor = dip->minor;
287     ip->nlink = dip->nlink;
288     ip->size = dip->size;
289     memmove(ip->addrs, dip->addrs, sizeof(ip->addrs));
290     brelse(bp);
291     ip->flags |= I_VALID;
292     if(ip->type == 0)
293       panic("ilock: no type");
294   }
295 }
296 
297 // Unlock the given inode.
298 void
299 iunlock(struct inode *ip)
300 {
301   if(ip == 0 || !(ip->flags & I_BUSY) || ip->ref < 1)
302     panic("iunlock");
303 
304   acquire(&icache.lock);
305   ip->flags &= ~I_BUSY;
306   wakeup(ip);
307   release(&icache.lock);
308 }
309 
310 // Drop a reference to an in-memory inode.
311 // If that was the last reference, the inode cache entry can
312 // be recycled.
313 // If that was the last reference and the inode has no links
314 // to it, free the inode (and its content) on disk.
315 // All calls to iput() must be inside a transaction in
316 // case it has to free the inode.
317 void
318 iput(struct inode *ip)
319 {
320   acquire(&icache.lock);
321   if(ip->ref == 1 && (ip->flags & I_VALID) && ip->nlink == 0){
322     // inode has no links and no other references: truncate and free.
323     if(ip->flags & I_BUSY)
324       panic("iput busy");
325     ip->flags |= I_BUSY;
326     release(&icache.lock);
327     itrunc(ip);
328     ip->type = 0;
329     iupdate(ip);
330     acquire(&icache.lock);
331     ip->flags = 0;
332     wakeup(ip);
333   }
334   ip->ref--;
335   release(&icache.lock);
336 }
337 
338 // Common idiom: unlock, then put.
339 void
340 iunlockput(struct inode *ip)
341 {
342   iunlock(ip);
343   iput(ip);
344 }
345 
346 //PAGEBREAK!
347 // Inode content
348 //
349 // The content (data) associated with each inode is stored
350 // in blocks on the disk. The first NDIRECT block numbers
351 // are listed in ip->addrs[].  The next NINDIRECT blocks are
352 // listed in block ip->addrs[NDIRECT].
353 
354 // Return the disk block address of the nth block in inode ip.
355 // If there is no such block, bmap allocates one.
356 static uint
357 bmap(struct inode *ip, uint bn)
358 {
359   uint addr, *a;
360   struct buf *bp;
361 
362   if(bn < NDIRECT){
363     if((addr = ip->addrs[bn]) == 0)
364       ip->addrs[bn] = addr = balloc(ip->dev);
365     return addr;
366   }
367   bn -= NDIRECT;
368 
369   if(bn < NINDIRECT){
370     // Load indirect block, allocating if necessary.
371     if((addr = ip->addrs[NDIRECT]) == 0)
372       ip->addrs[NDIRECT] = addr = balloc(ip->dev);
373     bp = bread(ip->dev, addr);
374     a = (uint*)bp->data;
375     if((addr = a[bn]) == 0){
376       a[bn] = addr = balloc(ip->dev);
377       log_write(bp);
378     }
379     brelse(bp);
380     return addr;
381   }
382 
383   panic("bmap: out of range");
384 }
385 
386 // Truncate inode (discard contents).
387 // Only called when the inode has no links
388 // to it (no directory entries referring to it)
389 // and has no in-memory reference to it (is
390 // not an open file or current directory).
391 static void
392 itrunc(struct inode *ip)
393 {
394   int i, j;
395   struct buf *bp;
396   uint *a;
397 
398   for(i = 0; i < NDIRECT; i++){
399     if(ip->addrs[i]){
400       bfree(ip->dev, ip->addrs[i]);
401       ip->addrs[i] = 0;
402     }
403   }
404 
405   if(ip->addrs[NDIRECT]){
406     bp = bread(ip->dev, ip->addrs[NDIRECT]);
407     a = (uint*)bp->data;
408     for(j = 0; j < NINDIRECT; j++){
409       if(a[j])
410         bfree(ip->dev, a[j]);
411     }
412     brelse(bp);
413     bfree(ip->dev, ip->addrs[NDIRECT]);
414     ip->addrs[NDIRECT] = 0;
415   }
416 
417   ip->size = 0;
418   iupdate(ip);
419 }
420 
421 // Copy stat information from inode.
422 void
423 stati(struct inode *ip, struct stat *st)
424 {
425   st->dev = ip->dev;
426   st->ino = ip->inum;
427   st->type = ip->type;
428   st->nlink = ip->nlink;
429   st->size = ip->size;
430 }
431 
432 //PAGEBREAK!
433 // Read data from inode.
434 int
435 readi(struct inode *ip, char *dst, uint off, uint n)
436 {
437   uint tot, m;
438   struct buf *bp;
439 
440   if(ip->type == T_DEV){
441     if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].read)
442       return -1;
443     return devsw[ip->major].read(ip, dst, n);
444   }
445 
446   if(off > ip->size || off + n < off)
447     return -1;
448   if(off + n > ip->size)
449     n = ip->size - off;
450 
451   for(tot=0; tot<n; tot+=m, off+=m, dst+=m){
452     bp = bread(ip->dev, bmap(ip, off/BSIZE));
453     m = min(n - tot, BSIZE - off%BSIZE);
454     memmove(dst, bp->data + off%BSIZE, m);
455     brelse(bp);
456   }
457   return n;
458 }
459 
460 // PAGEBREAK!
461 // Write data to inode.
462 int
463 writei(struct inode *ip, char *src, uint off, uint n)
464 {
465   uint tot, m;
466   struct buf *bp;
467 
468   if(ip->type == T_DEV){
469     if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].write)
470       return -1;
471     return devsw[ip->major].write(ip, src, n);
472   }
473 
474   if(off > ip->size || off + n < off)
475     return -1;
476   if(off + n > MAXFILE*BSIZE)
477     return -1;
478 
479   for(tot=0; tot<n; tot+=m, off+=m, src+=m){
480     bp = bread(ip->dev, bmap(ip, off/BSIZE));
481     m = min(n - tot, BSIZE - off%BSIZE);
482     memmove(bp->data + off%BSIZE, src, m);
483     log_write(bp);
484     brelse(bp);
485   }
486 
487   if(n > 0 && off > ip->size){
488     ip->size = off;
489     iupdate(ip);
490   }
491   return n;
492 }
493 
494 //PAGEBREAK!
495 // Directories
496 
497 int
498 namecmp(const char *s, const char *t)
499 {
500   return strncmp(s, t, DIRSIZ);
501 }
502 
503 // Look for a directory entry in a directory.
504 // If found, set *poff to byte offset of entry.
505 struct inode*
506 dirlookup(struct inode *dp, char *name, uint *poff)
507 {
508   uint off, inum;
509   struct dirent de;
510 
511   if(dp->type != T_DIR)
512     panic("dirlookup not DIR");
513 
514   for(off = 0; off < dp->size; off += sizeof(de)){
515     if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
516       panic("dirlink read");
517     if(de.inum == 0)
518       continue;
519     if(namecmp(name, de.name) == 0){
520       // entry matches path element
521       if(poff)
522         *poff = off;
523       inum = de.inum;
524       return iget(dp->dev, inum);
525     }
526   }
527 
528   return 0;
529 }
530 
531 // Write a new directory entry (name, inum) into the directory dp.
532 int
533 dirlink(struct inode *dp, char *name, uint inum)
534 {
535   int off;
536   struct dirent de;
537   struct inode *ip;
538 
539   // Check that name is not present.
540   if((ip = dirlookup(dp, name, 0)) != 0){
541     iput(ip);
542     return -1;
543   }
544 
545   // Look for an empty dirent.
546   for(off = 0; off < dp->size; off += sizeof(de)){
547     if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
548       panic("dirlink read");
549     if(de.inum == 0)
550       break;
551   }
552 
553   strncpy(de.name, name, DIRSIZ);
554   de.inum = inum;
555   if(writei(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
556     panic("dirlink");
557 
558   return 0;
559 }
560 
561 //PAGEBREAK!
562 // Paths
563 
564 // Copy the next path element from path into name.
565 // Return a pointer to the element following the copied one.
566 // The returned path has no leading slashes,
567 // so the caller can check *path=='\0' to see if the name is the last one.
568 // If no name to remove, return 0.
569 //
570 // Examples:
571 //   skipelem("a/bb/c", name) = "bb/c", setting name = "a"
572 //   skipelem("///a//bb", name) = "bb", setting name = "a"
573 //   skipelem("a", name) = "", setting name = "a"
574 //   skipelem("", name) = skipelem("////", name) = 0
575 //
576 static char*
577 skipelem(char *path, char *name)
578 {
579   char *s;
580   int len;
581 
582   while(*path == '/')
583     path++;
584   if(*path == 0)
585     return 0;
586   s = path;
587   while(*path != '/' && *path != 0)
588     path++;
589   len = path - s;
590   if(len >= DIRSIZ)
591     memmove(name, s, DIRSIZ);
592   else {
593     memmove(name, s, len);
594     name[len] = 0;
595   }
596   while(*path == '/')
597     path++;
598   return path;
599 }
600 
601 // Look up and return the inode for a path name.
602 // If parent != 0, return the inode for the parent and copy the final
603 // path element into name, which must have room for DIRSIZ bytes.
604 // Must be called inside a transaction since it calls iput().
605 static struct inode*
606 namex(char *path, int nameiparent, char *name)
607 {
608   struct inode *ip, *next;
609 
610   if(*path == '/')
611     ip = iget(ROOTDEV, ROOTINO);
612   else
613     ip = idup(proc->cwd);
614 
615   while((path = skipelem(path, name)) != 0){
616     ilock(ip);
617     if(ip->type != T_DIR){
618       iunlockput(ip);
619       return 0;
620     }
621     if(nameiparent && *path == '\0'){
622       // Stop one level early.
623       iunlock(ip);
624       return ip;
625     }
626     if((next = dirlookup(ip, name, 0)) == 0){
627       iunlockput(ip);
628       return 0;
629     }
630     iunlockput(ip);
631     ip = next;
632   }
633   if(nameiparent){
634     iput(ip);
635     return 0;
636   }
637   return ip;
638 }
639 
640 struct inode*
641 namei(char *path)
642 {
643   char name[DIRSIZ];
644   return namex(path, 0, name);
645 }
646 
647 struct inode*
648 nameiparent(char *path, char *name)
649 {
650   return namex(path, 1, name);
651 }
652