xref: /xv6-public/fs.c (revision b7fed77b)
1 // File system implementation.  Five layers:
2 //   + Blocks: allocator for raw disk blocks.
3 //   + Log: crash recovery for multi-step updates.
4 //   + Files: inode allocator, reading, writing, metadata.
5 //   + Directories: inode with special contents (list of other inodes!)
6 //   + Names: paths like /usr/rtm/xv6/fs.c for convenient naming.
7 //
8 // This file contains the low-level file system manipulation
9 // routines.  The (higher-level) system call implementations
10 // are in sysfile.c.
11 
12 #include "types.h"
13 #include "defs.h"
14 #include "param.h"
15 #include "stat.h"
16 #include "mmu.h"
17 #include "proc.h"
18 #include "spinlock.h"
19 #include "fs.h"
20 #include "buf.h"
21 #include "file.h"
22 
23 #define min(a, b) ((a) < (b) ? (a) : (b))
24 static void itrunc(struct inode*);
25 // there should be one superblock per disk device, but we run with
26 // only one device
27 struct superblock sb;
28 
29 // Read the super block.
30 void
31 readsb(int dev, struct superblock *sb)
32 {
33   struct buf *bp;
34 
35   bp = bread(dev, 1);
36   memmove(sb, bp->data, sizeof(*sb));
37   brelse(bp);
38 }
39 
40 // Zero a block.
41 static void
42 bzero(int dev, int bno)
43 {
44   struct buf *bp;
45 
46   bp = bread(dev, bno);
47   memset(bp->data, 0, BSIZE);
48   log_write(bp);
49   brelse(bp);
50 }
51 
52 // Blocks.
53 
54 // Allocate a zeroed disk block.
55 static uint
56 balloc(uint dev)
57 {
58   int b, bi, m;
59   struct buf *bp;
60 
61   bp = 0;
62   for(b = 0; b < sb.size; b += BPB){
63     bp = bread(dev, BBLOCK(b, sb));
64     for(bi = 0; bi < BPB && b + bi < sb.size; bi++){
65       m = 1 << (bi % 8);
66       if((bp->data[bi/8] & m) == 0){  // Is block free?
67         bp->data[bi/8] |= m;  // Mark block in use.
68         log_write(bp);
69         brelse(bp);
70         bzero(dev, b + bi);
71         return b + bi;
72       }
73     }
74     brelse(bp);
75   }
76   panic("balloc: out of blocks");
77 }
78 
79 // Free a disk block.
80 static void
81 bfree(int dev, uint b)
82 {
83   struct buf *bp;
84   int bi, m;
85 
86   readsb(dev, &sb);
87   bp = bread(dev, BBLOCK(b, sb));
88   bi = b % BPB;
89   m = 1 << (bi % 8);
90   if((bp->data[bi/8] & m) == 0)
91     panic("freeing free block");
92   bp->data[bi/8] &= ~m;
93   log_write(bp);
94   brelse(bp);
95 }
96 
97 // Inodes.
98 //
99 // An inode describes a single unnamed file.
100 // The inode disk structure holds metadata: the file's type,
101 // its size, the number of links referring to it, and the
102 // list of blocks holding the file's content.
103 //
104 // The inodes are laid out sequentially on disk at
105 // sb.startinode. Each inode has a number, indicating its
106 // position on the disk.
107 //
108 // The kernel keeps a cache of in-use inodes in memory
109 // to provide a place for synchronizing access
110 // to inodes used by multiple processes. The cached
111 // inodes include book-keeping information that is
112 // not stored on disk: ip->ref and ip->flags.
113 //
114 // An inode and its in-memory represtative go through a
115 // sequence of states before they can be used by the
116 // rest of the file system code.
117 //
118 // * Allocation: an inode is allocated if its type (on disk)
119 //   is non-zero. ialloc() allocates, iput() frees if
120 //   the link count has fallen to zero.
121 //
122 // * Referencing in cache: an entry in the inode cache
123 //   is free if ip->ref is zero. Otherwise ip->ref tracks
124 //   the number of in-memory pointers to the entry (open
125 //   files and current directories). iget() to find or
126 //   create a cache entry and increment its ref, iput()
127 //   to decrement ref.
128 //
129 // * Valid: the information (type, size, &c) in an inode
130 //   cache entry is only correct when the I_VALID bit
131 //   is set in ip->flags. ilock() reads the inode from
132 //   the disk and sets I_VALID, while iput() clears
133 //   I_VALID if ip->ref has fallen to zero.
134 //
135 // * Locked: file system code may only examine and modify
136 //   the information in an inode and its content if it
137 //   has first locked the inode. The I_BUSY flag indicates
138 //   that the inode is locked. ilock() sets I_BUSY,
139 //   while iunlock clears it.
140 //
141 // Thus a typical sequence is:
142 //   ip = iget(dev, inum)
143 //   ilock(ip)
144 //   ... examine and modify ip->xxx ...
145 //   iunlock(ip)
146 //   iput(ip)
147 //
148 // ilock() is separate from iget() so that system calls can
149 // get a long-term reference to an inode (as for an open file)
150 // and only lock it for short periods (e.g., in read()).
151 // The separation also helps avoid deadlock and races during
152 // pathname lookup. iget() increments ip->ref so that the inode
153 // stays cached and pointers to it remain valid.
154 //
155 // Many internal file system functions expect the caller to
156 // have locked the inodes involved; this lets callers create
157 // multi-step atomic operations.
158 
159 struct {
160   struct spinlock lock;
161   struct inode inode[NINODE];
162 } icache;
163 
164 void
165 iinit(int dev)
166 {
167   initlock(&icache.lock, "icache");
168   readsb(dev, &sb);
169   cprintf("sb: size %d nblocks %d ninodes %d nlog %d logstart %d\
170           inodestart %d bmap start %d\n", sb.size, sb.nblocks,
171           sb.ninodes, sb.nlog, sb.logstart, sb.inodestart,
172           sb.bmapstart);
173 }
174 
175 static struct inode* iget(uint dev, uint inum);
176 
177 //PAGEBREAK!
178 // Allocate a new inode with the given type on device dev.
179 // A free inode has a type of zero.
180 struct inode*
181 ialloc(uint dev, short type)
182 {
183   int inum;
184   struct buf *bp;
185   struct dinode *dip;
186 
187   for(inum = 1; inum < sb.ninodes; inum++){
188     bp = bread(dev, IBLOCK(inum, sb));
189     dip = (struct dinode*)bp->data + inum%IPB;
190     if(dip->type == 0){  // a free inode
191       memset(dip, 0, sizeof(*dip));
192       dip->type = type;
193       log_write(bp);   // mark it allocated on the disk
194       brelse(bp);
195       return iget(dev, inum);
196     }
197     brelse(bp);
198   }
199   panic("ialloc: no inodes");
200 }
201 
202 // Copy a modified in-memory inode to disk.
203 void
204 iupdate(struct inode *ip)
205 {
206   struct buf *bp;
207   struct dinode *dip;
208 
209   bp = bread(ip->dev, IBLOCK(ip->inum, sb));
210   dip = (struct dinode*)bp->data + ip->inum%IPB;
211   dip->type = ip->type;
212   dip->major = ip->major;
213   dip->minor = ip->minor;
214   dip->nlink = ip->nlink;
215   dip->size = ip->size;
216   memmove(dip->addrs, ip->addrs, sizeof(ip->addrs));
217   log_write(bp);
218   brelse(bp);
219 }
220 
221 // Find the inode with number inum on device dev
222 // and return the in-memory copy. Does not lock
223 // the inode and does not read it from disk.
224 static struct inode*
225 iget(uint dev, uint inum)
226 {
227   struct inode *ip, *empty;
228 
229   acquire(&icache.lock);
230 
231   // Is the inode already cached?
232   empty = 0;
233   for(ip = &icache.inode[0]; ip < &icache.inode[NINODE]; ip++){
234     if(ip->ref > 0 && ip->dev == dev && ip->inum == inum){
235       ip->ref++;
236       release(&icache.lock);
237       return ip;
238     }
239     if(empty == 0 && ip->ref == 0)    // Remember empty slot.
240       empty = ip;
241   }
242 
243   // Recycle an inode cache entry.
244   if(empty == 0)
245     panic("iget: no inodes");
246 
247   ip = empty;
248   ip->dev = dev;
249   ip->inum = inum;
250   ip->ref = 1;
251   ip->flags = 0;
252   release(&icache.lock);
253 
254   return ip;
255 }
256 
257 // Increment reference count for ip.
258 // Returns ip to enable ip = idup(ip1) idiom.
259 struct inode*
260 idup(struct inode *ip)
261 {
262   acquire(&icache.lock);
263   ip->ref++;
264   release(&icache.lock);
265   return ip;
266 }
267 
268 // Lock the given inode.
269 // Reads the inode from disk if necessary.
270 void
271 ilock(struct inode *ip)
272 {
273   struct buf *bp;
274   struct dinode *dip;
275 
276   if(ip == 0 || ip->ref < 1)
277     panic("ilock");
278 
279   acquire(&icache.lock);
280   while(ip->flags & I_BUSY)
281     sleep(ip, &icache.lock);
282   ip->flags |= I_BUSY;
283   release(&icache.lock);
284 
285   if(!(ip->flags & I_VALID)){
286     bp = bread(ip->dev, IBLOCK(ip->inum, sb));
287     dip = (struct dinode*)bp->data + ip->inum%IPB;
288     ip->type = dip->type;
289     ip->major = dip->major;
290     ip->minor = dip->minor;
291     ip->nlink = dip->nlink;
292     ip->size = dip->size;
293     memmove(ip->addrs, dip->addrs, sizeof(ip->addrs));
294     brelse(bp);
295     ip->flags |= I_VALID;
296     if(ip->type == 0)
297       panic("ilock: no type");
298   }
299 }
300 
301 // Unlock the given inode.
302 void
303 iunlock(struct inode *ip)
304 {
305   if(ip == 0 || !(ip->flags & I_BUSY) || ip->ref < 1)
306     panic("iunlock");
307 
308   acquire(&icache.lock);
309   ip->flags &= ~I_BUSY;
310   wakeup(ip);
311   release(&icache.lock);
312 }
313 
314 // Drop a reference to an in-memory inode.
315 // If that was the last reference, the inode cache entry can
316 // be recycled.
317 // If that was the last reference and the inode has no links
318 // to it, free the inode (and its content) on disk.
319 // All calls to iput() must be inside a transaction in
320 // case it has to free the inode.
321 void
322 iput(struct inode *ip)
323 {
324   acquire(&icache.lock);
325   if(ip->ref == 1 && (ip->flags & I_VALID) && ip->nlink == 0){
326     // inode has no links and no other references: truncate and free.
327     if(ip->flags & I_BUSY)
328       panic("iput busy");
329     ip->flags |= I_BUSY;
330     release(&icache.lock);
331     itrunc(ip);
332     ip->type = 0;
333     iupdate(ip);
334     acquire(&icache.lock);
335     ip->flags = 0;
336     wakeup(ip);
337   }
338   ip->ref--;
339   release(&icache.lock);
340 }
341 
342 // Common idiom: unlock, then put.
343 void
344 iunlockput(struct inode *ip)
345 {
346   iunlock(ip);
347   iput(ip);
348 }
349 
350 //PAGEBREAK!
351 // Inode content
352 //
353 // The content (data) associated with each inode is stored
354 // in blocks on the disk. The first NDIRECT block numbers
355 // are listed in ip->addrs[].  The next NINDIRECT blocks are
356 // listed in block ip->addrs[NDIRECT].
357 
358 // Return the disk block address of the nth block in inode ip.
359 // If there is no such block, bmap allocates one.
360 static uint
361 bmap(struct inode *ip, uint bn)
362 {
363   uint addr, *a;
364   struct buf *bp;
365 
366   if(bn < NDIRECT){
367     if((addr = ip->addrs[bn]) == 0)
368       ip->addrs[bn] = addr = balloc(ip->dev);
369     return addr;
370   }
371   bn -= NDIRECT;
372 
373   if(bn < NINDIRECT){
374     // Load indirect block, allocating if necessary.
375     if((addr = ip->addrs[NDIRECT]) == 0)
376       ip->addrs[NDIRECT] = addr = balloc(ip->dev);
377     bp = bread(ip->dev, addr);
378     a = (uint*)bp->data;
379     if((addr = a[bn]) == 0){
380       a[bn] = addr = balloc(ip->dev);
381       log_write(bp);
382     }
383     brelse(bp);
384     return addr;
385   }
386 
387   panic("bmap: out of range");
388 }
389 
390 // Truncate inode (discard contents).
391 // Only called when the inode has no links
392 // to it (no directory entries referring to it)
393 // and has no in-memory reference to it (is
394 // not an open file or current directory).
395 static void
396 itrunc(struct inode *ip)
397 {
398   int i, j;
399   struct buf *bp;
400   uint *a;
401 
402   for(i = 0; i < NDIRECT; i++){
403     if(ip->addrs[i]){
404       bfree(ip->dev, ip->addrs[i]);
405       ip->addrs[i] = 0;
406     }
407   }
408 
409   if(ip->addrs[NDIRECT]){
410     bp = bread(ip->dev, ip->addrs[NDIRECT]);
411     a = (uint*)bp->data;
412     for(j = 0; j < NINDIRECT; j++){
413       if(a[j])
414         bfree(ip->dev, a[j]);
415     }
416     brelse(bp);
417     bfree(ip->dev, ip->addrs[NDIRECT]);
418     ip->addrs[NDIRECT] = 0;
419   }
420 
421   ip->size = 0;
422   iupdate(ip);
423 }
424 
425 // Copy stat information from inode.
426 void
427 stati(struct inode *ip, struct stat *st)
428 {
429   st->dev = ip->dev;
430   st->ino = ip->inum;
431   st->type = ip->type;
432   st->nlink = ip->nlink;
433   st->size = ip->size;
434 }
435 
436 //PAGEBREAK!
437 // Read data from inode.
438 int
439 readi(struct inode *ip, char *dst, uint off, uint n)
440 {
441   uint tot, m;
442   struct buf *bp;
443 
444   if(ip->type == T_DEV){
445     if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].read)
446       return -1;
447     return devsw[ip->major].read(ip, dst, n);
448   }
449 
450   if(off > ip->size || off + n < off)
451     return -1;
452   if(off + n > ip->size)
453     n = ip->size - off;
454 
455   for(tot=0; tot<n; tot+=m, off+=m, dst+=m){
456     bp = bread(ip->dev, bmap(ip, off/BSIZE));
457     m = min(n - tot, BSIZE - off%BSIZE);
458     memmove(dst, bp->data + off%BSIZE, m);
459     brelse(bp);
460   }
461   return n;
462 }
463 
464 // PAGEBREAK!
465 // Write data to inode.
466 int
467 writei(struct inode *ip, char *src, uint off, uint n)
468 {
469   uint tot, m;
470   struct buf *bp;
471 
472   if(ip->type == T_DEV){
473     if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].write)
474       return -1;
475     return devsw[ip->major].write(ip, src, n);
476   }
477 
478   if(off > ip->size || off + n < off)
479     return -1;
480   if(off + n > MAXFILE*BSIZE)
481     return -1;
482 
483   for(tot=0; tot<n; tot+=m, off+=m, src+=m){
484     bp = bread(ip->dev, bmap(ip, off/BSIZE));
485     m = min(n - tot, BSIZE - off%BSIZE);
486     memmove(bp->data + off%BSIZE, src, m);
487     log_write(bp);
488     brelse(bp);
489   }
490 
491   if(n > 0 && off > ip->size){
492     ip->size = off;
493     iupdate(ip);
494   }
495   return n;
496 }
497 
498 //PAGEBREAK!
499 // Directories
500 
501 int
502 namecmp(const char *s, const char *t)
503 {
504   return strncmp(s, t, DIRSIZ);
505 }
506 
507 // Look for a directory entry in a directory.
508 // If found, set *poff to byte offset of entry.
509 struct inode*
510 dirlookup(struct inode *dp, char *name, uint *poff)
511 {
512   uint off, inum;
513   struct dirent de;
514 
515   if(dp->type != T_DIR)
516     panic("dirlookup not DIR");
517 
518   for(off = 0; off < dp->size; off += sizeof(de)){
519     if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
520       panic("dirlink read");
521     if(de.inum == 0)
522       continue;
523     if(namecmp(name, de.name) == 0){
524       // entry matches path element
525       if(poff)
526         *poff = off;
527       inum = de.inum;
528       return iget(dp->dev, inum);
529     }
530   }
531 
532   return 0;
533 }
534 
535 // Write a new directory entry (name, inum) into the directory dp.
536 int
537 dirlink(struct inode *dp, char *name, uint inum)
538 {
539   int off;
540   struct dirent de;
541   struct inode *ip;
542 
543   // Check that name is not present.
544   if((ip = dirlookup(dp, name, 0)) != 0){
545     iput(ip);
546     return -1;
547   }
548 
549   // Look for an empty dirent.
550   for(off = 0; off < dp->size; off += sizeof(de)){
551     if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
552       panic("dirlink read");
553     if(de.inum == 0)
554       break;
555   }
556 
557   strncpy(de.name, name, DIRSIZ);
558   de.inum = inum;
559   if(writei(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
560     panic("dirlink");
561 
562   return 0;
563 }
564 
565 //PAGEBREAK!
566 // Paths
567 
568 // Copy the next path element from path into name.
569 // Return a pointer to the element following the copied one.
570 // The returned path has no leading slashes,
571 // so the caller can check *path=='\0' to see if the name is the last one.
572 // If no name to remove, return 0.
573 //
574 // Examples:
575 //   skipelem("a/bb/c", name) = "bb/c", setting name = "a"
576 //   skipelem("///a//bb", name) = "bb", setting name = "a"
577 //   skipelem("a", name) = "", setting name = "a"
578 //   skipelem("", name) = skipelem("////", name) = 0
579 //
580 static char*
581 skipelem(char *path, char *name)
582 {
583   char *s;
584   int len;
585 
586   while(*path == '/')
587     path++;
588   if(*path == 0)
589     return 0;
590   s = path;
591   while(*path != '/' && *path != 0)
592     path++;
593   len = path - s;
594   if(len >= DIRSIZ)
595     memmove(name, s, DIRSIZ);
596   else {
597     memmove(name, s, len);
598     name[len] = 0;
599   }
600   while(*path == '/')
601     path++;
602   return path;
603 }
604 
605 // Look up and return the inode for a path name.
606 // If parent != 0, return the inode for the parent and copy the final
607 // path element into name, which must have room for DIRSIZ bytes.
608 // Must be called inside a transaction since it calls iput().
609 static struct inode*
610 namex(char *path, int nameiparent, char *name)
611 {
612   struct inode *ip, *next;
613 
614   if(*path == '/')
615     ip = iget(ROOTDEV, ROOTINO);
616   else
617     ip = idup(proc->cwd);
618 
619   while((path = skipelem(path, name)) != 0){
620     ilock(ip);
621     if(ip->type != T_DIR){
622       iunlockput(ip);
623       return 0;
624     }
625     if(nameiparent && *path == '\0'){
626       // Stop one level early.
627       iunlock(ip);
628       return ip;
629     }
630     if((next = dirlookup(ip, name, 0)) == 0){
631       iunlockput(ip);
632       return 0;
633     }
634     iunlockput(ip);
635     ip = next;
636   }
637   if(nameiparent){
638     iput(ip);
639     return 0;
640   }
641   return ip;
642 }
643 
644 struct inode*
645 namei(char *path)
646 {
647   char name[DIRSIZ];
648   return namex(path, 0, name);
649 }
650 
651 struct inode*
652 nameiparent(char *path, char *name)
653 {
654   return namex(path, 1, name);
655 }
656