xref: /xv6-public/fs.c (revision 38eee5bc)
1 // File system implementation.  Five layers:
2 //   + Blocks: allocator for raw disk blocks.
3 //   + Log: crash recovery for multi-step updates.
4 //   + Files: inode allocator, reading, writing, metadata.
5 //   + Directories: inode with special contents (list of other inodes!)
6 //   + Names: paths like /usr/rtm/xv6/fs.c for convenient naming.
7 //
8 // This file contains the low-level file system manipulation
9 // routines.  The (higher-level) system call implementations
10 // are in sysfile.c.
11 
12 #include "types.h"
13 #include "defs.h"
14 #include "param.h"
15 #include "stat.h"
16 #include "mmu.h"
17 #include "proc.h"
18 #include "spinlock.h"
19 #include "buf.h"
20 #include "fs.h"
21 #include "file.h"
22 
23 #define min(a, b) ((a) < (b) ? (a) : (b))
24 static void itrunc(struct inode*);
25 
26 // Read the super block.
27 void
28 readsb(int dev, struct superblock *sb)
29 {
30   struct buf *bp;
31 
32   bp = bread(dev, 1);
33   memmove(sb, bp->data, sizeof(*sb));
34   brelse(bp);
35 }
36 
37 // Zero a block.
38 static void
39 bzero(int dev, int bno)
40 {
41   struct buf *bp;
42 
43   bp = bread(dev, bno);
44   memset(bp->data, 0, BSIZE);
45   log_write(bp);
46   brelse(bp);
47 }
48 
49 // Blocks.
50 
51 // Allocate a zeroed disk block.
52 static uint
53 balloc(uint dev)
54 {
55   int b, bi, m;
56   struct buf *bp;
57   struct superblock sb;
58 
59   bp = 0;
60   readsb(dev, &sb);
61   for(b = 0; b < sb.size; b += BPB){
62     bp = bread(dev, BBLOCK(b, sb.ninodes));
63     for(bi = 0; bi < BPB && b + bi < sb.size; bi++){
64       m = 1 << (bi % 8);
65       if((bp->data[bi/8] & m) == 0){  // Is block free?
66         bp->data[bi/8] |= m;  // Mark block in use.
67         log_write(bp);
68         brelse(bp);
69         bzero(dev, b + bi);
70         return b + bi;
71       }
72     }
73     brelse(bp);
74   }
75   panic("balloc: out of blocks");
76 }
77 
78 // Free a disk block.
79 static void
80 bfree(int dev, uint b)
81 {
82   struct buf *bp;
83   struct superblock sb;
84   int bi, m;
85 
86   readsb(dev, &sb);
87   bp = bread(dev, BBLOCK(b, sb.ninodes));
88   bi = b % BPB;
89   m = 1 << (bi % 8);
90   if((bp->data[bi/8] & m) == 0)
91     panic("freeing free block");
92   bp->data[bi/8] &= ~m;
93   log_write(bp);
94   brelse(bp);
95 }
96 
97 // Inodes.
98 //
99 // An inode describes a single unnamed file.
100 // The inode disk structure holds metadata: the file's type,
101 // its size, the number of links referring to it, and the
102 // list of blocks holding the file's content.
103 //
104 // The inodes are laid out sequentially on disk immediately after
105 // the superblock. Each inode has a number, indicating its
106 // position on the disk.
107 //
108 // The kernel keeps a cache of in-use inodes in memory
109 // to provide a place for synchronizing access
110 // to inodes used by multiple processes. The cached
111 // inodes include book-keeping information that is
112 // not stored on disk: ip->ref and ip->flags.
113 //
114 // An inode and its in-memory represtative go through a
115 // sequence of states before they can be used by the
116 // rest of the file system code.
117 //
118 // * Allocation: an inode is allocated if its type (on disk)
119 //   is non-zero. ialloc() allocates, iput() frees if
120 //   the link count has fallen to zero.
121 //
122 // * Referencing in cache: an entry in the inode cache
123 //   is free if ip->ref is zero. Otherwise ip->ref tracks
124 //   the number of in-memory pointers to the entry (open
125 //   files and current directories). iget() to find or
126 //   create a cache entry and increment its ref, iput()
127 //   to decrement ref.
128 //
129 // * Valid: the information (type, size, &c) in an inode
130 //   cache entry is only correct when the I_VALID bit
131 //   is set in ip->flags. ilock() reads the inode from
132 //   the disk and sets I_VALID, while iput() clears
133 //   I_VALID if ip->ref has fallen to zero.
134 //
135 // * Locked: file system code may only examine and modify
136 //   the information in an inode and its content if it
137 //   has first locked the inode. The I_BUSY flag indicates
138 //   that the inode is locked. ilock() sets I_BUSY,
139 //   while iunlock clears it.
140 //
141 // Thus a typical sequence is:
142 //   ip = iget(dev, inum)
143 //   ilock(ip)
144 //   ... examine and modify ip->xxx ...
145 //   iunlock(ip)
146 //   iput(ip)
147 //
148 // ilock() is separate from iget() so that system calls can
149 // get a long-term reference to an inode (as for an open file)
150 // and only lock it for short periods (e.g., in read()).
151 // The separation also helps avoid deadlock and races during
152 // pathname lookup. iget() increments ip->ref so that the inode
153 // stays cached and pointers to it remain valid.
154 //
155 // Many internal file system functions expect the caller to
156 // have locked the inodes involved; this lets callers create
157 // multi-step atomic operations.
158 
159 struct {
160   struct spinlock lock;
161   struct inode inode[NINODE];
162 } icache;
163 
164 void
165 iinit(void)
166 {
167   initlock(&icache.lock, "icache");
168 }
169 
170 static struct inode* iget(uint dev, uint inum);
171 
172 //PAGEBREAK!
173 // Allocate a new inode with the given type on device dev.
174 // A free inode has a type of zero.
175 struct inode*
176 ialloc(uint dev, short type)
177 {
178   int inum;
179   struct buf *bp;
180   struct dinode *dip;
181   struct superblock sb;
182 
183   readsb(dev, &sb);
184 
185   for(inum = 1; inum < sb.ninodes; inum++){
186     bp = bread(dev, IBLOCK(inum));
187     dip = (struct dinode*)bp->data + inum%IPB;
188     if(dip->type == 0){  // a free inode
189       memset(dip, 0, sizeof(*dip));
190       dip->type = type;
191       log_write(bp);   // mark it allocated on the disk
192       brelse(bp);
193       return iget(dev, inum);
194     }
195     brelse(bp);
196   }
197   panic("ialloc: no inodes");
198 }
199 
200 // Copy a modified in-memory inode to disk.
201 void
202 iupdate(struct inode *ip)
203 {
204   struct buf *bp;
205   struct dinode *dip;
206 
207   bp = bread(ip->dev, IBLOCK(ip->inum));
208   dip = (struct dinode*)bp->data + ip->inum%IPB;
209   dip->type = ip->type;
210   dip->major = ip->major;
211   dip->minor = ip->minor;
212   dip->nlink = ip->nlink;
213   dip->size = ip->size;
214   memmove(dip->addrs, ip->addrs, sizeof(ip->addrs));
215   log_write(bp);
216   brelse(bp);
217 }
218 
219 // Find the inode with number inum on device dev
220 // and return the in-memory copy. Does not lock
221 // the inode and does not read it from disk.
222 static struct inode*
223 iget(uint dev, uint inum)
224 {
225   struct inode *ip, *empty;
226 
227   acquire(&icache.lock);
228 
229   // Is the inode already cached?
230   empty = 0;
231   for(ip = &icache.inode[0]; ip < &icache.inode[NINODE]; ip++){
232     if(ip->ref > 0 && ip->dev == dev && ip->inum == inum){
233       ip->ref++;
234       release(&icache.lock);
235       return ip;
236     }
237     if(empty == 0 && ip->ref == 0)    // Remember empty slot.
238       empty = ip;
239   }
240 
241   // Recycle an inode cache entry.
242   if(empty == 0)
243     panic("iget: no inodes");
244 
245   ip = empty;
246   ip->dev = dev;
247   ip->inum = inum;
248   ip->ref = 1;
249   ip->flags = 0;
250   release(&icache.lock);
251 
252   return ip;
253 }
254 
255 // Increment reference count for ip.
256 // Returns ip to enable ip = idup(ip1) idiom.
257 struct inode*
258 idup(struct inode *ip)
259 {
260   acquire(&icache.lock);
261   ip->ref++;
262   release(&icache.lock);
263   return ip;
264 }
265 
266 // Lock the given inode.
267 // Reads the inode from disk if necessary.
268 void
269 ilock(struct inode *ip)
270 {
271   struct buf *bp;
272   struct dinode *dip;
273 
274   if(ip == 0 || ip->ref < 1)
275     panic("ilock");
276 
277   acquire(&icache.lock);
278   while(ip->flags & I_BUSY)
279     sleep(ip, &icache.lock);
280   ip->flags |= I_BUSY;
281   release(&icache.lock);
282 
283   if(!(ip->flags & I_VALID)){
284     bp = bread(ip->dev, IBLOCK(ip->inum));
285     dip = (struct dinode*)bp->data + ip->inum%IPB;
286     ip->type = dip->type;
287     ip->major = dip->major;
288     ip->minor = dip->minor;
289     ip->nlink = dip->nlink;
290     ip->size = dip->size;
291     memmove(ip->addrs, dip->addrs, sizeof(ip->addrs));
292     brelse(bp);
293     ip->flags |= I_VALID;
294     if(ip->type == 0)
295       panic("ilock: no type");
296   }
297 }
298 
299 // Unlock the given inode.
300 void
301 iunlock(struct inode *ip)
302 {
303   if(ip == 0 || !(ip->flags & I_BUSY) || ip->ref < 1)
304     panic("iunlock");
305 
306   acquire(&icache.lock);
307   ip->flags &= ~I_BUSY;
308   wakeup(ip);
309   release(&icache.lock);
310 }
311 
312 // Drop a reference to an in-memory inode.
313 // If that was the last reference, the inode cache entry can
314 // be recycled.
315 // If that was the last reference and the inode has no links
316 // to it, free the inode (and its content) on disk.
317 void
318 iput(struct inode *ip)
319 {
320   acquire(&icache.lock);
321   if(ip->ref == 1 && (ip->flags & I_VALID) && ip->nlink == 0){
322     // inode has no links: truncate and free inode.
323     if(ip->flags & I_BUSY)
324       panic("iput busy");
325     ip->flags |= I_BUSY;
326     release(&icache.lock);
327     itrunc(ip);
328     ip->type = 0;
329     iupdate(ip);
330     acquire(&icache.lock);
331     ip->flags = 0;
332     wakeup(ip);
333   }
334   ip->ref--;
335   release(&icache.lock);
336 }
337 
338 // Common idiom: unlock, then put.
339 void
340 iunlockput(struct inode *ip)
341 {
342   iunlock(ip);
343   iput(ip);
344 }
345 
346 //PAGEBREAK!
347 // Inode content
348 //
349 // The content (data) associated with each inode is stored
350 // in blocks on the disk. The first NDIRECT block numbers
351 // are listed in ip->addrs[].  The next NINDIRECT blocks are
352 // listed in block ip->addrs[NDIRECT].
353 
354 // Return the disk block address of the nth block in inode ip.
355 // If there is no such block, bmap allocates one.
356 static uint
357 bmap(struct inode *ip, uint bn)
358 {
359   uint addr, *a;
360   struct buf *bp;
361 
362   if(bn < NDIRECT){
363     if((addr = ip->addrs[bn]) == 0)
364       ip->addrs[bn] = addr = balloc(ip->dev);
365     return addr;
366   }
367   bn -= NDIRECT;
368 
369   if(bn < NINDIRECT){
370     // Load indirect block, allocating if necessary.
371     if((addr = ip->addrs[NDIRECT]) == 0)
372       ip->addrs[NDIRECT] = addr = balloc(ip->dev);
373     bp = bread(ip->dev, addr);
374     a = (uint*)bp->data;
375     if((addr = a[bn]) == 0){
376       a[bn] = addr = balloc(ip->dev);
377       log_write(bp);
378     }
379     brelse(bp);
380     return addr;
381   }
382 
383   panic("bmap: out of range");
384 }
385 
386 // Truncate inode (discard contents).
387 // Only called when the inode has no links
388 // to it (no directory entries referring to it)
389 // and has no in-memory reference to it (is
390 // not an open file or current directory).
391 static void
392 itrunc(struct inode *ip)
393 {
394   int i, j;
395   struct buf *bp;
396   uint *a;
397 
398   for(i = 0; i < NDIRECT; i++){
399     if(ip->addrs[i]){
400       bfree(ip->dev, ip->addrs[i]);
401       ip->addrs[i] = 0;
402     }
403   }
404 
405   if(ip->addrs[NDIRECT]){
406     bp = bread(ip->dev, ip->addrs[NDIRECT]);
407     a = (uint*)bp->data;
408     for(j = 0; j < NINDIRECT; j++){
409       if(a[j])
410         bfree(ip->dev, a[j]);
411     }
412     brelse(bp);
413     bfree(ip->dev, ip->addrs[NDIRECT]);
414     ip->addrs[NDIRECT] = 0;
415   }
416 
417   ip->size = 0;
418   iupdate(ip);
419 }
420 
421 // Copy stat information from inode.
422 void
423 stati(struct inode *ip, struct stat *st)
424 {
425   st->dev = ip->dev;
426   st->ino = ip->inum;
427   st->type = ip->type;
428   st->nlink = ip->nlink;
429   st->size = ip->size;
430 }
431 
432 //PAGEBREAK!
433 // Read data from inode.
434 int
435 readi(struct inode *ip, char *dst, uint off, uint n)
436 {
437   uint tot, m;
438   struct buf *bp;
439 
440   if(ip->type == T_DEV){
441     if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].read)
442       return -1;
443     return devsw[ip->major].read(ip, dst, n);
444   }
445 
446   if(off > ip->size || off + n < off)
447     return -1;
448   if(off + n > ip->size)
449     n = ip->size - off;
450 
451   for(tot=0; tot<n; tot+=m, off+=m, dst+=m){
452     bp = bread(ip->dev, bmap(ip, off/BSIZE));
453     m = min(n - tot, BSIZE - off%BSIZE);
454     memmove(dst, bp->data + off%BSIZE, m);
455     brelse(bp);
456   }
457   return n;
458 }
459 
460 // PAGEBREAK!
461 // Write data to inode.
462 int
463 writei(struct inode *ip, char *src, uint off, uint n)
464 {
465   uint tot, m;
466   struct buf *bp;
467 
468   if(ip->type == T_DEV){
469     if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].write)
470       return -1;
471     return devsw[ip->major].write(ip, src, n);
472   }
473 
474   if(off > ip->size || off + n < off)
475     return -1;
476   if(off + n > MAXFILE*BSIZE)
477     return -1;
478 
479   for(tot=0; tot<n; tot+=m, off+=m, src+=m){
480     bp = bread(ip->dev, bmap(ip, off/BSIZE));
481     m = min(n - tot, BSIZE - off%BSIZE);
482     memmove(bp->data + off%BSIZE, src, m);
483     log_write(bp);
484     brelse(bp);
485   }
486 
487   if(n > 0 && off > ip->size){
488     ip->size = off;
489     iupdate(ip);
490   }
491   return n;
492 }
493 
494 //PAGEBREAK!
495 // Directories
496 
497 int
498 namecmp(const char *s, const char *t)
499 {
500   return strncmp(s, t, DIRSIZ);
501 }
502 
503 // Look for a directory entry in a directory.
504 // If found, set *poff to byte offset of entry.
505 struct inode*
506 dirlookup(struct inode *dp, char *name, uint *poff)
507 {
508   uint off, inum;
509   struct dirent de;
510 
511   if(dp->type != T_DIR)
512     panic("dirlookup not DIR");
513 
514   for(off = 0; off < dp->size; off += sizeof(de)){
515     if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
516       panic("dirlink read");
517     if(de.inum == 0)
518       continue;
519     if(namecmp(name, de.name) == 0){
520       // entry matches path element
521       if(poff)
522         *poff = off;
523       inum = de.inum;
524       return iget(dp->dev, inum);
525     }
526   }
527 
528   return 0;
529 }
530 
531 // Write a new directory entry (name, inum) into the directory dp.
532 int
533 dirlink(struct inode *dp, char *name, uint inum)
534 {
535   int off;
536   struct dirent de;
537   struct inode *ip;
538 
539   // Check that name is not present.
540   if((ip = dirlookup(dp, name, 0)) != 0){
541     iput(ip);
542     return -1;
543   }
544 
545   // Look for an empty dirent.
546   for(off = 0; off < dp->size; off += sizeof(de)){
547     if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
548       panic("dirlink read");
549     if(de.inum == 0)
550       break;
551   }
552 
553   strncpy(de.name, name, DIRSIZ);
554   de.inum = inum;
555   if(writei(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
556     panic("dirlink");
557 
558   return 0;
559 }
560 
561 //PAGEBREAK!
562 // Paths
563 
564 // Copy the next path element from path into name.
565 // Return a pointer to the element following the copied one.
566 // The returned path has no leading slashes,
567 // so the caller can check *path=='\0' to see if the name is the last one.
568 // If no name to remove, return 0.
569 //
570 // Examples:
571 //   skipelem("a/bb/c", name) = "bb/c", setting name = "a"
572 //   skipelem("///a//bb", name) = "bb", setting name = "a"
573 //   skipelem("a", name) = "", setting name = "a"
574 //   skipelem("", name) = skipelem("////", name) = 0
575 //
576 static char*
577 skipelem(char *path, char *name)
578 {
579   char *s;
580   int len;
581 
582   while(*path == '/')
583     path++;
584   if(*path == 0)
585     return 0;
586   s = path;
587   while(*path != '/' && *path != 0)
588     path++;
589   len = path - s;
590   if(len >= DIRSIZ)
591     memmove(name, s, DIRSIZ);
592   else {
593     memmove(name, s, len);
594     name[len] = 0;
595   }
596   while(*path == '/')
597     path++;
598   return path;
599 }
600 
601 // Look up and return the inode for a path name.
602 // If parent != 0, return the inode for the parent and copy the final
603 // path element into name, which must have room for DIRSIZ bytes.
604 static struct inode*
605 namex(char *path, int nameiparent, char *name)
606 {
607   struct inode *ip, *next;
608 
609   if(*path == '/')
610     ip = iget(ROOTDEV, ROOTINO);
611   else
612     ip = idup(proc->cwd);
613 
614   while((path = skipelem(path, name)) != 0){
615     ilock(ip);
616     if(ip->type != T_DIR){
617       iunlockput(ip);
618       return 0;
619     }
620     if(nameiparent && *path == '\0'){
621       // Stop one level early.
622       iunlock(ip);
623       return ip;
624     }
625     if((next = dirlookup(ip, name, 0)) == 0){
626       iunlockput(ip);
627       return 0;
628     }
629     iunlockput(ip);
630     ip = next;
631   }
632   if(nameiparent){
633     iput(ip);
634     return 0;
635   }
636   return ip;
637 }
638 
639 struct inode*
640 namei(char *path)
641 {
642   char name[DIRSIZ];
643   return namex(path, 0, name);
644 }
645 
646 struct inode*
647 nameiparent(char *path, char *name)
648 {
649   return namex(path, 1, name);
650 }
651