xref: /xv6-public/fs.c (revision 6389d9d4)
1 // File system implementation.  Five layers:
2 //   + Blocks: allocator for raw disk blocks.
3 //   + Log: crash recovery for multi-step updates.
4 //   + Files: inode allocator, reading, writing, metadata.
5 //   + Directories: inode with special contents (list of other inodes!)
6 //   + Names: paths like /usr/rtm/xv6/fs.c for convenient naming.
7 //
8 // This file contains the low-level file system manipulation
9 // routines.  The (higher-level) system call implementations
10 // are in sysfile.c.
11 
12 #include "types.h"
13 #include "defs.h"
14 #include "param.h"
15 #include "stat.h"
16 #include "mmu.h"
17 #include "proc.h"
18 #include "spinlock.h"
19 #include "sleeplock.h"
20 #include "fs.h"
21 #include "buf.h"
22 #include "file.h"
23 
24 #define min(a, b) ((a) < (b) ? (a) : (b))
25 static void itrunc(struct inode*);
26 // there should be one superblock per disk device, but we run with
27 // only one device
28 struct superblock sb;
29 
30 // Read the super block.
31 void
32 readsb(int dev, struct superblock *sb)
33 {
34   struct buf *bp;
35 
36   bp = bread(dev, 1);
37   memmove(sb, bp->data, sizeof(*sb));
38   brelse(bp);
39 }
40 
41 // Zero a block.
42 static void
43 bzero(int dev, int bno)
44 {
45   struct buf *bp;
46 
47   bp = bread(dev, bno);
48   memset(bp->data, 0, BSIZE);
49   log_write(bp);
50   brelse(bp);
51 }
52 
53 // Blocks.
54 
55 // Allocate a zeroed disk block.
56 static uint
57 balloc(uint dev)
58 {
59   int b, bi, m;
60   struct buf *bp;
61 
62   bp = 0;
63   for(b = 0; b < sb.size; b += BPB){
64     bp = bread(dev, BBLOCK(b, sb));
65     for(bi = 0; bi < BPB && b + bi < sb.size; bi++){
66       m = 1 << (bi % 8);
67       if((bp->data[bi/8] & m) == 0){  // Is block free?
68         bp->data[bi/8] |= m;  // Mark block in use.
69         log_write(bp);
70         brelse(bp);
71         bzero(dev, b + bi);
72         return b + bi;
73       }
74     }
75     brelse(bp);
76   }
77   panic("balloc: out of blocks");
78 }
79 
80 // Free a disk block.
81 static void
82 bfree(int dev, uint b)
83 {
84   struct buf *bp;
85   int bi, m;
86 
87   readsb(dev, &sb);
88   bp = bread(dev, BBLOCK(b, sb));
89   bi = b % BPB;
90   m = 1 << (bi % 8);
91   if((bp->data[bi/8] & m) == 0)
92     panic("freeing free block");
93   bp->data[bi/8] &= ~m;
94   log_write(bp);
95   brelse(bp);
96 }
97 
98 // Inodes.
99 //
100 // An inode describes a single unnamed file.
101 // The inode disk structure holds metadata: the file's type,
102 // its size, the number of links referring to it, and the
103 // list of blocks holding the file's content.
104 //
105 // The inodes are laid out sequentially on disk at
106 // sb.startinode. Each inode has a number, indicating its
107 // position on the disk.
108 //
109 // The kernel keeps a cache of in-use inodes in memory
110 // to provide a place for synchronizing access
111 // to inodes used by multiple processes. The cached
112 // inodes include book-keeping information that is
113 // not stored on disk: ip->ref and ip->valid.
114 //
115 // An inode and its in-memory representation go through a
116 // sequence of states before they can be used by the
117 // rest of the file system code.
118 //
119 // * Allocation: an inode is allocated if its type (on disk)
120 //   is non-zero. ialloc() allocates, and iput() frees if
121 //   the reference and link counts have fallen to zero.
122 //
123 // * Referencing in cache: an entry in the inode cache
124 //   is free if ip->ref is zero. Otherwise ip->ref tracks
125 //   the number of in-memory pointers to the entry (open
126 //   files and current directories). iget() finds or
127 //   creates a cache entry and increments its ref; iput()
128 //   decrements ref.
129 //
130 // * Valid: the information (type, size, &c) in an inode
131 //   cache entry is only correct when ip->valid is 1.
132 //   ilock() reads the inode from
133 //   the disk and sets ip->valid, while iput() clears
134 //   ip->valid if ip->ref has fallen to zero.
135 //
136 // * Locked: file system code may only examine and modify
137 //   the information in an inode and its content if it
138 //   has first locked the inode.
139 //
140 // Thus a typical sequence is:
141 //   ip = iget(dev, inum)
142 //   ilock(ip)
143 //   ... examine and modify ip->xxx ...
144 //   iunlock(ip)
145 //   iput(ip)
146 //
147 // ilock() is separate from iget() so that system calls can
148 // get a long-term reference to an inode (as for an open file)
149 // and only lock it for short periods (e.g., in read()).
150 // The separation also helps avoid deadlock and races during
151 // pathname lookup. iget() increments ip->ref so that the inode
152 // stays cached and pointers to it remain valid.
153 //
154 // Many internal file system functions expect the caller to
155 // have locked the inodes involved; this lets callers create
156 // multi-step atomic operations.
157 //
158 // The icache.lock spin-lock defends the allocation of icache
159 // entries. Since ip->ref indicates whether an entry is free,
160 // and ip->dev and ip->inum indicate which i-node an entry
161 // holds, one must hold icache.lock while using any of those fields.
162 //
163 // An ip->lock sleep-lock defends all ip-> fields other than ref,
164 // dev, and inum.  One must hold ip->lock in order to
165 // read or write that inode's ip->valid, ip->size, ip->type, &c.
166 
167 struct {
168   struct spinlock lock;
169   struct inode inode[NINODE];
170 } icache;
171 
172 void
173 iinit(int dev)
174 {
175   int i = 0;
176 
177   initlock(&icache.lock, "icache");
178   for(i = 0; i < NINODE; i++) {
179     initsleeplock(&icache.inode[i].lock, "inode");
180   }
181 
182   readsb(dev, &sb);
183   cprintf("sb: size %d nblocks %d ninodes %d nlog %d logstart %d\
184  inodestart %d bmap start %d\n", sb.size, sb.nblocks,
185           sb.ninodes, sb.nlog, sb.logstart, sb.inodestart,
186           sb.bmapstart);
187 }
188 
189 static struct inode* iget(uint dev, uint inum);
190 
191 //PAGEBREAK!
192 // Allocate an inode on device dev.
193 // Mark it as allocated by  giving it type type.
194 // Returns an unlocked but allocated and referenced inode.
195 struct inode*
196 ialloc(uint dev, short type)
197 {
198   int inum;
199   struct buf *bp;
200   struct dinode *dip;
201 
202   for(inum = 1; inum < sb.ninodes; inum++){
203     bp = bread(dev, IBLOCK(inum, sb));
204     dip = (struct dinode*)bp->data + inum%IPB;
205     if(dip->type == 0){  // a free inode
206       memset(dip, 0, sizeof(*dip));
207       dip->type = type;
208       log_write(bp);   // mark it allocated on the disk
209       brelse(bp);
210       return iget(dev, inum);
211     }
212     brelse(bp);
213   }
214   panic("ialloc: no inodes");
215 }
216 
217 // Copy a modified in-memory inode to disk.
218 // Must be called after every change to an ip->xxx field
219 // that lives on disk, since i-node cache is write-through.
220 // Caller must hold ip->lock.
221 void
222 iupdate(struct inode *ip)
223 {
224   struct buf *bp;
225   struct dinode *dip;
226 
227   bp = bread(ip->dev, IBLOCK(ip->inum, sb));
228   dip = (struct dinode*)bp->data + ip->inum%IPB;
229   dip->type = ip->type;
230   dip->major = ip->major;
231   dip->minor = ip->minor;
232   dip->nlink = ip->nlink;
233   dip->size = ip->size;
234   memmove(dip->addrs, ip->addrs, sizeof(ip->addrs));
235   log_write(bp);
236   brelse(bp);
237 }
238 
239 // Find the inode with number inum on device dev
240 // and return the in-memory copy. Does not lock
241 // the inode and does not read it from disk.
242 static struct inode*
243 iget(uint dev, uint inum)
244 {
245   struct inode *ip, *empty;
246 
247   acquire(&icache.lock);
248 
249   // Is the inode already cached?
250   empty = 0;
251   for(ip = &icache.inode[0]; ip < &icache.inode[NINODE]; ip++){
252     if(ip->ref > 0 && ip->dev == dev && ip->inum == inum){
253       ip->ref++;
254       release(&icache.lock);
255       return ip;
256     }
257     if(empty == 0 && ip->ref == 0)    // Remember empty slot.
258       empty = ip;
259   }
260 
261   // Recycle an inode cache entry.
262   if(empty == 0)
263     panic("iget: no inodes");
264 
265   ip = empty;
266   ip->dev = dev;
267   ip->inum = inum;
268   ip->ref = 1;
269   ip->valid = 0;
270   release(&icache.lock);
271 
272   return ip;
273 }
274 
275 // Increment reference count for ip.
276 // Returns ip to enable ip = idup(ip1) idiom.
277 struct inode*
278 idup(struct inode *ip)
279 {
280   acquire(&icache.lock);
281   ip->ref++;
282   release(&icache.lock);
283   return ip;
284 }
285 
286 // Lock the given inode.
287 // Reads the inode from disk if necessary.
288 void
289 ilock(struct inode *ip)
290 {
291   struct buf *bp;
292   struct dinode *dip;
293 
294   if(ip == 0 || ip->ref < 1)
295     panic("ilock");
296 
297   acquiresleep(&ip->lock);
298 
299   if(ip->valid == 0){
300     bp = bread(ip->dev, IBLOCK(ip->inum, sb));
301     dip = (struct dinode*)bp->data + ip->inum%IPB;
302     ip->type = dip->type;
303     ip->major = dip->major;
304     ip->minor = dip->minor;
305     ip->nlink = dip->nlink;
306     ip->size = dip->size;
307     memmove(ip->addrs, dip->addrs, sizeof(ip->addrs));
308     brelse(bp);
309     ip->valid = 1;
310     if(ip->type == 0)
311       panic("ilock: no type");
312   }
313 }
314 
315 // Unlock the given inode.
316 void
317 iunlock(struct inode *ip)
318 {
319   if(ip == 0 || !holdingsleep(&ip->lock) || ip->ref < 1)
320     panic("iunlock");
321 
322   releasesleep(&ip->lock);
323 }
324 
325 // Drop a reference to an in-memory inode.
326 // If that was the last reference, the inode cache entry can
327 // be recycled.
328 // If that was the last reference and the inode has no links
329 // to it, free the inode (and its content) on disk.
330 // All calls to iput() must be inside a transaction in
331 // case it has to free the inode.
332 void
333 iput(struct inode *ip)
334 {
335   acquiresleep(&ip->lock);
336   if(ip->valid && ip->nlink == 0){
337     acquire(&icache.lock);
338     int r = ip->ref;
339     release(&icache.lock);
340     if(r == 1){
341       // inode has no links and no other references: truncate and free.
342       itrunc(ip);
343       ip->type = 0;
344       iupdate(ip);
345       ip->valid = 0;
346     }
347   }
348   releasesleep(&ip->lock);
349 
350   acquire(&icache.lock);
351   ip->ref--;
352   release(&icache.lock);
353 }
354 
355 // Common idiom: unlock, then put.
356 void
357 iunlockput(struct inode *ip)
358 {
359   iunlock(ip);
360   iput(ip);
361 }
362 
363 //PAGEBREAK!
364 // Inode content
365 //
366 // The content (data) associated with each inode is stored
367 // in blocks on the disk. The first NDIRECT block numbers
368 // are listed in ip->addrs[].  The next NINDIRECT blocks are
369 // listed in block ip->addrs[NDIRECT].
370 
371 // Return the disk block address of the nth block in inode ip.
372 // If there is no such block, bmap allocates one.
373 static uint
374 bmap(struct inode *ip, uint bn)
375 {
376   uint addr, *a;
377   struct buf *bp;
378 
379   if(bn < NDIRECT){
380     if((addr = ip->addrs[bn]) == 0)
381       ip->addrs[bn] = addr = balloc(ip->dev);
382     return addr;
383   }
384   bn -= NDIRECT;
385 
386   if(bn < NINDIRECT){
387     // Load indirect block, allocating if necessary.
388     if((addr = ip->addrs[NDIRECT]) == 0)
389       ip->addrs[NDIRECT] = addr = balloc(ip->dev);
390     bp = bread(ip->dev, addr);
391     a = (uint*)bp->data;
392     if((addr = a[bn]) == 0){
393       a[bn] = addr = balloc(ip->dev);
394       log_write(bp);
395     }
396     brelse(bp);
397     return addr;
398   }
399 
400   panic("bmap: out of range");
401 }
402 
403 // Truncate inode (discard contents).
404 // Only called when the inode has no links
405 // to it (no directory entries referring to it)
406 // and has no in-memory reference to it (is
407 // not an open file or current directory).
408 static void
409 itrunc(struct inode *ip)
410 {
411   int i, j;
412   struct buf *bp;
413   uint *a;
414 
415   for(i = 0; i < NDIRECT; i++){
416     if(ip->addrs[i]){
417       bfree(ip->dev, ip->addrs[i]);
418       ip->addrs[i] = 0;
419     }
420   }
421 
422   if(ip->addrs[NDIRECT]){
423     bp = bread(ip->dev, ip->addrs[NDIRECT]);
424     a = (uint*)bp->data;
425     for(j = 0; j < NINDIRECT; j++){
426       if(a[j])
427         bfree(ip->dev, a[j]);
428     }
429     brelse(bp);
430     bfree(ip->dev, ip->addrs[NDIRECT]);
431     ip->addrs[NDIRECT] = 0;
432   }
433 
434   ip->size = 0;
435   iupdate(ip);
436 }
437 
438 // Copy stat information from inode.
439 // Caller must hold ip->lock.
440 void
441 stati(struct inode *ip, struct stat *st)
442 {
443   st->dev = ip->dev;
444   st->ino = ip->inum;
445   st->type = ip->type;
446   st->nlink = ip->nlink;
447   st->size = ip->size;
448 }
449 
450 //PAGEBREAK!
451 // Read data from inode.
452 // Caller must hold ip->lock.
453 int
454 readi(struct inode *ip, char *dst, uint off, uint n)
455 {
456   uint tot, m;
457   struct buf *bp;
458 
459   if(ip->type == T_DEV){
460     if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].read)
461       return -1;
462     return devsw[ip->major].read(ip, dst, n);
463   }
464 
465   if(off > ip->size || off + n < off)
466     return -1;
467   if(off + n > ip->size)
468     n = ip->size - off;
469 
470   for(tot=0; tot<n; tot+=m, off+=m, dst+=m){
471     bp = bread(ip->dev, bmap(ip, off/BSIZE));
472     m = min(n - tot, BSIZE - off%BSIZE);
473     memmove(dst, bp->data + off%BSIZE, m);
474     brelse(bp);
475   }
476   return n;
477 }
478 
479 // PAGEBREAK!
480 // Write data to inode.
481 // Caller must hold ip->lock.
482 int
483 writei(struct inode *ip, char *src, uint off, uint n)
484 {
485   uint tot, m;
486   struct buf *bp;
487 
488   if(ip->type == T_DEV){
489     if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].write)
490       return -1;
491     return devsw[ip->major].write(ip, src, n);
492   }
493 
494   if(off > ip->size || off + n < off)
495     return -1;
496   if(off + n > MAXFILE*BSIZE)
497     return -1;
498 
499   for(tot=0; tot<n; tot+=m, off+=m, src+=m){
500     bp = bread(ip->dev, bmap(ip, off/BSIZE));
501     m = min(n - tot, BSIZE - off%BSIZE);
502     memmove(bp->data + off%BSIZE, src, m);
503     log_write(bp);
504     brelse(bp);
505   }
506 
507   if(n > 0 && off > ip->size){
508     ip->size = off;
509     iupdate(ip);
510   }
511   return n;
512 }
513 
514 //PAGEBREAK!
515 // Directories
516 
517 int
518 namecmp(const char *s, const char *t)
519 {
520   return strncmp(s, t, DIRSIZ);
521 }
522 
523 // Look for a directory entry in a directory.
524 // If found, set *poff to byte offset of entry.
525 struct inode*
526 dirlookup(struct inode *dp, char *name, uint *poff)
527 {
528   uint off, inum;
529   struct dirent de;
530 
531   if(dp->type != T_DIR)
532     panic("dirlookup not DIR");
533 
534   for(off = 0; off < dp->size; off += sizeof(de)){
535     if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
536       panic("dirlookup read");
537     if(de.inum == 0)
538       continue;
539     if(namecmp(name, de.name) == 0){
540       // entry matches path element
541       if(poff)
542         *poff = off;
543       inum = de.inum;
544       return iget(dp->dev, inum);
545     }
546   }
547 
548   return 0;
549 }
550 
551 // Write a new directory entry (name, inum) into the directory dp.
552 int
553 dirlink(struct inode *dp, char *name, uint inum)
554 {
555   int off;
556   struct dirent de;
557   struct inode *ip;
558 
559   // Check that name is not present.
560   if((ip = dirlookup(dp, name, 0)) != 0){
561     iput(ip);
562     return -1;
563   }
564 
565   // Look for an empty dirent.
566   for(off = 0; off < dp->size; off += sizeof(de)){
567     if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
568       panic("dirlink read");
569     if(de.inum == 0)
570       break;
571   }
572 
573   strncpy(de.name, name, DIRSIZ);
574   de.inum = inum;
575   if(writei(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
576     panic("dirlink");
577 
578   return 0;
579 }
580 
581 //PAGEBREAK!
582 // Paths
583 
584 // Copy the next path element from path into name.
585 // Return a pointer to the element following the copied one.
586 // The returned path has no leading slashes,
587 // so the caller can check *path=='\0' to see if the name is the last one.
588 // If no name to remove, return 0.
589 //
590 // Examples:
591 //   skipelem("a/bb/c", name) = "bb/c", setting name = "a"
592 //   skipelem("///a//bb", name) = "bb", setting name = "a"
593 //   skipelem("a", name) = "", setting name = "a"
594 //   skipelem("", name) = skipelem("////", name) = 0
595 //
596 static char*
597 skipelem(char *path, char *name)
598 {
599   char *s;
600   int len;
601 
602   while(*path == '/')
603     path++;
604   if(*path == 0)
605     return 0;
606   s = path;
607   while(*path != '/' && *path != 0)
608     path++;
609   len = path - s;
610   if(len >= DIRSIZ)
611     memmove(name, s, DIRSIZ);
612   else {
613     memmove(name, s, len);
614     name[len] = 0;
615   }
616   while(*path == '/')
617     path++;
618   return path;
619 }
620 
621 // Look up and return the inode for a path name.
622 // If parent != 0, return the inode for the parent and copy the final
623 // path element into name, which must have room for DIRSIZ bytes.
624 // Must be called inside a transaction since it calls iput().
625 static struct inode*
626 namex(char *path, int nameiparent, char *name)
627 {
628   struct inode *ip, *next;
629 
630   if(*path == '/')
631     ip = iget(ROOTDEV, ROOTINO);
632   else
633     ip = idup(myproc()->cwd);
634 
635   while((path = skipelem(path, name)) != 0){
636     ilock(ip);
637     if(ip->type != T_DIR){
638       iunlockput(ip);
639       return 0;
640     }
641     if(nameiparent && *path == '\0'){
642       // Stop one level early.
643       iunlock(ip);
644       return ip;
645     }
646     if((next = dirlookup(ip, name, 0)) == 0){
647       iunlockput(ip);
648       return 0;
649     }
650     iunlockput(ip);
651     ip = next;
652   }
653   if(nameiparent){
654     iput(ip);
655     return 0;
656   }
657   return ip;
658 }
659 
660 struct inode*
661 namei(char *path)
662 {
663   char name[DIRSIZ];
664   return namex(path, 0, name);
665 }
666 
667 struct inode*
668 nameiparent(char *path, char *name)
669 {
670   return namex(path, 1, name);
671 }
672