xref: /xv6-public/fs.c (revision 82638c01)
1 // File system implementation.  Five layers:
2 //   + Blocks: allocator for raw disk blocks.
3 //   + Log: crash recovery for multi-step updates.
4 //   + Files: inode allocator, reading, writing, metadata.
5 //   + Directories: inode with special contents (list of other inodes!)
6 //   + Names: paths like /usr/rtm/xv6/fs.c for convenient naming.
7 //
8 // This file contains the low-level file system manipulation
9 // routines.  The (higher-level) system call implementations
10 // are in sysfile.c.
11 
12 #include "types.h"
13 #include "defs.h"
14 #include "param.h"
15 #include "stat.h"
16 #include "mmu.h"
17 #include "proc.h"
18 #include "spinlock.h"
19 #include "sleeplock.h"
20 #include "fs.h"
21 #include "buf.h"
22 #include "file.h"
23 
24 #define min(a, b) ((a) < (b) ? (a) : (b))
25 static void itrunc(struct inode*);
26 // there should be one superblock per disk device, but we run with
27 // only one device
28 struct superblock sb;
29 
30 // Read the super block.
31 void
readsb(int dev,struct superblock * sb)32 readsb(int dev, struct superblock *sb)
33 {
34   struct buf *bp;
35 
36   bp = bread(dev, 1);
37   memmove(sb, bp->data, sizeof(*sb));
38   brelse(bp);
39 }
40 
41 // Zero a block.
42 static void
bzero(int dev,int bno)43 bzero(int dev, int bno)
44 {
45   struct buf *bp;
46 
47   bp = bread(dev, bno);
48   memset(bp->data, 0, BSIZE);
49   log_write(bp);
50   brelse(bp);
51 }
52 
53 // Blocks.
54 
55 // Allocate a zeroed disk block.
56 static uint
balloc(uint dev)57 balloc(uint dev)
58 {
59   int b, bi, m;
60   struct buf *bp;
61 
62   bp = 0;
63   for(b = 0; b < sb.size; b += BPB){
64     bp = bread(dev, BBLOCK(b, sb));
65     for(bi = 0; bi < BPB && b + bi < sb.size; bi++){
66       m = 1 << (bi % 8);
67       if((bp->data[bi/8] & m) == 0){  // Is block free?
68         bp->data[bi/8] |= m;  // Mark block in use.
69         log_write(bp);
70         brelse(bp);
71         bzero(dev, b + bi);
72         return b + bi;
73       }
74     }
75     brelse(bp);
76   }
77   panic("balloc: out of blocks");
78 }
79 
80 // Free a disk block.
81 static void
bfree(int dev,uint b)82 bfree(int dev, uint b)
83 {
84   struct buf *bp;
85   int bi, m;
86 
87   bp = bread(dev, BBLOCK(b, sb));
88   bi = b % BPB;
89   m = 1 << (bi % 8);
90   if((bp->data[bi/8] & m) == 0)
91     panic("freeing free block");
92   bp->data[bi/8] &= ~m;
93   log_write(bp);
94   brelse(bp);
95 }
96 
97 // Inodes.
98 //
99 // An inode describes a single unnamed file.
100 // The inode disk structure holds metadata: the file's type,
101 // its size, the number of links referring to it, and the
102 // list of blocks holding the file's content.
103 //
104 // The inodes are laid out sequentially on disk at
105 // sb.startinode. Each inode has a number, indicating its
106 // position on the disk.
107 //
108 // The kernel keeps a cache of in-use inodes in memory
109 // to provide a place for synchronizing access
110 // to inodes used by multiple processes. The cached
111 // inodes include book-keeping information that is
112 // not stored on disk: ip->ref and ip->valid.
113 //
114 // An inode and its in-memory representation go through a
115 // sequence of states before they can be used by the
116 // rest of the file system code.
117 //
118 // * Allocation: an inode is allocated if its type (on disk)
119 //   is non-zero. ialloc() allocates, and iput() frees if
120 //   the reference and link counts have fallen to zero.
121 //
122 // * Referencing in cache: an entry in the inode cache
123 //   is free if ip->ref is zero. Otherwise ip->ref tracks
124 //   the number of in-memory pointers to the entry (open
125 //   files and current directories). iget() finds or
126 //   creates a cache entry and increments its ref; iput()
127 //   decrements ref.
128 //
129 // * Valid: the information (type, size, &c) in an inode
130 //   cache entry is only correct when ip->valid is 1.
131 //   ilock() reads the inode from
132 //   the disk and sets ip->valid, while iput() clears
133 //   ip->valid if ip->ref has fallen to zero.
134 //
135 // * Locked: file system code may only examine and modify
136 //   the information in an inode and its content if it
137 //   has first locked the inode.
138 //
139 // Thus a typical sequence is:
140 //   ip = iget(dev, inum)
141 //   ilock(ip)
142 //   ... examine and modify ip->xxx ...
143 //   iunlock(ip)
144 //   iput(ip)
145 //
146 // ilock() is separate from iget() so that system calls can
147 // get a long-term reference to an inode (as for an open file)
148 // and only lock it for short periods (e.g., in read()).
149 // The separation also helps avoid deadlock and races during
150 // pathname lookup. iget() increments ip->ref so that the inode
151 // stays cached and pointers to it remain valid.
152 //
153 // Many internal file system functions expect the caller to
154 // have locked the inodes involved; this lets callers create
155 // multi-step atomic operations.
156 //
157 // The icache.lock spin-lock protects the allocation of icache
158 // entries. Since ip->ref indicates whether an entry is free,
159 // and ip->dev and ip->inum indicate which i-node an entry
160 // holds, one must hold icache.lock while using any of those fields.
161 //
162 // An ip->lock sleep-lock protects all ip-> fields other than ref,
163 // dev, and inum.  One must hold ip->lock in order to
164 // read or write that inode's ip->valid, ip->size, ip->type, &c.
165 
166 struct {
167   struct spinlock lock;
168   struct inode inode[NINODE];
169 } icache;
170 
171 void
iinit(int dev)172 iinit(int dev)
173 {
174   int i = 0;
175 
176   initlock(&icache.lock, "icache");
177   for(i = 0; i < NINODE; i++) {
178     initsleeplock(&icache.inode[i].lock, "inode");
179   }
180 
181   readsb(dev, &sb);
182   cprintf("sb: size %d nblocks %d ninodes %d nlog %d logstart %d\
183  inodestart %d bmap start %d\n", sb.size, sb.nblocks,
184           sb.ninodes, sb.nlog, sb.logstart, sb.inodestart,
185           sb.bmapstart);
186 }
187 
188 static struct inode* iget(uint dev, uint inum);
189 
190 //PAGEBREAK!
191 // Allocate an inode on device dev.
192 // Mark it as allocated by  giving it type type.
193 // Returns an unlocked but allocated and referenced inode.
194 struct inode*
ialloc(uint dev,short type)195 ialloc(uint dev, short type)
196 {
197   int inum;
198   struct buf *bp;
199   struct dinode *dip;
200 
201   for(inum = 1; inum < sb.ninodes; inum++){
202     bp = bread(dev, IBLOCK(inum, sb));
203     dip = (struct dinode*)bp->data + inum%IPB;
204     if(dip->type == 0){  // a free inode
205       memset(dip, 0, sizeof(*dip));
206       dip->type = type;
207       log_write(bp);   // mark it allocated on the disk
208       brelse(bp);
209       return iget(dev, inum);
210     }
211     brelse(bp);
212   }
213   panic("ialloc: no inodes");
214 }
215 
216 // Copy a modified in-memory inode to disk.
217 // Must be called after every change to an ip->xxx field
218 // that lives on disk, since i-node cache is write-through.
219 // Caller must hold ip->lock.
220 void
iupdate(struct inode * ip)221 iupdate(struct inode *ip)
222 {
223   struct buf *bp;
224   struct dinode *dip;
225 
226   bp = bread(ip->dev, IBLOCK(ip->inum, sb));
227   dip = (struct dinode*)bp->data + ip->inum%IPB;
228   dip->type = ip->type;
229   dip->major = ip->major;
230   dip->minor = ip->minor;
231   dip->nlink = ip->nlink;
232   dip->size = ip->size;
233   memmove(dip->addrs, ip->addrs, sizeof(ip->addrs));
234   log_write(bp);
235   brelse(bp);
236 }
237 
238 // Find the inode with number inum on device dev
239 // and return the in-memory copy. Does not lock
240 // the inode and does not read it from disk.
241 static struct inode*
iget(uint dev,uint inum)242 iget(uint dev, uint inum)
243 {
244   struct inode *ip, *empty;
245 
246   acquire(&icache.lock);
247 
248   // Is the inode already cached?
249   empty = 0;
250   for(ip = &icache.inode[0]; ip < &icache.inode[NINODE]; ip++){
251     if(ip->ref > 0 && ip->dev == dev && ip->inum == inum){
252       ip->ref++;
253       release(&icache.lock);
254       return ip;
255     }
256     if(empty == 0 && ip->ref == 0)    // Remember empty slot.
257       empty = ip;
258   }
259 
260   // Recycle an inode cache entry.
261   if(empty == 0)
262     panic("iget: no inodes");
263 
264   ip = empty;
265   ip->dev = dev;
266   ip->inum = inum;
267   ip->ref = 1;
268   ip->valid = 0;
269   release(&icache.lock);
270 
271   return ip;
272 }
273 
274 // Increment reference count for ip.
275 // Returns ip to enable ip = idup(ip1) idiom.
276 struct inode*
idup(struct inode * ip)277 idup(struct inode *ip)
278 {
279   acquire(&icache.lock);
280   ip->ref++;
281   release(&icache.lock);
282   return ip;
283 }
284 
285 // Lock the given inode.
286 // Reads the inode from disk if necessary.
287 void
ilock(struct inode * ip)288 ilock(struct inode *ip)
289 {
290   struct buf *bp;
291   struct dinode *dip;
292 
293   if(ip == 0 || ip->ref < 1)
294     panic("ilock");
295 
296   acquiresleep(&ip->lock);
297 
298   if(ip->valid == 0){
299     bp = bread(ip->dev, IBLOCK(ip->inum, sb));
300     dip = (struct dinode*)bp->data + ip->inum%IPB;
301     ip->type = dip->type;
302     ip->major = dip->major;
303     ip->minor = dip->minor;
304     ip->nlink = dip->nlink;
305     ip->size = dip->size;
306     memmove(ip->addrs, dip->addrs, sizeof(ip->addrs));
307     brelse(bp);
308     ip->valid = 1;
309     if(ip->type == 0)
310       panic("ilock: no type");
311   }
312 }
313 
314 // Unlock the given inode.
315 void
iunlock(struct inode * ip)316 iunlock(struct inode *ip)
317 {
318   if(ip == 0 || !holdingsleep(&ip->lock) || ip->ref < 1)
319     panic("iunlock");
320 
321   releasesleep(&ip->lock);
322 }
323 
324 // Drop a reference to an in-memory inode.
325 // If that was the last reference, the inode cache entry can
326 // be recycled.
327 // If that was the last reference and the inode has no links
328 // to it, free the inode (and its content) on disk.
329 // All calls to iput() must be inside a transaction in
330 // case it has to free the inode.
331 void
iput(struct inode * ip)332 iput(struct inode *ip)
333 {
334   acquiresleep(&ip->lock);
335   if(ip->valid && ip->nlink == 0){
336     acquire(&icache.lock);
337     int r = ip->ref;
338     release(&icache.lock);
339     if(r == 1){
340       // inode has no links and no other references: truncate and free.
341       itrunc(ip);
342       ip->type = 0;
343       iupdate(ip);
344       ip->valid = 0;
345     }
346   }
347   releasesleep(&ip->lock);
348 
349   acquire(&icache.lock);
350   ip->ref--;
351   release(&icache.lock);
352 }
353 
354 // Common idiom: unlock, then put.
355 void
iunlockput(struct inode * ip)356 iunlockput(struct inode *ip)
357 {
358   iunlock(ip);
359   iput(ip);
360 }
361 
362 //PAGEBREAK!
363 // Inode content
364 //
365 // The content (data) associated with each inode is stored
366 // in blocks on the disk. The first NDIRECT block numbers
367 // are listed in ip->addrs[].  The next NINDIRECT blocks are
368 // listed in block ip->addrs[NDIRECT].
369 
370 // Return the disk block address of the nth block in inode ip.
371 // If there is no such block, bmap allocates one.
372 static uint
bmap(struct inode * ip,uint bn)373 bmap(struct inode *ip, uint bn)
374 {
375   uint addr, *a;
376   struct buf *bp;
377 
378   if(bn < NDIRECT){
379     if((addr = ip->addrs[bn]) == 0)
380       ip->addrs[bn] = addr = balloc(ip->dev);
381     return addr;
382   }
383   bn -= NDIRECT;
384 
385   if(bn < NINDIRECT){
386     // Load indirect block, allocating if necessary.
387     if((addr = ip->addrs[NDIRECT]) == 0)
388       ip->addrs[NDIRECT] = addr = balloc(ip->dev);
389     bp = bread(ip->dev, addr);
390     a = (uint*)bp->data;
391     if((addr = a[bn]) == 0){
392       a[bn] = addr = balloc(ip->dev);
393       log_write(bp);
394     }
395     brelse(bp);
396     return addr;
397   }
398 
399   panic("bmap: out of range");
400 }
401 
402 // Truncate inode (discard contents).
403 // Only called when the inode has no links
404 // to it (no directory entries referring to it)
405 // and has no in-memory reference to it (is
406 // not an open file or current directory).
407 static void
itrunc(struct inode * ip)408 itrunc(struct inode *ip)
409 {
410   int i, j;
411   struct buf *bp;
412   uint *a;
413 
414   for(i = 0; i < NDIRECT; i++){
415     if(ip->addrs[i]){
416       bfree(ip->dev, ip->addrs[i]);
417       ip->addrs[i] = 0;
418     }
419   }
420 
421   if(ip->addrs[NDIRECT]){
422     bp = bread(ip->dev, ip->addrs[NDIRECT]);
423     a = (uint*)bp->data;
424     for(j = 0; j < NINDIRECT; j++){
425       if(a[j])
426         bfree(ip->dev, a[j]);
427     }
428     brelse(bp);
429     bfree(ip->dev, ip->addrs[NDIRECT]);
430     ip->addrs[NDIRECT] = 0;
431   }
432 
433   ip->size = 0;
434   iupdate(ip);
435 }
436 
437 // Copy stat information from inode.
438 // Caller must hold ip->lock.
439 void
stati(struct inode * ip,struct stat * st)440 stati(struct inode *ip, struct stat *st)
441 {
442   st->dev = ip->dev;
443   st->ino = ip->inum;
444   st->type = ip->type;
445   st->nlink = ip->nlink;
446   st->size = ip->size;
447 }
448 
449 //PAGEBREAK!
450 // Read data from inode.
451 // Caller must hold ip->lock.
452 int
readi(struct inode * ip,char * dst,uint off,uint n)453 readi(struct inode *ip, char *dst, uint off, uint n)
454 {
455   uint tot, m;
456   struct buf *bp;
457 
458   if(ip->type == T_DEV){
459     if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].read)
460       return -1;
461     return devsw[ip->major].read(ip, dst, n);
462   }
463 
464   if(off > ip->size || off + n < off)
465     return -1;
466   if(off + n > ip->size)
467     n = ip->size - off;
468 
469   for(tot=0; tot<n; tot+=m, off+=m, dst+=m){
470     bp = bread(ip->dev, bmap(ip, off/BSIZE));
471     m = min(n - tot, BSIZE - off%BSIZE);
472     memmove(dst, bp->data + off%BSIZE, m);
473     brelse(bp);
474   }
475   return n;
476 }
477 
478 // PAGEBREAK!
479 // Write data to inode.
480 // Caller must hold ip->lock.
481 int
writei(struct inode * ip,char * src,uint off,uint n)482 writei(struct inode *ip, char *src, uint off, uint n)
483 {
484   uint tot, m;
485   struct buf *bp;
486 
487   if(ip->type == T_DEV){
488     if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].write)
489       return -1;
490     return devsw[ip->major].write(ip, src, n);
491   }
492 
493   if(off > ip->size || off + n < off)
494     return -1;
495   if(off + n > MAXFILE*BSIZE)
496     return -1;
497 
498   for(tot=0; tot<n; tot+=m, off+=m, src+=m){
499     bp = bread(ip->dev, bmap(ip, off/BSIZE));
500     m = min(n - tot, BSIZE - off%BSIZE);
501     memmove(bp->data + off%BSIZE, src, m);
502     log_write(bp);
503     brelse(bp);
504   }
505 
506   if(n > 0 && off > ip->size){
507     ip->size = off;
508     iupdate(ip);
509   }
510   return n;
511 }
512 
513 //PAGEBREAK!
514 // Directories
515 
516 int
namecmp(const char * s,const char * t)517 namecmp(const char *s, const char *t)
518 {
519   return strncmp(s, t, DIRSIZ);
520 }
521 
522 // Look for a directory entry in a directory.
523 // If found, set *poff to byte offset of entry.
524 struct inode*
dirlookup(struct inode * dp,char * name,uint * poff)525 dirlookup(struct inode *dp, char *name, uint *poff)
526 {
527   uint off, inum;
528   struct dirent de;
529 
530   if(dp->type != T_DIR)
531     panic("dirlookup not DIR");
532 
533   for(off = 0; off < dp->size; off += sizeof(de)){
534     if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
535       panic("dirlookup read");
536     if(de.inum == 0)
537       continue;
538     if(namecmp(name, de.name) == 0){
539       // entry matches path element
540       if(poff)
541         *poff = off;
542       inum = de.inum;
543       return iget(dp->dev, inum);
544     }
545   }
546 
547   return 0;
548 }
549 
550 // Write a new directory entry (name, inum) into the directory dp.
551 int
dirlink(struct inode * dp,char * name,uint inum)552 dirlink(struct inode *dp, char *name, uint inum)
553 {
554   int off;
555   struct dirent de;
556   struct inode *ip;
557 
558   // Check that name is not present.
559   if((ip = dirlookup(dp, name, 0)) != 0){
560     iput(ip);
561     return -1;
562   }
563 
564   // Look for an empty dirent.
565   for(off = 0; off < dp->size; off += sizeof(de)){
566     if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
567       panic("dirlink read");
568     if(de.inum == 0)
569       break;
570   }
571 
572   strncpy(de.name, name, DIRSIZ);
573   de.inum = inum;
574   if(writei(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
575     panic("dirlink");
576 
577   return 0;
578 }
579 
580 //PAGEBREAK!
581 // Paths
582 
583 // Copy the next path element from path into name.
584 // Return a pointer to the element following the copied one.
585 // The returned path has no leading slashes,
586 // so the caller can check *path=='\0' to see if the name is the last one.
587 // If no name to remove, return 0.
588 //
589 // Examples:
590 //   skipelem("a/bb/c", name) = "bb/c", setting name = "a"
591 //   skipelem("///a//bb", name) = "bb", setting name = "a"
592 //   skipelem("a", name) = "", setting name = "a"
593 //   skipelem("", name) = skipelem("////", name) = 0
594 //
595 static char*
skipelem(char * path,char * name)596 skipelem(char *path, char *name)
597 {
598   char *s;
599   int len;
600 
601   while(*path == '/')
602     path++;
603   if(*path == 0)
604     return 0;
605   s = path;
606   while(*path != '/' && *path != 0)
607     path++;
608   len = path - s;
609   if(len >= DIRSIZ)
610     memmove(name, s, DIRSIZ);
611   else {
612     memmove(name, s, len);
613     name[len] = 0;
614   }
615   while(*path == '/')
616     path++;
617   return path;
618 }
619 
620 // Look up and return the inode for a path name.
621 // If parent != 0, return the inode for the parent and copy the final
622 // path element into name, which must have room for DIRSIZ bytes.
623 // Must be called inside a transaction since it calls iput().
624 static struct inode*
namex(char * path,int nameiparent,char * name)625 namex(char *path, int nameiparent, char *name)
626 {
627   struct inode *ip, *next;
628 
629   if(*path == '/')
630     ip = iget(ROOTDEV, ROOTINO);
631   else
632     ip = idup(myproc()->cwd);
633 
634   while((path = skipelem(path, name)) != 0){
635     ilock(ip);
636     if(ip->type != T_DIR){
637       iunlockput(ip);
638       return 0;
639     }
640     if(nameiparent && *path == '\0'){
641       // Stop one level early.
642       iunlock(ip);
643       return ip;
644     }
645     if((next = dirlookup(ip, name, 0)) == 0){
646       iunlockput(ip);
647       return 0;
648     }
649     iunlockput(ip);
650     ip = next;
651   }
652   if(nameiparent){
653     iput(ip);
654     return 0;
655   }
656   return ip;
657 }
658 
659 struct inode*
namei(char * path)660 namei(char *path)
661 {
662   char name[DIRSIZ];
663   return namex(path, 0, name);
664 }
665 
666 struct inode*
nameiparent(char * path,char * name)667 nameiparent(char *path, char *name)
668 {
669   return namex(path, 1, name);
670 }
671