xref: /minix/minix/fs/mfs/read.c (revision 7f5f010b)
1 #include "fs.h"
2 #include <stddef.h>
3 #include <string.h>
4 #include <stdlib.h>
5 #include <minix/com.h>
6 #include <minix/u64.h>
7 #include "buf.h"
8 #include "inode.h"
9 #include "super.h"
10 #include <minix/vfsif.h>
11 #include <minix/minlib.h>
12 #include <sys/param.h>
13 #include <assert.h>
14 
15 
16 static struct buf *rahead(struct inode *rip, block_t baseblock, u64_t
17 	position, unsigned bytes_ahead);
18 static int rw_chunk(struct inode *rip, u64_t position, unsigned off,
19 	size_t chunk, unsigned left, int rw_flag, cp_grant_id_t gid, unsigned
20 	buf_off, unsigned int block_size, int *completed);
21 
22 
23 /*===========================================================================*
24  *				fs_readwrite				     *
25  *===========================================================================*/
26 int fs_readwrite(void)
27 {
28   int r, rw_flag, block_spec;
29   int regular;
30   cp_grant_id_t gid;
31   off_t position, f_size, bytes_left;
32   unsigned int off, cum_io, block_size, chunk;
33   mode_t mode_word;
34   int completed;
35   struct inode *rip;
36   size_t nrbytes;
37 
38   r = OK;
39 
40   /* Find the inode referred */
41   if ((rip = find_inode(fs_dev, fs_m_in.m_vfs_fs_readwrite.inode)) == NULL)
42 	return(EINVAL);
43 
44   mode_word = rip->i_mode & I_TYPE;
45   regular = (mode_word == I_REGULAR || mode_word == I_NAMED_PIPE);
46   block_spec = (mode_word == I_BLOCK_SPECIAL ? 1 : 0);
47 
48   /* Determine blocksize */
49   if (block_spec) {
50 	block_size = get_block_size( (dev_t) rip->i_zone[0]);
51 	f_size = MAX_FILE_POS;
52   } else {
53   	block_size = rip->i_sp->s_block_size;
54   	f_size = rip->i_size;
55   }
56 
57   /* Get the values from the request message */
58   switch(fs_m_in.m_type) {
59   	case REQ_READ: rw_flag = READING; break;
60   	case REQ_WRITE: rw_flag = WRITING; break;
61   	case REQ_PEEK: rw_flag = PEEKING; break;
62 	default: panic("odd request");
63   }
64   gid = fs_m_in.m_vfs_fs_readwrite.grant;
65   position = fs_m_in.m_vfs_fs_readwrite.seek_pos;
66   nrbytes = fs_m_in.m_vfs_fs_readwrite.nbytes;
67 
68   lmfs_reset_rdwt_err();
69 
70   /* If this is file i/o, check we can write */
71   if (rw_flag == WRITING && !block_spec) {
72   	  if(rip->i_sp->s_rd_only)
73 		  return EROFS;
74 
75 	  /* Check in advance to see if file will grow too big. */
76 	  if (position > (off_t) (rip->i_sp->s_max_size - nrbytes))
77 		  return(EFBIG);
78 
79 	  /* Clear the zone containing present EOF if hole about
80 	   * to be created.  This is necessary because all unwritten
81 	   * blocks prior to the EOF must read as zeros.
82 	   */
83 	  if(position > f_size) clear_zone(rip, f_size, 0);
84   }
85 
86   /* If this is block i/o, check we can write */
87   if(block_spec && rw_flag == WRITING &&
88   	(dev_t) rip->i_zone[0] == superblock.s_dev && superblock.s_rd_only)
89 		return EROFS;
90 
91   cum_io = 0;
92   /* Split the transfer into chunks that don't span two blocks. */
93   while (nrbytes > 0) {
94 	  off = ((unsigned int) position) % block_size; /* offset in blk*/
95 	  chunk = min(nrbytes, block_size - off);
96 
97 	  if (rw_flag == READING) {
98 		  bytes_left = f_size - position;
99 		  if (position >= f_size) break;	/* we are beyond EOF */
100 		  if (chunk > (unsigned int) bytes_left) chunk = bytes_left;
101 	  }
102 
103 	  /* Read or write 'chunk' bytes. */
104 	  r = rw_chunk(rip, ((u64_t)((unsigned long)position)), off, chunk,
105 	  	       nrbytes, rw_flag, gid, cum_io, block_size, &completed);
106 
107 	  if (r != OK) break;	/* EOF reached */
108 	  if (lmfs_rdwt_err() < 0) break;
109 
110 	  /* Update counters and pointers. */
111 	  nrbytes -= chunk;	/* bytes yet to be read */
112 	  cum_io += chunk;	/* bytes read so far */
113 	  position += (off_t) chunk;	/* position within the file */
114   }
115 
116   fs_m_out.m_fs_vfs_readwrite.seek_pos = position; /* It might change later and
117 						    the VFS has to know this
118 						    value */
119 
120   /* On write, update file size and access time. */
121   if (rw_flag == WRITING) {
122 	  if (regular || mode_word == I_DIRECTORY) {
123 		  if (position > f_size) rip->i_size = position;
124 	  }
125   }
126 
127   rip->i_seek = NO_SEEK;
128 
129   if (lmfs_rdwt_err() != OK) r = lmfs_rdwt_err();	/* check for disk error */
130   if (lmfs_rdwt_err() == END_OF_FILE) r = OK;
131 
132   /* even on a ROFS, writing to a device node on it is fine,
133    * just don't update the inode stats for it. And dito for reading.
134    */
135   if (r == OK && !rip->i_sp->s_rd_only) {
136 	  if (rw_flag == READING) rip->i_update |= ATIME;
137 	  if (rw_flag == WRITING) rip->i_update |= CTIME | MTIME;
138 	  IN_MARKDIRTY(rip);		/* inode is thus now dirty */
139   }
140 
141   fs_m_out.m_fs_vfs_readwrite.nbytes = cum_io;
142 
143   return(r);
144 }
145 
146 
147 /*===========================================================================*
148  *				fs_breadwrite				     *
149  *===========================================================================*/
150 int fs_breadwrite(void)
151 {
152   int r, rw_flag, completed;
153   cp_grant_id_t gid;
154   u64_t position;
155   unsigned int off, cum_io, chunk, block_size;
156   size_t nrbytes;
157   dev_t target_dev;
158 
159   /* Pseudo inode for rw_chunk */
160   struct inode rip;
161 
162   r = OK;
163 
164   target_dev = fs_m_in.m_vfs_fs_breadwrite.device;
165 
166   /* Get the values from the request message */
167   rw_flag = (fs_m_in.m_type == REQ_BREAD ? READING : WRITING);
168   gid = fs_m_in.m_vfs_fs_breadwrite.grant;
169   position = fs_m_in.m_vfs_fs_breadwrite.seek_pos;
170   nrbytes = fs_m_in.m_vfs_fs_breadwrite.nbytes;
171 
172   block_size = get_block_size(target_dev);
173 
174   /* Don't block-write to a RO-mounted filesystem. */
175   if(superblock.s_dev == target_dev && superblock.s_rd_only)
176   	return EROFS;
177 
178   rip.i_zone[0] = (zone_t) target_dev;
179   rip.i_mode = I_BLOCK_SPECIAL;
180   rip.i_size = 0;
181 
182   lmfs_reset_rdwt_err();
183 
184   cum_io = 0;
185   /* Split the transfer into chunks that don't span two blocks. */
186   while (nrbytes > 0) {
187 	  off = (unsigned int)(position % block_size);	/* offset in blk*/
188 	  chunk = min(nrbytes, block_size - off);
189 
190 	  /* Read or write 'chunk' bytes. */
191 	  r = rw_chunk(&rip, position, off, chunk, nrbytes, rw_flag, gid,
192 	  	       cum_io, block_size, &completed);
193 
194 	  if (r != OK) break;	/* EOF reached */
195 	  if (lmfs_rdwt_err() < 0) break;
196 
197 	  /* Update counters and pointers. */
198 	  nrbytes -= chunk;	/* bytes yet to be read */
199 	  cum_io += chunk;	/* bytes read so far */
200 	  position += chunk;	/* position within the file */
201   }
202 
203   fs_m_out.m_fs_vfs_breadwrite.seek_pos = position;
204 
205   if (lmfs_rdwt_err() != OK) r = lmfs_rdwt_err();	/* check for disk error */
206   if (lmfs_rdwt_err() == END_OF_FILE) r = OK;
207 
208   fs_m_out.m_fs_vfs_breadwrite.nbytes = cum_io;
209 
210   return(r);
211 }
212 
213 
214 /*===========================================================================*
215  *				rw_chunk				     *
216  *===========================================================================*/
217 static int rw_chunk(rip, position, off, chunk, left, rw_flag, gid,
218  buf_off, block_size, completed)
219 register struct inode *rip;	/* pointer to inode for file to be rd/wr */
220 u64_t position;			/* position within file to read or write */
221 unsigned off;			/* off within the current block */
222 unsigned int chunk;		/* number of bytes to read or write */
223 unsigned left;			/* max number of bytes wanted after position */
224 int rw_flag;			/* READING, WRITING or PEEKING */
225 cp_grant_id_t gid;		/* grant */
226 unsigned buf_off;		/* offset in grant */
227 unsigned int block_size;	/* block size of FS operating on */
228 int *completed;			/* number of bytes copied */
229 {
230 /* Read or write (part of) a block. */
231 
232   register struct buf *bp = NULL;
233   register int r = OK;
234   int n, block_spec;
235   block_t b;
236   dev_t dev;
237   ino_t ino = VMC_NO_INODE;
238   u64_t ino_off = rounddown(position, block_size);
239 
240   /* rw_flag:
241    *   READING: read from FS, copy to user
242    *   WRITING: copy from user, write to FS
243    *   PEEKING: try to get all the blocks into the cache, no copying
244    */
245 
246   *completed = 0;
247 
248   block_spec = (rip->i_mode & I_TYPE) == I_BLOCK_SPECIAL;
249 
250   if (block_spec) {
251 	b = (unsigned long)(position / block_size);
252 	dev = (dev_t) rip->i_zone[0];
253   } else {
254 	if (ex64hi(position) != 0)
255 		panic("rw_chunk: position too high");
256 	b = read_map(rip, (off_t) ex64lo(position), 0);
257 	dev = rip->i_dev;
258 	ino = rip->i_num;
259 	assert(ino != VMC_NO_INODE);
260   }
261 
262   if (!block_spec && b == NO_BLOCK) {
263 	if (rw_flag == READING) {
264 		/* Reading from a nonexistent block.  Must read as all zeros.*/
265 		r = sys_safememset(VFS_PROC_NR, gid, (vir_bytes) buf_off,
266 			   0, (size_t) chunk);
267 		if(r != OK) {
268 			printf("MFS: sys_safememset failed\n");
269 		}
270 		return r;
271 	} else {
272 		/* Writing to or peeking a nonexistent block.
273 		 * Create and enter in inode.
274 		 */
275 		if ((bp = new_block(rip, (off_t) ex64lo(position))) == NULL)
276 			return(err_code);
277 	}
278   } else if (rw_flag == READING || rw_flag == PEEKING) {
279 	/* Read and read ahead if convenient. */
280 	bp = rahead(rip, b, position, left);
281   } else {
282 	/* Normally an existing block to be partially overwritten is first read
283 	 * in.  However, a full block need not be read in.  If it is already in
284 	 * the cache, acquire it, otherwise just acquire a free buffer.
285 	 */
286 	n = (chunk == block_size ? NO_READ : NORMAL);
287 	if (!block_spec && off == 0 && (off_t) ex64lo(position) >= rip->i_size)
288 		n = NO_READ;
289 	if(block_spec) {
290 		assert(ino == VMC_NO_INODE);
291 		bp = get_block(dev, b, n);
292 	} else {
293 		assert(ino != VMC_NO_INODE);
294 		assert(!(ino_off % block_size));
295 		bp = lmfs_get_block_ino(dev, b, n, ino, ino_off);
296 	}
297   }
298 
299   /* In all cases, bp now points to a valid buffer. */
300   assert(bp != NULL);
301 
302   if (rw_flag == WRITING && chunk != block_size && !block_spec &&
303       (off_t) ex64lo(position) >= rip->i_size && off == 0) {
304 	zero_block(bp);
305   }
306 
307   if (rw_flag == READING) {
308 	/* Copy a chunk from the block buffer to user space. */
309 	r = sys_safecopyto(VFS_PROC_NR, gid, (vir_bytes) buf_off,
310 			   (vir_bytes) (b_data(bp)+off), (size_t) chunk);
311   } else if(rw_flag == WRITING) {
312 	/* Copy a chunk from user space to the block buffer. */
313 	r = sys_safecopyfrom(VFS_PROC_NR, gid, (vir_bytes) buf_off,
314 			     (vir_bytes) (b_data(bp)+off), (size_t) chunk);
315 	MARKDIRTY(bp);
316   }
317 
318   n = (off + chunk == block_size ? FULL_DATA_BLOCK : PARTIAL_DATA_BLOCK);
319   put_block(bp, n);
320 
321   return(r);
322 }
323 
324 
325 /*===========================================================================*
326  *				read_map				     *
327  *===========================================================================*/
328 block_t read_map(rip, position, opportunistic)
329 register struct inode *rip;	/* ptr to inode to map from */
330 off_t position;			/* position in file whose blk wanted */
331 int opportunistic;		/* if nonzero, only use cache for metadata */
332 {
333 /* Given an inode and a position within the corresponding file, locate the
334  * block (not zone) number in which that position is to be found and return it.
335  */
336 
337   struct buf *bp;
338   zone_t z;
339   int scale, boff, index, zind;
340   unsigned int dzones, nr_indirects;
341   block_t b;
342   unsigned long excess, zone, block_pos;
343   int iomode = NORMAL;
344 
345   if(opportunistic) iomode = PREFETCH;
346 
347   scale = rip->i_sp->s_log_zone_size;	/* for block-zone conversion */
348   block_pos = position/rip->i_sp->s_block_size;	/* relative blk # in file */
349   zone = block_pos >> scale;	/* position's zone */
350   boff = (int) (block_pos - (zone << scale) ); /* relative blk # within zone */
351   dzones = rip->i_ndzones;
352   nr_indirects = rip->i_nindirs;
353 
354   /* Is 'position' to be found in the inode itself? */
355   if (zone < dzones) {
356 	zind = (int) zone;	/* index should be an int */
357 	z = rip->i_zone[zind];
358 	if (z == NO_ZONE) return(NO_BLOCK);
359 	b = (block_t) ((z << scale) + boff);
360 	return(b);
361   }
362 
363   /* It is not in the inode, so it must be single or double indirect. */
364   excess = zone - dzones;	/* first Vx_NR_DZONES don't count */
365 
366   if (excess < nr_indirects) {
367 	/* 'position' can be located via the single indirect block. */
368 	z = rip->i_zone[dzones];
369   } else {
370 	/* 'position' can be located via the double indirect block. */
371 	if ( (z = rip->i_zone[dzones+1]) == NO_ZONE) return(NO_BLOCK);
372 	excess -= nr_indirects;			/* single indir doesn't count*/
373 	b = (block_t) z << scale;
374 	ASSERT(rip->i_dev != NO_DEV);
375 	index = (int) (excess/nr_indirects);
376 	if ((unsigned int) index > rip->i_nindirs)
377 		return(NO_BLOCK);	/* Can't go beyond double indirects */
378 	bp = get_block(rip->i_dev, b, iomode); /* get double indirect block */
379 	if(opportunistic && lmfs_dev(bp) == NO_DEV) {
380 		put_block(bp, INDIRECT_BLOCK);
381 		return NO_BLOCK;
382 	}
383 	ASSERT(lmfs_dev(bp) != NO_DEV);
384 	ASSERT(lmfs_dev(bp) == rip->i_dev);
385 	z = rd_indir(bp, index);		/* z= zone for single*/
386 	put_block(bp, INDIRECT_BLOCK);		/* release double ind block */
387 	excess = excess % nr_indirects;		/* index into single ind blk */
388   }
389 
390   /* 'z' is zone num for single indirect block; 'excess' is index into it. */
391   if (z == NO_ZONE) return(NO_BLOCK);
392   b = (block_t) z << scale;			/* b is blk # for single ind */
393   bp = get_block(rip->i_dev, b, iomode);	/* get single indirect block */
394   if(opportunistic && lmfs_dev(bp) == NO_DEV) {
395 	put_block(bp, INDIRECT_BLOCK);
396 	return NO_BLOCK;
397   }
398   z = rd_indir(bp, (int) excess);		/* get block pointed to */
399   put_block(bp, INDIRECT_BLOCK);		/* release single indir blk */
400   if (z == NO_ZONE) return(NO_BLOCK);
401   b = (block_t) ((z << scale) + boff);
402   return(b);
403 }
404 
405 struct buf *get_block_map(register struct inode *rip, u64_t position)
406 {
407 	block_t b = read_map(rip, position, 0);	/* get block number */
408 	int block_size = get_block_size(rip->i_dev);
409 	if(b == NO_BLOCK)
410 		return NULL;
411 	position = rounddown(position, block_size);
412 	assert(rip->i_num != VMC_NO_INODE);
413 	return lmfs_get_block_ino(rip->i_dev, b, NORMAL, rip->i_num, position);
414 }
415 
416 /*===========================================================================*
417  *				rd_indir				     *
418  *===========================================================================*/
419 zone_t rd_indir(bp, index)
420 struct buf *bp;			/* pointer to indirect block */
421 int index;			/* index into *bp */
422 {
423   struct super_block *sp;
424   zone_t zone;
425 
426   if(bp == NULL)
427 	panic("rd_indir() on NULL");
428 
429   sp = get_super(lmfs_dev(bp));	/* need super block to find file sys type */
430 
431   /* read a zone from an indirect block */
432   assert(sp->s_version == V3);
433   zone = (zone_t) conv4(sp->s_native, (long) b_v2_ind(bp)[index]);
434 
435   if (zone != NO_ZONE &&
436 		(zone < (zone_t) sp->s_firstdatazone || zone >= sp->s_zones)) {
437 	printf("Illegal zone number %ld in indirect block, index %d\n",
438 	       (long) zone, index);
439 	panic("check file system");
440   }
441 
442   return(zone);
443 }
444 
445 /*===========================================================================*
446  *				rahead					     *
447  *===========================================================================*/
448 static struct buf *rahead(rip, baseblock, position, bytes_ahead)
449 register struct inode *rip;	/* pointer to inode for file to be read */
450 block_t baseblock;		/* block at current position */
451 u64_t position;			/* position within file */
452 unsigned bytes_ahead;		/* bytes beyond position for immediate use */
453 {
454 /* Fetch a block from the cache or the device.  If a physical read is
455  * required, prefetch as many more blocks as convenient into the cache.
456  * This usually covers bytes_ahead and is at least BLOCKS_MINIMUM.
457  * The device driver may decide it knows better and stop reading at a
458  * cylinder boundary (or after an error).  Rw_scattered() puts an optional
459  * flag on all reads to allow this.
460  */
461 /* Minimum number of blocks to prefetch. */
462   int nr_bufs = lmfs_nr_bufs();
463 # define BLOCKS_MINIMUM		(nr_bufs < 50 ? 18 : 32)
464   int block_spec, scale, read_q_size;
465   unsigned int blocks_ahead, fragment, block_size;
466   block_t block, blocks_left;
467   off_t ind1_pos;
468   dev_t dev;
469   struct buf *bp;
470   static unsigned int readqsize = 0;
471   static struct buf **read_q;
472   u64_t position_running;
473   int inuse_before = lmfs_bufs_in_use();
474 
475   if(readqsize != nr_bufs) {
476 	if(readqsize > 0) {
477 		assert(read_q != NULL);
478 		free(read_q);
479 	}
480 	if(!(read_q = malloc(sizeof(read_q[0])*nr_bufs)))
481 		panic("couldn't allocate read_q");
482 	readqsize = nr_bufs;
483   }
484 
485   block_spec = (rip->i_mode & I_TYPE) == I_BLOCK_SPECIAL;
486   if (block_spec)
487 	dev = (dev_t) rip->i_zone[0];
488   else
489 	dev = rip->i_dev;
490 
491   assert(dev != NO_DEV);
492 
493   block_size = get_block_size(dev);
494 
495   block = baseblock;
496 
497   fragment = position % block_size;
498   position -= fragment;
499   position_running = position;
500   bytes_ahead += fragment;
501   blocks_ahead = (bytes_ahead + block_size - 1) / block_size;
502 
503   if(block_spec)
504 	  bp = get_block(dev, block, PREFETCH);
505   else
506 	  bp = lmfs_get_block_ino(dev, block, PREFETCH, rip->i_num, position);
507 
508   assert(bp != NULL);
509   assert(bp->lmfs_count > 0);
510   if (lmfs_dev(bp) != NO_DEV) return(bp);
511 
512   /* The best guess for the number of blocks to prefetch:  A lot.
513    * It is impossible to tell what the device looks like, so we don't even
514    * try to guess the geometry, but leave it to the driver.
515    *
516    * The floppy driver can read a full track with no rotational delay, and it
517    * avoids reading partial tracks if it can, so handing it enough buffers to
518    * read two tracks is perfect.  (Two, because some diskette types have
519    * an odd number of sectors per track, so a block may span tracks.)
520    *
521    * The disk drivers don't try to be smart.  With todays disks it is
522    * impossible to tell what the real geometry looks like, so it is best to
523    * read as much as you can.  With luck the caching on the drive allows
524    * for a little time to start the next read.
525    *
526    * The current solution below is a bit of a hack, it just reads blocks from
527    * the current file position hoping that more of the file can be found.  A
528    * better solution must look at the already available zone pointers and
529    * indirect blocks (but don't call read_map!).
530    */
531 
532   if (block_spec && rip->i_size == 0) {
533 	blocks_left = (block_t) NR_IOREQS;
534   } else {
535 	blocks_left = (block_t) (rip->i_size-ex64lo(position)+(block_size-1)) /
536 								block_size;
537 
538 	/* Go for the first indirect block if we are in its neighborhood. */
539 	if (!block_spec) {
540 		scale = rip->i_sp->s_log_zone_size;
541 		ind1_pos = (off_t) rip->i_ndzones * (block_size << scale);
542 		if ((off_t) ex64lo(position) <= ind1_pos &&
543 		     rip->i_size > ind1_pos) {
544 			blocks_ahead++;
545 			blocks_left++;
546 		}
547 	}
548   }
549 
550   /* No more than the maximum request. */
551   if (blocks_ahead > NR_IOREQS) blocks_ahead = NR_IOREQS;
552 
553   /* Read at least the minimum number of blocks, but not after a seek. */
554   if (blocks_ahead < BLOCKS_MINIMUM && rip->i_seek == NO_SEEK)
555 	blocks_ahead = BLOCKS_MINIMUM;
556 
557   /* Can't go past end of file. */
558   if (blocks_ahead > blocks_left) blocks_ahead = blocks_left;
559 
560   read_q_size = 0;
561 
562   /* Acquire block buffers. */
563   for (;;) {
564   	block_t thisblock;
565 	assert(bp->lmfs_count > 0);
566 	read_q[read_q_size++] = bp;
567 
568 	if (--blocks_ahead == 0) break;
569 
570 	/* Don't trash the cache, leave 4 free. */
571 	if (lmfs_bufs_in_use() >= nr_bufs - 4) break;
572 
573 	block++;
574 	position_running += block_size;
575 
576 	if(!block_spec &&
577 	  (thisblock = read_map(rip, (off_t) ex64lo(position_running), 1)) != NO_BLOCK) {
578 	  	bp = lmfs_get_block_ino(dev, thisblock, PREFETCH, rip->i_num, position_running);
579 	} else {
580 		bp = get_block(dev, block, PREFETCH);
581 	}
582 	assert(bp);
583 	assert(bp->lmfs_count > 0);
584 	if (lmfs_dev(bp) != NO_DEV) {
585 		/* Oops, block already in the cache, get out. */
586 		put_block(bp, FULL_DATA_BLOCK);
587 		break;
588 	}
589   }
590   lmfs_rw_scattered(dev, read_q, read_q_size, READING);
591 
592   assert(inuse_before == lmfs_bufs_in_use());
593 
594   if(block_spec)
595 	  return get_block(dev, baseblock, NORMAL);
596   return(lmfs_get_block_ino(dev, baseblock, NORMAL, rip->i_num, position));
597 }
598 
599 
600 /*===========================================================================*
601  *				fs_getdents				     *
602  *===========================================================================*/
603 int fs_getdents(void)
604 {
605 #define GETDENTS_BUFSIZE	(sizeof(struct dirent) + MFS_NAME_MAX + 1)
606 #define GETDENTS_ENTRIES	8
607   static char getdents_buf[GETDENTS_BUFSIZE * GETDENTS_ENTRIES];
608   register struct inode *rip;
609   int o, r, done;
610   unsigned int block_size, len, reclen;
611   ino_t ino;
612   cp_grant_id_t gid;
613   size_t size, tmpbuf_off, userbuf_off;
614   off_t pos, off, block_pos, new_pos, ent_pos;
615   struct buf *bp;
616   struct direct *dp;
617   struct dirent *dep;
618   char *cp;
619 
620   ino = fs_m_in.m_vfs_fs_getdents.inode;
621   gid = fs_m_in.m_vfs_fs_getdents.grant;
622   size = fs_m_in.m_vfs_fs_getdents.mem_size;
623   pos = fs_m_in.m_vfs_fs_getdents.seek_pos;
624 
625   /* Check whether the position is properly aligned */
626   if( (unsigned int) pos % DIR_ENTRY_SIZE)
627 	  return(ENOENT);
628 
629   if( (rip = get_inode(fs_dev, ino)) == NULL)
630 	  return(EINVAL);
631 
632   block_size = rip->i_sp->s_block_size;
633   off = (pos % block_size);		/* Offset in block */
634   block_pos = pos - off;
635   done = FALSE;		/* Stop processing directory blocks when done is set */
636 
637   tmpbuf_off = 0;	/* Offset in getdents_buf */
638   memset(getdents_buf, '\0', sizeof(getdents_buf));	/* Avoid leaking any data */
639   userbuf_off = 0;	/* Offset in the user's buffer */
640 
641   /* The default position for the next request is EOF. If the user's buffer
642    * fills up before EOF, new_pos will be modified. */
643   new_pos = rip->i_size;
644 
645   for(; block_pos < rip->i_size; block_pos += block_size) {
646 	/* Since directories don't have holes, 'bp' cannot be NULL. */
647 	bp = get_block_map(rip, block_pos);	/* get a dir block */
648 	assert(bp != NULL);
649 
650 	  /* Search a directory block. */
651 	  if (block_pos < pos)
652 		  dp = &b_dir(bp)[off / DIR_ENTRY_SIZE];
653 	  else
654 		  dp = &b_dir(bp)[0];
655 	  for (; dp < &b_dir(bp)[NR_DIR_ENTRIES(block_size)]; dp++) {
656 		  if (dp->mfs_d_ino == 0)
657 			  continue;	/* Entry is not in use */
658 
659 		  /* Compute the length of the name */
660 		  cp = memchr(dp->mfs_d_name, '\0', sizeof(dp->mfs_d_name));
661 		  if (cp == NULL)
662 			  len = sizeof(dp->mfs_d_name);
663 		  else
664 			  len = cp - (dp->mfs_d_name);
665 
666 		  /* Compute record length; also does alignment. */
667 		  reclen = _DIRENT_RECLEN(dep, len);
668 
669 		  /* Need the position of this entry in the directory */
670 		  ent_pos = block_pos + ((char *) dp - (char *) bp->data);
671 
672 		if (userbuf_off + tmpbuf_off + reclen >= size) {
673 			  /* The user has no space for one more record */
674 			  done = TRUE;
675 
676 			  /* Record the position of this entry, it is the
677 			   * starting point of the next request (unless the
678 			   * postion is modified with lseek).
679 			   */
680 			  new_pos = ent_pos;
681 			  break;
682 		}
683 
684 		if (tmpbuf_off + reclen >= GETDENTS_BUFSIZE*GETDENTS_ENTRIES) {
685 			  r = sys_safecopyto(VFS_PROC_NR, gid,
686 			  		     (vir_bytes) userbuf_off,
687 					     (vir_bytes) getdents_buf,
688 					     (size_t) tmpbuf_off);
689 			  if (r != OK) {
690 			  	put_inode(rip);
691 			  	return(r);
692 			  }
693 
694 			  userbuf_off += tmpbuf_off;
695 			  tmpbuf_off = 0;
696 		}
697 
698 		dep = (struct dirent *) &getdents_buf[tmpbuf_off];
699 		dep->d_fileno = (ino_t) dp->mfs_d_ino;
700 		dep->d_reclen = (unsigned short) reclen;
701 		dep->d_namlen = len;
702 		memcpy(dep->d_name, dp->mfs_d_name, len);
703 		{
704 			struct inode *entrip;
705 			if(!(entrip = get_inode(fs_dev, dep->d_fileno)))
706 				panic("unexpected get_inode failure");
707 			dep->d_type = fs_mode_to_type(entrip->i_mode);
708 			put_inode(entrip);
709 		}
710 		dep->d_name[len] = '\0';
711 		tmpbuf_off += reclen;
712 	}
713 
714 	put_block(bp, DIRECTORY_BLOCK);
715 	if (done)
716 		break;
717   }
718 
719   if (tmpbuf_off != 0) {
720 	r = sys_safecopyto(VFS_PROC_NR, gid, (vir_bytes) userbuf_off,
721 	  		     (vir_bytes) getdents_buf, (size_t) tmpbuf_off);
722 	if (r != OK) {
723 		put_inode(rip);
724 		return(r);
725 	}
726 
727 	userbuf_off += tmpbuf_off;
728   }
729 
730   if (done && userbuf_off == 0)
731 	  r = EINVAL;		/* The user's buffer is too small */
732   else {
733 	  fs_m_out.m_fs_vfs_getdents.nbytes = userbuf_off;
734 	  fs_m_out.m_fs_vfs_getdents.seek_pos = new_pos;
735 	  if(!rip->i_sp->s_rd_only) {
736 		  rip->i_update |= ATIME;
737 		  IN_MARKDIRTY(rip);
738 	  }
739 	  r = OK;
740   }
741 
742   put_inode(rip);		/* release the inode */
743   return(r);
744 }
745 
746