1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /* Portions Copyright 2007 Shivakumar GN */
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/cmn_err.h>
31 #include <sys/debug.h>
32 #include <sys/dirent.h>
33 #include <sys/kmem.h>
34 #include <sys/mman.h>
35 #include <sys/mutex.h>
36 #include <sys/sysmacros.h>
37 #include <sys/systm.h>
38 #include <sys/sunddi.h>
39 #include <sys/uio.h>
40 #include <sys/vmsystm.h>
41 #include <sys/vfs.h>
42 #include <sys/vnode.h>
43 
44 #include <vm/as.h>
45 #include <vm/seg_vn.h>
46 
47 #include <sys/gfs.h>
48 
49 /*
50  * Generic pseudo-filesystem routines.
51  *
52  * There are significant similarities between the implementation of certain file
53  * system entry points across different filesystems.  While one could attempt to
54  * "choke up on the bat" and incorporate common functionality into a VOP
55  * preamble or postamble, such an approach is limited in the benefit it can
56  * provide.  In this file we instead define a toolkit of routines which can be
57  * called from a filesystem (with in-kernel pseudo-filesystems being the focus
58  * of the exercise) in a more component-like fashion.
59  *
60  * There are three basic classes of routines:
61  *
62  * 1) Lowlevel support routines
63  *
64  *    These routines are designed to play a support role for existing
65  *    pseudo-filesystems (such as procfs).  They simplify common tasks,
66  *    without forcing the filesystem to hand over management to GFS.  The
67  *    routines covered are:
68  *
69  *	gfs_readdir_init()
70  *	gfs_readdir_emit()
71  *	gfs_readdir_emitn()
72  *	gfs_readdir_pred()
73  *	gfs_readdir_fini()
74  *	gfs_lookup_dot()
75  *
76  * 2) Complete GFS management
77  *
78  *    These routines take a more active role in management of the
79  *    pseudo-filesystem.  They handle the relationship between vnode private
80  *    data and VFS data, as well as the relationship between vnodes in the
81  *    directory hierarchy.
82  *
83  *    In order to use these interfaces, the first member of every private
84  *    v_data must be a gfs_file_t or a gfs_dir_t.  This hands over all control
85  *    to GFS.
86  *
87  * 	gfs_file_create()
88  * 	gfs_dir_create()
89  * 	gfs_root_create()
90  *
91  *	gfs_file_inactive()
92  *	gfs_dir_inactive()
93  *	gfs_dir_lookup()
94  *	gfs_dir_readdir()
95  *
96  * 	gfs_vop_inactive()
97  * 	gfs_vop_lookup()
98  * 	gfs_vop_readdir()
99  * 	gfs_vop_map()
100  *
101  * 3) Single File pseudo-filesystems
102  *
103  *    This routine creates a rooted file to be overlayed ontop of another
104  *    file in the physical filespace.
105  *
106  *    Note that the parent is NULL (actually the vfs), but there is nothing
107  *    technically keeping such a file from utilizing the "Complete GFS
108  *    management" set of routines.
109  *
110  * 	gfs_root_create_file()
111  */
112 
113 /*
114  * gfs_make_opsvec: take an array of vnode type definitions and create
115  * their vnodeops_t structures
116  *
117  * This routine takes an array of gfs_opsvec_t's.  It could
118  * alternatively take an array of gfs_opsvec_t*'s, which would allow
119  * vnode types to be completely defined in files external to the caller
120  * of gfs_make_opsvec().  As it stands, much more sharing takes place --
121  * both the caller and the vnode type provider need to access gfsv_ops
122  * and gfsv_template, and the caller also needs to know gfsv_name.
123  */
124 #ifdef PORT_SOLARIS
125 int
gfs_make_opsvec(gfs_opsvec_t * vec)126 gfs_make_opsvec(gfs_opsvec_t *vec)
127 {
128 	int error, i;
129 
130 	for (i = 0; ; i++) {
131 		if (vec[i].gfsv_name == NULL)
132 			return (0);
133 		error = vn_make_ops(vec[i].gfsv_name, vec[i].gfsv_template,
134 		    vec[i].gfsv_ops);
135 		if (error)
136 			break;
137 	}
138 
139 	cmn_err(CE_WARN, "gfs_make_opsvec: bad vnode ops template for '%s'",
140 	    vec[i].gfsv_name);
141 	for (i--; i >= 0; i--) {
142 		vn_freevnodeops(*vec[i].gfsv_ops);
143 		*vec[i].gfsv_ops = NULL;
144 	}
145 	return (error);
146 }
147 #endif
148 
149 /*
150  * Low level directory routines
151  *
152  * These routines provide some simple abstractions for reading directories.
153  * They are designed to be used by existing pseudo filesystems (namely procfs)
154  * that already have a complicated management infrastructure.
155  */
156 
157 /*
158  * gfs_get_parent_ino: used to obtain a parent inode number and the
159  * inode number of the given vnode in preparation for calling gfs_readdir_init.
160  */
161 int
gfs_get_parent_ino(vnode_t * dvp,cred_t * cr,caller_context_t * ct,ino64_t * pino,ino64_t * ino)162 gfs_get_parent_ino(vnode_t *dvp, cred_t *cr, caller_context_t *ct,
163     ino64_t *pino, ino64_t *ino)
164 {
165 	vnode_t *parent;
166 	gfs_dir_t *dp = dvp->v_data;
167 	int error;
168 
169 	*ino = dp->gfsd_file.gfs_ino;
170 	parent = dp->gfsd_file.gfs_parent;
171 
172 	if (parent == NULL) {
173 		*pino = *ino;		/* root of filesystem */
174 	} else if (dvp->v_flag & V_XATTRDIR) {
175 		vattr_t va;
176 
177 		va.va_mask = AT_NODEID;
178 		error = VOP_GETATTR(parent, &va, 0, cr, ct);
179 		if (error)
180 			return (error);
181 		*pino = va.va_nodeid;
182 	} else {
183 		*pino = ((gfs_file_t *)(parent->v_data))->gfs_ino;
184 	}
185 
186 	return (0);
187 }
188 
189 /*
190  * gfs_readdir_init: initiate a generic readdir
191  *   st		- a pointer to an uninitialized gfs_readdir_state_t structure
192  *   name_max	- the directory's maximum file name length
193  *   ureclen	- the exported file-space record length (1 for non-legacy FSs)
194  *   uiop	- the uiop passed to readdir
195  *   parent	- the parent directory's inode
196  *   self	- this directory's inode
197  *   flags	- flags from VOP_READDIR
198  *
199  * Returns 0 or a non-zero errno.
200  *
201  * Typical VOP_READDIR usage of gfs_readdir_*:
202  *
203  *	if ((error = gfs_readdir_init(...)) != 0)
204  *		return (error);
205  *	eof = 0;
206  *	while ((error = gfs_readdir_pred(..., &voffset)) != 0) {
207  *		if (!consumer_entry_at(voffset))
208  *			voffset = consumer_next_entry(voffset);
209  *		if (consumer_eof(voffset)) {
210  *			eof = 1
211  *			break;
212  *		}
213  *		if ((error = gfs_readdir_emit(..., voffset,
214  *		    consumer_ino(voffset), consumer_name(voffset))) != 0)
215  *			break;
216  *	}
217  *	return (gfs_readdir_fini(..., error, eofp, eof));
218  *
219  * As you can see, a zero result from gfs_readdir_pred() or
220  * gfs_readdir_emit() indicates that processing should continue,
221  * whereas a non-zero result indicates that the loop should terminate.
222  * Most consumers need do nothing more than let gfs_readdir_fini()
223  * determine what the cause of failure was and return the appropriate
224  * value.
225  */
226 int
gfs_readdir_init(gfs_readdir_state_t * st,int name_max,int ureclen,uio_t * uiop,ino64_t parent,ino64_t self,int flags)227 gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
228     uio_t *uiop, ino64_t parent, ino64_t self, int flags)
229 {
230 	size_t dirent_size;
231 
232 	if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 ||
233 	    (uiop->uio_loffset % ureclen) != 0)
234 		return (EINVAL);
235 
236 	st->grd_ureclen = ureclen;
237 	st->grd_oresid = uiop->uio_resid;
238 	st->grd_namlen = name_max;
239 	if (flags & V_RDDIR_ENTFLAGS)
240 		dirent_size = EDIRENT_RECLEN(st->grd_namlen);
241 	else
242 		dirent_size = DIRENT64_RECLEN(st->grd_namlen);
243 	st->grd_dirent = kmem_zalloc(dirent_size, KM_SLEEP);
244 	st->grd_parent = parent;
245 	st->grd_self = self;
246 	st->grd_flags = flags;
247 
248 	return (0);
249 }
250 
251 /*
252  * gfs_readdir_emit_int: internal routine to emit directory entry
253  *
254  *   st		- the current readdir state, which must have d_ino/ed_ino
255  *		  and d_name/ed_name set
256  *   uiop	- caller-supplied uio pointer
257  *   next	- the offset of the next entry
258  */
259 static int
gfs_readdir_emit_int(gfs_readdir_state_t * st,uio_t * uiop,offset_t next)260 gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next)
261 {
262 	int reclen, namelen;
263 	dirent64_t *dp;
264 	edirent_t *edp;
265 
266 	if (st->grd_flags & V_RDDIR_ENTFLAGS) {
267 		edp = st->grd_dirent;
268 		namelen = strlen(edp->ed_name);
269 		reclen = EDIRENT_RECLEN(strlen(edp->ed_name));
270 	} else {
271 		dp = st->grd_dirent;
272 		namelen = strlen(dp->d_name);
273 		reclen = DIRENT64_RECLEN(strlen(dp->d_name));
274 	}
275 
276 	if (reclen > uiop->uio_resid) {
277 		/*
278 		 * Error if no entries were returned yet
279 		 */
280 		if (uiop->uio_resid == st->grd_oresid)
281 			return (EINVAL);
282 		return (-1);
283 	}
284 
285 	if (st->grd_flags & V_RDDIR_ENTFLAGS) {
286 		edp->ed_off = next;
287 		edp->ed_reclen = (ushort_t)reclen;
288 	} else {
289 		dp->d_reclen = (ushort_t)reclen;
290 		dp->d_type = DT_DIR;
291 		dp->d_namlen = namelen;
292 	}
293 
294 	if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop))
295 		return (EFAULT);
296 
297 	uiop->uio_loffset = next;
298 
299 	return (0);
300 }
301 
302 /*
303  * gfs_readdir_emit: emit a directory entry
304  *   voff       - the virtual offset (obtained from gfs_readdir_pred)
305  *   ino        - the entry's inode
306  *   name       - the entry's name
307  *   eflags	- value for ed_eflags (if processing edirent_t)
308  *
309  * Returns a 0 on success, a non-zero errno on failure, or -1 if the
310  * readdir loop should terminate.  A non-zero result (either errno or
311  * -1) from this function is typically passed directly to
312  * gfs_readdir_fini().
313  */
314 int
gfs_readdir_emit(gfs_readdir_state_t * st,uio_t * uiop,offset_t voff,ino64_t ino,const char * name,int eflags)315 gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
316     ino64_t ino, const char *name, int eflags)
317 {
318 	offset_t off = (voff + 2) * st->grd_ureclen;
319 
320 	if (st->grd_flags & V_RDDIR_ENTFLAGS) {
321 		edirent_t *edp = st->grd_dirent;
322 
323 		edp->ed_ino = ino;
324 		(void) strncpy(edp->ed_name, name, st->grd_namlen);
325 		edp->ed_eflags = eflags;
326 	} else {
327 		dirent64_t *dp = st->grd_dirent;
328 
329 		dp->d_ino = ino;
330 		(void) strncpy(dp->d_name, name, st->grd_namlen);
331 	}
332 
333 	/*
334 	 * Inter-entry offsets are invalid, so we assume a record size of
335 	 * grd_ureclen and explicitly set the offset appropriately.
336 	 */
337 	return (gfs_readdir_emit_int(st, uiop, off + st->grd_ureclen));
338 }
339 
340 /*
341  * gfs_readdir_emitn: like gfs_readdir_emit(), but takes an integer
342  * instead of a string for the entry's name.
343  */
344 int
gfs_readdir_emitn(gfs_readdir_state_t * st,uio_t * uiop,offset_t voff,ino64_t ino,unsigned long num)345 gfs_readdir_emitn(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
346     ino64_t ino, unsigned long num)
347 {
348 	char buf[40];
349 
350 	numtos(num, buf);
351 	return (gfs_readdir_emit(st, uiop, voff, ino, buf, 0));
352 }
353 
354 /*
355  * gfs_readdir_pred: readdir loop predicate
356  *   voffp - a pointer in which the next virtual offset should be stored
357  *
358  * Returns a 0 on success, a non-zero errno on failure, or -1 if the
359  * readdir loop should terminate.  A non-zero result (either errno or
360  * -1) from this function is typically passed directly to
361  * gfs_readdir_fini().
362  */
363 int
gfs_readdir_pred(gfs_readdir_state_t * st,uio_t * uiop,offset_t * voffp)364 gfs_readdir_pred(gfs_readdir_state_t *st, uio_t *uiop, offset_t *voffp)
365 {
366 	offset_t off, voff;
367 	int error;
368 
369 top:
370 	if (uiop->uio_resid <= 0)
371 		return (-1);
372 
373 	off = uiop->uio_loffset / st->grd_ureclen;
374 	voff = off - 2;
375 	if (off == 0) {
376 		if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self,
377 		    ".", 0)) == 0)
378 			goto top;
379 	} else if (off == 1) {
380 		if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent,
381 		    "..", 0)) == 0)
382 			goto top;
383 	} else {
384 		*voffp = voff;
385 		return (0);
386 	}
387 
388 	return (error);
389 }
390 
391 /*
392  * gfs_readdir_fini: generic readdir cleanup
393  *   error	- if positive, an error to return
394  *   eofp	- the eofp passed to readdir
395  *   eof	- the eof value
396  *
397  * Returns a 0 on success, a non-zero errno on failure.  This result
398  * should be returned from readdir.
399  */
400 int
gfs_readdir_fini(gfs_readdir_state_t * st,int error,int * eofp,int eof)401 gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof)
402 {
403 	size_t dirent_size;
404 
405 	if (st->grd_flags & V_RDDIR_ENTFLAGS)
406 		dirent_size = EDIRENT_RECLEN(st->grd_namlen);
407 	else
408 		dirent_size = DIRENT64_RECLEN(st->grd_namlen);
409 	kmem_free(st->grd_dirent, dirent_size);
410 	if (error > 0)
411 		return (error);
412 	if (eofp)
413 		*eofp = eof;
414 	return (0);
415 }
416 
417 /*
418  * gfs_lookup_dot
419  *
420  * Performs a basic check for "." and ".." directory entries.
421  */
422 int
gfs_lookup_dot(vnode_t ** vpp,vnode_t * dvp,vnode_t * pvp,const char * nm)423 gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm)
424 {
425 	if (*nm == '\0' || strcmp(nm, ".") == 0) {
426 		VN_HOLD(dvp);
427 		*vpp = dvp;
428 		return (0);
429 	} else if (strcmp(nm, "..") == 0) {
430 		if (pvp == NULL) {
431 			ASSERT(dvp->v_flag & VROOT);
432 			VN_HOLD(dvp);
433 			*vpp = dvp;
434 		} else {
435 			VN_HOLD(pvp);
436 			*vpp = pvp;
437 		}
438 		return (0);
439 	}
440 
441 	return (-1);
442 }
443 
444 /*
445  * gfs_file_create(): create a new GFS file
446  *
447  *   size	- size of private data structure (v_data)
448  *   pvp	- parent vnode (GFS directory)
449  *   ops	- vnode operations vector
450  *
451  * In order to use this interface, the parent vnode must have been created by
452  * gfs_dir_create(), and the private data stored in v_data must have a
453  * 'gfs_file_t' as its first field.
454  *
455  * Given these constraints, this routine will automatically:
456  *
457  * 	- Allocate v_data for the vnode
458  * 	- Initialize necessary fields in the vnode
459  * 	- Hold the parent
460  */
461 vnode_t *
gfs_file_create(size_t size,vnode_t * pvp,vnodeops_t * ops)462 gfs_file_create(size_t size, vnode_t *pvp, vnodeops_t *ops)
463 {
464 	gfs_file_t *fp;
465 	vnode_t *vp;
466 	int error;
467 
468 	/*
469 	 * Allocate vnode and internal data structure
470 	 */
471 	fp = kmem_zalloc(size, KM_SLEEP);
472 	/* XXX FreeBSD adds vfs_t * as parameter to gfs_file_create and
473 	   gfs_dir_create */
474 	error = getnewvnode(VT_ZFS, pvp->v_vfsp, ops, &vp);
475 	ASSERT(error == 0);
476 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
477 
478 	/*
479 	 * Set up various pointers
480 	 */
481 	fp->gfs_vnode = vp;
482 	fp->gfs_parent = pvp;
483 	vp->v_data = fp;
484 	fp->gfs_size = size;
485 	fp->gfs_type = GFS_FILE;
486 
487 	/*
488 	 * Initialize vnode and hold parent.
489 	 */
490 	vn_setops(vp, ops);
491 	if (pvp) {
492 		VN_SET_VFS_TYPE_DEV(vp, pvp->v_vfsp, VREG, 0);
493 		VN_HOLD(pvp);
494 	}
495 
496 	return (vp);
497 }
498 
499 /*
500  * gfs_dir_create: creates a new directory in the parent
501  *
502  *   size	- size of private data structure (v_data)
503  *   pvp	- parent vnode (GFS directory)
504  *   ops	- vnode operations vector
505  *   entries	- NULL-terminated list of static entries (if any)
506  *   maxlen	- maximum length of a directory entry
507  *   readdir_cb	- readdir callback (see gfs_dir_readdir)
508  *   inode_cb	- inode callback (see gfs_dir_readdir)
509  *   lookup_cb	- lookup callback (see gfs_dir_lookup)
510  *
511  * In order to use this function, the first member of the private vnode
512  * structure (v_data) must be a gfs_dir_t.  For each directory, there are
513  * static entries, defined when the structure is initialized, and dynamic
514  * entries, retrieved through callbacks.
515  *
516  * If a directory has static entries, then it must supply a inode callback,
517  * which will compute the inode number based on the parent and the index.
518  * For a directory with dynamic entries, the caller must supply a readdir
519  * callback and a lookup callback.  If a static lookup fails, we fall back to
520  * the supplied lookup callback, if any.
521  *
522  * This function also performs the same initialization as gfs_file_create().
523  */
524 vnode_t *
gfs_dir_create(size_t struct_size,vnode_t * pvp,vnodeops_t * ops,gfs_dirent_t * entries,gfs_inode_cb inode_cb,int maxlen,gfs_readdir_cb readdir_cb,gfs_lookup_cb lookup_cb)525 gfs_dir_create(size_t struct_size, vnode_t *pvp, vnodeops_t *ops,
526     gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
527     gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
528 {
529 	vnode_t *vp;
530 	gfs_dir_t *dp;
531 	gfs_dirent_t *de;
532 
533 	vp = gfs_file_create(struct_size, pvp, ops);
534 	vp->v_type = VDIR;
535 
536 	dp = vp->v_data;
537 	dp->gfsd_file.gfs_type = GFS_DIR;
538 	dp->gfsd_maxlen = maxlen;
539 
540 	if (entries != NULL) {
541 		for (de = entries; de->gfse_name != NULL; de++)
542 			dp->gfsd_nstatic++;
543 
544 		dp->gfsd_static = kmem_alloc(
545 		    dp->gfsd_nstatic * sizeof (gfs_dirent_t), KM_SLEEP);
546 		bcopy(entries, dp->gfsd_static,
547 		    dp->gfsd_nstatic * sizeof (gfs_dirent_t));
548 	}
549 
550 	dp->gfsd_readdir = readdir_cb;
551 	dp->gfsd_lookup = lookup_cb;
552 	dp->gfsd_inode = inode_cb;
553 
554 	mutex_init(&dp->gfsd_lock, NULL, MUTEX_DEFAULT, NULL);
555 
556 	return (vp);
557 }
558 
559 /*
560  * gfs_root_create(): create a root vnode for a GFS filesystem
561  *
562  * Similar to gfs_dir_create(), this creates a root vnode for a filesystem.  The
563  * only difference is that it takes a vfs_t instead of a vnode_t as its parent.
564  */
565 vnode_t *
gfs_root_create(size_t size,vfs_t * vfsp,vnodeops_t * ops,ino64_t ino,gfs_dirent_t * entries,gfs_inode_cb inode_cb,int maxlen,gfs_readdir_cb readdir_cb,gfs_lookup_cb lookup_cb)566 gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino,
567     gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
568     gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
569 {
570 	vnode_t *vp = gfs_dir_create(size, NULL, ops, entries, inode_cb,
571 	    maxlen, readdir_cb, lookup_cb);
572 
573 	/* Manually set the inode */
574 	((gfs_file_t *)vp->v_data)->gfs_ino = ino;
575 
576 	VFS_HOLD(vfsp);
577 	VN_SET_VFS_TYPE_DEV(vp, vfsp, VDIR, 0);
578 	vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT;
579 
580 	return (vp);
581 }
582 
583 /*
584  * gfs_root_create_file(): create a root vnode for a GFS file as a filesystem
585  *
586  * Similar to gfs_root_create(), this creates a root vnode for a file to
587  * be the pseudo-filesystem.
588  */
589 vnode_t *
gfs_root_create_file(size_t size,vfs_t * vfsp,vnodeops_t * ops,ino64_t ino)590 gfs_root_create_file(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino)
591 {
592 	vnode_t	*vp = gfs_file_create(size, NULL, ops);
593 
594 	((gfs_file_t *)vp->v_data)->gfs_ino = ino;
595 
596 	VFS_HOLD(vfsp);
597 	VN_SET_VFS_TYPE_DEV(vp, vfsp, VREG, 0);
598 	vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT;
599 
600 	return (vp);
601 }
602 
603 /*
604  * gfs_file_inactive()
605  *
606  * Called from the VOP_INACTIVE() routine.  If necessary, this routine will
607  * remove the given vnode from the parent directory and clean up any references
608  * in the VFS layer.
609  *
610  * If the vnode was not removed (due to a race with vget), then NULL is
611  * returned.  Otherwise, a pointer to the private data is returned.
612  */
613 void *
gfs_file_inactive(vnode_t * vp)614 gfs_file_inactive(vnode_t *vp)
615 {
616 	int i;
617 	gfs_dirent_t *ge = NULL;
618 	gfs_file_t *fp = vp->v_data;
619 	gfs_dir_t *dp = NULL;
620 	void *data;
621 
622 	if (fp->gfs_parent == NULL || (vp->v_flag & V_XATTRDIR))
623 		goto found;
624 
625 	dp = fp->gfs_parent->v_data;
626 
627 	/*
628 	 * First, see if this vnode is cached in the parent.
629 	 */
630 	gfs_dir_lock(dp);
631 
632 	/*
633 	 * Find it in the set of static entries.
634 	 */
635 	for (i = 0; i < dp->gfsd_nstatic; i++)  {
636 		ge = &dp->gfsd_static[i];
637 
638 		if (ge->gfse_vnode == vp)
639 			goto found;
640 	}
641 
642 	/*
643 	 * If 'ge' is NULL, then it is a dynamic entry.
644 	 */
645 	ge = NULL;
646 
647 found:
648 	if (vp->v_flag & V_XATTRDIR) {
649 		mutex_enter(&fp->gfs_parent->v_lock);
650 	}
651 	mutex_enter(&vp->v_lock);
652 #ifdef PORT_SOLARIS
653 	if (vp->v_count == 1) {
654 		/*
655 		 * Really remove this vnode
656 		 */
657 		data = vp->v_data;
658 		if (ge != NULL) {
659 			/*
660 			 * If this was a statically cached entry, simply set the
661 			 * cached vnode to NULL.
662 			 */
663 			ge->gfse_vnode = NULL;
664 		}
665 		if (vp->v_flag & V_XATTRDIR) {
666 			fp->gfs_parent->v_xattrdir = NULL;
667 			mutex_exit(&fp->gfs_parent->v_lock);
668 		}
669 		mutex_exit(&vp->v_lock);
670 
671 		/*
672 		 * Free vnode and release parent
673 		 */
674 		if (fp->gfs_parent) {
675 			if (dp) {
676 				gfs_dir_unlock(dp);
677 			}
678 			VN_RELE(fp->gfs_parent);
679 		} else {
680 			ASSERT(vp->v_vfsp != NULL);
681 			VFS_RELE(vp->v_vfsp);
682 		}
683 		vn_free(vp);
684 	} else {
685 		vp->v_count--;
686 		data = NULL;
687 		mutex_exit(&vp->v_lock);
688 		if (vp->v_flag & V_XATTRDIR) {
689 			mutex_exit(&fp->gfs_parent->v_lock);
690 		}
691 		if (dp)
692 			gfs_dir_unlock(dp);
693 	}
694 #endif
695 
696 	return (data);
697 }
698 
699 /*
700  * gfs_dir_inactive()
701  *
702  * Same as above, but for directories.
703  */
704 void *
gfs_dir_inactive(vnode_t * vp)705 gfs_dir_inactive(vnode_t *vp)
706 {
707 	gfs_dir_t *dp;
708 
709 	ASSERT(vp->v_type == VDIR);
710 
711 	if ((dp = gfs_file_inactive(vp)) != NULL) {
712 		mutex_destroy(&dp->gfsd_lock);
713 		if (dp->gfsd_nstatic)
714 			kmem_free(dp->gfsd_static,
715 			    dp->gfsd_nstatic * sizeof (gfs_dirent_t));
716 	}
717 
718 	return (dp);
719 }
720 
721 /*
722  * gfs_dir_lookup_dynamic()
723  *
724  * This routine looks up the provided name amongst the dynamic entries
725  * in the gfs directory and returns the corresponding vnode, if found.
726  *
727  * The gfs directory is expected to be locked by the caller prior to
728  * calling this function.  The directory will be unlocked during the
729  * execution of this function, but will be locked upon return from the
730  * function.  This function returns 0 on success, non-zero on error.
731  *
732  * The dynamic lookups are performed by invoking the lookup
733  * callback, which is passed to this function as the first argument.
734  * The arguments to the callback are:
735  *
736  * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp, cred_t *cr,
737  *     int flags, int *deflgs, pathname_t *rpnp);
738  *
739  *	pvp	- parent vnode
740  *	nm	- name of entry
741  *	vpp	- pointer to resulting vnode
742  *	cr	- pointer to cred
743  *	flags	- flags value from lookup request
744  *		ignored here; currently only used to request
745  *		insensitive lookups
746  *	direntflgs - output parameter, directory entry flags
747  *		ignored here; currently only used to indicate a lookup
748  *		has more than one possible match when case is not considered
749  *	realpnp	- output parameter, real pathname
750  *		ignored here; when lookup was performed case-insensitively,
751  *		this field contains the "real" name of the file.
752  *
753  * 	Returns 0 on success, non-zero on error.
754  */
755 static int
gfs_dir_lookup_dynamic(gfs_lookup_cb callback,gfs_dir_t * dp,const char * nm,vnode_t * dvp,vnode_t ** vpp,cred_t * cr,int flags,int * direntflags,pathname_t * realpnp)756 gfs_dir_lookup_dynamic(gfs_lookup_cb callback, gfs_dir_t *dp,
757     const char *nm, vnode_t *dvp, vnode_t **vpp, cred_t *cr, int flags,
758     int *direntflags, pathname_t *realpnp)
759 {
760 	gfs_file_t *fp;
761 	ino64_t ino;
762 	int ret;
763 
764 	ASSERT(GFS_DIR_LOCKED(dp));
765 
766 	/*
767 	 * Drop the directory lock, as the lookup routine
768 	 * will need to allocate memory, or otherwise deadlock on this
769 	 * directory.
770 	 */
771 	gfs_dir_unlock(dp);
772 	ret = callback(dvp, nm, vpp, &ino, cr, flags, direntflags, realpnp);
773 	gfs_dir_lock(dp);
774 
775 	/*
776 	 * The callback for extended attributes returns a vnode
777 	 * with v_data from an underlying fs.
778 	 */
779 	if (ret == 0 && !IS_XATTRDIR(dvp)) {
780 		fp = (gfs_file_t *)((*vpp)->v_data);
781 		fp->gfs_index = -1;
782 		fp->gfs_ino = ino;
783 	}
784 
785 	return (ret);
786 }
787 
788 /*
789  * gfs_dir_lookup_static()
790  *
791  * This routine looks up the provided name amongst the static entries
792  * in the gfs directory and returns the corresponding vnode, if found.
793  * The first argument to the function is a pointer to the comparison
794  * function this function should use to decide if names are a match.
795  *
796  * If a match is found, and GFS_CACHE_VNODE is set and the vnode
797  * exists, we simply return the existing vnode.  Otherwise, we call
798  * the static entry's callback routine, caching the result if
799  * necessary.  If the idx pointer argument is non-NULL, we use it to
800  * return the index of the matching static entry.
801  *
802  * The gfs directory is expected to be locked by the caller prior to calling
803  * this function.  The directory may be unlocked during the execution of
804  * this function, but will be locked upon return from the function.
805  *
806  * This function returns 0 if a match is found, ENOENT if not.
807  */
808 static int
gfs_dir_lookup_static(int (* compare)(const char *,const char *),gfs_dir_t * dp,const char * nm,vnode_t * dvp,int * idx,vnode_t ** vpp,pathname_t * rpnp)809 gfs_dir_lookup_static(int (*compare)(const char *, const char *),
810     gfs_dir_t *dp, const char *nm, vnode_t *dvp, int *idx,
811     vnode_t **vpp, pathname_t *rpnp)
812 {
813 	gfs_dirent_t *ge;
814 	vnode_t *vp = NULL;
815 	int i;
816 
817 	ASSERT(GFS_DIR_LOCKED(dp));
818 
819 	/*
820 	 * Search static entries.
821 	 */
822 	for (i = 0; i < dp->gfsd_nstatic; i++) {
823 		ge = &dp->gfsd_static[i];
824 
825 		if (compare(ge->gfse_name, nm) == 0) {
826 			if (rpnp)
827 				(void) strlcpy(rpnp->pn_buf, ge->gfse_name,
828 				    rpnp->pn_bufsize);
829 
830 			if (ge->gfse_vnode) {
831 				ASSERT(ge->gfse_flags & GFS_CACHE_VNODE);
832 				vp = ge->gfse_vnode;
833 				VN_HOLD(vp);
834 				break;
835 			}
836 
837 			/*
838 			 * We drop the directory lock, as the constructor will
839 			 * need to do KM_SLEEP allocations.  If we return from
840 			 * the constructor only to find that a parallel
841 			 * operation has completed, and GFS_CACHE_VNODE is set
842 			 * for this entry, we discard the result in favor of
843 			 * the cached vnode.
844 			 */
845 			gfs_dir_unlock(dp);
846 			vp = ge->gfse_ctor(dvp);
847 			gfs_dir_lock(dp);
848 
849 			((gfs_file_t *)vp->v_data)->gfs_index = i;
850 
851 			/* Set the inode according to the callback. */
852 			((gfs_file_t *)vp->v_data)->gfs_ino =
853 			    dp->gfsd_inode(dvp, i);
854 
855 			if (ge->gfse_flags & GFS_CACHE_VNODE) {
856 				if (ge->gfse_vnode == NULL) {
857 					ge->gfse_vnode = vp;
858 				} else {
859 					/*
860 					 * A parallel constructor beat us to it;
861 					 * return existing vnode.  We have to be
862 					 * careful because we can't release the
863 					 * current vnode while holding the
864 					 * directory lock; its inactive routine
865 					 * will try to lock this directory.
866 					 */
867 					vnode_t *oldvp = vp;
868 					vp = ge->gfse_vnode;
869 					VN_HOLD(vp);
870 
871 					gfs_dir_unlock(dp);
872 					VN_RELE(oldvp);
873 					gfs_dir_lock(dp);
874 				}
875 			}
876 			break;
877 		}
878 	}
879 
880 	if (vp == NULL)
881 		return (ENOENT);
882 	else if (idx)
883 		*idx = i;
884 	*vpp = vp;
885 	return (0);
886 }
887 
888 /*
889  * gfs_dir_lookup()
890  *
891  * Looks up the given name in the directory and returns the corresponding
892  * vnode, if found.
893  *
894  * First, we search statically defined entries, if any, with a call to
895  * gfs_dir_lookup_static().  If no static entry is found, and we have
896  * a callback function we try a dynamic lookup via gfs_dir_lookup_dynamic().
897  *
898  * This function returns 0 on success, non-zero on error.
899  */
900 int
gfs_dir_lookup(vnode_t * dvp,const char * nm,vnode_t ** vpp,cred_t * cr,int flags,int * direntflags,pathname_t * realpnp)901 gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, cred_t *cr,
902     int flags, int *direntflags, pathname_t *realpnp)
903 {
904 	gfs_dir_t *dp = dvp->v_data;
905 	boolean_t casecheck;
906 	vnode_t *dynvp = NULL;
907 	vnode_t *vp = NULL;
908 	int (*compare)(const char *, const char *);
909 	int error, idx;
910 
911 	ASSERT(dvp->v_type == VDIR);
912 
913 	if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0)
914 		return (0);
915 
916 	casecheck = (flags & FIGNORECASE) != 0 && direntflags != NULL;
917 	if (vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) ||
918 	    (flags & FIGNORECASE))
919 		compare = strcasecmp;
920 	else
921 		compare = strcmp;
922 
923 	gfs_dir_lock(dp);
924 
925 	error = gfs_dir_lookup_static(compare, dp, nm, dvp, &idx, &vp, realpnp);
926 
927 	if (vp && casecheck) {
928 		gfs_dirent_t *ge;
929 		int i;
930 
931 		for (i = idx + 1; i < dp->gfsd_nstatic; i++) {
932 			ge = &dp->gfsd_static[i];
933 
934 			if (strcasecmp(ge->gfse_name, nm) == 0) {
935 				*direntflags |= ED_CASE_CONFLICT;
936 				goto out;
937 			}
938 		}
939 	}
940 
941 	if ((error || casecheck) && dp->gfsd_lookup)
942 		error = gfs_dir_lookup_dynamic(dp->gfsd_lookup, dp, nm, dvp,
943 		    &dynvp, cr, flags, direntflags, vp ? NULL : realpnp);
944 
945 	if (vp && dynvp) {
946 		/* static and dynamic entries are case-insensitive conflict */
947 		ASSERT(casecheck);
948 		*direntflags |= ED_CASE_CONFLICT;
949 		VN_RELE(dynvp);
950 	} else if (vp == NULL) {
951 		vp = dynvp;
952 	} else if (error == ENOENT) {
953 		error = 0;
954 	} else if (error) {
955 		VN_RELE(vp);
956 		vp = NULL;
957 	}
958 
959 out:
960 	gfs_dir_unlock(dp);
961 
962 	*vpp = vp;
963 	return (error);
964 }
965 
966 /*
967  * gfs_dir_readdir: does a readdir() on the given directory
968  *
969  *    dvp	- directory vnode
970  *    uiop	- uio structure
971  *    eofp	- eof pointer
972  *    data	- arbitrary data passed to readdir callback
973  *
974  * This routine does all the readdir() dirty work.  Even so, the caller must
975  * supply two callbacks in order to get full compatibility.
976  *
977  * If the directory contains static entries, an inode callback must be
978  * specified.  This avoids having to create every vnode and call VOP_GETATTR()
979  * when reading the directory.  This function has the following arguments:
980  *
981  *	ino_t gfs_inode_cb(vnode_t *vp, int index);
982  *
983  * 	vp	- vnode for the directory
984  * 	index	- index in original gfs_dirent_t array
985  *
986  * 	Returns the inode number for the given entry.
987  *
988  * For directories with dynamic entries, a readdir callback must be provided.
989  * This is significantly more complex, thanks to the particulars of
990  * VOP_READDIR().
991  *
992  *	int gfs_readdir_cb(vnode_t *vp, void *dp, int *eofp,
993  *	    offset_t *off, offset_t *nextoff, void *data, int flags)
994  *
995  *	vp	- directory vnode
996  *	dp	- directory entry, sized according to maxlen given to
997  *		  gfs_dir_create().  callback must fill in d_name and
998  *		  d_ino (if a dirent64_t), or ed_name, ed_ino, and ed_eflags
999  *		  (if an edirent_t). edirent_t is used if V_RDDIR_ENTFLAGS
1000  *		  is set in 'flags'.
1001  *	eofp	- callback must set to 1 when EOF has been reached
1002  *	off	- on entry, the last offset read from the directory.  Callback
1003  *		  must set to the offset of the current entry, typically left
1004  *		  untouched.
1005  *	nextoff	- callback must set to offset of next entry.  Typically
1006  *		  (off + 1)
1007  *	data	- caller-supplied data
1008  *	flags	- VOP_READDIR flags
1009  *
1010  *	Return 0 on success, or error on failure.
1011  */
1012 int
gfs_dir_readdir(vnode_t * dvp,uio_t * uiop,int * eofp,void * data,cred_t * cr,caller_context_t * ct,int flags)1013 gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, void *data, cred_t *cr,
1014     caller_context_t *ct, int flags)
1015 {
1016 	gfs_readdir_state_t gstate;
1017 	int error, eof = 0;
1018 	ino64_t ino, pino;
1019 	offset_t off, next;
1020 	gfs_dir_t *dp = dvp->v_data;
1021 
1022 	error = gfs_get_parent_ino(dvp, cr, ct, &pino, &ino);
1023 	if (error)
1024 		return (error);
1025 
1026 	if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop,
1027 	    pino, ino, flags)) != 0)
1028 		return (error);
1029 
1030 	while ((error = gfs_readdir_pred(&gstate, uiop, &off)) == 0 &&
1031 	    !eof) {
1032 
1033 		if (off >= 0 && off < dp->gfsd_nstatic) {
1034 			ino = dp->gfsd_inode(dvp, off);
1035 
1036 			if ((error = gfs_readdir_emit(&gstate, uiop,
1037 			    off, ino, dp->gfsd_static[off].gfse_name, 0))
1038 			    != 0)
1039 				break;
1040 
1041 		} else if (dp->gfsd_readdir) {
1042 			off -= dp->gfsd_nstatic;
1043 
1044 			if ((error = dp->gfsd_readdir(dvp,
1045 			    gstate.grd_dirent, &eof, &off, &next,
1046 			    data, flags)) != 0 || eof)
1047 				break;
1048 
1049 			off += dp->gfsd_nstatic + 2;
1050 			next += dp->gfsd_nstatic + 2;
1051 
1052 			if ((error = gfs_readdir_emit_int(&gstate, uiop,
1053 			    next)) != 0)
1054 				break;
1055 		} else {
1056 			/*
1057 			 * Offset is beyond the end of the static entries, and
1058 			 * we have no dynamic entries.  Set EOF.
1059 			 */
1060 			eof = 1;
1061 		}
1062 	}
1063 
1064 	return (gfs_readdir_fini(&gstate, error, eofp, eof));
1065 }
1066 
1067 
1068 /*
1069  * gfs_vop_lookup: VOP_LOOKUP() entry point
1070  *
1071  * For use directly in vnode ops table.  Given a GFS directory, calls
1072  * gfs_dir_lookup() as necessary.
1073  */
1074 /* ARGSUSED */
1075 int
gfs_vop_lookup(vnode_t * dvp,char * nm,vnode_t ** vpp,pathname_t * pnp,int flags,vnode_t * rdir,cred_t * cr,caller_context_t * ct,int * direntflags,pathname_t * realpnp)1076 gfs_vop_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
1077     int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
1078     int *direntflags, pathname_t *realpnp)
1079 {
1080 	return (gfs_dir_lookup(dvp, nm, vpp, cr, flags, direntflags, realpnp));
1081 }
1082 
1083 /*
1084  * gfs_vop_readdir: VOP_READDIR() entry point
1085  *
1086  * For use directly in vnode ops table.  Given a GFS directory, calls
1087  * gfs_dir_readdir() as necessary.
1088  */
1089 /* ARGSUSED */
1090 int
gfs_vop_readdir(vnode_t * vp,uio_t * uiop,cred_t * cr,int * eofp,caller_context_t * ct,int flags)1091 gfs_vop_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp,
1092     caller_context_t *ct, int flags)
1093 {
1094 	return (gfs_dir_readdir(vp, uiop, eofp, NULL, cr, ct, flags));
1095 }
1096 
1097 
1098 /*
1099  * gfs_vop_map: VOP_MAP() entry point
1100  *
1101  * Convenient routine for handling pseudo-files that wish to allow mmap() calls.
1102  * This function only works for readonly files, and uses the read function for
1103  * the vnode to fill in the data.  The mapped data is immediately faulted in and
1104  * filled with the necessary data during this call; there are no getpage() or
1105  * putpage() routines.
1106  */
1107 /* ARGSUSED */
1108 #ifdef PORT_SOLARIS
1109 int
gfs_vop_map(vnode_t * vp,offset_t off,struct as * as,caddr_t * addrp,size_t len,uchar_t prot,uchar_t maxprot,uint_t flags,cred_t * cred,caller_context_t * ct)1110 gfs_vop_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
1111     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cred,
1112     caller_context_t *ct)
1113 {
1114 	int rv;
1115 	ssize_t resid = len;
1116 
1117 	/*
1118 	 * Check for bad parameters
1119 	 */
1120 #ifdef _ILP32
1121 	if (len > MAXOFF_T)
1122 		return (ENOMEM);
1123 #endif
1124 	if (vp->v_flag & VNOMAP)
1125 		return (ENOTSUP);
1126 	if (off > MAXOFF_T)
1127 		return (EFBIG);
1128 	if ((long)off < 0 || (long)(off + len) < 0)
1129 		return (EINVAL);
1130 	if (vp->v_type != VREG)
1131 		return (ENODEV);
1132 	if ((prot & (PROT_EXEC | PROT_WRITE)) != 0)
1133 		return (EACCES);
1134 
1135 	/*
1136 	 * Find appropriate address if needed, otherwise clear address range.
1137 	 */
1138 	as_rangelock(as);
1139 	rv = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
1140 	if (rv != 0) {
1141 		as_rangeunlock(as);
1142 		return (rv);
1143 	}
1144 
1145 	/*
1146 	 * Create mapping
1147 	 */
1148 	rv = as_map(as, *addrp, len, segvn_create, zfod_argsp);
1149 	as_rangeunlock(as);
1150 	if (rv != 0)
1151 		return (rv);
1152 
1153 	/*
1154 	 * Fill with data from read()
1155 	 */
1156 	rv = vn_rdwr(UIO_READ, vp, *addrp, len, off, UIO_USERSPACE,
1157 	    0, (rlim64_t)0, cred, &resid);
1158 
1159 	if (rv == 0 && resid != 0)
1160 		rv = ENXIO;
1161 
1162 	if (rv != 0) {
1163 		as_rangelock(as);
1164 		(void) as_unmap(as, *addrp, len);
1165 		as_rangeunlock(as);
1166 	}
1167 
1168 	return (rv);
1169 }
1170 #endif
1171 /*
1172  * gfs_vop_inactive: VOP_INACTIVE() entry point
1173  *
1174  * Given a vnode that is a GFS file or directory, call gfs_file_inactive() or
1175  * gfs_dir_inactive() as necessary, and kmem_free()s associated private data.
1176  */
1177 /* ARGSUSED */
1178 void
gfs_vop_inactive(vnode_t * vp,cred_t * cr,caller_context_t * ct)1179 gfs_vop_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1180 {
1181 	gfs_file_t *fp = vp->v_data;
1182 	void *data;
1183 
1184 	if (fp->gfs_type == GFS_DIR)
1185 		data = gfs_dir_inactive(vp);
1186 	else
1187 		data = gfs_file_inactive(vp);
1188 
1189 	if (data != NULL)
1190 		kmem_free(data, fp->gfs_size);
1191 }
1192