xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_vnops.c (revision 80ab886d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  *	Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T.
29  *	All Rights Reserved
30  */
31 
32 #pragma ident	"%Z%%M%	%I%	%E% SMI"
33 
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/systm.h>
37 #include <sys/cred.h>
38 #include <sys/time.h>
39 #include <sys/vnode.h>
40 #include <sys/vfs.h>
41 #include <sys/file.h>
42 #include <sys/filio.h>
43 #include <sys/uio.h>
44 #include <sys/buf.h>
45 #include <sys/mman.h>
46 #include <sys/pathname.h>
47 #include <sys/dirent.h>
48 #include <sys/debug.h>
49 #include <sys/vmsystm.h>
50 #include <sys/fcntl.h>
51 #include <sys/flock.h>
52 #include <sys/swap.h>
53 #include <sys/errno.h>
54 #include <sys/strsubr.h>
55 #include <sys/sysmacros.h>
56 #include <sys/kmem.h>
57 #include <sys/cmn_err.h>
58 #include <sys/pathconf.h>
59 #include <sys/utsname.h>
60 #include <sys/dnlc.h>
61 #include <sys/acl.h>
62 #include <sys/systeminfo.h>
63 #include <sys/policy.h>
64 #include <sys/sdt.h>
65 #include <sys/list.h>
66 #include <sys/stat.h>
67 
68 #include <rpc/types.h>
69 #include <rpc/auth.h>
70 #include <rpc/clnt.h>
71 
72 #include <nfs/nfs.h>
73 #include <nfs/nfs_clnt.h>
74 #include <nfs/nfs_acl.h>
75 #include <nfs/lm.h>
76 #include <nfs/nfs4.h>
77 #include <nfs/nfs4_kprot.h>
78 #include <nfs/rnode4.h>
79 #include <nfs/nfs4_clnt.h>
80 
81 #include <vm/hat.h>
82 #include <vm/as.h>
83 #include <vm/page.h>
84 #include <vm/pvn.h>
85 #include <vm/seg.h>
86 #include <vm/seg_map.h>
87 #include <vm/seg_kpm.h>
88 #include <vm/seg_vn.h>
89 
90 #include <fs/fs_subr.h>
91 
92 #include <sys/ddi.h>
93 #include <sys/int_fmtio.h>
94 
95 typedef struct {
96 	nfs4_ga_res_t	*di_garp;
97 	cred_t		*di_cred;
98 	hrtime_t	di_time_call;
99 } dirattr_info_t;
100 
101 typedef enum nfs4_acl_op {
102 	NFS4_ACL_GET,
103 	NFS4_ACL_SET
104 } nfs4_acl_op_t;
105 
106 static void	nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *,
107 			char *, dirattr_info_t *);
108 
109 static void	nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *,
110 		    nfs4_open_stream_t *, int *, int *, nfs4_close_type_t,
111 		    nfs4_error_t *, int *);
112 static int	nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
113 			cred_t *);
114 static int	nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *,
115 			stable_how4 *);
116 static int	nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *,
117 			cred_t *, bool_t, struct uio *);
118 static int	nfs4setattr(vnode_t *, struct vattr *, int, cred_t *,
119 			vsecattr_t *);
120 static int	nfs4openattr(vnode_t *, vnode_t **, int, cred_t *);
121 static int	nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int);
122 static int	nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *);
123 static int	nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *);
124 static int	nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *);
125 static int	nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl,
126 			int, vnode_t **, cred_t *);
127 static int	nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **,
128 			cred_t *, int, int, enum createmode4, int);
129 static int	nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *);
130 static int	nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *,
131 			vnode_t *, char *, cred_t *, nfsstat4 *);
132 static int	nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *,
133 			vnode_t *, char *, cred_t *, nfsstat4 *);
134 static int	do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
135 static void	nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
136 static int	nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t);
137 static int	nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
138 			page_t *[], size_t, struct seg *, caddr_t,
139 			enum seg_rw, cred_t *);
140 static void	nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
141 			cred_t *);
142 static int	nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
143 			int, cred_t *);
144 static int	nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
145 			int, cred_t *);
146 static int	nfs4_commit(vnode_t *, offset4, count4, cred_t *);
147 static void	nfs4_set_mod(vnode_t *);
148 static void	nfs4_get_commit(vnode_t *);
149 static void	nfs4_get_commit_range(vnode_t *, u_offset_t, size_t);
150 static int	nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *);
151 static int	nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int);
152 static int	nfs4_sync_commit(vnode_t *, page_t *, offset3, count3,
153 			cred_t *);
154 static void	do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3,
155 			cred_t *);
156 static int	nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *,
157 			hrtime_t, vnode_t *, cred_t *);
158 static int	nfs4_open_non_reg_file(vnode_t **, int, cred_t *);
159 static int	nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *);
160 static void	nfs4_register_lock_locally(vnode_t *, struct flock64 *, int,
161 			u_offset_t);
162 static int 	nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *);
163 static int	nfs4_block_and_wait(clock_t *, rnode4_t *);
164 static cred_t  *state_to_cred(nfs4_open_stream_t *);
165 static int	vtoname(vnode_t *, char *, ssize_t);
166 static void	denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *);
167 static pid_t	lo_to_pid(lock_owner4 *);
168 static void	nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *,
169 			cred_t *, nfs4_lock_owner_t *);
170 static void	push_reinstate(vnode_t *, int, flock64_t *, cred_t *,
171 			nfs4_lock_owner_t *);
172 static nfs4_open_stream_t *open_and_get_osp(vnode_t *, cred_t *, mntinfo4_t *);
173 static void	nfs4_delmap_callback(struct as *, void *, uint_t);
174 static void	nfs4_free_delmapcall(nfs4_delmapcall_t *);
175 static nfs4_delmapcall_t	*nfs4_init_delmapcall();
176 static int	nfs4_find_and_delete_delmapcall(rnode4_t *, int *);
177 static int	nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t);
178 static int	nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *,
179 			uid_t, gid_t, int);
180 
181 /*
182  * Routines that implement the setting of v4 args for the misc. ops
183  */
184 static void	nfs4args_lock_free(nfs_argop4 *);
185 static void	nfs4args_lockt_free(nfs_argop4 *);
186 static void	nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *,
187 			int, rnode4_t *, cred_t *, bitmap4, int *,
188 			nfs4_stateid_types_t *);
189 static void	nfs4args_setattr_free(nfs_argop4 *);
190 static int	nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4,
191 			bitmap4);
192 static void	nfs4args_verify_free(nfs_argop4 *);
193 static void	nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *,
194 			WRITE4args **, nfs4_stateid_types_t *);
195 
196 /*
197  * These are the vnode ops functions that implement the vnode interface to
198  * the networked file system.  See more comments below at nfs4_vnodeops.
199  */
200 static int	nfs4_open(vnode_t **, int, cred_t *);
201 static int	nfs4_close(vnode_t *, int, int, offset_t, cred_t *);
202 static int	nfs4_read(vnode_t *, struct uio *, int, cred_t *,
203 			caller_context_t *);
204 static int	nfs4_write(vnode_t *, struct uio *, int, cred_t *,
205 			caller_context_t *);
206 static int	nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *);
207 static int	nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *);
208 static int	nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *,
209 			caller_context_t *);
210 static int	nfs4_access(vnode_t *, int, int, cred_t *);
211 static int	nfs4_readlink(vnode_t *, struct uio *, cred_t *);
212 static int	nfs4_fsync(vnode_t *, int, cred_t *);
213 static void	nfs4_inactive(vnode_t *, cred_t *);
214 static int	nfs4_lookup(vnode_t *, char *, vnode_t **,
215 			struct pathname *, int, vnode_t *, cred_t *);
216 static int	nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl,
217 			int, vnode_t **, cred_t *, int);
218 static int	nfs4_remove(vnode_t *, char *, cred_t *);
219 static int	nfs4_link(vnode_t *, vnode_t *, char *, cred_t *);
220 static int	nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *);
221 static int	nfs4_mkdir(vnode_t *, char *, struct vattr *,
222 			vnode_t **, cred_t *);
223 static int	nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *);
224 static int	nfs4_symlink(vnode_t *, char *, struct vattr *, char *,
225 			cred_t *);
226 static int	nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *);
227 static int	nfs4_fid(vnode_t *, fid_t *);
228 static int	nfs4_rwlock(vnode_t *, int, caller_context_t *);
229 static void	nfs4_rwunlock(vnode_t *, int, caller_context_t *);
230 static int	nfs4_seek(vnode_t *, offset_t, offset_t *);
231 static int	nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *,
232 			page_t *[], size_t, struct seg *, caddr_t,
233 			enum seg_rw, cred_t *);
234 static int	nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *);
235 static int	nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *,
236 			size_t, uchar_t, uchar_t, uint_t, cred_t *);
237 static int	nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t,
238 			size_t, uchar_t, uchar_t, uint_t, cred_t *);
239 static int	nfs4_cmp(vnode_t *, vnode_t *);
240 static int	nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
241 			struct flk_callback *, cred_t *);
242 static int	nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t,
243 			cred_t *, caller_context_t *);
244 static int	nfs4_realvp(vnode_t *, vnode_t **);
245 static int	nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t,
246 			size_t, uint_t, uint_t, uint_t, cred_t *);
247 static int	nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *);
248 static int	nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
249 			cred_t *);
250 static void	nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *);
251 static int	nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *);
252 static int	nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *);
253 static int	nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *);
254 
255 /*
256  * Used for nfs4_commit_vp() to indicate if we should
257  * wait on pending writes.
258  */
259 #define	NFS4_WRITE_NOWAIT	0
260 #define	NFS4_WRITE_WAIT		1
261 
262 #define	NFS4_BASE_WAIT_TIME 1	/* 1 second */
263 
264 /*
265  * Error flags used to pass information about certain special errors
266  * which need to be handled specially.
267  */
268 #define	NFS_EOF			-98
269 #define	NFS_VERF_MISMATCH	-97
270 
271 /*
272  * Flags used to differentiate between which operation drove the
273  * potential CLOSE OTW. (see nfs4_close_otw_if_necessary)
274  */
275 #define	NFS4_CLOSE_OP		0x1
276 #define	NFS4_DELMAP_OP		0x2
277 #define	NFS4_INACTIVE_OP	0x3
278 
279 #define	ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO))
280 
281 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */
282 #define	ALIGN64(x, ptr, sz)						\
283 	x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1);		\
284 	if (x) {							\
285 		x = sizeof (uint64_t) - (x);				\
286 		sz -= (x);						\
287 		ptr += (x);						\
288 	}
289 
290 #ifdef DEBUG
291 int nfs4_client_attr_debug = 0;
292 int nfs4_client_state_debug = 0;
293 int nfs4_client_shadow_debug = 0;
294 int nfs4_client_lock_debug = 0;
295 int nfs4_seqid_sync = 0;
296 int nfs4_client_map_debug = 0;
297 static int nfs4_pageio_debug = 0;
298 int nfs4_client_inactive_debug = 0;
299 int nfs4_client_recov_debug = 0;
300 int nfs4_client_recov_stub_debug = 0;
301 int nfs4_client_failover_debug = 0;
302 int nfs4_client_call_debug = 0;
303 int nfs4_client_lookup_debug = 0;
304 int nfs4_client_zone_debug = 0;
305 int nfs4_lost_rqst_debug = 0;
306 int nfs4_rdattrerr_debug = 0;
307 int nfs4_open_stream_debug = 0;
308 
309 int nfs4read_error_inject;
310 
311 static int nfs4_create_misses = 0;
312 
313 static int nfs4_readdir_cache_shorts = 0;
314 static int nfs4_readdir_readahead = 0;
315 
316 static int nfs4_bio_do_stop = 0;
317 
318 static int nfs4_lostpage = 0;	/* number of times we lost original page */
319 
320 int nfs4_mmap_debug = 0;
321 
322 static int nfs4_pathconf_cache_hits = 0;
323 static int nfs4_pathconf_cache_misses = 0;
324 
325 int nfs4close_all_cnt;
326 int nfs4close_one_debug = 0;
327 int nfs4close_notw_debug = 0;
328 
329 int denied_to_flk_debug = 0;
330 void *lockt_denied_debug;
331 
332 #endif
333 
334 /*
335  * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT
336  * or NFS4ERR_RESOURCE.
337  */
338 static int confirm_retry_sec = 30;
339 
340 static int nfs4_lookup_neg_cache = 1;
341 
342 /*
343  * number of pages to read ahead
344  * optimized for 100 base-T.
345  */
346 static int nfs4_nra = 4;
347 
348 static int nfs4_do_symlink_cache = 1;
349 
350 static int nfs4_pathconf_disable_cache = 0;
351 
352 /*
353  * These are the vnode ops routines which implement the vnode interface to
354  * the networked file system.  These routines just take their parameters,
355  * make them look networkish by putting the right info into interface structs,
356  * and then calling the appropriate remote routine(s) to do the work.
357  *
358  * Note on directory name lookup cacheing:  If we detect a stale fhandle,
359  * we purge the directory cache relative to that vnode.  This way, the
360  * user won't get burned by the cache repeatedly.  See <nfs/rnode4.h> for
361  * more details on rnode locking.
362  */
363 
364 struct vnodeops *nfs4_vnodeops;
365 
366 const fs_operation_def_t nfs4_vnodeops_template[] = {
367 	VOPNAME_OPEN, nfs4_open,
368 	VOPNAME_CLOSE, nfs4_close,
369 	VOPNAME_READ, nfs4_read,
370 	VOPNAME_WRITE, nfs4_write,
371 	VOPNAME_IOCTL, nfs4_ioctl,
372 	VOPNAME_GETATTR, nfs4_getattr,
373 	VOPNAME_SETATTR, nfs4_setattr,
374 	VOPNAME_ACCESS, nfs4_access,
375 	VOPNAME_LOOKUP, nfs4_lookup,
376 	VOPNAME_CREATE, nfs4_create,
377 	VOPNAME_REMOVE, nfs4_remove,
378 	VOPNAME_LINK, nfs4_link,
379 	VOPNAME_RENAME, nfs4_rename,
380 	VOPNAME_MKDIR, nfs4_mkdir,
381 	VOPNAME_RMDIR, nfs4_rmdir,
382 	VOPNAME_READDIR, nfs4_readdir,
383 	VOPNAME_SYMLINK, nfs4_symlink,
384 	VOPNAME_READLINK, nfs4_readlink,
385 	VOPNAME_FSYNC, nfs4_fsync,
386 	VOPNAME_INACTIVE, (fs_generic_func_p) nfs4_inactive,
387 	VOPNAME_FID, nfs4_fid,
388 	VOPNAME_RWLOCK, nfs4_rwlock,
389 	VOPNAME_RWUNLOCK, (fs_generic_func_p) nfs4_rwunlock,
390 	VOPNAME_SEEK, nfs4_seek,
391 	VOPNAME_FRLOCK, nfs4_frlock,
392 	VOPNAME_SPACE, nfs4_space,
393 	VOPNAME_REALVP, nfs4_realvp,
394 	VOPNAME_GETPAGE, nfs4_getpage,
395 	VOPNAME_PUTPAGE, nfs4_putpage,
396 	VOPNAME_MAP, (fs_generic_func_p) nfs4_map,
397 	VOPNAME_ADDMAP, (fs_generic_func_p) nfs4_addmap,
398 	VOPNAME_DELMAP, nfs4_delmap,
399 	VOPNAME_DUMP, nfs_dump,		/* there is no separate nfs4_dump */
400 	VOPNAME_PATHCONF, nfs4_pathconf,
401 	VOPNAME_PAGEIO, nfs4_pageio,
402 	VOPNAME_DISPOSE, (fs_generic_func_p) nfs4_dispose,
403 	VOPNAME_SETSECATTR, nfs4_setsecattr,
404 	VOPNAME_GETSECATTR, nfs4_getsecattr,
405 	VOPNAME_SHRLOCK, nfs4_shrlock,
406 	NULL, NULL
407 };
408 
409 /*
410  * The following are subroutines and definitions to set args or get res
411  * for the different nfsv4 ops
412  */
413 
414 void
415 nfs4args_lookup_free(nfs_argop4 *argop, int arglen)
416 {
417 	int i;
418 
419 	for (i = 0; i < arglen; i++) {
420 	    if (argop[i].argop == OP_LOOKUP)
421 		kmem_free(
422 			argop[i].nfs_argop4_u.oplookup.objname.utf8string_val,
423 			argop[i].nfs_argop4_u.oplookup.objname.utf8string_len);
424 	}
425 }
426 
427 static void
428 nfs4args_lock_free(nfs_argop4 *argop)
429 {
430 	locker4 *locker = &argop->nfs_argop4_u.oplock.locker;
431 
432 	if (locker->new_lock_owner == TRUE) {
433 		open_to_lock_owner4 *open_owner;
434 
435 		open_owner = &locker->locker4_u.open_owner;
436 		if (open_owner->lock_owner.owner_val != NULL) {
437 			kmem_free(open_owner->lock_owner.owner_val,
438 				open_owner->lock_owner.owner_len);
439 		}
440 	}
441 }
442 
443 static void
444 nfs4args_lockt_free(nfs_argop4 *argop)
445 {
446 	lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner;
447 
448 	if (lowner->owner_val != NULL) {
449 		kmem_free(lowner->owner_val, lowner->owner_len);
450 	}
451 }
452 
453 static void
454 nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags,
455 		rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error,
456 		nfs4_stateid_types_t *sid_types)
457 {
458 	fattr4		*attr = &argop->nfs_argop4_u.opsetattr.obj_attributes;
459 	mntinfo4_t	*mi;
460 
461 	argop->argop = OP_SETATTR;
462 	/*
463 	 * The stateid is set to 0 if client is not modifying the size
464 	 * and otherwise to whatever nfs4_get_stateid() returns.
465 	 *
466 	 * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no
467 	 * state struct could be found for the process/file pair.  We may
468 	 * want to change this in the future (by OPENing the file).  See
469 	 * bug # 4474852.
470 	 */
471 	if (vap->va_mask & AT_SIZE) {
472 
473 		ASSERT(rp != NULL);
474 		mi = VTOMI4(RTOV4(rp));
475 
476 		argop->nfs_argop4_u.opsetattr.stateid =
477 			nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
478 				OP_SETATTR, sid_types, FALSE);
479 	} else {
480 		bzero(&argop->nfs_argop4_u.opsetattr.stateid,
481 		    sizeof (stateid4));
482 	}
483 
484 	*error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp);
485 	if (*error)
486 		bzero(attr, sizeof (*attr));
487 }
488 
489 static void
490 nfs4args_setattr_free(nfs_argop4 *argop)
491 {
492 	nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes);
493 }
494 
495 static int
496 nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op,
497 		bitmap4 supp)
498 {
499 	fattr4 *attr;
500 	int error = 0;
501 
502 	argop->argop = op;
503 	switch (op) {
504 	case OP_VERIFY:
505 		attr = &argop->nfs_argop4_u.opverify.obj_attributes;
506 		break;
507 	case OP_NVERIFY:
508 		attr = &argop->nfs_argop4_u.opnverify.obj_attributes;
509 		break;
510 	default:
511 		return (EINVAL);
512 	}
513 	if (!error)
514 		error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp);
515 	if (error)
516 		bzero(attr, sizeof (*attr));
517 	return (error);
518 }
519 
520 static void
521 nfs4args_verify_free(nfs_argop4 *argop)
522 {
523 	switch (argop->argop) {
524 	case OP_VERIFY:
525 		nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes);
526 		break;
527 	case OP_NVERIFY:
528 		nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes);
529 		break;
530 	default:
531 		break;
532 	}
533 }
534 
535 static void
536 nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr,
537 	WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp)
538 {
539 	WRITE4args *wargs = &argop->nfs_argop4_u.opwrite;
540 	mntinfo4_t *mi = VTOMI4(RTOV4(rp));
541 
542 	argop->argop = OP_WRITE;
543 	wargs->stable = stable;
544 	wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id,
545 				mi, OP_WRITE, sid_tp);
546 	wargs->mblk = NULL;
547 	*wargs_pp = wargs;
548 }
549 
550 void
551 nfs4args_copen_free(OPEN4cargs *open_args)
552 {
553 	if (open_args->owner.owner_val) {
554 		kmem_free(open_args->owner.owner_val,
555 					open_args->owner.owner_len);
556 	}
557 	if ((open_args->opentype == OPEN4_CREATE) &&
558 	    (open_args->mode != EXCLUSIVE4)) {
559 		nfs4_fattr4_free(&open_args->createhow4_u.createattrs);
560 	}
561 }
562 
563 /*
564  * XXX:  This is referenced in modstubs.s
565  */
566 struct vnodeops *
567 nfs4_getvnodeops(void)
568 {
569 	return (nfs4_vnodeops);
570 }
571 
572 /*
573  * The OPEN operation opens a regular file.
574  *
575  * ARGSUSED
576  */
577 static int
578 nfs4_open(vnode_t **vpp, int flag, cred_t *cr)
579 {
580 	vnode_t *dvp = NULL;
581 	rnode4_t *rp;
582 	int error;
583 	int just_been_created;
584 	char fn[MAXNAMELEN];
585 
586 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: "));
587 	if (nfs_zone() != VTOMI4(*vpp)->mi_zone)
588 		return (EIO);
589 	rp = VTOR4(*vpp);
590 
591 	/*
592 	 * Check to see if opening something besides a regular file;
593 	 * if so skip the OTW call
594 	 */
595 	if ((*vpp)->v_type != VREG) {
596 		error = nfs4_open_non_reg_file(vpp, flag, cr);
597 		return (error);
598 	}
599 
600 	/*
601 	 * XXX - would like a check right here to know if the file is
602 	 * executable or not, so as to skip OTW
603 	 */
604 
605 	if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0)
606 		return (error);
607 
608 	if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0)
609 		return (error);
610 
611 	/*
612 	 * See if this file has just been CREATEd.
613 	 * If so, clear the flag and update the dnlc, which was previously
614 	 * skipped in nfs4_create.
615 	 * XXX need better serilization on this.
616 	 * XXX move this into the nf4open_otw call, after we have
617 	 * XXX acquired the open owner seqid sync.
618 	 */
619 	mutex_enter(&rp->r_statev4_lock);
620 	if (rp->created_v4) {
621 		rp->created_v4 = 0;
622 		mutex_exit(&rp->r_statev4_lock);
623 
624 		dnlc_update(dvp, fn, *vpp);
625 		/* This is needed so we don't bump the open ref count */
626 		just_been_created = 1;
627 	} else {
628 		mutex_exit(&rp->r_statev4_lock);
629 		just_been_created = 0;
630 	}
631 
632 	/*
633 	 * If caller specified O_TRUNC/FTRUNC, then be sure to set
634 	 * FWRITE (to drive successful setattr(size=0) after open)
635 	 */
636 	if (flag & FTRUNC)
637 		flag |= FWRITE;
638 
639 	error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0,
640 			just_been_created);
641 
642 	if (!error && !((*vpp)->v_flag & VROOT))
643 		dnlc_update(dvp, fn, *vpp);
644 
645 	/* release the hold from vtodv */
646 	VN_RELE(dvp);
647 
648 	/* exchange the shadow for the master vnode, if needed */
649 
650 	if (error == 0 && IS_SHADOW(*vpp, rp))
651 		sv_exchange(vpp);
652 
653 	return (error);
654 }
655 
656 /*
657  * See if there's a "lost open" request to be saved and recovered.
658  */
659 static void
660 nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
661 	nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp,
662 	vnode_t *dvp, OPEN4cargs *open_args)
663 {
664 	vfs_t *vfsp;
665 	char *srccfp;
666 
667 	vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp);
668 
669 	if (error != ETIMEDOUT && error != EINTR &&
670 			!NFS4_FRC_UNMT_ERR(error, vfsp)) {
671 		lost_rqstp->lr_op = 0;
672 		return;
673 	}
674 
675 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
676 		    "nfs4open_save_lost_rqst: error %d", error));
677 
678 	lost_rqstp->lr_op = OP_OPEN;
679 	/*
680 	 * The vp (if it is not NULL) and dvp are held and rele'd via
681 	 * the recovery code.  See nfs4_save_lost_rqst.
682 	 */
683 	lost_rqstp->lr_vp = vp;
684 	lost_rqstp->lr_dvp = dvp;
685 	lost_rqstp->lr_oop = oop;
686 	lost_rqstp->lr_osp = NULL;
687 	lost_rqstp->lr_lop = NULL;
688 	lost_rqstp->lr_cr = cr;
689 	lost_rqstp->lr_flk = NULL;
690 	lost_rqstp->lr_oacc = open_args->share_access;
691 	lost_rqstp->lr_odeny = open_args->share_deny;
692 	lost_rqstp->lr_oclaim = open_args->claim;
693 	if (open_args->claim == CLAIM_DELEGATE_CUR) {
694 		lost_rqstp->lr_ostateid =
695 		    open_args->open_claim4_u.delegate_cur_info.delegate_stateid;
696 		srccfp = open_args->open_claim4_u.delegate_cur_info.cfile;
697 	} else {
698 		srccfp = open_args->open_claim4_u.cfile;
699 	}
700 	lost_rqstp->lr_ofile.utf8string_len = 0;
701 	lost_rqstp->lr_ofile.utf8string_val = NULL;
702 	(void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile);
703 	lost_rqstp->lr_putfirst = FALSE;
704 }
705 
706 struct nfs4_excl_time {
707 	uint32 seconds;
708 	uint32 nseconds;
709 };
710 
711 /*
712  * The OPEN operation creates and/or opens a regular file
713  *
714  * ARGSUSED
715  */
716 static int
717 nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va,
718 	vnode_t **vpp, cred_t *cr, int create_flag, int open_flag,
719 	enum createmode4 createmode, int file_just_been_created)
720 {
721 	rnode4_t *rp;
722 	rnode4_t *drp = VTOR4(dvp);
723 	vnode_t *vp = NULL;
724 	vnode_t *vpi = *vpp;
725 	bool_t needrecov = FALSE;
726 
727 	int doqueue = 1;
728 
729 	COMPOUND4args_clnt args;
730 	COMPOUND4res_clnt res;
731 	nfs_argop4 *argop;
732 	nfs_resop4 *resop;
733 	int argoplist_size;
734 	int idx_open, idx_fattr;
735 
736 	GETFH4res *gf_res = NULL;
737 	OPEN4res *op_res = NULL;
738 	nfs4_ga_res_t *garp;
739 	fattr4 *attr = NULL;
740 	struct nfs4_excl_time verf;
741 	bool_t did_excl_setup = FALSE;
742 	int created_osp;
743 
744 	OPEN4cargs *open_args;
745 	nfs4_open_owner_t	*oop = NULL;
746 	nfs4_open_stream_t	*osp = NULL;
747 	seqid4 seqid = 0;
748 	bool_t retry_open = FALSE;
749 	nfs4_recov_state_t recov_state;
750 	nfs4_lost_rqst_t lost_rqst;
751 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
752 	hrtime_t t;
753 	int acc = 0;
754 	cred_t *cred_otw = NULL;	/* cred used to do the RPC call */
755 	cred_t *ncr = NULL;
756 
757 	nfs4_sharedfh_t *otw_sfh;
758 	nfs4_sharedfh_t *orig_sfh;
759 	int fh_differs = 0;
760 	int numops, setgid_flag;
761 	int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1;
762 
763 	/*
764 	 * Make sure we properly deal with setting the right gid on
765 	 * a newly created file to reflect the parent's setgid bit
766 	 */
767 	setgid_flag = 0;
768 	if (create_flag && in_va) {
769 
770 		/*
771 		 * If the parent's directory has the setgid bit set
772 		 * _and_ the client was able to get a valid mapping
773 		 * for the parent dir's owner_group, we want to
774 		 * append NVERIFY(owner_group == dva.va_gid) and
775 		 * SETATTR to the CREATE compound.
776 		 */
777 		mutex_enter(&drp->r_statelock);
778 		if (drp->r_attr.va_mode & VSGID &&
779 		    drp->r_attr.va_gid != GID_NOBODY) {
780 			in_va->va_gid = drp->r_attr.va_gid;
781 			setgid_flag = 1;
782 		}
783 		mutex_exit(&drp->r_statelock);
784 	}
785 
786 	/*
787 	 * Normal/non-create compound:
788 	 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new)
789 	 *
790 	 * Open(create) compound no setgid:
791 	 * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) +
792 	 * RESTOREFH + GETATTR
793 	 *
794 	 * Open(create) setgid:
795 	 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) +
796 	 * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH +
797 	 * NVERIFY(grp) + SETATTR
798 	 */
799 	if (setgid_flag) {
800 		numops = 10;
801 		idx_open = 1;
802 		idx_fattr = 3;
803 	} else if (create_flag) {
804 		numops = 7;
805 		idx_open = 2;
806 		idx_fattr = 4;
807 	} else {
808 		numops = 4;
809 		idx_open = 1;
810 		idx_fattr = 3;
811 	}
812 
813 	args.array_len = numops;
814 	argoplist_size = numops * sizeof (nfs_argop4);
815 	argop = kmem_alloc(argoplist_size, KM_SLEEP);
816 
817 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: "
818 		"open %s open flag 0x%x cred %p", file_name, open_flag,
819 		(void *)cr));
820 
821 	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
822 	if (create_flag) {
823 		/*
824 		 * We are to create a file.  Initialize the passed in vnode
825 		 * pointer.
826 		 */
827 		vpi = NULL;
828 	} else {
829 		/*
830 		 * Check to see if the client owns a read delegation and is
831 		 * trying to open for write.  If so, then return the delegation
832 		 * to avoid the server doing a cb_recall and returning DELAY.
833 		 * NB - we don't use the statev4_lock here because we'd have
834 		 * to drop the lock anyway and the result would be stale.
835 		 */
836 		if ((open_flag & FWRITE) &&
837 		    VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ)
838 			(void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN);
839 
840 		/*
841 		 * If the file has a delegation, then do an access check up
842 		 * front.  This avoids having to an access check later after
843 		 * we've already done start_op, which could deadlock.
844 		 */
845 		if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) {
846 			if (open_flag & FREAD &&
847 			    nfs4_access(vpi, VREAD, 0, cr) == 0)
848 				acc |= VREAD;
849 			if (open_flag & FWRITE &&
850 			    nfs4_access(vpi, VWRITE, 0, cr) == 0)
851 				acc |= VWRITE;
852 		}
853 	}
854 
855 	drp = VTOR4(dvp);
856 
857 	recov_state.rs_flags = 0;
858 	recov_state.rs_num_retry_despite_err = 0;
859 	cred_otw = cr;
860 
861 recov_retry:
862 	fh_differs = 0;
863 	nfs4_error_zinit(&e);
864 
865 	/* argop is empty here */
866 
867 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) {
868 		if (ncr != NULL)
869 			crfree(ncr);
870 		kmem_free(argop, argoplist_size);
871 		return (EINTR);
872 	}
873 
874 	e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state);
875 	if (e.error) {
876 		nfs_rw_exit(&drp->r_rwlock);
877 		if (ncr != NULL)
878 			crfree(ncr);
879 		kmem_free(argop, argoplist_size);
880 		return (e.error);
881 	}
882 
883 	args.ctag = TAG_OPEN;
884 	args.array_len = numops;
885 	args.array = argop;
886 
887 	/* putfh directory fh */
888 	argop[0].argop = OP_CPUTFH;
889 	argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
890 
891 	/* OPEN: either op 1 or op 2 depending upon create/setgid flags */
892 	argop[idx_open].argop = OP_COPEN;
893 	open_args = &argop[idx_open].nfs_argop4_u.opcopen;
894 	open_args->claim = CLAIM_NULL;
895 
896 	/* name of file */
897 	open_args->open_claim4_u.cfile = file_name;
898 	open_args->owner.owner_len = 0;
899 	open_args->owner.owner_val = NULL;
900 
901 	if (create_flag) {
902 		/* CREATE a file */
903 		open_args->opentype = OPEN4_CREATE;
904 		open_args->mode = createmode;
905 		if (createmode == EXCLUSIVE4) {
906 			if (did_excl_setup == FALSE) {
907 				verf.seconds = nfs_atoi(hw_serial);
908 				if (verf.seconds != 0)
909 					verf.nseconds = newnum();
910 				else {
911 					timestruc_t now;
912 
913 					gethrestime(&now);
914 					verf.seconds = now.tv_sec;
915 					verf.nseconds = now.tv_nsec;
916 				}
917 				/*
918 				 * Since the server will use this value for the
919 				 * mtime, make sure that it can't overflow. Zero
920 				 * out the MSB. The actual value does not matter
921 				 * here, only its uniqeness.
922 				 */
923 				verf.seconds &= INT32_MAX;
924 				did_excl_setup = TRUE;
925 			}
926 
927 			/* Now copy over verifier to OPEN4args. */
928 			open_args->createhow4_u.createverf = *(uint64_t *)&verf;
929 		} else {
930 			int v_error;
931 			bitmap4 supp_attrs;
932 			servinfo4_t *svp;
933 
934 			attr = &open_args->createhow4_u.createattrs;
935 
936 			svp = drp->r_server;
937 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
938 			supp_attrs = svp->sv_supp_attrs;
939 			nfs_rw_exit(&svp->sv_lock);
940 
941 			/* GUARDED4 or UNCHECKED4 */
942 			v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN,
943 					supp_attrs);
944 			if (v_error) {
945 				bzero(attr, sizeof (*attr));
946 				nfs4args_copen_free(open_args);
947 				nfs_rw_exit(&drp->r_rwlock);
948 				nfs4_end_op(VTOMI4(dvp), dvp, vpi,
949 					&recov_state, FALSE);
950 				if (ncr != NULL)
951 					crfree(ncr);
952 				kmem_free(argop, argoplist_size);
953 				return (v_error);
954 			}
955 		}
956 	} else {
957 		/* NO CREATE */
958 		open_args->opentype = OPEN4_NOCREATE;
959 	}
960 
961 	if (recov_state.rs_sp != NULL) {
962 		mutex_enter(&recov_state.rs_sp->s_lock);
963 		open_args->owner.clientid = recov_state.rs_sp->clientid;
964 		mutex_exit(&recov_state.rs_sp->s_lock);
965 	} else {
966 		/* XXX should we just fail here? */
967 		open_args->owner.clientid = 0;
968 	}
969 
970 	/*
971 	 * This increments oop's ref count or creates a temporary 'just_created'
972 	 * open owner that will become valid when this OPEN/OPEN_CONFIRM call
973 	 * completes.
974 	 */
975 	mutex_enter(&VTOMI4(dvp)->mi_lock);
976 
977 	/* See if a permanent or just created open owner exists */
978 	oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp));
979 	if (!oop) {
980 		/*
981 		 * This open owner does not exist so create a temporary
982 		 * just created one.
983 		 */
984 		oop = create_open_owner(cr, VTOMI4(dvp));
985 		ASSERT(oop != NULL);
986 	}
987 	mutex_exit(&VTOMI4(dvp)->mi_lock);
988 
989 	/* this length never changes, do alloc before seqid sync */
990 	open_args->owner.owner_len = sizeof (oop->oo_name);
991 	open_args->owner.owner_val =
992 	    kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
993 
994 	e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp));
995 	if (e.error == EAGAIN) {
996 		open_owner_rele(oop);
997 		nfs4args_copen_free(open_args);
998 		nfs_rw_exit(&drp->r_rwlock);
999 		nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1000 		if (ncr != NULL) {
1001 			crfree(ncr);
1002 			ncr = NULL;
1003 		}
1004 		goto recov_retry;
1005 	}
1006 
1007 	/* Check to see if we need to do the OTW call */
1008 	if (!create_flag) {
1009 		if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi,
1010 			file_just_been_created, &e.error, acc, &recov_state)) {
1011 
1012 			/*
1013 			 * The OTW open is not necessary.  Either
1014 			 * the open can succeed without it (eg.
1015 			 * delegation, error == 0) or the open
1016 			 * must fail due to an access failure
1017 			 * (error != 0).  In either case, tidy
1018 			 * up and return.
1019 			 */
1020 
1021 			nfs4_end_open_seqid_sync(oop);
1022 			open_owner_rele(oop);
1023 			nfs4args_copen_free(open_args);
1024 			nfs_rw_exit(&drp->r_rwlock);
1025 			nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE);
1026 			if (ncr != NULL)
1027 				crfree(ncr);
1028 			kmem_free(argop, argoplist_size);
1029 			return (e.error);
1030 		}
1031 	}
1032 
1033 	bcopy(&oop->oo_name, open_args->owner.owner_val,
1034 	    open_args->owner.owner_len);
1035 
1036 	seqid = nfs4_get_open_seqid(oop) + 1;
1037 	open_args->seqid = seqid;
1038 	open_args->share_access = 0;
1039 	if (open_flag & FREAD)
1040 		open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1041 	if (open_flag & FWRITE)
1042 		open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1043 	open_args->share_deny = OPEN4_SHARE_DENY_NONE;
1044 
1045 
1046 
1047 	/*
1048 	 * getfh w/sanity check for idx_open/idx_fattr
1049 	 */
1050 	ASSERT((idx_open + 1) == (idx_fattr - 1));
1051 	argop[idx_open + 1].argop = OP_GETFH;
1052 
1053 	/* getattr */
1054 	argop[idx_fattr].argop = OP_GETATTR;
1055 	argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1056 	argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1057 
1058 	if (setgid_flag) {
1059 		vattr_t	_v;
1060 		servinfo4_t *svp;
1061 		bitmap4	supp_attrs;
1062 
1063 		svp = drp->r_server;
1064 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1065 		supp_attrs = svp->sv_supp_attrs;
1066 		nfs_rw_exit(&svp->sv_lock);
1067 
1068 		/*
1069 		 * For setgid case, we need to:
1070 		 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
1071 		 */
1072 		argop[4].argop = OP_SAVEFH;
1073 
1074 		argop[5].argop = OP_CPUTFH;
1075 		argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
1076 
1077 		argop[6].argop = OP_GETATTR;
1078 		argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1079 		argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1080 
1081 		argop[7].argop = OP_RESTOREFH;
1082 
1083 		/*
1084 		 * nverify
1085 		 */
1086 		_v.va_mask = AT_GID;
1087 		_v.va_gid = in_va->va_gid;
1088 		if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
1089 		    supp_attrs))) {
1090 
1091 			/*
1092 			 * setattr
1093 			 *
1094 			 * We _know_ we're not messing with AT_SIZE or
1095 			 * AT_XTIME, so no need for stateid or flags.
1096 			 * Also we specify NULL rp since we're only
1097 			 * interested in setting owner_group attributes.
1098 			 */
1099 			nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr,
1100 			    supp_attrs, &e.error, 0);
1101 			if (e.error)
1102 				nfs4args_verify_free(&argop[8]);
1103 		}
1104 
1105 		if (e.error) {
1106 			/*
1107 			 * XXX - Revisit the last argument to nfs4_end_op()
1108 			 *	 once 5020486 is fixed.
1109 			 */
1110 			nfs4_end_open_seqid_sync(oop);
1111 			open_owner_rele(oop);
1112 			nfs4args_copen_free(open_args);
1113 			nfs_rw_exit(&drp->r_rwlock);
1114 			nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1115 			if (ncr != NULL)
1116 				crfree(ncr);
1117 			kmem_free(argop, argoplist_size);
1118 			return (e.error);
1119 		}
1120 	} else if (create_flag) {
1121 		/*
1122 		 * For setgid case, we need to:
1123 		 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
1124 		 */
1125 		argop[1].argop = OP_SAVEFH;
1126 
1127 		argop[5].argop = OP_RESTOREFH;
1128 
1129 		argop[6].argop = OP_GETATTR;
1130 		argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1131 		argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1132 	}
1133 
1134 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1135 	    "nfs4open_otw: %s call, nm %s, rp %s",
1136 	    needrecov ? "recov" : "first", file_name,
1137 	    rnode4info(VTOR4(dvp))));
1138 
1139 	t = gethrtime();
1140 
1141 	rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e);
1142 
1143 	if (!e.error && nfs4_need_to_bump_seqid(&res))
1144 		nfs4_set_open_seqid(seqid, oop, args.ctag);
1145 
1146 	needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp);
1147 
1148 	if (e.error || needrecov) {
1149 		bool_t abort = FALSE;
1150 
1151 		if (needrecov) {
1152 			nfs4_bseqid_entry_t *bsep = NULL;
1153 
1154 			nfs4open_save_lost_rqst(e.error, &lost_rqst, oop,
1155 			    cred_otw, vpi, dvp, open_args);
1156 
1157 			if (!e.error && res.status == NFS4ERR_BAD_SEQID) {
1158 				bsep = nfs4_create_bseqid_entry(oop, NULL,
1159 					vpi, 0, args.ctag, open_args->seqid);
1160 				num_bseqid_retry--;
1161 			}
1162 
1163 			abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi,
1164 				    NULL, lost_rqst.lr_op == OP_OPEN ?
1165 				    &lost_rqst : NULL, OP_OPEN, bsep);
1166 
1167 			if (bsep)
1168 				kmem_free(bsep, sizeof (*bsep));
1169 			/* give up if we keep getting BAD_SEQID */
1170 			if (num_bseqid_retry == 0)
1171 				abort = TRUE;
1172 			if (abort == TRUE && e.error == 0)
1173 				e.error = geterrno4(res.status);
1174 		}
1175 		nfs4_end_open_seqid_sync(oop);
1176 		open_owner_rele(oop);
1177 		nfs_rw_exit(&drp->r_rwlock);
1178 		nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1179 		nfs4args_copen_free(open_args);
1180 		if (setgid_flag) {
1181 			nfs4args_verify_free(&argop[8]);
1182 			nfs4args_setattr_free(&argop[9]);
1183 		}
1184 		if (!e.error)
1185 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1186 		if (ncr != NULL) {
1187 			crfree(ncr);
1188 			ncr = NULL;
1189 		}
1190 		if (!needrecov || abort == TRUE || e.error == EINTR ||
1191 		    NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) {
1192 			kmem_free(argop, argoplist_size);
1193 			return (e.error);
1194 		}
1195 		goto recov_retry;
1196 	}
1197 
1198 	/*
1199 	 * Will check and update lease after checking the rflag for
1200 	 * OPEN_CONFIRM in the successful OPEN call.
1201 	 */
1202 	if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
1203 
1204 		/*
1205 		 * XXX what if we're crossing mount points from server1:/drp
1206 		 * to server2:/drp/rp.
1207 		 */
1208 
1209 		/* Signal our end of use of the open seqid */
1210 		nfs4_end_open_seqid_sync(oop);
1211 
1212 		/*
1213 		 * This will destroy the open owner if it was just created,
1214 		 * and no one else has put a reference on it.
1215 		 */
1216 		open_owner_rele(oop);
1217 		if (create_flag && (createmode != EXCLUSIVE4) &&
1218 		    res.status == NFS4ERR_BADOWNER)
1219 			nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1220 
1221 		e.error = geterrno4(res.status);
1222 		nfs4args_copen_free(open_args);
1223 		if (setgid_flag) {
1224 			nfs4args_verify_free(&argop[8]);
1225 			nfs4args_setattr_free(&argop[9]);
1226 		}
1227 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1228 		nfs_rw_exit(&drp->r_rwlock);
1229 		nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1230 		/*
1231 		 * If the reply is NFS4ERR_ACCESS, it may be because
1232 		 * we are root (no root net access).  If the real uid
1233 		 * is not root, then retry with the real uid instead.
1234 		 */
1235 		if (ncr != NULL) {
1236 			crfree(ncr);
1237 			ncr = NULL;
1238 		}
1239 		if (res.status == NFS4ERR_ACCESS &&
1240 		    (ncr = crnetadjust(cred_otw)) != NULL) {
1241 			cred_otw = ncr;
1242 			goto recov_retry;
1243 		}
1244 		kmem_free(argop, argoplist_size);
1245 		return (e.error);
1246 	}
1247 
1248 	resop = &res.array[idx_open];  /* open res */
1249 	op_res = &resop->nfs_resop4_u.opopen;
1250 
1251 #ifdef DEBUG
1252 	/*
1253 	 * verify attrset bitmap
1254 	 */
1255 	if (create_flag &&
1256 	    (createmode == UNCHECKED4 || createmode == GUARDED4)) {
1257 		/* make sure attrset returned is what we asked for */
1258 		/* XXX Ignore this 'error' for now */
1259 		if (attr->attrmask != op_res->attrset)
1260 			/* EMPTY */;
1261 	}
1262 #endif
1263 
1264 	if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) {
1265 		mutex_enter(&VTOMI4(dvp)->mi_lock);
1266 		VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK;
1267 		mutex_exit(&VTOMI4(dvp)->mi_lock);
1268 	}
1269 
1270 	resop = &res.array[idx_open + 1];  /* getfh res */
1271 	gf_res = &resop->nfs_resop4_u.opgetfh;
1272 
1273 	otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp));
1274 
1275 	/*
1276 	 * The open stateid has been updated on the server but not
1277 	 * on the client yet.  There is a path: makenfs4node->nfs4_attr_cache->
1278 	 * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW
1279 	 * WRITE call.  That, however, will use the old stateid, so go ahead
1280 	 * and upate the open stateid now, before any call to makenfs4node.
1281 	 */
1282 	if (vpi) {
1283 		nfs4_open_stream_t	*tmp_osp;
1284 		rnode4_t		*tmp_rp = VTOR4(vpi);
1285 
1286 		tmp_osp = find_open_stream(oop, tmp_rp);
1287 		if (tmp_osp) {
1288 			tmp_osp->open_stateid = op_res->stateid;
1289 			mutex_exit(&tmp_osp->os_sync_lock);
1290 			open_stream_rele(tmp_osp, tmp_rp);
1291 		}
1292 
1293 		/*
1294 		 * We must determine if the file handle given by the otw open
1295 		 * is the same as the file handle which was passed in with
1296 		 * *vpp.  This case can be reached if the file we are trying
1297 		 * to open has been removed and another file has been created
1298 		 * having the same file name.  The passed in vnode is released
1299 		 * later.
1300 		 */
1301 		orig_sfh = VTOR4(vpi)->r_fh;
1302 		fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh);
1303 	}
1304 
1305 	garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res;
1306 
1307 	if (create_flag || fh_differs) {
1308 		int rnode_err = 0;
1309 
1310 		vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr,
1311 		    dvp, fn_get(VTOSV(dvp)->sv_name, file_name));
1312 
1313 		if (e.error)
1314 			PURGE_ATTRCACHE4(vp);
1315 		/*
1316 		 * For the newly created vp case, make sure the rnode
1317 		 * isn't bad before using it.
1318 		 */
1319 		mutex_enter(&(VTOR4(vp))->r_statelock);
1320 		if (VTOR4(vp)->r_flags & R4RECOVERR)
1321 			rnode_err = EIO;
1322 		mutex_exit(&(VTOR4(vp))->r_statelock);
1323 
1324 		if (rnode_err) {
1325 			nfs4_end_open_seqid_sync(oop);
1326 			nfs4args_copen_free(open_args);
1327 			if (setgid_flag) {
1328 				nfs4args_verify_free(&argop[8]);
1329 				nfs4args_setattr_free(&argop[9]);
1330 			}
1331 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1332 			nfs_rw_exit(&drp->r_rwlock);
1333 			nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1334 				    needrecov);
1335 			open_owner_rele(oop);
1336 			VN_RELE(vp);
1337 			if (ncr != NULL)
1338 				crfree(ncr);
1339 			sfh4_rele(&otw_sfh);
1340 			kmem_free(argop, argoplist_size);
1341 			return (EIO);
1342 		}
1343 	} else {
1344 		vp = vpi;
1345 	}
1346 	sfh4_rele(&otw_sfh);
1347 
1348 	/*
1349 	 * It seems odd to get a full set of attrs and then not update
1350 	 * the object's attrcache in the non-create case.  Create case uses
1351 	 * the attrs since makenfs4node checks to see if the attrs need to
1352 	 * be updated (and then updates them).  The non-create case should
1353 	 * update attrs also.
1354 	 */
1355 	if (! create_flag && ! fh_differs && !e.error) {
1356 		nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
1357 	}
1358 
1359 	nfs4_error_zinit(&e);
1360 	if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
1361 		/* This does not do recovery for vp explicitly. */
1362 		nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE,
1363 		    &retry_open, oop, FALSE, &e, &num_bseqid_retry);
1364 
1365 		if (e.error || e.stat) {
1366 			nfs4_end_open_seqid_sync(oop);
1367 			nfs4args_copen_free(open_args);
1368 			if (setgid_flag) {
1369 				nfs4args_verify_free(&argop[8]);
1370 				nfs4args_setattr_free(&argop[9]);
1371 			}
1372 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1373 			nfs_rw_exit(&drp->r_rwlock);
1374 			nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1375 				needrecov);
1376 			open_owner_rele(oop);
1377 			if (create_flag || fh_differs) {
1378 				/* rele the makenfs4node */
1379 				VN_RELE(vp);
1380 			}
1381 			if (ncr != NULL) {
1382 				crfree(ncr);
1383 				ncr = NULL;
1384 			}
1385 			if (retry_open == TRUE) {
1386 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1387 				    "nfs4open_otw: retry the open since OPEN "
1388 				    "CONFIRM failed with error %d stat %d",
1389 				    e.error, e.stat));
1390 				if (create_flag && createmode == GUARDED4) {
1391 					NFS4_DEBUG(nfs4_client_recov_debug,
1392 					    (CE_NOTE, "nfs4open_otw: switch "
1393 					    "createmode from GUARDED4 to "
1394 					    "UNCHECKED4"));
1395 					createmode = UNCHECKED4;
1396 				}
1397 				goto recov_retry;
1398 			}
1399 			if (!e.error) {
1400 				if (create_flag && (createmode != EXCLUSIVE4) &&
1401 				    e.stat == NFS4ERR_BADOWNER)
1402 					nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1403 
1404 				e.error = geterrno4(e.stat);
1405 			}
1406 			kmem_free(argop, argoplist_size);
1407 			return (e.error);
1408 		}
1409 	}
1410 
1411 	rp = VTOR4(vp);
1412 
1413 	mutex_enter(&rp->r_statev4_lock);
1414 	if (create_flag)
1415 		rp->created_v4 = 1;
1416 	mutex_exit(&rp->r_statev4_lock);
1417 
1418 	mutex_enter(&oop->oo_lock);
1419 	/* Doesn't matter if 'oo_just_created' already was set as this */
1420 	oop->oo_just_created = NFS4_PERM_CREATED;
1421 	if (oop->oo_cred_otw)
1422 		crfree(oop->oo_cred_otw);
1423 	oop->oo_cred_otw = cred_otw;
1424 	crhold(oop->oo_cred_otw);
1425 	mutex_exit(&oop->oo_lock);
1426 
1427 	/* returns with 'os_sync_lock' held */
1428 	osp = find_or_create_open_stream(oop, rp, &created_osp);
1429 	if (!osp) {
1430 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1431 		    "nfs4open_otw: failed to create an open stream"));
1432 		NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: "
1433 		    "signal our end of use of the open seqid"));
1434 
1435 		nfs4_end_open_seqid_sync(oop);
1436 		open_owner_rele(oop);
1437 		nfs4args_copen_free(open_args);
1438 		if (setgid_flag) {
1439 			nfs4args_verify_free(&argop[8]);
1440 			nfs4args_setattr_free(&argop[9]);
1441 		}
1442 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1443 		nfs_rw_exit(&drp->r_rwlock);
1444 		nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1445 		if (create_flag || fh_differs)
1446 			VN_RELE(vp);
1447 		if (ncr != NULL)
1448 			crfree(ncr);
1449 
1450 		kmem_free(argop, argoplist_size);
1451 		return (EINVAL);
1452 
1453 	}
1454 
1455 	osp->open_stateid = op_res->stateid;
1456 
1457 	if (open_flag & FREAD)
1458 		osp->os_share_acc_read++;
1459 	if (open_flag & FWRITE)
1460 		osp->os_share_acc_write++;
1461 	osp->os_share_deny_none++;
1462 
1463 	/*
1464 	 * Need to reset this bitfield for the possible case where we were
1465 	 * going to OTW CLOSE the file, got a non-recoverable error, and before
1466 	 * we could retry the CLOSE, OPENed the file again.
1467 	 */
1468 	ASSERT(osp->os_open_owner->oo_seqid_inuse);
1469 	osp->os_final_close = 0;
1470 	osp->os_force_close = 0;
1471 #ifdef DEBUG
1472 	if (osp->os_failed_reopen)
1473 		NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:"
1474 		    " clearing os_failed_reopen for osp %p, cr %p, rp %s",
1475 		    (void *)osp, (void *)cr, rnode4info(rp)));
1476 #endif
1477 	osp->os_failed_reopen = 0;
1478 
1479 	mutex_exit(&osp->os_sync_lock);
1480 
1481 	nfs4_end_open_seqid_sync(oop);
1482 
1483 	if (created_osp && recov_state.rs_sp != NULL) {
1484 		mutex_enter(&recov_state.rs_sp->s_lock);
1485 		nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp));
1486 		mutex_exit(&recov_state.rs_sp->s_lock);
1487 	}
1488 
1489 	/* get rid of our reference to find oop */
1490 	open_owner_rele(oop);
1491 
1492 	open_stream_rele(osp, rp);
1493 
1494 	/* accept delegation, if any */
1495 	nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw);
1496 
1497 	nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1498 
1499 	if (createmode == EXCLUSIVE4 &&
1500 		(in_va->va_mask & ~(AT_GID | AT_SIZE))) {
1501 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:"
1502 			" EXCLUSIVE4: sending a SETATTR"));
1503 		/*
1504 		 * If doing an exclusive create, then generate
1505 		 * a SETATTR to set the initial attributes.
1506 		 * Try to set the mtime and the atime to the
1507 		 * server's current time.  It is somewhat
1508 		 * expected that these fields will be used to
1509 		 * store the exclusive create cookie.  If not,
1510 		 * server implementors will need to know that
1511 		 * a SETATTR will follow an exclusive create
1512 		 * and the cookie should be destroyed if
1513 		 * appropriate.
1514 		 *
1515 		 * The AT_GID and AT_SIZE bits are turned off
1516 		 * so that the SETATTR request will not attempt
1517 		 * to process these.  The gid will be set
1518 		 * separately if appropriate.  The size is turned
1519 		 * off because it is assumed that a new file will
1520 		 * be created empty and if the file wasn't empty,
1521 		 * then the exclusive create will have failed
1522 		 * because the file must have existed already.
1523 		 * Therefore, no truncate operation is needed.
1524 		 */
1525 		in_va->va_mask &= ~(AT_GID | AT_SIZE);
1526 		in_va->va_mask |= (AT_MTIME | AT_ATIME);
1527 
1528 		e.error = nfs4setattr(vp, in_va, 0, cr, NULL);
1529 		if (e.error) {
1530 			/*
1531 			 * Couldn't correct the attributes of
1532 			 * the newly created file and the
1533 			 * attributes are wrong.  Remove the
1534 			 * file and return an error to the
1535 			 * application.
1536 			 */
1537 			/* XXX will this take care of client state ? */
1538 			NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1539 				"nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:"
1540 				" remove file", e.error));
1541 			VN_RELE(vp);
1542 			(void) nfs4_remove(dvp, file_name, cr);
1543 			nfs_rw_exit(&drp->r_rwlock);
1544 			goto skip_rwlock_exit;
1545 		}
1546 	}
1547 
1548 	/*
1549 	 * If we created or found the correct vnode, due to create_flag or
1550 	 * fh_differs being set, then update directory cache attribute, readdir
1551 	 * and dnlc caches.
1552 	 */
1553 	if (create_flag || fh_differs) {
1554 		dirattr_info_t dinfo, *dinfop;
1555 
1556 		/*
1557 		 * Make sure getattr succeeded before using results.
1558 		 * note: op 7 is getattr(dir) for both flavors of
1559 		 * open(create).
1560 		 */
1561 		if (create_flag && res.status == NFS4_OK) {
1562 			dinfo.di_time_call = t;
1563 			dinfo.di_cred = cr;
1564 			dinfo.di_garp =
1565 				&res.array[6].nfs_resop4_u.opgetattr.ga_res;
1566 			dinfop = &dinfo;
1567 		} else {
1568 			dinfop = NULL;
1569 		}
1570 
1571 		nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name,
1572 					dinfop);
1573 	}
1574 	nfs_rw_exit(&drp->r_rwlock);
1575 skip_rwlock_exit:
1576 
1577 	/*
1578 	 * If the page cache for this file was flushed from actions
1579 	 * above, it was done asynchronously and if that is true,
1580 	 * there is a need to wait here for it to complete.  This must
1581 	 * be done outside of start_fop/end_fop.
1582 	 */
1583 	(void) nfs4_waitfor_purge_complete(vp);
1584 
1585 	/*
1586 	 * It is implicit that we are in the open case (create_flag == 0) since
1587 	 * fh_differs can only be set to a non-zero value in the open case.
1588 	 */
1589 	if (fh_differs != 0 && vpi != NULL)
1590 		VN_RELE(vpi);
1591 
1592 	/*
1593 	 * Be sure to set *vpp to the correct value before returning.
1594 	 */
1595 	*vpp = vp;
1596 
1597 	nfs4args_copen_free(open_args);
1598 	if (setgid_flag) {
1599 		nfs4args_verify_free(&argop[8]);
1600 		nfs4args_setattr_free(&argop[9]);
1601 	}
1602 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1603 
1604 	if (ncr)
1605 		crfree(ncr);
1606 	kmem_free(argop, argoplist_size);
1607 	return (e.error);
1608 }
1609 
1610 /*
1611  * Reopen an open instance.  cf. nfs4open_otw().
1612  *
1613  * Errors are returned by the nfs4_error_t parameter.
1614  * - ep->error contains an errno value or zero.
1615  * - if it is zero, ep->stat is set to an NFS status code, if any.
1616  *   If the file could not be reopened, but the caller should continue, the
1617  *   file is marked dead and no error values are returned.  If the caller
1618  *   should stop recovering open files and start over, either the ep->error
1619  *   value or ep->stat will indicate an error (either something that requires
1620  *   recovery or EAGAIN).  Note that some recovery (e.g., expired volatile
1621  *   filehandles) may be handled silently by this routine.
1622  * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state
1623  *   will be started, so the caller should not do it.
1624  *
1625  * Gotos:
1626  * - kill_file : reopen failed in such a fashion to constitute marking the
1627  *    file dead and setting the open stream's 'os_failed_reopen' as 1.  This
1628  *   is for cases where recovery is not possible.
1629  * - failed_reopen : same as above, except that the file has already been
1630  *   marked dead, so no need to do it again.
1631  * - bailout : reopen failed but we are able to recover and retry the reopen -
1632  *   either within this function immediatley or via the calling function.
1633  */
1634 
1635 void
1636 nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep,
1637 	    open_claim_type4 claim, bool_t frc_use_claim_previous,
1638 	    bool_t is_recov)
1639 {
1640 	COMPOUND4args_clnt args;
1641 	COMPOUND4res_clnt res;
1642 	nfs_argop4 argop[4];
1643 	nfs_resop4 *resop;
1644 	OPEN4res *op_res = NULL;
1645 	OPEN4cargs *open_args;
1646 	GETFH4res *gf_res;
1647 	rnode4_t *rp = VTOR4(vp);
1648 	int doqueue = 1;
1649 	cred_t *cr = NULL, *cred_otw = NULL;
1650 	nfs4_open_owner_t *oop = NULL;
1651 	seqid4 seqid;
1652 	nfs4_ga_res_t *garp;
1653 	char fn[MAXNAMELEN];
1654 	nfs4_recov_state_t recov = {NULL, 0};
1655 	nfs4_lost_rqst_t lost_rqst;
1656 	mntinfo4_t *mi = VTOMI4(vp);
1657 	bool_t abort;
1658 	char *failed_msg = "";
1659 	int fh_different;
1660 	hrtime_t t;
1661 	nfs4_bseqid_entry_t *bsep = NULL;
1662 
1663 	ASSERT(nfs4_consistent_type(vp));
1664 	ASSERT(nfs_zone() == mi->mi_zone);
1665 
1666 	nfs4_error_zinit(ep);
1667 
1668 	/* this is the cred used to find the open owner */
1669 	cr = state_to_cred(osp);
1670 	if (cr == NULL) {
1671 		failed_msg = "Couldn't reopen: no cred";
1672 		goto kill_file;
1673 	}
1674 	/* use this cred for OTW operations */
1675 	cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner);
1676 
1677 top:
1678 	nfs4_error_zinit(ep);
1679 
1680 	if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1681 		/* File system has been unmounted, quit */
1682 		ep->error = EIO;
1683 		failed_msg = "Couldn't reopen: file system has been unmounted";
1684 		goto kill_file;
1685 	}
1686 
1687 	oop = osp->os_open_owner;
1688 
1689 	ASSERT(oop != NULL);
1690 	if (oop == NULL) {	/* be defensive in non-DEBUG */
1691 		failed_msg = "can't reopen: no open owner";
1692 		goto kill_file;
1693 	}
1694 	open_owner_hold(oop);
1695 
1696 	ep->error = nfs4_start_open_seqid_sync(oop, mi);
1697 	if (ep->error) {
1698 		open_owner_rele(oop);
1699 		oop = NULL;
1700 		goto bailout;
1701 	}
1702 
1703 	/*
1704 	 * If the rnode has a delegation and the delegation has been
1705 	 * recovered and the server didn't request a recall and the caller
1706 	 * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during
1707 	 * recovery) and the rnode hasn't been marked dead, then install
1708 	 * the delegation stateid in the open stream.  Otherwise, proceed
1709 	 * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN.
1710 	 */
1711 	mutex_enter(&rp->r_statev4_lock);
1712 	if (rp->r_deleg_type != OPEN_DELEGATE_NONE &&
1713 	    !rp->r_deleg_return_pending &&
1714 	    (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) &&
1715 	    !rp->r_deleg_needs_recall &&
1716 	    claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous &&
1717 	    !(rp->r_flags & R4RECOVERR)) {
1718 		mutex_enter(&osp->os_sync_lock);
1719 		osp->os_delegation = 1;
1720 		osp->open_stateid = rp->r_deleg_stateid;
1721 		mutex_exit(&osp->os_sync_lock);
1722 		mutex_exit(&rp->r_statev4_lock);
1723 		goto bailout;
1724 	}
1725 	mutex_exit(&rp->r_statev4_lock);
1726 
1727 	/*
1728 	 * If the file failed recovery, just quit.  This failure need not
1729 	 * affect other reopens, so don't return an error.
1730 	 */
1731 	mutex_enter(&rp->r_statelock);
1732 	if (rp->r_flags & R4RECOVERR) {
1733 		mutex_exit(&rp->r_statelock);
1734 		ep->error = 0;
1735 		goto failed_reopen;
1736 	}
1737 	mutex_exit(&rp->r_statelock);
1738 
1739 	/*
1740 	 * argop is empty here
1741 	 *
1742 	 * PUTFH, OPEN, GETATTR
1743 	 */
1744 	args.ctag = TAG_REOPEN;
1745 	args.array_len = 4;
1746 	args.array = argop;
1747 
1748 	NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1749 	    "nfs4_reopen: file is type %d, id %s",
1750 	    vp->v_type, rnode4info(VTOR4(vp))));
1751 
1752 	argop[0].argop = OP_CPUTFH;
1753 
1754 	if (claim != CLAIM_PREVIOUS) {
1755 		/*
1756 		 * if this is a file mount then
1757 		 * use the mntinfo parentfh
1758 		 */
1759 		argop[0].nfs_argop4_u.opcputfh.sfh =
1760 			(vp->v_flag & VROOT) ? mi->mi_srvparentfh :
1761 						VTOSV(vp)->sv_dfh;
1762 	} else {
1763 		/* putfh fh to reopen */
1764 		argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1765 	}
1766 
1767 	argop[1].argop = OP_COPEN;
1768 	open_args = &argop[1].nfs_argop4_u.opcopen;
1769 	open_args->claim = claim;
1770 
1771 	if (claim == CLAIM_NULL) {
1772 
1773 		if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1774 			nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1775 			    "failed for vp 0x%p for CLAIM_NULL with %m",
1776 			    (void *)vp);
1777 			failed_msg = "Couldn't reopen: vtoname failed for "
1778 			    "CLAIM_NULL";
1779 			/* nothing allocated yet */
1780 			goto kill_file;
1781 		}
1782 
1783 		open_args->open_claim4_u.cfile = fn;
1784 	} else if (claim == CLAIM_PREVIOUS) {
1785 
1786 		/*
1787 		 * We have two cases to deal with here:
1788 		 * 1) We're being called to reopen files in order to satisfy
1789 		 *    a lock operation request which requires us to explicitly
1790 		 *    reopen files which were opened under a delegation.  If
1791 		 *    we're in recovery, we *must* use CLAIM_PREVIOUS.  In
1792 		 *    that case, frc_use_claim_previous is TRUE and we must
1793 		 *    use the rnode's current delegation type (r_deleg_type).
1794 		 * 2) We're reopening files during some form of recovery.
1795 		 *    In this case, frc_use_claim_previous is FALSE and we
1796 		 *    use the delegation type appropriate for recovery
1797 		 *    (r_deleg_needs_recovery).
1798 		 */
1799 		mutex_enter(&rp->r_statev4_lock);
1800 		open_args->open_claim4_u.delegate_type =
1801 			frc_use_claim_previous ?
1802 				rp->r_deleg_type :
1803 				rp->r_deleg_needs_recovery;
1804 		mutex_exit(&rp->r_statev4_lock);
1805 
1806 	} else if (claim == CLAIM_DELEGATE_CUR) {
1807 
1808 		if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1809 			nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1810 			    "failed for vp 0x%p for CLAIM_DELEGATE_CUR "
1811 			    "with %m", (void *)vp);
1812 			failed_msg = "Couldn't reopen: vtoname failed for "
1813 			    "CLAIM_DELEGATE_CUR";
1814 			/* nothing allocated yet */
1815 			goto kill_file;
1816 		}
1817 
1818 		mutex_enter(&rp->r_statev4_lock);
1819 		open_args->open_claim4_u.delegate_cur_info.delegate_stateid =
1820 							rp->r_deleg_stateid;
1821 		mutex_exit(&rp->r_statev4_lock);
1822 
1823 		open_args->open_claim4_u.delegate_cur_info.cfile = fn;
1824 	}
1825 	open_args->opentype = OPEN4_NOCREATE;
1826 	open_args->owner.clientid = mi2clientid(mi);
1827 	open_args->owner.owner_len = sizeof (oop->oo_name);
1828 	open_args->owner.owner_val =
1829 			kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
1830 	bcopy(&oop->oo_name, open_args->owner.owner_val,
1831 			open_args->owner.owner_len);
1832 	open_args->share_access = 0;
1833 	open_args->share_deny = 0;
1834 
1835 	mutex_enter(&osp->os_sync_lock);
1836 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp "
1837 	    "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: "
1838 	    "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ",
1839 	    (void *)osp, (void *)rp, osp->os_share_acc_read,
1840 	    osp->os_share_acc_write, osp->os_open_ref_count,
1841 	    osp->os_mmap_read, osp->os_mmap_write, claim));
1842 
1843 	if (osp->os_share_acc_read || osp->os_mmap_read)
1844 		open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1845 	if (osp->os_share_acc_write || osp->os_mmap_write)
1846 		open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1847 	if (osp->os_share_deny_read)
1848 		open_args->share_deny |= OPEN4_SHARE_DENY_READ;
1849 	if (osp->os_share_deny_write)
1850 		open_args->share_deny |= OPEN4_SHARE_DENY_WRITE;
1851 	mutex_exit(&osp->os_sync_lock);
1852 
1853 	seqid = nfs4_get_open_seqid(oop) + 1;
1854 	open_args->seqid = seqid;
1855 
1856 	/* Construct the getfh part of the compound */
1857 	argop[2].argop = OP_GETFH;
1858 
1859 	/* Construct the getattr part of the compound */
1860 	argop[3].argop = OP_GETATTR;
1861 	argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1862 	argop[3].nfs_argop4_u.opgetattr.mi = mi;
1863 
1864 	t = gethrtime();
1865 
1866 	rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
1867 
1868 	if (ep->error) {
1869 		if (!is_recov && !frc_use_claim_previous &&
1870 		    (ep->error == EINTR || ep->error == ETIMEDOUT ||
1871 		    NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) {
1872 			nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop,
1873 				cred_otw, vp, NULL, open_args);
1874 			abort = nfs4_start_recovery(ep,
1875 				    VTOMI4(vp), vp, NULL, NULL,
1876 				    lost_rqst.lr_op == OP_OPEN ?
1877 				    &lost_rqst : NULL, OP_OPEN, NULL);
1878 			nfs4args_copen_free(open_args);
1879 			goto bailout;
1880 		}
1881 
1882 		nfs4args_copen_free(open_args);
1883 
1884 		if (ep->error == EACCES && cred_otw != cr) {
1885 			crfree(cred_otw);
1886 			cred_otw = cr;
1887 			crhold(cred_otw);
1888 			nfs4_end_open_seqid_sync(oop);
1889 			open_owner_rele(oop);
1890 			oop = NULL;
1891 			goto top;
1892 		}
1893 		if (ep->error == ETIMEDOUT)
1894 			goto bailout;
1895 		failed_msg = "Couldn't reopen: rpc error";
1896 		goto kill_file;
1897 	}
1898 
1899 	if (nfs4_need_to_bump_seqid(&res))
1900 		nfs4_set_open_seqid(seqid, oop, args.ctag);
1901 
1902 	switch (res.status) {
1903 	case NFS4_OK:
1904 		if (recov.rs_flags & NFS4_RS_DELAY_MSG) {
1905 			mutex_enter(&rp->r_statelock);
1906 			rp->r_delay_interval = 0;
1907 			mutex_exit(&rp->r_statelock);
1908 		}
1909 		break;
1910 	case NFS4ERR_BAD_SEQID:
1911 		bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0,
1912 			args.ctag, open_args->seqid);
1913 
1914 		abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL,
1915 			    NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst :
1916 			    NULL, OP_OPEN, bsep);
1917 
1918 		nfs4args_copen_free(open_args);
1919 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1920 		nfs4_end_open_seqid_sync(oop);
1921 		open_owner_rele(oop);
1922 		oop = NULL;
1923 		kmem_free(bsep, sizeof (*bsep));
1924 
1925 		goto kill_file;
1926 	case NFS4ERR_NO_GRACE:
1927 		nfs4args_copen_free(open_args);
1928 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1929 		nfs4_end_open_seqid_sync(oop);
1930 		open_owner_rele(oop);
1931 		oop = NULL;
1932 		if (claim == CLAIM_PREVIOUS) {
1933 			/*
1934 			 * Retry as a plain open. We don't need to worry about
1935 			 * checking the changeinfo: it is acceptable for a
1936 			 * client to re-open a file and continue processing
1937 			 * (in the absence of locks).
1938 			 */
1939 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1940 			    "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; "
1941 			    "will retry as CLAIM_NULL"));
1942 			claim = CLAIM_NULL;
1943 			nfs4_mi_kstat_inc_no_grace(mi);
1944 			goto top;
1945 		}
1946 		failed_msg =
1947 		    "Couldn't reopen: tried reclaim outside grace period. ";
1948 		goto kill_file;
1949 	case NFS4ERR_GRACE:
1950 		nfs4_set_grace_wait(mi);
1951 		nfs4args_copen_free(open_args);
1952 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1953 		nfs4_end_open_seqid_sync(oop);
1954 		open_owner_rele(oop);
1955 		oop = NULL;
1956 		ep->error = nfs4_wait_for_grace(mi, &recov);
1957 		if (ep->error != 0)
1958 			goto bailout;
1959 		goto top;
1960 	case NFS4ERR_DELAY:
1961 		nfs4_set_delay_wait(vp);
1962 		nfs4args_copen_free(open_args);
1963 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1964 		nfs4_end_open_seqid_sync(oop);
1965 		open_owner_rele(oop);
1966 		oop = NULL;
1967 		ep->error = nfs4_wait_for_delay(vp, &recov);
1968 		nfs4_mi_kstat_inc_delay(mi);
1969 		if (ep->error != 0)
1970 			goto bailout;
1971 		goto top;
1972 	case NFS4ERR_FHEXPIRED:
1973 		/* recover filehandle and retry */
1974 		abort = nfs4_start_recovery(ep,
1975 				mi, vp, NULL, NULL, NULL, OP_OPEN, NULL);
1976 		nfs4args_copen_free(open_args);
1977 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1978 		nfs4_end_open_seqid_sync(oop);
1979 		open_owner_rele(oop);
1980 		oop = NULL;
1981 		if (abort == FALSE)
1982 			goto top;
1983 		failed_msg = "Couldn't reopen: recovery aborted";
1984 		goto kill_file;
1985 	case NFS4ERR_RESOURCE:
1986 	case NFS4ERR_STALE_CLIENTID:
1987 	case NFS4ERR_WRONGSEC:
1988 	case NFS4ERR_EXPIRED:
1989 		/*
1990 		 * Do not mark the file dead and let the calling
1991 		 * function initiate recovery.
1992 		 */
1993 		nfs4args_copen_free(open_args);
1994 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1995 		nfs4_end_open_seqid_sync(oop);
1996 		open_owner_rele(oop);
1997 		oop = NULL;
1998 		goto bailout;
1999 	case NFS4ERR_ACCESS:
2000 		if (cred_otw != cr) {
2001 			crfree(cred_otw);
2002 			cred_otw = cr;
2003 			crhold(cred_otw);
2004 			nfs4args_copen_free(open_args);
2005 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2006 			nfs4_end_open_seqid_sync(oop);
2007 			open_owner_rele(oop);
2008 			oop = NULL;
2009 			goto top;
2010 		}
2011 		/* fall through */
2012 	default:
2013 		NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
2014 		    "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s",
2015 		    (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv,
2016 		    rnode4info(VTOR4(vp))));
2017 		failed_msg = "Couldn't reopen: NFSv4 error";
2018 		nfs4args_copen_free(open_args);
2019 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2020 		goto kill_file;
2021 	}
2022 
2023 	resop = &res.array[1];  /* open res */
2024 	op_res = &resop->nfs_resop4_u.opopen;
2025 
2026 	garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res;
2027 
2028 	/*
2029 	 * Check if the path we reopened really is the same
2030 	 * file. We could end up in a situation where the file
2031 	 * was removed and a new file created with the same name.
2032 	 */
2033 	resop = &res.array[2];
2034 	gf_res = &resop->nfs_resop4_u.opgetfh;
2035 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
2036 	fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0);
2037 	if (fh_different) {
2038 		if (mi->mi_fh_expire_type == FH4_PERSISTENT ||
2039 		    mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) {
2040 			/* Oops, we don't have the same file */
2041 			if (mi->mi_fh_expire_type == FH4_PERSISTENT)
2042 				failed_msg = "Couldn't reopen: Persistent "
2043 				    "file handle changed";
2044 			else
2045 				failed_msg = "Couldn't reopen: Volatile "
2046 				    "(no expire on open) file handle changed";
2047 
2048 			nfs4args_copen_free(open_args);
2049 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2050 			nfs_rw_exit(&mi->mi_fh_lock);
2051 			goto kill_file;
2052 
2053 		} else {
2054 			/*
2055 			 * We have volatile file handles that don't compare.
2056 			 * If the fids are the same then we assume that the
2057 			 * file handle expired but the rnode still refers to
2058 			 * the same file object.
2059 			 *
2060 			 * First check that we have fids or not.
2061 			 * If we don't we have a dumb server so we will
2062 			 * just assume every thing is ok for now.
2063 			 */
2064 			if (!ep->error && garp->n4g_va.va_mask & AT_NODEID &&
2065 			    rp->r_attr.va_mask & AT_NODEID &&
2066 			    rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) {
2067 				/*
2068 				 * We have fids, but they don't
2069 				 * compare. So kill the file.
2070 				 */
2071 				failed_msg =
2072 					"Couldn't reopen: file handle changed"
2073 				    " due to mismatched fids";
2074 				nfs4args_copen_free(open_args);
2075 				(void) xdr_free(xdr_COMPOUND4res_clnt,
2076 						(caddr_t)&res);
2077 				nfs_rw_exit(&mi->mi_fh_lock);
2078 				goto kill_file;
2079 			} else {
2080 				/*
2081 				 * We have volatile file handles that refers
2082 				 * to the same file (at least they have the
2083 				 * same fid) or we don't have fids so we
2084 				 * can't tell. :(. We'll be a kind and accepting
2085 				 * client so we'll update the rnode's file
2086 				 * handle with the otw handle.
2087 				 *
2088 				 * We need to drop mi->mi_fh_lock since
2089 				 * sh4_update acquires it. Since there is
2090 				 * only one recovery thread there is no
2091 				 * race.
2092 				 */
2093 				nfs_rw_exit(&mi->mi_fh_lock);
2094 				sfh4_update(rp->r_fh, &gf_res->object);
2095 			}
2096 		}
2097 	} else {
2098 		nfs_rw_exit(&mi->mi_fh_lock);
2099 	}
2100 
2101 	ASSERT(nfs4_consistent_type(vp));
2102 
2103 	/*
2104 	 * If the server wanted an OPEN_CONFIRM but that fails, just start
2105 	 * over.  Presumably if there is a persistent error it will show up
2106 	 * when we resend the OPEN.
2107 	 */
2108 	if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
2109 		bool_t retry_open = FALSE;
2110 
2111 		nfs4open_confirm(vp, &seqid, &op_res->stateid,
2112 					cred_otw, is_recov, &retry_open,
2113 					oop, FALSE, ep, NULL);
2114 		if (ep->error || ep->stat) {
2115 			nfs4args_copen_free(open_args);
2116 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2117 			nfs4_end_open_seqid_sync(oop);
2118 			open_owner_rele(oop);
2119 			oop = NULL;
2120 			goto top;
2121 		}
2122 	}
2123 
2124 	mutex_enter(&osp->os_sync_lock);
2125 	osp->open_stateid = op_res->stateid;
2126 	osp->os_delegation = 0;
2127 	/*
2128 	 * Need to reset this bitfield for the possible case where we were
2129 	 * going to OTW CLOSE the file, got a non-recoverable error, and before
2130 	 * we could retry the CLOSE, OPENed the file again.
2131 	 */
2132 	ASSERT(osp->os_open_owner->oo_seqid_inuse);
2133 	osp->os_final_close = 0;
2134 	osp->os_force_close = 0;
2135 	if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS)
2136 		osp->os_dc_openacc = open_args->share_access;
2137 	mutex_exit(&osp->os_sync_lock);
2138 
2139 	nfs4_end_open_seqid_sync(oop);
2140 
2141 	/* accept delegation, if any */
2142 	nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw);
2143 
2144 	nfs4args_copen_free(open_args);
2145 
2146 	nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
2147 
2148 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2149 
2150 	ASSERT(nfs4_consistent_type(vp));
2151 
2152 	open_owner_rele(oop);
2153 	crfree(cr);
2154 	crfree(cred_otw);
2155 	return;
2156 
2157 kill_file:
2158 	nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat);
2159 failed_reopen:
2160 	NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
2161 	    "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s",
2162 	    (void *)osp, (void *)cr, rnode4info(rp)));
2163 	mutex_enter(&osp->os_sync_lock);
2164 	osp->os_failed_reopen = 1;
2165 	mutex_exit(&osp->os_sync_lock);
2166 bailout:
2167 	if (oop != NULL) {
2168 		nfs4_end_open_seqid_sync(oop);
2169 		open_owner_rele(oop);
2170 	}
2171 	if (cr != NULL)
2172 		crfree(cr);
2173 	if (cred_otw != NULL)
2174 		crfree(cred_otw);
2175 }
2176 
2177 /* for . and .. OPENs */
2178 /* ARGSUSED */
2179 static int
2180 nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr)
2181 {
2182 	rnode4_t *rp;
2183 	nfs4_ga_res_t gar;
2184 
2185 	ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone);
2186 
2187 	/*
2188 	 * If close-to-open consistency checking is turned off or
2189 	 * if there is no cached data, we can avoid
2190 	 * the over the wire getattr.  Otherwise, force a
2191 	 * call to the server to get fresh attributes and to
2192 	 * check caches. This is required for close-to-open
2193 	 * consistency.
2194 	 */
2195 	rp = VTOR4(*vpp);
2196 	if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO ||
2197 			(rp->r_dir == NULL && !nfs4_has_pages(*vpp)))
2198 		return (0);
2199 
2200 	gar.n4g_va.va_mask = AT_ALL;
2201 	return (nfs4_getattr_otw(*vpp, &gar, cr, 0));
2202 }
2203 
2204 /*
2205  * CLOSE a file
2206  */
2207 static int
2208 nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
2209 {
2210 	rnode4_t	*rp;
2211 	int		 error = 0;
2212 	int		 r_error = 0;
2213 	int		 n4error = 0;
2214 	nfs4_error_t	 e = { 0, NFS4_OK, RPC_SUCCESS };
2215 
2216 	/*
2217 	 * Remove client state for this (lockowner, file) pair.
2218 	 * Issue otw v4 call to have the server do the same.
2219 	 */
2220 
2221 	rp = VTOR4(vp);
2222 
2223 	/*
2224 	 * zone_enter(2) prevents processes from changing zones with NFS files
2225 	 * open; if we happen to get here from the wrong zone we can't do
2226 	 * anything over the wire.
2227 	 */
2228 	if (VTOMI4(vp)->mi_zone != nfs_zone()) {
2229 		/*
2230 		 * We could attempt to clean up locks, except we're sure
2231 		 * that the current process didn't acquire any locks on
2232 		 * the file: any attempt to lock a file belong to another zone
2233 		 * will fail, and one can't lock an NFS file and then change
2234 		 * zones, as that fails too.
2235 		 *
2236 		 * Returning an error here is the sane thing to do.  A
2237 		 * subsequent call to VN_RELE() which translates to a
2238 		 * nfs4_inactive() will clean up state: if the zone of the
2239 		 * vnode's origin is still alive and kicking, the inactive
2240 		 * thread will handle the request (from the correct zone), and
2241 		 * everything (minus the OTW close call) should be OK.  If the
2242 		 * zone is going away nfs4_async_inactive() will throw away
2243 		 * delegations, open streams and cached pages inline.
2244 		 */
2245 		return (EIO);
2246 	}
2247 
2248 	/*
2249 	 * If we are using local locking for this filesystem, then
2250 	 * release all of the SYSV style record locks.  Otherwise,
2251 	 * we are doing network locking and we need to release all
2252 	 * of the network locks.  All of the locks held by this
2253 	 * process on this file are released no matter what the
2254 	 * incoming reference count is.
2255 	 */
2256 	if (VTOMI4(vp)->mi_flags & MI4_LLOCK) {
2257 		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
2258 		cleanshares(vp, ttoproc(curthread)->p_pid);
2259 	} else
2260 		e.error = nfs4_lockrelease(vp, flag, offset, cr);
2261 
2262 	if (e.error)
2263 		return (e.error);
2264 
2265 	if (count > 1)
2266 		return (0);
2267 
2268 	/*
2269 	 * If the file has been `unlinked', then purge the
2270 	 * DNLC so that this vnode will get reycled quicker
2271 	 * and the .nfs* file on the server will get removed.
2272 	 */
2273 	if (rp->r_unldvp != NULL)
2274 		dnlc_purge_vp(vp);
2275 
2276 	/*
2277 	 * If the file was open for write and there are pages,
2278 	 * do a synchronous flush and commit of all of the
2279 	 * dirty and uncommitted pages.
2280 	 */
2281 	ASSERT(!e.error);
2282 	if ((flag & FWRITE) && nfs4_has_pages(vp))
2283 		error = nfs4_putpage_commit(vp, 0, 0, cr);
2284 
2285 	mutex_enter(&rp->r_statelock);
2286 	r_error = rp->r_error;
2287 	rp->r_error = 0;
2288 	mutex_exit(&rp->r_statelock);
2289 
2290 	/*
2291 	 * If this file type is one for which no explicit 'open' was
2292 	 * done, then bail now (ie. no need for protocol 'close'). If
2293 	 * there was an error w/the vm subsystem, return _that_ error,
2294 	 * otherwise, return any errors that may've been reported via
2295 	 * the rnode.
2296 	 */
2297 	if (vp->v_type != VREG)
2298 		return (error ? error : r_error);
2299 
2300 	/*
2301 	 * The sync putpage commit may have failed above, but since
2302 	 * we're working w/a regular file, we need to do the protocol
2303 	 * 'close' (nfs4close_one will figure out if an otw close is
2304 	 * needed or not). Report any errors _after_ doing the protocol
2305 	 * 'close'.
2306 	 */
2307 	nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0);
2308 	n4error = e.error ? e.error : geterrno4(e.stat);
2309 
2310 	/*
2311 	 * Error reporting prio (Hi -> Lo)
2312 	 *
2313 	 *   i) nfs4_putpage_commit (error)
2314 	 *  ii) rnode's (r_error)
2315 	 * iii) nfs4close_one (n4error)
2316 	 */
2317 	return (error ? error : (r_error ? r_error : n4error));
2318 }
2319 
2320 /*
2321  * Initialize *lost_rqstp.
2322  */
2323 
2324 static void
2325 nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
2326 	nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr,
2327 	vnode_t *vp)
2328 {
2329 	if (error != ETIMEDOUT && error != EINTR &&
2330 	    !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
2331 		lost_rqstp->lr_op = 0;
2332 		return;
2333 	}
2334 
2335 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2336 			"nfs4close_save_lost_rqst: error %d", error));
2337 
2338 	lost_rqstp->lr_op = OP_CLOSE;
2339 	/*
2340 	 * The vp is held and rele'd via the recovery code.
2341 	 * See nfs4_save_lost_rqst.
2342 	 */
2343 	lost_rqstp->lr_vp = vp;
2344 	lost_rqstp->lr_dvp = NULL;
2345 	lost_rqstp->lr_oop = oop;
2346 	lost_rqstp->lr_osp = osp;
2347 	ASSERT(osp != NULL);
2348 	ASSERT(mutex_owned(&osp->os_sync_lock));
2349 	osp->os_pending_close = 1;
2350 	lost_rqstp->lr_lop = NULL;
2351 	lost_rqstp->lr_cr = cr;
2352 	lost_rqstp->lr_flk = NULL;
2353 	lost_rqstp->lr_putfirst = FALSE;
2354 }
2355 
2356 /*
2357  * Assumes you already have the open seqid sync grabbed as well as the
2358  * 'os_sync_lock'.  Note: this will release the open seqid sync and
2359  * 'os_sync_lock' if client recovery starts.  Calling functions have to
2360  * be prepared to handle this.
2361  *
2362  * 'recov' is returned as 1 if the CLOSE operation detected client recovery
2363  * was needed and was started, and that the calling function should retry
2364  * this function; otherwise it is returned as 0.
2365  *
2366  * Errors are returned via the nfs4_error_t parameter.
2367  */
2368 static void
2369 nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop,
2370 	nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp,
2371 	nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp)
2372 {
2373 	COMPOUND4args_clnt args;
2374 	COMPOUND4res_clnt res;
2375 	CLOSE4args *close_args;
2376 	nfs_resop4 *resop;
2377 	nfs_argop4 argop[3];
2378 	int doqueue = 1;
2379 	mntinfo4_t *mi;
2380 	seqid4 seqid;
2381 	vnode_t *vp;
2382 	bool_t needrecov = FALSE;
2383 	nfs4_lost_rqst_t lost_rqst;
2384 	hrtime_t t;
2385 
2386 	ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone);
2387 
2388 	ASSERT(MUTEX_HELD(&osp->os_sync_lock));
2389 
2390 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw"));
2391 
2392 	/* Only set this to 1 if recovery is started */
2393 	*recov = 0;
2394 
2395 	/* do the OTW call to close the file */
2396 
2397 	if (close_type == CLOSE_RESEND)
2398 		args.ctag = TAG_CLOSE_LOST;
2399 	else if (close_type == CLOSE_AFTER_RESEND)
2400 		args.ctag = TAG_CLOSE_UNDO;
2401 	else
2402 		args.ctag = TAG_CLOSE;
2403 
2404 	args.array_len = 3;
2405 	args.array = argop;
2406 
2407 	vp = RTOV4(rp);
2408 
2409 	mi = VTOMI4(vp);
2410 
2411 	/* putfh target fh */
2412 	argop[0].argop = OP_CPUTFH;
2413 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
2414 
2415 	argop[1].argop = OP_GETATTR;
2416 	argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
2417 	argop[1].nfs_argop4_u.opgetattr.mi = mi;
2418 
2419 	argop[2].argop = OP_CLOSE;
2420 	close_args = &argop[2].nfs_argop4_u.opclose;
2421 
2422 	seqid = nfs4_get_open_seqid(oop) + 1;
2423 
2424 	close_args->seqid = seqid;
2425 	close_args->open_stateid = osp->open_stateid;
2426 
2427 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
2428 	    "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first",
2429 	    rnode4info(rp)));
2430 
2431 	t = gethrtime();
2432 
2433 	rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
2434 
2435 	if (!ep->error && nfs4_need_to_bump_seqid(&res)) {
2436 		nfs4_set_open_seqid(seqid, oop, args.ctag);
2437 	}
2438 
2439 	needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
2440 	if (ep->error && !needrecov) {
2441 		/*
2442 		 * if there was an error and no recovery is to be done
2443 		 * then then set up the file to flush its cache if
2444 		 * needed for the next caller.
2445 		 */
2446 		mutex_enter(&rp->r_statelock);
2447 		PURGE_ATTRCACHE4_LOCKED(rp);
2448 		rp->r_flags &= ~R4WRITEMODIFIED;
2449 		mutex_exit(&rp->r_statelock);
2450 		return;
2451 	}
2452 
2453 	if (needrecov) {
2454 		bool_t abort;
2455 		nfs4_bseqid_entry_t *bsep = NULL;
2456 
2457 		if (close_type != CLOSE_RESEND)
2458 			nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop,
2459 				osp, cred_otw, vp);
2460 
2461 		if (!ep->error && res.status == NFS4ERR_BAD_SEQID)
2462 			bsep = nfs4_create_bseqid_entry(oop, NULL, vp,
2463 				0, args.ctag, close_args->seqid);
2464 
2465 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2466 			"nfs4close_otw: initiating recovery. error %d "
2467 			"res.status %d", ep->error, res.status));
2468 
2469 		/*
2470 		 * Drop the 'os_sync_lock' here so we don't hit
2471 		 * a potential recursive mutex_enter via an
2472 		 * 'open_stream_hold()'.
2473 		 */
2474 		mutex_exit(&osp->os_sync_lock);
2475 		*have_sync_lockp = 0;
2476 		abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL,
2477 			    (close_type != CLOSE_RESEND &&
2478 			    lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL,
2479 			    OP_CLOSE, bsep);
2480 
2481 		/* drop open seq sync, and let the calling function regrab it */
2482 		nfs4_end_open_seqid_sync(oop);
2483 		*did_start_seqid_syncp = 0;
2484 
2485 		if (bsep)
2486 			kmem_free(bsep, sizeof (*bsep));
2487 		/*
2488 		 * For signals, the caller wants to quit, so don't say to
2489 		 * retry.  For forced unmount, if it's a user thread, it
2490 		 * wants to quit.  If it's a recovery thread, the retry
2491 		 * will happen higher-up on the call stack.  Either way,
2492 		 * don't say to retry.
2493 		 */
2494 		if (abort == FALSE && ep->error != EINTR &&
2495 		    !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) &&
2496 		    close_type != CLOSE_RESEND &&
2497 		    close_type != CLOSE_AFTER_RESEND)
2498 			*recov = 1;
2499 		else
2500 			*recov = 0;
2501 
2502 		if (!ep->error)
2503 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2504 		return;
2505 	}
2506 
2507 	if (res.status) {
2508 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2509 		return;
2510 	}
2511 
2512 	mutex_enter(&rp->r_statev4_lock);
2513 	rp->created_v4 = 0;
2514 	mutex_exit(&rp->r_statev4_lock);
2515 
2516 	resop = &res.array[2];
2517 	osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid;
2518 	osp->os_valid = 0;
2519 
2520 	/*
2521 	 * This removes the reference obtained at OPEN; ie, when the
2522 	 * open stream structure was created.
2523 	 *
2524 	 * We don't have to worry about calling 'open_stream_rele'
2525 	 * since we our currently holding a reference to the open
2526 	 * stream which means the count cannot go to 0 with this
2527 	 * decrement.
2528 	 */
2529 	ASSERT(osp->os_ref_count >= 2);
2530 	osp->os_ref_count--;
2531 
2532 	if (!ep->error)
2533 		nfs4_attr_cache(vp,
2534 				&res.array[1].nfs_resop4_u.opgetattr.ga_res,
2535 				t, cred_otw, TRUE, NULL);
2536 
2537 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:"
2538 		" returning %d", ep->error));
2539 
2540 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2541 }
2542 
2543 /* ARGSUSED */
2544 static int
2545 nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2546 	caller_context_t *ct)
2547 {
2548 	rnode4_t *rp;
2549 	u_offset_t off;
2550 	offset_t diff;
2551 	uint_t on;
2552 	uint_t n;
2553 	caddr_t base;
2554 	uint_t flags;
2555 	int error;
2556 	mntinfo4_t *mi;
2557 
2558 	rp = VTOR4(vp);
2559 
2560 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2561 
2562 	if (IS_SHADOW(vp, rp))
2563 		vp = RTOV4(rp);
2564 
2565 	if (vp->v_type != VREG)
2566 		return (EISDIR);
2567 
2568 	mi = VTOMI4(vp);
2569 
2570 	if (nfs_zone() != mi->mi_zone)
2571 		return (EIO);
2572 
2573 	if (uiop->uio_resid == 0)
2574 		return (0);
2575 
2576 	if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0)
2577 		return (EINVAL);
2578 
2579 	mutex_enter(&rp->r_statelock);
2580 	if (rp->r_flags & R4RECOVERRP)
2581 		error = (rp->r_error ? rp->r_error : EIO);
2582 	else
2583 		error = 0;
2584 	mutex_exit(&rp->r_statelock);
2585 	if (error)
2586 		return (error);
2587 
2588 	/*
2589 	 * Bypass VM if caching has been disabled (e.g., locking) or if
2590 	 * using client-side direct I/O and the file is not mmap'd and
2591 	 * there are no cached pages.
2592 	 */
2593 	if ((vp->v_flag & VNOCACHE) ||
2594 	    (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2595 	    rp->r_mapcnt == 0 && !nfs4_has_pages(vp))) {
2596 		size_t resid = 0;
2597 
2598 		return (nfs4read(vp, NULL, uiop->uio_loffset,
2599 				uiop->uio_resid, &resid, cr, FALSE, uiop));
2600 	}
2601 
2602 	error = 0;
2603 
2604 	do {
2605 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2606 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2607 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
2608 
2609 		if (error = nfs4_validate_caches(vp, cr))
2610 			break;
2611 
2612 		mutex_enter(&rp->r_statelock);
2613 		diff = rp->r_size - uiop->uio_loffset;
2614 		mutex_exit(&rp->r_statelock);
2615 		if (diff <= 0)
2616 			break;
2617 		if (diff < n)
2618 			n = (uint_t)diff;
2619 
2620 		base = segmap_getmapflt(segkmap, vp, off + on, n, 1, S_READ);
2621 
2622 		error = uiomove(base + on, n, UIO_READ, uiop);
2623 
2624 		if (!error) {
2625 			/*
2626 			 * If read a whole block or read to eof,
2627 			 * won't need this buffer again soon.
2628 			 */
2629 			mutex_enter(&rp->r_statelock);
2630 			if (n + on == MAXBSIZE ||
2631 			    uiop->uio_loffset == rp->r_size)
2632 				flags = SM_DONTNEED;
2633 			else
2634 				flags = 0;
2635 			mutex_exit(&rp->r_statelock);
2636 			error = segmap_release(segkmap, base, flags);
2637 		} else
2638 			(void) segmap_release(segkmap, base, 0);
2639 	} while (!error && uiop->uio_resid > 0);
2640 
2641 	return (error);
2642 }
2643 
2644 /* ARGSUSED */
2645 static int
2646 nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2647 		caller_context_t *ct)
2648 {
2649 	rlim64_t limit = uiop->uio_llimit;
2650 	rnode4_t *rp;
2651 	u_offset_t off;
2652 	caddr_t base;
2653 	uint_t flags;
2654 	int remainder;
2655 	size_t n;
2656 	int on;
2657 	int error;
2658 	int resid;
2659 	u_offset_t offset;
2660 	mntinfo4_t *mi;
2661 	uint_t bsize;
2662 
2663 	rp = VTOR4(vp);
2664 
2665 	if (IS_SHADOW(vp, rp))
2666 		vp = RTOV4(rp);
2667 
2668 	if (vp->v_type != VREG)
2669 		return (EISDIR);
2670 
2671 	mi = VTOMI4(vp);
2672 
2673 	if (nfs_zone() != mi->mi_zone)
2674 		return (EIO);
2675 
2676 	if (uiop->uio_resid == 0)
2677 		return (0);
2678 
2679 	mutex_enter(&rp->r_statelock);
2680 	if (rp->r_flags & R4RECOVERRP)
2681 		error = (rp->r_error ? rp->r_error : EIO);
2682 	else
2683 		error = 0;
2684 	mutex_exit(&rp->r_statelock);
2685 	if (error)
2686 		return (error);
2687 
2688 	if (ioflag & FAPPEND) {
2689 		struct vattr va;
2690 
2691 		/*
2692 		 * Must serialize if appending.
2693 		 */
2694 		if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
2695 			nfs_rw_exit(&rp->r_rwlock);
2696 			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
2697 			    INTR(vp)))
2698 				return (EINTR);
2699 		}
2700 
2701 		va.va_mask = AT_SIZE;
2702 		error = nfs4getattr(vp, &va, cr);
2703 		if (error)
2704 			return (error);
2705 		uiop->uio_loffset = va.va_size;
2706 	}
2707 
2708 	offset = uiop->uio_loffset + uiop->uio_resid;
2709 
2710 	if (uiop->uio_loffset < (offset_t)0 || offset < 0)
2711 		return (EINVAL);
2712 
2713 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
2714 		limit = MAXOFFSET_T;
2715 
2716 	/*
2717 	 * Check to make sure that the process will not exceed
2718 	 * its limit on file size.  It is okay to write up to
2719 	 * the limit, but not beyond.  Thus, the write which
2720 	 * reaches the limit will be short and the next write
2721 	 * will return an error.
2722 	 */
2723 	remainder = 0;
2724 	if (offset > uiop->uio_llimit) {
2725 		remainder = offset - uiop->uio_llimit;
2726 		uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset;
2727 		if (uiop->uio_resid <= 0) {
2728 			proc_t *p = ttoproc(curthread);
2729 
2730 			uiop->uio_resid += remainder;
2731 			mutex_enter(&p->p_lock);
2732 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
2733 			    p->p_rctls, p, RCA_UNSAFE_SIGINFO);
2734 			mutex_exit(&p->p_lock);
2735 			return (EFBIG);
2736 		}
2737 	}
2738 
2739 	/* update the change attribute, if we have a write delegation */
2740 
2741 	mutex_enter(&rp->r_statev4_lock);
2742 	if (rp->r_deleg_type == OPEN_DELEGATE_WRITE)
2743 		rp->r_deleg_change++;
2744 
2745 	mutex_exit(&rp->r_statev4_lock);
2746 
2747 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp)))
2748 		return (EINTR);
2749 
2750 	/*
2751 	 * Bypass VM if caching has been disabled (e.g., locking) or if
2752 	 * using client-side direct I/O and the file is not mmap'd and
2753 	 * there are no cached pages.
2754 	 */
2755 	if ((vp->v_flag & VNOCACHE) ||
2756 	    (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2757 	    rp->r_mapcnt == 0 && !nfs4_has_pages(vp))) {
2758 		size_t bufsize;
2759 		int count;
2760 		u_offset_t org_offset;
2761 		stable_how4 stab_comm;
2762 nfs4_fwrite:
2763 		if (rp->r_flags & R4STALE) {
2764 			resid = uiop->uio_resid;
2765 			offset = uiop->uio_loffset;
2766 			error = rp->r_error;
2767 			goto bottom;
2768 		}
2769 
2770 		bufsize = MIN(uiop->uio_resid, mi->mi_stsize);
2771 		base = kmem_alloc(bufsize, KM_SLEEP);
2772 		do {
2773 			if (ioflag & FDSYNC)
2774 				stab_comm = DATA_SYNC4;
2775 			else
2776 				stab_comm = FILE_SYNC4;
2777 			resid = uiop->uio_resid;
2778 			offset = uiop->uio_loffset;
2779 			count = MIN(uiop->uio_resid, bufsize);
2780 			org_offset = uiop->uio_loffset;
2781 			error = uiomove(base, count, UIO_WRITE, uiop);
2782 			if (!error) {
2783 				error = nfs4write(vp, base, org_offset,
2784 						count, cr, &stab_comm);
2785 				if (!error) {
2786 					mutex_enter(&rp->r_statelock);
2787 					if (rp->r_size < uiop->uio_loffset)
2788 						rp->r_size = uiop->uio_loffset;
2789 					mutex_exit(&rp->r_statelock);
2790 				}
2791 			}
2792 		} while (!error && uiop->uio_resid > 0);
2793 		kmem_free(base, bufsize);
2794 		goto bottom;
2795 	}
2796 
2797 	bsize = vp->v_vfsp->vfs_bsize;
2798 
2799 	do {
2800 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2801 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2802 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
2803 
2804 		resid = uiop->uio_resid;
2805 		offset = uiop->uio_loffset;
2806 
2807 		if (rp->r_flags & R4STALE) {
2808 			error = rp->r_error;
2809 			break;
2810 		}
2811 
2812 		/*
2813 		 * Don't create dirty pages faster than they
2814 		 * can be cleaned so that the system doesn't
2815 		 * get imbalanced.  If the async queue is
2816 		 * maxed out, then wait for it to drain before
2817 		 * creating more dirty pages.  Also, wait for
2818 		 * any threads doing pagewalks in the vop_getattr
2819 		 * entry points so that they don't block for
2820 		 * long periods.
2821 		 */
2822 		mutex_enter(&rp->r_statelock);
2823 		while ((mi->mi_max_threads != 0 &&
2824 		    rp->r_awcount > 2 * mi->mi_max_threads) ||
2825 		    rp->r_gcount > 0)
2826 			cv_wait(&rp->r_cv, &rp->r_statelock);
2827 		mutex_exit(&rp->r_statelock);
2828 
2829 		if (segmap_kpm) {
2830 			int pon = uiop->uio_loffset & PAGEOFFSET;
2831 			size_t pn = MIN(PAGESIZE - pon, uiop->uio_resid);
2832 			int pagecreate;
2833 
2834 			mutex_enter(&rp->r_statelock);
2835 			pagecreate = (pon == 0) && (pn == PAGESIZE ||
2836 				uiop->uio_loffset + pn >= rp->r_size);
2837 			mutex_exit(&rp->r_statelock);
2838 
2839 			base = segmap_getmapflt(segkmap, vp, off + on,
2840 						pn, !pagecreate, S_WRITE);
2841 
2842 			error = writerp4(rp, base + pon, n, uiop, pagecreate);
2843 
2844 		} else {
2845 			base = segmap_getmapflt(segkmap, vp, off + on,
2846 						n, 0, S_READ);
2847 			error = writerp4(rp, base + on, n, uiop, 0);
2848 		}
2849 
2850 		if (!error) {
2851 			if (mi->mi_flags & MI4_NOAC)
2852 				flags = SM_WRITE;
2853 			else if ((uiop->uio_loffset % bsize) == 0 ||
2854 			    IS_SWAPVP(vp)) {
2855 				/*
2856 				 * Have written a whole block.
2857 				 * Start an asynchronous write
2858 				 * and mark the buffer to
2859 				 * indicate that it won't be
2860 				 * needed again soon.
2861 				 */
2862 				flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
2863 			} else
2864 				flags = 0;
2865 			if ((ioflag & (FSYNC|FDSYNC)) ||
2866 			    (rp->r_flags & R4OUTOFSPACE)) {
2867 				flags &= ~SM_ASYNC;
2868 				flags |= SM_WRITE;
2869 			}
2870 			error = segmap_release(segkmap, base, flags);
2871 		} else {
2872 			(void) segmap_release(segkmap, base, 0);
2873 			/*
2874 			 * In the event that we got an access error while
2875 			 * faulting in a page for a write-only file just
2876 			 * force a write.
2877 			 */
2878 			if (error == EACCES)
2879 				goto nfs4_fwrite;
2880 		}
2881 	} while (!error && uiop->uio_resid > 0);
2882 
2883 bottom:
2884 	if (error) {
2885 		uiop->uio_resid = resid + remainder;
2886 		uiop->uio_loffset = offset;
2887 	} else {
2888 		uiop->uio_resid += remainder;
2889 
2890 		mutex_enter(&rp->r_statev4_lock);
2891 		if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
2892 			gethrestime(&rp->r_attr.va_mtime);
2893 			rp->r_attr.va_ctime = rp->r_attr.va_mtime;
2894 		}
2895 		mutex_exit(&rp->r_statev4_lock);
2896 	}
2897 
2898 	nfs_rw_exit(&rp->r_lkserlock);
2899 
2900 	return (error);
2901 }
2902 
2903 /*
2904  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
2905  */
2906 static int
2907 nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
2908 	int flags, cred_t *cr)
2909 {
2910 	struct buf *bp;
2911 	int error;
2912 	page_t *savepp;
2913 	uchar_t fsdata;
2914 	stable_how4 stab_comm;
2915 
2916 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
2917 	bp = pageio_setup(pp, len, vp, flags);
2918 	ASSERT(bp != NULL);
2919 
2920 	/*
2921 	 * pageio_setup should have set b_addr to 0.  This
2922 	 * is correct since we want to do I/O on a page
2923 	 * boundary.  bp_mapin will use this addr to calculate
2924 	 * an offset, and then set b_addr to the kernel virtual
2925 	 * address it allocated for us.
2926 	 */
2927 	ASSERT(bp->b_un.b_addr == 0);
2928 
2929 	bp->b_edev = 0;
2930 	bp->b_dev = 0;
2931 	bp->b_lblkno = lbtodb(off);
2932 	bp->b_file = vp;
2933 	bp->b_offset = (offset_t)off;
2934 	bp_mapin(bp);
2935 
2936 	if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) &&
2937 	    freemem > desfree)
2938 		stab_comm = UNSTABLE4;
2939 	else
2940 		stab_comm = FILE_SYNC4;
2941 
2942 	error = nfs4_bio(bp, &stab_comm, cr, FALSE);
2943 
2944 	bp_mapout(bp);
2945 	pageio_done(bp);
2946 
2947 	if (stab_comm == UNSTABLE4)
2948 		fsdata = C_DELAYCOMMIT;
2949 	else
2950 		fsdata = C_NOCOMMIT;
2951 
2952 	savepp = pp;
2953 	do {
2954 		pp->p_fsdata = fsdata;
2955 	} while ((pp = pp->p_next) != savepp);
2956 
2957 	return (error);
2958 }
2959 
2960 /*
2961  */
2962 static int
2963 nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr)
2964 {
2965 	nfs4_open_owner_t	*oop;
2966 	nfs4_open_stream_t	*osp;
2967 	rnode4_t		*rp = VTOR4(vp);
2968 	mntinfo4_t 		*mi = VTOMI4(vp);
2969 	int 			reopen_needed;
2970 
2971 	ASSERT(nfs_zone() == mi->mi_zone);
2972 
2973 
2974 	oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
2975 	if (!oop)
2976 		return (EIO);
2977 
2978 	/* returns with 'os_sync_lock' held */
2979 	osp = find_open_stream(oop, rp);
2980 	if (!osp) {
2981 		open_owner_rele(oop);
2982 		return (EIO);
2983 	}
2984 
2985 	if (osp->os_failed_reopen) {
2986 		mutex_exit(&osp->os_sync_lock);
2987 		open_stream_rele(osp, rp);
2988 		open_owner_rele(oop);
2989 		return (EIO);
2990 	}
2991 
2992 	/*
2993 	 * Determine whether a reopen is needed.  If this
2994 	 * is a delegation open stream, then the os_delegation bit
2995 	 * should be set.
2996 	 */
2997 
2998 	reopen_needed = osp->os_delegation;
2999 
3000 	mutex_exit(&osp->os_sync_lock);
3001 	open_owner_rele(oop);
3002 
3003 	if (reopen_needed) {
3004 		nfs4_error_zinit(ep);
3005 		nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE);
3006 		mutex_enter(&osp->os_sync_lock);
3007 		if (ep->error || ep->stat || osp->os_failed_reopen) {
3008 			mutex_exit(&osp->os_sync_lock);
3009 			open_stream_rele(osp, rp);
3010 			return (EIO);
3011 		}
3012 		mutex_exit(&osp->os_sync_lock);
3013 	}
3014 	open_stream_rele(osp, rp);
3015 
3016 	return (0);
3017 }
3018 
3019 /*
3020  * Write to file.  Writes to remote server in largest size
3021  * chunks that the server can handle.  Write is synchronous.
3022  */
3023 static int
3024 nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr,
3025 	stable_how4 *stab_comm)
3026 {
3027 	mntinfo4_t *mi;
3028 	COMPOUND4args_clnt args;
3029 	COMPOUND4res_clnt res;
3030 	WRITE4args *wargs;
3031 	WRITE4res *wres;
3032 	nfs_argop4 argop[2];
3033 	nfs_resop4 *resop;
3034 	int tsize;
3035 	stable_how4 stable;
3036 	rnode4_t *rp;
3037 	int doqueue = 1;
3038 	bool_t needrecov;
3039 	nfs4_recov_state_t recov_state;
3040 	nfs4_stateid_types_t sid_types;
3041 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3042 
3043 	rp = VTOR4(vp);
3044 	mi = VTOMI4(vp);
3045 
3046 	ASSERT(nfs_zone() == mi->mi_zone);
3047 
3048 	stable = *stab_comm;
3049 	*stab_comm = FILE_SYNC4;
3050 
3051 	needrecov = FALSE;
3052 	recov_state.rs_flags = 0;
3053 	recov_state.rs_num_retry_despite_err = 0;
3054 	nfs4_init_stateid_types(&sid_types);
3055 
3056 recov_retry:
3057 	args.ctag = TAG_WRITE;
3058 	args.array_len = 2;
3059 	args.array = argop;
3060 
3061 	e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3062 			    &recov_state, NULL);
3063 	if (e.error)
3064 		return (e.error);
3065 
3066 	/* 0. putfh target fh */
3067 	argop[0].argop = OP_CPUTFH;
3068 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3069 
3070 	/* 1. write */
3071 	nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types);
3072 
3073 	do {
3074 
3075 		wargs->offset = (offset4)offset;
3076 		wargs->data_val = base;
3077 
3078 		if (mi->mi_io_kstats) {
3079 			mutex_enter(&mi->mi_lock);
3080 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3081 			mutex_exit(&mi->mi_lock);
3082 		}
3083 
3084 		if ((vp->v_flag & VNOCACHE) ||
3085 		    (rp->r_flags & R4DIRECTIO) ||
3086 		    (mi->mi_flags & MI4_DIRECTIO))
3087 			tsize = MIN(mi->mi_stsize, count);
3088 		else
3089 			tsize = MIN(mi->mi_curwrite, count);
3090 		wargs->data_len = (uint_t)tsize;
3091 		rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3092 
3093 		if (mi->mi_io_kstats) {
3094 			mutex_enter(&mi->mi_lock);
3095 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3096 			mutex_exit(&mi->mi_lock);
3097 		}
3098 
3099 		needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3100 		if (e.error && !needrecov) {
3101 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3102 				&recov_state, needrecov);
3103 			return (e.error);
3104 		}
3105 
3106 
3107 		/*
3108 		 * Do handling of OLD_STATEID outside
3109 		 * of the normal recovery framework.
3110 		 *
3111 		 * If write receives a BAD stateid error while using a
3112 		 * delegation stateid, retry using the open stateid (if it
3113 		 * exists).  If it doesn't have an open stateid, reopen the
3114 		 * file first, then retry.
3115 		 */
3116 		if (!e.error && res.status == NFS4ERR_OLD_STATEID &&
3117 		    sid_types.cur_sid_type != SPEC_SID) {
3118 			nfs4_save_stateid(&wargs->stateid, &sid_types);
3119 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3120 				&recov_state, needrecov);
3121 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3122 			goto recov_retry;
3123 		} else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
3124 			    sid_types.cur_sid_type == DEL_SID) {
3125 			nfs4_save_stateid(&wargs->stateid, &sid_types);
3126 			mutex_enter(&rp->r_statev4_lock);
3127 			rp->r_deleg_return_pending = TRUE;
3128 			mutex_exit(&rp->r_statev4_lock);
3129 			if (nfs4rdwr_check_osid(vp, &e, cr)) {
3130 				nfs4_end_fop(mi, vp, NULL, OH_WRITE,
3131 					&recov_state, needrecov);
3132 				(void) xdr_free(xdr_COMPOUND4res_clnt,
3133 								(caddr_t)&res);
3134 				return (EIO);
3135 			}
3136 			nfs4_end_fop(mi, vp, NULL, OH_WRITE,
3137 				&recov_state, needrecov);
3138 			/* hold needed for nfs4delegreturn_thread */
3139 			VN_HOLD(vp);
3140 			nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
3141 				NFS4_DR_DISCARD), FALSE);
3142 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3143 			goto recov_retry;
3144 		}
3145 
3146 		if (needrecov) {
3147 			bool_t abort;
3148 
3149 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3150 				"nfs4write: client got error %d, res.status %d"
3151 				", so start recovery", e.error, res.status));
3152 
3153 			abort = nfs4_start_recovery(&e,
3154 				    VTOMI4(vp), vp, NULL, &wargs->stateid,
3155 				    NULL, OP_WRITE, NULL);
3156 			if (!e.error) {
3157 				e.error = geterrno4(res.status);
3158 				(void) xdr_free(xdr_COMPOUND4res_clnt,
3159 								(caddr_t)&res);
3160 			}
3161 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3162 				&recov_state, needrecov);
3163 			if (abort == FALSE)
3164 				goto recov_retry;
3165 			return (e.error);
3166 		}
3167 
3168 		if (res.status) {
3169 			e.error = geterrno4(res.status);
3170 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3171 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3172 				&recov_state, needrecov);
3173 			return (e.error);
3174 		}
3175 
3176 		resop = &res.array[1];	/* write res */
3177 		wres = &resop->nfs_resop4_u.opwrite;
3178 
3179 		if ((int)wres->count > tsize) {
3180 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3181 
3182 			zcmn_err(getzoneid(), CE_WARN,
3183 			"nfs4write: server wrote %u, requested was %u",
3184 			    (int)wres->count, tsize);
3185 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3186 				&recov_state, needrecov);
3187 			return (EIO);
3188 		}
3189 		if (wres->committed == UNSTABLE4) {
3190 			*stab_comm = UNSTABLE4;
3191 			if (wargs->stable == DATA_SYNC4 ||
3192 			    wargs->stable == FILE_SYNC4) {
3193 				(void) xdr_free(xdr_COMPOUND4res_clnt,
3194 								(caddr_t)&res);
3195 				zcmn_err(getzoneid(), CE_WARN,
3196 					"nfs4write: server %s did not commit "
3197 					"to stable storage",
3198 					rp->r_server->sv_hostname);
3199 				nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3200 						&recov_state, needrecov);
3201 				return (EIO);
3202 			}
3203 		}
3204 
3205 		tsize = (int)wres->count;
3206 		count -= tsize;
3207 		base += tsize;
3208 		offset += tsize;
3209 		if (mi->mi_io_kstats) {
3210 			mutex_enter(&mi->mi_lock);
3211 			KSTAT_IO_PTR(mi->mi_io_kstats)->writes++;
3212 			KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten +=
3213 			    tsize;
3214 			mutex_exit(&mi->mi_lock);
3215 		}
3216 		lwp_stat_update(LWP_STAT_OUBLK, 1);
3217 		mutex_enter(&rp->r_statelock);
3218 		if (rp->r_flags & R4HAVEVERF) {
3219 			if (rp->r_writeverf != wres->writeverf) {
3220 				nfs4_set_mod(vp);
3221 				rp->r_writeverf = wres->writeverf;
3222 			}
3223 		} else {
3224 			rp->r_writeverf = wres->writeverf;
3225 			rp->r_flags |= R4HAVEVERF;
3226 		}
3227 		PURGE_ATTRCACHE4_LOCKED(rp);
3228 		rp->r_flags |= R4WRITEMODIFIED;
3229 		gethrestime(&rp->r_attr.va_mtime);
3230 		rp->r_attr.va_ctime = rp->r_attr.va_mtime;
3231 		mutex_exit(&rp->r_statelock);
3232 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3233 	} while (count);
3234 
3235 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state, needrecov);
3236 
3237 	return (e.error);
3238 }
3239 
3240 /*
3241  * Read from a file.  Reads data in largest chunks our interface can handle.
3242  */
3243 static int
3244 nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count,
3245 	size_t *residp, cred_t *cr, bool_t async, struct uio *uiop)
3246 {
3247 	mntinfo4_t *mi;
3248 	COMPOUND4args_clnt args;
3249 	COMPOUND4res_clnt res;
3250 	READ4args *rargs;
3251 	nfs_argop4 argop[2];
3252 	int tsize;
3253 	int doqueue;
3254 	rnode4_t *rp;
3255 	int data_len;
3256 	bool_t is_eof;
3257 	bool_t needrecov = FALSE;
3258 	nfs4_recov_state_t recov_state;
3259 	nfs4_stateid_types_t sid_types;
3260 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3261 
3262 	rp = VTOR4(vp);
3263 	mi = VTOMI4(vp);
3264 	doqueue = 1;
3265 
3266 	ASSERT(nfs_zone() == mi->mi_zone);
3267 
3268 	args.ctag = async ? TAG_READAHEAD : TAG_READ;
3269 
3270 	args.array_len = 2;
3271 	args.array = argop;
3272 
3273 	nfs4_init_stateid_types(&sid_types);
3274 
3275 	recov_state.rs_flags = 0;
3276 	recov_state.rs_num_retry_despite_err = 0;
3277 
3278 recov_retry:
3279 	e.error = nfs4_start_fop(mi, vp, NULL, OH_READ,
3280 			    &recov_state, NULL);
3281 	if (e.error)
3282 		return (e.error);
3283 
3284 	/* putfh target fh */
3285 	argop[0].argop = OP_CPUTFH;
3286 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3287 
3288 	/* read */
3289 	argop[1].argop = OP_READ;
3290 	rargs = &argop[1].nfs_argop4_u.opread;
3291 	rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
3292 				OP_READ, &sid_types, async);
3293 
3294 	do {
3295 		if (mi->mi_io_kstats) {
3296 			mutex_enter(&mi->mi_lock);
3297 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3298 			mutex_exit(&mi->mi_lock);
3299 		}
3300 
3301 		NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3302 		    "nfs4read: %s call, rp %s",
3303 		    needrecov ? "recov" : "first",
3304 		    rnode4info(rp)));
3305 
3306 		if ((vp->v_flag & VNOCACHE) ||
3307 		    (rp->r_flags & R4DIRECTIO) ||
3308 		    (mi->mi_flags & MI4_DIRECTIO))
3309 			tsize = MIN(mi->mi_tsize, count);
3310 		else
3311 			tsize = MIN(mi->mi_curread, count);
3312 		rargs->offset = (offset4)offset;
3313 		rargs->count = (count4)tsize;
3314 		rargs->res_data_val_alt = NULL;
3315 		rargs->res_mblk = NULL;
3316 		rargs->res_uiop = NULL;
3317 		rargs->res_maxsize = 0;
3318 		if (uiop)
3319 			rargs->res_uiop = uiop;
3320 		else
3321 			rargs->res_data_val_alt = base;
3322 		rargs->res_maxsize = tsize;
3323 
3324 		rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3325 #ifdef	DEBUG
3326 		if (nfs4read_error_inject) {
3327 			res.status = nfs4read_error_inject;
3328 			nfs4read_error_inject = 0;
3329 		}
3330 #endif
3331 
3332 		if (mi->mi_io_kstats) {
3333 			mutex_enter(&mi->mi_lock);
3334 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3335 			mutex_exit(&mi->mi_lock);
3336 		}
3337 
3338 		needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3339 		if (e.error != 0 && !needrecov) {
3340 			nfs4_end_fop(mi, vp, NULL, OH_READ,
3341 				&recov_state, needrecov);
3342 			return (e.error);
3343 		}
3344 
3345 		/*
3346 		 * Do proper retry for OLD and BAD stateid errors outside
3347 		 * of the normal recovery framework.  There are two differences
3348 		 * between async and sync reads.  The first is that we allow
3349 		 * retry on BAD_STATEID for async reads, but not sync reads.
3350 		 * The second is that we mark the file dead for a failed
3351 		 * attempt with a special stateid for sync reads, but just
3352 		 * return EIO for async reads.
3353 		 *
3354 		 * If a sync read receives a BAD stateid error while using a
3355 		 * delegation stateid, retry using the open stateid (if it
3356 		 * exists).  If it doesn't have an open stateid, reopen the
3357 		 * file first, then retry.
3358 		 */
3359 		if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID ||
3360 		    res.status == NFS4ERR_BAD_STATEID) && async) {
3361 			nfs4_end_fop(mi, vp, NULL, OH_READ,
3362 				&recov_state, needrecov);
3363 			if (sid_types.cur_sid_type == SPEC_SID) {
3364 				(void) xdr_free(xdr_COMPOUND4res_clnt,
3365 						(caddr_t)&res);
3366 				return (EIO);
3367 			}
3368 			nfs4_save_stateid(&rargs->stateid, &sid_types);
3369 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3370 			goto recov_retry;
3371 		} else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3372 			    !async && sid_types.cur_sid_type != SPEC_SID) {
3373 			nfs4_save_stateid(&rargs->stateid, &sid_types);
3374 			nfs4_end_fop(mi, vp, NULL, OH_READ,
3375 				&recov_state, needrecov);
3376 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3377 			goto recov_retry;
3378 		} else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
3379 			    sid_types.cur_sid_type == DEL_SID) {
3380 			nfs4_save_stateid(&rargs->stateid, &sid_types);
3381 			mutex_enter(&rp->r_statev4_lock);
3382 			rp->r_deleg_return_pending = TRUE;
3383 			mutex_exit(&rp->r_statev4_lock);
3384 			if (nfs4rdwr_check_osid(vp, &e, cr)) {
3385 				nfs4_end_fop(mi, vp, NULL, OH_READ,
3386 					&recov_state, needrecov);
3387 				(void) xdr_free(xdr_COMPOUND4res_clnt,
3388 				    (caddr_t)&res);
3389 				return (EIO);
3390 			}
3391 			nfs4_end_fop(mi, vp, NULL, OH_READ,
3392 				&recov_state, needrecov);
3393 			/* hold needed for nfs4delegreturn_thread */
3394 			VN_HOLD(vp);
3395 			nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
3396 				NFS4_DR_DISCARD), FALSE);
3397 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3398 			goto recov_retry;
3399 		}
3400 		if (needrecov) {
3401 			bool_t abort;
3402 
3403 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3404 			    "nfs4read: initiating recovery\n"));
3405 
3406 			abort = nfs4_start_recovery(&e,
3407 				    mi, vp, NULL, &rargs->stateid,
3408 				    NULL, OP_READ, NULL);
3409 			nfs4_end_fop(mi, vp, NULL, OH_READ,
3410 				&recov_state, needrecov);
3411 			/*
3412 			 * Do not retry if we got OLD_STATEID using a special
3413 			 * stateid.  This avoids looping with a broken server.
3414 			 */
3415 			if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3416 			    sid_types.cur_sid_type == SPEC_SID)
3417 				abort = TRUE;
3418 
3419 			if (abort == FALSE) {
3420 				/*
3421 				 * Need to retry all possible stateids in
3422 				 * case the recovery error wasn't stateid
3423 				 * related or the stateids have become
3424 				 * stale (server reboot).
3425 				 */
3426 				nfs4_init_stateid_types(&sid_types);
3427 				(void) xdr_free(xdr_COMPOUND4res_clnt,
3428 						(caddr_t)&res);
3429 				goto recov_retry;
3430 			}
3431 
3432 			if (!e.error) {
3433 				e.error = geterrno4(res.status);
3434 				(void) xdr_free(xdr_COMPOUND4res_clnt,
3435 						(caddr_t)&res);
3436 			}
3437 			return (e.error);
3438 		}
3439 
3440 		if (res.status) {
3441 			e.error = geterrno4(res.status);
3442 			nfs4_end_fop(mi, vp, NULL, OH_READ,
3443 				&recov_state, needrecov);
3444 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3445 			return (e.error);
3446 		}
3447 
3448 		data_len = res.array[1].nfs_resop4_u.opread.data_len;
3449 		count -= data_len;
3450 		if (base)
3451 			base += data_len;
3452 		offset += data_len;
3453 		if (mi->mi_io_kstats) {
3454 			mutex_enter(&mi->mi_lock);
3455 			KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
3456 			KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len;
3457 			mutex_exit(&mi->mi_lock);
3458 		}
3459 		lwp_stat_update(LWP_STAT_INBLK, 1);
3460 		is_eof = res.array[1].nfs_resop4_u.opread.eof;
3461 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3462 
3463 	} while (count && !is_eof);
3464 
3465 	*residp = count;
3466 
3467 	nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov);
3468 
3469 	return (e.error);
3470 }
3471 
3472 /* ARGSUSED */
3473 static int
3474 nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
3475 {
3476 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
3477 		return (EIO);
3478 	switch (cmd) {
3479 		case _FIODIRECTIO:
3480 			return (nfs4_directio(vp, (int)arg, cr));
3481 		default:
3482 			return (ENOTTY);
3483 	}
3484 }
3485 
3486 static int
3487 nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
3488 {
3489 	int error;
3490 	rnode4_t *rp = VTOR4(vp);
3491 
3492 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
3493 		return (EIO);
3494 	/*
3495 	 * If it has been specified that the return value will
3496 	 * just be used as a hint, and we are only being asked
3497 	 * for size, fsid or rdevid, then return the client's
3498 	 * notion of these values without checking to make sure
3499 	 * that the attribute cache is up to date.
3500 	 * The whole point is to avoid an over the wire GETATTR
3501 	 * call.
3502 	 */
3503 	if (flags & ATTR_HINT) {
3504 		if (vap->va_mask ==
3505 		    (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) {
3506 			mutex_enter(&rp->r_statelock);
3507 			if (vap->va_mask | AT_SIZE)
3508 				vap->va_size = rp->r_size;
3509 			if (vap->va_mask | AT_FSID)
3510 				vap->va_fsid = rp->r_attr.va_fsid;
3511 			if (vap->va_mask | AT_RDEV)
3512 				vap->va_rdev = rp->r_attr.va_rdev;
3513 			mutex_exit(&rp->r_statelock);
3514 			return (0);
3515 		}
3516 	}
3517 
3518 	/*
3519 	 * Only need to flush pages if asking for the mtime
3520 	 * and if there any dirty pages or any outstanding
3521 	 * asynchronous (write) requests for this file.
3522 	 */
3523 	if (vap->va_mask & AT_MTIME) {
3524 		rp = VTOR4(vp);
3525 		if (nfs4_has_pages(vp)) {
3526 			mutex_enter(&rp->r_statev4_lock);
3527 			if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) {
3528 				mutex_exit(&rp->r_statev4_lock);
3529 				if (rp->r_flags & R4DIRTY ||
3530 				    rp->r_awcount > 0) {
3531 					mutex_enter(&rp->r_statelock);
3532 					rp->r_gcount++;
3533 					mutex_exit(&rp->r_statelock);
3534 					error =
3535 						nfs4_putpage(vp, (u_offset_t)0,
3536 								0, 0, cr);
3537 					mutex_enter(&rp->r_statelock);
3538 					if (error && (error == ENOSPC ||
3539 							error == EDQUOT)) {
3540 						if (!rp->r_error)
3541 							rp->r_error = error;
3542 					}
3543 					if (--rp->r_gcount == 0)
3544 						cv_broadcast(&rp->r_cv);
3545 					mutex_exit(&rp->r_statelock);
3546 				}
3547 			} else {
3548 				mutex_exit(&rp->r_statev4_lock);
3549 			}
3550 		}
3551 	}
3552 	return (nfs4getattr(vp, vap, cr));
3553 }
3554 
3555 int
3556 nfs4_compare_modes(mode_t from_server, mode_t on_client)
3557 {
3558 	/*
3559 	 * If these are the only two bits cleared
3560 	 * on the server then return 0 (OK) else
3561 	 * return 1 (BAD).
3562 	 */
3563 	on_client &= ~(S_ISUID|S_ISGID);
3564 	if (on_client == from_server)
3565 		return (0);
3566 	else
3567 		return (1);
3568 }
3569 
3570 /*ARGSUSED4*/
3571 static int
3572 nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3573 		caller_context_t *ct)
3574 {
3575 	if (vap->va_mask & AT_NOSET)
3576 		return (EINVAL);
3577 
3578 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
3579 		return (EIO);
3580 
3581 	/*
3582 	 * Don't call secpolicy_vnode_setattr, the client cannot
3583 	 * use its cached attributes to make security decisions
3584 	 * as the server may be faking mode bits or mapping uid/gid.
3585 	 * Always just let the server to the checking.
3586 	 * If we provide the ability to remove basic priviledges
3587 	 * to setattr (e.g. basic without chmod) then we will
3588 	 * need to add a check here before calling the server.
3589 	 */
3590 
3591 	return (nfs4setattr(vp, vap, flags, cr, NULL));
3592 }
3593 
3594 /*
3595  * To replace the "guarded" version 3 setattr, we use two types of compound
3596  * setattr requests:
3597  * 1. The "normal" setattr, used when the size of the file isn't being
3598  *    changed - { Putfh <fh>; Setattr; Getattr }/
3599  * 2. If the size is changed, precede Setattr with: Getattr; Verify
3600  *    with only ctime as the argument. If the server ctime differs from
3601  *    what is cached on the client, the verify will fail, but we would
3602  *    already have the ctime from the preceding getattr, so just set it
3603  *    and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify;
3604  *	Setattr; Getattr }.
3605  *
3606  * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in
3607  * this setattr and NULL if they are not.
3608  */
3609 static int
3610 nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3611 		vsecattr_t *vsap)
3612 {
3613 	COMPOUND4args_clnt args;
3614 	COMPOUND4res_clnt res, *resp = NULL;
3615 	nfs4_ga_res_t *garp = NULL;
3616 	int numops = 3;			/* { Putfh; Setattr; Getattr } */
3617 	nfs_argop4 argop[5];
3618 	int verify_argop = -1;
3619 	int setattr_argop = 1;
3620 	nfs_resop4 *resop;
3621 	vattr_t va;
3622 	rnode4_t *rp;
3623 	int doqueue = 1;
3624 	uint_t mask = vap->va_mask;
3625 	mode_t omode;
3626 	vsecattr_t *vsp;
3627 	timestruc_t ctime;
3628 	bool_t needrecov = FALSE;
3629 	nfs4_recov_state_t recov_state;
3630 	nfs4_stateid_types_t sid_types;
3631 	stateid4 stateid;
3632 	hrtime_t t;
3633 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3634 	servinfo4_t *svp;
3635 	bitmap4 supp_attrs;
3636 
3637 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
3638 	rp = VTOR4(vp);
3639 	nfs4_init_stateid_types(&sid_types);
3640 
3641 	/*
3642 	 * Only need to flush pages if there are any pages and
3643 	 * if the file is marked as dirty in some fashion.  The
3644 	 * file must be flushed so that we can accurately
3645 	 * determine the size of the file and the cached data
3646 	 * after the SETATTR returns.  A file is considered to
3647 	 * be dirty if it is either marked with R4DIRTY, has
3648 	 * outstanding i/o's active, or is mmap'd.  In this
3649 	 * last case, we can't tell whether there are dirty
3650 	 * pages, so we flush just to be sure.
3651 	 */
3652 	if (nfs4_has_pages(vp) &&
3653 	    ((rp->r_flags & R4DIRTY) ||
3654 	    rp->r_count > 0 ||
3655 	    rp->r_mapcnt > 0)) {
3656 		ASSERT(vp->v_type != VCHR);
3657 		e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr);
3658 		if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) {
3659 			mutex_enter(&rp->r_statelock);
3660 			if (!rp->r_error)
3661 				rp->r_error = e.error;
3662 			mutex_exit(&rp->r_statelock);
3663 		}
3664 	}
3665 
3666 	if (mask & AT_SIZE) {
3667 		/*
3668 		 * Verification setattr compound for non-deleg AT_SIZE:
3669 		 *	{ Putfh; Getattr; Verify; Setattr; Getattr }
3670 		 * Set ctime local here (outside the do_again label)
3671 		 * so that subsequent retries (after failed VERIFY)
3672 		 * will use ctime from GETATTR results (from failed
3673 		 * verify compound) as VERIFY arg.
3674 		 * If file has delegation, then VERIFY(time_metadata)
3675 		 * is of little added value, so don't bother.
3676 		 */
3677 		mutex_enter(&rp->r_statev4_lock);
3678 		if (rp->r_deleg_type == OPEN_DELEGATE_NONE ||
3679 						rp->r_deleg_return_pending) {
3680 			numops = 5;
3681 			ctime = rp->r_attr.va_ctime;
3682 		}
3683 		mutex_exit(&rp->r_statev4_lock);
3684 	}
3685 
3686 	recov_state.rs_flags = 0;
3687 	recov_state.rs_num_retry_despite_err = 0;
3688 
3689 	args.ctag = TAG_SETATTR;
3690 do_again:
3691 recov_retry:
3692 	setattr_argop = numops - 2;
3693 
3694 	args.array = argop;
3695 	args.array_len = numops;
3696 
3697 	e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
3698 	if (e.error)
3699 		return (e.error);
3700 
3701 
3702 	/* putfh target fh */
3703 	argop[0].argop = OP_CPUTFH;
3704 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3705 
3706 	if (numops == 5) {
3707 		/*
3708 		 * We only care about the ctime, but need to get mtime
3709 		 * and size for proper cache update.
3710 		 */
3711 		/* getattr */
3712 		argop[1].argop = OP_GETATTR;
3713 		argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
3714 		argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
3715 
3716 		/* verify - set later in loop */
3717 		verify_argop = 2;
3718 	}
3719 
3720 	/* setattr */
3721 	svp = rp->r_server;
3722 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3723 	supp_attrs = svp->sv_supp_attrs;
3724 	nfs_rw_exit(&svp->sv_lock);
3725 
3726 	nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr,
3727 		supp_attrs, &e.error, &sid_types);
3728 	stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid;
3729 	if (e.error) {
3730 		/* req time field(s) overflow - return immediately */
3731 		nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
3732 		nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
3733 						opsetattr.obj_attributes);
3734 		return (e.error);
3735 	}
3736 	omode = rp->r_attr.va_mode;
3737 
3738 	/* getattr */
3739 	argop[numops-1].argop = OP_GETATTR;
3740 	argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
3741 	/*
3742 	 * If we are setting the ACL (indicated only by vsap != NULL), request
3743 	 * the ACL in this getattr.  The ACL returned from this getattr will be
3744 	 * used in updating the ACL cache.
3745 	 */
3746 	if (vsap != NULL)
3747 		argop[numops-1].nfs_argop4_u.opgetattr.attr_request |=
3748 		    FATTR4_ACL_MASK;
3749 	argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
3750 
3751 	/*
3752 	 * setattr iterates if the object size is set and the cached ctime
3753 	 * does not match the file ctime. In that case, verify the ctime first.
3754 	 */
3755 
3756 	do {
3757 		if (verify_argop != -1) {
3758 			/*
3759 			 * Verify that the ctime match before doing setattr.
3760 			 */
3761 			va.va_mask = AT_CTIME;
3762 			va.va_ctime = ctime;
3763 			svp = rp->r_server;
3764 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3765 			supp_attrs = svp->sv_supp_attrs;
3766 			nfs_rw_exit(&svp->sv_lock);
3767 			e.error = nfs4args_verify(&argop[verify_argop], &va,
3768 					OP_VERIFY, supp_attrs);
3769 			if (e.error) {
3770 				/* req time field(s) overflow - return */
3771 				nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3772 					needrecov);
3773 				break;
3774 			}
3775 		}
3776 
3777 		doqueue = 1;
3778 
3779 		t = gethrtime();
3780 
3781 		rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
3782 
3783 		/*
3784 		 * Purge the access cache and ACL cache if changing either the
3785 		 * owner of the file, the group owner, or the mode.  These may
3786 		 * change the access permissions of the file, so purge old
3787 		 * information and start over again.
3788 		 */
3789 		if (mask & (AT_UID | AT_GID | AT_MODE)) {
3790 			(void) nfs4_access_purge_rp(rp);
3791 			if (rp->r_secattr != NULL) {
3792 				mutex_enter(&rp->r_statelock);
3793 				vsp = rp->r_secattr;
3794 				rp->r_secattr = NULL;
3795 				mutex_exit(&rp->r_statelock);
3796 				if (vsp != NULL)
3797 					nfs4_acl_free_cache(vsp);
3798 			}
3799 		}
3800 
3801 		/*
3802 		 * If res.array_len == numops, then everything succeeded,
3803 		 * except for possibly the final getattr.  If only the
3804 		 * last getattr failed, give up, and don't try recovery.
3805 		 */
3806 		if (res.array_len == numops) {
3807 			nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3808 			    needrecov);
3809 			if (! e.error)
3810 				resp = &res;
3811 			break;
3812 		}
3813 
3814 		/*
3815 		 * if either rpc call failed or completely succeeded - done
3816 		 */
3817 		needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
3818 		if (e.error) {
3819 			PURGE_ATTRCACHE4(vp);
3820 			if (!needrecov) {
3821 				nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3822 				    needrecov);
3823 				break;
3824 			}
3825 		}
3826 
3827 		/*
3828 		 * Do proper retry for OLD_STATEID outside of the normal
3829 		 * recovery framework.
3830 		 */
3831 		if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3832 		    sid_types.cur_sid_type != SPEC_SID &&
3833 		    sid_types.cur_sid_type != NO_SID) {
3834 			nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3835 				    needrecov);
3836 			nfs4_save_stateid(&stateid, &sid_types);
3837 			nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
3838 						opsetattr.obj_attributes);
3839 			if (verify_argop != -1) {
3840 				nfs4args_verify_free(&argop[verify_argop]);
3841 				verify_argop = -1;
3842 			}
3843 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3844 			goto recov_retry;
3845 		}
3846 
3847 		if (needrecov) {
3848 			bool_t abort;
3849 
3850 			abort = nfs4_start_recovery(&e,
3851 				    VTOMI4(vp), vp, NULL, NULL, NULL,
3852 				    OP_SETATTR, NULL);
3853 			nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3854 				    needrecov);
3855 			/*
3856 			 * Do not retry if we failed with OLD_STATEID using
3857 			 * a special stateid.  This is done to avoid looping
3858 			 * with a broken server.
3859 			 */
3860 			if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3861 			    (sid_types.cur_sid_type == SPEC_SID ||
3862 			    sid_types.cur_sid_type == NO_SID))
3863 				abort = TRUE;
3864 			if (!e.error) {
3865 				if (res.status == NFS4ERR_BADOWNER)
3866 					nfs4_log_badowner(VTOMI4(vp),
3867 					    OP_SETATTR);
3868 
3869 				e.error = geterrno4(res.status);
3870 				(void) xdr_free(xdr_COMPOUND4res_clnt,
3871 								(caddr_t)&res);
3872 			}
3873 			nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
3874 						opsetattr.obj_attributes);
3875 			if (verify_argop != -1) {
3876 				nfs4args_verify_free(&argop[verify_argop]);
3877 				verify_argop = -1;
3878 			}
3879 			if (abort == FALSE) {
3880 				/*
3881 				 * Need to retry all possible stateids in
3882 				 * case the recovery error wasn't stateid
3883 				 * related or the stateids have become
3884 				 * stale (server reboot).
3885 				 */
3886 				nfs4_init_stateid_types(&sid_types);
3887 				goto recov_retry;
3888 			}
3889 			return (e.error);
3890 		}
3891 
3892 		/*
3893 		 * Need to call nfs4_end_op before nfs4getattr to
3894 		 * avoid potential nfs4_start_op deadlock. See RFE
3895 		 * 4777612.  Calls to nfs4_invalidate_pages() and
3896 		 * nfs4_purge_stale_fh() might also generate over the
3897 		 * wire calls which my cause nfs4_start_op() deadlock.
3898 		 */
3899 		nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
3900 
3901 		/*
3902 		 * Check to update lease.
3903 		 */
3904 		resp = &res;
3905 		if (res.status == NFS4_OK) {
3906 			break;
3907 		}
3908 
3909 		/*
3910 		 * Check if verify failed to see if try again
3911 		 */
3912 		if ((verify_argop == -1) || (res.array_len != 3)) {
3913 			/*
3914 			 * can't continue...
3915 			 */
3916 			if (res.status == NFS4ERR_BADOWNER)
3917 				nfs4_log_badowner(VTOMI4(vp), OP_SETATTR);
3918 
3919 			e.error = geterrno4(res.status);
3920 		} else {
3921 			/*
3922 			 * When the verify request fails, the client ctime is
3923 			 * not in sync with the server. This is the same as
3924 			 * the version 3 "not synchronized" error, and we
3925 			 * handle it in a similar manner (XXX do we need to???).
3926 			 * Use the ctime returned in the first getattr for
3927 			 * the input to the next verify.
3928 			 * If we couldn't get the attributes, then we give up
3929 			 * because we can't complete the operation as required.
3930 			 */
3931 			garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
3932 		}
3933 		if (e.error) {
3934 			PURGE_ATTRCACHE4(vp);
3935 			nfs4_purge_stale_fh(e.error, vp, cr);
3936 		} else {
3937 			/*
3938 			 * retry with a new verify value
3939 			 */
3940 			ctime = garp->n4g_va.va_ctime;
3941 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3942 			resp = NULL;
3943 		}
3944 		if (!e.error) {
3945 			nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
3946 						opsetattr.obj_attributes);
3947 			if (verify_argop != -1) {
3948 				nfs4args_verify_free(&argop[verify_argop]);
3949 				verify_argop = -1;
3950 			}
3951 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3952 			goto do_again;
3953 		}
3954 	} while (!e.error);
3955 
3956 	if (e.error) {
3957 		/*
3958 		 * If we are here, rfs4call has an irrecoverable error - return
3959 		 */
3960 		nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
3961 						opsetattr.obj_attributes);
3962 		if (verify_argop != -1) {
3963 			nfs4args_verify_free(&argop[verify_argop]);
3964 			verify_argop = -1;
3965 		}
3966 		if (resp)
3967 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
3968 		return (e.error);
3969 	}
3970 
3971 
3972 
3973 	/*
3974 	 * If changing the size of the file, invalidate
3975 	 * any local cached data which is no longer part
3976 	 * of the file.  We also possibly invalidate the
3977 	 * last page in the file.  We could use
3978 	 * pvn_vpzero(), but this would mark the page as
3979 	 * modified and require it to be written back to
3980 	 * the server for no particularly good reason.
3981 	 * This way, if we access it, then we bring it
3982 	 * back in.  A read should be cheaper than a
3983 	 * write.
3984 	 */
3985 	if (mask & AT_SIZE) {
3986 		nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr);
3987 	}
3988 
3989 	/* either no error or one of the postop getattr failed */
3990 
3991 	/*
3992 	 * XXX Perform a simplified version of wcc checking. Instead of
3993 	 * have another getattr to get pre-op, just purge cache if
3994 	 * any of the ops prior to and including the getattr failed.
3995 	 * If the getattr succeeded then update the attrcache accordingly.
3996 	 */
3997 
3998 	garp = NULL;
3999 	if (res.status == NFS4_OK) {
4000 		/*
4001 		 * Last getattr
4002 		 */
4003 		resop = &res.array[numops - 1];
4004 		garp = &resop->nfs_resop4_u.opgetattr.ga_res;
4005 	}
4006 	/*
4007 	 * In certain cases, nfs4_update_attrcache() will purge the attrcache,
4008 	 * rather than filling it.  See the function itself for details.
4009 	 */
4010 	e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
4011 	if (garp != NULL) {
4012 		if (garp->n4g_resbmap & FATTR4_ACL_MASK) {
4013 			nfs4_acl_fill_cache(rp, &garp->n4g_vsa);
4014 			vs_ace4_destroy(&garp->n4g_vsa);
4015 		} else {
4016 			if (vsap != NULL) {
4017 				/*
4018 				 * The ACL was supposed to be set and to be
4019 				 * returned in the last getattr of this
4020 				 * compound, but for some reason the getattr
4021 				 * result doesn't contain the ACL.  In this
4022 				 * case, purge the ACL cache.
4023 				 */
4024 				if (rp->r_secattr != NULL) {
4025 					mutex_enter(&rp->r_statelock);
4026 					vsp = rp->r_secattr;
4027 					rp->r_secattr = NULL;
4028 					mutex_exit(&rp->r_statelock);
4029 					if (vsp != NULL)
4030 						nfs4_acl_free_cache(vsp);
4031 				}
4032 			}
4033 		}
4034 	}
4035 
4036 	if (res.status == NFS4_OK && (mask & AT_SIZE)) {
4037 		/*
4038 		 * Set the size, rather than relying on getting it updated
4039 		 * via a GETATTR.  With delegations the client tries to
4040 		 * suppress GETATTR calls.
4041 		 */
4042 		mutex_enter(&rp->r_statelock);
4043 		rp->r_size = vap->va_size;
4044 		mutex_exit(&rp->r_statelock);
4045 	}
4046 
4047 	/*
4048 	 * Can free up request args and res
4049 	 */
4050 	nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4051 						opsetattr.obj_attributes);
4052 	if (verify_argop != -1) {
4053 		nfs4args_verify_free(&argop[verify_argop]);
4054 		verify_argop = -1;
4055 	}
4056 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4057 
4058 	/*
4059 	 * Some servers will change the mode to clear the setuid
4060 	 * and setgid bits when changing the uid or gid.  The
4061 	 * client needs to compensate appropriately.
4062 	 */
4063 	if (mask & (AT_UID | AT_GID)) {
4064 		int terror, do_setattr;
4065 
4066 		do_setattr = 0;
4067 		va.va_mask = AT_MODE;
4068 		terror = nfs4getattr(vp, &va, cr);
4069 		if (!terror &&
4070 		    (((mask & AT_MODE) && va.va_mode != vap->va_mode) ||
4071 		    (!(mask & AT_MODE) && va.va_mode != omode))) {
4072 			va.va_mask = AT_MODE;
4073 			if (mask & AT_MODE) {
4074 				/*
4075 				 * We asked the mode to be changed and what
4076 				 * we just got from the server in getattr is
4077 				 * not what we wanted it to be, so set it now.
4078 				 */
4079 				va.va_mode = vap->va_mode;
4080 				do_setattr = 1;
4081 			} else {
4082 				/*
4083 				 * We did not ask the mode to be changed,
4084 				 * Check to see that the server just cleared
4085 				 * I_SUID and I_GUID from it. If not then
4086 				 * set mode to omode with UID/GID cleared.
4087 				 */
4088 				if (nfs4_compare_modes(va.va_mode, omode)) {
4089 					omode &= ~(S_ISUID|S_ISGID);
4090 					va.va_mode = omode;
4091 					do_setattr = 1;
4092 				}
4093 			}
4094 
4095 			if (do_setattr)
4096 				(void) nfs4setattr(vp, &va, 0, cr, NULL);
4097 		}
4098 	}
4099 
4100 	return (e.error);
4101 }
4102 
4103 /* ARGSUSED */
4104 static int
4105 nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr)
4106 {
4107 	COMPOUND4args_clnt args;
4108 	COMPOUND4res_clnt res;
4109 	int doqueue;
4110 	uint32_t acc, resacc, argacc;
4111 	rnode4_t *rp;
4112 	cred_t *cred, *ncr, *ncrfree = NULL;
4113 	nfs4_access_type_t cacc;
4114 	int num_ops;
4115 	nfs_argop4 argop[3];
4116 	nfs_resop4 *resop;
4117 	bool_t needrecov = FALSE, do_getattr;
4118 	nfs4_recov_state_t recov_state;
4119 	int rpc_error;
4120 	hrtime_t t;
4121 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4122 	mntinfo4_t *mi = VTOMI4(vp);
4123 
4124 	if (nfs_zone() != mi->mi_zone)
4125 		return (EIO);
4126 
4127 	acc = 0;
4128 	if (mode & VREAD)
4129 		acc |= ACCESS4_READ;
4130 	if (mode & VWRITE) {
4131 		if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type))
4132 			return (EROFS);
4133 		if (vp->v_type == VDIR)
4134 			acc |= ACCESS4_DELETE;
4135 		acc |= ACCESS4_MODIFY | ACCESS4_EXTEND;
4136 	}
4137 	if (mode & VEXEC) {
4138 		if (vp->v_type == VDIR)
4139 			acc |= ACCESS4_LOOKUP;
4140 		else
4141 			acc |= ACCESS4_EXECUTE;
4142 	}
4143 
4144 	if (VTOR4(vp)->r_acache != NULL) {
4145 		e.error = nfs4_validate_caches(vp, cr);
4146 		if (e.error)
4147 			return (e.error);
4148 	}
4149 
4150 	rp = VTOR4(vp);
4151 	if (vp->v_type == VDIR) {
4152 		argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY |
4153 			ACCESS4_EXTEND | ACCESS4_LOOKUP;
4154 	} else {
4155 		argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND |
4156 			ACCESS4_EXECUTE;
4157 	}
4158 	recov_state.rs_flags = 0;
4159 	recov_state.rs_num_retry_despite_err = 0;
4160 
4161 	cred = cr;
4162 	/*
4163 	 * ncr and ncrfree both initially
4164 	 * point to the memory area returned
4165 	 * by crnetadjust();
4166 	 * ncrfree not NULL when exiting means
4167 	 * that we need to release it
4168 	 */
4169 	ncr = crnetadjust(cred);
4170 	ncrfree = ncr;
4171 
4172 tryagain:
4173 	cacc = nfs4_access_check(rp, acc, cred);
4174 	if (cacc == NFS4_ACCESS_ALLOWED) {
4175 		if (ncrfree != NULL)
4176 			crfree(ncrfree);
4177 		return (0);
4178 	}
4179 	if (cacc == NFS4_ACCESS_DENIED) {
4180 		/*
4181 		 * If the cred can be adjusted, try again
4182 		 * with the new cred.
4183 		 */
4184 		if (ncr != NULL) {
4185 			cred = ncr;
4186 			ncr = NULL;
4187 			goto tryagain;
4188 		}
4189 		if (ncrfree != NULL)
4190 			crfree(ncrfree);
4191 		return (EACCES);
4192 	}
4193 
4194 recov_retry:
4195 	/*
4196 	 * Don't take with r_statev4_lock here. r_deleg_type could
4197 	 * change as soon as lock is released.  Since it is an int,
4198 	 * there is no atomicity issue.
4199 	 */
4200 	do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE);
4201 	num_ops = do_getattr ? 3 : 2;
4202 
4203 	args.ctag = TAG_ACCESS;
4204 
4205 	args.array_len = num_ops;
4206 	args.array = argop;
4207 
4208 	if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS,
4209 					&recov_state, NULL)) {
4210 		if (ncrfree != NULL)
4211 			crfree(ncrfree);
4212 		return (e.error);
4213 	}
4214 
4215 	/* putfh target fh */
4216 	argop[0].argop = OP_CPUTFH;
4217 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
4218 
4219 	/* access */
4220 	argop[1].argop = OP_ACCESS;
4221 	argop[1].nfs_argop4_u.opaccess.access = argacc;
4222 
4223 	/* getattr */
4224 	if (do_getattr) {
4225 		argop[2].argop = OP_GETATTR;
4226 		argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
4227 		argop[2].nfs_argop4_u.opgetattr.mi = mi;
4228 	}
4229 
4230 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
4231 	    "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first",
4232 	    rnode4info(VTOR4(vp))));
4233 
4234 	doqueue = 1;
4235 	t = gethrtime();
4236 	rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e);
4237 	rpc_error = e.error;
4238 
4239 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4240 	if (needrecov) {
4241 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
4242 		    "nfs4_access: initiating recovery\n"));
4243 
4244 		if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
4245 		    NULL, OP_ACCESS, NULL) == FALSE) {
4246 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS,
4247 			    &recov_state, needrecov);
4248 			if (!e.error)
4249 				(void) xdr_free(xdr_COMPOUND4res_clnt,
4250 						(caddr_t)&res);
4251 			goto recov_retry;
4252 		}
4253 	}
4254 	nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov);
4255 
4256 	if (e.error)
4257 		goto out;
4258 
4259 	if (res.status) {
4260 		e.error = geterrno4(res.status);
4261 		/*
4262 		 * This might generate over the wire calls throught
4263 		 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
4264 		 * here to avoid a deadlock.
4265 		 */
4266 		nfs4_purge_stale_fh(e.error, vp, cr);
4267 		goto out;
4268 	}
4269 	resop = &res.array[1];	/* access res */
4270 
4271 	resacc = resop->nfs_resop4_u.opaccess.access;
4272 
4273 	if (do_getattr) {
4274 		resop++;	/* getattr res */
4275 		nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res,
4276 				t, cr, FALSE, NULL);
4277 	}
4278 
4279 	if (!e.error) {
4280 		nfs4_access_cache(rp, argacc, resacc, cred);
4281 		/*
4282 		 * we just cached results with cred; if cred is the
4283 		 * adjusted credentials from crnetadjust, we do not want
4284 		 * to release them before exiting: hence setting ncrfree
4285 		 * to NULL
4286 		 */
4287 		if (cred != cr)
4288 			ncrfree = NULL;
4289 		/* XXX check the supported bits too? */
4290 		if ((acc & resacc) != acc) {
4291 			/*
4292 			 * The following code implements the semantic
4293 			 * that a setuid root program has *at least* the
4294 			 * permissions of the user that is running the
4295 			 * program.  See rfs3call() for more portions
4296 			 * of the implementation of this functionality.
4297 			 */
4298 			/* XXX-LP */
4299 			if (ncr != NULL) {
4300 				(void) xdr_free(xdr_COMPOUND4res_clnt,
4301 						(caddr_t)&res);
4302 				cred = ncr;
4303 				ncr = NULL;
4304 				goto tryagain;
4305 			}
4306 			e.error = EACCES;
4307 		}
4308 	}
4309 
4310 out:
4311 	if (!rpc_error)
4312 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4313 
4314 	if (ncrfree != NULL)
4315 		crfree(ncrfree);
4316 
4317 	return (e.error);
4318 }
4319 
4320 static int
4321 nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr)
4322 {
4323 	COMPOUND4args_clnt args;
4324 	COMPOUND4res_clnt res;
4325 	int doqueue;
4326 	rnode4_t *rp;
4327 	nfs_argop4 argop[3];
4328 	nfs_resop4 *resop;
4329 	READLINK4res *lr_res;
4330 	nfs4_ga_res_t *garp;
4331 	uint_t len;
4332 	char *linkdata;
4333 	bool_t needrecov = FALSE;
4334 	nfs4_recov_state_t recov_state;
4335 	hrtime_t t;
4336 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4337 
4338 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
4339 		return (EIO);
4340 	/*
4341 	 * Can't readlink anything other than a symbolic link.
4342 	 */
4343 	if (vp->v_type != VLNK)
4344 		return (EINVAL);
4345 
4346 	rp = VTOR4(vp);
4347 	if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) {
4348 		e.error = nfs4_validate_caches(vp, cr);
4349 		if (e.error)
4350 			return (e.error);
4351 		mutex_enter(&rp->r_statelock);
4352 		if (rp->r_symlink.contents != NULL) {
4353 			e.error = uiomove(rp->r_symlink.contents,
4354 			    rp->r_symlink.len, UIO_READ, uiop);
4355 			mutex_exit(&rp->r_statelock);
4356 			return (e.error);
4357 		}
4358 		mutex_exit(&rp->r_statelock);
4359 	}
4360 	recov_state.rs_flags = 0;
4361 	recov_state.rs_num_retry_despite_err = 0;
4362 
4363 recov_retry:
4364 	args.array_len = 3;
4365 	args.array = argop;
4366 	args.ctag = TAG_READLINK;
4367 
4368 	e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
4369 	if (e.error) {
4370 		return (e.error);
4371 	}
4372 
4373 	/* 0. putfh symlink fh */
4374 	argop[0].argop = OP_CPUTFH;
4375 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
4376 
4377 	/* 1. readlink */
4378 	argop[1].argop = OP_READLINK;
4379 
4380 	/* 2. getattr */
4381 	argop[2].argop = OP_GETATTR;
4382 	argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
4383 	argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
4384 
4385 	doqueue = 1;
4386 
4387 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
4388 	    "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first",
4389 	    rnode4info(VTOR4(vp))));
4390 
4391 	t = gethrtime();
4392 
4393 	rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
4394 
4395 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4396 	if (needrecov) {
4397 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
4398 		    "nfs4_readlink: initiating recovery\n"));
4399 
4400 		if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
4401 		    NULL, OP_READLINK, NULL) == FALSE) {
4402 			if (!e.error)
4403 				(void) xdr_free(xdr_COMPOUND4res_clnt,
4404 								(caddr_t)&res);
4405 
4406 			nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4407 			    needrecov);
4408 			goto recov_retry;
4409 		}
4410 	}
4411 
4412 	nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
4413 
4414 	if (e.error)
4415 		return (e.error);
4416 
4417 	/*
4418 	 * There is an path in the code below which calls
4419 	 * nfs4_purge_stale_fh(), which may generate otw calls through
4420 	 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
4421 	 * here to avoid nfs4_start_op() deadlock.
4422 	 */
4423 
4424 	if (res.status && (res.array_len < args.array_len)) {
4425 		/*
4426 		 * either Putfh or Link failed
4427 		 */
4428 		e.error = geterrno4(res.status);
4429 		nfs4_purge_stale_fh(e.error, vp, cr);
4430 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4431 		return (e.error);
4432 	}
4433 
4434 	resop = &res.array[1];	/* readlink res */
4435 	lr_res = &resop->nfs_resop4_u.opreadlink;
4436 
4437 	/*
4438 	 * treat symlink names as data
4439 	 */
4440 	linkdata = utf8_to_str(&lr_res->link, &len, NULL);
4441 	if (linkdata != NULL) {
4442 		int uio_len = len - 1;
4443 		/* len includes null byte, which we won't uiomove */
4444 		e.error = uiomove(linkdata, uio_len, UIO_READ, uiop);
4445 		if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) {
4446 			mutex_enter(&rp->r_statelock);
4447 			if (rp->r_symlink.contents == NULL) {
4448 				rp->r_symlink.contents = linkdata;
4449 				rp->r_symlink.len = uio_len;
4450 				rp->r_symlink.size = len;
4451 				mutex_exit(&rp->r_statelock);
4452 			} else {
4453 				mutex_exit(&rp->r_statelock);
4454 				kmem_free(linkdata, len);
4455 			}
4456 		} else {
4457 			kmem_free(linkdata, len);
4458 		}
4459 	}
4460 	if (res.status == NFS4_OK) {
4461 		resop++;	/* getattr res */
4462 		garp = &resop->nfs_resop4_u.opgetattr.ga_res;
4463 	}
4464 	e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
4465 
4466 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4467 
4468 	/*
4469 	 * The over the wire error for attempting to readlink something
4470 	 * other than a symbolic link is ENXIO.  However, we need to
4471 	 * return EINVAL instead of ENXIO, so we map it here.
4472 	 */
4473 	return (e.error == ENXIO ? EINVAL : e.error);
4474 }
4475 
4476 /*
4477  * Flush local dirty pages to stable storage on the server.
4478  *
4479  * If FNODSYNC is specified, then there is nothing to do because
4480  * metadata changes are not cached on the client before being
4481  * sent to the server.
4482  */
4483 static int
4484 nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr)
4485 {
4486 	int error;
4487 
4488 	if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
4489 		return (0);
4490 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
4491 		return (EIO);
4492 	error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr);
4493 	if (!error)
4494 		error = VTOR4(vp)->r_error;
4495 	return (error);
4496 }
4497 
4498 /*
4499  * Weirdness: if the file was removed or the target of a rename
4500  * operation while it was open, it got renamed instead.  Here we
4501  * remove the renamed file.
4502  */
4503 static void
4504 nfs4_inactive(vnode_t *vp, cred_t *cr)
4505 {
4506 	rnode4_t *rp;
4507 
4508 	ASSERT(vp != DNLC_NO_VNODE);
4509 
4510 	rp = VTOR4(vp);
4511 
4512 	if (IS_SHADOW(vp, rp)) {
4513 		sv_inactive(vp);
4514 		return;
4515 	}
4516 
4517 	/*
4518 	 * If this is coming from the wrong zone, we let someone in the right
4519 	 * zone take care of it asynchronously.  We can get here due to
4520 	 * VN_RELE() being called from pageout() or fsflush().  This call may
4521 	 * potentially turn into an expensive no-op if, for instance, v_count
4522 	 * gets incremented in the meantime, but it's still correct.
4523 	 */
4524 	if (nfs_zone() != VTOMI4(vp)->mi_zone) {
4525 		nfs4_async_inactive(vp, cr);
4526 		return;
4527 	}
4528 
4529 	/*
4530 	 * Some of the cleanup steps might require over-the-wire
4531 	 * operations.  Since VOP_INACTIVE can get called as a result of
4532 	 * other over-the-wire operations (e.g., an attribute cache update
4533 	 * can lead to a DNLC purge), doing those steps now would lead to a
4534 	 * nested call to the recovery framework, which can deadlock.  So
4535 	 * do any over-the-wire cleanups asynchronously, in a separate
4536 	 * thread.
4537 	 */
4538 
4539 	mutex_enter(&rp->r_os_lock);
4540 	mutex_enter(&rp->r_statelock);
4541 	mutex_enter(&rp->r_statev4_lock);
4542 
4543 	if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) {
4544 		mutex_exit(&rp->r_statev4_lock);
4545 		mutex_exit(&rp->r_statelock);
4546 		mutex_exit(&rp->r_os_lock);
4547 		nfs4_async_inactive(vp, cr);
4548 		return;
4549 	}
4550 
4551 	if (rp->r_deleg_type == OPEN_DELEGATE_READ ||
4552 	    rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
4553 		mutex_exit(&rp->r_statev4_lock);
4554 		mutex_exit(&rp->r_statelock);
4555 		mutex_exit(&rp->r_os_lock);
4556 		nfs4_async_inactive(vp, cr);
4557 		return;
4558 	}
4559 
4560 	if (rp->r_unldvp != NULL) {
4561 		mutex_exit(&rp->r_statev4_lock);
4562 		mutex_exit(&rp->r_statelock);
4563 		mutex_exit(&rp->r_os_lock);
4564 		nfs4_async_inactive(vp, cr);
4565 		return;
4566 	}
4567 	mutex_exit(&rp->r_statev4_lock);
4568 	mutex_exit(&rp->r_statelock);
4569 	mutex_exit(&rp->r_os_lock);
4570 
4571 	rp4_addfree(rp, cr);
4572 }
4573 
4574 /*
4575  * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up
4576  * various bits of state.  The caller must not refer to vp after this call.
4577  */
4578 
4579 void
4580 nfs4_inactive_otw(vnode_t *vp, cred_t *cr)
4581 {
4582 	rnode4_t *rp = VTOR4(vp);
4583 	nfs4_recov_state_t recov_state;
4584 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4585 	vnode_t *unldvp;
4586 	char *unlname;
4587 	cred_t *unlcred;
4588 	COMPOUND4args_clnt args;
4589 	COMPOUND4res_clnt res, *resp;
4590 	nfs_argop4 argop[2];
4591 	int doqueue;
4592 #ifdef DEBUG
4593 	char *name;
4594 #endif
4595 
4596 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
4597 	ASSERT(!IS_SHADOW(vp, rp));
4598 
4599 #ifdef DEBUG
4600 	name = fn_name(VTOSV(vp)->sv_name);
4601 	NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: "
4602 		"release vnode %s", name));
4603 	kmem_free(name, MAXNAMELEN);
4604 #endif
4605 
4606 	if (vp->v_type == VREG) {
4607 		bool_t recov_failed = FALSE;
4608 
4609 		e.error = nfs4close_all(vp, cr);
4610 		if (e.error) {
4611 			/* Check to see if recovery failed */
4612 			mutex_enter(&(VTOMI4(vp)->mi_lock));
4613 			if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL)
4614 				recov_failed = TRUE;
4615 			mutex_exit(&(VTOMI4(vp)->mi_lock));
4616 			if (!recov_failed) {
4617 				mutex_enter(&rp->r_statelock);
4618 				if (rp->r_flags & R4RECOVERR)
4619 					recov_failed = TRUE;
4620 				mutex_exit(&rp->r_statelock);
4621 			}
4622 			if (recov_failed) {
4623 				NFS4_DEBUG(nfs4_client_recov_debug,
4624 					    (CE_NOTE, "nfs4_inactive_otw: "
4625 					    "close failed (recovery failure)"));
4626 			}
4627 		}
4628 	}
4629 
4630 redo:
4631 	if (rp->r_unldvp == NULL) {
4632 		rp4_addfree(rp, cr);
4633 		return;
4634 	}
4635 
4636 	/*
4637 	 * Save the vnode pointer for the directory where the
4638 	 * unlinked-open file got renamed, then set it to NULL
4639 	 * to prevent another thread from getting here before
4640 	 * we're done with the remove.  While we have the
4641 	 * statelock, make local copies of the pertinent rnode
4642 	 * fields.  If we weren't to do this in an atomic way, the
4643 	 * the unl* fields could become inconsistent with respect
4644 	 * to each other due to a race condition between this
4645 	 * code and nfs_remove().  See bug report 1034328.
4646 	 */
4647 	mutex_enter(&rp->r_statelock);
4648 	if (rp->r_unldvp == NULL) {
4649 		mutex_exit(&rp->r_statelock);
4650 		rp4_addfree(rp, cr);
4651 		return;
4652 	}
4653 
4654 	unldvp = rp->r_unldvp;
4655 	rp->r_unldvp = NULL;
4656 	unlname = rp->r_unlname;
4657 	rp->r_unlname = NULL;
4658 	unlcred = rp->r_unlcred;
4659 	rp->r_unlcred = NULL;
4660 	mutex_exit(&rp->r_statelock);
4661 
4662 	/*
4663 	 * If there are any dirty pages left, then flush
4664 	 * them.  This is unfortunate because they just
4665 	 * may get thrown away during the remove operation,
4666 	 * but we have to do this for correctness.
4667 	 */
4668 	if (nfs4_has_pages(vp) &&
4669 			    ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) {
4670 		ASSERT(vp->v_type != VCHR);
4671 		e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr);
4672 		if (e.error) {
4673 			mutex_enter(&rp->r_statelock);
4674 			if (!rp->r_error)
4675 				rp->r_error = e.error;
4676 			mutex_exit(&rp->r_statelock);
4677 		}
4678 	}
4679 
4680 	recov_state.rs_flags = 0;
4681 	recov_state.rs_num_retry_despite_err = 0;
4682 recov_retry_remove:
4683 	/*
4684 	 * Do the remove operation on the renamed file
4685 	 */
4686 	args.ctag = TAG_INACTIVE;
4687 
4688 	/*
4689 	 * Remove ops: putfh dir; remove
4690 	 */
4691 	args.array_len = 2;
4692 	args.array = argop;
4693 
4694 	e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state);
4695 	if (e.error) {
4696 		kmem_free(unlname, MAXNAMELEN);
4697 		crfree(unlcred);
4698 		VN_RELE(unldvp);
4699 		/*
4700 		 * Try again; this time around r_unldvp will be NULL, so we'll
4701 		 * just call rp4_addfree() and return.
4702 		 */
4703 		goto redo;
4704 	}
4705 
4706 	/* putfh directory */
4707 	argop[0].argop = OP_CPUTFH;
4708 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh;
4709 
4710 	/* remove */
4711 	argop[1].argop = OP_CREMOVE;
4712 	argop[1].nfs_argop4_u.opcremove.ctarget = unlname;
4713 
4714 	doqueue = 1;
4715 	resp = &res;
4716 
4717 #if 0 /* notyet */
4718 	/*
4719 	 * Can't do this yet.  We may be being called from
4720 	 * dnlc_purge_XXX while that routine is holding a
4721 	 * mutex lock to the nc_rele list.  The calls to
4722 	 * nfs3_cache_wcc_data may result in calls to
4723 	 * dnlc_purge_XXX.  This will result in a deadlock.
4724 	 */
4725 	rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
4726 	if (e.error) {
4727 		PURGE_ATTRCACHE4(unldvp);
4728 		resp = NULL;
4729 	} else if (res.status) {
4730 		e.error = geterrno4(res.status);
4731 		PURGE_ATTRCACHE4(unldvp);
4732 		/*
4733 		 * This code is inactive right now
4734 		 * but if made active there should
4735 		 * be a nfs4_end_op() call before
4736 		 * nfs4_purge_stale_fh to avoid start_op()
4737 		 * deadlock. See BugId: 4948726
4738 		 */
4739 		nfs4_purge_stale_fh(error, unldvp, cr);
4740 	} else {
4741 		nfs_resop4 *resop;
4742 		REMOVE4res *rm_res;
4743 
4744 		resop = &res.array[1];
4745 		rm_res = &resop->nfs_resop4_u.opremove;
4746 		/*
4747 		 * Update directory cache attribute,
4748 		 * readdir and dnlc caches.
4749 		 */
4750 		nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL);
4751 	}
4752 #else
4753 	rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
4754 
4755 	PURGE_ATTRCACHE4(unldvp);
4756 #endif
4757 
4758 	if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) {
4759 		if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL,
4760 		    NULL, NULL, OP_REMOVE, NULL) == FALSE) {
4761 			if (!e.error)
4762 				(void) xdr_free(xdr_COMPOUND4res_clnt,
4763 								(caddr_t)&res);
4764 			nfs4_end_op(VTOMI4(unldvp), unldvp, NULL,
4765 							&recov_state, TRUE);
4766 			goto recov_retry_remove;
4767 		}
4768 	}
4769 	nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE);
4770 
4771 	/*
4772 	 * Release stuff held for the remove
4773 	 */
4774 	VN_RELE(unldvp);
4775 	if (!e.error && resp)
4776 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
4777 
4778 	kmem_free(unlname, MAXNAMELEN);
4779 	crfree(unlcred);
4780 	goto redo;
4781 }
4782 
4783 /*
4784  * Remote file system operations having to do with directory manipulation.
4785  */
4786 /* ARGSUSED3 */
4787 static int
4788 nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
4789 	int flags, vnode_t *rdir, cred_t *cr)
4790 {
4791 	int error;
4792 	vnode_t *vp, *avp = NULL;
4793 	rnode4_t *drp;
4794 
4795 	*vpp = NULL;
4796 	if (nfs_zone() != VTOMI4(dvp)->mi_zone)
4797 		return (EPERM);
4798 	/*
4799 	 * if LOOKUP_XATTR, must replace dvp (object) with
4800 	 * object's attrdir before continuing with lookup
4801 	 */
4802 	if (flags & LOOKUP_XATTR) {
4803 		error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr);
4804 		if (error)
4805 			return (error);
4806 
4807 		dvp = avp;
4808 
4809 		/*
4810 		 * If lookup is for "", just return dvp now.  The attrdir
4811 		 * has already been activated (from nfs4lookup_xattr), and
4812 		 * the caller will RELE the original dvp -- not
4813 		 * the attrdir.  So, set vpp and return.
4814 		 * Currently, when the LOOKUP_XATTR flag is
4815 		 * passed to VOP_LOOKUP, the name is always empty, and
4816 		 * shortcircuiting here avoids 3 unneeded lock/unlock
4817 		 * pairs.
4818 		 *
4819 		 * If a non-empty name was provided, then it is the
4820 		 * attribute name, and it will be looked up below.
4821 		 */
4822 		if (*nm == '\0') {
4823 			*vpp = dvp;
4824 			return (0);
4825 		}
4826 
4827 		/*
4828 		 * The vfs layer never sends a name when asking for the
4829 		 * attrdir, so we should never get here (unless of course
4830 		 * name is passed at some time in future -- at which time
4831 		 * we'll blow up here).
4832 		 */
4833 		ASSERT(0);
4834 	}
4835 
4836 	drp = VTOR4(dvp);
4837 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
4838 		return (EINTR);
4839 
4840 	error = nfs4lookup(dvp, nm, vpp, cr, 0);
4841 	nfs_rw_exit(&drp->r_rwlock);
4842 
4843 	/*
4844 	 * If vnode is a device, create special vnode.
4845 	 */
4846 	if (!error && ISVDEV((*vpp)->v_type)) {
4847 		vp = *vpp;
4848 		*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
4849 		VN_RELE(vp);
4850 	}
4851 
4852 	return (error);
4853 }
4854 
4855 /* ARGSUSED */
4856 static int
4857 nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr)
4858 {
4859 	int error;
4860 	rnode4_t *drp;
4861 	int cflag = ((flags & CREATE_XATTR_DIR) != 0);
4862 	mntinfo4_t *mi;
4863 
4864 	mi = VTOMI4(dvp);
4865 	if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR))
4866 		return (EINVAL);
4867 
4868 	drp = VTOR4(dvp);
4869 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
4870 		return (EINTR);
4871 
4872 	mutex_enter(&drp->r_statelock);
4873 	/*
4874 	 * If the server doesn't support xattrs just return EINVAL
4875 	 */
4876 	if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) {
4877 		mutex_exit(&drp->r_statelock);
4878 		nfs_rw_exit(&drp->r_rwlock);
4879 		return (EINVAL);
4880 	}
4881 
4882 	/*
4883 	 * If there is a cached xattr directory entry,
4884 	 * use it as long as the attributes are valid. If the
4885 	 * attributes are not valid, take the simple approach and
4886 	 * free the cached value and re-fetch a new value.
4887 	 *
4888 	 * We don't negative entry cache for now, if we did we
4889 	 * would need to check if the file has changed on every
4890 	 * lookup. But xattrs don't exist very often and failing
4891 	 * an openattr is not much more expensive than and NVERIFY or GETATTR
4892 	 * so do an openattr over the wire for now.
4893 	 */
4894 	if (drp->r_xattr_dir != NULL) {
4895 		if (ATTRCACHE4_VALID(dvp)) {
4896 			VN_HOLD(drp->r_xattr_dir);
4897 			*vpp = drp->r_xattr_dir;
4898 			mutex_exit(&drp->r_statelock);
4899 			nfs_rw_exit(&drp->r_rwlock);
4900 			return (0);
4901 		}
4902 		VN_RELE(drp->r_xattr_dir);
4903 		drp->r_xattr_dir = NULL;
4904 	}
4905 	mutex_exit(&drp->r_statelock);
4906 
4907 	error = nfs4openattr(dvp, vpp, cflag, cr);
4908 
4909 	nfs_rw_exit(&drp->r_rwlock);
4910 
4911 	return (error);
4912 }
4913 
4914 static int
4915 nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc)
4916 {
4917 	int error;
4918 	rnode4_t *drp;
4919 
4920 	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
4921 
4922 	/*
4923 	 * If lookup is for "", just return dvp.  Don't need
4924 	 * to send it over the wire, look it up in the dnlc,
4925 	 * or perform any access checks.
4926 	 */
4927 	if (*nm == '\0') {
4928 		VN_HOLD(dvp);
4929 		*vpp = dvp;
4930 		return (0);
4931 	}
4932 
4933 	/*
4934 	 * Can't do lookups in non-directories.
4935 	 */
4936 	if (dvp->v_type != VDIR)
4937 		return (ENOTDIR);
4938 
4939 	/*
4940 	 * If lookup is for ".", just return dvp.  Don't need
4941 	 * to send it over the wire or look it up in the dnlc,
4942 	 * just need to check access.
4943 	 */
4944 	if (nm[0] == '.' && nm[1] == '\0') {
4945 		error = nfs4_access(dvp, VEXEC, 0, cr);
4946 		if (error)
4947 			return (error);
4948 		VN_HOLD(dvp);
4949 		*vpp = dvp;
4950 		return (0);
4951 	}
4952 
4953 	drp = VTOR4(dvp);
4954 	if (!(drp->r_flags & R4LOOKUP)) {
4955 		mutex_enter(&drp->r_statelock);
4956 		drp->r_flags |= R4LOOKUP;
4957 		mutex_exit(&drp->r_statelock);
4958 	}
4959 
4960 	*vpp = NULL;
4961 	/*
4962 	 * Lookup this name in the DNLC.  If there is no entry
4963 	 * lookup over the wire.
4964 	 */
4965 	if (!skipdnlc)
4966 		*vpp = dnlc_lookup(dvp, nm);
4967 	if (*vpp == NULL) {
4968 		/*
4969 		 * We need to go over the wire to lookup the name.
4970 		 */
4971 		return (nfs4lookupnew_otw(dvp, nm, vpp, cr));
4972 	}
4973 
4974 	/*
4975 	 * We hit on the dnlc
4976 	 */
4977 	if (*vpp != DNLC_NO_VNODE ||
4978 			    (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
4979 		/*
4980 		 * But our attrs may not be valid.
4981 		 */
4982 		if (ATTRCACHE4_VALID(dvp)) {
4983 			error = nfs4_waitfor_purge_complete(dvp);
4984 			if (error) {
4985 				VN_RELE(*vpp);
4986 				*vpp = NULL;
4987 				return (error);
4988 			}
4989 
4990 			/*
4991 			 * If after the purge completes, check to make sure
4992 			 * our attrs are still valid.
4993 			 */
4994 			if (ATTRCACHE4_VALID(dvp)) {
4995 				/*
4996 				 * If we waited for a purge we may have
4997 				 * lost our vnode so look it up again.
4998 				 */
4999 				VN_RELE(*vpp);
5000 				*vpp = dnlc_lookup(dvp, nm);
5001 				if (*vpp == NULL)
5002 					return (nfs4lookupnew_otw(dvp,
5003 						nm, vpp, cr));
5004 
5005 				/*
5006 				 * The access cache should almost always hit
5007 				 */
5008 				error = nfs4_access(dvp, VEXEC, 0, cr);
5009 
5010 				if (error) {
5011 					VN_RELE(*vpp);
5012 					*vpp = NULL;
5013 					return (error);
5014 				}
5015 				if (*vpp == DNLC_NO_VNODE) {
5016 					VN_RELE(*vpp);
5017 					*vpp = NULL;
5018 					return (ENOENT);
5019 				}
5020 				return (0);
5021 			}
5022 		}
5023 	}
5024 
5025 	ASSERT(*vpp != NULL);
5026 
5027 	/*
5028 	 * We may have gotten here we have one of the following cases:
5029 	 *	1) vpp != DNLC_NO_VNODE, our attrs have timed out so we
5030 	 *		need to validate them.
5031 	 *	2) vpp == DNLC_NO_VNODE, a negative entry that we always
5032 	 *		must validate.
5033 	 *
5034 	 * Go to the server and check if the directory has changed, if
5035 	 * it hasn't we are done and can use the dnlc entry.
5036 	 */
5037 	return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr));
5038 }
5039 
5040 /*
5041  * Go to the server and check if the directory has changed, if
5042  * it hasn't we are done and can use the dnlc entry.  If it
5043  * has changed we get a new copy of its attributes and check
5044  * the access for VEXEC, then relookup the filename and
5045  * get its filehandle and attributes.
5046  *
5047  * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR
5048  *	if the NVERIFY failed we must
5049  *		purge the caches
5050  *		cache new attributes (will set r_time_attr_inval)
5051  *		cache new access
5052  *		recheck VEXEC access
5053  *		add name to dnlc, possibly negative
5054  *		if LOOKUP succeeded
5055  *			cache new attributes
5056  *	else
5057  *		set a new r_time_attr_inval for dvp
5058  *		check to make sure we have access
5059  *
5060  * The vpp returned is the vnode passed in if the directory is valid,
5061  * a new vnode if successful lookup, or NULL on error.
5062  */
5063 static int
5064 nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
5065 {
5066 	COMPOUND4args_clnt args;
5067 	COMPOUND4res_clnt res;
5068 	fattr4 *ver_fattr;
5069 	fattr4_change dchange;
5070 	int32_t *ptr;
5071 	int argoplist_size  = 7 * sizeof (nfs_argop4);
5072 	nfs_argop4 *argop;
5073 	int doqueue;
5074 	mntinfo4_t *mi;
5075 	nfs4_recov_state_t recov_state;
5076 	hrtime_t t;
5077 	int isdotdot;
5078 	vnode_t *nvp;
5079 	nfs_fh4 *fhp;
5080 	nfs4_sharedfh_t *sfhp;
5081 	nfs4_access_type_t cacc;
5082 	rnode4_t *nrp;
5083 	rnode4_t *drp = VTOR4(dvp);
5084 	nfs4_ga_res_t *garp = NULL;
5085 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
5086 
5087 	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5088 	ASSERT(nm != NULL);
5089 	ASSERT(nm[0] != '\0');
5090 	ASSERT(dvp->v_type == VDIR);
5091 	ASSERT(nm[0] != '.' || nm[1] != '\0');
5092 	ASSERT(*vpp != NULL);
5093 
5094 	if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
5095 		isdotdot = 1;
5096 		args.ctag = TAG_LOOKUP_VPARENT;
5097 	} else {
5098 		/*
5099 		 * Do not allow crossing of server mount points.  The
5100 		 * only visible entries in a SRVSTUB dir are . and ..
5101 		 * This code handles the non-.. case.  We can't even get
5102 		 * this far if looking up ".".
5103 		 */
5104 		if (VTOR4(dvp)->r_flags & R4SRVSTUB) {
5105 			VN_RELE(*vpp);
5106 			*vpp = NULL;
5107 			return (ENOENT);
5108 		}
5109 		isdotdot = 0;
5110 		args.ctag = TAG_LOOKUP_VALID;
5111 	}
5112 
5113 	mi = VTOMI4(dvp);
5114 	recov_state.rs_flags = 0;
5115 	recov_state.rs_num_retry_despite_err = 0;
5116 
5117 	nvp = NULL;
5118 
5119 	/* Save the original mount point security information */
5120 	(void) save_mnt_secinfo(mi->mi_curr_serv);
5121 
5122 recov_retry:
5123 	e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
5124 			    &recov_state, NULL);
5125 	if (e.error) {
5126 		(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5127 		VN_RELE(*vpp);
5128 		*vpp = NULL;
5129 		return (e.error);
5130 	}
5131 
5132 	argop = kmem_alloc(argoplist_size, KM_SLEEP);
5133 
5134 	/* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */
5135 	args.array_len = 7;
5136 	args.array = argop;
5137 
5138 	/* 0. putfh file */
5139 	argop[0].argop = OP_CPUTFH;
5140 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
5141 
5142 	/* 1. nverify the change info */
5143 	argop[1].argop = OP_NVERIFY;
5144 	ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes;
5145 	ver_fattr->attrmask = FATTR4_CHANGE_MASK;
5146 	ver_fattr->attrlist4 = (char *)&dchange;
5147 	ptr = (int32_t *)&dchange;
5148 	IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
5149 	ver_fattr->attrlist4_len = sizeof (fattr4_change);
5150 
5151 	/* 2. getattr directory */
5152 	argop[2].argop = OP_GETATTR;
5153 	argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5154 	argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5155 
5156 	/* 3. access directory */
5157 	argop[3].argop = OP_ACCESS;
5158 	argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
5159 			ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
5160 
5161 	/* 4. lookup name */
5162 	if (isdotdot) {
5163 		argop[4].argop = OP_LOOKUPP;
5164 	} else {
5165 		argop[4].argop = OP_CLOOKUP;
5166 		argop[4].nfs_argop4_u.opclookup.cname = nm;
5167 	}
5168 
5169 	/* 5. resulting file handle */
5170 	argop[5].argop = OP_GETFH;
5171 
5172 	/* 6. resulting file attributes */
5173 	argop[6].argop = OP_GETATTR;
5174 	argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5175 	argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5176 
5177 	doqueue = 1;
5178 	t = gethrtime();
5179 
5180 	rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
5181 
5182 	if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
5183 		/*
5184 		 * For WRONGSEC of a non-dotdot case, send secinfo directly
5185 		 * from this thread, do not go thru the recovery thread since
5186 		 * we need the nm information.
5187 		 *
5188 		 * Not doing dotdot case because there is no specification
5189 		 * for (PUTFH, SECINFO "..") yet.
5190 		 */
5191 		if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
5192 			if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) {
5193 				nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5194 					&recov_state, FALSE);
5195 			} else {
5196 				nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5197 					&recov_state, TRUE);
5198 			}
5199 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5200 			kmem_free(argop, argoplist_size);
5201 			if (!e.error)
5202 				goto recov_retry;
5203 			(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5204 			VN_RELE(*vpp);
5205 			*vpp = NULL;
5206 			return (e.error);
5207 		}
5208 
5209 		if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
5210 		    OP_LOOKUP, NULL) == FALSE) {
5211 			nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5212 				&recov_state, TRUE);
5213 
5214 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5215 			kmem_free(argop, argoplist_size);
5216 			goto recov_retry;
5217 		}
5218 	}
5219 
5220 	nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
5221 
5222 	if (e.error || res.array_len == 0) {
5223 		/*
5224 		 * If e.error isn't set, then reply has no ops (or we couldn't
5225 		 * be here).  The only legal way to reply without an op array
5226 		 * is via NFS4ERR_MINOR_VERS_MISMATCH.  An ops array should
5227 		 * be in the reply for all other status values.
5228 		 *
5229 		 * For valid replies without an ops array, return ENOTSUP
5230 		 * (geterrno4 xlation of VERS_MISMATCH).  For illegal replies,
5231 		 * return EIO -- don't trust status.
5232 		 */
5233 		if (e.error == 0)
5234 			e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ?
5235 				ENOTSUP : EIO;
5236 		VN_RELE(*vpp);
5237 		*vpp = NULL;
5238 		kmem_free(argop, argoplist_size);
5239 		(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5240 		return (e.error);
5241 	}
5242 
5243 	if (res.status != NFS4ERR_SAME) {
5244 		e.error = geterrno4(res.status);
5245 
5246 		/*
5247 		 * The NVERIFY "failed" so the directory has changed
5248 		 * First make sure PUTFH succeeded and NVERIFY "failed"
5249 		 * cleanly.
5250 		 */
5251 		if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) ||
5252 		    (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) {
5253 			nfs4_purge_stale_fh(e.error, dvp, cr);
5254 			VN_RELE(*vpp);
5255 			*vpp = NULL;
5256 			goto exit;
5257 		}
5258 
5259 		/*
5260 		 * We know the NVERIFY "failed" so we must:
5261 		 *	purge the caches (access and indirectly dnlc if needed)
5262 		 */
5263 		nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE);
5264 
5265 		if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5266 			nfs4_purge_stale_fh(e.error, dvp, cr);
5267 			VN_RELE(*vpp);
5268 			*vpp = NULL;
5269 			goto exit;
5270 		}
5271 
5272 		/*
5273 		 * Install new cached attributes for the directory
5274 		 */
5275 		nfs4_attr_cache(dvp,
5276 				&res.array[2].nfs_resop4_u.opgetattr.ga_res,
5277 				t, cr, FALSE, NULL);
5278 
5279 		if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) {
5280 			nfs4_purge_stale_fh(e.error, dvp, cr);
5281 			VN_RELE(*vpp);
5282 			*vpp = NULL;
5283 			e.error = geterrno4(res.status);
5284 			goto exit;
5285 		}
5286 
5287 		/*
5288 		 * Now we know the directory is valid,
5289 		 * cache new directory access
5290 		 */
5291 		nfs4_access_cache(drp,
5292 			args.array[3].nfs_argop4_u.opaccess.access,
5293 			res.array[3].nfs_resop4_u.opaccess.access, cr);
5294 
5295 		/*
5296 		 * recheck VEXEC access
5297 		 */
5298 		cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr);
5299 		if (cacc != NFS4_ACCESS_ALLOWED) {
5300 			/*
5301 			 * Directory permissions might have been revoked
5302 			 */
5303 			if (cacc == NFS4_ACCESS_DENIED) {
5304 				e.error = EACCES;
5305 				VN_RELE(*vpp);
5306 				*vpp = NULL;
5307 				goto exit;
5308 			}
5309 
5310 			/*
5311 			 * Somehow we must not have asked for enough
5312 			 * so try a singleton ACCESS, should never happen.
5313 			 */
5314 			e.error = nfs4_access(dvp, VEXEC, 0, cr);
5315 			if (e.error) {
5316 				VN_RELE(*vpp);
5317 				*vpp = NULL;
5318 				goto exit;
5319 			}
5320 		}
5321 
5322 		e.error = geterrno4(res.status);
5323 		if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) {
5324 			/*
5325 			 * The lookup failed, probably no entry
5326 			 */
5327 			if (e.error == ENOENT && nfs4_lookup_neg_cache) {
5328 				dnlc_update(dvp, nm, DNLC_NO_VNODE);
5329 			} else {
5330 				/*
5331 				 * Might be some other error, so remove
5332 				 * the dnlc entry to make sure we start all
5333 				 * over again, next time.
5334 				 */
5335 				dnlc_remove(dvp, nm);
5336 			}
5337 			VN_RELE(*vpp);
5338 			*vpp = NULL;
5339 			goto exit;
5340 		}
5341 
5342 		if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) {
5343 			/*
5344 			 * The file exists but we can't get its fh for
5345 			 * some unknown reason.  Remove it from the dnlc
5346 			 * and error out to be safe.
5347 			 */
5348 			dnlc_remove(dvp, nm);
5349 			VN_RELE(*vpp);
5350 			*vpp = NULL;
5351 			goto exit;
5352 		}
5353 		fhp = &res.array[5].nfs_resop4_u.opgetfh.object;
5354 		if (fhp->nfs_fh4_len == 0) {
5355 			/*
5356 			 * The file exists but a bogus fh
5357 			 * some unknown reason.  Remove it from the dnlc
5358 			 * and error out to be safe.
5359 			 */
5360 			e.error = ENOENT;
5361 			dnlc_remove(dvp, nm);
5362 			VN_RELE(*vpp);
5363 			*vpp = NULL;
5364 			goto exit;
5365 		}
5366 		sfhp = sfh4_get(fhp, mi);
5367 
5368 		if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK)
5369 			garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res;
5370 
5371 		/*
5372 		 * Make the new rnode
5373 		 */
5374 		if (isdotdot) {
5375 			e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1);
5376 			if (e.error) {
5377 				sfh4_rele(&sfhp);
5378 				VN_RELE(*vpp);
5379 				*vpp = NULL;
5380 				goto exit;
5381 			}
5382 			/*
5383 			 * XXX if nfs4_make_dotdot uses an existing rnode
5384 			 * XXX it doesn't update the attributes.
5385 			 * XXX for now just save them again to save an OTW
5386 			 */
5387 			nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL);
5388 		} else {
5389 			nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr,
5390 				dvp, fn_get(VTOSV(dvp)->sv_name, nm));
5391 			/*
5392 			 * If v_type == VNON, then garp was NULL because
5393 			 * the last op in the compound failed and makenfs4node
5394 			 * could not find the vnode for sfhp. It created
5395 			 * a new vnode, so we have nothing to purge here.
5396 			 */
5397 			if (nvp->v_type == VNON) {
5398 				vattr_t vattr;
5399 
5400 				vattr.va_mask = AT_TYPE;
5401 				/*
5402 				 * N.B. We've already called nfs4_end_fop above.
5403 				 */
5404 				e.error = nfs4getattr(nvp, &vattr, cr);
5405 				if (e.error) {
5406 					sfh4_rele(&sfhp);
5407 					VN_RELE(*vpp);
5408 					*vpp = NULL;
5409 					VN_RELE(nvp);
5410 					goto exit;
5411 				}
5412 				nvp->v_type = vattr.va_type;
5413 			}
5414 		}
5415 		sfh4_rele(&sfhp);
5416 
5417 		nrp = VTOR4(nvp);
5418 		mutex_enter(&nrp->r_statev4_lock);
5419 		if (!nrp->created_v4) {
5420 			mutex_exit(&nrp->r_statev4_lock);
5421 			dnlc_update(dvp, nm, nvp);
5422 		} else
5423 			mutex_exit(&nrp->r_statev4_lock);
5424 
5425 		VN_RELE(*vpp);
5426 		*vpp = nvp;
5427 	} else {
5428 		hrtime_t now;
5429 		hrtime_t delta = 0;
5430 
5431 		e.error = 0;
5432 
5433 		/*
5434 		 * Because the NVERIFY "succeeded" we know that the
5435 		 * directory attributes are still valid
5436 		 * so update r_time_attr_inval
5437 		 */
5438 		now = gethrtime();
5439 		mutex_enter(&drp->r_statelock);
5440 		if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) {
5441 			delta = now - drp->r_time_attr_saved;
5442 			if (delta < mi->mi_acdirmin)
5443 				delta = mi->mi_acdirmin;
5444 			else if (delta > mi->mi_acdirmax)
5445 				delta = mi->mi_acdirmax;
5446 		}
5447 		drp->r_time_attr_inval = now + delta;
5448 		mutex_exit(&drp->r_statelock);
5449 		dnlc_update(dvp, nm, *vpp);
5450 
5451 		/*
5452 		 * Even though we have a valid directory attr cache
5453 		 * and dnlc entry, we may not have access.
5454 		 * This should almost always hit the cache.
5455 		 */
5456 		e.error = nfs4_access(dvp, VEXEC, 0, cr);
5457 		if (e.error) {
5458 			VN_RELE(*vpp);
5459 			*vpp = NULL;
5460 		}
5461 
5462 		if (*vpp == DNLC_NO_VNODE) {
5463 			VN_RELE(*vpp);
5464 			*vpp = NULL;
5465 			e.error = ENOENT;
5466 		}
5467 	}
5468 
5469 exit:
5470 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5471 	kmem_free(argop, argoplist_size);
5472 	(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5473 	return (e.error);
5474 }
5475 
5476 /*
5477  * We need to go over the wire to lookup the name, but
5478  * while we are there verify the directory has not
5479  * changed but if it has, get new attributes and check access
5480  *
5481  * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH
5482  *					NVERIFY GETATTR ACCESS
5483  *
5484  * With the results:
5485  *	if the NVERIFY failed we must purge the caches, add new attributes,
5486  *		and cache new access.
5487  *	set a new r_time_attr_inval
5488  *	add name to dnlc, possibly negative
5489  *	if LOOKUP succeeded
5490  *		cache new attributes
5491  */
5492 static int
5493 nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
5494 {
5495 	COMPOUND4args_clnt args;
5496 	COMPOUND4res_clnt res;
5497 	fattr4 *ver_fattr;
5498 	fattr4_change dchange;
5499 	int32_t *ptr;
5500 	nfs4_ga_res_t *garp = NULL;
5501 	int argoplist_size  = 9 * sizeof (nfs_argop4);
5502 	nfs_argop4 *argop;
5503 	int doqueue;
5504 	mntinfo4_t *mi;
5505 	nfs4_recov_state_t recov_state;
5506 	hrtime_t t;
5507 	int isdotdot;
5508 	vnode_t *nvp;
5509 	nfs_fh4 *fhp;
5510 	nfs4_sharedfh_t *sfhp;
5511 	nfs4_access_type_t cacc;
5512 	rnode4_t *nrp;
5513 	rnode4_t *drp = VTOR4(dvp);
5514 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
5515 
5516 	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5517 	ASSERT(nm != NULL);
5518 	ASSERT(nm[0] != '\0');
5519 	ASSERT(dvp->v_type == VDIR);
5520 	ASSERT(nm[0] != '.' || nm[1] != '\0');
5521 	ASSERT(*vpp == NULL);
5522 
5523 	if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
5524 		isdotdot = 1;
5525 		args.ctag = TAG_LOOKUP_PARENT;
5526 	} else {
5527 		/*
5528 		 * Do not allow crossing of server mount points.  The
5529 		 * only visible entries in a SRVSTUB dir are . and ..
5530 		 * This code handles the non-.. case.  We can't even get
5531 		 * this far if looking up ".".
5532 		 */
5533 		if (VTOR4(dvp)->r_flags & R4SRVSTUB)
5534 			return (ENOENT);
5535 
5536 		isdotdot = 0;
5537 		args.ctag = TAG_LOOKUP;
5538 	}
5539 
5540 	mi = VTOMI4(dvp);
5541 	recov_state.rs_flags = 0;
5542 	recov_state.rs_num_retry_despite_err = 0;
5543 
5544 	nvp = NULL;
5545 
5546 	/* Save the original mount point security information */
5547 	(void) save_mnt_secinfo(mi->mi_curr_serv);
5548 
5549 recov_retry:
5550 	e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
5551 			    &recov_state, NULL);
5552 	if (e.error) {
5553 		(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5554 		return (e.error);
5555 	}
5556 
5557 	argop = kmem_alloc(argoplist_size, KM_SLEEP);
5558 
5559 	/* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */
5560 	args.array_len = 9;
5561 	args.array = argop;
5562 
5563 	/* 0. putfh file */
5564 	argop[0].argop = OP_CPUTFH;
5565 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
5566 
5567 	/* 1. savefh for the nverify */
5568 	argop[1].argop = OP_SAVEFH;
5569 
5570 	/* 2. lookup name */
5571 	if (isdotdot) {
5572 		argop[2].argop = OP_LOOKUPP;
5573 	} else {
5574 		argop[2].argop = OP_CLOOKUP;
5575 		argop[2].nfs_argop4_u.opclookup.cname = nm;
5576 	}
5577 
5578 	/* 3. resulting file handle */
5579 	argop[3].argop = OP_GETFH;
5580 
5581 	/* 4. resulting file attributes */
5582 	argop[4].argop = OP_GETATTR;
5583 	argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5584 	argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5585 
5586 	/* 5. restorefh back the directory for the nverify */
5587 	argop[5].argop = OP_RESTOREFH;
5588 
5589 	/* 6. nverify the change info */
5590 	argop[6].argop = OP_NVERIFY;
5591 	ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes;
5592 	ver_fattr->attrmask = FATTR4_CHANGE_MASK;
5593 	ver_fattr->attrlist4 = (char *)&dchange;
5594 	ptr = (int32_t *)&dchange;
5595 	IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
5596 	ver_fattr->attrlist4_len = sizeof (fattr4_change);
5597 
5598 	/* 7. getattr directory */
5599 	argop[7].argop = OP_GETATTR;
5600 	argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5601 	argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5602 
5603 	/* 8. access directory */
5604 	argop[8].argop = OP_ACCESS;
5605 	argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
5606 			ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
5607 
5608 	doqueue = 1;
5609 	t = gethrtime();
5610 
5611 	rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
5612 
5613 	if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
5614 		/*
5615 		 * For WRONGSEC of a non-dotdot case, send secinfo directly
5616 		 * from this thread, do not go thru the recovery thread since
5617 		 * we need the nm information.
5618 		 *
5619 		 * Not doing dotdot case because there is no specification
5620 		 * for (PUTFH, SECINFO "..") yet.
5621 		 */
5622 		if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
5623 			if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) {
5624 				nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5625 					&recov_state, FALSE);
5626 			} else {
5627 				nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5628 					&recov_state, TRUE);
5629 			}
5630 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5631 			kmem_free(argop, argoplist_size);
5632 			if (!e.error)
5633 				goto recov_retry;
5634 			(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5635 			return (e.error);
5636 		}
5637 
5638 		if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
5639 		    OP_LOOKUP, NULL) == FALSE) {
5640 			nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5641 				&recov_state, TRUE);
5642 
5643 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5644 			kmem_free(argop, argoplist_size);
5645 			goto recov_retry;
5646 		}
5647 	}
5648 
5649 	nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
5650 
5651 	if (e.error || res.array_len == 0) {
5652 		/*
5653 		 * If e.error isn't set, then reply has no ops (or we couldn't
5654 		 * be here).  The only legal way to reply without an op array
5655 		 * is via NFS4ERR_MINOR_VERS_MISMATCH.  An ops array should
5656 		 * be in the reply for all other status values.
5657 		 *
5658 		 * For valid replies without an ops array, return ENOTSUP
5659 		 * (geterrno4 xlation of VERS_MISMATCH).  For illegal replies,
5660 		 * return EIO -- don't trust status.
5661 		 */
5662 		if (e.error == 0)
5663 			e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ?
5664 				ENOTSUP : EIO;
5665 
5666 		kmem_free(argop, argoplist_size);
5667 		(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5668 		return (e.error);
5669 	}
5670 
5671 	e.error = geterrno4(res.status);
5672 
5673 	/*
5674 	 * The PUTFH and SAVEFH may have failed.
5675 	 */
5676 	if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) ||
5677 		    (res.array[1].nfs_resop4_u.opsavefh.status != NFS4_OK)) {
5678 		nfs4_purge_stale_fh(e.error, dvp, cr);
5679 		goto exit;
5680 	}
5681 
5682 	/*
5683 	 * Check if the file exists, if it does delay entering
5684 	 * into the dnlc until after we update the directory
5685 	 * attributes so we don't cause it to get purged immediately.
5686 	 */
5687 	if (res.array[2].nfs_resop4_u.oplookup.status != NFS4_OK) {
5688 		/*
5689 		 * The lookup failed, probably no entry
5690 		 */
5691 		if (e.error == ENOENT && nfs4_lookup_neg_cache) {
5692 			dnlc_update(dvp, nm, DNLC_NO_VNODE);
5693 		}
5694 		goto exit;
5695 	}
5696 
5697 	if (res.array[3].nfs_resop4_u.opgetfh.status != NFS4_OK) {
5698 		/*
5699 		 * The file exists but we can't get its fh for
5700 		 * some unknown reason. Error out to be safe.
5701 		 */
5702 		goto exit;
5703 	}
5704 
5705 	fhp = &res.array[3].nfs_resop4_u.opgetfh.object;
5706 	if (fhp->nfs_fh4_len == 0) {
5707 		/*
5708 		 * The file exists but a bogus fh
5709 		 * some unknown reason.  Error out to be safe.
5710 		 */
5711 		e.error = EIO;
5712 		goto exit;
5713 	}
5714 	sfhp = sfh4_get(fhp, mi);
5715 
5716 	if (res.array[4].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5717 		sfh4_rele(&sfhp);
5718 		e.error = EIO;
5719 		goto exit;
5720 	}
5721 	garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res;
5722 
5723 	/*
5724 	 * The RESTOREFH may have failed
5725 	 */
5726 	if (res.array[5].nfs_resop4_u.oprestorefh.status != NFS4_OK) {
5727 		sfh4_rele(&sfhp);
5728 		e.error = EIO;
5729 		goto exit;
5730 	}
5731 
5732 	if (res.array[6].nfs_resop4_u.opnverify.status != NFS4ERR_SAME) {
5733 		/*
5734 		 * First make sure the NVERIFY failed as we expected,
5735 		 * if it didn't then be conservative and error out
5736 		 * as we can't trust the directory.
5737 		 */
5738 		if (res.array[6].nfs_resop4_u.opnverify.status != NFS4_OK) {
5739 			sfh4_rele(&sfhp);
5740 			e.error = EIO;
5741 			goto exit;
5742 		}
5743 
5744 		/*
5745 		 * We know the NVERIFY "failed" so the directory has changed,
5746 		 * so we must:
5747 		 *	purge the caches (access and indirectly dnlc if needed)
5748 		 */
5749 		nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE);
5750 
5751 		if (res.array[7].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5752 			sfh4_rele(&sfhp);
5753 			goto exit;
5754 		}
5755 		nfs4_attr_cache(dvp,
5756 				&res.array[7].nfs_resop4_u.opgetattr.ga_res,
5757 				t, cr, FALSE, NULL);
5758 
5759 		if (res.array[8].nfs_resop4_u.opaccess.status != NFS4_OK) {
5760 			nfs4_purge_stale_fh(e.error, dvp, cr);
5761 			sfh4_rele(&sfhp);
5762 			e.error = geterrno4(res.status);
5763 			goto exit;
5764 		}
5765 
5766 		/*
5767 		 * Now we know the directory is valid,
5768 		 * cache new directory access
5769 		 */
5770 		nfs4_access_cache(drp,
5771 			args.array[8].nfs_argop4_u.opaccess.access,
5772 			res.array[8].nfs_resop4_u.opaccess.access, cr);
5773 
5774 		/*
5775 		 * recheck VEXEC access
5776 		 */
5777 		cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr);
5778 		if (cacc != NFS4_ACCESS_ALLOWED) {
5779 			/*
5780 			 * Directory permissions might have been revoked
5781 			 */
5782 			if (cacc == NFS4_ACCESS_DENIED) {
5783 				sfh4_rele(&sfhp);
5784 				e.error = EACCES;
5785 				goto exit;
5786 			}
5787 
5788 			/*
5789 			 * Somehow we must not have asked for enough
5790 			 * so try a singleton ACCESS should never happen
5791 			 */
5792 			e.error = nfs4_access(dvp, VEXEC, 0, cr);
5793 			if (e.error) {
5794 				sfh4_rele(&sfhp);
5795 				goto exit;
5796 			}
5797 		}
5798 
5799 		e.error = geterrno4(res.status);
5800 	} else {
5801 		hrtime_t now;
5802 		hrtime_t delta = 0;
5803 
5804 		e.error = 0;
5805 
5806 		/*
5807 		 * Because the NVERIFY "succeeded" we know that the
5808 		 * directory attributes are still valid
5809 		 * so update r_time_attr_inval
5810 		 */
5811 		now = gethrtime();
5812 		mutex_enter(&drp->r_statelock);
5813 		if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) {
5814 			delta = now - drp->r_time_attr_saved;
5815 			if (delta < mi->mi_acdirmin)
5816 				delta = mi->mi_acdirmin;
5817 			else if (delta > mi->mi_acdirmax)
5818 				delta = mi->mi_acdirmax;
5819 		}
5820 		drp->r_time_attr_inval = now + delta;
5821 		mutex_exit(&drp->r_statelock);
5822 
5823 		/*
5824 		 * Even though we have a valid directory attr cache,
5825 		 * we may not have access.
5826 		 * This should almost always hit the cache.
5827 		 */
5828 		e.error = nfs4_access(dvp, VEXEC, 0, cr);
5829 		if (e.error) {
5830 			sfh4_rele(&sfhp);
5831 			goto exit;
5832 		}
5833 	}
5834 
5835 	/*
5836 	 * Now we have successfully completed the lookup, if the
5837 	 * directory has changed we now have the valid attributes.
5838 	 * We also know we have directory access.
5839 	 * Create the new rnode and insert it in the dnlc.
5840 	 */
5841 	if (isdotdot) {
5842 		e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1);
5843 		if (e.error) {
5844 			sfh4_rele(&sfhp);
5845 			goto exit;
5846 		}
5847 		/*
5848 		 * XXX if nfs4_make_dotdot uses an existing rnode
5849 		 * XXX it doesn't update the attributes.
5850 		 * XXX for now just save them again to save an OTW
5851 		 */
5852 		nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL);
5853 	} else {
5854 		nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr,
5855 			dvp, fn_get(VTOSV(dvp)->sv_name, nm));
5856 	}
5857 	sfh4_rele(&sfhp);
5858 
5859 	nrp = VTOR4(nvp);
5860 	mutex_enter(&nrp->r_statev4_lock);
5861 	if (!nrp->created_v4) {
5862 		mutex_exit(&nrp->r_statev4_lock);
5863 		dnlc_update(dvp, nm, nvp);
5864 	} else
5865 		mutex_exit(&nrp->r_statev4_lock);
5866 
5867 	*vpp = nvp;
5868 
5869 exit:
5870 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5871 	kmem_free(argop, argoplist_size);
5872 	(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5873 	return (e.error);
5874 }
5875 
5876 #ifdef DEBUG
5877 void
5878 nfs4lookup_dump_compound(char *where, nfs_argop4 *argbase, int argcnt)
5879 {
5880 	uint_t i, len;
5881 	zoneid_t zoneid = getzoneid();
5882 	char *s;
5883 
5884 	zcmn_err(zoneid, CE_NOTE, "%s: dumping cmpd", where);
5885 	for (i = 0; i < argcnt; i++) {
5886 		nfs_argop4 *op = &argbase[i];
5887 		switch (op->argop) {
5888 		case OP_CPUTFH:
5889 		case OP_PUTFH:
5890 			zcmn_err(zoneid, CE_NOTE, "\t op %d, putfh", i);
5891 			break;
5892 		case OP_PUTROOTFH:
5893 			zcmn_err(zoneid, CE_NOTE, "\t op %d, putrootfh", i);
5894 			break;
5895 		case OP_CLOOKUP:
5896 			s = op->nfs_argop4_u.opclookup.cname;
5897 			zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s);
5898 			break;
5899 		case OP_LOOKUP:
5900 			s = utf8_to_str(&op->nfs_argop4_u.oplookup.objname,
5901 			    &len, NULL);
5902 			zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s);
5903 			kmem_free(s, len);
5904 			break;
5905 		case OP_LOOKUPP:
5906 			zcmn_err(zoneid, CE_NOTE, "\t op %d, lookupp ..", i);
5907 			break;
5908 		case OP_GETFH:
5909 			zcmn_err(zoneid, CE_NOTE, "\t op %d, getfh", i);
5910 			break;
5911 		case OP_GETATTR:
5912 			zcmn_err(zoneid, CE_NOTE, "\t op %d, getattr", i);
5913 			break;
5914 		case OP_OPENATTR:
5915 			zcmn_err(zoneid, CE_NOTE, "\t op %d, openattr", i);
5916 			break;
5917 		default:
5918 			zcmn_err(zoneid, CE_NOTE, "\t op %d, opcode %d", i,
5919 			    op->argop);
5920 			break;
5921 		}
5922 	}
5923 }
5924 #endif
5925 
5926 /*
5927  * nfs4lookup_setup - constructs a multi-lookup compound request.
5928  *
5929  * Given the path "nm1/nm2/.../nmn", the following compound requests
5930  * may be created:
5931  *
5932  * Note: Getfh is not be needed because filehandle attr is mandatory, but it
5933  * is faster, for now.
5934  *
5935  * l4_getattrs indicates the type of compound requested.
5936  *
5937  * LKP4_NO_ATTRIBUTE - no attributes (used by secinfo):
5938  *
5939  *	compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ...  Lookup {nmn} }
5940  *
5941  *   total number of ops is n + 1.
5942  *
5943  * LKP4_LAST_NAMED_ATTR - multi-component path for a named
5944  *      attribute: create lookups plus one OPENATTR/GETFH/GETATTR
5945  *      before the last component, and only get attributes
5946  *      for the last component.  Note that the second-to-last
5947  *	pathname component is XATTR_RPATH, which does NOT go
5948  *	over-the-wire as a lookup.
5949  *
5950  *      compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn-2};
5951  *		Openattr; Getfh; Getattr; Lookup {nmn}; Getfh; Getattr }
5952  *
5953  *   and total number of ops is n + 5.
5954  *
5955  * LKP4_LAST_ATTRDIR - multi-component path for the hidden named
5956  *      attribute directory: create lookups plus an OPENATTR
5957  *	replacing the last lookup.  Note that the last pathname
5958  *	component is XATTR_RPATH, which does NOT go over-the-wire
5959  *	as a lookup.
5960  *
5961  *      compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Getfh; Getattr;
5962  *		Openattr; Getfh; Getattr }
5963  *
5964  *   and total number of ops is n + 5.
5965  *
5966  * LKP4_ALL_ATTRIBUTES - create lookups and get attributes for intermediate
5967  *	nodes too.
5968  *
5969  *	compound { Put*fh; Lookup {nm1}; Getfh; Getattr;
5970  *		Lookup {nm2}; ...  Lookup {nmn}; Getfh; Getattr }
5971  *
5972  *   and total number of ops is 3*n + 1.
5973  *
5974  * All cases: returns the index in the arg array of the final LOOKUP op, or
5975  * -1 if no LOOKUPs were used.
5976  */
5977 int
5978 nfs4lookup_setup(char *nm, lookup4_param_t *lookupargp, int needgetfh)
5979 {
5980 	enum lkp4_attr_setup l4_getattrs = lookupargp->l4_getattrs;
5981 	nfs_argop4 *argbase, *argop;
5982 	int arglen, argcnt;
5983 	int n = 1;	/* number of components */
5984 	int nga = 1;	/* number of Getattr's in request */
5985 	char c = '\0', *s, *p;
5986 	int lookup_idx = -1;
5987 	int argoplist_size;
5988 
5989 	/* set lookuparg response result to 0 */
5990 	lookupargp->resp->status = NFS4_OK;
5991 
5992 	/* skip leading "/" or "." e.g. ".//./" if there is */
5993 	for (; ; nm++) {
5994 		if (*nm != '/' && *nm != '.')
5995 			break;
5996 
5997 		/* ".." is counted as 1 component */
5998 		if (*nm == '.' && *(nm + 1) == '.')
5999 			break;
6000 	}
6001 
6002 	/*
6003 	 * Find n = number of components - nm must be null terminated
6004 	 * Skip "." components.
6005 	 */
6006 	if (*nm != '\0') {
6007 		for (n = 1, s = nm; *s != '\0'; s++) {
6008 			if ((*s == '/') && (*(s + 1) != '/') &&
6009 				    (*(s + 1) != '\0') &&
6010 				    !(*(s + 1) == '.' && (*(s + 2) == '/' ||
6011 					*(s + 2) == '\0')))
6012 				n++;
6013 		}
6014 	} else
6015 		n = 0;
6016 
6017 	/*
6018 	 * nga is number of components that need Getfh+Getattr
6019 	 */
6020 	switch (l4_getattrs) {
6021 	case LKP4_NO_ATTRIBUTES:
6022 		nga = 0;
6023 		break;
6024 	case LKP4_ALL_ATTRIBUTES:
6025 		nga = n;
6026 		/*
6027 		 * Always have at least 1 getfh, getattr pair
6028 		 */
6029 		if (nga == 0)
6030 			nga++;
6031 		break;
6032 	case LKP4_LAST_ATTRDIR:
6033 	case LKP4_LAST_NAMED_ATTR:
6034 		nga = n+1;
6035 		break;
6036 	}
6037 
6038 	/*
6039 	 * If change to use the filehandle attr instead of getfh
6040 	 * the following line can be deleted.
6041 	 */
6042 	nga *= 2;
6043 
6044 	/*
6045 	 * calculate number of ops in request as
6046 	 * header + trailer + lookups + getattrs
6047 	 */
6048 	arglen = lookupargp->header_len + lookupargp->trailer_len + n + nga;
6049 
6050 	argoplist_size = arglen * sizeof (nfs_argop4);
6051 	argop = argbase = kmem_alloc(argoplist_size, KM_SLEEP);
6052 	lookupargp->argsp->array = argop;
6053 
6054 	argcnt = lookupargp->header_len;
6055 	argop += argcnt;
6056 
6057 	/*
6058 	 * loop and create a lookup op and possibly getattr/getfh for
6059 	 * each component. Skip "." components.
6060 	 */
6061 	for (s = nm; *s != '\0'; s = p) {
6062 		/*
6063 		 * Set up a pathname struct for each component if needed
6064 		 */
6065 		while (*s == '/')
6066 			s++;
6067 		if (*s == '\0')
6068 			break;
6069 		for (p = s; (*p != '/') && (*p != '\0'); p++);
6070 		c = *p;
6071 		*p = '\0';
6072 
6073 		if (s[0] == '.' && s[1] == '\0') {
6074 			*p = c;
6075 			continue;
6076 		}
6077 		if (l4_getattrs == LKP4_LAST_ATTRDIR &&
6078 		    strcmp(s, XATTR_RPATH) == 0) {
6079 			/* getfh XXX may not be needed in future */
6080 			argop->argop = OP_GETFH;
6081 			argop++;
6082 			argcnt++;
6083 
6084 			/* getattr */
6085 			argop->argop = OP_GETATTR;
6086 			argop->nfs_argop4_u.opgetattr.attr_request =
6087 							lookupargp->ga_bits;
6088 			argop->nfs_argop4_u.opgetattr.mi =
6089 				lookupargp->mi;
6090 			argop++;
6091 			argcnt++;
6092 
6093 			/* openattr */
6094 			argop->argop = OP_OPENATTR;
6095 		} else if (l4_getattrs == LKP4_LAST_NAMED_ATTR &&
6096 		    strcmp(s, XATTR_RPATH) == 0) {
6097 			/* openattr */
6098 			argop->argop = OP_OPENATTR;
6099 			argop++;
6100 			argcnt++;
6101 
6102 			/* getfh XXX may not be needed in future */
6103 			argop->argop = OP_GETFH;
6104 			argop++;
6105 			argcnt++;
6106 
6107 			/* getattr */
6108 			argop->argop = OP_GETATTR;
6109 			argop->nfs_argop4_u.opgetattr.attr_request =
6110 							lookupargp->ga_bits;
6111 			argop->nfs_argop4_u.opgetattr.mi =
6112 							lookupargp->mi;
6113 			argop++;
6114 			argcnt++;
6115 			*p = c;
6116 			continue;
6117 		} else if (s[0] == '.' && s[1] == '.' && s[2] == '\0') {
6118 			/* lookupp */
6119 			argop->argop = OP_LOOKUPP;
6120 		} else {
6121 			/* lookup */
6122 			argop->argop = OP_LOOKUP;
6123 			(void) str_to_utf8(s,
6124 				&argop->nfs_argop4_u.oplookup.objname);
6125 		}
6126 		lookup_idx = argcnt;
6127 		argop++;
6128 		argcnt++;
6129 
6130 		*p = c;
6131 
6132 		if (l4_getattrs == LKP4_ALL_ATTRIBUTES) {
6133 			/* getfh XXX may not be needed in future */
6134 			argop->argop = OP_GETFH;
6135 			argop++;
6136 			argcnt++;
6137 
6138 			/* getattr */
6139 			argop->argop = OP_GETATTR;
6140 			argop->nfs_argop4_u.opgetattr.attr_request =
6141 							lookupargp->ga_bits;
6142 			argop->nfs_argop4_u.opgetattr.mi =
6143 							lookupargp->mi;
6144 			argop++;
6145 			argcnt++;
6146 		}
6147 	}
6148 
6149 	if ((l4_getattrs != LKP4_NO_ATTRIBUTES) &&
6150 		((l4_getattrs != LKP4_ALL_ATTRIBUTES) || (lookup_idx < 0))) {
6151 		if (needgetfh) {
6152 			/* stick in a post-lookup getfh */
6153 			argop->argop = OP_GETFH;
6154 			argcnt++;
6155 			argop++;
6156 		}
6157 		/* post-lookup getattr */
6158 		argop->argop = OP_GETATTR;
6159 		argop->nfs_argop4_u.opgetattr.attr_request =
6160 						lookupargp->ga_bits;
6161 		argop->nfs_argop4_u.opgetattr.mi = lookupargp->mi;
6162 		argcnt++;
6163 	}
6164 	argcnt += lookupargp->trailer_len;	/* actual op count */
6165 	lookupargp->argsp->array_len = argcnt;
6166 	lookupargp->arglen = arglen;
6167 
6168 #ifdef DEBUG
6169 	if (nfs4_client_lookup_debug)
6170 		nfs4lookup_dump_compound("nfs4lookup_setup", argbase, argcnt);
6171 #endif
6172 
6173 	return (lookup_idx);
6174 }
6175 
6176 static int
6177 nfs4openattr(vnode_t *dvp, vnode_t **avp, int cflag, cred_t *cr)
6178 {
6179 	COMPOUND4args_clnt	args;
6180 	COMPOUND4res_clnt	res;
6181 	GETFH4res	*gf_res = NULL;
6182 	nfs_argop4	argop[4];
6183 	nfs_resop4	*resop = NULL;
6184 	nfs4_sharedfh_t *sfhp;
6185 	hrtime_t t;
6186 	nfs4_error_t	e;
6187 
6188 	rnode4_t	*drp;
6189 	int		doqueue = 1;
6190 	vnode_t		*vp;
6191 	int		needrecov = 0;
6192 	nfs4_recov_state_t recov_state;
6193 
6194 	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
6195 
6196 	*avp = NULL;
6197 	recov_state.rs_flags = 0;
6198 	recov_state.rs_num_retry_despite_err = 0;
6199 
6200 recov_retry:
6201 	/* COMPOUND: putfh, openattr, getfh, getattr */
6202 	args.array_len = 4;
6203 	args.array = argop;
6204 	args.ctag = TAG_OPENATTR;
6205 
6206 	e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
6207 	if (e.error)
6208 		return (e.error);
6209 
6210 	drp = VTOR4(dvp);
6211 
6212 	/* putfh */
6213 	argop[0].argop = OP_CPUTFH;
6214 	argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6215 
6216 	/* openattr */
6217 	argop[1].argop = OP_OPENATTR;
6218 	argop[1].nfs_argop4_u.opopenattr.createdir = (cflag ? TRUE : FALSE);
6219 
6220 	/* getfh */
6221 	argop[2].argop = OP_GETFH;
6222 
6223 	/* getattr */
6224 	argop[3].argop = OP_GETATTR;
6225 	argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6226 	argop[3].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
6227 
6228 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
6229 	    "nfs4openattr: %s call, drp %s", needrecov ? "recov" : "first",
6230 	    rnode4info(drp)));
6231 
6232 	t = gethrtime();
6233 
6234 	rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
6235 
6236 	needrecov = nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp);
6237 	if (needrecov) {
6238 		bool_t abort;
6239 
6240 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
6241 		    "nfs4openattr: initiating recovery\n"));
6242 
6243 		abort = nfs4_start_recovery(&e,
6244 				VTOMI4(dvp), dvp, NULL, NULL, NULL,
6245 				OP_OPENATTR, NULL);
6246 		nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6247 		if (!e.error) {
6248 			e.error = geterrno4(res.status);
6249 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6250 		}
6251 		if (abort == FALSE)
6252 			goto recov_retry;
6253 		return (e.error);
6254 	}
6255 
6256 	if (e.error) {
6257 		nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6258 		return (e.error);
6259 	}
6260 
6261 	if (res.status) {
6262 		/*
6263 		 * If OTW errro is NOTSUPP, then it should be
6264 		 * translated to EINVAL.  All Solaris file system
6265 		 * implementations return EINVAL to the syscall layer
6266 		 * when the attrdir cannot be created due to an
6267 		 * implementation restriction or noxattr mount option.
6268 		 */
6269 		if (res.status == NFS4ERR_NOTSUPP) {
6270 			mutex_enter(&drp->r_statelock);
6271 			if (drp->r_xattr_dir)
6272 				VN_RELE(drp->r_xattr_dir);
6273 			VN_HOLD(NFS4_XATTR_DIR_NOTSUPP);
6274 			drp->r_xattr_dir = NFS4_XATTR_DIR_NOTSUPP;
6275 			mutex_exit(&drp->r_statelock);
6276 
6277 			e.error = EINVAL;
6278 		} else {
6279 			e.error = geterrno4(res.status);
6280 		}
6281 
6282 		if (e.error) {
6283 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6284 			nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
6285 				    needrecov);
6286 			return (e.error);
6287 		}
6288 	}
6289 
6290 	resop = &res.array[0];  /* putfh res */
6291 	ASSERT(resop->nfs_resop4_u.opgetfh.status == NFS4_OK);
6292 
6293 	resop = &res.array[1];  /* openattr res */
6294 	ASSERT(resop->nfs_resop4_u.opopenattr.status == NFS4_OK);
6295 
6296 	resop = &res.array[2];  /* getfh res */
6297 	gf_res = &resop->nfs_resop4_u.opgetfh;
6298 	if (gf_res->object.nfs_fh4_len == 0) {
6299 		*avp = NULL;
6300 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6301 		nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6302 		return (ENOENT);
6303 	}
6304 
6305 	sfhp = sfh4_get(&gf_res->object, VTOMI4(dvp));
6306 	vp = makenfs4node(sfhp, &res.array[3].nfs_resop4_u.opgetattr.ga_res,
6307 				dvp->v_vfsp, t, cr, dvp,
6308 				fn_get(VTOSV(dvp)->sv_name, XATTR_RPATH));
6309 	sfh4_rele(&sfhp);
6310 
6311 	if (e.error)
6312 		PURGE_ATTRCACHE4(vp);
6313 
6314 	mutex_enter(&vp->v_lock);
6315 	vp->v_flag |= V_XATTRDIR;
6316 	mutex_exit(&vp->v_lock);
6317 
6318 	*avp = vp;
6319 
6320 	mutex_enter(&drp->r_statelock);
6321 	if (drp->r_xattr_dir)
6322 		VN_RELE(drp->r_xattr_dir);
6323 	VN_HOLD(vp);
6324 	drp->r_xattr_dir = vp;
6325 
6326 	/*
6327 	 * Invalidate pathconf4 cache because r_xattr_dir is no longer
6328 	 * NULL.  xattrs could be created at any time, and we have no
6329 	 * way to update pc4_xattr_exists in the base object if/when
6330 	 * it happens.
6331 	 */
6332 	drp->r_pathconf.pc4_xattr_valid = 0;
6333 
6334 	mutex_exit(&drp->r_statelock);
6335 
6336 	nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6337 
6338 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6339 
6340 	return (0);
6341 }
6342 
6343 /* ARGSUSED */
6344 static int
6345 nfs4_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
6346 	int mode, vnode_t **vpp, cred_t *cr, int flags)
6347 {
6348 	int error;
6349 	vnode_t *vp = NULL;
6350 	rnode4_t *rp;
6351 	struct vattr vattr;
6352 	rnode4_t *drp;
6353 	vnode_t *tempvp;
6354 	enum createmode4 createmode;
6355 	bool_t must_trunc = FALSE;
6356 
6357 	if (nfs_zone() != VTOMI4(dvp)->mi_zone)
6358 		return (EPERM);
6359 	if (exclusive == EXCL && (dvp->v_flag & V_XATTRDIR)) {
6360 		return (EINVAL);
6361 	}
6362 
6363 	/* . and .. have special meaning in the protocol, reject them. */
6364 
6365 	if (nm[0] == '.' && (nm[1] == '\0' || (nm[1] == '.' && nm[2] == '\0')))
6366 		return (EISDIR);
6367 
6368 	drp = VTOR4(dvp);
6369 
6370 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
6371 		return (EINTR);
6372 
6373 top:
6374 	/*
6375 	 * We make a copy of the attributes because the caller does not
6376 	 * expect us to change what va points to.
6377 	 */
6378 	vattr = *va;
6379 
6380 	/*
6381 	 * If the pathname is "", then dvp is the root vnode of
6382 	 * a remote file mounted over a local directory.
6383 	 * All that needs to be done is access
6384 	 * checking and truncation.  Note that we avoid doing
6385 	 * open w/ create because the parent directory might
6386 	 * be in pseudo-fs and the open would fail.
6387 	 */
6388 	if (*nm == '\0') {
6389 		error = 0;
6390 		VN_HOLD(dvp);
6391 		vp = dvp;
6392 		must_trunc = TRUE;
6393 	} else {
6394 		/*
6395 		 * We need to go over the wire, just to be sure whether the
6396 		 * file exists or not.  Using the DNLC can be dangerous in
6397 		 * this case when making a decision regarding existence.
6398 		 */
6399 		error = nfs4lookup(dvp, nm, &vp, cr, 1);
6400 	}
6401 
6402 	if (exclusive)
6403 		createmode = EXCLUSIVE4;
6404 	else
6405 		createmode = GUARDED4;
6406 
6407 	/*
6408 	 * error would be set if the file does not exist on the
6409 	 * server, so lets go create it.
6410 	 */
6411 	if (error) {
6412 		goto create_otw;
6413 	}
6414 
6415 	/*
6416 	 * File does exist on the server
6417 	 */
6418 	if (exclusive == EXCL)
6419 		error = EEXIST;
6420 	else if (vp->v_type == VDIR && (mode & VWRITE))
6421 		error = EISDIR;
6422 	else {
6423 		/*
6424 		 * If vnode is a device, create special vnode.
6425 		 */
6426 		if (ISVDEV(vp->v_type)) {
6427 			tempvp = vp;
6428 			vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
6429 			VN_RELE(tempvp);
6430 		}
6431 		if (!(error = VOP_ACCESS(vp, mode, 0, cr))) {
6432 			if ((vattr.va_mask & AT_SIZE) &&
6433 			    vp->v_type == VREG) {
6434 				rp = VTOR4(vp);
6435 				/*
6436 				 * Check here for large file handled
6437 				 * by LF-unaware process (as
6438 				 * ufs_create() does)
6439 				 */
6440 				if (!(flags & FOFFMAX)) {
6441 					mutex_enter(&rp->r_statelock);
6442 					if (rp->r_size > MAXOFF32_T)
6443 						error = EOVERFLOW;
6444 					mutex_exit(&rp->r_statelock);
6445 				}
6446 
6447 				/* if error is set then we need to return */
6448 				if (error) {
6449 					nfs_rw_exit(&drp->r_rwlock);
6450 					VN_RELE(vp);
6451 					return (error);
6452 				}
6453 
6454 				if (must_trunc) {
6455 					vattr.va_mask = AT_SIZE;
6456 					error = nfs4setattr(vp, &vattr, 0, cr,
6457 						NULL);
6458 				} else {
6459 				/*
6460 				 * we know we have a regular file that already
6461 				 * exists and we may end up truncating the file
6462 				 * as a result of the open_otw, so flush out
6463 				 * any dirty pages for this file first.
6464 				 */
6465 					if (nfs4_has_pages(vp) &&
6466 					    ((rp->r_flags & R4DIRTY) ||
6467 					    rp->r_count > 0 ||
6468 					    rp->r_mapcnt > 0)) {
6469 						error = nfs4_putpage(vp,
6470 							(offset_t)0, 0, 0, cr);
6471 						if (error && (error == ENOSPC ||
6472 						    error == EDQUOT)) {
6473 							mutex_enter(
6474 							    &rp->r_statelock);
6475 							if (!rp->r_error)
6476 								rp->r_error =
6477 								    error;
6478 							mutex_exit(
6479 							    &rp->r_statelock);
6480 						}
6481 					}
6482 					vattr.va_mask = (AT_SIZE |
6483 							AT_TYPE | AT_MODE);
6484 					vattr.va_type = VREG;
6485 					createmode = UNCHECKED4;
6486 					goto create_otw;
6487 				}
6488 			}
6489 		}
6490 	}
6491 	nfs_rw_exit(&drp->r_rwlock);
6492 	if (error) {
6493 		VN_RELE(vp);
6494 	} else {
6495 		*vpp = vp;
6496 	}
6497 	return (error);
6498 
6499 create_otw:
6500 	dnlc_remove(dvp, nm);
6501 
6502 	ASSERT(vattr.va_mask & AT_TYPE);
6503 
6504 	/*
6505 	 * If not a regular file let nfs4mknod() handle it.
6506 	 */
6507 	if (vattr.va_type != VREG) {
6508 		error = nfs4mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr);
6509 		nfs_rw_exit(&drp->r_rwlock);
6510 		return (error);
6511 	}
6512 
6513 	/*
6514 	 * It _is_ a regular file.
6515 	 */
6516 	ASSERT(vattr.va_mask & AT_MODE);
6517 	if (MANDMODE(vattr.va_mode)) {
6518 		nfs_rw_exit(&drp->r_rwlock);
6519 		return (EACCES);
6520 	}
6521 
6522 	/*
6523 	 * If this happens to be a mknod of a regular file, then flags will
6524 	 * have neither FREAD or FWRITE.  However, we must set at least one
6525 	 * for the call to nfs4open_otw.  If it's open(O_CREAT) driving
6526 	 * nfs4_create, then either FREAD, FWRITE, or FRDWR has already been
6527 	 * set (based on openmode specified by app).
6528 	 */
6529 	if ((flags & (FREAD|FWRITE)) == 0)
6530 		flags |= (FREAD|FWRITE);
6531 
6532 	error = nfs4open_otw(dvp, nm, &vattr, vpp, cr, 1, flags, createmode, 0);
6533 
6534 	if (vp != NULL) {
6535 		/* if create was successful, throw away the file's pages */
6536 		if (!error && (vattr.va_mask & AT_SIZE))
6537 			nfs4_invalidate_pages(vp, (vattr.va_size & PAGEMASK),
6538 				cr);
6539 		/* release the lookup hold */
6540 		VN_RELE(vp);
6541 		vp = NULL;
6542 	}
6543 
6544 	/*
6545 	 * validate that we opened a regular file. This handles a misbehaving
6546 	 * server that returns an incorrect FH.
6547 	 */
6548 	if ((error == 0) && *vpp && (*vpp)->v_type != VREG) {
6549 		error = EISDIR;
6550 		VN_RELE(*vpp);
6551 	}
6552 
6553 	/*
6554 	 * If this is not an exclusive create, then the CREATE
6555 	 * request will be made with the GUARDED mode set.  This
6556 	 * means that the server will return EEXIST if the file
6557 	 * exists.  The file could exist because of a retransmitted
6558 	 * request.  In this case, we recover by starting over and
6559 	 * checking to see whether the file exists.  This second
6560 	 * time through it should and a CREATE request will not be
6561 	 * sent.
6562 	 *
6563 	 * This handles the problem of a dangling CREATE request
6564 	 * which contains attributes which indicate that the file
6565 	 * should be truncated.  This retransmitted request could
6566 	 * possibly truncate valid data in the file if not caught
6567 	 * by the duplicate request mechanism on the server or if
6568 	 * not caught by other means.  The scenario is:
6569 	 *
6570 	 * Client transmits CREATE request with size = 0
6571 	 * Client times out, retransmits request.
6572 	 * Response to the first request arrives from the server
6573 	 *  and the client proceeds on.
6574 	 * Client writes data to the file.
6575 	 * The server now processes retransmitted CREATE request
6576 	 *  and truncates file.
6577 	 *
6578 	 * The use of the GUARDED CREATE request prevents this from
6579 	 * happening because the retransmitted CREATE would fail
6580 	 * with EEXIST and would not truncate the file.
6581 	 */
6582 	if (error == EEXIST && exclusive == NONEXCL) {
6583 #ifdef DEBUG
6584 		nfs4_create_misses++;
6585 #endif
6586 		goto top;
6587 	}
6588 	nfs_rw_exit(&drp->r_rwlock);
6589 	return (error);
6590 }
6591 
6592 /*
6593  * Create compound (for mkdir, mknod, symlink):
6594  * { Putfh <dfh>; Create; Getfh; Getattr }
6595  * It's okay if setattr failed to set gid - this is not considered
6596  * an error, but purge attrs in that case.
6597  */
6598 static int
6599 call_nfs4_create_req(vnode_t *dvp, char *nm, void *data, struct vattr *va,
6600 	vnode_t **vpp, cred_t *cr, nfs_ftype4 type)
6601 {
6602 	int need_end_op = FALSE;
6603 	COMPOUND4args_clnt args;
6604 	COMPOUND4res_clnt res, *resp = NULL;
6605 	nfs_argop4 *argop;
6606 	nfs_resop4 *resop;
6607 	int doqueue;
6608 	mntinfo4_t *mi;
6609 	rnode4_t *drp = VTOR4(dvp);
6610 	change_info4 *cinfo;
6611 	GETFH4res *gf_res;
6612 	struct vattr vattr;
6613 	vnode_t *vp;
6614 	fattr4 *crattr;
6615 	bool_t needrecov = FALSE;
6616 	nfs4_recov_state_t recov_state;
6617 	nfs4_sharedfh_t *sfhp = NULL;
6618 	hrtime_t t;
6619 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
6620 	int numops, argoplist_size, setgid_flag, idx_create, idx_fattr;
6621 	dirattr_info_t dinfo, *dinfop;
6622 	servinfo4_t *svp;
6623 	bitmap4 supp_attrs;
6624 
6625 	ASSERT(type == NF4DIR || type == NF4LNK || type == NF4BLK ||
6626 		type == NF4CHR || type == NF4SOCK || type == NF4FIFO);
6627 
6628 	mi = VTOMI4(dvp);
6629 
6630 	/*
6631 	 * Make sure we properly deal with setting the right gid
6632 	 * on a new directory to reflect the parent's setgid bit
6633 	 */
6634 	setgid_flag = 0;
6635 	if (type == NF4DIR) {
6636 		struct vattr dva;
6637 
6638 		va->va_mode &= ~VSGID;
6639 		dva.va_mask = AT_MODE | AT_GID;
6640 		if (VOP_GETATTR(dvp, &dva, 0, cr) == 0) {
6641 
6642 			/*
6643 			 * If the parent's directory has the setgid bit set
6644 			 * _and_ the client was able to get a valid mapping
6645 			 * for the parent dir's owner_group, we want to
6646 			 * append NVERIFY(owner_group == dva.va_gid) and
6647 			 * SETTATTR to the CREATE compound.
6648 			 */
6649 			if (mi->mi_flags & MI4_GRPID || dva.va_mode & VSGID) {
6650 				setgid_flag = 1;
6651 				va->va_mode |= VSGID;
6652 				if (dva.va_gid != GID_NOBODY) {
6653 					va->va_mask |= AT_GID;
6654 					va->va_gid = dva.va_gid;
6655 				}
6656 			}
6657 		}
6658 	}
6659 
6660 	/*
6661 	 * Create ops:
6662 	 *	0:putfh(dir) 1:savefh(dir) 2:create 3:getfh(new) 4:getattr(new)
6663 	 *	5:restorefh(dir) 6:getattr(dir)
6664 	 *
6665 	 * if (setgid)
6666 	 *	0:putfh(dir) 1:create 2:getfh(new) 3:getattr(new)
6667 	 *	4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
6668 	 *	8:nverify 9:setattr
6669 	 */
6670 	if (setgid_flag) {
6671 		numops = 10;
6672 		idx_create = 1;
6673 		idx_fattr = 3;
6674 	} else {
6675 		numops = 7;
6676 		idx_create = 2;
6677 		idx_fattr = 4;
6678 	}
6679 
6680 	ASSERT(nfs_zone() == mi->mi_zone);
6681 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) {
6682 		return (EINTR);
6683 	}
6684 	recov_state.rs_flags = 0;
6685 	recov_state.rs_num_retry_despite_err = 0;
6686 
6687 	argoplist_size = numops * sizeof (nfs_argop4);
6688 	argop = kmem_alloc(argoplist_size, KM_SLEEP);
6689 
6690 recov_retry:
6691 	if (type == NF4LNK)
6692 		args.ctag = TAG_SYMLINK;
6693 	else if (type == NF4DIR)
6694 		args.ctag = TAG_MKDIR;
6695 	else
6696 		args.ctag = TAG_MKNOD;
6697 
6698 	args.array_len = numops;
6699 	args.array = argop;
6700 
6701 	if (e.error = nfs4_start_op(mi, dvp, NULL, &recov_state)) {
6702 		nfs_rw_exit(&drp->r_rwlock);
6703 		kmem_free(argop, argoplist_size);
6704 		return (e.error);
6705 	}
6706 	need_end_op = TRUE;
6707 
6708 
6709 	/* 0: putfh directory */
6710 	argop[0].argop = OP_CPUTFH;
6711 	argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6712 
6713 	/* 1/2: Create object */
6714 	argop[idx_create].argop = OP_CCREATE;
6715 	argop[idx_create].nfs_argop4_u.opccreate.cname = nm;
6716 	argop[idx_create].nfs_argop4_u.opccreate.type = type;
6717 	if (type == NF4LNK) {
6718 		/*
6719 		 * symlink, treat name as data
6720 		 */
6721 		ASSERT(data != NULL);
6722 		argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.clinkdata =
6723 							(char *)data;
6724 	}
6725 	if (type == NF4BLK || type == NF4CHR) {
6726 		ASSERT(data != NULL);
6727 		argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.devdata =
6728 							*((specdata4 *)data);
6729 	}
6730 
6731 	crattr = &argop[idx_create].nfs_argop4_u.opccreate.createattrs;
6732 
6733 	svp = drp->r_server;
6734 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
6735 	supp_attrs = svp->sv_supp_attrs;
6736 	nfs_rw_exit(&svp->sv_lock);
6737 
6738 	if (vattr_to_fattr4(va, NULL, crattr, 0, OP_CREATE, supp_attrs)) {
6739 		nfs_rw_exit(&drp->r_rwlock);
6740 		nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov);
6741 		e.error = EINVAL;
6742 		kmem_free(argop, argoplist_size);
6743 		return (e.error);
6744 	}
6745 
6746 	/* 2/3: getfh fh of created object */
6747 	ASSERT(idx_create + 1 == idx_fattr - 1);
6748 	argop[idx_create + 1].argop = OP_GETFH;
6749 
6750 	/* 3/4: getattr of new object */
6751 	argop[idx_fattr].argop = OP_GETATTR;
6752 	argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6753 	argop[idx_fattr].nfs_argop4_u.opgetattr.mi = mi;
6754 
6755 	if (setgid_flag) {
6756 		vattr_t	_v;
6757 
6758 		argop[4].argop = OP_SAVEFH;
6759 
6760 		argop[5].argop = OP_CPUTFH;
6761 		argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6762 
6763 		argop[6].argop = OP_GETATTR;
6764 		argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6765 		argop[6].nfs_argop4_u.opgetattr.mi = mi;
6766 
6767 		argop[7].argop = OP_RESTOREFH;
6768 
6769 		/*
6770 		 * nverify
6771 		 *
6772 		 * XXX - Revisit the last argument to nfs4_end_op()
6773 		 *	 once 5020486 is fixed.
6774 		 */
6775 		_v.va_mask = AT_GID;
6776 		_v.va_gid = va->va_gid;
6777 		if (e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
6778 		    supp_attrs)) {
6779 			nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE);
6780 			nfs_rw_exit(&drp->r_rwlock);
6781 			nfs4_fattr4_free(crattr);
6782 			kmem_free(argop, argoplist_size);
6783 			return (e.error);
6784 		}
6785 
6786 		/*
6787 		 * setattr
6788 		 *
6789 		 * We _know_ we're not messing with AT_SIZE or AT_XTIME,
6790 		 * so no need for stateid or flags. Also we specify NULL
6791 		 * rp since we're only interested in setting owner_group
6792 		 * attributes.
6793 		 */
6794 		nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, supp_attrs,
6795 		    &e.error, 0);
6796 
6797 		if (e.error) {
6798 			nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE);
6799 			nfs_rw_exit(&drp->r_rwlock);
6800 			nfs4_fattr4_free(crattr);
6801 			nfs4args_verify_free(&argop[8]);
6802 			kmem_free(argop, argoplist_size);
6803 			return (e.error);
6804 		}
6805 	} else {
6806 		argop[1].argop = OP_SAVEFH;
6807 
6808 		argop[5].argop = OP_RESTOREFH;
6809 
6810 		argop[6].argop = OP_GETATTR;
6811 		argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6812 		argop[6].nfs_argop4_u.opgetattr.mi = mi;
6813 	}
6814 
6815 	dnlc_remove(dvp, nm);
6816 
6817 	doqueue = 1;
6818 	t = gethrtime();
6819 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
6820 
6821 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
6822 	if (e.error) {
6823 		PURGE_ATTRCACHE4(dvp);
6824 		if (!needrecov)
6825 			goto out;
6826 	}
6827 
6828 	if (needrecov) {
6829 		if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
6830 		    OP_CREATE, NULL) == FALSE) {
6831 			nfs4_end_op(mi, dvp, NULL, &recov_state,
6832 				    needrecov);
6833 			need_end_op = FALSE;
6834 			nfs4_fattr4_free(crattr);
6835 			if (setgid_flag) {
6836 				nfs4args_verify_free(&argop[8]);
6837 				nfs4args_setattr_free(&argop[9]);
6838 			}
6839 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6840 			goto recov_retry;
6841 		}
6842 	}
6843 
6844 	resp = &res;
6845 
6846 	if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
6847 
6848 		if (res.status == NFS4ERR_BADOWNER)
6849 			nfs4_log_badowner(mi, OP_CREATE);
6850 
6851 		e.error = geterrno4(res.status);
6852 
6853 		/*
6854 		 * This check is left over from when create was implemented
6855 		 * using a setattr op (instead of createattrs).  If the
6856 		 * putfh/create/getfh failed, the error was returned.  If
6857 		 * setattr/getattr failed, we keep going.
6858 		 *
6859 		 * It might be better to get rid of the GETFH also, and just
6860 		 * do PUTFH/CREATE/GETATTR since the FH attr is mandatory.
6861 		 * Then if any of the operations failed, we could return the
6862 		 * error now, and remove much of the error code below.
6863 		 */
6864 		if (res.array_len <= idx_fattr) {
6865 			/*
6866 			 * Either Putfh, Create or Getfh failed.
6867 			 */
6868 			PURGE_ATTRCACHE4(dvp);
6869 			/*
6870 			 * nfs4_purge_stale_fh() may generate otw calls through
6871 			 * nfs4_invalidate_pages. Hence the need to call
6872 			 * nfs4_end_op() here to avoid nfs4_start_op() deadlock.
6873 			 */
6874 			nfs4_end_op(mi, dvp, NULL, &recov_state,
6875 			    needrecov);
6876 			need_end_op = FALSE;
6877 			nfs4_purge_stale_fh(e.error, dvp, cr);
6878 			goto out;
6879 		}
6880 	}
6881 
6882 	resop = &res.array[idx_create];	/* create res */
6883 	cinfo = &resop->nfs_resop4_u.opcreate.cinfo;
6884 
6885 	resop = &res.array[idx_create + 1]; /* getfh res */
6886 	gf_res = &resop->nfs_resop4_u.opgetfh;
6887 
6888 	sfhp = sfh4_get(&gf_res->object, mi);
6889 	if (e.error) {
6890 		*vpp = vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp,
6891 		    fn_get(VTOSV(dvp)->sv_name, nm));
6892 		if (vp->v_type == VNON) {
6893 			vattr.va_mask = AT_TYPE;
6894 			/*
6895 			 * Need to call nfs4_end_op before nfs4getattr to avoid
6896 			 * potential nfs4_start_op deadlock. See RFE 4777612.
6897 			 */
6898 			nfs4_end_op(mi, dvp, NULL, &recov_state,
6899 				needrecov);
6900 			need_end_op = FALSE;
6901 			e.error = nfs4getattr(vp, &vattr, cr);
6902 			if (e.error) {
6903 				VN_RELE(vp);
6904 				*vpp = NULL;
6905 				goto out;
6906 			}
6907 			vp->v_type = vattr.va_type;
6908 		}
6909 		e.error = 0;
6910 	} else {
6911 		*vpp = vp = makenfs4node(sfhp,
6912 			&res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res,
6913 			dvp->v_vfsp, t, cr,
6914 			dvp, fn_get(VTOSV(dvp)->sv_name, nm));
6915 	}
6916 
6917 	/*
6918 	 * If compound succeeded, then update dir attrs
6919 	 */
6920 	if (res.status == NFS4_OK) {
6921 		dinfo.di_garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res;
6922 		dinfo.di_cred = cr;
6923 		dinfo.di_time_call = t;
6924 		dinfop = &dinfo;
6925 	} else
6926 		dinfop = NULL;
6927 
6928 	/* Update directory cache attribute, readdir and dnlc caches */
6929 	nfs4_update_dircaches(cinfo, dvp, vp, nm, dinfop);
6930 
6931 out:
6932 	if (sfhp != NULL)
6933 		sfh4_rele(&sfhp);
6934 	nfs_rw_exit(&drp->r_rwlock);
6935 	nfs4_fattr4_free(crattr);
6936 	if (setgid_flag) {
6937 		nfs4args_verify_free(&argop[8]);
6938 		nfs4args_setattr_free(&argop[9]);
6939 	}
6940 	if (resp)
6941 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
6942 	if (need_end_op)
6943 		nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov);
6944 
6945 	kmem_free(argop, argoplist_size);
6946 	return (e.error);
6947 }
6948 
6949 /* ARGSUSED */
6950 static int
6951 nfs4mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
6952 	int mode, vnode_t **vpp, cred_t *cr)
6953 {
6954 	int error;
6955 	vnode_t *vp;
6956 	nfs_ftype4 type;
6957 	specdata4 spec, *specp = NULL;
6958 
6959 	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
6960 
6961 	switch (va->va_type) {
6962 	case VCHR:
6963 	case VBLK:
6964 		type = (va->va_type == VCHR) ? NF4CHR : NF4BLK;
6965 		spec.specdata1 = getmajor(va->va_rdev);
6966 		spec.specdata2 = getminor(va->va_rdev);
6967 		specp = &spec;
6968 		break;
6969 
6970 	case VFIFO:
6971 		type = NF4FIFO;
6972 		break;
6973 	case VSOCK:
6974 		type = NF4SOCK;
6975 		break;
6976 
6977 	default:
6978 		return (EINVAL);
6979 	}
6980 
6981 	error = call_nfs4_create_req(dvp, nm, specp, va, &vp, cr, type);
6982 	if (error) {
6983 		return (error);
6984 	}
6985 
6986 	/*
6987 	 * This might not be needed any more; special case to deal
6988 	 * with problematic v2/v3 servers.  Since create was unable
6989 	 * to set group correctly, not sure what hope setattr has.
6990 	 */
6991 	if (va->va_gid != VTOR4(vp)->r_attr.va_gid) {
6992 		va->va_mask = AT_GID;
6993 		(void) nfs4setattr(vp, va, 0, cr, NULL);
6994 	}
6995 
6996 	/*
6997 	 * If vnode is a device create special vnode
6998 	 */
6999 	if (ISVDEV(vp->v_type)) {
7000 		*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
7001 		VN_RELE(vp);
7002 	} else {
7003 		*vpp = vp;
7004 	}
7005 	return (error);
7006 }
7007 
7008 /*
7009  * Remove requires that the current fh be the target directory.
7010  * After the operation, the current fh is unchanged.
7011  * The compound op structure is:
7012  *      PUTFH(targetdir), REMOVE
7013  *
7014  * Weirdness: if the vnode to be removed is open
7015  * we rename it instead of removing it and nfs_inactive
7016  * will remove the new name.
7017  */
7018 static int
7019 nfs4_remove(vnode_t *dvp, char *nm, cred_t *cr)
7020 {
7021 	COMPOUND4args_clnt args;
7022 	COMPOUND4res_clnt res, *resp = NULL;
7023 	REMOVE4res *rm_res;
7024 	nfs_argop4 argop[3];
7025 	nfs_resop4 *resop;
7026 	vnode_t *vp;
7027 	char *tmpname;
7028 	int doqueue;
7029 	mntinfo4_t *mi;
7030 	rnode4_t *rp;
7031 	rnode4_t *drp;
7032 	int needrecov = 0;
7033 	nfs4_recov_state_t recov_state;
7034 	int isopen;
7035 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
7036 	dirattr_info_t dinfo;
7037 
7038 	if (nfs_zone() != VTOMI4(dvp)->mi_zone)
7039 		return (EPERM);
7040 	drp = VTOR4(dvp);
7041 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
7042 		return (EINTR);
7043 
7044 	e.error = nfs4lookup(dvp, nm, &vp, cr, 0);
7045 	if (e.error) {
7046 		nfs_rw_exit(&drp->r_rwlock);
7047 		return (e.error);
7048 	}
7049 
7050 	if (vp->v_type == VDIR) {
7051 		VN_RELE(vp);
7052 		nfs_rw_exit(&drp->r_rwlock);
7053 		return (EISDIR);
7054 	}
7055 
7056 	/*
7057 	 * First just remove the entry from the name cache, as it
7058 	 * is most likely the only entry for this vp.
7059 	 */
7060 	dnlc_remove(dvp, nm);
7061 
7062 	rp = VTOR4(vp);
7063 
7064 	/*
7065 	 * For regular file types, check to see if the file is open by looking
7066 	 * at the open streams.
7067 	 * For all other types, check the reference count on the vnode.  Since
7068 	 * they are not opened OTW they never have an open stream.
7069 	 *
7070 	 * If the file is open, rename it to .nfsXXXX.
7071 	 */
7072 	if (vp->v_type != VREG) {
7073 		/*
7074 		 * If the file has a v_count > 1 then there may be more than one
7075 		 * entry in the name cache due multiple links or an open file,
7076 		 * but we don't have the real reference count so flush all
7077 		 * possible entries.
7078 		 */
7079 		if (vp->v_count > 1)
7080 			dnlc_purge_vp(vp);
7081 
7082 		/*
7083 		 * Now we have the real reference count.
7084 		 */
7085 		isopen = vp->v_count > 1;
7086 	} else {
7087 		mutex_enter(&rp->r_os_lock);
7088 		isopen = list_head(&rp->r_open_streams) != NULL;
7089 		mutex_exit(&rp->r_os_lock);
7090 	}
7091 
7092 	mutex_enter(&rp->r_statelock);
7093 	if (isopen &&
7094 	    (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
7095 		mutex_exit(&rp->r_statelock);
7096 		tmpname = newname();
7097 		e.error = nfs4rename(dvp, nm, dvp, tmpname, cr);
7098 		if (e.error)
7099 			kmem_free(tmpname, MAXNAMELEN);
7100 		else {
7101 			mutex_enter(&rp->r_statelock);
7102 			if (rp->r_unldvp == NULL) {
7103 				VN_HOLD(dvp);
7104 				rp->r_unldvp = dvp;
7105 				if (rp->r_unlcred != NULL)
7106 					crfree(rp->r_unlcred);
7107 				crhold(cr);
7108 				rp->r_unlcred = cr;
7109 				rp->r_unlname = tmpname;
7110 			} else {
7111 				kmem_free(rp->r_unlname, MAXNAMELEN);
7112 				rp->r_unlname = tmpname;
7113 			}
7114 			mutex_exit(&rp->r_statelock);
7115 		}
7116 		VN_RELE(vp);
7117 		nfs_rw_exit(&drp->r_rwlock);
7118 		return (e.error);
7119 	}
7120 	/*
7121 	 * Actually remove the file/dir
7122 	 */
7123 	mutex_exit(&rp->r_statelock);
7124 
7125 	/*
7126 	 * We need to flush any dirty pages which happen to
7127 	 * be hanging around before removing the file.
7128 	 * This shouldn't happen very often since in NFSv4
7129 	 * we should be close to open consistent.
7130 	 */
7131 	if (nfs4_has_pages(vp) &&
7132 	    ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) {
7133 		e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr);
7134 		if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) {
7135 			mutex_enter(&rp->r_statelock);
7136 			if (!rp->r_error)
7137 				rp->r_error = e.error;
7138 			mutex_exit(&rp->r_statelock);
7139 		}
7140 	}
7141 
7142 	mi = VTOMI4(dvp);
7143 
7144 	(void) nfs4delegreturn(rp, NFS4_DR_REOPEN);
7145 	recov_state.rs_flags = 0;
7146 	recov_state.rs_num_retry_despite_err = 0;
7147 
7148 recov_retry:
7149 	/*
7150 	 * Remove ops: putfh dir; remove
7151 	 */
7152 	args.ctag = TAG_REMOVE;
7153 	args.array_len = 3;
7154 	args.array = argop;
7155 
7156 	e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
7157 	if (e.error) {
7158 		nfs_rw_exit(&drp->r_rwlock);
7159 		VN_RELE(vp);
7160 		return (e.error);
7161 	}
7162 
7163 	/* putfh directory */
7164 	argop[0].argop = OP_CPUTFH;
7165 	argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
7166 
7167 	/* remove */
7168 	argop[1].argop = OP_CREMOVE;
7169 	argop[1].nfs_argop4_u.opcremove.ctarget = nm;
7170 
7171 	/* getattr dir */
7172 	argop[2].argop = OP_GETATTR;
7173 	argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7174 	argop[2].nfs_argop4_u.opgetattr.mi = mi;
7175 
7176 	doqueue = 1;
7177 	dinfo.di_time_call = gethrtime();
7178 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
7179 
7180 	PURGE_ATTRCACHE4(vp);
7181 
7182 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
7183 	if (e.error)
7184 		PURGE_ATTRCACHE4(dvp);
7185 
7186 	if (needrecov) {
7187 		if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp,
7188 		    NULL, NULL, NULL, OP_REMOVE, NULL) == FALSE) {
7189 			if (!e.error)
7190 				(void) xdr_free(xdr_COMPOUND4res_clnt,
7191 								(caddr_t)&res);
7192 			nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
7193 					needrecov);
7194 			goto recov_retry;
7195 		}
7196 	}
7197 
7198 	/*
7199 	 * Matching nfs4_end_op() for start_op() above.
7200 	 * There is a path in the code below which calls
7201 	 * nfs4_purge_stale_fh(), which may generate otw calls through
7202 	 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
7203 	 * here to avoid nfs4_start_op() deadlock.
7204 	 */
7205 	nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
7206 
7207 	if (!e.error) {
7208 		resp = &res;
7209 
7210 		if (res.status) {
7211 			e.error = geterrno4(res.status);
7212 			PURGE_ATTRCACHE4(dvp);
7213 			nfs4_purge_stale_fh(e.error, dvp, cr);
7214 		} else {
7215 			resop = &res.array[1];	/* remove res */
7216 			rm_res = &resop->nfs_resop4_u.opremove;
7217 
7218 			dinfo.di_garp =
7219 				&res.array[2].nfs_resop4_u.opgetattr.ga_res;
7220 			dinfo.di_cred = cr;
7221 
7222 			/* Update directory attr, readdir and dnlc caches */
7223 			nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL,
7224 				&dinfo);
7225 		}
7226 	}
7227 	nfs_rw_exit(&drp->r_rwlock);
7228 	if (resp)
7229 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7230 
7231 	VN_RELE(vp);
7232 	return (e.error);
7233 }
7234 
7235 /*
7236  * Link requires that the current fh be the target directory and the
7237  * saved fh be the source fh. After the operation, the current fh is unchanged.
7238  * Thus the compound op structure is:
7239  *	PUTFH(file), SAVEFH, PUTFH(targetdir), LINK, RESTOREFH,
7240  *	GETATTR(file)
7241  */
7242 static int
7243 nfs4_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr)
7244 {
7245 	COMPOUND4args_clnt args;
7246 	COMPOUND4res_clnt res, *resp = NULL;
7247 	LINK4res *ln_res;
7248 	int argoplist_size  = 7 * sizeof (nfs_argop4);
7249 	nfs_argop4 *argop;
7250 	nfs_resop4 *resop;
7251 	vnode_t *realvp, *nvp;
7252 	int doqueue;
7253 	mntinfo4_t *mi;
7254 	rnode4_t *tdrp;
7255 	bool_t needrecov = FALSE;
7256 	nfs4_recov_state_t recov_state;
7257 	hrtime_t t;
7258 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
7259 	dirattr_info_t dinfo;
7260 
7261 	ASSERT(*tnm != '\0');
7262 	ASSERT(tdvp->v_type == VDIR);
7263 	ASSERT(nfs4_consistent_type(tdvp));
7264 	ASSERT(nfs4_consistent_type(svp));
7265 
7266 	if (nfs_zone() != VTOMI4(tdvp)->mi_zone)
7267 		return (EPERM);
7268 	if (VOP_REALVP(svp, &realvp) == 0) {
7269 		svp = realvp;
7270 		ASSERT(nfs4_consistent_type(svp));
7271 	}
7272 
7273 	tdrp = VTOR4(tdvp);
7274 	mi = VTOMI4(svp);
7275 
7276 	if (!(mi->mi_flags & MI4_LINK)) {
7277 		return (EOPNOTSUPP);
7278 	}
7279 	recov_state.rs_flags = 0;
7280 	recov_state.rs_num_retry_despite_err = 0;
7281 
7282 	if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR4(tdvp)))
7283 		return (EINTR);
7284 
7285 recov_retry:
7286 	argop = kmem_alloc(argoplist_size, KM_SLEEP);
7287 
7288 	args.ctag = TAG_LINK;
7289 
7290 	/*
7291 	 * Link ops: putfh fl; savefh; putfh tdir; link; getattr(dir);
7292 	 * restorefh; getattr(fl)
7293 	 */
7294 	args.array_len = 7;
7295 	args.array = argop;
7296 
7297 	e.error = nfs4_start_op(VTOMI4(svp), svp, tdvp, &recov_state);
7298 	if (e.error) {
7299 		kmem_free(argop, argoplist_size);
7300 		nfs_rw_exit(&tdrp->r_rwlock);
7301 		return (e.error);
7302 	}
7303 
7304 	/* 0. putfh file */
7305 	argop[0].argop = OP_CPUTFH;
7306 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(svp)->r_fh;
7307 
7308 	/* 1. save current fh to free up the space for the dir */
7309 	argop[1].argop = OP_SAVEFH;
7310 
7311 	/* 2. putfh targetdir */
7312 	argop[2].argop = OP_CPUTFH;
7313 	argop[2].nfs_argop4_u.opcputfh.sfh = tdrp->r_fh;
7314 
7315 	/* 3. link: current_fh is targetdir, saved_fh is source */
7316 	argop[3].argop = OP_CLINK;
7317 	argop[3].nfs_argop4_u.opclink.cnewname = tnm;
7318 
7319 	/* 4. Get attributes of dir */
7320 	argop[4].argop = OP_GETATTR;
7321 	argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7322 	argop[4].nfs_argop4_u.opgetattr.mi = mi;
7323 
7324 	/* 5. If link was successful, restore current vp to file */
7325 	argop[5].argop = OP_RESTOREFH;
7326 
7327 	/* 6. Get attributes of linked object */
7328 	argop[6].argop = OP_GETATTR;
7329 	argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7330 	argop[6].nfs_argop4_u.opgetattr.mi = mi;
7331 
7332 	dnlc_remove(tdvp, tnm);
7333 
7334 	doqueue = 1;
7335 	t = gethrtime();
7336 
7337 	rfs4call(VTOMI4(svp), &args, &res, cr, &doqueue, 0, &e);
7338 
7339 	needrecov = nfs4_needs_recovery(&e, FALSE, svp->v_vfsp);
7340 	if (e.error != 0 && !needrecov) {
7341 		PURGE_ATTRCACHE4(tdvp);
7342 		PURGE_ATTRCACHE4(svp);
7343 		nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov);
7344 		goto out;
7345 	}
7346 
7347 	if (needrecov) {
7348 		bool_t abort;
7349 
7350 		abort = nfs4_start_recovery(&e, VTOMI4(svp), svp, tdvp,
7351 			    NULL, NULL, OP_LINK, NULL);
7352 		if (abort == FALSE) {
7353 			nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state,
7354 				    needrecov);
7355 			kmem_free(argop, argoplist_size);
7356 			if (!e.error)
7357 				(void) xdr_free(xdr_COMPOUND4res_clnt,
7358 								(caddr_t)&res);
7359 			goto recov_retry;
7360 		} else {
7361 			if (e.error != 0) {
7362 				PURGE_ATTRCACHE4(tdvp);
7363 				PURGE_ATTRCACHE4(svp);
7364 				nfs4_end_op(VTOMI4(svp), svp, tdvp,
7365 					    &recov_state, needrecov);
7366 				goto out;
7367 			}
7368 			/* fall through for res.status case */
7369 		}
7370 	}
7371 
7372 	nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov);
7373 
7374 	resp = &res;
7375 	if (res.status) {
7376 		/* If link succeeded, then don't return error */
7377 		e.error = geterrno4(res.status);
7378 		if (res.array_len <= 4) {
7379 			/*
7380 			 * Either Putfh, Savefh, Putfh dir, or Link failed
7381 			 */
7382 			PURGE_ATTRCACHE4(svp);
7383 			PURGE_ATTRCACHE4(tdvp);
7384 			if (e.error == EOPNOTSUPP) {
7385 				mutex_enter(&mi->mi_lock);
7386 				mi->mi_flags &= ~MI4_LINK;
7387 				mutex_exit(&mi->mi_lock);
7388 			}
7389 			/* Remap EISDIR to EPERM for non-root user for SVVS */
7390 			/* XXX-LP */
7391 			if (e.error == EISDIR && crgetuid(cr) != 0)
7392 				e.error = EPERM;
7393 			goto out;
7394 		}
7395 	}
7396 
7397 	/* either no error or one of the postop getattr failed */
7398 
7399 	/*
7400 	 * XXX - if LINK succeeded, but no attrs were returned for link
7401 	 * file, purge its cache.
7402 	 *
7403 	 * XXX Perform a simplified version of wcc checking. Instead of
7404 	 * have another getattr to get pre-op, just purge cache if
7405 	 * any of the ops prior to and including the getattr failed.
7406 	 * If the getattr succeeded then update the attrcache accordingly.
7407 	 */
7408 
7409 	/*
7410 	 * update cache with link file postattrs.
7411 	 * Note: at this point resop points to link res.
7412 	 */
7413 	resop = &res.array[3];	/* link res */
7414 	ln_res = &resop->nfs_resop4_u.oplink;
7415 	if (res.status == NFS4_OK) {
7416 		e.error = nfs4_update_attrcache(res.status,
7417 				&res.array[6].nfs_resop4_u.opgetattr.ga_res,
7418 				t, svp, cr);
7419 	}
7420 
7421 	/*
7422 	 * Call makenfs4node to create the new shadow vp for tnm.
7423 	 * We pass NULL attrs because we just cached attrs for
7424 	 * the src object.  All we're trying to accomplish is to
7425 	 * to create the new shadow vnode.
7426 	 */
7427 	nvp = makenfs4node(VTOR4(svp)->r_fh, NULL, tdvp->v_vfsp, t, cr,
7428 			tdvp, fn_get(VTOSV(tdvp)->sv_name, tnm));
7429 
7430 	/* Update target cache attribute, readdir and dnlc caches */
7431 	dinfo.di_garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res;
7432 	dinfo.di_time_call = t;
7433 	dinfo.di_cred = cr;
7434 
7435 	nfs4_update_dircaches(&ln_res->cinfo, tdvp, nvp, tnm, &dinfo);
7436 	ASSERT(nfs4_consistent_type(tdvp));
7437 	ASSERT(nfs4_consistent_type(svp));
7438 	ASSERT(nfs4_consistent_type(nvp));
7439 	VN_RELE(nvp);
7440 
7441 out:
7442 	kmem_free(argop, argoplist_size);
7443 	if (resp)
7444 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7445 
7446 	nfs_rw_exit(&tdrp->r_rwlock);
7447 
7448 	return (e.error);
7449 }
7450 
7451 static int
7452 nfs4_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr)
7453 {
7454 	vnode_t *realvp;
7455 
7456 	if (nfs_zone() != VTOMI4(odvp)->mi_zone)
7457 		return (EPERM);
7458 	if (VOP_REALVP(ndvp, &realvp) == 0)
7459 		ndvp = realvp;
7460 
7461 	return (nfs4rename(odvp, onm, ndvp, nnm, cr));
7462 }
7463 
7464 /*
7465  * nfs4rename does the real work of renaming in NFS Version 4.
7466  *
7467  * A file handle is considered volatile for renaming purposes if either
7468  * of the volatile bits are turned on. However, the compound may differ
7469  * based on the likelihood of the filehandle to change during rename.
7470  */
7471 static int
7472 nfs4rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr)
7473 {
7474 	int error;
7475 	mntinfo4_t *mi;
7476 	vnode_t *nvp;
7477 	vnode_t *ovp = NULL;
7478 	char *tmpname = NULL;
7479 	rnode4_t *rp;
7480 	rnode4_t *odrp;
7481 	rnode4_t *ndrp;
7482 	int did_link = 0;
7483 	int do_link = 1;
7484 	nfsstat4 stat = NFS4_OK;
7485 
7486 	ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
7487 	ASSERT(nfs4_consistent_type(odvp));
7488 	ASSERT(nfs4_consistent_type(ndvp));
7489 
7490 	if (onm[0] == '.' && (onm[1] == '\0' ||
7491 			(onm[1] == '.' && onm[2] == '\0')))
7492 		return (EINVAL);
7493 
7494 	if (nnm[0] == '.' && (nnm[1] == '\0' ||
7495 			(nnm[1] == '.' && nnm[2] == '\0')))
7496 		return (EINVAL);
7497 
7498 	odrp = VTOR4(odvp);
7499 	ndrp = VTOR4(ndvp);
7500 	if ((intptr_t)odrp < (intptr_t)ndrp) {
7501 		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp)))
7502 			return (EINTR);
7503 		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) {
7504 			nfs_rw_exit(&odrp->r_rwlock);
7505 			return (EINTR);
7506 		}
7507 	} else {
7508 		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp)))
7509 			return (EINTR);
7510 		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) {
7511 			nfs_rw_exit(&ndrp->r_rwlock);
7512 			return (EINTR);
7513 		}
7514 	}
7515 
7516 	/*
7517 	 * Lookup the target file.  If it exists, it needs to be
7518 	 * checked to see whether it is a mount point and whether
7519 	 * it is active (open).
7520 	 */
7521 	error = nfs4lookup(ndvp, nnm, &nvp, cr, 0);
7522 	if (!error) {
7523 		int	isactive;
7524 
7525 		ASSERT(nfs4_consistent_type(nvp));
7526 		/*
7527 		 * If this file has been mounted on, then just
7528 		 * return busy because renaming to it would remove
7529 		 * the mounted file system from the name space.
7530 		 */
7531 		if (vn_ismntpt(nvp)) {
7532 			VN_RELE(nvp);
7533 			nfs_rw_exit(&odrp->r_rwlock);
7534 			nfs_rw_exit(&ndrp->r_rwlock);
7535 			return (EBUSY);
7536 		}
7537 
7538 		/*
7539 		 * First just remove the entry from the name cache, as it
7540 		 * is most likely the only entry for this vp.
7541 		 */
7542 		dnlc_remove(ndvp, nnm);
7543 
7544 		rp = VTOR4(nvp);
7545 
7546 		if (nvp->v_type != VREG) {
7547 			/*
7548 			 * Purge the name cache of all references to this vnode
7549 			 * so that we can check the reference count to infer
7550 			 * whether it is active or not.
7551 			 */
7552 			if (nvp->v_count > 1)
7553 				dnlc_purge_vp(nvp);
7554 
7555 			isactive = nvp->v_count > 1;
7556 		} else {
7557 			mutex_enter(&rp->r_os_lock);
7558 			isactive = list_head(&rp->r_open_streams) != NULL;
7559 			mutex_exit(&rp->r_os_lock);
7560 		}
7561 
7562 		/*
7563 		 * If the vnode is active and is not a directory,
7564 		 * arrange to rename it to a
7565 		 * temporary file so that it will continue to be
7566 		 * accessible.  This implements the "unlink-open-file"
7567 		 * semantics for the target of a rename operation.
7568 		 * Before doing this though, make sure that the
7569 		 * source and target files are not already the same.
7570 		 */
7571 		if (isactive && nvp->v_type != VDIR) {
7572 			/*
7573 			 * Lookup the source name.
7574 			 */
7575 			error = nfs4lookup(odvp, onm, &ovp, cr, 0);
7576 
7577 			/*
7578 			 * The source name *should* already exist.
7579 			 */
7580 			if (error) {
7581 				VN_RELE(nvp);
7582 				nfs_rw_exit(&odrp->r_rwlock);
7583 				nfs_rw_exit(&ndrp->r_rwlock);
7584 				return (error);
7585 			}
7586 
7587 			ASSERT(nfs4_consistent_type(ovp));
7588 
7589 			/*
7590 			 * Compare the two vnodes.  If they are the same,
7591 			 * just release all held vnodes and return success.
7592 			 */
7593 			if (VN_CMP(ovp, nvp)) {
7594 				VN_RELE(ovp);
7595 				VN_RELE(nvp);
7596 				nfs_rw_exit(&odrp->r_rwlock);
7597 				nfs_rw_exit(&ndrp->r_rwlock);
7598 				return (0);
7599 			}
7600 
7601 			/*
7602 			 * Can't mix and match directories and non-
7603 			 * directories in rename operations.  We already
7604 			 * know that the target is not a directory.  If
7605 			 * the source is a directory, return an error.
7606 			 */
7607 			if (ovp->v_type == VDIR) {
7608 				VN_RELE(ovp);
7609 				VN_RELE(nvp);
7610 				nfs_rw_exit(&odrp->r_rwlock);
7611 				nfs_rw_exit(&ndrp->r_rwlock);
7612 				return (ENOTDIR);
7613 			}
7614 link_call:
7615 			/*
7616 			 * The target file exists, is not the same as
7617 			 * the source file, and is active.  We first
7618 			 * try to Link it to a temporary filename to
7619 			 * avoid having the server removing the file
7620 			 * completely (which could cause data loss to
7621 			 * the user's POV in the event the Rename fails
7622 			 * -- see bug 1165874).
7623 			 */
7624 			/*
7625 			 * The do_link and did_link booleans are
7626 			 * introduced in the event we get NFS4ERR_FILE_OPEN
7627 			 * returned for the Rename.  Some servers can
7628 			 * not Rename over an Open file, so they return
7629 			 * this error.  The client needs to Remove the
7630 			 * newly created Link and do two Renames, just
7631 			 * as if the server didn't support LINK.
7632 			 */
7633 			tmpname = newname();
7634 			error = 0;
7635 
7636 			if (do_link) {
7637 				error = nfs4_link(ndvp, nvp, tmpname, cr);
7638 			}
7639 			if (error == EOPNOTSUPP || !do_link) {
7640 				error = nfs4_rename(ndvp, nnm, ndvp, tmpname,
7641 				    cr);
7642 				did_link = 0;
7643 			} else {
7644 				did_link = 1;
7645 			}
7646 			if (error) {
7647 				kmem_free(tmpname, MAXNAMELEN);
7648 				VN_RELE(ovp);
7649 				VN_RELE(nvp);
7650 				nfs_rw_exit(&odrp->r_rwlock);
7651 				nfs_rw_exit(&ndrp->r_rwlock);
7652 				return (error);
7653 			}
7654 
7655 			mutex_enter(&rp->r_statelock);
7656 			if (rp->r_unldvp == NULL) {
7657 				VN_HOLD(ndvp);
7658 				rp->r_unldvp = ndvp;
7659 				if (rp->r_unlcred != NULL)
7660 					crfree(rp->r_unlcred);
7661 				crhold(cr);
7662 				rp->r_unlcred = cr;
7663 				rp->r_unlname = tmpname;
7664 			} else {
7665 				if (rp->r_unlname)
7666 					kmem_free(rp->r_unlname, MAXNAMELEN);
7667 				rp->r_unlname = tmpname;
7668 			}
7669 			mutex_exit(&rp->r_statelock);
7670 		}
7671 
7672 		(void) nfs4delegreturn(VTOR4(nvp), NFS4_DR_PUSH|NFS4_DR_REOPEN);
7673 
7674 		ASSERT(nfs4_consistent_type(nvp));
7675 		VN_RELE(nvp);
7676 	}
7677 
7678 	if (ovp == NULL) {
7679 		/*
7680 		 * When renaming directories to be a subdirectory of a
7681 		 * different parent, the dnlc entry for ".." will no
7682 		 * longer be valid, so it must be removed.
7683 		 *
7684 		 * We do a lookup here to determine whether we are renaming
7685 		 * a directory and we need to check if we are renaming
7686 		 * an unlinked file.  This might have already been done
7687 		 * in previous code, so we check ovp == NULL to avoid
7688 		 * doing it twice.
7689 		 */
7690 		error = nfs4lookup(odvp, onm, &ovp, cr, 0);
7691 		/*
7692 		 * The source name *should* already exist.
7693 		 */
7694 		if (error) {
7695 			nfs_rw_exit(&odrp->r_rwlock);
7696 			nfs_rw_exit(&ndrp->r_rwlock);
7697 			return (error);
7698 		}
7699 		ASSERT(ovp != NULL);
7700 		ASSERT(nfs4_consistent_type(ovp));
7701 	}
7702 
7703 	/*
7704 	 * Is the object being renamed a dir, and if so, is
7705 	 * it being renamed to a child of itself?  The underlying
7706 	 * fs should ultimately return EINVAL for this case;
7707 	 * however, buggy beta non-Solaris NFSv4 servers at
7708 	 * interop testing events have allowed this behavior,
7709 	 * and it caused our client to panic due to a recursive
7710 	 * mutex_enter in fn_move.
7711 	 *
7712 	 * The tedious locking in fn_move could be changed to
7713 	 * deal with this case, and the client could avoid the
7714 	 * panic; however, the client would just confuse itself
7715 	 * later and misbehave.  A better way to handle the broken
7716 	 * server is to detect this condition and return EINVAL
7717 	 * without ever sending the the bogus rename to the server.
7718 	 * We know the rename is invalid -- just fail it now.
7719 	 */
7720 	if (ovp->v_type == VDIR && VN_CMP(ndvp, ovp)) {
7721 		VN_RELE(ovp);
7722 		nfs_rw_exit(&odrp->r_rwlock);
7723 		nfs_rw_exit(&ndrp->r_rwlock);
7724 		return (EINVAL);
7725 	}
7726 
7727 	(void) nfs4delegreturn(VTOR4(ovp), NFS4_DR_PUSH|NFS4_DR_REOPEN);
7728 
7729 	/*
7730 	 * If FH4_VOL_RENAME or FH4_VOLATILE_ANY bits are set, it is
7731 	 * possible for the filehandle to change due to the rename.
7732 	 * If neither of these bits is set, but FH4_VOL_MIGRATION is set,
7733 	 * the fh will not change because of the rename, but we still need
7734 	 * to update its rnode entry with the new name for
7735 	 * an eventual fh change due to migration. The FH4_NOEXPIRE_ON_OPEN
7736 	 * has no effect on these for now, but for future improvements,
7737 	 * we might want to use it too to simplify handling of files
7738 	 * that are open with that flag on. (XXX)
7739 	 */
7740 	mi = VTOMI4(odvp);
7741 	if (NFS4_VOLATILE_FH(mi)) {
7742 		error = nfs4rename_volatile_fh(odvp, onm, ovp, ndvp, nnm, cr,
7743 				&stat);
7744 	} else {
7745 		error = nfs4rename_persistent_fh(odvp, onm, ovp, ndvp, nnm, cr,
7746 				&stat);
7747 	}
7748 	ASSERT(nfs4_consistent_type(odvp));
7749 	ASSERT(nfs4_consistent_type(ndvp));
7750 	ASSERT(nfs4_consistent_type(ovp));
7751 
7752 	if (stat == NFS4ERR_FILE_OPEN && did_link) {
7753 		do_link = 0;
7754 		/*
7755 		 * Before the 'link_call' code, we did a nfs4_lookup
7756 		 * that puts a VN_HOLD on nvp.  After the nfs4_link
7757 		 * call we call VN_RELE to match that hold.  We need
7758 		 * to place an additional VN_HOLD here since we will
7759 		 * be hitting that VN_RELE again.
7760 		 */
7761 		VN_HOLD(nvp);
7762 
7763 		(void) nfs4_remove(ndvp, tmpname, cr);
7764 
7765 		/* Undo the unlinked file naming stuff we just did */
7766 		mutex_enter(&rp->r_statelock);
7767 		if (rp->r_unldvp) {
7768 			VN_RELE(ndvp);
7769 			rp->r_unldvp = NULL;
7770 			if (rp->r_unlcred != NULL)
7771 				crfree(rp->r_unlcred);
7772 			rp->r_unlcred = NULL;
7773 			/* rp->r_unlanme points to tmpname */
7774 			if (rp->r_unlname)
7775 				kmem_free(rp->r_unlname, MAXNAMELEN);
7776 			rp->r_unlname = NULL;
7777 		}
7778 		mutex_exit(&rp->r_statelock);
7779 
7780 		goto link_call;
7781 	}
7782 
7783 	if (error) {
7784 		VN_RELE(ovp);
7785 		nfs_rw_exit(&odrp->r_rwlock);
7786 		nfs_rw_exit(&ndrp->r_rwlock);
7787 		return (error);
7788 	}
7789 
7790 	/*
7791 	 * when renaming directories to be a subdirectory of a
7792 	 * different parent, the dnlc entry for ".." will no
7793 	 * longer be valid, so it must be removed
7794 	 */
7795 	rp = VTOR4(ovp);
7796 	if (ndvp != odvp) {
7797 		if (ovp->v_type == VDIR) {
7798 			dnlc_remove(ovp, "..");
7799 			if (rp->r_dir != NULL)
7800 				nfs4_purge_rddir_cache(ovp);
7801 		}
7802 	}
7803 
7804 	/*
7805 	 * If we are renaming the unlinked file, update the
7806 	 * r_unldvp and r_unlname as needed.
7807 	 */
7808 	mutex_enter(&rp->r_statelock);
7809 	if (rp->r_unldvp != NULL) {
7810 		if (strcmp(rp->r_unlname, onm) == 0) {
7811 			(void) strncpy(rp->r_unlname, nnm, MAXNAMELEN);
7812 			rp->r_unlname[MAXNAMELEN - 1] = '\0';
7813 			if (ndvp != rp->r_unldvp) {
7814 				VN_RELE(rp->r_unldvp);
7815 				rp->r_unldvp = ndvp;
7816 				VN_HOLD(ndvp);
7817 			}
7818 		}
7819 	}
7820 	mutex_exit(&rp->r_statelock);
7821 
7822 	VN_RELE(ovp);
7823 
7824 	nfs_rw_exit(&odrp->r_rwlock);
7825 	nfs_rw_exit(&ndrp->r_rwlock);
7826 
7827 	return (error);
7828 }
7829 
7830 /*
7831  * nfs4rename_persistent does the otw portion of renaming in NFS Version 4,
7832  * when it is known that the filehandle is persistent through rename.
7833  *
7834  * Rename requires that the current fh be the target directory and the
7835  * saved fh be the source directory. After the operation, the current fh
7836  * is unchanged.
7837  * The compound op structure for persistent fh rename is:
7838  *      PUTFH(sourcdir), SAVEFH, PUTFH(targetdir), RENAME
7839  * Rather than bother with the directory postop args, we'll simply
7840  * update that a change occured in the cache, so no post-op getattrs.
7841  */
7842 static int
7843 nfs4rename_persistent_fh(vnode_t *odvp, char *onm, vnode_t *renvp,
7844 	vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp)
7845 {
7846 	COMPOUND4args_clnt args;
7847 	COMPOUND4res_clnt res, *resp = NULL;
7848 	nfs_argop4 *argop;
7849 	nfs_resop4 *resop;
7850 	int doqueue, argoplist_size;
7851 	mntinfo4_t *mi;
7852 	rnode4_t *odrp = VTOR4(odvp);
7853 	rnode4_t *ndrp = VTOR4(ndvp);
7854 	RENAME4res *rn_res;
7855 	bool_t needrecov;
7856 	nfs4_recov_state_t recov_state;
7857 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
7858 	dirattr_info_t dinfo, *dinfop;
7859 
7860 	ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
7861 
7862 	recov_state.rs_flags = 0;
7863 	recov_state.rs_num_retry_despite_err = 0;
7864 
7865 	/*
7866 	 * Rename ops: putfh sdir; savefh; putfh tdir; rename; getattr tdir
7867 	 *
7868 	 * If source/target are different dirs, then append putfh(src); getattr
7869 	 */
7870 	args.array_len = (odvp == ndvp) ? 5 : 7;
7871 	argoplist_size = args.array_len * sizeof (nfs_argop4);
7872 	args.array = argop = kmem_alloc(argoplist_size, KM_SLEEP);
7873 
7874 recov_retry:
7875 	*statp = NFS4_OK;
7876 
7877 	/* No need to Lookup the file, persistent fh */
7878 	args.ctag = TAG_RENAME;
7879 
7880 	mi = VTOMI4(odvp);
7881 	e.error = nfs4_start_op(mi, odvp, ndvp, &recov_state);
7882 	if (e.error) {
7883 		kmem_free(argop, argoplist_size);
7884 		return (e.error);
7885 	}
7886 
7887 	/* 0: putfh source directory */
7888 	argop[0].argop = OP_CPUTFH;
7889 	argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh;
7890 
7891 	/* 1: Save source fh to free up current for target */
7892 	argop[1].argop = OP_SAVEFH;
7893 
7894 	/* 2: putfh targetdir */
7895 	argop[2].argop = OP_CPUTFH;
7896 	argop[2].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
7897 
7898 	/* 3: current_fh is targetdir, saved_fh is sourcedir */
7899 	argop[3].argop = OP_CRENAME;
7900 	argop[3].nfs_argop4_u.opcrename.coldname = onm;
7901 	argop[3].nfs_argop4_u.opcrename.cnewname = nnm;
7902 
7903 	/* 4: getattr (targetdir) */
7904 	argop[4].argop = OP_GETATTR;
7905 	argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7906 	argop[4].nfs_argop4_u.opgetattr.mi = mi;
7907 
7908 	if (ndvp != odvp) {
7909 
7910 		/* 5: putfh (sourcedir) */
7911 		argop[5].argop = OP_CPUTFH;
7912 		argop[5].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
7913 
7914 		/* 6: getattr (sourcedir) */
7915 		argop[6].argop = OP_GETATTR;
7916 		argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7917 		argop[6].nfs_argop4_u.opgetattr.mi = mi;
7918 	}
7919 
7920 	dnlc_remove(odvp, onm);
7921 	dnlc_remove(ndvp, nnm);
7922 
7923 	doqueue = 1;
7924 	dinfo.di_time_call = gethrtime();
7925 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
7926 
7927 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
7928 	if (e.error) {
7929 		PURGE_ATTRCACHE4(odvp);
7930 		PURGE_ATTRCACHE4(ndvp);
7931 	} else {
7932 		*statp = res.status;
7933 	}
7934 
7935 	if (needrecov) {
7936 		if (nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL,
7937 		    OP_RENAME, NULL) == FALSE) {
7938 			nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov);
7939 			if (!e.error)
7940 				(void) xdr_free(xdr_COMPOUND4res_clnt,
7941 								(caddr_t)&res);
7942 			goto recov_retry;
7943 		}
7944 	}
7945 
7946 	if (!e.error) {
7947 		resp = &res;
7948 		/*
7949 		 * as long as OP_RENAME
7950 		 */
7951 		if (res.status != NFS4_OK && res.array_len <= 4) {
7952 			e.error = geterrno4(res.status);
7953 			PURGE_ATTRCACHE4(odvp);
7954 			PURGE_ATTRCACHE4(ndvp);
7955 			/*
7956 			 * System V defines rename to return EEXIST, not
7957 			 * ENOTEMPTY if the target directory is not empty.
7958 			 * Over the wire, the error is NFSERR_ENOTEMPTY
7959 			 * which geterrno4 maps to ENOTEMPTY.
7960 			 */
7961 			if (e.error == ENOTEMPTY)
7962 				e.error = EEXIST;
7963 		} else {
7964 
7965 			resop = &res.array[3];	/* rename res */
7966 			rn_res = &resop->nfs_resop4_u.oprename;
7967 
7968 			if (res.status == NFS4_OK) {
7969 				/*
7970 				 * Update target attribute, readdir and dnlc
7971 				 * caches.
7972 				 */
7973 				dinfo.di_garp =
7974 				    &res.array[4].nfs_resop4_u.opgetattr.ga_res;
7975 				dinfo.di_cred = cr;
7976 				dinfop = &dinfo;
7977 			} else
7978 				dinfop = NULL;
7979 
7980 			nfs4_update_dircaches(&rn_res->target_cinfo,
7981 						ndvp, NULL, NULL, dinfop);
7982 
7983 			/*
7984 			 * Update source attribute, readdir and dnlc caches
7985 			 *
7986 			 */
7987 			if (ndvp != odvp) {
7988 				if (dinfop)
7989 					dinfo.di_garp =
7990 					    &(res.array[6].nfs_resop4_u.
7991 					    opgetattr.ga_res);
7992 
7993 				nfs4_update_dircaches(&rn_res->source_cinfo,
7994 						odvp, NULL, NULL, dinfop);
7995 			}
7996 
7997 			fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name,
7998 									nnm);
7999 		}
8000 	}
8001 
8002 	if (resp)
8003 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8004 	nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov);
8005 	kmem_free(argop, argoplist_size);
8006 
8007 	return (e.error);
8008 }
8009 
8010 /*
8011  * nfs4rename_volatile_fh does the otw part of renaming in NFS Version 4, when
8012  * it is possible for the filehandle to change due to the rename.
8013  *
8014  * The compound req in this case includes a post-rename lookup and getattr
8015  * to ensure that we have the correct fh and attributes for the object.
8016  *
8017  * Rename requires that the current fh be the target directory and the
8018  * saved fh be the source directory. After the operation, the current fh
8019  * is unchanged.
8020  *
8021  * We need the new filehandle (hence a LOOKUP and GETFH) so that we can
8022  * update the filehandle for the renamed object.  We also get the old
8023  * filehandle for historical reasons; this should be taken out sometime.
8024  * This results in a rather cumbersome compound...
8025  *
8026  *    PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old),
8027  *    PUTFH(targetdir), RENAME, LOOKUP(trgt), GETFH(new), GETATTR
8028  *
8029  */
8030 static int
8031 nfs4rename_volatile_fh(vnode_t *odvp, char *onm, vnode_t *ovp,
8032 	vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp)
8033 {
8034 	COMPOUND4args_clnt args;
8035 	COMPOUND4res_clnt res, *resp = NULL;
8036 	int argoplist_size;
8037 	nfs_argop4 *argop;
8038 	nfs_resop4 *resop;
8039 	int doqueue;
8040 	mntinfo4_t *mi;
8041 	rnode4_t *odrp = VTOR4(odvp);	/* old directory */
8042 	rnode4_t *ndrp = VTOR4(ndvp);	/* new directory */
8043 	rnode4_t *orp = VTOR4(ovp);	/* object being renamed */
8044 	RENAME4res *rn_res;
8045 	GETFH4res *ngf_res;
8046 	bool_t needrecov;
8047 	nfs4_recov_state_t recov_state;
8048 	hrtime_t t;
8049 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8050 	dirattr_info_t dinfo, *dinfop = &dinfo;
8051 
8052 	ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
8053 
8054 	recov_state.rs_flags = 0;
8055 	recov_state.rs_num_retry_despite_err = 0;
8056 
8057 recov_retry:
8058 	*statp = NFS4_OK;
8059 
8060 	/*
8061 	 * There is a window between the RPC and updating the path and
8062 	 * filehandle stored in the rnode.  Lock out the FHEXPIRED recovery
8063 	 * code, so that it doesn't try to use the old path during that
8064 	 * window.
8065 	 */
8066 	mutex_enter(&orp->r_statelock);
8067 	while (orp->r_flags & R4RECEXPFH) {
8068 		klwp_t *lwp = ttolwp(curthread);
8069 
8070 		if (lwp != NULL)
8071 			lwp->lwp_nostop++;
8072 		if (cv_wait_sig(&orp->r_cv, &orp->r_statelock) == 0) {
8073 			mutex_exit(&orp->r_statelock);
8074 			if (lwp != NULL)
8075 				lwp->lwp_nostop--;
8076 			return (EINTR);
8077 		}
8078 		if (lwp != NULL)
8079 			lwp->lwp_nostop--;
8080 	}
8081 	orp->r_flags |= R4RECEXPFH;
8082 	mutex_exit(&orp->r_statelock);
8083 
8084 	mi = VTOMI4(odvp);
8085 
8086 	args.ctag = TAG_RENAME_VFH;
8087 	args.array_len = (odvp == ndvp) ? 10 : 12;
8088 	argoplist_size  = args.array_len * sizeof (nfs_argop4);
8089 	argop = kmem_alloc(argoplist_size, KM_SLEEP);
8090 
8091 	/*
8092 	 * Rename ops:
8093 	 *    PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old),
8094 	 *    PUTFH(targetdir), RENAME, GETATTR(targetdir)
8095 	 *    LOOKUP(trgt), GETFH(new), GETATTR,
8096 	 *
8097 	 *    if (odvp != ndvp)
8098 	 *	add putfh(sourcedir), getattr(sourcedir) }
8099 	 */
8100 	args.array = argop;
8101 
8102 	e.error = nfs4_start_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8103 			    &recov_state, NULL);
8104 	if (e.error) {
8105 		kmem_free(argop, argoplist_size);
8106 		mutex_enter(&orp->r_statelock);
8107 		orp->r_flags &= ~R4RECEXPFH;
8108 		cv_broadcast(&orp->r_cv);
8109 		mutex_exit(&orp->r_statelock);
8110 		return (e.error);
8111 	}
8112 
8113 	/* 0: putfh source directory */
8114 	argop[0].argop = OP_CPUTFH;
8115 	argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh;
8116 
8117 	/* 1: Save source fh to free up current for target */
8118 	argop[1].argop = OP_SAVEFH;
8119 
8120 	/* 2: Lookup pre-rename fh of renamed object */
8121 	argop[2].argop = OP_CLOOKUP;
8122 	argop[2].nfs_argop4_u.opclookup.cname = onm;
8123 
8124 	/* 3: getfh fh of renamed object (before rename) */
8125 	argop[3].argop = OP_GETFH;
8126 
8127 	/* 4: putfh targetdir */
8128 	argop[4].argop = OP_CPUTFH;
8129 	argop[4].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8130 
8131 	/* 5: current_fh is targetdir, saved_fh is sourcedir */
8132 	argop[5].argop = OP_CRENAME;
8133 	argop[5].nfs_argop4_u.opcrename.coldname = onm;
8134 	argop[5].nfs_argop4_u.opcrename.cnewname = nnm;
8135 
8136 	/* 6: getattr of target dir (post op attrs) */
8137 	argop[6].argop = OP_GETATTR;
8138 	argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8139 	argop[6].nfs_argop4_u.opgetattr.mi = mi;
8140 
8141 	/* 7: Lookup post-rename fh of renamed object */
8142 	argop[7].argop = OP_CLOOKUP;
8143 	argop[7].nfs_argop4_u.opclookup.cname = nnm;
8144 
8145 	/* 8: getfh fh of renamed object (after rename) */
8146 	argop[8].argop = OP_GETFH;
8147 
8148 	/* 9: getattr of renamed object */
8149 	argop[9].argop = OP_GETATTR;
8150 	argop[9].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8151 	argop[9].nfs_argop4_u.opgetattr.mi = mi;
8152 
8153 	/*
8154 	 * If source/target dirs are different, then get new post-op
8155 	 * attrs for source dir also.
8156 	 */
8157 	if (ndvp != odvp) {
8158 		/* 10: putfh (sourcedir) */
8159 		argop[10].argop = OP_CPUTFH;
8160 		argop[10].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8161 
8162 		/* 11: getattr (sourcedir) */
8163 		argop[11].argop = OP_GETATTR;
8164 		argop[11].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8165 		argop[11].nfs_argop4_u.opgetattr.mi = mi;
8166 	}
8167 
8168 	dnlc_remove(odvp, onm);
8169 	dnlc_remove(ndvp, nnm);
8170 
8171 	doqueue = 1;
8172 	t = gethrtime();
8173 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8174 
8175 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8176 	if (e.error) {
8177 		PURGE_ATTRCACHE4(odvp);
8178 		PURGE_ATTRCACHE4(ndvp);
8179 		if (!needrecov) {
8180 			nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8181 					&recov_state, needrecov);
8182 			goto out;
8183 		}
8184 	} else {
8185 		*statp = res.status;
8186 	}
8187 
8188 	if (needrecov) {
8189 		bool_t abort;
8190 
8191 		abort = nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL,
8192 			    OP_RENAME, NULL);
8193 		if (abort == FALSE) {
8194 			nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8195 					&recov_state, needrecov);
8196 			kmem_free(argop, argoplist_size);
8197 			if (!e.error)
8198 				(void) xdr_free(xdr_COMPOUND4res_clnt,
8199 								(caddr_t)&res);
8200 			mutex_enter(&orp->r_statelock);
8201 			orp->r_flags &= ~R4RECEXPFH;
8202 			cv_broadcast(&orp->r_cv);
8203 			mutex_exit(&orp->r_statelock);
8204 			goto recov_retry;
8205 		} else {
8206 			if (e.error != 0) {
8207 				nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8208 						&recov_state, needrecov);
8209 				goto out;
8210 			}
8211 			/* fall through for res.status case */
8212 		}
8213 	}
8214 
8215 	resp = &res;
8216 	/*
8217 	 * If OP_RENAME (or any prev op) failed, then return an error.
8218 	 * OP_RENAME is index 5, so if array len <= 6 we return an error.
8219 	 */
8220 	if ((res.status != NFS4_OK) && (res.array_len <= 6)) {
8221 		/*
8222 		 * Error in an op other than last Getattr
8223 		 */
8224 		e.error = geterrno4(res.status);
8225 		PURGE_ATTRCACHE4(odvp);
8226 		PURGE_ATTRCACHE4(ndvp);
8227 		/*
8228 		 * System V defines rename to return EEXIST, not
8229 		 * ENOTEMPTY if the target directory is not empty.
8230 		 * Over the wire, the error is NFSERR_ENOTEMPTY
8231 		 * which geterrno4 maps to ENOTEMPTY.
8232 		 */
8233 		if (e.error == ENOTEMPTY)
8234 			e.error = EEXIST;
8235 		nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state,
8236 				needrecov);
8237 		goto out;
8238 	}
8239 
8240 	/* rename results */
8241 	rn_res = &res.array[5].nfs_resop4_u.oprename;
8242 
8243 	if (res.status == NFS4_OK) {
8244 		/* Update target attribute, readdir and dnlc caches */
8245 		dinfo.di_garp =
8246 			&res.array[6].nfs_resop4_u.opgetattr.ga_res;
8247 		dinfo.di_cred = cr;
8248 		dinfo.di_time_call = t;
8249 	} else
8250 		dinfop = NULL;
8251 
8252 	/* Update source cache attribute, readdir and dnlc caches */
8253 	nfs4_update_dircaches(&rn_res->target_cinfo, ndvp, NULL, NULL, dinfop);
8254 
8255 	/* Update source cache attribute, readdir and dnlc caches */
8256 	if (ndvp != odvp) {
8257 
8258 		/*
8259 		 * If dinfop is non-NULL, then compound succeded, so
8260 		 * set di_garp to attrs for source dir.  dinfop is only
8261 		 * set to NULL when compound fails.
8262 		 */
8263 		if (dinfop)
8264 			dinfo.di_garp =
8265 				&res.array[11].nfs_resop4_u.opgetattr.ga_res;
8266 		nfs4_update_dircaches(&rn_res->source_cinfo, odvp, NULL, NULL,
8267 				dinfop);
8268 	}
8269 
8270 	/*
8271 	 * Update the rnode with the new component name and args,
8272 	 * and if the file handle changed, also update it with the new fh.
8273 	 * This is only necessary if the target object has an rnode
8274 	 * entry and there is no need to create one for it.
8275 	 */
8276 	resop = &res.array[8];	/* getfh new res */
8277 	ngf_res = &resop->nfs_resop4_u.opgetfh;
8278 
8279 	/*
8280 	 * Update the path and filehandle for the renamed object.
8281 	 */
8282 	nfs4rename_update(ovp, ndvp, &ngf_res->object, nnm);
8283 
8284 	nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, needrecov);
8285 
8286 	if (res.status == NFS4_OK) {
8287 		resop++;	/* getattr res */
8288 		e.error = nfs4_update_attrcache(res.status,
8289 				&resop->nfs_resop4_u.opgetattr.ga_res,
8290 				t, ovp, cr);
8291 	}
8292 
8293 out:
8294 	kmem_free(argop, argoplist_size);
8295 	if (resp)
8296 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8297 	mutex_enter(&orp->r_statelock);
8298 	orp->r_flags &= ~R4RECEXPFH;
8299 	cv_broadcast(&orp->r_cv);
8300 	mutex_exit(&orp->r_statelock);
8301 
8302 	return (e.error);
8303 }
8304 
8305 static int
8306 nfs4_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr)
8307 {
8308 	int error;
8309 	vnode_t *vp;
8310 
8311 	if (nfs_zone() != VTOMI4(dvp)->mi_zone)
8312 		return (EPERM);
8313 	/*
8314 	 * As ".." has special meaning and rather than send a mkdir
8315 	 * over the wire to just let the server freak out, we just
8316 	 * short circuit it here and return EEXIST
8317 	 */
8318 	if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0')
8319 		return (EEXIST);
8320 
8321 	/*
8322 	 * Decision to get the right gid and setgid bit of the
8323 	 * new directory is now made in call_nfs4_create_req.
8324 	 */
8325 	va->va_mask |= AT_MODE;
8326 	error = call_nfs4_create_req(dvp, nm, NULL, va, &vp, cr, NF4DIR);
8327 	if (error)
8328 		return (error);
8329 
8330 	*vpp = vp;
8331 	return (0);
8332 }
8333 
8334 
8335 /*
8336  * rmdir is using the same remove v4 op as does remove.
8337  * Remove requires that the current fh be the target directory.
8338  * After the operation, the current fh is unchanged.
8339  * The compound op structure is:
8340  *      PUTFH(targetdir), REMOVE
8341  */
8342 static int
8343 nfs4_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr)
8344 {
8345 	int need_end_op = FALSE;
8346 	COMPOUND4args_clnt args;
8347 	COMPOUND4res_clnt res, *resp = NULL;
8348 	REMOVE4res *rm_res;
8349 	nfs_argop4 argop[3];
8350 	nfs_resop4 *resop;
8351 	vnode_t *vp;
8352 	int doqueue;
8353 	mntinfo4_t *mi;
8354 	rnode4_t *drp;
8355 	bool_t needrecov = FALSE;
8356 	nfs4_recov_state_t recov_state;
8357 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8358 	dirattr_info_t dinfo, *dinfop;
8359 
8360 	if (nfs_zone() != VTOMI4(dvp)->mi_zone)
8361 		return (EPERM);
8362 	/*
8363 	 * As ".." has special meaning and rather than send a rmdir
8364 	 * over the wire to just let the server freak out, we just
8365 	 * short circuit it here and return EEXIST
8366 	 */
8367 	if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0')
8368 		return (EEXIST);
8369 
8370 	drp = VTOR4(dvp);
8371 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
8372 		return (EINTR);
8373 
8374 	/*
8375 	 * Attempt to prevent a rmdir(".") from succeeding.
8376 	 */
8377 	e.error = nfs4lookup(dvp, nm, &vp, cr, 0);
8378 	if (e.error) {
8379 		nfs_rw_exit(&drp->r_rwlock);
8380 		return (e.error);
8381 	}
8382 	if (vp == cdir) {
8383 		VN_RELE(vp);
8384 		nfs_rw_exit(&drp->r_rwlock);
8385 		return (EINVAL);
8386 	}
8387 
8388 	/*
8389 	 * Since nfsv4 remove op works on both files and directories,
8390 	 * check that the removed object is indeed a directory.
8391 	 */
8392 	if (vp->v_type != VDIR) {
8393 		VN_RELE(vp);
8394 		nfs_rw_exit(&drp->r_rwlock);
8395 		return (ENOTDIR);
8396 	}
8397 
8398 	/*
8399 	 * First just remove the entry from the name cache, as it
8400 	 * is most likely an entry for this vp.
8401 	 */
8402 	dnlc_remove(dvp, nm);
8403 
8404 	/*
8405 	 * If there vnode reference count is greater than one, then
8406 	 * there may be additional references in the DNLC which will
8407 	 * need to be purged.  First, trying removing the entry for
8408 	 * the parent directory and see if that removes the additional
8409 	 * reference(s).  If that doesn't do it, then use dnlc_purge_vp
8410 	 * to completely remove any references to the directory which
8411 	 * might still exist in the DNLC.
8412 	 */
8413 	if (vp->v_count > 1) {
8414 		dnlc_remove(vp, "..");
8415 		if (vp->v_count > 1)
8416 			dnlc_purge_vp(vp);
8417 	}
8418 
8419 	mi = VTOMI4(dvp);
8420 	recov_state.rs_flags = 0;
8421 	recov_state.rs_num_retry_despite_err = 0;
8422 
8423 recov_retry:
8424 	args.ctag = TAG_RMDIR;
8425 
8426 	/*
8427 	 * Rmdir ops: putfh dir; remove
8428 	 */
8429 	args.array_len = 3;
8430 	args.array = argop;
8431 
8432 	e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
8433 	if (e.error) {
8434 		nfs_rw_exit(&drp->r_rwlock);
8435 		return (e.error);
8436 	}
8437 	need_end_op = TRUE;
8438 
8439 	/* putfh directory */
8440 	argop[0].argop = OP_CPUTFH;
8441 	argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
8442 
8443 	/* remove */
8444 	argop[1].argop = OP_CREMOVE;
8445 	argop[1].nfs_argop4_u.opcremove.ctarget = nm;
8446 
8447 	/* getattr (postop attrs for dir that contained removed dir) */
8448 	argop[2].argop = OP_GETATTR;
8449 	argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8450 	argop[2].nfs_argop4_u.opgetattr.mi = mi;
8451 
8452 	dinfo.di_time_call = gethrtime();
8453 	doqueue = 1;
8454 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8455 
8456 	PURGE_ATTRCACHE4(vp);
8457 
8458 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8459 	if (e.error) {
8460 		PURGE_ATTRCACHE4(dvp);
8461 	}
8462 
8463 	if (needrecov) {
8464 		if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, NULL, NULL,
8465 		    NULL, OP_REMOVE, NULL) == FALSE) {
8466 			if (!e.error)
8467 				(void) xdr_free(xdr_COMPOUND4res_clnt,
8468 								(caddr_t)&res);
8469 
8470 			nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
8471 			    needrecov);
8472 			need_end_op = FALSE;
8473 			goto recov_retry;
8474 		}
8475 	}
8476 
8477 	if (!e.error) {
8478 		resp = &res;
8479 
8480 		/*
8481 		 * Only return error if first 2 ops (OP_REMOVE or earlier)
8482 		 * failed.
8483 		 */
8484 		if (res.status != NFS4_OK && res.array_len <= 2) {
8485 			e.error = geterrno4(res.status);
8486 			PURGE_ATTRCACHE4(dvp);
8487 			nfs4_end_op(VTOMI4(dvp), dvp, NULL,
8488 						&recov_state, needrecov);
8489 			need_end_op = FALSE;
8490 			nfs4_purge_stale_fh(e.error, dvp, cr);
8491 			/*
8492 			 * System V defines rmdir to return EEXIST, not
8493 			 * ENOTEMPTY if the directory is not empty.  Over
8494 			 * the wire, the error is NFSERR_ENOTEMPTY which
8495 			 * geterrno4 maps to ENOTEMPTY.
8496 			 */
8497 			if (e.error == ENOTEMPTY)
8498 				e.error = EEXIST;
8499 		} else {
8500 			resop = &res.array[1];	/* remove res */
8501 			rm_res = &resop->nfs_resop4_u.opremove;
8502 
8503 			if (res.status == NFS4_OK) {
8504 				resop = &res.array[2];	/* dir attrs */
8505 				dinfo.di_garp =
8506 					&resop->nfs_resop4_u.opgetattr.ga_res;
8507 				dinfo.di_cred = cr;
8508 				dinfop = &dinfo;
8509 			} else
8510 				dinfop = NULL;
8511 
8512 			/* Update dir attribute, readdir and dnlc caches */
8513 			nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL,
8514 				dinfop);
8515 
8516 			/* destroy rddir cache for dir that was removed */
8517 			if (VTOR4(vp)->r_dir != NULL)
8518 				nfs4_purge_rddir_cache(vp);
8519 		}
8520 	}
8521 
8522 	if (need_end_op)
8523 		nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
8524 
8525 	nfs_rw_exit(&drp->r_rwlock);
8526 
8527 	if (resp)
8528 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8529 
8530 	VN_RELE(vp);
8531 
8532 	return (e.error);
8533 }
8534 
8535 static int
8536 nfs4_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr)
8537 {
8538 	int error;
8539 	vnode_t *vp;
8540 	rnode4_t *rp;
8541 	char *contents;
8542 	mntinfo4_t *mi = VTOMI4(dvp);
8543 
8544 	if (nfs_zone() != mi->mi_zone)
8545 		return (EPERM);
8546 	if (!(mi->mi_flags & MI4_SYMLINK))
8547 		return (EOPNOTSUPP);
8548 
8549 	error = call_nfs4_create_req(dvp, lnm, tnm, tva, &vp, cr, NF4LNK);
8550 	if (error) {
8551 		return (error);
8552 	}
8553 
8554 	ASSERT(nfs4_consistent_type(vp));
8555 	rp = VTOR4(vp);
8556 	if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) {
8557 
8558 		contents = kmem_alloc(MAXPATHLEN, KM_SLEEP);
8559 
8560 		if (contents != NULL) {
8561 			mutex_enter(&rp->r_statelock);
8562 			if (rp->r_symlink.contents == NULL) {
8563 				rp->r_symlink.len = strlen(tnm);
8564 				bcopy(tnm, contents, rp->r_symlink.len);
8565 				rp->r_symlink.contents = contents;
8566 				rp->r_symlink.size = MAXPATHLEN;
8567 				mutex_exit(&rp->r_statelock);
8568 			} else {
8569 				mutex_exit(&rp->r_statelock);
8570 				kmem_free((void *)contents, MAXPATHLEN);
8571 			}
8572 		}
8573 	}
8574 	VN_RELE(vp);
8575 
8576 	return (error);
8577 }
8578 
8579 
8580 /*
8581  * Read directory entries.
8582  * There are some weird things to look out for here.  The uio_loffset
8583  * field is either 0 or it is the offset returned from a previous
8584  * readdir.  It is an opaque value used by the server to find the
8585  * correct directory block to read. The count field is the number
8586  * of blocks to read on the server.  This is advisory only, the server
8587  * may return only one block's worth of entries.  Entries may be compressed
8588  * on the server.
8589  */
8590 static int
8591 nfs4_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp)
8592 {
8593 	int error;
8594 	uint_t count;
8595 	rnode4_t *rp;
8596 	rddir4_cache *rdc;
8597 	rddir4_cache *rrdc;
8598 
8599 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
8600 		return (EIO);
8601 	rp = VTOR4(vp);
8602 
8603 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
8604 
8605 	/*
8606 	 * Make sure that the directory cache is valid.
8607 	 */
8608 	if (rp->r_dir != NULL) {
8609 		if (nfs_disable_rddir_cache != 0) {
8610 			/*
8611 			 * Setting nfs_disable_rddir_cache in /etc/system
8612 			 * allows interoperability with servers that do not
8613 			 * properly update the attributes of directories.
8614 			 * Any cached information gets purged before an
8615 			 * access is made to it.
8616 			 */
8617 			nfs4_purge_rddir_cache(vp);
8618 		}
8619 
8620 		error = nfs4_validate_caches(vp, cr);
8621 		if (error)
8622 			return (error);
8623 	}
8624 
8625 	count = MIN(uiop->uio_iov->iov_len, MAXBSIZE);
8626 
8627 	/*
8628 	 * Short circuit last readdir which always returns 0 bytes.
8629 	 * This can be done after the directory has been read through
8630 	 * completely at least once.  This will set r_direof which
8631 	 * can be used to find the value of the last cookie.
8632 	 */
8633 	mutex_enter(&rp->r_statelock);
8634 	if (rp->r_direof != NULL &&
8635 	    uiop->uio_loffset == rp->r_direof->nfs4_ncookie) {
8636 		mutex_exit(&rp->r_statelock);
8637 #ifdef DEBUG
8638 		nfs4_readdir_cache_shorts++;
8639 #endif
8640 		if (eofp)
8641 			*eofp = 1;
8642 		return (0);
8643 	}
8644 
8645 	/*
8646 	 * Look for a cache entry.  Cache entries are identified
8647 	 * by the NFS cookie value and the byte count requested.
8648 	 */
8649 	rdc = rddir4_cache_lookup(rp, uiop->uio_loffset, count);
8650 
8651 	/*
8652 	 * If rdc is NULL then the lookup resulted in an unrecoverable error.
8653 	 */
8654 	if (rdc == NULL) {
8655 		mutex_exit(&rp->r_statelock);
8656 		return (EINTR);
8657 	}
8658 
8659 	/*
8660 	 * Check to see if we need to fill this entry in.
8661 	 */
8662 	if (rdc->flags & RDDIRREQ) {
8663 		rdc->flags &= ~RDDIRREQ;
8664 		rdc->flags |= RDDIR;
8665 		mutex_exit(&rp->r_statelock);
8666 
8667 		/*
8668 		 * Do the readdir.
8669 		 */
8670 		nfs4readdir(vp, rdc, cr);
8671 
8672 		/*
8673 		 * Reaquire the lock, so that we can continue
8674 		 */
8675 		mutex_enter(&rp->r_statelock);
8676 		/*
8677 		 * The entry is now complete
8678 		 */
8679 		rdc->flags &= ~RDDIR;
8680 	}
8681 
8682 	ASSERT(!(rdc->flags & RDDIR));
8683 
8684 	/*
8685 	 * If an error occurred while attempting
8686 	 * to fill the cache entry, mark the entry invalid and
8687 	 * just return the error.
8688 	 */
8689 	if (rdc->error) {
8690 		error = rdc->error;
8691 		rdc->flags |= RDDIRREQ;
8692 		rddir4_cache_rele(rp, rdc);
8693 		mutex_exit(&rp->r_statelock);
8694 		return (error);
8695 	}
8696 
8697 	/*
8698 	 * The cache entry is complete and good,
8699 	 * copyout the dirent structs to the calling
8700 	 * thread.
8701 	 */
8702 	error = uiomove(rdc->entries, rdc->actlen, UIO_READ, uiop);
8703 
8704 	/*
8705 	 * If no error occurred during the copyout,
8706 	 * update the offset in the uio struct to
8707 	 * contain the value of the next NFS 4 cookie
8708 	 * and set the eof value appropriately.
8709 	 */
8710 	if (!error) {
8711 		uiop->uio_loffset = rdc->nfs4_ncookie;
8712 		if (eofp)
8713 			*eofp = rdc->eof;
8714 	}
8715 
8716 	/*
8717 	 * Decide whether to do readahead.  Don't if we
8718 	 * have already read to the end of directory.
8719 	 */
8720 	if (rdc->eof) {
8721 		/*
8722 		 * Make the entry the direof only if it is cached
8723 		 */
8724 		if (rdc->flags & RDDIRCACHED)
8725 			rp->r_direof = rdc;
8726 		rddir4_cache_rele(rp, rdc);
8727 		mutex_exit(&rp->r_statelock);
8728 		return (error);
8729 	}
8730 
8731 	/* Determine if a readdir readahead should be done */
8732 	if (!(rp->r_flags & R4LOOKUP)) {
8733 		rddir4_cache_rele(rp, rdc);
8734 		mutex_exit(&rp->r_statelock);
8735 		return (error);
8736 	}
8737 
8738 	/*
8739 	 * Now look for a readahead entry.
8740 	 *
8741 	 * Check to see whether we found an entry for the readahead.
8742 	 * If so, we don't need to do anything further, so free the new
8743 	 * entry if one was allocated.  Otherwise, allocate a new entry, add
8744 	 * it to the cache, and then initiate an asynchronous readdir
8745 	 * operation to fill it.
8746 	 */
8747 	rrdc = rddir4_cache_lookup(rp, rdc->nfs4_ncookie, count);
8748 
8749 	/*
8750 	 * A readdir cache entry could not be obtained for the readahead.  In
8751 	 * this case we skip the readahead and return.
8752 	 */
8753 	if (rrdc == NULL) {
8754 		rddir4_cache_rele(rp, rdc);
8755 		mutex_exit(&rp->r_statelock);
8756 		return (error);
8757 	}
8758 
8759 	/*
8760 	 * Check to see if we need to fill this entry in.
8761 	 */
8762 	if (rrdc->flags & RDDIRREQ) {
8763 		rrdc->flags &= ~RDDIRREQ;
8764 		rrdc->flags |= RDDIR;
8765 		rddir4_cache_rele(rp, rdc);
8766 		mutex_exit(&rp->r_statelock);
8767 #ifdef DEBUG
8768 		nfs4_readdir_readahead++;
8769 #endif
8770 		/*
8771 		 * Do the readdir.
8772 		 */
8773 		nfs4_async_readdir(vp, rrdc, cr, do_nfs4readdir);
8774 		return (error);
8775 	}
8776 
8777 	rddir4_cache_rele(rp, rrdc);
8778 	rddir4_cache_rele(rp, rdc);
8779 	mutex_exit(&rp->r_statelock);
8780 	return (error);
8781 }
8782 
8783 static int
8784 do_nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr)
8785 {
8786 	int error;
8787 	rnode4_t *rp;
8788 
8789 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
8790 
8791 	rp = VTOR4(vp);
8792 
8793 	/*
8794 	 * Obtain the readdir results for the caller.
8795 	 */
8796 	nfs4readdir(vp, rdc, cr);
8797 
8798 	mutex_enter(&rp->r_statelock);
8799 	/*
8800 	 * The entry is now complete
8801 	 */
8802 	rdc->flags &= ~RDDIR;
8803 
8804 	error = rdc->error;
8805 	if (error)
8806 		rdc->flags |= RDDIRREQ;
8807 	rddir4_cache_rele(rp, rdc);
8808 	mutex_exit(&rp->r_statelock);
8809 
8810 	return (error);
8811 }
8812 
8813 static void
8814 nfs4readdir_stub(vnode_t *vp, rddir4_cache *rdc, cred_t *cr)
8815 {
8816 	int stublength;
8817 	dirent64_t *dp;
8818 	u_longlong_t nodeid, pnodeid;
8819 	vnode_t *dotdotvp = NULL;
8820 	rnode4_t *rp = VTOR4(vp);
8821 	nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie;
8822 
8823 	rdc->error = 0;
8824 	rdc->entries = 0;
8825 	rdc->actlen = rdc->entlen = 0;
8826 	rdc->eof = TRUE;
8827 
8828 	/* Check for EOF case for readdir of stub */
8829 	if (cookie != 0 && cookie != 1)
8830 		return;
8831 
8832 	nodeid = rp->r_attr.va_nodeid;
8833 	if (vp->v_flag & VROOT) {
8834 		pnodeid = nodeid;	/* root of mount point */
8835 	} else {
8836 		if (rdc->error = nfs4_lookup(vp, "..", &dotdotvp, 0, 0, 0, cr))
8837 			return;
8838 		pnodeid = VTOR4(dotdotvp)->r_attr.va_nodeid;
8839 		VN_RELE(dotdotvp);
8840 	}
8841 
8842 	stublength = DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2);
8843 	rdc->entries = kmem_alloc(stublength, KM_SLEEP);
8844 	rdc->entlen = rdc->buflen = stublength;
8845 	rdc->eof = TRUE;
8846 
8847 	dp = (dirent64_t *)rdc->entries;
8848 
8849 	if (rdc->nfs4_cookie == (nfs_cookie4)0) {
8850 		bcopy(nfs4_dot_entries, rdc->entries,
8851 			DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2));
8852 		dp->d_ino = nodeid;
8853 		dp = (struct dirent64 *)(((char *)dp) + DIRENT64_RECLEN(1));
8854 		dp->d_ino = pnodeid;
8855 		rdc->actlen = DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2);
8856 	} else	{	/* for ".." entry */
8857 		bcopy(nfs4_dot_dot_entry, rdc->entries, DIRENT64_RECLEN(2));
8858 		dp->d_ino = pnodeid;
8859 		rdc->actlen = DIRENT64_RECLEN(2);
8860 	}
8861 	rdc->nfs4_ncookie = rdc->actlen;
8862 }
8863 
8864 /*
8865  * Read directory entries.
8866  * There are some weird things to look out for here.  The uio_loffset
8867  * field is either 0 or it is the offset returned from a previous
8868  * readdir.  It is an opaque value used by the server to find the
8869  * correct directory block to read. The count field is the number
8870  * of blocks to read on the server.  This is advisory only, the server
8871  * may return only one block's worth of entries.  Entries may be compressed
8872  * on the server.
8873  *
8874  * Generates the following compound request:
8875  * 1. If readdir offset is zero and no dnlc entry for parent exists,
8876  *    must include a Lookupp as well. In this case, send:
8877  *    { Putfh <fh>; Readdir; Lookupp; Getfh; Getattr }
8878  * 2. Otherwise just do: { Putfh <fh>; Readdir }
8879  *
8880  * Get complete attributes and filehandles for entries if this is the
8881  * first read of the directory. Otherwise, just get fileid's.
8882  */
8883 static void
8884 nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr)
8885 {
8886 	COMPOUND4args_clnt args;
8887 	COMPOUND4res_clnt res;
8888 	READDIR4args *rargs;
8889 	READDIR4res_clnt *rd_res;
8890 	bitmap4 rd_bitsval;
8891 	nfs_argop4 argop[5];
8892 	nfs_resop4 *resop;
8893 	rnode4_t *rp = VTOR4(vp);
8894 	mntinfo4_t *mi = VTOMI4(vp);
8895 	int doqueue;
8896 	u_longlong_t nodeid, pnodeid;	/* id's of dir and its parents */
8897 	vnode_t *dvp;
8898 	nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie;
8899 	int num_ops, res_opcnt;
8900 	bool_t needrecov = FALSE;
8901 	nfs4_recov_state_t recov_state;
8902 	hrtime_t t;
8903 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8904 
8905 	ASSERT(nfs_zone() == mi->mi_zone);
8906 	ASSERT(rdc->flags & RDDIR);
8907 	ASSERT(rdc->entries == NULL);
8908 
8909 	if (rp->r_flags & R4SRVSTUB) {
8910 		nfs4readdir_stub(vp, rdc, cr);
8911 		return;
8912 	}
8913 
8914 	num_ops = 2;
8915 	if (cookie == (nfs_cookie4)0 || cookie == (nfs_cookie4)1) {
8916 		/*
8917 		 * Since nfsv4 readdir may not return entries for "." and "..",
8918 		 * the client must recreate them:
8919 		 * To find the correct nodeid, do the following:
8920 		 * For current node, get nodeid from dnlc.
8921 		 * - if current node is rootvp, set pnodeid to nodeid.
8922 		 * - else if parent is in the dnlc, get its nodeid from there.
8923 		 * - else add LOOKUPP+GETATTR to compound.
8924 		 */
8925 		nodeid = rp->r_attr.va_nodeid;
8926 		if (vp->v_flag & VROOT) {
8927 			pnodeid = nodeid;	/* root of mount point */
8928 		} else {
8929 			dvp = dnlc_lookup(vp, "..");
8930 			if (dvp != NULL && dvp != DNLC_NO_VNODE) {
8931 				/* parent in dnlc cache - no need for otw */
8932 				pnodeid = VTOR4(dvp)->r_attr.va_nodeid;
8933 			} else {
8934 				/*
8935 				 * parent not in dnlc cache,
8936 				 * do lookupp to get its id
8937 				 */
8938 				num_ops = 5;
8939 				pnodeid = 0; /* set later by getattr parent */
8940 			}
8941 			if (dvp)
8942 				VN_RELE(dvp);
8943 		}
8944 	}
8945 	recov_state.rs_flags = 0;
8946 	recov_state.rs_num_retry_despite_err = 0;
8947 
8948 	/* Save the original mount point security flavor */
8949 	(void) save_mnt_secinfo(mi->mi_curr_serv);
8950 
8951 recov_retry:
8952 	args.ctag = TAG_READDIR;
8953 
8954 	args.array = argop;
8955 	args.array_len = num_ops;
8956 
8957 	if (e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
8958 					&recov_state, NULL)) {
8959 		/*
8960 		 * If readdir a node that is a stub for a crossed mount point,
8961 		 * keep the original secinfo flavor for the current file
8962 		 * system, not the crossed one.
8963 		 */
8964 		(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
8965 		rdc->error = e.error;
8966 		return;
8967 	}
8968 
8969 	/*
8970 	 * Determine which attrs to request for dirents.  This code
8971 	 * must be protected by nfs4_start/end_fop because of r_server
8972 	 * (which will change during failover recovery).
8973 	 *
8974 	 */
8975 	if (rp->r_flags & (R4LOOKUP | R4READDIRWATTR)) {
8976 		/*
8977 		 * Get all vattr attrs plus filehandle and rdattr_error
8978 		 */
8979 		rd_bitsval = NFS4_VATTR_MASK |
8980 			FATTR4_RDATTR_ERROR_MASK |
8981 			FATTR4_FILEHANDLE_MASK;
8982 
8983 		if (rp->r_flags & R4READDIRWATTR) {
8984 			mutex_enter(&rp->r_statelock);
8985 			rp->r_flags &= ~R4READDIRWATTR;
8986 			mutex_exit(&rp->r_statelock);
8987 		}
8988 	} else {
8989 		servinfo4_t *svp = rp->r_server;
8990 
8991 		/*
8992 		 * Already read directory. Use readdir with
8993 		 * no attrs (except for mounted_on_fileid) for updates.
8994 		 */
8995 		rd_bitsval = FATTR4_RDATTR_ERROR_MASK;
8996 
8997 		/*
8998 		 * request mounted on fileid if supported, else request
8999 		 * fileid.  maybe we should verify that fileid is supported
9000 		 * and request something else if not.
9001 		 */
9002 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
9003 		if (svp->sv_supp_attrs & FATTR4_MOUNTED_ON_FILEID_MASK)
9004 			rd_bitsval |= FATTR4_MOUNTED_ON_FILEID_MASK;
9005 		nfs_rw_exit(&svp->sv_lock);
9006 	}
9007 
9008 	/* putfh directory fh */
9009 	argop[0].argop = OP_CPUTFH;
9010 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
9011 
9012 	argop[1].argop = OP_READDIR;
9013 	rargs = &argop[1].nfs_argop4_u.opreaddir;
9014 	/*
9015 	 * 1 and 2 are reserved for client "." and ".." entry offset.
9016 	 * cookie 0 should be used over-the-wire to start reading at
9017 	 * the beginning of the directory excluding "." and "..".
9018 	 */
9019 	if (rdc->nfs4_cookie == 0 ||
9020 	    rdc->nfs4_cookie == 1 ||
9021 	    rdc->nfs4_cookie == 2) {
9022 		rargs->cookie = (nfs_cookie4)0;
9023 		rargs->cookieverf = 0;
9024 	} else {
9025 		rargs->cookie = (nfs_cookie4)rdc->nfs4_cookie;
9026 		mutex_enter(&rp->r_statelock);
9027 		rargs->cookieverf = rp->r_cookieverf4;
9028 		mutex_exit(&rp->r_statelock);
9029 	}
9030 	rargs->dircount = MIN(rdc->buflen, mi->mi_tsize);
9031 	rargs->maxcount = mi->mi_tsize;
9032 	rargs->attr_request = rd_bitsval;
9033 	rargs->rdc = rdc;
9034 	rargs->dvp = vp;
9035 	rargs->mi = mi;
9036 	rargs->cr = cr;
9037 
9038 
9039 	/*
9040 	 * If count < than the minimum required, we return no entries
9041 	 * and fail with EINVAL
9042 	 */
9043 	if (rargs->dircount < (DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2))) {
9044 		rdc->error = EINVAL;
9045 		goto out;
9046 	}
9047 
9048 	if (args.array_len == 5) {
9049 		/*
9050 		 * Add lookupp and getattr for parent nodeid.
9051 		 */
9052 		argop[2].argop = OP_LOOKUPP;
9053 
9054 		argop[3].argop = OP_GETFH;
9055 
9056 		/* getattr parent */
9057 		argop[4].argop = OP_GETATTR;
9058 		argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
9059 		argop[4].nfs_argop4_u.opgetattr.mi = mi;
9060 	}
9061 
9062 	doqueue = 1;
9063 
9064 	if (mi->mi_io_kstats) {
9065 		mutex_enter(&mi->mi_lock);
9066 		kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
9067 		mutex_exit(&mi->mi_lock);
9068 	}
9069 
9070 	/* capture the time of this call */
9071 	rargs->t = t = gethrtime();
9072 
9073 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
9074 
9075 	if (mi->mi_io_kstats) {
9076 		mutex_enter(&mi->mi_lock);
9077 		kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
9078 		mutex_exit(&mi->mi_lock);
9079 	}
9080 
9081 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
9082 
9083 	/*
9084 	 * If RPC error occurred and it isn't an error that
9085 	 * triggers recovery, then go ahead and fail now.
9086 	 */
9087 	if (e.error != 0 && !needrecov) {
9088 		rdc->error = e.error;
9089 		goto out;
9090 	}
9091 
9092 	if (needrecov) {
9093 		bool_t abort;
9094 
9095 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
9096 		    "nfs4readdir: initiating recovery.\n"));
9097 
9098 		abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
9099 			    NULL, OP_READDIR, NULL);
9100 		if (abort == FALSE) {
9101 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9102 				    &recov_state, needrecov);
9103 			if (!e.error)
9104 				(void) xdr_free(xdr_COMPOUND4res_clnt,
9105 						(caddr_t)&res);
9106 			if (rdc->entries != NULL) {
9107 				kmem_free(rdc->entries, rdc->entlen);
9108 				rdc->entries = NULL;
9109 			}
9110 			goto recov_retry;
9111 		}
9112 
9113 		if (e.error != 0) {
9114 			rdc->error = e.error;
9115 			goto out;
9116 		}
9117 
9118 		/* fall through for res.status case */
9119 	}
9120 
9121 	res_opcnt = res.array_len;
9122 
9123 	/*
9124 	 * If compound failed first 2 ops (PUTFH+READDIR), then return
9125 	 * failure here.  Subsequent ops are for filling out dot-dot
9126 	 * dirent, and if they fail, we still want to give the caller
9127 	 * the dirents returned by (the successful) READDIR op, so we need
9128 	 * to silently ignore failure for subsequent ops (LOOKUPP+GETATTR).
9129 	 *
9130 	 * One example where PUTFH+READDIR ops would succeed but
9131 	 * LOOKUPP+GETATTR would fail would be a dir that has r perm
9132 	 * but lacks x.  In this case, a POSIX server's VOP_READDIR
9133 	 * would succeed; however, VOP_LOOKUP(..) would fail since no
9134 	 * x perm.  We need to come up with a non-vendor-specific way
9135 	 * for a POSIX server to return d_ino from dotdot's dirent if
9136 	 * client only requests mounted_on_fileid, and just say the
9137 	 * LOOKUPP succeeded and fill out the GETATTR.  However, if
9138 	 * client requested any mandatory attrs, server would be required
9139 	 * to fail the GETATTR op because it can't call VOP_LOOKUP+VOP_GETATTR
9140 	 * for dotdot.
9141 	 */
9142 
9143 	if (res.status) {
9144 		if (res_opcnt <= 2) {
9145 			e.error = geterrno4(res.status);
9146 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9147 			    &recov_state, needrecov);
9148 			nfs4_purge_stale_fh(e.error, vp, cr);
9149 			rdc->error = e.error;
9150 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9151 			if (rdc->entries != NULL) {
9152 				kmem_free(rdc->entries, rdc->entlen);
9153 				rdc->entries = NULL;
9154 			}
9155 			/*
9156 			 * If readdir a node that is a stub for a
9157 			 * crossed mount point, keep the original
9158 			 * secinfo flavor for the current file system,
9159 			 * not the crossed one.
9160 			 */
9161 			(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9162 			return;
9163 		}
9164 	}
9165 
9166 	resop = &res.array[1];	/* readdir res */
9167 	rd_res = &resop->nfs_resop4_u.opreaddirclnt;
9168 
9169 	mutex_enter(&rp->r_statelock);
9170 	rp->r_cookieverf4 = rd_res->cookieverf;
9171 	mutex_exit(&rp->r_statelock);
9172 
9173 	/*
9174 	 * For "." and ".." entries
9175 	 * e.g.
9176 	 *	seek(cookie=0) -> "." entry with d_off = 1
9177 	 *	seek(cookie=1) -> ".." entry with d_off = 2
9178 	 */
9179 	if (cookie == (nfs_cookie4) 0) {
9180 		if (rd_res->dotp)
9181 			rd_res->dotp->d_ino = nodeid;
9182 		if (rd_res->dotdotp)
9183 			rd_res->dotdotp->d_ino = pnodeid;
9184 	}
9185 	if (cookie == (nfs_cookie4) 1) {
9186 		if (rd_res->dotdotp)
9187 			rd_res->dotdotp->d_ino = pnodeid;
9188 	}
9189 
9190 
9191 	/* LOOKUPP+GETATTR attemped */
9192 	if (args.array_len == 5 && rd_res->dotdotp) {
9193 		if (res.status == NFS4_OK && res_opcnt == 5) {
9194 			nfs_fh4 *fhp;
9195 			nfs4_sharedfh_t *sfhp;
9196 			vnode_t *pvp;
9197 			nfs4_ga_res_t *garp;
9198 
9199 			resop++;	/* lookupp */
9200 			resop++;	/* getfh   */
9201 			fhp = &resop->nfs_resop4_u.opgetfh.object;
9202 
9203 			resop++;	/* getattr of parent */
9204 
9205 			/*
9206 			 * First, take care of finishing the
9207 			 * readdir results.
9208 			 */
9209 			garp = &resop->nfs_resop4_u.opgetattr.ga_res;
9210 			/*
9211 			 * The d_ino of .. must be the inode number
9212 			 * of the mounted filesystem.
9213 			 */
9214 			if (garp->n4g_va.va_mask & AT_NODEID)
9215 				rd_res->dotdotp->d_ino =
9216 					garp->n4g_va.va_nodeid;
9217 
9218 
9219 			/*
9220 			 * Next, create the ".." dnlc entry
9221 			 */
9222 			sfhp = sfh4_get(fhp, mi);
9223 			if (!nfs4_make_dotdot(sfhp, t, vp, cr, &pvp, 0)) {
9224 				dnlc_update(vp, "..", pvp);
9225 				VN_RELE(pvp);
9226 			}
9227 			sfh4_rele(&sfhp);
9228 		}
9229 	}
9230 
9231 	if (mi->mi_io_kstats) {
9232 		mutex_enter(&mi->mi_lock);
9233 		KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
9234 		KSTAT_IO_PTR(mi->mi_io_kstats)->nread += rdc->actlen;
9235 		mutex_exit(&mi->mi_lock);
9236 	}
9237 
9238 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9239 
9240 out:
9241 	/*
9242 	 * If readdir a node that is a stub for a crossed mount point,
9243 	 * keep the original secinfo flavor for the current file system,
9244 	 * not the crossed one.
9245 	 */
9246 	(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9247 
9248 	nfs4_end_fop(mi, vp, NULL, OH_READDIR, &recov_state, needrecov);
9249 }
9250 
9251 
9252 static int
9253 nfs4_bio(struct buf *bp, stable_how4 *stab_comm, cred_t *cr, bool_t readahead)
9254 {
9255 	rnode4_t *rp = VTOR4(bp->b_vp);
9256 	int count;
9257 	int error;
9258 	cred_t *cred_otw = NULL;
9259 	offset_t offset;
9260 	nfs4_open_stream_t *osp = NULL;
9261 	bool_t first_time = TRUE;	/* first time getting otw cred */
9262 	bool_t last_time = FALSE;	/* last time getting otw cred */
9263 
9264 	ASSERT(nfs_zone() == VTOMI4(bp->b_vp)->mi_zone);
9265 
9266 	DTRACE_IO1(start, struct buf *, bp);
9267 	offset = ldbtob(bp->b_lblkno);
9268 
9269 	if (bp->b_flags & B_READ) {
9270 	read_again:
9271 		/*
9272 		 * Releases the osp, if it is provided.
9273 		 * Puts a hold on the cred_otw and the new osp (if found).
9274 		 */
9275 		cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
9276 			&first_time, &last_time);
9277 		error = bp->b_error = nfs4read(bp->b_vp, bp->b_un.b_addr,
9278 						offset, bp->b_bcount,
9279 						&bp->b_resid, cred_otw,
9280 						readahead, NULL);
9281 		crfree(cred_otw);
9282 		if (!error) {
9283 			if (bp->b_resid) {
9284 				/*
9285 				 * Didn't get it all because we hit EOF,
9286 				 * zero all the memory beyond the EOF.
9287 				 */
9288 				/* bzero(rdaddr + */
9289 				bzero(bp->b_un.b_addr +
9290 				    bp->b_bcount - bp->b_resid, bp->b_resid);
9291 			}
9292 			mutex_enter(&rp->r_statelock);
9293 			if (bp->b_resid == bp->b_bcount &&
9294 			    offset >= rp->r_size) {
9295 				/*
9296 				 * We didn't read anything at all as we are
9297 				 * past EOF.  Return an error indicator back
9298 				 * but don't destroy the pages (yet).
9299 				 */
9300 				error = NFS_EOF;
9301 			}
9302 			mutex_exit(&rp->r_statelock);
9303 		} else if (error == EACCES && last_time == FALSE) {
9304 				goto read_again;
9305 		}
9306 	} else {
9307 		if (!(rp->r_flags & R4STALE)) {
9308 		write_again:
9309 			/*
9310 			 * Releases the osp, if it is provided.
9311 			 * Puts a hold on the cred_otw and the new
9312 			 * osp (if found).
9313 			 */
9314 			cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
9315 				&first_time, &last_time);
9316 			mutex_enter(&rp->r_statelock);
9317 			count = MIN(bp->b_bcount, rp->r_size - offset);
9318 			mutex_exit(&rp->r_statelock);
9319 			if (count < 0)
9320 				cmn_err(CE_PANIC, "nfs4_bio: write count < 0");
9321 #ifdef DEBUG
9322 			if (count == 0) {
9323 				zoneid_t zoneid = getzoneid();
9324 
9325 				zcmn_err(zoneid, CE_WARN,
9326 				    "nfs4_bio: zero length write at %lld",
9327 				    offset);
9328 				zcmn_err(zoneid, CE_CONT, "flags=0x%x, "
9329 				    "b_bcount=%ld, file size=%lld",
9330 				    rp->r_flags, (long)bp->b_bcount,
9331 				    rp->r_size);
9332 				sfh4_printfhandle(VTOR4(bp->b_vp)->r_fh);
9333 				if (nfs4_bio_do_stop)
9334 					debug_enter("nfs4_bio");
9335 			}
9336 #endif
9337 			error = nfs4write(bp->b_vp, bp->b_un.b_addr, offset,
9338 			    count, cred_otw, stab_comm);
9339 			if (error == EACCES && last_time == FALSE) {
9340 				crfree(cred_otw);
9341 				goto write_again;
9342 			}
9343 			bp->b_error = error;
9344 			if (error && error != EINTR &&
9345 			    !(bp->b_vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) {
9346 				/*
9347 				 * Don't print EDQUOT errors on the console.
9348 				 * Don't print asynchronous EACCES errors.
9349 				 * Don't print EFBIG errors.
9350 				 * Print all other write errors.
9351 				 */
9352 				if (error != EDQUOT && error != EFBIG &&
9353 				    (error != EACCES ||
9354 				    !(bp->b_flags & B_ASYNC)))
9355 					nfs4_write_error(bp->b_vp,
9356 					    error, cred_otw);
9357 				/*
9358 				 * Update r_error and r_flags as appropriate.
9359 				 * If the error was ESTALE, then mark the
9360 				 * rnode as not being writeable and save
9361 				 * the error status.  Otherwise, save any
9362 				 * errors which occur from asynchronous
9363 				 * page invalidations.  Any errors occurring
9364 				 * from other operations should be saved
9365 				 * by the caller.
9366 				 */
9367 				mutex_enter(&rp->r_statelock);
9368 				if (error == ESTALE) {
9369 					rp->r_flags |= R4STALE;
9370 					if (!rp->r_error)
9371 						rp->r_error = error;
9372 				} else if (!rp->r_error &&
9373 				    (bp->b_flags &
9374 				    (B_INVAL|B_FORCE|B_ASYNC)) ==
9375 				    (B_INVAL|B_FORCE|B_ASYNC)) {
9376 					rp->r_error = error;
9377 				}
9378 				mutex_exit(&rp->r_statelock);
9379 			}
9380 			crfree(cred_otw);
9381 		} else
9382 			error = rp->r_error;
9383 	}
9384 
9385 	if (error != 0 && error != NFS_EOF)
9386 		bp->b_flags |= B_ERROR;
9387 
9388 	if (osp)
9389 		open_stream_rele(osp, rp);
9390 
9391 	DTRACE_IO1(done, struct buf *, bp);
9392 
9393 	return (error);
9394 }
9395 
9396 /* ARGSUSED */
9397 static int
9398 nfs4_fid(vnode_t *vp, fid_t *fidp)
9399 {
9400 	return (EREMOTE);
9401 }
9402 
9403 /* ARGSUSED2 */
9404 static int
9405 nfs4_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
9406 {
9407 	rnode4_t *rp = VTOR4(vp);
9408 
9409 	if (!write_lock) {
9410 		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
9411 		return (V_WRITELOCK_FALSE);
9412 	}
9413 
9414 	if ((rp->r_flags & R4DIRECTIO) ||
9415 	    (VTOMI4(vp)->mi_flags & MI4_DIRECTIO)) {
9416 		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
9417 		if (rp->r_mapcnt == 0 && !nfs4_has_pages(vp))
9418 			return (V_WRITELOCK_FALSE);
9419 		nfs_rw_exit(&rp->r_rwlock);
9420 	}
9421 
9422 	(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
9423 	return (V_WRITELOCK_TRUE);
9424 }
9425 
9426 /* ARGSUSED */
9427 static void
9428 nfs4_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
9429 {
9430 	rnode4_t *rp = VTOR4(vp);
9431 
9432 	nfs_rw_exit(&rp->r_rwlock);
9433 }
9434 
9435 /* ARGSUSED */
9436 static int
9437 nfs4_seek(vnode_t *vp, offset_t ooff, offset_t *noffp)
9438 {
9439 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
9440 		return (EIO);
9441 
9442 	/*
9443 	 * Because we stuff the readdir cookie into the offset field
9444 	 * someone may attempt to do an lseek with the cookie which
9445 	 * we want to succeed.
9446 	 */
9447 	if (vp->v_type == VDIR)
9448 		return (0);
9449 	if (*noffp < 0)
9450 		return (EINVAL);
9451 	return (0);
9452 }
9453 
9454 
9455 /*
9456  * Return all the pages from [off..off+len) in file
9457  */
9458 static int
9459 nfs4_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
9460 	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
9461 	enum seg_rw rw, cred_t *cr)
9462 {
9463 	rnode4_t *rp;
9464 	int error;
9465 	mntinfo4_t *mi;
9466 
9467 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
9468 		return (EIO);
9469 	rp = VTOR4(vp);
9470 	if (IS_SHADOW(vp, rp))
9471 		vp = RTOV4(rp);
9472 
9473 	if (vp->v_flag & VNOMAP)
9474 		return (ENOSYS);
9475 
9476 	if (protp != NULL)
9477 		*protp = PROT_ALL;
9478 
9479 	/*
9480 	 * Now validate that the caches are up to date.
9481 	 */
9482 	if (error = nfs4_validate_caches(vp, cr))
9483 		return (error);
9484 
9485 	mi = VTOMI4(vp);
9486 retry:
9487 	mutex_enter(&rp->r_statelock);
9488 
9489 	/*
9490 	 * Don't create dirty pages faster than they
9491 	 * can be cleaned so that the system doesn't
9492 	 * get imbalanced.  If the async queue is
9493 	 * maxed out, then wait for it to drain before
9494 	 * creating more dirty pages.  Also, wait for
9495 	 * any threads doing pagewalks in the vop_getattr
9496 	 * entry points so that they don't block for
9497 	 * long periods.
9498 	 */
9499 	if (rw == S_CREATE) {
9500 		while ((mi->mi_max_threads != 0 &&
9501 			rp->r_awcount > 2 * mi->mi_max_threads) ||
9502 			rp->r_gcount > 0)
9503 			cv_wait(&rp->r_cv, &rp->r_statelock);
9504 	}
9505 
9506 	/*
9507 	 * If we are getting called as a side effect of an nfs_write()
9508 	 * operation the local file size might not be extended yet.
9509 	 * In this case we want to be able to return pages of zeroes.
9510 	 */
9511 	if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
9512 		NFS4_DEBUG(nfs4_pageio_debug,
9513 		    (CE_NOTE, "getpage beyond EOF: off=%lld, "
9514 		    "len=%llu, size=%llu, attrsize =%llu", off,
9515 		    (u_longlong_t)len, rp->r_size, rp->r_attr.va_size));
9516 		mutex_exit(&rp->r_statelock);
9517 		return (EFAULT);		/* beyond EOF */
9518 	}
9519 
9520 	mutex_exit(&rp->r_statelock);
9521 
9522 	if (len <= PAGESIZE) {
9523 		error = nfs4_getapage(vp, off, len, protp, pl, plsz,
9524 		    seg, addr, rw, cr);
9525 		NFS4_DEBUG(nfs4_pageio_debug && error,
9526 			(CE_NOTE, "getpage error %d; off=%lld, "
9527 			"len=%lld", error, off, (u_longlong_t)len));
9528 	} else {
9529 		error = pvn_getpages(nfs4_getapage, vp, off, len, protp,
9530 		    pl, plsz, seg, addr, rw, cr);
9531 		NFS4_DEBUG(nfs4_pageio_debug && error,
9532 			(CE_NOTE, "getpages error %d; off=%lld, "
9533 			"len=%lld", error, off, (u_longlong_t)len));
9534 	}
9535 
9536 	switch (error) {
9537 	case NFS_EOF:
9538 		nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, FALSE);
9539 		goto retry;
9540 	case ESTALE:
9541 		nfs4_purge_stale_fh(error, vp, cr);
9542 	}
9543 
9544 	return (error);
9545 }
9546 
9547 /*
9548  * Called from pvn_getpages or nfs4_getpage to get a particular page.
9549  */
9550 /* ARGSUSED */
9551 static int
9552 nfs4_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
9553 	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
9554 	enum seg_rw rw, cred_t *cr)
9555 {
9556 	rnode4_t *rp;
9557 	uint_t bsize;
9558 	struct buf *bp;
9559 	page_t *pp;
9560 	u_offset_t lbn;
9561 	u_offset_t io_off;
9562 	u_offset_t blkoff;
9563 	u_offset_t rablkoff;
9564 	size_t io_len;
9565 	uint_t blksize;
9566 	int error;
9567 	int readahead;
9568 	int readahead_issued = 0;
9569 	int ra_window; /* readahead window */
9570 	page_t *pagefound;
9571 	page_t *savepp;
9572 
9573 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
9574 		return (EIO);
9575 
9576 	rp = VTOR4(vp);
9577 	ASSERT(!IS_SHADOW(vp, rp));
9578 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
9579 
9580 reread:
9581 	bp = NULL;
9582 	pp = NULL;
9583 	pagefound = NULL;
9584 
9585 	if (pl != NULL)
9586 		pl[0] = NULL;
9587 
9588 	error = 0;
9589 	lbn = off / bsize;
9590 	blkoff = lbn * bsize;
9591 
9592 	/*
9593 	 * Queueing up the readahead before doing the synchronous read
9594 	 * results in a significant increase in read throughput because
9595 	 * of the increased parallelism between the async threads and
9596 	 * the process context.
9597 	 */
9598 	if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
9599 	    rw != S_CREATE &&
9600 	    !(vp->v_flag & VNOCACHE)) {
9601 		mutex_enter(&rp->r_statelock);
9602 
9603 		/*
9604 		 * Calculate the number of readaheads to do.
9605 		 * a) No readaheads at offset = 0.
9606 		 * b) Do maximum(nfs4_nra) readaheads when the readahead
9607 		 *    window is closed.
9608 		 * c) Do readaheads between 1 to (nfs4_nra - 1) depending
9609 		 *    upon how far the readahead window is open or close.
9610 		 * d) No readaheads if rp->r_nextr is not within the scope
9611 		 *    of the readahead window (random i/o).
9612 		 */
9613 
9614 		if (off == 0)
9615 			readahead = 0;
9616 		else if (blkoff == rp->r_nextr)
9617 			readahead = nfs4_nra;
9618 		else if (rp->r_nextr > blkoff &&
9619 			((ra_window = (rp->r_nextr - blkoff) / bsize)
9620 					<= (nfs4_nra - 1)))
9621 			readahead = nfs4_nra - ra_window;
9622 		else
9623 			readahead = 0;
9624 
9625 		rablkoff = rp->r_nextr;
9626 		while (readahead > 0 && rablkoff + bsize < rp->r_size) {
9627 			mutex_exit(&rp->r_statelock);
9628 			if (nfs4_async_readahead(vp, rablkoff + bsize,
9629 			    addr + (rablkoff + bsize - off),
9630 			    seg, cr, nfs4_readahead) < 0) {
9631 				mutex_enter(&rp->r_statelock);
9632 				break;
9633 			}
9634 			readahead--;
9635 			rablkoff += bsize;
9636 			/*
9637 			 * Indicate that we did a readahead so
9638 			 * readahead offset is not updated
9639 			 * by the synchronous read below.
9640 			 */
9641 			readahead_issued = 1;
9642 			mutex_enter(&rp->r_statelock);
9643 			/*
9644 			 * set readahead offset to
9645 			 * offset of last async readahead
9646 			 * request.
9647 			 */
9648 			rp->r_nextr = rablkoff;
9649 		}
9650 		mutex_exit(&rp->r_statelock);
9651 	}
9652 
9653 again:
9654 	if ((pagefound = page_exists(vp, off)) == NULL) {
9655 		if (pl == NULL) {
9656 			(void) nfs4_async_readahead(vp, blkoff, addr, seg, cr,
9657 			    nfs4_readahead);
9658 		} else if (rw == S_CREATE) {
9659 			/*
9660 			 * Block for this page is not allocated, or the offset
9661 			 * is beyond the current allocation size, or we're
9662 			 * allocating a swap slot and the page was not found,
9663 			 * so allocate it and return a zero page.
9664 			 */
9665 			if ((pp = page_create_va(vp, off,
9666 			    PAGESIZE, PG_WAIT, seg, addr)) == NULL)
9667 				cmn_err(CE_PANIC, "nfs4_getapage: page_create");
9668 			io_len = PAGESIZE;
9669 			mutex_enter(&rp->r_statelock);
9670 			rp->r_nextr = off + PAGESIZE;
9671 			mutex_exit(&rp->r_statelock);
9672 		} else {
9673 			/*
9674 			 * Need to go to server to get a block
9675 			 */
9676 			mutex_enter(&rp->r_statelock);
9677 			if (blkoff < rp->r_size &&
9678 			    blkoff + bsize > rp->r_size) {
9679 				/*
9680 				 * If less than a block left in
9681 				 * file read less than a block.
9682 				 */
9683 				if (rp->r_size <= off) {
9684 					/*
9685 					 * Trying to access beyond EOF,
9686 					 * set up to get at least one page.
9687 					 */
9688 					blksize = off + PAGESIZE - blkoff;
9689 				} else
9690 					blksize = rp->r_size - blkoff;
9691 			} else if ((off == 0) ||
9692 				(off != rp->r_nextr && !readahead_issued)) {
9693 				blksize = PAGESIZE;
9694 				blkoff = off; /* block = page here */
9695 			} else
9696 				blksize = bsize;
9697 			mutex_exit(&rp->r_statelock);
9698 
9699 			pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
9700 			    &io_len, blkoff, blksize, 0);
9701 
9702 			/*
9703 			 * Some other thread has entered the page,
9704 			 * so just use it.
9705 			 */
9706 			if (pp == NULL)
9707 				goto again;
9708 
9709 			/*
9710 			 * Now round the request size up to page boundaries.
9711 			 * This ensures that the entire page will be
9712 			 * initialized to zeroes if EOF is encountered.
9713 			 */
9714 			io_len = ptob(btopr(io_len));
9715 
9716 			bp = pageio_setup(pp, io_len, vp, B_READ);
9717 			ASSERT(bp != NULL);
9718 
9719 			/*
9720 			 * pageio_setup should have set b_addr to 0.  This
9721 			 * is correct since we want to do I/O on a page
9722 			 * boundary.  bp_mapin will use this addr to calculate
9723 			 * an offset, and then set b_addr to the kernel virtual
9724 			 * address it allocated for us.
9725 			 */
9726 			ASSERT(bp->b_un.b_addr == 0);
9727 
9728 			bp->b_edev = 0;
9729 			bp->b_dev = 0;
9730 			bp->b_lblkno = lbtodb(io_off);
9731 			bp->b_file = vp;
9732 			bp->b_offset = (offset_t)off;
9733 			bp_mapin(bp);
9734 
9735 			/*
9736 			 * If doing a write beyond what we believe is EOF,
9737 			 * don't bother trying to read the pages from the
9738 			 * server, we'll just zero the pages here.  We
9739 			 * don't check that the rw flag is S_WRITE here
9740 			 * because some implementations may attempt a
9741 			 * read access to the buffer before copying data.
9742 			 */
9743 			mutex_enter(&rp->r_statelock);
9744 			if (io_off >= rp->r_size && seg == segkmap) {
9745 				mutex_exit(&rp->r_statelock);
9746 				bzero(bp->b_un.b_addr, io_len);
9747 			} else {
9748 				mutex_exit(&rp->r_statelock);
9749 				error = nfs4_bio(bp, NULL, cr, FALSE);
9750 			}
9751 
9752 			/*
9753 			 * Unmap the buffer before freeing it.
9754 			 */
9755 			bp_mapout(bp);
9756 			pageio_done(bp);
9757 
9758 			savepp = pp;
9759 			do {
9760 				pp->p_fsdata = C_NOCOMMIT;
9761 			} while ((pp = pp->p_next) != savepp);
9762 
9763 			if (error == NFS_EOF) {
9764 				/*
9765 				 * If doing a write system call just return
9766 				 * zeroed pages, else user tried to get pages
9767 				 * beyond EOF, return error.  We don't check
9768 				 * that the rw flag is S_WRITE here because
9769 				 * some implementations may attempt a read
9770 				 * access to the buffer before copying data.
9771 				 */
9772 				if (seg == segkmap)
9773 					error = 0;
9774 				else
9775 					error = EFAULT;
9776 			}
9777 
9778 			if (!readahead_issued && !error) {
9779 				mutex_enter(&rp->r_statelock);
9780 				rp->r_nextr = io_off + io_len;
9781 				mutex_exit(&rp->r_statelock);
9782 			}
9783 		}
9784 	}
9785 
9786 out:
9787 	if (pl == NULL)
9788 		return (error);
9789 
9790 	if (error) {
9791 		if (pp != NULL)
9792 			pvn_read_done(pp, B_ERROR);
9793 		return (error);
9794 	}
9795 
9796 	if (pagefound) {
9797 		se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
9798 
9799 		/*
9800 		 * Page exists in the cache, acquire the appropriate lock.
9801 		 * If this fails, start all over again.
9802 		 */
9803 		if ((pp = page_lookup(vp, off, se)) == NULL) {
9804 #ifdef DEBUG
9805 			nfs4_lostpage++;
9806 #endif
9807 			goto reread;
9808 		}
9809 		pl[0] = pp;
9810 		pl[1] = NULL;
9811 		return (0);
9812 	}
9813 
9814 	if (pp != NULL)
9815 		pvn_plist_init(pp, pl, plsz, off, io_len, rw);
9816 
9817 	return (error);
9818 }
9819 
9820 static void
9821 nfs4_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
9822 	cred_t *cr)
9823 {
9824 	int error;
9825 	page_t *pp;
9826 	u_offset_t io_off;
9827 	size_t io_len;
9828 	struct buf *bp;
9829 	uint_t bsize, blksize;
9830 	rnode4_t *rp = VTOR4(vp);
9831 	page_t *savepp;
9832 
9833 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
9834 
9835 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
9836 
9837 	mutex_enter(&rp->r_statelock);
9838 	if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
9839 		/*
9840 		 * If less than a block left in file read less
9841 		 * than a block.
9842 		 */
9843 		blksize = rp->r_size - blkoff;
9844 	} else
9845 		blksize = bsize;
9846 	mutex_exit(&rp->r_statelock);
9847 
9848 	pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
9849 	    &io_off, &io_len, blkoff, blksize, 1);
9850 	/*
9851 	 * The isra flag passed to the kluster function is 1, we may have
9852 	 * gotten a return value of NULL for a variety of reasons (# of free
9853 	 * pages < minfree, someone entered the page on the vnode etc). In all
9854 	 * cases, we want to punt on the readahead.
9855 	 */
9856 	if (pp == NULL)
9857 		return;
9858 
9859 	/*
9860 	 * Now round the request size up to page boundaries.
9861 	 * This ensures that the entire page will be
9862 	 * initialized to zeroes if EOF is encountered.
9863 	 */
9864 	io_len = ptob(btopr(io_len));
9865 
9866 	bp = pageio_setup(pp, io_len, vp, B_READ);
9867 	ASSERT(bp != NULL);
9868 
9869 	/*
9870 	 * pageio_setup should have set b_addr to 0.  This is correct since
9871 	 * we want to do I/O on a page boundary. bp_mapin() will use this addr
9872 	 * to calculate an offset, and then set b_addr to the kernel virtual
9873 	 * address it allocated for us.
9874 	 */
9875 	ASSERT(bp->b_un.b_addr == 0);
9876 
9877 	bp->b_edev = 0;
9878 	bp->b_dev = 0;
9879 	bp->b_lblkno = lbtodb(io_off);
9880 	bp->b_file = vp;
9881 	bp->b_offset = (offset_t)blkoff;
9882 	bp_mapin(bp);
9883 
9884 	/*
9885 	 * If doing a write beyond what we believe is EOF, don't bother trying
9886 	 * to read the pages from the server, we'll just zero the pages here.
9887 	 * We don't check that the rw flag is S_WRITE here because some
9888 	 * implementations may attempt a read access to the buffer before
9889 	 * copying data.
9890 	 */
9891 	mutex_enter(&rp->r_statelock);
9892 	if (io_off >= rp->r_size && seg == segkmap) {
9893 		mutex_exit(&rp->r_statelock);
9894 		bzero(bp->b_un.b_addr, io_len);
9895 		error = 0;
9896 	} else {
9897 		mutex_exit(&rp->r_statelock);
9898 		error = nfs4_bio(bp, NULL, cr, TRUE);
9899 		if (error == NFS_EOF)
9900 			error = 0;
9901 	}
9902 
9903 	/*
9904 	 * Unmap the buffer before freeing it.
9905 	 */
9906 	bp_mapout(bp);
9907 	pageio_done(bp);
9908 
9909 	savepp = pp;
9910 	do {
9911 		pp->p_fsdata = C_NOCOMMIT;
9912 	} while ((pp = pp->p_next) != savepp);
9913 
9914 	pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
9915 
9916 	/*
9917 	 * In case of error set readahead offset
9918 	 * to the lowest offset.
9919 	 * pvn_read_done() calls VN_DISPOSE to destroy the pages
9920 	 */
9921 	if (error && rp->r_nextr > io_off) {
9922 		mutex_enter(&rp->r_statelock);
9923 		if (rp->r_nextr > io_off)
9924 			rp->r_nextr = io_off;
9925 		mutex_exit(&rp->r_statelock);
9926 	}
9927 }
9928 
9929 /*
9930  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
9931  * If len == 0, do from off to EOF.
9932  *
9933  * The normal cases should be len == 0 && off == 0 (entire vp list) or
9934  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
9935  * (from pageout).
9936  */
9937 static int
9938 nfs4_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr)
9939 {
9940 	int error;
9941 	rnode4_t *rp;
9942 
9943 	ASSERT(cr != NULL);
9944 
9945 	if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone)
9946 		return (EIO);
9947 
9948 	rp = VTOR4(vp);
9949 	if (IS_SHADOW(vp, rp))
9950 		vp = RTOV4(rp);
9951 
9952 	/*
9953 	 * XXX - Why should this check be made here?
9954 	 */
9955 	if (vp->v_flag & VNOMAP)
9956 		return (ENOSYS);
9957 
9958 	if (len == 0 && !(flags & B_INVAL) &&
9959 	    (vp->v_vfsp->vfs_flag & VFS_RDONLY))
9960 		return (0);
9961 
9962 	mutex_enter(&rp->r_statelock);
9963 	rp->r_count++;
9964 	mutex_exit(&rp->r_statelock);
9965 	error = nfs4_putpages(vp, off, len, flags, cr);
9966 	mutex_enter(&rp->r_statelock);
9967 	rp->r_count--;
9968 	cv_broadcast(&rp->r_cv);
9969 	mutex_exit(&rp->r_statelock);
9970 
9971 	return (error);
9972 }
9973 
9974 /*
9975  * Write out a single page, possibly klustering adjacent dirty pages.
9976  */
9977 int
9978 nfs4_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
9979 	int flags, cred_t *cr)
9980 {
9981 	u_offset_t io_off;
9982 	u_offset_t lbn_off;
9983 	u_offset_t lbn;
9984 	size_t io_len;
9985 	uint_t bsize;
9986 	int error;
9987 	rnode4_t *rp;
9988 
9989 	ASSERT(!(vp->v_vfsp->vfs_flag & VFS_RDONLY));
9990 	ASSERT(pp != NULL);
9991 	ASSERT(cr != NULL);
9992 	ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI4(vp)->mi_zone);
9993 
9994 	rp = VTOR4(vp);
9995 	ASSERT(rp->r_count > 0);
9996 	ASSERT(!IS_SHADOW(vp, rp));
9997 
9998 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
9999 	lbn = pp->p_offset / bsize;
10000 	lbn_off = lbn * bsize;
10001 
10002 	/*
10003 	 * Find a kluster that fits in one block, or in
10004 	 * one page if pages are bigger than blocks.  If
10005 	 * there is less file space allocated than a whole
10006 	 * page, we'll shorten the i/o request below.
10007 	 */
10008 	pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
10009 	    roundup(bsize, PAGESIZE), flags);
10010 
10011 	/*
10012 	 * pvn_write_kluster shouldn't have returned a page with offset
10013 	 * behind the original page we were given.  Verify that.
10014 	 */
10015 	ASSERT((pp->p_offset / bsize) >= lbn);
10016 
10017 	/*
10018 	 * Now pp will have the list of kept dirty pages marked for
10019 	 * write back.  It will also handle invalidation and freeing
10020 	 * of pages that are not dirty.  Check for page length rounding
10021 	 * problems.
10022 	 */
10023 	if (io_off + io_len > lbn_off + bsize) {
10024 		ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
10025 		io_len = lbn_off + bsize - io_off;
10026 	}
10027 	/*
10028 	 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a
10029 	 * consistent value of r_size. R4MODINPROGRESS is set in writerp4().
10030 	 * When R4MODINPROGRESS is set it indicates that a uiomove() is in
10031 	 * progress and the r_size has not been made consistent with the
10032 	 * new size of the file. When the uiomove() completes the r_size is
10033 	 * updated and the R4MODINPROGRESS flag is cleared.
10034 	 *
10035 	 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a
10036 	 * consistent value of r_size. Without this handshaking, it is
10037 	 * possible that nfs4_bio() picks  up the old value of r_size
10038 	 * before the uiomove() in writerp4() completes. This will result
10039 	 * in the write through nfs4_bio() being dropped.
10040 	 *
10041 	 * More precisely, there is a window between the time the uiomove()
10042 	 * completes and the time the r_size is updated. If a VOP_PUTPAGE()
10043 	 * operation intervenes in this window, the page will be picked up,
10044 	 * because it is dirty (it will be unlocked, unless it was
10045 	 * pagecreate'd). When the page is picked up as dirty, the dirty
10046 	 * bit is reset (pvn_getdirty()). In nfs4write(), r_size is
10047 	 * checked. This will still be the old size. Therefore the page will
10048 	 * not be written out. When segmap_release() calls VOP_PUTPAGE(),
10049 	 * the page will be found to be clean and the write will be dropped.
10050 	 */
10051 	if (rp->r_flags & R4MODINPROGRESS) {
10052 		mutex_enter(&rp->r_statelock);
10053 		if ((rp->r_flags & R4MODINPROGRESS) &&
10054 		    rp->r_modaddr + MAXBSIZE > io_off &&
10055 		    rp->r_modaddr < io_off + io_len) {
10056 			page_t *plist;
10057 			/*
10058 			 * A write is in progress for this region of the file.
10059 			 * If we did not detect R4MODINPROGRESS here then this
10060 			 * path through nfs_putapage() would eventually go to
10061 			 * nfs4_bio() and may not write out all of the data
10062 			 * in the pages. We end up losing data. So we decide
10063 			 * to set the modified bit on each page in the page
10064 			 * list and mark the rnode with R4DIRTY. This write
10065 			 * will be restarted at some later time.
10066 			 */
10067 			plist = pp;
10068 			while (plist != NULL) {
10069 				pp = plist;
10070 				page_sub(&plist, pp);
10071 				hat_setmod(pp);
10072 				page_io_unlock(pp);
10073 				page_unlock(pp);
10074 			}
10075 			rp->r_flags |= R4DIRTY;
10076 			mutex_exit(&rp->r_statelock);
10077 			if (offp)
10078 				*offp = io_off;
10079 			if (lenp)
10080 				*lenp = io_len;
10081 			return (0);
10082 		}
10083 		mutex_exit(&rp->r_statelock);
10084 	}
10085 
10086 	if (flags & B_ASYNC) {
10087 		error = nfs4_async_putapage(vp, pp, io_off, io_len, flags, cr,
10088 		    nfs4_sync_putapage);
10089 	} else
10090 		error = nfs4_sync_putapage(vp, pp, io_off, io_len, flags, cr);
10091 
10092 	if (offp)
10093 		*offp = io_off;
10094 	if (lenp)
10095 		*lenp = io_len;
10096 	return (error);
10097 }
10098 
10099 static int
10100 nfs4_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
10101 	int flags, cred_t *cr)
10102 {
10103 	int error;
10104 	rnode4_t *rp;
10105 
10106 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
10107 
10108 	flags |= B_WRITE;
10109 
10110 	error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
10111 
10112 	rp = VTOR4(vp);
10113 
10114 	if ((error == ENOSPC || error == EDQUOT || error == EFBIG ||
10115 	    error == EACCES) &&
10116 	    (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
10117 		if (!(rp->r_flags & R4OUTOFSPACE)) {
10118 			mutex_enter(&rp->r_statelock);
10119 			rp->r_flags |= R4OUTOFSPACE;
10120 			mutex_exit(&rp->r_statelock);
10121 		}
10122 		flags |= B_ERROR;
10123 		pvn_write_done(pp, flags);
10124 		/*
10125 		 * If this was not an async thread, then try again to
10126 		 * write out the pages, but this time, also destroy
10127 		 * them whether or not the write is successful.  This
10128 		 * will prevent memory from filling up with these
10129 		 * pages and destroying them is the only alternative
10130 		 * if they can't be written out.
10131 		 *
10132 		 * Don't do this if this is an async thread because
10133 		 * when the pages are unlocked in pvn_write_done,
10134 		 * some other thread could have come along, locked
10135 		 * them, and queued for an async thread.  It would be
10136 		 * possible for all of the async threads to be tied
10137 		 * up waiting to lock the pages again and they would
10138 		 * all already be locked and waiting for an async
10139 		 * thread to handle them.  Deadlock.
10140 		 */
10141 		if (!(flags & B_ASYNC)) {
10142 			error = nfs4_putpage(vp, io_off, io_len,
10143 			    B_INVAL | B_FORCE, cr);
10144 		}
10145 	} else {
10146 		if (error)
10147 			flags |= B_ERROR;
10148 		else if (rp->r_flags & R4OUTOFSPACE) {
10149 			mutex_enter(&rp->r_statelock);
10150 			rp->r_flags &= ~R4OUTOFSPACE;
10151 			mutex_exit(&rp->r_statelock);
10152 		}
10153 		pvn_write_done(pp, flags);
10154 		if (freemem < desfree)
10155 			(void) nfs4_commit_vp(vp, (u_offset_t)0, 0, cr,
10156 					NFS4_WRITE_NOWAIT);
10157 	}
10158 
10159 	return (error);
10160 }
10161 
10162 #ifdef DEBUG
10163 int nfs4_force_open_before_mmap = 0;
10164 #endif
10165 
10166 static int
10167 nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
10168 	size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr)
10169 {
10170 	struct segvn_crargs vn_a;
10171 	int error = 0;
10172 	rnode4_t *rp = VTOR4(vp);
10173 	mntinfo4_t *mi = VTOMI4(vp);
10174 
10175 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
10176 		return (EIO);
10177 
10178 	if (vp->v_flag & VNOMAP)
10179 		return (ENOSYS);
10180 
10181 	if (off < 0 || (off + len) < 0)
10182 		return (ENXIO);
10183 
10184 	if (vp->v_type != VREG)
10185 		return (ENODEV);
10186 
10187 	/*
10188 	 * If the file is delegated to the client don't do anything.
10189 	 * If the file is not delegated, then validate the data cache.
10190 	 */
10191 	mutex_enter(&rp->r_statev4_lock);
10192 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
10193 		mutex_exit(&rp->r_statev4_lock);
10194 		error = nfs4_validate_caches(vp, cr);
10195 		if (error)
10196 			return (error);
10197 	} else {
10198 		mutex_exit(&rp->r_statev4_lock);
10199 	}
10200 
10201 	/*
10202 	 * Check to see if the vnode is currently marked as not cachable.
10203 	 * This means portions of the file are locked (through VOP_FRLOCK).
10204 	 * In this case the map request must be refused.  We use
10205 	 * rp->r_lkserlock to avoid a race with concurrent lock requests.
10206 	 */
10207 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp)))
10208 		return (EINTR);
10209 
10210 	if (vp->v_flag & VNOCACHE) {
10211 		error = EAGAIN;
10212 		goto done;
10213 	}
10214 
10215 	/*
10216 	 * Don't allow concurrent locks and mapping if mandatory locking is
10217 	 * enabled.
10218 	 */
10219 	if (flk_has_remote_locks(vp)) {
10220 		struct vattr va;
10221 		va.va_mask = AT_MODE;
10222 		error = nfs4getattr(vp, &va, cr);
10223 		if (error != 0)
10224 			goto done;
10225 		if (MANDLOCK(vp, va.va_mode)) {
10226 			error = EAGAIN;
10227 			goto done;
10228 		}
10229 	}
10230 
10231 	/*
10232 	 * It is possible that the rnode has a lost lock request that we
10233 	 * are still trying to recover, and that the request conflicts with
10234 	 * this map request.
10235 	 *
10236 	 * An alternative approach would be for nfs4_safemap() to consider
10237 	 * queued lock requests when deciding whether to set or clear
10238 	 * VNOCACHE.  This would require the frlock code path to call
10239 	 * nfs4_safemap() after enqueing a lost request.
10240 	 */
10241 	if (nfs4_map_lost_lock_conflict(vp)) {
10242 		error = EAGAIN;
10243 		goto done;
10244 	}
10245 
10246 	as_rangelock(as);
10247 	if (!(flags & MAP_FIXED)) {
10248 		map_addr(addrp, len, off, 1, flags);
10249 		if (*addrp == NULL) {
10250 			as_rangeunlock(as);
10251 			error = ENOMEM;
10252 			goto done;
10253 		}
10254 	} else {
10255 		/*
10256 		 * User specified address - blow away any previous mappings
10257 		 */
10258 		(void) as_unmap(as, *addrp, len);
10259 	}
10260 
10261 	if (vp->v_type == VREG) {
10262 		/*
10263 		 * We need to retrieve the open stream
10264 		 */
10265 		nfs4_open_stream_t	*osp = NULL;
10266 		nfs4_open_owner_t	*oop = NULL;
10267 
10268 		oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
10269 		if (oop != NULL) {
10270 			/* returns with 'os_sync_lock' held */
10271 			osp = find_open_stream(oop, rp);
10272 			open_owner_rele(oop);
10273 		}
10274 		if (osp == NULL) {
10275 #ifdef DEBUG
10276 			if (nfs4_force_open_before_mmap) {
10277 				error = EIO;
10278 				goto done;
10279 			}
10280 #endif
10281 			/* returns with 'os_sync_lock' held */
10282 			osp = open_and_get_osp(vp, cr, mi);
10283 			if (osp == NULL) {
10284 				NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE,
10285 				    "nfs4_map: we tried to OPEN the file "
10286 				    "but again no osp, so fail with EIO"));
10287 				error = EIO;
10288 				goto done;
10289 			}
10290 		}
10291 
10292 		if (osp->os_failed_reopen) {
10293 			mutex_exit(&osp->os_sync_lock);
10294 			open_stream_rele(osp, rp);
10295 			NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
10296 			    "nfs4_map: os_failed_reopen set on "
10297 			    "osp %p, cr %p, rp %s", (void *)osp,
10298 			    (void *)cr, rnode4info(rp)));
10299 			error = EIO;
10300 			goto done;
10301 		}
10302 		mutex_exit(&osp->os_sync_lock);
10303 		open_stream_rele(osp, rp);
10304 	}
10305 
10306 	vn_a.vp = vp;
10307 	vn_a.offset = off;
10308 	vn_a.type = (flags & MAP_TYPE);
10309 	vn_a.prot = (uchar_t)prot;
10310 	vn_a.maxprot = (uchar_t)maxprot;
10311 	vn_a.flags = (flags & ~MAP_TYPE);
10312 	vn_a.cred = cr;
10313 	vn_a.amp = NULL;
10314 	vn_a.szc = 0;
10315 	vn_a.lgrp_mem_policy_flags = 0;
10316 
10317 	error = as_map(as, *addrp, len, segvn_create, &vn_a);
10318 	as_rangeunlock(as);
10319 
10320 done:
10321 	nfs_rw_exit(&rp->r_lkserlock);
10322 	return (error);
10323 }
10324 
10325 /*
10326  * We're most likely dealing with a kernel module that likes to READ
10327  * and mmap without OPENing the file (ie: lookup/read/mmap), so lets
10328  * officially OPEN the file to create the necessary client state
10329  * for bookkeeping of os_mmap_read/write counts.
10330  *
10331  * Since VOP_MAP only passes in a pointer to the vnode rather than
10332  * a double pointer, we can't handle the case where nfs4open_otw()
10333  * returns a different vnode than the one passed into VOP_MAP (since
10334  * VOP_DELMAP will not see the vnode nfs4open_otw used).  In this case,
10335  * we return NULL and let nfs4_map() fail.  Note: the only case where
10336  * this should happen is if the file got removed and replaced with the
10337  * same name on the server (in addition to the fact that we're trying
10338  * to VOP_MAP withouth VOP_OPENing the file in the first place).
10339  */
10340 static nfs4_open_stream_t *
10341 open_and_get_osp(vnode_t *map_vp, cred_t *cr, mntinfo4_t *mi)
10342 {
10343 	rnode4_t		*rp, *drp;
10344 	vnode_t			*dvp, *open_vp;
10345 	char			*file_name;
10346 	int			just_created;
10347 	nfs4_sharedfh_t		*sfh;
10348 	nfs4_open_stream_t	*osp;
10349 	nfs4_open_owner_t	*oop;
10350 
10351 	open_vp = map_vp;
10352 	sfh = (open_vp->v_flag & VROOT) ? mi->mi_srvparentfh :
10353 				VTOSV(open_vp)->sv_dfh;
10354 	drp = r4find_unlocked(sfh, open_vp->v_vfsp);
10355 	if (!drp)
10356 		return (NULL);
10357 
10358 	file_name = fn_name(VTOSV(open_vp)->sv_name);
10359 
10360 	rp = VTOR4(open_vp);
10361 	dvp = RTOV4(drp);
10362 	mutex_enter(&rp->r_statev4_lock);
10363 	if (rp->created_v4) {
10364 		rp->created_v4 = 0;
10365 		mutex_exit(&rp->r_statev4_lock);
10366 
10367 		dnlc_update(dvp, file_name, open_vp);
10368 		/* This is needed so we don't bump the open ref count */
10369 		just_created = 1;
10370 	} else {
10371 		mutex_exit(&rp->r_statev4_lock);
10372 		just_created = 0;
10373 	}
10374 
10375 	VN_HOLD(map_vp);
10376 
10377 	if (nfs4open_otw(dvp, file_name, NULL, &open_vp, cr, 0, FREAD, 0,
10378 	    just_created)) {
10379 		kmem_free(file_name, MAXNAMELEN);
10380 		VN_RELE(dvp);
10381 		VN_RELE(map_vp);
10382 		return (NULL);
10383 	}
10384 
10385 	kmem_free(file_name, MAXNAMELEN);
10386 	VN_RELE(dvp);
10387 
10388 	/*
10389 	 * If nfs4open_otw() returned a different vnode then "undo"
10390 	 * the open and return failure to the caller.
10391 	 */
10392 	if (!VN_CMP(open_vp, map_vp)) {
10393 		nfs4_error_t e;
10394 
10395 		NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: "
10396 		    "open returned a different vnode"));
10397 		/*
10398 		 * If there's an error, ignore it,
10399 		 * and let VOP_INACTIVE handle it.
10400 		 */
10401 		(void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e,
10402 				CLOSE_NORM, 0, 0, 0);
10403 		VN_RELE(map_vp);
10404 		return (NULL);
10405 	}
10406 
10407 	VN_RELE(map_vp);
10408 
10409 	oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(open_vp));
10410 	if (!oop) {
10411 		nfs4_error_t e;
10412 
10413 		NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: "
10414 		    "no open owner"));
10415 		/*
10416 		 * If there's an error, ignore it,
10417 		 * and let VOP_INACTIVE handle it.
10418 		 */
10419 		(void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e,
10420 				CLOSE_NORM, 0, 0, 0);
10421 		return (NULL);
10422 	}
10423 	osp = find_open_stream(oop, rp);
10424 	open_owner_rele(oop);
10425 	return (osp);
10426 }
10427 
10428 /*
10429  * Please be aware that when this function is called, the address space write
10430  * a_lock is held.  Do not put over the wire calls in this function.
10431  */
10432 /* ARGSUSED */
10433 static int
10434 nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
10435 	size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr)
10436 {
10437 	rnode4_t		*rp;
10438 	int			error = 0;
10439 	mntinfo4_t		*mi;
10440 
10441 	mi = VTOMI4(vp);
10442 	rp = VTOR4(vp);
10443 
10444 	if (nfs_zone() != mi->mi_zone)
10445 		return (EIO);
10446 	if (vp->v_flag & VNOMAP)
10447 		return (ENOSYS);
10448 
10449 	/*
10450 	 * Need to hold rwlock while incrementing the mapcnt so that
10451 	 * mmap'ing can be serialized with writes so that the caching
10452 	 * can be handled correctly.
10453 	 *
10454 	 * Don't need to update the open stream first, since this
10455 	 * mmap can't add any additional share access that isn't
10456 	 * already contained in the open stream (for the case where we
10457 	 * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't
10458 	 * take into account os_mmap_read[write] counts).
10459 	 */
10460 	if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
10461 		return (EINTR);
10462 	atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
10463 	nfs_rw_exit(&rp->r_rwlock);
10464 
10465 	if (vp->v_type == VREG) {
10466 		/*
10467 		 * We need to retrieve the open stream and update the counts.
10468 		 * If there is no open stream here, something is wrong.
10469 		 */
10470 		nfs4_open_stream_t	*osp = NULL;
10471 		nfs4_open_owner_t	*oop = NULL;
10472 
10473 		oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
10474 		if (oop != NULL) {
10475 			/* returns with 'os_sync_lock' held */
10476 			osp = find_open_stream(oop, rp);
10477 			open_owner_rele(oop);
10478 		}
10479 		if (osp == NULL) {
10480 			NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE,
10481 			    "nfs4_addmap: we should have an osp"
10482 			    "but we don't, so fail with EIO"));
10483 			error = EIO;
10484 			goto out;
10485 		}
10486 
10487 		NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "nfs4_addmap: osp %p,"
10488 		    " pages %ld, prot 0x%x", (void *)osp, btopr(len), prot));
10489 
10490 		/*
10491 		 * Update the map count in the open stream.
10492 		 * This is necessary in the case where we
10493 		 * open/mmap/close/, then the server reboots, and we
10494 		 * attempt to reopen.  If the mmap doesn't add share
10495 		 * access then we send an invalid reopen with
10496 		 * access = NONE.
10497 		 *
10498 		 * We need to specifically check each PROT_* so a mmap
10499 		 * call of (PROT_WRITE | PROT_EXEC) will ensure us both
10500 		 * read and write access.  A simple comparison of prot
10501 		 * to ~PROT_WRITE to determine read access is insufficient
10502 		 * since prot can be |= with PROT_USER, etc.
10503 		 */
10504 
10505 		/*
10506 		 * Unless we're MAP_SHARED, no sense in adding os_mmap_write
10507 		 */
10508 		if ((flags & MAP_SHARED) && (maxprot & PROT_WRITE))
10509 			osp->os_mmap_write += btopr(len);
10510 		if (maxprot & PROT_READ)
10511 			osp->os_mmap_read += btopr(len);
10512 		if (maxprot & PROT_EXEC)
10513 			osp->os_mmap_read += btopr(len);
10514 		/*
10515 		 * Ensure that os_mmap_read gets incremented, even if
10516 		 * maxprot were to look like PROT_NONE.
10517 		 */
10518 		if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) &&
10519 		    !(maxprot & PROT_EXEC))
10520 			osp->os_mmap_read += btopr(len);
10521 		osp->os_mapcnt += btopr(len);
10522 		mutex_exit(&osp->os_sync_lock);
10523 		open_stream_rele(osp, rp);
10524 	}
10525 
10526 out:
10527 	/*
10528 	 * If we got an error, then undo our
10529 	 * incrementing of 'r_mapcnt'.
10530 	 */
10531 
10532 	if (error) {
10533 		atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(len));
10534 		ASSERT(rp->r_mapcnt >= 0);
10535 	}
10536 	return (error);
10537 }
10538 
10539 static int
10540 nfs4_cmp(vnode_t *vp1, vnode_t *vp2)
10541 {
10542 
10543 	return (VTOR4(vp1) == VTOR4(vp2));
10544 }
10545 
10546 static int
10547 nfs4_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
10548 	offset_t offset, struct flk_callback *flk_cbp, cred_t *cr)
10549 {
10550 	int rc;
10551 	u_offset_t start, end;
10552 	rnode4_t *rp;
10553 	int error = 0, intr = INTR4(vp);
10554 	nfs4_error_t e;
10555 
10556 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
10557 		return (EIO);
10558 
10559 	/* check for valid cmd parameter */
10560 	if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
10561 		return (EINVAL);
10562 
10563 	/* Verify l_type. */
10564 	switch (bfp->l_type) {
10565 	case F_RDLCK:
10566 		if (cmd != F_GETLK && !(flag & FREAD))
10567 			return (EBADF);
10568 		break;
10569 	case F_WRLCK:
10570 		if (cmd != F_GETLK && !(flag & FWRITE))
10571 			return (EBADF);
10572 		break;
10573 	case F_UNLCK:
10574 		intr = 0;
10575 		break;
10576 
10577 	default:
10578 		return (EINVAL);
10579 	}
10580 
10581 	/* check the validity of the lock range */
10582 	if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
10583 		return (rc);
10584 	if (rc = flk_check_lock_data(start, end, MAXEND))
10585 		return (rc);
10586 
10587 	/*
10588 	 * If the filesystem is mounted using local locking, pass the
10589 	 * request off to the local locking code.
10590 	 */
10591 	if (VTOMI4(vp)->mi_flags & MI4_LLOCK || vp->v_type != VREG) {
10592 		if (cmd == F_SETLK || cmd == F_SETLKW) {
10593 			/*
10594 			 * For complete safety, we should be holding
10595 			 * r_lkserlock.  However, we can't call
10596 			 * nfs4_safelock and then fs_frlock while
10597 			 * holding r_lkserlock, so just invoke
10598 			 * nfs4_safelock and expect that this will
10599 			 * catch enough of the cases.
10600 			 */
10601 			if (!nfs4_safelock(vp, bfp, cr))
10602 				return (EAGAIN);
10603 		}
10604 		return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr));
10605 	}
10606 
10607 	rp = VTOR4(vp);
10608 
10609 	/*
10610 	 * Check whether the given lock request can proceed, given the
10611 	 * current file mappings.
10612 	 */
10613 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
10614 		return (EINTR);
10615 	if (cmd == F_SETLK || cmd == F_SETLKW) {
10616 		if (!nfs4_safelock(vp, bfp, cr)) {
10617 			rc = EAGAIN;
10618 			goto done;
10619 		}
10620 	}
10621 
10622 	/*
10623 	 * Flush the cache after waiting for async I/O to finish.  For new
10624 	 * locks, this is so that the process gets the latest bits from the
10625 	 * server.  For unlocks, this is so that other clients see the
10626 	 * latest bits once the file has been unlocked.  If currently dirty
10627 	 * pages can't be flushed, then don't allow a lock to be set.  But
10628 	 * allow unlocks to succeed, to avoid having orphan locks on the
10629 	 * server.
10630 	 */
10631 	if (cmd != F_GETLK) {
10632 		mutex_enter(&rp->r_statelock);
10633 		while (rp->r_count > 0) {
10634 		    if (intr) {
10635 			klwp_t *lwp = ttolwp(curthread);
10636 
10637 			if (lwp != NULL)
10638 				lwp->lwp_nostop++;
10639 			if (cv_wait_sig(&rp->r_cv, &rp->r_statelock) == 0) {
10640 				if (lwp != NULL)
10641 					lwp->lwp_nostop--;
10642 				rc = EINTR;
10643 				break;
10644 			}
10645 			if (lwp != NULL)
10646 				lwp->lwp_nostop--;
10647 		    } else
10648 			cv_wait(&rp->r_cv, &rp->r_statelock);
10649 		}
10650 		mutex_exit(&rp->r_statelock);
10651 		if (rc != 0)
10652 			goto done;
10653 		error = nfs4_putpage(vp, (offset_t)0, 0, B_INVAL, cr);
10654 		if (error) {
10655 			if (error == ENOSPC || error == EDQUOT) {
10656 				mutex_enter(&rp->r_statelock);
10657 				if (!rp->r_error)
10658 					rp->r_error = error;
10659 				mutex_exit(&rp->r_statelock);
10660 			}
10661 			if (bfp->l_type != F_UNLCK) {
10662 				rc = ENOLCK;
10663 				goto done;
10664 			}
10665 		}
10666 	}
10667 
10668 	/*
10669 	 * Call the lock manager to do the real work of contacting
10670 	 * the server and obtaining the lock.
10671 	 */
10672 
10673 	nfs4frlock(NFS4_LCK_CTYPE_NORM, vp, cmd, bfp, flag, offset,
10674 		cr, &e, NULL, NULL);
10675 	rc = e.error;
10676 
10677 	if (rc == 0)
10678 		nfs4_lockcompletion(vp, cmd);
10679 
10680 done:
10681 	nfs_rw_exit(&rp->r_lkserlock);
10682 
10683 	return (rc);
10684 }
10685 
10686 /*
10687  * Free storage space associated with the specified vnode.  The portion
10688  * to be freed is specified by bfp->l_start and bfp->l_len (already
10689  * normalized to a "whence" of 0).
10690  *
10691  * This is an experimental facility whose continued existence is not
10692  * guaranteed.  Currently, we only support the special case
10693  * of l_len == 0, meaning free to end of file.
10694  */
10695 /* ARGSUSED */
10696 static int
10697 nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
10698 	offset_t offset, cred_t *cr, caller_context_t *ct)
10699 {
10700 	int error;
10701 
10702 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
10703 		return (EIO);
10704 	ASSERT(vp->v_type == VREG);
10705 	if (cmd != F_FREESP)
10706 		return (EINVAL);
10707 
10708 	error = convoff(vp, bfp, 0, offset);
10709 	if (!error) {
10710 		ASSERT(bfp->l_start >= 0);
10711 		if (bfp->l_len == 0) {
10712 			struct vattr va;
10713 
10714 			va.va_mask = AT_SIZE;
10715 			va.va_size = bfp->l_start;
10716 			error = nfs4setattr(vp, &va, 0, cr, NULL);
10717 		} else
10718 			error = EINVAL;
10719 	}
10720 
10721 	return (error);
10722 }
10723 
10724 /* ARGSUSED */
10725 static int
10726 nfs4_realvp(vnode_t *vp, vnode_t **vpp)
10727 {
10728 	return (EINVAL);
10729 }
10730 
10731 /*
10732  * Setup and add an address space callback to do the work of the delmap call.
10733  * The callback will (and must be) deleted in the actual callback function.
10734  *
10735  * This is done in order to take care of the problem that we have with holding
10736  * the address space's a_lock for a long period of time (e.g. if the NFS server
10737  * is down).  Callbacks will be executed in the address space code while the
10738  * a_lock is not held.  Holding the address space's a_lock causes things such
10739  * as ps and fork to hang because they are trying to acquire this lock as well.
10740  */
10741 /* ARGSUSED */
10742 static int
10743 nfs4_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
10744 	size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr)
10745 {
10746 	int			caller_found;
10747 	int			error;
10748 	rnode4_t		*rp;
10749 	nfs4_delmap_args_t	*dmapp;
10750 	nfs4_delmapcall_t	*delmap_call;
10751 
10752 	if (vp->v_flag & VNOMAP)
10753 		return (ENOSYS);
10754 
10755 	/*
10756 	 * A process may not change zones if it has NFS pages mmap'ed
10757 	 * in, so we can't legitimately get here from the wrong zone.
10758 	 */
10759 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
10760 
10761 	rp = VTOR4(vp);
10762 
10763 	/*
10764 	 * The way that the address space of this process deletes its mapping
10765 	 * of this file is via the following call chains:
10766 	 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap()
10767 	 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap()
10768 	 *
10769 	 * With the use of address space callbacks we are allowed to drop the
10770 	 * address space lock, a_lock, while executing the NFS operations that
10771 	 * need to go over the wire.  Returning EAGAIN to the caller of this
10772 	 * function is what drives the execution of the callback that we add
10773 	 * below.  The callback will be executed by the address space code
10774 	 * after dropping the a_lock.  When the callback is finished, since
10775 	 * we dropped the a_lock, it must be re-acquired and segvn_unmap()
10776 	 * is called again on the same segment to finish the rest of the work
10777 	 * that needs to happen during unmapping.
10778 	 *
10779 	 * This action of calling back into the segment driver causes
10780 	 * nfs4_delmap() to get called again, but since the callback was
10781 	 * already executed at this point, it already did the work and there
10782 	 * is nothing left for us to do.
10783 	 *
10784 	 * To Summarize:
10785 	 * - The first time nfs4_delmap is called by the current thread is when
10786 	 * we add the caller associated with this delmap to the delmap caller
10787 	 * list, add the callback, and return EAGAIN.
10788 	 * - The second time in this call chain when nfs4_delmap is called we
10789 	 * will find this caller in the delmap caller list and realize there
10790 	 * is no more work to do thus removing this caller from the list and
10791 	 * returning the error that was set in the callback execution.
10792 	 */
10793 	caller_found = nfs4_find_and_delete_delmapcall(rp, &error);
10794 	if (caller_found) {
10795 		/*
10796 		 * 'error' is from the actual delmap operations.  To avoid
10797 		 * hangs, we need to handle the return of EAGAIN differently
10798 		 * since this is what drives the callback execution.
10799 		 * In this case, we don't want to return EAGAIN and do the
10800 		 * callback execution because there are none to execute.
10801 		 */
10802 		if (error == EAGAIN)
10803 			return (0);
10804 		else
10805 			return (error);
10806 	}
10807 
10808 	/* current caller was not in the list */
10809 	delmap_call = nfs4_init_delmapcall();
10810 
10811 	mutex_enter(&rp->r_statelock);
10812 	list_insert_tail(&rp->r_indelmap, delmap_call);
10813 	mutex_exit(&rp->r_statelock);
10814 
10815 	dmapp = kmem_alloc(sizeof (nfs4_delmap_args_t), KM_SLEEP);
10816 
10817 	dmapp->vp = vp;
10818 	dmapp->off = off;
10819 	dmapp->addr = addr;
10820 	dmapp->len = len;
10821 	dmapp->prot = prot;
10822 	dmapp->maxprot = maxprot;
10823 	dmapp->flags = flags;
10824 	dmapp->cr = cr;
10825 	dmapp->caller = delmap_call;
10826 
10827 	error = as_add_callback(as, nfs4_delmap_callback, dmapp,
10828 	    AS_UNMAP_EVENT, addr, len, KM_SLEEP);
10829 
10830 	return (error ? error : EAGAIN);
10831 }
10832 
10833 static nfs4_delmapcall_t *
10834 nfs4_init_delmapcall()
10835 {
10836 	nfs4_delmapcall_t	*delmap_call;
10837 
10838 	delmap_call = kmem_alloc(sizeof (nfs4_delmapcall_t), KM_SLEEP);
10839 	delmap_call->call_id = curthread;
10840 	delmap_call->error = 0;
10841 
10842 	return (delmap_call);
10843 }
10844 
10845 static void
10846 nfs4_free_delmapcall(nfs4_delmapcall_t *delmap_call)
10847 {
10848 	kmem_free(delmap_call, sizeof (nfs4_delmapcall_t));
10849 }
10850 
10851 /*
10852  * Searches for the current delmap caller (based on curthread) in the list of
10853  * callers.  If it is found, we remove it and free the delmap caller.
10854  * Returns:
10855  *      0 if the caller wasn't found
10856  *      1 if the caller was found, removed and freed.  *errp will be set
10857  *	to what the result of the delmap was.
10858  */
10859 static int
10860 nfs4_find_and_delete_delmapcall(rnode4_t *rp, int *errp)
10861 {
10862 	nfs4_delmapcall_t	*delmap_call;
10863 
10864 	/*
10865 	 * If the list doesn't exist yet, we create it and return
10866 	 * that the caller wasn't found.  No list = no callers.
10867 	 */
10868 	mutex_enter(&rp->r_statelock);
10869 	if (!(rp->r_flags & R4DELMAPLIST)) {
10870 		/* The list does not exist */
10871 		list_create(&rp->r_indelmap, sizeof (nfs4_delmapcall_t),
10872 		    offsetof(nfs4_delmapcall_t, call_node));
10873 		rp->r_flags |= R4DELMAPLIST;
10874 		mutex_exit(&rp->r_statelock);
10875 		return (0);
10876 	} else {
10877 		/* The list exists so search it */
10878 		for (delmap_call = list_head(&rp->r_indelmap);
10879 		    delmap_call != NULL;
10880 		    delmap_call = list_next(&rp->r_indelmap, delmap_call)) {
10881 			if (delmap_call->call_id == curthread) {
10882 				/* current caller is in the list */
10883 				*errp = delmap_call->error;
10884 				list_remove(&rp->r_indelmap, delmap_call);
10885 				mutex_exit(&rp->r_statelock);
10886 				nfs4_free_delmapcall(delmap_call);
10887 				return (1);
10888 			}
10889 		}
10890 	}
10891 	mutex_exit(&rp->r_statelock);
10892 	return (0);
10893 }
10894 
10895 /*
10896  * Remove some pages from an mmap'd vnode.  Just update the
10897  * count of pages.  If doing close-to-open, then flush and
10898  * commit all of the pages associated with this file.
10899  * Otherwise, start an asynchronous page flush to write out
10900  * any dirty pages.  This will also associate a credential
10901  * with the rnode which can be used to write the pages.
10902  */
10903 /* ARGSUSED */
10904 static void
10905 nfs4_delmap_callback(struct as *as, void *arg, uint_t event)
10906 {
10907 	nfs4_error_t		e = { 0, NFS4_OK, RPC_SUCCESS };
10908 	rnode4_t		*rp;
10909 	mntinfo4_t		*mi;
10910 	nfs4_delmap_args_t	*dmapp = (nfs4_delmap_args_t *)arg;
10911 
10912 	rp = VTOR4(dmapp->vp);
10913 	mi = VTOMI4(dmapp->vp);
10914 
10915 	atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
10916 	ASSERT(rp->r_mapcnt >= 0);
10917 
10918 	/*
10919 	 * Initiate a page flush and potential commit if there are
10920 	 * pages, the file system was not mounted readonly, the segment
10921 	 * was mapped shared, and the pages themselves were writeable.
10922 	 */
10923 	if (nfs4_has_pages(dmapp->vp) &&
10924 	    !(dmapp->vp->v_vfsp->vfs_flag & VFS_RDONLY) &&
10925 	    dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
10926 		mutex_enter(&rp->r_statelock);
10927 		rp->r_flags |= R4DIRTY;
10928 		mutex_exit(&rp->r_statelock);
10929 		e.error = nfs4_putpage_commit(dmapp->vp, dmapp->off,
10930 		    dmapp->len, dmapp->cr);
10931 		if (!e.error) {
10932 			mutex_enter(&rp->r_statelock);
10933 			e.error = rp->r_error;
10934 			rp->r_error = 0;
10935 			mutex_exit(&rp->r_statelock);
10936 		}
10937 	} else
10938 		e.error = 0;
10939 
10940 	if ((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO))
10941 		(void) nfs4_putpage(dmapp->vp, dmapp->off, dmapp->len,
10942 		    B_INVAL, dmapp->cr);
10943 
10944 	if (e.error) {
10945 		e.stat = puterrno4(e.error);
10946 		nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0,
10947 			OP_COMMIT, FALSE, NULL, 0, dmapp->vp);
10948 		dmapp->caller->error = e.error;
10949 	}
10950 
10951 	/* Check to see if we need to close the file */
10952 
10953 	if (dmapp->vp->v_type == VREG) {
10954 		nfs4close_one(dmapp->vp, NULL, dmapp->cr, 0, NULL, &e,
10955 		    CLOSE_DELMAP, dmapp->len, dmapp->maxprot, dmapp->flags);
10956 
10957 		if (e.error != 0 || e.stat != NFS4_OK) {
10958 			/*
10959 			 * Since it is possible that e.error == 0 and
10960 			 * e.stat != NFS4_OK (and vice versa),
10961 			 * we do the proper checking in order to get both
10962 			 * e.error and e.stat reporting the correct info.
10963 			 */
10964 			if (e.stat == NFS4_OK)
10965 				e.stat = puterrno4(e.error);
10966 			if (e.error == 0)
10967 				e.error = geterrno4(e.stat);
10968 
10969 			nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0,
10970 			    OP_CLOSE, FALSE, NULL, 0, dmapp->vp);
10971 			dmapp->caller->error = e.error;
10972 		}
10973 	}
10974 
10975 	(void) as_delete_callback(as, arg);
10976 	kmem_free(dmapp, sizeof (nfs4_delmap_args_t));
10977 }
10978 
10979 
10980 static uint_t
10981 fattr4_maxfilesize_to_bits(uint64_t ll)
10982 {
10983 	uint_t l = 1;
10984 
10985 	if (ll == 0) {
10986 		return (0);
10987 	}
10988 
10989 	if (ll & 0xffffffff00000000) {
10990 		l += 32; ll >>= 32;
10991 	}
10992 	if (ll & 0xffff0000) {
10993 		l += 16; ll >>= 16;
10994 	}
10995 	if (ll & 0xff00) {
10996 		l += 8; ll >>= 8;
10997 	}
10998 	if (ll & 0xf0) {
10999 		l += 4; ll >>= 4;
11000 	}
11001 	if (ll & 0xc) {
11002 		l += 2; ll >>= 2;
11003 	}
11004 	if (ll & 0x2) {
11005 		l += 1;
11006 	}
11007 	return (l);
11008 }
11009 
11010 static int
11011 nfs4_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
11012 {
11013 	int error;
11014 	hrtime_t t;
11015 	rnode4_t *rp;
11016 	nfs4_ga_res_t gar;
11017 	nfs4_ga_ext_res_t ger;
11018 
11019 	gar.n4g_ext_res = &ger;
11020 
11021 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
11022 		return (EIO);
11023 	if (cmd == _PC_PATH_MAX || cmd == _PC_SYMLINK_MAX) {
11024 		*valp = MAXPATHLEN;
11025 		return (0);
11026 	}
11027 	if (cmd == _PC_ACL_ENABLED) {
11028 		*valp = _ACL_ACE_ENABLED;
11029 		return (0);
11030 	}
11031 
11032 	rp = VTOR4(vp);
11033 	if (cmd == _PC_XATTR_EXISTS) {
11034 		/*
11035 		 * Eventually should attempt small client readdir before
11036 		 * going otw with GETATTR(FATTR4_NAMED_ATTR).  For now
11037 		 * just drive the OTW getattr.  This is required because
11038 		 * _PC_XATTR_EXISTS can only return true if attributes
11039 		 * exist -- simply checking for existance of the attrdir
11040 		 * is not sufficient.
11041 		 *
11042 		 * pc4_xattr_valid can be only be trusted when r_xattr_dir
11043 		 * is NULL.  Once the xadir vp exists, we can create xattrs,
11044 		 * and we don't have any way to update the "base" object's
11045 		 * pc4_xattr_exists from the xattr or xadir.  Maybe FEM
11046 		 * could help out.
11047 		 */
11048 		if (ATTRCACHE4_VALID(vp) && rp->r_pathconf.pc4_xattr_valid &&
11049 		    rp->r_xattr_dir == NULL) {
11050 			*valp = rp->r_pathconf.pc4_xattr_exists;
11051 			return (0);
11052 		}
11053 	} else {  /* OLD CODE */
11054 		if (ATTRCACHE4_VALID(vp)) {
11055 			mutex_enter(&rp->r_statelock);
11056 			if (rp->r_pathconf.pc4_cache_valid) {
11057 				error = 0;
11058 				switch (cmd) {
11059 				case _PC_FILESIZEBITS:
11060 					*valp =
11061 					rp->r_pathconf.pc4_filesizebits;
11062 					break;
11063 				case _PC_LINK_MAX:
11064 					*valp =
11065 					rp->r_pathconf.pc4_link_max;
11066 					break;
11067 				case _PC_NAME_MAX:
11068 					*valp =
11069 					rp->r_pathconf.pc4_name_max;
11070 					break;
11071 				case _PC_CHOWN_RESTRICTED:
11072 					*valp =
11073 					rp->r_pathconf.pc4_chown_restricted;
11074 					break;
11075 				case _PC_NO_TRUNC:
11076 					*valp =
11077 					rp->r_pathconf.pc4_no_trunc;
11078 					break;
11079 				default:
11080 					error = EINVAL;
11081 					break;
11082 				}
11083 				mutex_exit(&rp->r_statelock);
11084 #ifdef DEBUG
11085 				nfs4_pathconf_cache_hits++;
11086 #endif
11087 				return (error);
11088 			}
11089 			mutex_exit(&rp->r_statelock);
11090 		}
11091 	}
11092 #ifdef DEBUG
11093 	nfs4_pathconf_cache_misses++;
11094 #endif
11095 
11096 	t = gethrtime();
11097 
11098 	error = nfs4_attr_otw(vp, TAG_PATHCONF, &gar, NFS4_PATHCONF_MASK, cr);
11099 
11100 	if (error) {
11101 		mutex_enter(&rp->r_statelock);
11102 		rp->r_pathconf.pc4_cache_valid = FALSE;
11103 		rp->r_pathconf.pc4_xattr_valid = FALSE;
11104 		mutex_exit(&rp->r_statelock);
11105 		return (error);
11106 	}
11107 
11108 	/* interpret the max filesize */
11109 	gar.n4g_ext_res->n4g_pc4.pc4_filesizebits =
11110 		fattr4_maxfilesize_to_bits(gar.n4g_ext_res->n4g_maxfilesize);
11111 
11112 	/* Store the attributes we just received */
11113 	nfs4_attr_cache(vp, &gar, t, cr, TRUE, NULL);
11114 
11115 	switch (cmd) {
11116 	case _PC_FILESIZEBITS:
11117 		*valp = gar.n4g_ext_res->n4g_pc4.pc4_filesizebits;
11118 		break;
11119 	case _PC_LINK_MAX:
11120 		*valp = gar.n4g_ext_res->n4g_pc4.pc4_link_max;
11121 		break;
11122 	case _PC_NAME_MAX:
11123 		*valp = gar.n4g_ext_res->n4g_pc4.pc4_name_max;
11124 		break;
11125 	case _PC_CHOWN_RESTRICTED:
11126 		*valp = gar.n4g_ext_res->n4g_pc4.pc4_chown_restricted;
11127 		break;
11128 	case _PC_NO_TRUNC:
11129 		*valp = gar.n4g_ext_res->n4g_pc4.pc4_no_trunc;
11130 		break;
11131 	case _PC_XATTR_EXISTS:
11132 		*valp = gar.n4g_ext_res->n4g_pc4.pc4_xattr_exists;
11133 		break;
11134 	default:
11135 		return (EINVAL);
11136 	}
11137 
11138 	return (0);
11139 }
11140 
11141 /*
11142  * Called by async thread to do synchronous pageio. Do the i/o, wait
11143  * for it to complete, and cleanup the page list when done.
11144  */
11145 static int
11146 nfs4_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
11147 	int flags, cred_t *cr)
11148 {
11149 	int error;
11150 
11151 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11152 
11153 	error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
11154 	if (flags & B_READ)
11155 		pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
11156 	else
11157 		pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
11158 	return (error);
11159 }
11160 
11161 static int
11162 nfs4_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
11163 	int flags, cred_t *cr)
11164 {
11165 	int error;
11166 	rnode4_t *rp;
11167 
11168 	if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone)
11169 		return (EIO);
11170 
11171 	if (pp == NULL)
11172 		return (EINVAL);
11173 
11174 	rp = VTOR4(vp);
11175 	mutex_enter(&rp->r_statelock);
11176 	rp->r_count++;
11177 	mutex_exit(&rp->r_statelock);
11178 
11179 	if (flags & B_ASYNC) {
11180 		error = nfs4_async_pageio(vp, pp, io_off, io_len, flags, cr,
11181 		    nfs4_sync_pageio);
11182 	} else
11183 		error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
11184 	mutex_enter(&rp->r_statelock);
11185 	rp->r_count--;
11186 	cv_broadcast(&rp->r_cv);
11187 	mutex_exit(&rp->r_statelock);
11188 	return (error);
11189 }
11190 
11191 static void
11192 nfs4_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr)
11193 {
11194 	int error;
11195 	rnode4_t *rp;
11196 	page_t *plist;
11197 	page_t *pptr;
11198 	offset3 offset;
11199 	count3 len;
11200 	k_sigset_t smask;
11201 
11202 	/*
11203 	 * We should get called with fl equal to either B_FREE or
11204 	 * B_INVAL.  Any other value is illegal.
11205 	 *
11206 	 * The page that we are either supposed to free or destroy
11207 	 * should be exclusive locked and its io lock should not
11208 	 * be held.
11209 	 */
11210 	ASSERT(fl == B_FREE || fl == B_INVAL);
11211 	ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
11212 
11213 	rp = VTOR4(vp);
11214 
11215 	/*
11216 	 * If the page doesn't need to be committed or we shouldn't
11217 	 * even bother attempting to commit it, then just make sure
11218 	 * that the p_fsdata byte is clear and then either free or
11219 	 * destroy the page as appropriate.
11220 	 */
11221 	if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & R4STALE)) {
11222 		pp->p_fsdata = C_NOCOMMIT;
11223 		if (fl == B_FREE)
11224 			page_free(pp, dn);
11225 		else
11226 			page_destroy(pp, dn);
11227 		return;
11228 	}
11229 
11230 	/*
11231 	 * If there is a page invalidation operation going on, then
11232 	 * if this is one of the pages being destroyed, then just
11233 	 * clear the p_fsdata byte and then either free or destroy
11234 	 * the page as appropriate.
11235 	 */
11236 	mutex_enter(&rp->r_statelock);
11237 	if ((rp->r_flags & R4TRUNCATE) && pp->p_offset >= rp->r_truncaddr) {
11238 		mutex_exit(&rp->r_statelock);
11239 		pp->p_fsdata = C_NOCOMMIT;
11240 		if (fl == B_FREE)
11241 			page_free(pp, dn);
11242 		else
11243 			page_destroy(pp, dn);
11244 		return;
11245 	}
11246 
11247 	/*
11248 	 * If we are freeing this page and someone else is already
11249 	 * waiting to do a commit, then just unlock the page and
11250 	 * return.  That other thread will take care of commiting
11251 	 * this page.  The page can be freed sometime after the
11252 	 * commit has finished.  Otherwise, if the page is marked
11253 	 * as delay commit, then we may be getting called from
11254 	 * pvn_write_done, one page at a time.   This could result
11255 	 * in one commit per page, so we end up doing lots of small
11256 	 * commits instead of fewer larger commits.  This is bad,
11257 	 * we want do as few commits as possible.
11258 	 */
11259 	if (fl == B_FREE) {
11260 		if (rp->r_flags & R4COMMITWAIT) {
11261 			page_unlock(pp);
11262 			mutex_exit(&rp->r_statelock);
11263 			return;
11264 		}
11265 		if (pp->p_fsdata == C_DELAYCOMMIT) {
11266 			pp->p_fsdata = C_COMMIT;
11267 			page_unlock(pp);
11268 			mutex_exit(&rp->r_statelock);
11269 			return;
11270 		}
11271 	}
11272 
11273 	/*
11274 	 * Check to see if there is a signal which would prevent an
11275 	 * attempt to commit the pages from being successful.  If so,
11276 	 * then don't bother with all of the work to gather pages and
11277 	 * generate the unsuccessful RPC.  Just return from here and
11278 	 * let the page be committed at some later time.
11279 	 */
11280 	sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
11281 	if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) {
11282 		sigunintr(&smask);
11283 		page_unlock(pp);
11284 		mutex_exit(&rp->r_statelock);
11285 		return;
11286 	}
11287 	sigunintr(&smask);
11288 
11289 	/*
11290 	 * We are starting to need to commit pages, so let's try
11291 	 * to commit as many as possible at once to reduce the
11292 	 * overhead.
11293 	 *
11294 	 * Set the `commit inprogress' state bit.  We must
11295 	 * first wait until any current one finishes.  Then
11296 	 * we initialize the c_pages list with this page.
11297 	 */
11298 	while (rp->r_flags & R4COMMIT) {
11299 		rp->r_flags |= R4COMMITWAIT;
11300 		cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
11301 		rp->r_flags &= ~R4COMMITWAIT;
11302 	}
11303 	rp->r_flags |= R4COMMIT;
11304 	mutex_exit(&rp->r_statelock);
11305 	ASSERT(rp->r_commit.c_pages == NULL);
11306 	rp->r_commit.c_pages = pp;
11307 	rp->r_commit.c_commbase = (offset3)pp->p_offset;
11308 	rp->r_commit.c_commlen = PAGESIZE;
11309 
11310 	/*
11311 	 * Gather together all other pages which can be committed.
11312 	 * They will all be chained off r_commit.c_pages.
11313 	 */
11314 	nfs4_get_commit(vp);
11315 
11316 	/*
11317 	 * Clear the `commit inprogress' status and disconnect
11318 	 * the list of pages to be committed from the rnode.
11319 	 * At this same time, we also save the starting offset
11320 	 * and length of data to be committed on the server.
11321 	 */
11322 	plist = rp->r_commit.c_pages;
11323 	rp->r_commit.c_pages = NULL;
11324 	offset = rp->r_commit.c_commbase;
11325 	len = rp->r_commit.c_commlen;
11326 	mutex_enter(&rp->r_statelock);
11327 	rp->r_flags &= ~R4COMMIT;
11328 	cv_broadcast(&rp->r_commit.c_cv);
11329 	mutex_exit(&rp->r_statelock);
11330 
11331 	if (curproc == proc_pageout || curproc == proc_fsflush ||
11332 	    nfs_zone() != VTOMI4(vp)->mi_zone) {
11333 		nfs4_async_commit(vp, plist, offset, len,
11334 		    cr, do_nfs4_async_commit);
11335 		return;
11336 	}
11337 
11338 	/*
11339 	 * Actually generate the COMMIT op over the wire operation.
11340 	 */
11341 	error = nfs4_commit(vp, (offset4)offset, (count4)len, cr);
11342 
11343 	/*
11344 	 * If we got an error during the commit, just unlock all
11345 	 * of the pages.  The pages will get retransmitted to the
11346 	 * server during a putpage operation.
11347 	 */
11348 	if (error) {
11349 		while (plist != NULL) {
11350 			pptr = plist;
11351 			page_sub(&plist, pptr);
11352 			page_unlock(pptr);
11353 		}
11354 		return;
11355 	}
11356 
11357 	/*
11358 	 * We've tried as hard as we can to commit the data to stable
11359 	 * storage on the server.  We just unlock the rest of the pages
11360 	 * and clear the commit required state.  They will be put
11361 	 * onto the tail of the cachelist if they are nolonger
11362 	 * mapped.
11363 	 */
11364 	while (plist != pp) {
11365 		pptr = plist;
11366 		page_sub(&plist, pptr);
11367 		pptr->p_fsdata = C_NOCOMMIT;
11368 		page_unlock(pptr);
11369 	}
11370 
11371 	/*
11372 	 * It is possible that nfs4_commit didn't return error but
11373 	 * some other thread has modified the page we are going
11374 	 * to free/destroy.
11375 	 *    In this case we need to rewrite the page. Do an explicit check
11376 	 * before attempting to free/destroy the page. If modified, needs to
11377 	 * be rewritten so unlock the page and return.
11378 	 */
11379 	if (hat_ismod(pp)) {
11380 		pp->p_fsdata = C_NOCOMMIT;
11381 		page_unlock(pp);
11382 		return;
11383 	}
11384 
11385 	/*
11386 	 * Now, as appropriate, either free or destroy the page
11387 	 * that we were called with.
11388 	 */
11389 	pp->p_fsdata = C_NOCOMMIT;
11390 	if (fl == B_FREE)
11391 		page_free(pp, dn);
11392 	else
11393 		page_destroy(pp, dn);
11394 }
11395 
11396 /*
11397  * Commit requires that the current fh be the file written to.
11398  * The compound op structure is:
11399  *      PUTFH(file), COMMIT
11400  */
11401 static int
11402 nfs4_commit(vnode_t *vp, offset4 offset, count4 count, cred_t *cr)
11403 {
11404 	COMPOUND4args_clnt args;
11405 	COMPOUND4res_clnt res;
11406 	COMMIT4res *cm_res;
11407 	nfs_argop4 argop[2];
11408 	nfs_resop4 *resop;
11409 	int doqueue;
11410 	mntinfo4_t *mi;
11411 	rnode4_t *rp;
11412 	cred_t *cred_otw = NULL;
11413 	bool_t needrecov = FALSE;
11414 	nfs4_recov_state_t recov_state;
11415 	nfs4_open_stream_t *osp = NULL;
11416 	bool_t first_time = TRUE;	/* first time getting OTW cred */
11417 	bool_t last_time = FALSE;	/* last time getting OTW cred */
11418 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
11419 
11420 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11421 
11422 	rp = VTOR4(vp);
11423 
11424 	mi = VTOMI4(vp);
11425 	recov_state.rs_flags = 0;
11426 	recov_state.rs_num_retry_despite_err = 0;
11427 get_commit_cred:
11428 	/*
11429 	 * Releases the osp, if a valid open stream is provided.
11430 	 * Puts a hold on the cred_otw and the new osp (if found).
11431 	 */
11432 	cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
11433 			&first_time, &last_time);
11434 	args.ctag = TAG_COMMIT;
11435 recov_retry:
11436 	/*
11437 	 * Commit ops: putfh file; commit
11438 	 */
11439 	args.array_len = 2;
11440 	args.array = argop;
11441 
11442 	e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11443 			    &recov_state, NULL);
11444 	if (e.error) {
11445 		crfree(cred_otw);
11446 		if (osp != NULL)
11447 			open_stream_rele(osp, rp);
11448 		return (e.error);
11449 	}
11450 
11451 	/* putfh directory */
11452 	argop[0].argop = OP_CPUTFH;
11453 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
11454 
11455 	/* commit */
11456 	argop[1].argop = OP_COMMIT;
11457 	argop[1].nfs_argop4_u.opcommit.offset = offset;
11458 	argop[1].nfs_argop4_u.opcommit.count = count;
11459 
11460 	doqueue = 1;
11461 	rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, &e);
11462 
11463 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
11464 	if (!needrecov && e.error) {
11465 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state,
11466 			needrecov);
11467 		crfree(cred_otw);
11468 		if (e.error == EACCES && last_time == FALSE)
11469 			goto get_commit_cred;
11470 		if (osp != NULL)
11471 			open_stream_rele(osp, rp);
11472 		return (e.error);
11473 	}
11474 
11475 	if (needrecov) {
11476 		if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
11477 		    NULL, OP_COMMIT, NULL) == FALSE) {
11478 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11479 				&recov_state, needrecov);
11480 			if (!e.error)
11481 				(void) xdr_free(xdr_COMPOUND4res_clnt,
11482 								(caddr_t)&res);
11483 			goto recov_retry;
11484 		}
11485 		if (e.error) {
11486 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11487 				&recov_state, needrecov);
11488 			crfree(cred_otw);
11489 			if (osp != NULL)
11490 				open_stream_rele(osp, rp);
11491 			return (e.error);
11492 		}
11493 		/* fall through for res.status case */
11494 	}
11495 
11496 	if (res.status) {
11497 		e.error = geterrno4(res.status);
11498 		if (e.error == EACCES && last_time == FALSE) {
11499 			crfree(cred_otw);
11500 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11501 				&recov_state, needrecov);
11502 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11503 			goto get_commit_cred;
11504 		}
11505 		/*
11506 		 * Can't do a nfs4_purge_stale_fh here because this
11507 		 * can cause a deadlock.  nfs4_commit can
11508 		 * be called from nfs4_dispose which can be called
11509 		 * indirectly via pvn_vplist_dirty.  nfs4_purge_stale_fh
11510 		 * can call back to pvn_vplist_dirty.
11511 		 */
11512 		if (e.error == ESTALE) {
11513 			mutex_enter(&rp->r_statelock);
11514 			rp->r_flags |= R4STALE;
11515 			if (!rp->r_error)
11516 				rp->r_error = e.error;
11517 			mutex_exit(&rp->r_statelock);
11518 			PURGE_ATTRCACHE4(vp);
11519 		} else {
11520 			mutex_enter(&rp->r_statelock);
11521 			if (!rp->r_error)
11522 				rp->r_error = e.error;
11523 			mutex_exit(&rp->r_statelock);
11524 		}
11525 	} else {
11526 		ASSERT(rp->r_flags & R4HAVEVERF);
11527 		resop = &res.array[1];	/* commit res */
11528 		cm_res = &resop->nfs_resop4_u.opcommit;
11529 		mutex_enter(&rp->r_statelock);
11530 		if (cm_res->writeverf == rp->r_writeverf) {
11531 			mutex_exit(&rp->r_statelock);
11532 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11533 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11534 				&recov_state, needrecov);
11535 			crfree(cred_otw);
11536 			if (osp != NULL)
11537 				open_stream_rele(osp, rp);
11538 			return (0);
11539 		}
11540 		nfs4_set_mod(vp);
11541 		rp->r_writeverf = cm_res->writeverf;
11542 		mutex_exit(&rp->r_statelock);
11543 		e.error = NFS_VERF_MISMATCH;
11544 	}
11545 
11546 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11547 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, needrecov);
11548 	crfree(cred_otw);
11549 	if (osp != NULL)
11550 		open_stream_rele(osp, rp);
11551 
11552 	return (e.error);
11553 }
11554 
11555 static void
11556 nfs4_set_mod(vnode_t *vp)
11557 {
11558 	page_t *pp;
11559 	kmutex_t *vphm;
11560 	rnode4_t *rp;
11561 
11562 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11563 
11564 	/* make sure we're looking at the master vnode, not a shadow */
11565 
11566 	rp = VTOR4(vp);
11567 	if (IS_SHADOW(vp, rp))
11568 		vp = RTOV4(rp);
11569 
11570 	vphm = page_vnode_mutex(vp);
11571 	mutex_enter(vphm);
11572 	/*
11573 	 * If there are no pages associated with this vnode, then
11574 	 * just return.
11575 	 */
11576 	if ((pp = vp->v_pages) == NULL) {
11577 		mutex_exit(vphm);
11578 		return;
11579 	}
11580 
11581 	do {
11582 		if (pp->p_fsdata != C_NOCOMMIT) {
11583 			hat_setmod(pp);
11584 			pp->p_fsdata = C_NOCOMMIT;
11585 		}
11586 	} while ((pp = pp->p_vpnext) != vp->v_pages);
11587 	mutex_exit(vphm);
11588 }
11589 
11590 /*
11591  * This function is used to gather a page list of the pages which
11592  * can be committed on the server.
11593  *
11594  * The calling thread must have set R4COMMIT.  This bit is used to
11595  * serialize access to the commit structure in the rnode.  As long
11596  * as the thread has set R4COMMIT, then it can manipulate the commit
11597  * structure without requiring any other locks.
11598  *
11599  * When this function is called from nfs4_dispose() the page passed
11600  * into nfs4_dispose() will be SE_EXCL locked, and so this function
11601  * will skip it. This is not a problem since we initially add the
11602  * page to the r_commit page list.
11603  *
11604  */
11605 static void
11606 nfs4_get_commit(vnode_t *vp)
11607 {
11608 	rnode4_t *rp;
11609 	page_t *pp;
11610 	kmutex_t *vphm;
11611 
11612 	rp = VTOR4(vp);
11613 
11614 	ASSERT(rp->r_flags & R4COMMIT);
11615 
11616 	/* make sure we're looking at the master vnode, not a shadow */
11617 
11618 	if (IS_SHADOW(vp, rp))
11619 		vp = RTOV4(rp);
11620 
11621 	vphm = page_vnode_mutex(vp);
11622 	mutex_enter(vphm);
11623 
11624 	/*
11625 	 * If there are no pages associated with this vnode, then
11626 	 * just return.
11627 	 */
11628 	if ((pp = vp->v_pages) == NULL) {
11629 		mutex_exit(vphm);
11630 		return;
11631 	}
11632 
11633 	/*
11634 	 * Step through all of the pages associated with this vnode
11635 	 * looking for pages which need to be committed.
11636 	 */
11637 	do {
11638 		/*
11639 		 * First short-cut everything (without the page_lock)
11640 		 * and see if this page does not need to be committed
11641 		 * or is modified if so then we'll just skip it.
11642 		 */
11643 		if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp))
11644 			continue;
11645 
11646 		/*
11647 		 * Attempt to lock the page.  If we can't, then
11648 		 * someone else is messing with it or we have been
11649 		 * called from nfs4_dispose and this is the page that
11650 		 * nfs4_dispose was called with.. anyway just skip it.
11651 		 */
11652 		if (!page_trylock(pp, SE_EXCL))
11653 			continue;
11654 
11655 		/*
11656 		 * Lets check again now that we have the page lock.
11657 		 */
11658 		if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
11659 			page_unlock(pp);
11660 			continue;
11661 		}
11662 
11663 		/* this had better not be a free page */
11664 		ASSERT(PP_ISFREE(pp) == 0);
11665 
11666 		/*
11667 		 * The page needs to be committed and we locked it.
11668 		 * Update the base and length parameters and add it
11669 		 * to r_pages.
11670 		 */
11671 		if (rp->r_commit.c_pages == NULL) {
11672 			rp->r_commit.c_commbase = (offset3)pp->p_offset;
11673 			rp->r_commit.c_commlen = PAGESIZE;
11674 		} else if (pp->p_offset < rp->r_commit.c_commbase) {
11675 			rp->r_commit.c_commlen = rp->r_commit.c_commbase -
11676 			    (offset3)pp->p_offset + rp->r_commit.c_commlen;
11677 			rp->r_commit.c_commbase = (offset3)pp->p_offset;
11678 		} else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen)
11679 			    <= pp->p_offset) {
11680 			rp->r_commit.c_commlen = (offset3)pp->p_offset -
11681 			    rp->r_commit.c_commbase + PAGESIZE;
11682 		}
11683 		page_add(&rp->r_commit.c_pages, pp);
11684 	} while ((pp = pp->p_vpnext) != vp->v_pages);
11685 
11686 	mutex_exit(vphm);
11687 }
11688 
11689 /*
11690  * This routine is used to gather together a page list of the pages
11691  * which are to be committed on the server.  This routine must not
11692  * be called if the calling thread holds any locked pages.
11693  *
11694  * The calling thread must have set R4COMMIT.  This bit is used to
11695  * serialize access to the commit structure in the rnode.  As long
11696  * as the thread has set R4COMMIT, then it can manipulate the commit
11697  * structure without requiring any other locks.
11698  */
11699 static void
11700 nfs4_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len)
11701 {
11702 
11703 	rnode4_t *rp;
11704 	page_t *pp;
11705 	u_offset_t end;
11706 	u_offset_t off;
11707 	ASSERT(len != 0);
11708 	rp = VTOR4(vp);
11709 	ASSERT(rp->r_flags & R4COMMIT);
11710 
11711 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11712 
11713 	/* make sure we're looking at the master vnode, not a shadow */
11714 
11715 	if (IS_SHADOW(vp, rp))
11716 		vp = RTOV4(rp);
11717 
11718 	/*
11719 	 * If there are no pages associated with this vnode, then
11720 	 * just return.
11721 	 */
11722 	if ((pp = vp->v_pages) == NULL)
11723 		return;
11724 	/*
11725 	 * Calculate the ending offset.
11726 	 */
11727 	end = soff + len;
11728 	for (off = soff; off < end; off += PAGESIZE) {
11729 		/*
11730 		 * Lookup each page by vp, offset.
11731 		 */
11732 		if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL)
11733 			continue;
11734 		/*
11735 		 * If this page does not need to be committed or is
11736 		 * modified, then just skip it.
11737 		 */
11738 		if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
11739 			page_unlock(pp);
11740 			continue;
11741 		}
11742 
11743 		ASSERT(PP_ISFREE(pp) == 0);
11744 		/*
11745 		 * The page needs to be committed and we locked it.
11746 		 * Update the base and length parameters and add it
11747 		 * to r_pages.
11748 		 */
11749 		if (rp->r_commit.c_pages == NULL) {
11750 			rp->r_commit.c_commbase = (offset3)pp->p_offset;
11751 			rp->r_commit.c_commlen = PAGESIZE;
11752 		} else {
11753 			rp->r_commit.c_commlen = (offset3)pp->p_offset -
11754 			rp->r_commit.c_commbase + PAGESIZE;
11755 		}
11756 		page_add(&rp->r_commit.c_pages, pp);
11757 	}
11758 }
11759 
11760 /*
11761  * Called from nfs4_close(), nfs4_fsync() and nfs4_delmap().
11762  * Flushes and commits data to the server.
11763  */
11764 static int
11765 nfs4_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr)
11766 {
11767 	int error;
11768 	verifier4 write_verf;
11769 	rnode4_t *rp = VTOR4(vp);
11770 
11771 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11772 
11773 	/*
11774 	 * Flush the data portion of the file and then commit any
11775 	 * portions which need to be committed.  This may need to
11776 	 * be done twice if the server has changed state since
11777 	 * data was last written.  The data will need to be
11778 	 * rewritten to the server and then a new commit done.
11779 	 *
11780 	 * In fact, this may need to be done several times if the
11781 	 * server is having problems and crashing while we are
11782 	 * attempting to do this.
11783 	 */
11784 
11785 top:
11786 	/*
11787 	 * Do a flush based on the poff and plen arguments.  This
11788 	 * will synchronously write out any modified pages in the
11789 	 * range specified by (poff, plen). This starts all of the
11790 	 * i/o operations which will be waited for in the next
11791 	 * call to nfs4_putpage
11792 	 */
11793 
11794 	mutex_enter(&rp->r_statelock);
11795 	write_verf = rp->r_writeverf;
11796 	mutex_exit(&rp->r_statelock);
11797 
11798 	error = nfs4_putpage(vp, poff, plen, B_ASYNC, cr);
11799 	if (error == EAGAIN)
11800 		error = 0;
11801 
11802 	/*
11803 	 * Do a flush based on the poff and plen arguments.  This
11804 	 * will synchronously write out any modified pages in the
11805 	 * range specified by (poff, plen) and wait until all of
11806 	 * the asynchronous i/o's in that range are done as well.
11807 	 */
11808 	if (!error)
11809 		error = nfs4_putpage(vp, poff, plen, 0, cr);
11810 
11811 	if (error)
11812 		return (error);
11813 
11814 	mutex_enter(&rp->r_statelock);
11815 	if (rp->r_writeverf != write_verf) {
11816 		mutex_exit(&rp->r_statelock);
11817 		goto top;
11818 	}
11819 	mutex_exit(&rp->r_statelock);
11820 
11821 	/*
11822 	 * Now commit any pages which might need to be committed.
11823 	 * If the error, NFS_VERF_MISMATCH, is returned, then
11824 	 * start over with the flush operation.
11825 	 */
11826 	error = nfs4_commit_vp(vp, poff, plen, cr, NFS4_WRITE_WAIT);
11827 
11828 	if (error == NFS_VERF_MISMATCH)
11829 		goto top;
11830 
11831 	return (error);
11832 }
11833 
11834 /*
11835  * nfs4_commit_vp()  will wait for other pending commits and
11836  * will either commit the whole file or a range, plen dictates
11837  * if we commit whole file. a value of zero indicates the whole
11838  * file. Called from nfs4_putpage_commit() or nfs4_sync_putapage()
11839  */
11840 static int
11841 nfs4_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen,
11842 		cred_t *cr, int wait_on_writes)
11843 {
11844 	rnode4_t *rp;
11845 	page_t *plist;
11846 	offset3 offset;
11847 	count3 len;
11848 
11849 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11850 
11851 	rp = VTOR4(vp);
11852 
11853 	/*
11854 	 *  before we gather commitable pages make
11855 	 *  sure there are no outstanding async writes
11856 	 */
11857 	if (rp->r_count && wait_on_writes == NFS4_WRITE_WAIT) {
11858 		mutex_enter(&rp->r_statelock);
11859 		while (rp->r_count > 0) {
11860 			cv_wait(&rp->r_cv, &rp->r_statelock);
11861 		}
11862 		mutex_exit(&rp->r_statelock);
11863 	}
11864 
11865 	/*
11866 	 * Set the `commit inprogress' state bit.  We must
11867 	 * first wait until any current one finishes.
11868 	 */
11869 	mutex_enter(&rp->r_statelock);
11870 	while (rp->r_flags & R4COMMIT) {
11871 		rp->r_flags |= R4COMMITWAIT;
11872 		cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
11873 		rp->r_flags &= ~R4COMMITWAIT;
11874 	}
11875 	rp->r_flags |= R4COMMIT;
11876 	mutex_exit(&rp->r_statelock);
11877 
11878 	/*
11879 	 * Gather all of the pages which need to be
11880 	 * committed.
11881 	 */
11882 	if (plen == 0)
11883 		nfs4_get_commit(vp);
11884 	else
11885 		nfs4_get_commit_range(vp, poff, plen);
11886 
11887 	/*
11888 	 * Clear the `commit inprogress' bit and disconnect the
11889 	 * page list which was gathered by nfs4_get_commit.
11890 	 */
11891 	plist = rp->r_commit.c_pages;
11892 	rp->r_commit.c_pages = NULL;
11893 	offset = rp->r_commit.c_commbase;
11894 	len = rp->r_commit.c_commlen;
11895 	mutex_enter(&rp->r_statelock);
11896 	rp->r_flags &= ~R4COMMIT;
11897 	cv_broadcast(&rp->r_commit.c_cv);
11898 	mutex_exit(&rp->r_statelock);
11899 
11900 	/*
11901 	 * If any pages need to be committed, commit them and
11902 	 * then unlock them so that they can be freed some
11903 	 * time later.
11904 	 */
11905 	if (plist == NULL)
11906 		return (0);
11907 
11908 	/*
11909 	 * No error occurred during the flush portion
11910 	 * of this operation, so now attempt to commit
11911 	 * the data to stable storage on the server.
11912 	 *
11913 	 * This will unlock all of the pages on the list.
11914 	 */
11915 	return (nfs4_sync_commit(vp, plist, offset, len, cr));
11916 }
11917 
11918 static int
11919 nfs4_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
11920 	cred_t *cr)
11921 {
11922 	int error;
11923 	page_t *pp;
11924 
11925 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11926 
11927 	error = nfs4_commit(vp, (offset4)offset, (count3)count, cr);
11928 
11929 	/*
11930 	 * If we got an error, then just unlock all of the pages
11931 	 * on the list.
11932 	 */
11933 	if (error) {
11934 		while (plist != NULL) {
11935 			pp = plist;
11936 			page_sub(&plist, pp);
11937 			page_unlock(pp);
11938 		}
11939 		return (error);
11940 	}
11941 	/*
11942 	 * We've tried as hard as we can to commit the data to stable
11943 	 * storage on the server.  We just unlock the pages and clear
11944 	 * the commit required state.  They will get freed later.
11945 	 */
11946 	while (plist != NULL) {
11947 		pp = plist;
11948 		page_sub(&plist, pp);
11949 		pp->p_fsdata = C_NOCOMMIT;
11950 		page_unlock(pp);
11951 	}
11952 
11953 	return (error);
11954 }
11955 
11956 static void
11957 do_nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
11958 	cred_t *cr)
11959 {
11960 
11961 	(void) nfs4_sync_commit(vp, plist, offset, count, cr);
11962 }
11963 
11964 /*ARGSUSED*/
11965 static int
11966 nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr)
11967 {
11968 	int		error = 0;
11969 	mntinfo4_t	*mi;
11970 	vattr_t		va;
11971 	vsecattr_t	nfsace4_vsap;
11972 
11973 	mi = VTOMI4(vp);
11974 	if (nfs_zone() != mi->mi_zone)
11975 		return (EIO);
11976 	if (mi->mi_flags & MI4_ACL) {
11977 		/* if we have a delegation, return it */
11978 		if (VTOR4(vp)->r_deleg_type != OPEN_DELEGATE_NONE)
11979 			(void) nfs4delegreturn(VTOR4(vp),
11980 					NFS4_DR_REOPEN|NFS4_DR_PUSH);
11981 
11982 		error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask,
11983 			NFS4_ACL_SET);
11984 		if (error) /* EINVAL */
11985 			return (error);
11986 
11987 		if (vsecattr->vsa_mask & (VSA_ACL | VSA_DFACL)) {
11988 			/*
11989 			 * These are aclent_t type entries.
11990 			 */
11991 			error = vs_aent_to_ace4(vsecattr, &nfsace4_vsap,
11992 			    vp->v_type == VDIR, FALSE);
11993 			if (error)
11994 				return (error);
11995 		} else {
11996 			/*
11997 			 * These are ace_t type entries.
11998 			 */
11999 			error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap,
12000 			    FALSE);
12001 			if (error)
12002 				return (error);
12003 		}
12004 		bzero(&va, sizeof (va));
12005 		error = nfs4setattr(vp, &va, flag, cr, &nfsace4_vsap);
12006 		vs_ace4_destroy(&nfsace4_vsap);
12007 		return (error);
12008 	}
12009 	return (ENOSYS);
12010 }
12011 
12012 static int
12013 nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr)
12014 {
12015 	int		error;
12016 	mntinfo4_t	*mi;
12017 	nfs4_ga_res_t	gar;
12018 	rnode4_t	*rp = VTOR4(vp);
12019 
12020 	mi = VTOMI4(vp);
12021 	if (nfs_zone() != mi->mi_zone)
12022 		return (EIO);
12023 
12024 	bzero(&gar, sizeof (gar));
12025 	gar.n4g_vsa.vsa_mask = vsecattr->vsa_mask;
12026 
12027 	/*
12028 	 * vsecattr->vsa_mask holds the original acl request mask.
12029 	 * This is needed when determining what to return.
12030 	 * (See: nfs4_create_getsecattr_return())
12031 	 */
12032 	error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, NFS4_ACL_GET);
12033 	if (error) /* EINVAL */
12034 		return (error);
12035 
12036 	if (mi->mi_flags & MI4_ACL) {
12037 		/*
12038 		 * Check if the data is cached and the cache is valid.  If it
12039 		 * is we don't go over the wire.
12040 		 */
12041 		if (rp->r_secattr != NULL && ATTRCACHE4_VALID(vp)) {
12042 			mutex_enter(&rp->r_statelock);
12043 			if (rp->r_secattr != NULL) {
12044 				error = nfs4_create_getsecattr_return(
12045 				    rp->r_secattr, vsecattr, rp->r_attr.va_uid,
12046 				    rp->r_attr.va_gid,
12047 				    vp->v_type == VDIR);
12048 				if (!error) { /* error == 0 - Success! */
12049 					mutex_exit(&rp->r_statelock);
12050 					return (error);
12051 				}
12052 			}
12053 			mutex_exit(&rp->r_statelock);
12054 		}
12055 
12056 		/*
12057 		 * The getattr otw call will always get both the acl, in
12058 		 * the form of a list of nfsace4's, and the number of acl
12059 		 * entries; independent of the value of gar.n4g_vsa.vsa_mask.
12060 		 */
12061 		gar.n4g_va.va_mask = AT_ALL;
12062 		error =  nfs4_getattr_otw(vp, &gar, cr, 1);
12063 		if (error) {
12064 			vs_ace4_destroy(&gar.n4g_vsa);
12065 			if (error == ENOTSUP || error == EOPNOTSUPP)
12066 				error = fs_fab_acl(vp, vsecattr, flag, cr);
12067 			return (error);
12068 		}
12069 
12070 		if (!(gar.n4g_resbmap & FATTR4_ACL_MASK)) {
12071 			/*
12072 			 * No error was returned, but according to the response
12073 			 * bitmap, neither was an acl.
12074 			 */
12075 			vs_ace4_destroy(&gar.n4g_vsa);
12076 			error = fs_fab_acl(vp, vsecattr, flag, cr);
12077 			return (error);
12078 		}
12079 
12080 		/*
12081 		 * Update the cache with the ACL.
12082 		 */
12083 		nfs4_acl_fill_cache(rp, &gar.n4g_vsa);
12084 
12085 		error = nfs4_create_getsecattr_return(&gar.n4g_vsa,
12086 		    vsecattr, gar.n4g_va.va_uid, gar.n4g_va.va_gid,
12087 		    vp->v_type == VDIR);
12088 		vs_ace4_destroy(&gar.n4g_vsa);
12089 		if ((error) && (vsecattr->vsa_mask &
12090 		    (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) &&
12091 		    (error != EACCES)) {
12092 			error = fs_fab_acl(vp, vsecattr, flag, cr);
12093 		}
12094 		return (error);
12095 	}
12096 	error = fs_fab_acl(vp, vsecattr, flag, cr);
12097 	return (error);
12098 }
12099 
12100 /*
12101  * The function returns:
12102  * 	- 0 (zero) if the passed in "acl_mask" is a valid request.
12103  * 	- EINVAL if the passed in "acl_mask" is an invalid request.
12104  *
12105  * In the case of getting an acl (op == NFS4_ACL_GET) the mask is invalid if:
12106  * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE)
12107  *
12108  * In the case of setting an acl (op == NFS4_ACL_SET) the mask is invalid if:
12109  * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE)
12110  * - We have a count field set without the corresponding acl field set. (e.g. -
12111  * VSA_ACECNT is set, but VSA_ACE is not)
12112  */
12113 static int
12114 nfs4_is_acl_mask_valid(uint_t acl_mask, nfs4_acl_op_t op)
12115 {
12116 	/* Shortcut the masks that are always valid. */
12117 	if (acl_mask == (VSA_ACE | VSA_ACECNT))
12118 		return (0);
12119 	if (acl_mask == (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT))
12120 		return (0);
12121 
12122 	if (acl_mask & (VSA_ACE | VSA_ACECNT)) {
12123 		/*
12124 		 * We can't have any VSA_ACL type stuff in the mask now.
12125 		 */
12126 		if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL |
12127 		    VSA_DFACLCNT))
12128 			return (EINVAL);
12129 
12130 		if (op == NFS4_ACL_SET) {
12131 			if ((acl_mask & VSA_ACECNT) && !(acl_mask & VSA_ACE))
12132 				return (EINVAL);
12133 		}
12134 	}
12135 
12136 	if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) {
12137 		/*
12138 		 * We can't have any VSA_ACE type stuff in the mask now.
12139 		 */
12140 		if (acl_mask & (VSA_ACE | VSA_ACECNT))
12141 			return (EINVAL);
12142 
12143 		if (op == NFS4_ACL_SET) {
12144 			if ((acl_mask & VSA_ACLCNT) && !(acl_mask & VSA_ACL))
12145 				return (EINVAL);
12146 
12147 			if ((acl_mask & VSA_DFACLCNT) &&
12148 			    !(acl_mask & VSA_DFACL))
12149 				return (EINVAL);
12150 		}
12151 	}
12152 	return (0);
12153 }
12154 
12155 /*
12156  * The theory behind creating the correct getsecattr return is simply this:
12157  * "Don't return anything that the caller is not expecting to have to free."
12158  */
12159 static int
12160 nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap,
12161 	uid_t uid, gid_t gid, int isdir)
12162 {
12163 	int error = 0;
12164 	/* Save the mask since the translators modify it. */
12165 	uint_t	orig_mask = vsap->vsa_mask;
12166 
12167 	if (orig_mask & (VSA_ACE | VSA_ACECNT)) {
12168 		error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid,
12169 		    FALSE, ((orig_mask & VSA_ACE) ? FALSE : TRUE));
12170 
12171 		if (error)
12172 			return (error);
12173 
12174 		/*
12175 		 * If the caller only asked for the ace count (VSA_ACECNT)
12176 		 * don't give them the full acl (VSA_ACE), free it.
12177 		 */
12178 		if (!orig_mask & VSA_ACE) {
12179 			if (vsap->vsa_aclentp != NULL) {
12180 				kmem_free(vsap->vsa_aclentp,
12181 				    vsap->vsa_aclcnt * sizeof (ace_t));
12182 				vsap->vsa_aclentp = NULL;
12183 			}
12184 		}
12185 		vsap->vsa_mask = orig_mask;
12186 
12187 	} else if (orig_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL |
12188 	    VSA_DFACLCNT)) {
12189 		error = vs_ace4_to_aent(filled_vsap, vsap, uid, gid,
12190 		    isdir, FALSE,
12191 		    ((orig_mask & (VSA_ACL | VSA_DFACL)) ? FALSE : TRUE));
12192 
12193 		if (error)
12194 			return (error);
12195 
12196 		/*
12197 		 * If the caller only asked for the acl count (VSA_ACLCNT)
12198 		 * and/or the default acl count (VSA_DFACLCNT) don't give them
12199 		 * the acl (VSA_ACL) or default acl (VSA_DFACL), free it.
12200 		 */
12201 		if (!orig_mask & VSA_ACL) {
12202 			if (vsap->vsa_aclentp != NULL) {
12203 				kmem_free(vsap->vsa_aclentp,
12204 				    vsap->vsa_aclcnt * sizeof (aclent_t));
12205 				vsap->vsa_aclentp = NULL;
12206 			}
12207 		}
12208 
12209 		if (!orig_mask & VSA_DFACL) {
12210 			if (vsap->vsa_dfaclentp != NULL) {
12211 				kmem_free(vsap->vsa_dfaclentp,
12212 				    vsap->vsa_dfaclcnt * sizeof (aclent_t));
12213 				vsap->vsa_dfaclentp = NULL;
12214 			}
12215 		}
12216 		vsap->vsa_mask = orig_mask;
12217 	}
12218 	return (0);
12219 }
12220 
12221 static int
12222 nfs4_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr)
12223 {
12224 	int error;
12225 
12226 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
12227 		return (EIO);
12228 	/*
12229 	 * check for valid cmd parameter
12230 	 */
12231 	if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
12232 		return (EINVAL);
12233 
12234 	/*
12235 	 * Check access permissions
12236 	 */
12237 	if ((cmd & F_SHARE) &&
12238 	    (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) ||
12239 	    (shr->s_access == F_WRACC && (flag & FWRITE) == 0)))
12240 		return (EBADF);
12241 
12242 	/*
12243 	 * If the filesystem is mounted using local locking, pass the
12244 	 * request off to the local share code.
12245 	 */
12246 	if (VTOMI4(vp)->mi_flags & MI4_LLOCK)
12247 		return (fs_shrlock(vp, cmd, shr, flag, cr));
12248 
12249 	switch (cmd) {
12250 	case F_SHARE:
12251 	case F_UNSHARE:
12252 		/*
12253 		 * This will be properly implemented later,
12254 		 * see RFE: 4823948 .
12255 		 */
12256 		error = EAGAIN;
12257 		break;
12258 
12259 	case F_HASREMOTELOCKS:
12260 		/*
12261 		 * NFS client can't store remote locks itself
12262 		 */
12263 		shr->s_access = 0;
12264 		error = 0;
12265 		break;
12266 
12267 	default:
12268 		error = EINVAL;
12269 		break;
12270 	}
12271 
12272 	return (error);
12273 }
12274 
12275 /*
12276  * Common code called by directory ops to update the attrcache
12277  */
12278 static int
12279 nfs4_update_attrcache(nfsstat4 status, nfs4_ga_res_t *garp,
12280 	hrtime_t t, vnode_t *vp, cred_t *cr)
12281 {
12282 	int error = 0;
12283 
12284 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12285 
12286 	if (status != NFS4_OK) {
12287 		/* getattr not done or failed */
12288 		PURGE_ATTRCACHE4(vp);
12289 		return (error);
12290 	}
12291 
12292 	if (garp) {
12293 		nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
12294 	} else {
12295 		PURGE_ATTRCACHE4(vp);
12296 	}
12297 	return (error);
12298 }
12299 
12300 /*
12301  * Update directory caches for directory modification ops (link, rename, etc.)
12302  * When dinfo is NULL, manage dircaches in the old way.
12303  */
12304 static void
12305 nfs4_update_dircaches(change_info4 *cinfo, vnode_t *dvp, vnode_t *vp, char *nm,
12306 		dirattr_info_t *dinfo)
12307 {
12308 	rnode4_t	*drp = VTOR4(dvp);
12309 
12310 	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
12311 
12312 	/* Purge rddir cache for dir since it changed */
12313 	if (drp->r_dir != NULL)
12314 		nfs4_purge_rddir_cache(dvp);
12315 
12316 	/*
12317 	 * If caller provided dinfo, then use it to manage dir caches.
12318 	 */
12319 	if (dinfo != NULL) {
12320 		if (vp != NULL) {
12321 			mutex_enter(&VTOR4(vp)->r_statev4_lock);
12322 			if (!VTOR4(vp)->created_v4) {
12323 				mutex_exit(&VTOR4(vp)->r_statev4_lock);
12324 				dnlc_update(dvp, nm, vp);
12325 			} else {
12326 				/*
12327 				 * XXX don't update if the created_v4 flag is
12328 				 * set
12329 				 */
12330 				mutex_exit(&VTOR4(vp)->r_statev4_lock);
12331 				NFS4_DEBUG(nfs4_client_state_debug,
12332 					(CE_NOTE, "nfs4_update_dircaches: "
12333 					"don't update dnlc: created_v4 flag"));
12334 			}
12335 		}
12336 
12337 		nfs4_attr_cache(dvp, dinfo->di_garp, dinfo->di_time_call,
12338 				dinfo->di_cred, FALSE, cinfo);
12339 
12340 		return;
12341 	}
12342 
12343 	/*
12344 	 * Caller didn't provide dinfo, then check change_info4 to update DNLC.
12345 	 * Since caller modified dir but didn't receive post-dirmod-op dir
12346 	 * attrs, the dir's attrs must be purged.
12347 	 *
12348 	 * XXX this check and dnlc update/purge should really be atomic,
12349 	 * XXX but can't use rnode statelock because it'll deadlock in
12350 	 * XXX dnlc_purge_vp, however, the risk is minimal even if a race
12351 	 * XXX does occur.
12352 	 *
12353 	 * XXX We also may want to check that atomic is true in the
12354 	 * XXX change_info struct. If it is not, the change_info may
12355 	 * XXX reflect changes by more than one clients which means that
12356 	 * XXX our cache may not be valid.
12357 	 */
12358 	PURGE_ATTRCACHE4(dvp);
12359 	if (drp->r_change == cinfo->before) {
12360 		/* no changes took place in the directory prior to our link */
12361 		if (vp != NULL) {
12362 			mutex_enter(&VTOR4(vp)->r_statev4_lock);
12363 			if (!VTOR4(vp)->created_v4) {
12364 				mutex_exit(&VTOR4(vp)->r_statev4_lock);
12365 				dnlc_update(dvp, nm, vp);
12366 			} else {
12367 				/*
12368 				 * XXX dont' update if the created_v4 flag
12369 				 * is set
12370 				 */
12371 				mutex_exit(&VTOR4(vp)->r_statev4_lock);
12372 				NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
12373 					"nfs4_update_dircaches: don't"
12374 					" update dnlc: created_v4 flag"));
12375 			}
12376 		}
12377 	} else {
12378 		/* Another client modified directory - purge its dnlc cache */
12379 		dnlc_purge_vp(dvp);
12380 	}
12381 }
12382 
12383 /*
12384  * The OPEN_CONFIRM operation confirms the sequence number used in OPENing a
12385  * file.
12386  *
12387  * The 'reopening_file' boolean should be set to TRUE if we are reopening this
12388  * file (ie: client recovery) and otherwise set to FALSE.
12389  *
12390  * 'nfs4_start/end_op' should have been called by the proper (ie: not recovery
12391  * initiated) calling functions.
12392  *
12393  * 'resend' is set to TRUE if this is a OPEN_CONFIRM issued as a result
12394  * of resending a 'lost' open request.
12395  *
12396  * 'num_bseqid_retryp' makes sure we don't loop forever on a broken
12397  * server that hands out BAD_SEQID on open confirm.
12398  *
12399  * Errors are returned via the nfs4_error_t parameter.
12400  */
12401 void
12402 nfs4open_confirm(vnode_t *vp, seqid4 *seqid, stateid4 *stateid, cred_t *cr,
12403 	bool_t reopening_file, bool_t *retry_open, nfs4_open_owner_t *oop,
12404 	bool_t resend, nfs4_error_t *ep, int *num_bseqid_retryp)
12405 {
12406 	COMPOUND4args_clnt args;
12407 	COMPOUND4res_clnt res;
12408 	nfs_argop4 argop[2];
12409 	nfs_resop4 *resop;
12410 	int doqueue = 1;
12411 	mntinfo4_t *mi;
12412 	OPEN_CONFIRM4args *open_confirm_args;
12413 	int needrecov;
12414 
12415 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12416 #if DEBUG
12417 	mutex_enter(&oop->oo_lock);
12418 	ASSERT(oop->oo_seqid_inuse);
12419 	mutex_exit(&oop->oo_lock);
12420 #endif
12421 
12422 recov_retry_confirm:
12423 	nfs4_error_zinit(ep);
12424 	*retry_open = FALSE;
12425 
12426 	if (resend)
12427 		args.ctag = TAG_OPEN_CONFIRM_LOST;
12428 	else
12429 		args.ctag = TAG_OPEN_CONFIRM;
12430 
12431 	args.array_len = 2;
12432 	args.array = argop;
12433 
12434 	/* putfh target fh */
12435 	argop[0].argop = OP_CPUTFH;
12436 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
12437 
12438 	argop[1].argop = OP_OPEN_CONFIRM;
12439 	open_confirm_args = &argop[1].nfs_argop4_u.opopen_confirm;
12440 
12441 	(*seqid) += 1;
12442 	open_confirm_args->seqid = *seqid;
12443 	open_confirm_args->open_stateid = *stateid;
12444 
12445 	mi = VTOMI4(vp);
12446 
12447 	rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
12448 
12449 	if (!ep->error && nfs4_need_to_bump_seqid(&res)) {
12450 		nfs4_set_open_seqid((*seqid), oop, args.ctag);
12451 	}
12452 
12453 	needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp);
12454 	if (!needrecov && ep->error)
12455 		return;
12456 
12457 	if (needrecov) {
12458 		bool_t abort = FALSE;
12459 
12460 		if (reopening_file == FALSE) {
12461 			nfs4_bseqid_entry_t *bsep = NULL;
12462 
12463 			if (!ep->error && res.status == NFS4ERR_BAD_SEQID)
12464 				bsep = nfs4_create_bseqid_entry(oop, NULL,
12465 					vp, 0, args.ctag,
12466 					open_confirm_args->seqid);
12467 
12468 			abort = nfs4_start_recovery(ep, VTOMI4(vp), vp,
12469 				    NULL, NULL, NULL, OP_OPEN_CONFIRM, bsep);
12470 			if (bsep) {
12471 				kmem_free(bsep, sizeof (*bsep));
12472 				if (num_bseqid_retryp &&
12473 				    --(*num_bseqid_retryp) == 0)
12474 					abort = TRUE;
12475 			}
12476 		}
12477 		if ((ep->error == ETIMEDOUT ||
12478 					res.status == NFS4ERR_RESOURCE) &&
12479 					abort == FALSE && resend == FALSE) {
12480 			if (!ep->error)
12481 				(void) xdr_free(xdr_COMPOUND4res_clnt,
12482 								(caddr_t)&res);
12483 
12484 			delay(SEC_TO_TICK(confirm_retry_sec));
12485 			goto recov_retry_confirm;
12486 		}
12487 		/* State may have changed so retry the entire OPEN op */
12488 		if (abort == FALSE)
12489 			*retry_open = TRUE;
12490 		else
12491 			*retry_open = FALSE;
12492 		if (!ep->error)
12493 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12494 		return;
12495 	}
12496 
12497 	if (res.status) {
12498 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12499 		return;
12500 	}
12501 
12502 	resop = &res.array[1];  /* open confirm res */
12503 	bcopy(&resop->nfs_resop4_u.opopen_confirm.open_stateid,
12504 				stateid, sizeof (*stateid));
12505 
12506 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12507 }
12508 
12509 /*
12510  * Return the credentials associated with a client state object.  The
12511  * caller is responsible for freeing the credentials.
12512  */
12513 
12514 static cred_t *
12515 state_to_cred(nfs4_open_stream_t *osp)
12516 {
12517 	cred_t *cr;
12518 
12519 	/*
12520 	 * It's ok to not lock the open stream and open owner to get
12521 	 * the oo_cred since this is only written once (upon creation)
12522 	 * and will not change.
12523 	 */
12524 	cr = osp->os_open_owner->oo_cred;
12525 	crhold(cr);
12526 
12527 	return (cr);
12528 }
12529 
12530 /*
12531  * nfs4_find_sysid
12532  *
12533  * Find the sysid for the knetconfig associated with the given mi.
12534  */
12535 static struct lm_sysid *
12536 nfs4_find_sysid(mntinfo4_t *mi)
12537 {
12538 	ASSERT(nfs_zone() == mi->mi_zone);
12539 
12540 	/*
12541 	 * Switch from RDMA knconf to original mount knconf
12542 	 */
12543 	return (lm_get_sysid(ORIG_KNCONF(mi), &mi->mi_curr_serv->sv_addr,
12544 		    mi->mi_curr_serv->sv_hostname, NULL));
12545 }
12546 
12547 #ifdef DEBUG
12548 /*
12549  * Return a string version of the call type for easy reading.
12550  */
12551 static char *
12552 nfs4frlock_get_call_type(nfs4_lock_call_type_t ctype)
12553 {
12554 	switch (ctype) {
12555 	case NFS4_LCK_CTYPE_NORM:
12556 		return ("NORMAL");
12557 	case NFS4_LCK_CTYPE_RECLAIM:
12558 		return ("RECLAIM");
12559 	case NFS4_LCK_CTYPE_RESEND:
12560 		return ("RESEND");
12561 	case NFS4_LCK_CTYPE_REINSTATE:
12562 		return ("REINSTATE");
12563 	default:
12564 		cmn_err(CE_PANIC, "nfs4frlock_get_call_type: got illegal "
12565 			"type %d", ctype);
12566 		return ("");
12567 	}
12568 }
12569 #endif
12570 
12571 /*
12572  * Map the frlock cmd and lock type to the NFSv4 over-the-wire lock type
12573  * Unlock requests don't have an over-the-wire locktype, so we just return
12574  * something non-threatening.
12575  */
12576 
12577 static nfs_lock_type4
12578 flk_to_locktype(int cmd, int l_type)
12579 {
12580 	ASSERT(l_type == F_RDLCK || l_type == F_WRLCK || l_type == F_UNLCK);
12581 
12582 	switch (l_type) {
12583 	case F_UNLCK:
12584 		return (READ_LT);
12585 	case F_RDLCK:
12586 		if (cmd == F_SETLK)
12587 			return (READ_LT);
12588 		else
12589 			return (READW_LT);
12590 	case F_WRLCK:
12591 		if (cmd == F_SETLK)
12592 			return (WRITE_LT);
12593 		else
12594 			return (WRITEW_LT);
12595 	}
12596 	panic("flk_to_locktype");
12597 	/*NOTREACHED*/
12598 }
12599 
12600 /*
12601  * Do some preliminary checks for nfs4frlock.
12602  */
12603 static int
12604 nfs4frlock_validate_args(int cmd, flock64_t *flk, int flag, vnode_t *vp,
12605 	u_offset_t offset)
12606 {
12607 	int error = 0;
12608 
12609 	/*
12610 	 * If we are setting a lock, check that the file is opened
12611 	 * with the correct mode.
12612 	 */
12613 	if (cmd == F_SETLK || cmd == F_SETLKW) {
12614 		if ((flk->l_type == F_RDLCK && (flag & FREAD) == 0) ||
12615 		    (flk->l_type == F_WRLCK && (flag & FWRITE) == 0)) {
12616 			NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12617 			    "nfs4frlock_validate_args: file was opened with "
12618 			    "incorrect mode"));
12619 			return (EBADF);
12620 		}
12621 	}
12622 
12623 	/* Convert the offset. It may need to be restored before returning. */
12624 	if (error = convoff(vp, flk, 0, offset)) {
12625 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12626 		    "nfs4frlock_validate_args: convoff  =>  error= %d\n",
12627 		    error));
12628 		return (error);
12629 	}
12630 
12631 	return (error);
12632 }
12633 
12634 /*
12635  * Set the flock64's lm_sysid for nfs4frlock.
12636  */
12637 static int
12638 nfs4frlock_get_sysid(struct lm_sysid **lspp, vnode_t *vp, flock64_t *flk)
12639 {
12640 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12641 
12642 	/* Find the lm_sysid */
12643 	*lspp = nfs4_find_sysid(VTOMI4(vp));
12644 
12645 	if (*lspp == NULL) {
12646 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12647 		    "nfs4frlock_get_sysid: no sysid, return ENOLCK"));
12648 		return (ENOLCK);
12649 	}
12650 
12651 	flk->l_sysid = lm_sysidt(*lspp);
12652 
12653 	return (0);
12654 }
12655 
12656 /*
12657  * Do the remaining preliminary setup for nfs4frlock.
12658  */
12659 static void
12660 nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep,
12661 	flock64_t *flk, short *whencep, vnode_t *vp, cred_t *search_cr,
12662 	cred_t **cred_otw)
12663 {
12664 	/*
12665 	 * set tick_delay to the base delay time.
12666 	 * (NFS4_BASE_WAIT_TIME is in secs)
12667 	 */
12668 
12669 	*tick_delayp = drv_usectohz(NFS4_BASE_WAIT_TIME * 1000 * 1000);
12670 
12671 	/*
12672 	 * If lock is relative to EOF, we need the newest length of the
12673 	 * file. Therefore invalidate the ATTR_CACHE.
12674 	 */
12675 
12676 	*whencep = flk->l_whence;
12677 
12678 	if (*whencep == 2)		/* SEEK_END */
12679 		PURGE_ATTRCACHE4(vp);
12680 
12681 	recov_statep->rs_flags = 0;
12682 	recov_statep->rs_num_retry_despite_err = 0;
12683 	*cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL);
12684 }
12685 
12686 /*
12687  * Initialize and allocate the data structures necessary for
12688  * the nfs4frlock call.
12689  * Allocates argsp's op array, frees up the saved_rqstpp if there is one.
12690  */
12691 static void
12692 nfs4frlock_call_init(COMPOUND4args_clnt *argsp, COMPOUND4args_clnt **argspp,
12693 	nfs_argop4 **argopp, nfs4_op_hint_t *op_hintp, flock64_t *flk, int cmd,
12694 	bool_t *retry, bool_t *did_start_fop, COMPOUND4res_clnt **respp,
12695 	bool_t *skip_get_err, nfs4_lost_rqst_t *lost_rqstp)
12696 {
12697 	int		argoplist_size;
12698 	int		num_ops = 2;
12699 
12700 	*retry = FALSE;
12701 	*did_start_fop = FALSE;
12702 	*skip_get_err = FALSE;
12703 	lost_rqstp->lr_op = 0;
12704 	argoplist_size  = num_ops * sizeof (nfs_argop4);
12705 	/* fill array with zero */
12706 	*argopp = kmem_zalloc(argoplist_size, KM_SLEEP);
12707 
12708 	*argspp = argsp;
12709 	*respp = NULL;
12710 
12711 	argsp->array_len = num_ops;
12712 	argsp->array = *argopp;
12713 
12714 	/* initialize in case of error; will get real value down below */
12715 	argsp->ctag = TAG_NONE;
12716 
12717 	if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK)
12718 		*op_hintp = OH_LOCKU;
12719 	else
12720 		*op_hintp = OH_OTHER;
12721 }
12722 
12723 /*
12724  * Call the nfs4_start_fop() for nfs4frlock, if necessary.  Assign
12725  * the proper nfs4_server_t for this instance of nfs4frlock.
12726  * Returns 0 (success) or an errno value.
12727  */
12728 static int
12729 nfs4frlock_start_call(nfs4_lock_call_type_t ctype, vnode_t *vp,
12730 	nfs4_op_hint_t op_hint, nfs4_recov_state_t *recov_statep,
12731 	bool_t *did_start_fop, bool_t *startrecovp)
12732 {
12733 	int error = 0;
12734 	rnode4_t *rp;
12735 
12736 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12737 
12738 	if (ctype == NFS4_LCK_CTYPE_NORM) {
12739 		error = nfs4_start_fop(VTOMI4(vp), vp, NULL, op_hint,
12740 				recov_statep, startrecovp);
12741 		if (error)
12742 			return (error);
12743 		*did_start_fop = TRUE;
12744 	} else {
12745 		*did_start_fop = FALSE;
12746 		*startrecovp = FALSE;
12747 	}
12748 
12749 	if (!error) {
12750 		rp = VTOR4(vp);
12751 
12752 		/* If the file failed recovery, just quit. */
12753 		mutex_enter(&rp->r_statelock);
12754 		if (rp->r_flags & R4RECOVERR) {
12755 			error = EIO;
12756 		}
12757 		mutex_exit(&rp->r_statelock);
12758 	}
12759 
12760 	return (error);
12761 }
12762 
12763 /*
12764  * Setup the LOCK4/LOCKU4 arguments for resending a lost lock request.  A
12765  * resend nfs4frlock call is initiated by the recovery framework.
12766  * Acquires the lop and oop seqid synchronization.
12767  */
12768 static void
12769 nfs4frlock_setup_resend_lock_args(nfs4_lost_rqst_t *resend_rqstp,
12770 	COMPOUND4args_clnt *argsp, nfs_argop4 *argop, nfs4_lock_owner_t **lopp,
12771 	nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
12772 	LOCK4args **lock_argsp, LOCKU4args **locku_argsp)
12773 {
12774 	mntinfo4_t *mi = VTOMI4(resend_rqstp->lr_vp);
12775 	int error;
12776 
12777 	NFS4_DEBUG((nfs4_lost_rqst_debug || nfs4_client_lock_debug),
12778 		(CE_NOTE,
12779 	    "nfs4frlock_setup_resend_lock_args: have lost lock to resend"));
12780 	ASSERT(resend_rqstp != NULL);
12781 	ASSERT(resend_rqstp->lr_op == OP_LOCK ||
12782 	    resend_rqstp->lr_op == OP_LOCKU);
12783 
12784 	*oopp = resend_rqstp->lr_oop;
12785 	if (resend_rqstp->lr_oop) {
12786 		open_owner_hold(resend_rqstp->lr_oop);
12787 		error = nfs4_start_open_seqid_sync(resend_rqstp->lr_oop, mi);
12788 		ASSERT(error == 0);	/* recov thread always succeeds */
12789 	}
12790 
12791 	/* Must resend this lost lock/locku request. */
12792 	ASSERT(resend_rqstp->lr_lop != NULL);
12793 	*lopp = resend_rqstp->lr_lop;
12794 	lock_owner_hold(resend_rqstp->lr_lop);
12795 	error = nfs4_start_lock_seqid_sync(resend_rqstp->lr_lop, mi);
12796 	ASSERT(error == 0);	/* recov thread always succeeds */
12797 
12798 	*ospp = resend_rqstp->lr_osp;
12799 	if (*ospp)
12800 		open_stream_hold(resend_rqstp->lr_osp);
12801 
12802 	if (resend_rqstp->lr_op == OP_LOCK) {
12803 		LOCK4args *lock_args;
12804 
12805 		argop->argop = OP_LOCK;
12806 		*lock_argsp = lock_args = &argop->nfs_argop4_u.oplock;
12807 		lock_args->locktype = resend_rqstp->lr_locktype;
12808 		lock_args->reclaim =
12809 			(resend_rqstp->lr_ctype == NFS4_LCK_CTYPE_RECLAIM);
12810 		lock_args->offset = resend_rqstp->lr_flk->l_start;
12811 		lock_args->length = resend_rqstp->lr_flk->l_len;
12812 		if (lock_args->length == 0)
12813 			lock_args->length = ~lock_args->length;
12814 		nfs4_setup_lock_args(*lopp, *oopp, *ospp,
12815 				mi2clientid(mi), &lock_args->locker);
12816 
12817 		switch (resend_rqstp->lr_ctype) {
12818 		case NFS4_LCK_CTYPE_RESEND:
12819 			argsp->ctag = TAG_LOCK_RESEND;
12820 			break;
12821 		case NFS4_LCK_CTYPE_REINSTATE:
12822 			argsp->ctag = TAG_LOCK_REINSTATE;
12823 			break;
12824 		case NFS4_LCK_CTYPE_RECLAIM:
12825 			argsp->ctag = TAG_LOCK_RECLAIM;
12826 			break;
12827 		default:
12828 			argsp->ctag = TAG_LOCK_UNKNOWN;
12829 			break;
12830 		}
12831 	} else {
12832 		LOCKU4args *locku_args;
12833 		nfs4_lock_owner_t *lop = resend_rqstp->lr_lop;
12834 
12835 		argop->argop = OP_LOCKU;
12836 		*locku_argsp = locku_args = &argop->nfs_argop4_u.oplocku;
12837 		locku_args->locktype = READ_LT;
12838 		locku_args->seqid = lop->lock_seqid + 1;
12839 		mutex_enter(&lop->lo_lock);
12840 		locku_args->lock_stateid = lop->lock_stateid;
12841 		mutex_exit(&lop->lo_lock);
12842 		locku_args->offset = resend_rqstp->lr_flk->l_start;
12843 		locku_args->length = resend_rqstp->lr_flk->l_len;
12844 		if (locku_args->length == 0)
12845 			locku_args->length = ~locku_args->length;
12846 
12847 		switch (resend_rqstp->lr_ctype) {
12848 		case NFS4_LCK_CTYPE_RESEND:
12849 			argsp->ctag = TAG_LOCKU_RESEND;
12850 			break;
12851 		case NFS4_LCK_CTYPE_REINSTATE:
12852 			argsp->ctag = TAG_LOCKU_REINSTATE;
12853 			break;
12854 		default:
12855 			argsp->ctag = TAG_LOCK_UNKNOWN;
12856 			break;
12857 		}
12858 	}
12859 }
12860 
12861 /*
12862  * Setup the LOCKT4 arguments.
12863  */
12864 static void
12865 nfs4frlock_setup_lockt_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop,
12866 	LOCKT4args **lockt_argsp, COMPOUND4args_clnt *argsp, flock64_t *flk,
12867 	rnode4_t *rp)
12868 {
12869 	LOCKT4args *lockt_args;
12870 
12871 	ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone);
12872 	ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
12873 	argop->argop = OP_LOCKT;
12874 	argsp->ctag = TAG_LOCKT;
12875 	lockt_args = &argop->nfs_argop4_u.oplockt;
12876 
12877 	/*
12878 	 * The locktype will be READ_LT unless it's
12879 	 * a write lock. We do this because the Solaris
12880 	 * system call allows the combination of
12881 	 * F_UNLCK and F_GETLK* and so in that case the
12882 	 * unlock is mapped to a read.
12883 	 */
12884 	if (flk->l_type == F_WRLCK)
12885 		lockt_args->locktype = WRITE_LT;
12886 	else
12887 		lockt_args->locktype = READ_LT;
12888 
12889 	lockt_args->owner.clientid = mi2clientid(VTOMI4(RTOV4(rp)));
12890 	/* set the lock owner4 args */
12891 	nfs4_setlockowner_args(&lockt_args->owner, rp,
12892 	    ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id :
12893 	    flk->l_pid);
12894 	lockt_args->offset = flk->l_start;
12895 	lockt_args->length = flk->l_len;
12896 	if (flk->l_len == 0)
12897 		lockt_args->length = ~lockt_args->length;
12898 
12899 	*lockt_argsp = lockt_args;
12900 }
12901 
12902 /*
12903  * If the client is holding a delegation, and the open stream to be used
12904  * with this lock request is a delegation open stream, then re-open the stream.
12905  * Sets the nfs4_error_t to all zeros unless the open stream has already
12906  * failed a reopen or we couldn't find the open stream.  NFS4ERR_DELAY
12907  * means the caller should retry (like a recovery retry).
12908  */
12909 static void
12910 nfs4frlock_check_deleg(vnode_t *vp, nfs4_error_t *ep, cred_t *cr, int lt)
12911 {
12912 	open_delegation_type4	dt;
12913 	bool_t			reopen_needed, force;
12914 	nfs4_open_stream_t	*osp;
12915 	open_claim_type4 	oclaim;
12916 	rnode4_t		*rp = VTOR4(vp);
12917 	mntinfo4_t		*mi = VTOMI4(vp);
12918 
12919 	ASSERT(nfs_zone() == mi->mi_zone);
12920 
12921 	nfs4_error_zinit(ep);
12922 
12923 	mutex_enter(&rp->r_statev4_lock);
12924 	dt = rp->r_deleg_type;
12925 	mutex_exit(&rp->r_statev4_lock);
12926 
12927 	if (dt != OPEN_DELEGATE_NONE) {
12928 		nfs4_open_owner_t	*oop;
12929 
12930 		oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
12931 		if (!oop) {
12932 			ep->stat = NFS4ERR_IO;
12933 			return;
12934 		}
12935 		/* returns with 'os_sync_lock' held */
12936 		osp = find_open_stream(oop, rp);
12937 		if (!osp) {
12938 			open_owner_rele(oop);
12939 			ep->stat = NFS4ERR_IO;
12940 			return;
12941 		}
12942 
12943 		if (osp->os_failed_reopen) {
12944 			NFS4_DEBUG((nfs4_open_stream_debug ||
12945 				    nfs4_client_lock_debug), (CE_NOTE,
12946 			    "nfs4frlock_check_deleg: os_failed_reopen set "
12947 			    "for osp %p, cr %p, rp %s", (void *)osp,
12948 			    (void *)cr, rnode4info(rp)));
12949 			mutex_exit(&osp->os_sync_lock);
12950 			open_stream_rele(osp, rp);
12951 			open_owner_rele(oop);
12952 			ep->stat = NFS4ERR_IO;
12953 			return;
12954 		}
12955 
12956 		/*
12957 		 * Determine whether a reopen is needed.  If this
12958 		 * is a delegation open stream, then send the open
12959 		 * to the server to give visibility to the open owner.
12960 		 * Even if it isn't a delegation open stream, we need
12961 		 * to check if the previous open CLAIM_DELEGATE_CUR
12962 		 * was sufficient.
12963 		 */
12964 
12965 		reopen_needed = osp->os_delegation ||
12966 		    ((lt == F_RDLCK &&
12967 			!(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_READ)) ||
12968 		    (lt == F_WRLCK &&
12969 			!(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_WRITE)));
12970 
12971 		mutex_exit(&osp->os_sync_lock);
12972 		open_owner_rele(oop);
12973 
12974 		if (reopen_needed) {
12975 			/*
12976 			 * Always use CLAIM_PREVIOUS after server reboot.
12977 			 * The server will reject CLAIM_DELEGATE_CUR if
12978 			 * it is used during the grace period.
12979 			 */
12980 			mutex_enter(&mi->mi_lock);
12981 			if (mi->mi_recovflags & MI4R_SRV_REBOOT) {
12982 				oclaim = CLAIM_PREVIOUS;
12983 				force = TRUE;
12984 			} else {
12985 				oclaim = CLAIM_DELEGATE_CUR;
12986 				force = FALSE;
12987 			}
12988 			mutex_exit(&mi->mi_lock);
12989 
12990 			nfs4_reopen(vp, osp, ep, oclaim, force, FALSE);
12991 			if (ep->error == EAGAIN) {
12992 				nfs4_error_zinit(ep);
12993 				ep->stat = NFS4ERR_DELAY;
12994 			}
12995 		}
12996 		open_stream_rele(osp, rp);
12997 		osp = NULL;
12998 	}
12999 }
13000 
13001 /*
13002  * Setup the LOCKU4 arguments.
13003  * Returns errors via the nfs4_error_t.
13004  * NFS4_OK		no problems.  *go_otwp is TRUE if call should go
13005  *			over-the-wire.  The caller must release the
13006  *			reference on *lopp.
13007  * NFS4ERR_DELAY	caller should retry (like recovery retry)
13008  * (other)		unrecoverable error.
13009  */
13010 static void
13011 nfs4frlock_setup_locku_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop,
13012 	LOCKU4args **locku_argsp, flock64_t *flk,
13013 	nfs4_lock_owner_t **lopp, nfs4_error_t *ep, COMPOUND4args_clnt *argsp,
13014 	vnode_t *vp, int flag, u_offset_t offset, cred_t *cr,
13015 	bool_t *skip_get_err, bool_t *go_otwp)
13016 {
13017 	nfs4_lock_owner_t	*lop = NULL;
13018 	LOCKU4args		*locku_args;
13019 	pid_t			pid;
13020 	bool_t			is_spec = FALSE;
13021 	rnode4_t		*rp = VTOR4(vp);
13022 
13023 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13024 	ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13025 
13026 	nfs4frlock_check_deleg(vp, ep, cr, F_UNLCK);
13027 	if (ep->error || ep->stat)
13028 		return;
13029 
13030 	argop->argop = OP_LOCKU;
13031 	if (ctype == NFS4_LCK_CTYPE_REINSTATE)
13032 		argsp->ctag = TAG_LOCKU_REINSTATE;
13033 	else
13034 		argsp->ctag = TAG_LOCKU;
13035 	locku_args = &argop->nfs_argop4_u.oplocku;
13036 	*locku_argsp = locku_args;
13037 
13038 	/*
13039 	 * XXX what should locku_args->locktype be?
13040 	 * setting to ALWAYS be READ_LT so at least
13041 	 * it is a valid locktype.
13042 	 */
13043 
13044 	locku_args->locktype = READ_LT;
13045 
13046 	pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id :
13047 		flk->l_pid;
13048 
13049 	/*
13050 	 * Get the lock owner stateid.  If no lock owner
13051 	 * exists, return success.
13052 	 */
13053 	lop = find_lock_owner(rp, pid, LOWN_ANY);
13054 	*lopp = lop;
13055 	if (lop && CLNT_ISSPECIAL(&lop->lock_stateid))
13056 		is_spec = TRUE;
13057 	if (!lop || is_spec) {
13058 		/*
13059 		 * No lock owner so no locks to unlock.
13060 		 * Return success.  If there was a failed
13061 		 * reclaim earlier, the lock might still be
13062 		 * registered with the local locking code,
13063 		 * so notify it of the unlock.
13064 		 *
13065 		 * If the lockowner is using a special stateid,
13066 		 * then the original lock request (that created
13067 		 * this lockowner) was never successful, so we
13068 		 * have no lock to undo OTW.
13069 		 */
13070 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13071 			"nfs4frlock_setup_locku_args: LOCKU: no lock owner "
13072 			"(%ld) so return success", (long)pid));
13073 
13074 		if (ctype == NFS4_LCK_CTYPE_NORM)
13075 			flk->l_pid = curproc->p_pid;
13076 		nfs4_register_lock_locally(vp, flk, flag, offset);
13077 		/*
13078 		 * Release our hold and NULL out so final_cleanup
13079 		 * doesn't try to end a lock seqid sync we
13080 		 * never started.
13081 		 */
13082 		if (is_spec) {
13083 			lock_owner_rele(lop);
13084 			*lopp = NULL;
13085 		}
13086 		*skip_get_err = TRUE;
13087 		*go_otwp = FALSE;
13088 		return;
13089 	}
13090 
13091 	ep->error = nfs4_start_lock_seqid_sync(lop, VTOMI4(vp));
13092 	if (ep->error == EAGAIN) {
13093 		lock_owner_rele(lop);
13094 		*lopp = NULL;
13095 		return;
13096 	}
13097 
13098 	mutex_enter(&lop->lo_lock);
13099 	locku_args->lock_stateid = lop->lock_stateid;
13100 	mutex_exit(&lop->lo_lock);
13101 	locku_args->seqid = lop->lock_seqid + 1;
13102 
13103 	/* leave the ref count on lop, rele after RPC call */
13104 
13105 	locku_args->offset = flk->l_start;
13106 	locku_args->length = flk->l_len;
13107 	if (flk->l_len == 0)
13108 		locku_args->length = ~locku_args->length;
13109 
13110 	*go_otwp = TRUE;
13111 }
13112 
13113 /*
13114  * Setup the LOCK4 arguments.
13115  *
13116  * Returns errors via the nfs4_error_t.
13117  * NFS4_OK		no problems
13118  * NFS4ERR_DELAY	caller should retry (like recovery retry)
13119  * (other)		unrecoverable error
13120  */
13121 static void
13122 nfs4frlock_setup_lock_args(nfs4_lock_call_type_t ctype, LOCK4args **lock_argsp,
13123 	nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13124 	nfs4_lock_owner_t **lopp, nfs_argop4 *argop, COMPOUND4args_clnt *argsp,
13125 	flock64_t *flk, int cmd, vnode_t *vp, cred_t *cr, nfs4_error_t *ep)
13126 {
13127 	LOCK4args		*lock_args;
13128 	nfs4_open_owner_t	*oop = NULL;
13129 	nfs4_open_stream_t	*osp = NULL;
13130 	nfs4_lock_owner_t	*lop = NULL;
13131 	pid_t			pid;
13132 	rnode4_t		*rp = VTOR4(vp);
13133 
13134 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13135 
13136 	nfs4frlock_check_deleg(vp, ep, cr, flk->l_type);
13137 	if (ep->error || ep->stat != NFS4_OK)
13138 		return;
13139 
13140 	argop->argop = OP_LOCK;
13141 	if (ctype == NFS4_LCK_CTYPE_NORM)
13142 		argsp->ctag = TAG_LOCK;
13143 	else if (ctype == NFS4_LCK_CTYPE_RECLAIM)
13144 		argsp->ctag = TAG_RELOCK;
13145 	else
13146 		argsp->ctag = TAG_LOCK_REINSTATE;
13147 	lock_args = &argop->nfs_argop4_u.oplock;
13148 	lock_args->locktype = flk_to_locktype(cmd, flk->l_type);
13149 	lock_args->reclaim = ctype == NFS4_LCK_CTYPE_RECLAIM ? 1 : 0;
13150 	/*
13151 	 * Get the lock owner.  If no lock owner exists,
13152 	 * create a 'temporary' one and grab the open seqid
13153 	 * synchronization (which puts a hold on the open
13154 	 * owner and open stream).
13155 	 * This also grabs the lock seqid synchronization.
13156 	 */
13157 	pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : flk->l_pid;
13158 	ep->stat =
13159 		nfs4_find_or_create_lock_owner(pid, rp, cr, &oop, &osp, &lop);
13160 
13161 	if (ep->stat != NFS4_OK)
13162 		goto out;
13163 
13164 	nfs4_setup_lock_args(lop, oop, osp, mi2clientid(VTOMI4(vp)),
13165 			&lock_args->locker);
13166 
13167 	lock_args->offset = flk->l_start;
13168 	lock_args->length = flk->l_len;
13169 	if (flk->l_len == 0)
13170 		lock_args->length = ~lock_args->length;
13171 	*lock_argsp = lock_args;
13172 out:
13173 	*oopp = oop;
13174 	*ospp = osp;
13175 	*lopp = lop;
13176 }
13177 
13178 /*
13179  * After we get the reply from the server, record the proper information
13180  * for possible resend lock requests.
13181  *
13182  * Allocates memory for the saved_rqstp if we have a lost lock to save.
13183  */
13184 static void
13185 nfs4frlock_save_lost_rqst(nfs4_lock_call_type_t ctype, int error,
13186 	nfs_lock_type4 locktype, nfs4_open_owner_t *oop,
13187 	nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk,
13188 	nfs4_lost_rqst_t *lost_rqstp, cred_t *cr, vnode_t *vp)
13189 {
13190 	bool_t unlock = (flk->l_type == F_UNLCK);
13191 
13192 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13193 	ASSERT(ctype == NFS4_LCK_CTYPE_NORM ||
13194 	    ctype == NFS4_LCK_CTYPE_REINSTATE);
13195 
13196 	if (error != 0 && !unlock) {
13197 		NFS4_DEBUG((nfs4_lost_rqst_debug ||
13198 			    nfs4_client_lock_debug), (CE_NOTE,
13199 		    "nfs4frlock_save_lost_rqst: set lo_pending_rqsts to 1 "
13200 		    " for lop %p", (void *)lop));
13201 		ASSERT(lop != NULL);
13202 		mutex_enter(&lop->lo_lock);
13203 		lop->lo_pending_rqsts = 1;
13204 		mutex_exit(&lop->lo_lock);
13205 	}
13206 
13207 	lost_rqstp->lr_putfirst = FALSE;
13208 	lost_rqstp->lr_op = 0;
13209 
13210 	/*
13211 	 * For lock/locku requests, we treat EINTR as ETIMEDOUT for
13212 	 * recovery purposes so that the lock request that was sent
13213 	 * can be saved and re-issued later.  Ditto for EIO from a forced
13214 	 * unmount.  This is done to have the client's local locking state
13215 	 * match the v4 server's state; that is, the request was
13216 	 * potentially received and accepted by the server but the client
13217 	 * thinks it was not.
13218 	 */
13219 	if (error == ETIMEDOUT || error == EINTR ||
13220 	    NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
13221 		NFS4_DEBUG((nfs4_lost_rqst_debug ||
13222 			    nfs4_client_lock_debug), (CE_NOTE,
13223 		    "nfs4frlock_save_lost_rqst: got a lost %s lock for "
13224 		    "lop %p oop %p osp %p", unlock ? "LOCKU" : "LOCK",
13225 		    (void *)lop, (void *)oop, (void *)osp));
13226 		if (unlock)
13227 			lost_rqstp->lr_op = OP_LOCKU;
13228 		else {
13229 			lost_rqstp->lr_op = OP_LOCK;
13230 			lost_rqstp->lr_locktype = locktype;
13231 		}
13232 		/*
13233 		 * Objects are held and rele'd via the recovery code.
13234 		 * See nfs4_save_lost_rqst.
13235 		 */
13236 		lost_rqstp->lr_vp = vp;
13237 		lost_rqstp->lr_dvp = NULL;
13238 		lost_rqstp->lr_oop = oop;
13239 		lost_rqstp->lr_osp = osp;
13240 		lost_rqstp->lr_lop = lop;
13241 		lost_rqstp->lr_cr = cr;
13242 		switch (ctype) {
13243 		case NFS4_LCK_CTYPE_NORM:
13244 			flk->l_pid = ttoproc(curthread)->p_pid;
13245 			lost_rqstp->lr_ctype = NFS4_LCK_CTYPE_RESEND;
13246 			break;
13247 		case NFS4_LCK_CTYPE_REINSTATE:
13248 			lost_rqstp->lr_putfirst = TRUE;
13249 			lost_rqstp->lr_ctype = ctype;
13250 			break;
13251 		default:
13252 			break;
13253 		}
13254 		lost_rqstp->lr_flk = flk;
13255 	}
13256 }
13257 
13258 /*
13259  * Update lop's seqid.  Also update the seqid stored in a resend request,
13260  * if any.  (Some recovery errors increment the seqid, and we may have to
13261  * send the resend request again.)
13262  */
13263 
13264 static void
13265 nfs4frlock_bump_seqid(LOCK4args *lock_args, LOCKU4args *locku_args,
13266     nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop, nfs4_tag_type_t tag_type)
13267 {
13268 	if (lock_args) {
13269 		if (lock_args->locker.new_lock_owner == TRUE)
13270 			nfs4_get_and_set_next_open_seqid(oop, tag_type);
13271 		else {
13272 			ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
13273 			nfs4_set_lock_seqid(lop->lock_seqid + 1, lop);
13274 		}
13275 	} else if (locku_args) {
13276 		ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
13277 		nfs4_set_lock_seqid(lop->lock_seqid +1, lop);
13278 	}
13279 }
13280 
13281 /*
13282  * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13283  * COMPOUND4 args/res for calls that need to retry.
13284  * Switches the *cred_otwp to base_cr.
13285  */
13286 static void
13287 nfs4frlock_check_access(vnode_t *vp, nfs4_op_hint_t op_hint,
13288     nfs4_recov_state_t *recov_statep, int needrecov, bool_t *did_start_fop,
13289     COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, int error,
13290     nfs4_lock_owner_t **lopp, nfs4_open_owner_t **oopp,
13291     nfs4_open_stream_t **ospp, cred_t *base_cr, cred_t **cred_otwp)
13292 {
13293 	nfs4_open_owner_t	*oop = *oopp;
13294 	nfs4_open_stream_t	*osp = *ospp;
13295 	nfs4_lock_owner_t	*lop = *lopp;
13296 	nfs_argop4		*argop = (*argspp)->array;
13297 
13298 	if (*did_start_fop) {
13299 		nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep,
13300 			    needrecov);
13301 		*did_start_fop = FALSE;
13302 	}
13303 	ASSERT((*argspp)->array_len == 2);
13304 	if (argop[1].argop == OP_LOCK)
13305 		nfs4args_lock_free(&argop[1]);
13306 	else if (argop[1].argop == OP_LOCKT)
13307 		nfs4args_lockt_free(&argop[1]);
13308 	kmem_free(argop, 2 * sizeof (nfs_argop4));
13309 	if (!error)
13310 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp);
13311 	*argspp = NULL;
13312 	*respp = NULL;
13313 
13314 	if (lop) {
13315 		nfs4_end_lock_seqid_sync(lop);
13316 		lock_owner_rele(lop);
13317 		*lopp = NULL;
13318 	}
13319 
13320 	/* need to free up the reference on osp for lock args */
13321 	if (osp != NULL) {
13322 		open_stream_rele(osp, VTOR4(vp));
13323 		*ospp = NULL;
13324 	}
13325 
13326 	/* need to free up the reference on oop for lock args */
13327 	if (oop != NULL) {
13328 		nfs4_end_open_seqid_sync(oop);
13329 		open_owner_rele(oop);
13330 		*oopp = NULL;
13331 	}
13332 
13333 	crfree(*cred_otwp);
13334 	*cred_otwp = base_cr;
13335 	crhold(*cred_otwp);
13336 }
13337 
13338 /*
13339  * Function to process the client's recovery for nfs4frlock.
13340  * Returns TRUE if we should retry the lock request; FALSE otherwise.
13341  *
13342  * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13343  * COMPOUND4 args/res for calls that need to retry.
13344  *
13345  * Note: the rp's r_lkserlock is *not* dropped during this path.
13346  */
13347 static bool_t
13348 nfs4frlock_recovery(int needrecov, nfs4_error_t *ep,
13349 	COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp,
13350 	LOCK4args *lock_args, LOCKU4args *locku_args,
13351 	nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13352 	nfs4_lock_owner_t **lopp, rnode4_t *rp, vnode_t *vp,
13353 	nfs4_recov_state_t *recov_statep, nfs4_op_hint_t op_hint,
13354 	bool_t *did_start_fop, nfs4_lost_rqst_t *lost_rqstp, flock64_t *flk)
13355 {
13356 	nfs4_open_owner_t	*oop = *oopp;
13357 	nfs4_open_stream_t	*osp = *ospp;
13358 	nfs4_lock_owner_t	*lop = *lopp;
13359 
13360 	bool_t abort, retry;
13361 
13362 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13363 	ASSERT((*argspp) != NULL);
13364 	ASSERT((*respp) != NULL);
13365 	if (lock_args || locku_args)
13366 		ASSERT(lop != NULL);
13367 
13368 	NFS4_DEBUG((nfs4_client_lock_debug || nfs4_client_recov_debug),
13369 	    (CE_NOTE, "nfs4frlock_recovery: initiating recovery\n"));
13370 
13371 	retry = TRUE;
13372 	abort = FALSE;
13373 	if (needrecov) {
13374 		nfs4_bseqid_entry_t *bsep = NULL;
13375 		nfs_opnum4 op;
13376 
13377 		op = lock_args ? OP_LOCK : locku_args ? OP_LOCKU : OP_LOCKT;
13378 
13379 		if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) {
13380 			seqid4 seqid;
13381 
13382 			if (lock_args) {
13383 				if (lock_args->locker.new_lock_owner == TRUE)
13384 					seqid = lock_args->locker.locker4_u.
13385 						    open_owner.open_seqid;
13386 				else
13387 					seqid = lock_args->locker.locker4_u.
13388 						    lock_owner.lock_seqid;
13389 			} else if (locku_args) {
13390 				seqid = locku_args->seqid;
13391 			} else {
13392 				seqid = 0;
13393 			}
13394 
13395 			bsep = nfs4_create_bseqid_entry(oop, lop, vp,
13396 				flk->l_pid, (*argspp)->ctag, seqid);
13397 		}
13398 
13399 		abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL,
13400 			    (lost_rqstp && (lost_rqstp->lr_op == OP_LOCK ||
13401 			    lost_rqstp->lr_op == OP_LOCKU)) ? lost_rqstp :
13402 			    NULL, op, bsep);
13403 
13404 		if (bsep)
13405 			kmem_free(bsep, sizeof (*bsep));
13406 	}
13407 
13408 	/*
13409 	 * Return that we do not want to retry the request for 3 cases:
13410 	 * 1. If we received EINTR or are bailing out because of a forced
13411 	 *    unmount, we came into this code path just for the sake of
13412 	 *    initiating recovery, we now need to return the error.
13413 	 * 2. If we have aborted recovery.
13414 	 * 3. We received NFS4ERR_BAD_SEQID.
13415 	 */
13416 	if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp) ||
13417 	    abort == TRUE || (ep->error == 0 && ep->stat == NFS4ERR_BAD_SEQID))
13418 		retry = FALSE;
13419 
13420 	if (*did_start_fop == TRUE) {
13421 		nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep,
13422 		    needrecov);
13423 		*did_start_fop = FALSE;
13424 	}
13425 
13426 	if (retry == TRUE) {
13427 		nfs_argop4	*argop;
13428 
13429 		argop = (*argspp)->array;
13430 		ASSERT((*argspp)->array_len == 2);
13431 
13432 		if (argop[1].argop == OP_LOCK)
13433 			nfs4args_lock_free(&argop[1]);
13434 		else if (argop[1].argop == OP_LOCKT)
13435 			nfs4args_lockt_free(&argop[1]);
13436 		kmem_free(argop, 2 * sizeof (nfs_argop4));
13437 		if (!ep->error)
13438 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp);
13439 		*respp = NULL;
13440 		*argspp = NULL;
13441 	}
13442 
13443 	if (lop != NULL) {
13444 		nfs4_end_lock_seqid_sync(lop);
13445 		lock_owner_rele(lop);
13446 	}
13447 
13448 	*lopp = NULL;
13449 
13450 	/* need to free up the reference on osp for lock args */
13451 	if (osp != NULL) {
13452 		open_stream_rele(osp, rp);
13453 		*ospp = NULL;
13454 	}
13455 
13456 	/* need to free up the reference on oop for lock args */
13457 	if (oop != NULL) {
13458 		nfs4_end_open_seqid_sync(oop);
13459 		open_owner_rele(oop);
13460 		*oopp = NULL;
13461 	}
13462 
13463 	return (retry);
13464 }
13465 
13466 /*
13467  * Handles the succesful reply from the server for nfs4frlock.
13468  */
13469 static void
13470 nfs4frlock_results_ok(nfs4_lock_call_type_t ctype, int cmd, flock64_t *flk,
13471 	vnode_t *vp, int flag, u_offset_t offset,
13472 	nfs4_lost_rqst_t *resend_rqstp)
13473 {
13474 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13475 	if ((cmd == F_SETLK || cmd == F_SETLKW) &&
13476 	    (flk->l_type == F_RDLCK || flk->l_type == F_WRLCK)) {
13477 		if (ctype == NFS4_LCK_CTYPE_NORM) {
13478 			flk->l_pid = ttoproc(curthread)->p_pid;
13479 			/*
13480 			 * We do not register lost locks locally in
13481 			 * the 'resend' case since the user/application
13482 			 * doesn't think we have the lock.
13483 			 */
13484 			ASSERT(!resend_rqstp);
13485 			nfs4_register_lock_locally(vp, flk, flag, offset);
13486 		}
13487 	}
13488 }
13489 
13490 /*
13491  * Handle the DENIED reply from the server for nfs4frlock.
13492  * Returns TRUE if we should retry the request; FALSE otherwise.
13493  *
13494  * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13495  * COMPOUND4 args/res for calls that need to retry.  Can also
13496  * drop and regrab the r_lkserlock.
13497  */
13498 static bool_t
13499 nfs4frlock_results_denied(nfs4_lock_call_type_t ctype, LOCK4args *lock_args,
13500 	LOCKT4args *lockt_args, nfs4_open_owner_t **oopp,
13501 	nfs4_open_stream_t **ospp, nfs4_lock_owner_t **lopp, int cmd,
13502 	vnode_t *vp, flock64_t *flk, nfs4_op_hint_t op_hint,
13503 	nfs4_recov_state_t *recov_statep, int needrecov,
13504 	COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp,
13505 	clock_t *tick_delayp, short *whencep, int *errorp,
13506 	nfs_resop4 *resop, cred_t *cr, bool_t *did_start_fop,
13507 	bool_t *skip_get_err)
13508 {
13509 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13510 
13511 	if (lock_args) {
13512 		nfs4_open_owner_t	*oop = *oopp;
13513 		nfs4_open_stream_t	*osp = *ospp;
13514 		nfs4_lock_owner_t	*lop = *lopp;
13515 		int			intr;
13516 
13517 		/*
13518 		 * Blocking lock needs to sleep and retry from the request.
13519 		 *
13520 		 * Do not block and wait for 'resend' or 'reinstate'
13521 		 * lock requests, just return the error.
13522 		 *
13523 		 * Note: reclaim requests have cmd == F_SETLK, not F_SETLKW.
13524 		 */
13525 		if (cmd == F_SETLKW) {
13526 			rnode4_t *rp = VTOR4(vp);
13527 			nfs_argop4 *argop = (*argspp)->array;
13528 
13529 			ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13530 
13531 			nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint,
13532 				recov_statep, needrecov);
13533 			*did_start_fop = FALSE;
13534 			ASSERT((*argspp)->array_len == 2);
13535 			if (argop[1].argop == OP_LOCK)
13536 				nfs4args_lock_free(&argop[1]);
13537 			else if (argop[1].argop == OP_LOCKT)
13538 				nfs4args_lockt_free(&argop[1]);
13539 			kmem_free(argop, 2 * sizeof (nfs_argop4));
13540 			if (*respp)
13541 				(void) xdr_free(xdr_COMPOUND4res_clnt,
13542 							(caddr_t)*respp);
13543 			*argspp = NULL;
13544 			*respp = NULL;
13545 			nfs4_end_lock_seqid_sync(lop);
13546 			lock_owner_rele(lop);
13547 			*lopp = NULL;
13548 			if (osp != NULL) {
13549 				open_stream_rele(osp, rp);
13550 				*ospp = NULL;
13551 			}
13552 			if (oop != NULL) {
13553 				nfs4_end_open_seqid_sync(oop);
13554 				open_owner_rele(oop);
13555 				*oopp = NULL;
13556 			}
13557 
13558 			nfs_rw_exit(&rp->r_lkserlock);
13559 
13560 			intr = nfs4_block_and_wait(tick_delayp, rp);
13561 
13562 			if (intr) {
13563 				(void) nfs_rw_enter_sig(&rp->r_lkserlock,
13564 						RW_WRITER, FALSE);
13565 				*errorp = EINTR;
13566 				return (FALSE);
13567 			}
13568 
13569 			(void) nfs_rw_enter_sig(&rp->r_lkserlock,
13570 					RW_WRITER, FALSE);
13571 
13572 			/*
13573 			 * Make sure we are still safe to lock with
13574 			 * regards to mmapping.
13575 			 */
13576 			if (!nfs4_safelock(vp, flk, cr)) {
13577 				*errorp = EAGAIN;
13578 				return (FALSE);
13579 			}
13580 
13581 			return (TRUE);
13582 		}
13583 		if (ctype == NFS4_LCK_CTYPE_NORM)
13584 			*errorp = EAGAIN;
13585 		*skip_get_err = TRUE;
13586 		flk->l_whence = 0;
13587 		*whencep = 0;
13588 		return (FALSE);
13589 	} else if (lockt_args) {
13590 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13591 		    "nfs4frlock_results_denied: OP_LOCKT DENIED"));
13592 
13593 		denied_to_flk(&resop->nfs_resop4_u.oplockt.denied,
13594 			flk, lockt_args);
13595 
13596 		/* according to NLM code */
13597 		*errorp = 0;
13598 		*whencep = 0;
13599 		*skip_get_err = TRUE;
13600 		return (FALSE);
13601 	}
13602 	return (FALSE);
13603 }
13604 
13605 /*
13606  * Handles all NFS4 errors besides NFS4_OK and NFS4ERR_DENIED for nfs4frlock.
13607  */
13608 static void
13609 nfs4frlock_results_default(COMPOUND4res_clnt *resp, int *errorp)
13610 {
13611 	switch (resp->status) {
13612 	case NFS4ERR_ACCESS:
13613 	case NFS4ERR_ADMIN_REVOKED:
13614 	case NFS4ERR_BADHANDLE:
13615 	case NFS4ERR_BAD_RANGE:
13616 	case NFS4ERR_BAD_SEQID:
13617 	case NFS4ERR_BAD_STATEID:
13618 	case NFS4ERR_BADXDR:
13619 	case NFS4ERR_DEADLOCK:
13620 	case NFS4ERR_DELAY:
13621 	case NFS4ERR_EXPIRED:
13622 	case NFS4ERR_FHEXPIRED:
13623 	case NFS4ERR_GRACE:
13624 	case NFS4ERR_INVAL:
13625 	case NFS4ERR_ISDIR:
13626 	case NFS4ERR_LEASE_MOVED:
13627 	case NFS4ERR_LOCK_NOTSUPP:
13628 	case NFS4ERR_LOCK_RANGE:
13629 	case NFS4ERR_MOVED:
13630 	case NFS4ERR_NOFILEHANDLE:
13631 	case NFS4ERR_NO_GRACE:
13632 	case NFS4ERR_OLD_STATEID:
13633 	case NFS4ERR_OPENMODE:
13634 	case NFS4ERR_RECLAIM_BAD:
13635 	case NFS4ERR_RECLAIM_CONFLICT:
13636 	case NFS4ERR_RESOURCE:
13637 	case NFS4ERR_SERVERFAULT:
13638 	case NFS4ERR_STALE:
13639 	case NFS4ERR_STALE_CLIENTID:
13640 	case NFS4ERR_STALE_STATEID:
13641 		return;
13642 	default:
13643 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13644 		    "nfs4frlock_results_default: got unrecognizable "
13645 		    "res.status %d", resp->status));
13646 		*errorp = NFS4ERR_INVAL;
13647 	}
13648 }
13649 
13650 /*
13651  * The lock request was successful, so update the client's state.
13652  */
13653 static void
13654 nfs4frlock_update_state(LOCK4args *lock_args, LOCKU4args *locku_args,
13655 	LOCKT4args *lockt_args, nfs_resop4 *resop, nfs4_lock_owner_t *lop,
13656 	vnode_t *vp, flock64_t *flk, cred_t *cr,
13657 	nfs4_lost_rqst_t *resend_rqstp)
13658 {
13659 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13660 
13661 	if (lock_args) {
13662 		LOCK4res *lock_res;
13663 
13664 		lock_res = &resop->nfs_resop4_u.oplock;
13665 		/* update the stateid with server's response */
13666 
13667 		if (lock_args->locker.new_lock_owner == TRUE) {
13668 			mutex_enter(&lop->lo_lock);
13669 			lop->lo_just_created = NFS4_PERM_CREATED;
13670 			mutex_exit(&lop->lo_lock);
13671 		}
13672 
13673 		nfs4_set_lock_stateid(lop, lock_res->LOCK4res_u.lock_stateid);
13674 
13675 		/*
13676 		 * If the lock was the result of a resending a lost
13677 		 * request, we've synched up the stateid and seqid
13678 		 * with the server, but now the server might be out of sync
13679 		 * with what the application thinks it has for locks.
13680 		 * Clean that up here.  It's unclear whether we should do
13681 		 * this even if the filesystem has been forcibly unmounted.
13682 		 * For most servers, it's probably wasted effort, but
13683 		 * RFC3530 lets servers require that unlocks exactly match
13684 		 * the locks that are held.
13685 		 */
13686 		if (resend_rqstp != NULL &&
13687 		    resend_rqstp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) {
13688 			nfs4_reinstitute_local_lock_state(vp, flk, cr, lop);
13689 		} else {
13690 			flk->l_whence = 0;
13691 		}
13692 	} else if (locku_args) {
13693 		LOCKU4res *locku_res;
13694 
13695 		locku_res = &resop->nfs_resop4_u.oplocku;
13696 
13697 		/* Update the stateid with the server's response */
13698 		nfs4_set_lock_stateid(lop, locku_res->lock_stateid);
13699 	} else if (lockt_args) {
13700 		/* Switch the lock type to express success, see fcntl */
13701 		flk->l_type = F_UNLCK;
13702 		flk->l_whence = 0;
13703 	}
13704 }
13705 
13706 /*
13707  * Do final cleanup before exiting nfs4frlock.
13708  * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13709  * COMPOUND4 args/res for calls that haven't already.
13710  */
13711 static void
13712 nfs4frlock_final_cleanup(nfs4_lock_call_type_t ctype, COMPOUND4args_clnt *argsp,
13713 	COMPOUND4res_clnt *resp, vnode_t *vp, nfs4_op_hint_t op_hint,
13714 	nfs4_recov_state_t *recov_statep, int needrecov, nfs4_open_owner_t *oop,
13715 	nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk,
13716 	short whence, u_offset_t offset, struct lm_sysid *ls,
13717 	int *errorp, LOCK4args *lock_args, LOCKU4args *locku_args,
13718 	bool_t did_start_fop, bool_t skip_get_err,
13719 	cred_t *cred_otw, cred_t *cred)
13720 {
13721 	mntinfo4_t	*mi = VTOMI4(vp);
13722 	rnode4_t	*rp = VTOR4(vp);
13723 	int		error = *errorp;
13724 	nfs_argop4	*argop;
13725 
13726 	ASSERT(nfs_zone() == mi->mi_zone);
13727 	/*
13728 	 * The client recovery code wants the raw status information,
13729 	 * so don't map the NFS status code to an errno value for
13730 	 * non-normal call types.
13731 	 */
13732 	if (ctype == NFS4_LCK_CTYPE_NORM) {
13733 		if (*errorp == 0 && resp != NULL && skip_get_err == FALSE)
13734 			*errorp = geterrno4(resp->status);
13735 		if (did_start_fop == TRUE)
13736 			nfs4_end_fop(mi, vp, NULL, op_hint, recov_statep,
13737 				needrecov);
13738 
13739 		if (!error && resp && resp->status == NFS4_OK) {
13740 		/*
13741 		 * We've established a new lock on the server, so invalidate
13742 		 * the pages associated with the vnode to get the most up to
13743 		 * date pages from the server after acquiring the lock. We
13744 		 * want to be sure that the read operation gets the newest data.
13745 		 * N.B.
13746 		 * We used to do this in nfs4frlock_results_ok but that doesn't
13747 		 * work since VOP_PUTPAGE can call nfs4_commit which calls
13748 		 * nfs4_start_fop. We flush the pages below after calling
13749 		 * nfs4_end_fop above
13750 		 */
13751 			int error;
13752 
13753 			error = VOP_PUTPAGE(vp, (u_offset_t)0,
13754 						0, B_INVAL, cred);
13755 
13756 			if (error && (error == ENOSPC || error == EDQUOT)) {
13757 				rnode4_t *rp = VTOR4(vp);
13758 
13759 				mutex_enter(&rp->r_statelock);
13760 				if (!rp->r_error)
13761 					rp->r_error = error;
13762 				mutex_exit(&rp->r_statelock);
13763 			}
13764 		}
13765 	}
13766 	if (argsp) {
13767 		ASSERT(argsp->array_len == 2);
13768 		argop = argsp->array;
13769 		if (argop[1].argop == OP_LOCK)
13770 			nfs4args_lock_free(&argop[1]);
13771 		else if (argop[1].argop == OP_LOCKT)
13772 			nfs4args_lockt_free(&argop[1]);
13773 		kmem_free(argop, 2 * sizeof (nfs_argop4));
13774 		if (resp)
13775 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
13776 	}
13777 
13778 	/* free the reference on the lock owner */
13779 	if (lop != NULL) {
13780 		nfs4_end_lock_seqid_sync(lop);
13781 		lock_owner_rele(lop);
13782 	}
13783 
13784 	/* need to free up the reference on osp for lock args */
13785 	if (osp != NULL)
13786 		open_stream_rele(osp, rp);
13787 
13788 	/* need to free up the reference on oop for lock args */
13789 	if (oop != NULL) {
13790 		nfs4_end_open_seqid_sync(oop);
13791 		open_owner_rele(oop);
13792 	}
13793 
13794 	(void) convoff(vp, flk, whence, offset);
13795 
13796 	lm_rel_sysid(ls);
13797 
13798 	/*
13799 	 * Record debug information in the event we get EINVAL.
13800 	 */
13801 	mutex_enter(&mi->mi_lock);
13802 	if (*errorp == EINVAL && (lock_args || locku_args) &&
13803 	    (!(mi->mi_flags & MI4_POSIX_LOCK))) {
13804 		if (!(mi->mi_flags & MI4_LOCK_DEBUG)) {
13805 			zcmn_err(getzoneid(), CE_NOTE,
13806 			    "%s operation failed with "
13807 			    "EINVAL probably since the server, %s,"
13808 			    " doesn't support POSIX style locking",
13809 			    lock_args ? "LOCK" : "LOCKU",
13810 			    mi->mi_curr_serv->sv_hostname);
13811 			mi->mi_flags |= MI4_LOCK_DEBUG;
13812 		}
13813 	}
13814 	mutex_exit(&mi->mi_lock);
13815 
13816 	if (cred_otw)
13817 		crfree(cred_otw);
13818 }
13819 
13820 /*
13821  * This calls the server and the local locking code.
13822  *
13823  * Client locks are registerred locally by oring the sysid with
13824  * LM_SYSID_CLIENT. The server registers locks locally using just the sysid.
13825  * We need to distinguish between the two to avoid collision in case one
13826  * machine is used as both client and server.
13827  *
13828  * Blocking lock requests will continually retry to acquire the lock
13829  * forever.
13830  *
13831  * The ctype is defined as follows:
13832  * NFS4_LCK_CTYPE_NORM: normal lock request.
13833  *
13834  * NFS4_LCK_CTYPE_RECLAIM:  bypass the usual calls for synchronizing with client
13835  * recovery, get the pid from flk instead of curproc, and don't reregister
13836  * the lock locally.
13837  *
13838  * NFS4_LCK_CTYPE_RESEND: same as NFS4_LCK_CTYPE_RECLAIM, with the addition
13839  * that we will use the information passed in via resend_rqstp to setup the
13840  * lock/locku request.  This resend is the exact same request as the 'lost
13841  * lock', and is initiated by the recovery framework. A successful resend
13842  * request can initiate one or more reinstate requests.
13843  *
13844  * NFS4_LCK_CTYPE_REINSTATE: same as NFS4_LCK_CTYPE_RESEND, except that it
13845  * does not trigger additional reinstate requests.  This lock call type is
13846  * set for setting the v4 server's locking state back to match what the
13847  * client's local locking state is in the event of a received 'lost lock'.
13848  *
13849  * Errors are returned via the nfs4_error_t parameter.
13850  */
13851 void
13852 nfs4frlock(nfs4_lock_call_type_t ctype, vnode_t *vp, int cmd, flock64_t *flk,
13853 		int flag, u_offset_t offset, cred_t *cr, nfs4_error_t *ep,
13854 		nfs4_lost_rqst_t *resend_rqstp, int *did_reclaimp)
13855 {
13856 	COMPOUND4args_clnt	args, *argsp = NULL;
13857 	COMPOUND4res_clnt	res, *resp = NULL;
13858 	nfs_argop4	*argop;
13859 	nfs_resop4	*resop;
13860 	rnode4_t	*rp;
13861 	int		doqueue = 1;
13862 	clock_t		tick_delay;  /* delay in clock ticks */
13863 	struct lm_sysid	*ls;
13864 	LOCK4args	*lock_args = NULL;
13865 	LOCKU4args	*locku_args = NULL;
13866 	LOCKT4args	*lockt_args = NULL;
13867 	nfs4_open_owner_t *oop = NULL;
13868 	nfs4_open_stream_t *osp = NULL;
13869 	nfs4_lock_owner_t *lop = NULL;
13870 	bool_t		needrecov = FALSE;
13871 	nfs4_recov_state_t recov_state;
13872 	short		whence;
13873 	nfs4_op_hint_t	op_hint;
13874 	nfs4_lost_rqst_t lost_rqst;
13875 	bool_t		retry = FALSE;
13876 	bool_t		did_start_fop = FALSE;
13877 	bool_t		skip_get_err = FALSE;
13878 	cred_t		*cred_otw = NULL;
13879 	bool_t		recovonly;	/* just queue request */
13880 	int		frc_no_reclaim = 0;
13881 #ifdef DEBUG
13882 	char *name;
13883 #endif
13884 
13885 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13886 
13887 #ifdef DEBUG
13888 	name = fn_name(VTOSV(vp)->sv_name);
13889 	NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4frlock: "
13890 	    "%s: cmd %d, type %d, offset %llu, start %"PRIx64", "
13891 	    "length %"PRIu64", pid %d, sysid %d, call type %s, "
13892 	    "resend request %s", name, cmd, flk->l_type, offset, flk->l_start,
13893 	    flk->l_len, ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid :
13894 	    flk->l_pid, flk->l_sysid, nfs4frlock_get_call_type(ctype),
13895 	    resend_rqstp ? "TRUE" : "FALSE"));
13896 	kmem_free(name, MAXNAMELEN);
13897 #endif
13898 
13899 	nfs4_error_zinit(ep);
13900 	ep->error = nfs4frlock_validate_args(cmd, flk, flag, vp, offset);
13901 	if (ep->error)
13902 		return;
13903 	ep->error = nfs4frlock_get_sysid(&ls, vp, flk);
13904 	if (ep->error)
13905 		return;
13906 	nfs4frlock_pre_setup(&tick_delay, &recov_state, flk, &whence,
13907 	    vp, cr, &cred_otw);
13908 
13909 recov_retry:
13910 	nfs4frlock_call_init(&args, &argsp, &argop, &op_hint, flk, cmd,
13911 		&retry, &did_start_fop, &resp, &skip_get_err, &lost_rqst);
13912 	rp = VTOR4(vp);
13913 
13914 	ep->error = nfs4frlock_start_call(ctype, vp, op_hint, &recov_state,
13915 			    &did_start_fop, &recovonly);
13916 
13917 	if (ep->error)
13918 		goto out;
13919 
13920 	if (recovonly) {
13921 		/*
13922 		 * Leave the request for the recovery system to deal with.
13923 		 */
13924 		ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13925 		ASSERT(cmd != F_GETLK);
13926 		ASSERT(flk->l_type == F_UNLCK);
13927 
13928 		nfs4_error_init(ep, EINTR);
13929 		needrecov = TRUE;
13930 		lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
13931 		if (lop != NULL) {
13932 			nfs4frlock_save_lost_rqst(ctype, ep->error, READ_LT,
13933 				NULL, NULL, lop, flk, &lost_rqst, cr, vp);
13934 			(void) nfs4_start_recovery(ep,
13935 				VTOMI4(vp), vp, NULL, NULL,
13936 				(lost_rqst.lr_op == OP_LOCK ||
13937 				lost_rqst.lr_op == OP_LOCKU) ?
13938 				&lost_rqst : NULL, OP_LOCKU, NULL);
13939 			lock_owner_rele(lop);
13940 			lop = NULL;
13941 		}
13942 		flk->l_pid = curproc->p_pid;
13943 		nfs4_register_lock_locally(vp, flk, flag, offset);
13944 		goto out;
13945 	}
13946 
13947 	/* putfh directory fh */
13948 	argop[0].argop = OP_CPUTFH;
13949 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
13950 
13951 	/*
13952 	 * Set up the over-the-wire arguments and get references to the
13953 	 * open owner, etc.
13954 	 */
13955 
13956 	if (ctype == NFS4_LCK_CTYPE_RESEND ||
13957 	    ctype == NFS4_LCK_CTYPE_REINSTATE) {
13958 		nfs4frlock_setup_resend_lock_args(resend_rqstp, argsp,
13959 			&argop[1], &lop, &oop, &osp, &lock_args, &locku_args);
13960 	} else {
13961 		bool_t go_otw = TRUE;
13962 
13963 		ASSERT(resend_rqstp == NULL);
13964 
13965 		switch (cmd) {
13966 		case F_GETLK:
13967 		case F_O_GETLK:
13968 			nfs4frlock_setup_lockt_args(ctype, &argop[1],
13969 					&lockt_args, argsp, flk, rp);
13970 			break;
13971 		case F_SETLKW:
13972 		case F_SETLK:
13973 			if (flk->l_type == F_UNLCK)
13974 				nfs4frlock_setup_locku_args(ctype,
13975 						&argop[1], &locku_args, flk,
13976 						&lop, ep, argsp,
13977 						vp, flag, offset, cr,
13978 						&skip_get_err, &go_otw);
13979 			else
13980 				nfs4frlock_setup_lock_args(ctype,
13981 					&lock_args, &oop, &osp, &lop, &argop[1],
13982 					argsp, flk, cmd, vp, cr, ep);
13983 
13984 			if (ep->error)
13985 				goto out;
13986 
13987 			switch (ep->stat) {
13988 			case NFS4_OK:
13989 				break;
13990 			case NFS4ERR_DELAY:
13991 				/* recov thread never gets this error */
13992 				ASSERT(resend_rqstp == NULL);
13993 				ASSERT(did_start_fop);
13994 
13995 				nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint,
13996 				    &recov_state, TRUE);
13997 				did_start_fop = FALSE;
13998 				if (argop[1].argop == OP_LOCK)
13999 					nfs4args_lock_free(&argop[1]);
14000 				else if (argop[1].argop == OP_LOCKT)
14001 					nfs4args_lockt_free(&argop[1]);
14002 				kmem_free(argop, 2 * sizeof (nfs_argop4));
14003 				argsp = NULL;
14004 				goto recov_retry;
14005 			default:
14006 				ep->error = EIO;
14007 				goto out;
14008 			}
14009 			break;
14010 		default:
14011 			NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14012 				"nfs4_frlock: invalid cmd %d", cmd));
14013 			ep->error = EINVAL;
14014 			goto out;
14015 		}
14016 
14017 		if (!go_otw)
14018 			goto out;
14019 	}
14020 
14021 	/* XXX should we use the local reclock as a cache ? */
14022 	/*
14023 	 * Unregister the lock with the local locking code before
14024 	 * contacting the server.  This avoids a potential race where
14025 	 * another process gets notified that it has been granted a lock
14026 	 * before we can unregister ourselves locally.
14027 	 */
14028 	if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) {
14029 		if (ctype == NFS4_LCK_CTYPE_NORM)
14030 			flk->l_pid = ttoproc(curthread)->p_pid;
14031 		nfs4_register_lock_locally(vp, flk, flag, offset);
14032 	}
14033 
14034 	/*
14035 	 * Send the server the lock request.  Continually loop with a delay
14036 	 * if get error NFS4ERR_DENIED (for blocking locks) or NFS4ERR_GRACE.
14037 	 */
14038 	resp = &res;
14039 
14040 	NFS4_DEBUG((nfs4_client_call_debug || nfs4_client_lock_debug),
14041 	    (CE_NOTE,
14042 	    "nfs4frlock: %s call, rp %s", needrecov ? "recov" : "first",
14043 	    rnode4info(rp)));
14044 
14045 	if (lock_args && frc_no_reclaim) {
14046 		ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM);
14047 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14048 		    "nfs4frlock: frc_no_reclaim: clearing reclaim"));
14049 		lock_args->reclaim = FALSE;
14050 		if (did_reclaimp)
14051 			*did_reclaimp = 0;
14052 	}
14053 
14054 	/*
14055 	 * Do the OTW call.
14056 	 */
14057 	rfs4call(VTOMI4(vp), argsp, resp, cred_otw, &doqueue, 0, ep);
14058 
14059 	NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14060 	    "nfs4frlock: error %d, status %d", ep->error, resp->status));
14061 
14062 	needrecov = nfs4_needs_recovery(ep, TRUE, vp->v_vfsp);
14063 	NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14064 	    "nfs4frlock: needrecov %d", needrecov));
14065 
14066 	if (ep->error != 0 && !needrecov && ep->error != EACCES)
14067 		goto out;
14068 
14069 	if (ep->error == 0 && nfs4_need_to_bump_seqid(resp))
14070 		nfs4frlock_bump_seqid(lock_args, locku_args, oop, lop,
14071 		    args.ctag);
14072 
14073 	if ((ep->error == EACCES ||
14074 	    (ep->error == 0 && resp->status == NFS4ERR_ACCESS)) &&
14075 	    cred_otw != cr) {
14076 		nfs4frlock_check_access(vp, op_hint, &recov_state, needrecov,
14077 		    &did_start_fop, &argsp, &resp, ep->error, &lop, &oop, &osp,
14078 		    cr, &cred_otw);
14079 		goto recov_retry;
14080 	}
14081 
14082 	if (needrecov) {
14083 		/*
14084 		 * LOCKT requests don't need to recover from lost
14085 		 * requests since they don't create/modify state.
14086 		 */
14087 		if ((ep->error == EINTR ||
14088 		    NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) &&
14089 		    lockt_args)
14090 			goto out;
14091 		/*
14092 		 * Do not attempt recovery for requests initiated by
14093 		 * the recovery framework.  Let the framework redrive them.
14094 		 */
14095 		if (ctype != NFS4_LCK_CTYPE_NORM)
14096 			goto out;
14097 		else {
14098 			ASSERT(resend_rqstp == NULL);
14099 		}
14100 
14101 		nfs4frlock_save_lost_rqst(ctype, ep->error,
14102 			flk_to_locktype(cmd, flk->l_type),
14103 			oop, osp, lop, flk, &lost_rqst, cred_otw, vp);
14104 
14105 		retry = nfs4frlock_recovery(needrecov, ep, &argsp,
14106 			    &resp, lock_args, locku_args, &oop, &osp, &lop,
14107 			    rp, vp, &recov_state, op_hint, &did_start_fop,
14108 			    cmd != F_GETLK ? &lost_rqst : NULL, flk);
14109 
14110 		if (retry) {
14111 			ASSERT(oop == NULL);
14112 			ASSERT(osp == NULL);
14113 			ASSERT(lop == NULL);
14114 			goto recov_retry;
14115 		}
14116 		goto out;
14117 	}
14118 
14119 	/*
14120 	 * Process the reply.
14121 	 */
14122 	switch (resp->status) {
14123 	case NFS4_OK:
14124 		resop = &resp->array[1];
14125 		nfs4frlock_results_ok(ctype, cmd, flk, vp, flag, offset,
14126 			resend_rqstp);
14127 		/*
14128 		 * Have a successful lock operation, now update state.
14129 		 */
14130 		nfs4frlock_update_state(lock_args, locku_args, lockt_args,
14131 			resop, lop, vp, flk, cr, resend_rqstp);
14132 		break;
14133 
14134 	case NFS4ERR_DENIED:
14135 		resop = &resp->array[1];
14136 		retry = nfs4frlock_results_denied(ctype, lock_args, lockt_args,
14137 				&oop, &osp, &lop, cmd, vp, flk, op_hint,
14138 				&recov_state, needrecov, &argsp, &resp,
14139 				&tick_delay, &whence, &ep->error, resop, cr,
14140 				&did_start_fop, &skip_get_err);
14141 
14142 		if (retry) {
14143 			ASSERT(oop == NULL);
14144 			ASSERT(osp == NULL);
14145 			ASSERT(lop == NULL);
14146 			goto recov_retry;
14147 		}
14148 		break;
14149 	/*
14150 	 * If the server won't let us reclaim, fall-back to trying to lock
14151 	 * the file from scratch. Code elsewhere will check the changeinfo
14152 	 * to ensure the file hasn't been changed.
14153 	 */
14154 	case NFS4ERR_NO_GRACE:
14155 		if (lock_args && lock_args->reclaim == TRUE) {
14156 			ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM);
14157 			NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14158 			    "nfs4frlock: reclaim: NFS4ERR_NO_GRACE"));
14159 			frc_no_reclaim = 1;
14160 			/* clean up before retrying */
14161 			needrecov = 0;
14162 			(void) nfs4frlock_recovery(needrecov, ep, &argsp, &resp,
14163 			    lock_args, locku_args, &oop, &osp, &lop, rp, vp,
14164 			    &recov_state, op_hint, &did_start_fop, NULL, flk);
14165 			goto recov_retry;
14166 		}
14167 		/* FALLTHROUGH */
14168 
14169 	default:
14170 		nfs4frlock_results_default(resp, &ep->error);
14171 		break;
14172 	}
14173 out:
14174 	/*
14175 	 * Process and cleanup from error.  Make interrupted unlock
14176 	 * requests look successful, since they will be handled by the
14177 	 * client recovery code.
14178 	 */
14179 	nfs4frlock_final_cleanup(ctype, argsp, resp, vp, op_hint, &recov_state,
14180 		needrecov, oop, osp, lop, flk, whence, offset, ls, &ep->error,
14181 		lock_args, locku_args, did_start_fop,
14182 		skip_get_err, cred_otw, cr);
14183 
14184 	if (ep->error == EINTR && flk->l_type == F_UNLCK &&
14185 	    (cmd == F_SETLK || cmd == F_SETLKW))
14186 		ep->error = 0;
14187 }
14188 
14189 /*
14190  * nfs4_safelock:
14191  *
14192  * Return non-zero if the given lock request can be handled without
14193  * violating the constraints on concurrent mapping and locking.
14194  */
14195 
14196 static int
14197 nfs4_safelock(vnode_t *vp, const struct flock64 *bfp, cred_t *cr)
14198 {
14199 	rnode4_t *rp = VTOR4(vp);
14200 	struct vattr va;
14201 	int error;
14202 
14203 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14204 	ASSERT(rp->r_mapcnt >= 0);
14205 	NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock %s: "
14206 		"(%"PRIx64", %"PRIx64"); mapcnt = %ld", bfp->l_type == F_WRLCK ?
14207 		"write" : bfp->l_type == F_RDLCK ? "read" : "unlock",
14208 		bfp->l_start, bfp->l_len, rp->r_mapcnt));
14209 
14210 	if (rp->r_mapcnt == 0)
14211 		return (1);		/* always safe if not mapped */
14212 
14213 	/*
14214 	 * If the file is already mapped and there are locks, then they
14215 	 * should be all safe locks.  So adding or removing a lock is safe
14216 	 * as long as the new request is safe (i.e., whole-file, meaning
14217 	 * length and starting offset are both zero).
14218 	 */
14219 
14220 	if (bfp->l_start != 0 || bfp->l_len != 0) {
14221 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14222 			"cannot lock a memory mapped file unless locking the "
14223 			"entire file: start %"PRIx64", len %"PRIx64,
14224 			bfp->l_start, bfp->l_len));
14225 		return (0);
14226 	}
14227 
14228 	/* mandatory locking and mapping don't mix */
14229 	va.va_mask = AT_MODE;
14230 	error = VOP_GETATTR(vp, &va, 0, cr);
14231 	if (error != 0) {
14232 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14233 		"getattr error %d", error));
14234 		return (0);		/* treat errors conservatively */
14235 	}
14236 	if (MANDLOCK(vp, va.va_mode)) {
14237 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14238 			"cannot mandatory lock and mmap a file"));
14239 		return (0);
14240 	}
14241 
14242 	return (1);
14243 }
14244 
14245 
14246 /*
14247  * Register the lock locally within Solaris.
14248  * As the client, we "or" the sysid with LM_SYSID_CLIENT when
14249  * recording locks locally.
14250  *
14251  * This should handle conflicts/cooperation with NFS v2/v3 since all locks
14252  * are registered locally.
14253  */
14254 void
14255 nfs4_register_lock_locally(vnode_t *vp, struct flock64 *flk, int flag,
14256 	u_offset_t offset)
14257 {
14258 	int oldsysid;
14259 	int error;
14260 #ifdef DEBUG
14261 	char *name;
14262 #endif
14263 
14264 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14265 
14266 #ifdef DEBUG
14267 	name = fn_name(VTOSV(vp)->sv_name);
14268 	NFS4_DEBUG(nfs4_client_lock_debug,
14269 	    (CE_NOTE, "nfs4_register_lock_locally: %s: type %d, "
14270 	    "start %"PRIx64", length %"PRIx64", pid %ld, sysid %d",
14271 	    name, flk->l_type, flk->l_start, flk->l_len, (long)flk->l_pid,
14272 	    flk->l_sysid));
14273 	kmem_free(name, MAXNAMELEN);
14274 #endif
14275 
14276 	/* register the lock with local locking */
14277 	oldsysid = flk->l_sysid;
14278 	flk->l_sysid |= LM_SYSID_CLIENT;
14279 	error = reclock(vp, flk, SETFLCK, flag, offset, NULL);
14280 #ifdef DEBUG
14281 	if (error != 0) {
14282 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14283 			"nfs4_register_lock_locally: could not register with"
14284 			" local locking"));
14285 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14286 			"error %d, vp 0x%p, pid %d, sysid 0x%x",
14287 			error, (void *)vp, flk->l_pid, flk->l_sysid));
14288 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14289 			"type %d off 0x%" PRIx64 " len 0x%" PRIx64,
14290 			flk->l_type, flk->l_start, flk->l_len));
14291 		(void) reclock(vp, flk, 0, flag, offset, NULL);
14292 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14293 			"blocked by pid %d sysid 0x%x type %d "
14294 			"off 0x%" PRIx64 " len 0x%" PRIx64,
14295 			flk->l_pid, flk->l_sysid, flk->l_type, flk->l_start,
14296 			flk->l_len));
14297 	}
14298 #endif
14299 	flk->l_sysid = oldsysid;
14300 }
14301 
14302 /*
14303  * nfs4_lockrelease:
14304  *
14305  * Release any locks on the given vnode that are held by the current
14306  * process.  Also removes the lock owner (if one exists) from the rnode's
14307  * list.
14308  */
14309 static int
14310 nfs4_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr)
14311 {
14312 	flock64_t ld;
14313 	int ret, error;
14314 	rnode4_t *rp;
14315 	nfs4_lock_owner_t *lop;
14316 	nfs4_recov_state_t recov_state;
14317 	mntinfo4_t *mi;
14318 	bool_t possible_orphan = FALSE;
14319 	bool_t recovonly;
14320 
14321 	ASSERT((uintptr_t)vp > KERNELBASE);
14322 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14323 
14324 	rp = VTOR4(vp);
14325 	mi = VTOMI4(vp);
14326 
14327 	/*
14328 	 * If we have not locked anything then we can
14329 	 * just return since we have no work to do.
14330 	 */
14331 	if (rp->r_lo_head.lo_next_rnode == &rp->r_lo_head) {
14332 		return (0);
14333 	}
14334 
14335 	/*
14336 	 * We need to comprehend that another thread may
14337 	 * kick off recovery and the lock_owner we have stashed
14338 	 * in lop might be invalid so we should NOT cache it
14339 	 * locally!
14340 	 */
14341 	recov_state.rs_flags = 0;
14342 	recov_state.rs_num_retry_despite_err = 0;
14343 	error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state,
14344 			    &recovonly);
14345 	if (error) {
14346 		mutex_enter(&rp->r_statelock);
14347 		rp->r_flags |= R4LODANGLERS;
14348 		mutex_exit(&rp->r_statelock);
14349 		return (error);
14350 	}
14351 
14352 	lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14353 
14354 	/*
14355 	 * Check if the lock owner might have a lock (request was sent but
14356 	 * no response was received).  Also check if there are any remote
14357 	 * locks on the file.  (In theory we shouldn't have to make this
14358 	 * second check if there's no lock owner, but for now we'll be
14359 	 * conservative and do it anyway.)  If either condition is true,
14360 	 * send an unlock for the entire file to the server.
14361 	 *
14362 	 * Note that no explicit synchronization is needed here.  At worst,
14363 	 * flk_has_remote_locks() will return a false positive, in which case
14364 	 * the unlock call wastes time but doesn't harm correctness.
14365 	 */
14366 
14367 	if (lop) {
14368 		mutex_enter(&lop->lo_lock);
14369 		possible_orphan = lop->lo_pending_rqsts;
14370 		mutex_exit(&lop->lo_lock);
14371 		lock_owner_rele(lop);
14372 	}
14373 
14374 	nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0);
14375 
14376 	NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14377 	    "nfs4_lockrelease: possible orphan %d, remote locks %d, for "
14378 	    "lop %p.", possible_orphan, flk_has_remote_locks(vp),
14379 	    (void *)lop));
14380 
14381 	if (possible_orphan || flk_has_remote_locks(vp)) {
14382 		ld.l_type = F_UNLCK;    /* set to unlock entire file */
14383 		ld.l_whence = 0;	/* unlock from start of file */
14384 		ld.l_start = 0;
14385 		ld.l_len = 0;		/* do entire file */
14386 
14387 		ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr);
14388 
14389 		if (ret != 0) {
14390 			/*
14391 			 * If VOP_FRLOCK fails, make sure we unregister
14392 			 * local locks before we continue.
14393 			 */
14394 			ld.l_pid = ttoproc(curthread)->p_pid;
14395 			nfs4_register_lock_locally(vp, &ld, flag, offset);
14396 			NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14397 				"nfs4_lockrelease: lock release error on vp"
14398 				" %p: error %d.\n", (void *)vp, ret));
14399 		}
14400 	}
14401 
14402 	recov_state.rs_flags = 0;
14403 	recov_state.rs_num_retry_despite_err = 0;
14404 	error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state,
14405 			    &recovonly);
14406 	if (error) {
14407 		mutex_enter(&rp->r_statelock);
14408 		rp->r_flags |= R4LODANGLERS;
14409 		mutex_exit(&rp->r_statelock);
14410 		return (error);
14411 	}
14412 
14413 	/*
14414 	 * So, here we're going to need to retrieve the lock-owner
14415 	 * again (in case recovery has done a switch-a-roo) and
14416 	 * remove it because we can.
14417 	 */
14418 	lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14419 
14420 	if (lop) {
14421 		nfs4_rnode_remove_lock_owner(rp, lop);
14422 		lock_owner_rele(lop);
14423 	}
14424 
14425 	nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0);
14426 	return (0);
14427 }
14428 
14429 /*
14430  * Wait for 'tick_delay' clock ticks.
14431  * Implement exponential backoff until hit the lease_time of this nfs4_server.
14432  * NOTE: lock_lease_time is in seconds.
14433  *
14434  * XXX For future improvements, should implement a waiting queue scheme.
14435  */
14436 static int
14437 nfs4_block_and_wait(clock_t *tick_delay, rnode4_t *rp)
14438 {
14439 	long milliseconds_delay;
14440 	time_t lock_lease_time;
14441 
14442 	/* wait tick_delay clock ticks or siginteruptus */
14443 	if (delay_sig(*tick_delay)) {
14444 		return (EINTR);
14445 	}
14446 	NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: "
14447 		"reissue the lock request: blocked for %ld clock ticks: %ld "
14448 		"milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000));
14449 
14450 	/* get the lease time */
14451 	lock_lease_time = r2lease_time(rp);
14452 
14453 	/* drv_hztousec converts ticks to microseconds */
14454 	milliseconds_delay = drv_hztousec(*tick_delay) / 1000;
14455 	if (milliseconds_delay < lock_lease_time * 1000) {
14456 		*tick_delay = 2 * *tick_delay;
14457 		if (drv_hztousec(*tick_delay) > lock_lease_time * 1000 * 1000)
14458 			*tick_delay = drv_usectohz(lock_lease_time*1000*1000);
14459 	}
14460 	return (0);
14461 }
14462 
14463 
14464 void
14465 nfs4_vnops_init(void)
14466 {
14467 }
14468 
14469 void
14470 nfs4_vnops_fini(void)
14471 {
14472 }
14473 
14474 /*
14475  * Return a reference to the directory (parent) vnode for a given vnode,
14476  * using the saved pathname information and the directory file handle.  The
14477  * caller is responsible for disposing of the reference.
14478  * Returns zero or an errno value.
14479  *
14480  * Caller should set need_start_op to FALSE if it is the recovery
14481  * thread, or if a start_fop has already been done.  Otherwise, TRUE.
14482  */
14483 int
14484 vtodv(vnode_t *vp, vnode_t **dvpp, cred_t *cr, bool_t need_start_op)
14485 {
14486 	svnode_t *svnp;
14487 	vnode_t *dvp = NULL;
14488 	servinfo4_t *svp;
14489 	nfs4_fname_t *mfname;
14490 	int error;
14491 
14492 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14493 
14494 	if (vp->v_flag & VROOT) {
14495 		nfs4_sharedfh_t *sfh;
14496 		nfs_fh4 fh;
14497 		mntinfo4_t *mi;
14498 
14499 		ASSERT(vp->v_type == VREG);
14500 
14501 		mi = VTOMI4(vp);
14502 		svp = mi->mi_curr_serv;
14503 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
14504 		fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
14505 		fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
14506 		sfh = sfh4_get(&fh, VTOMI4(vp));
14507 		nfs_rw_exit(&svp->sv_lock);
14508 		mfname = mi->mi_fname;
14509 		fn_hold(mfname);
14510 		dvp = makenfs4node_by_fh(sfh, NULL, &mfname, NULL, mi, cr, 0);
14511 		sfh4_rele(&sfh);
14512 
14513 		if (dvp->v_type == VNON)
14514 			dvp->v_type = VDIR;
14515 		*dvpp = dvp;
14516 		return (0);
14517 	}
14518 
14519 	svnp = VTOSV(vp);
14520 
14521 	if (svnp == NULL) {
14522 		NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14523 			"shadow node is NULL"));
14524 		return (EINVAL);
14525 	}
14526 
14527 	if (svnp->sv_name == NULL || svnp->sv_dfh == NULL) {
14528 		NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14529 			"shadow node name or dfh val == NULL"));
14530 		return (EINVAL);
14531 	}
14532 
14533 	error = nfs4_make_dotdot(svnp->sv_dfh, 0, vp, cr, &dvp,
14534 							(int)need_start_op);
14535 	if (error != 0) {
14536 		NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14537 			"nfs4_make_dotdot returned %d", error));
14538 		return (error);
14539 	}
14540 	if (!dvp) {
14541 		NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14542 			"nfs4_make_dotdot returned a NULL dvp"));
14543 		return (EIO);
14544 	}
14545 	if (dvp->v_type == VNON)
14546 		dvp->v_type = VDIR;
14547 	ASSERT(dvp->v_type == VDIR);
14548 	if (VTOR4(vp)->r_flags & R4ISXATTR) {
14549 		mutex_enter(&dvp->v_lock);
14550 		dvp->v_flag |= V_XATTRDIR;
14551 		mutex_exit(&dvp->v_lock);
14552 	}
14553 	*dvpp = dvp;
14554 	return (0);
14555 }
14556 
14557 /*
14558  * Copy the (final) component name of vp to fnamep.  maxlen is the maximum
14559  * length that fnamep can accept, including the trailing null.
14560  * Returns 0 if okay, returns an errno value if there was a problem.
14561  */
14562 
14563 int
14564 vtoname(vnode_t *vp, char *fnamep, ssize_t maxlen)
14565 {
14566 	char *fn;
14567 	int err = 0;
14568 	servinfo4_t *svp;
14569 	svnode_t *shvp;
14570 
14571 	/*
14572 	 * If the file being opened has VROOT set, then this is
14573 	 * a "file" mount.  sv_name will not be interesting, so
14574 	 * go back to the servinfo4 to get the original mount
14575 	 * path and strip off all but the final edge.  Otherwise
14576 	 * just return the name from the shadow vnode.
14577 	 */
14578 
14579 	if (vp->v_flag & VROOT) {
14580 
14581 		svp = VTOMI4(vp)->mi_curr_serv;
14582 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
14583 
14584 		fn = strrchr(svp->sv_path, '/');
14585 		if (fn == NULL)
14586 			err = EINVAL;
14587 		else
14588 			fn++;
14589 	} else {
14590 		shvp = VTOSV(vp);
14591 		fn = fn_name(shvp->sv_name);
14592 	}
14593 
14594 	if (err == 0)
14595 		if (strlen(fn) < maxlen)
14596 			(void) strcpy(fnamep, fn);
14597 		else
14598 			err = ENAMETOOLONG;
14599 
14600 	if (vp->v_flag & VROOT)
14601 		nfs_rw_exit(&svp->sv_lock);
14602 	else
14603 		kmem_free(fn, MAXNAMELEN);
14604 
14605 	return (err);
14606 }
14607 
14608 /*
14609  * If the vnode has pages, run the list and check for
14610  * any that are still dangling. We call this function
14611  * before the OTW CLOSE occurs so we can B_INVAL the
14612  * danglers.
14613  */
14614 static int
14615 nfs4_dross_pages(vnode_t *vp)
14616 {
14617 	page_t *pp;
14618 	kmutex_t *vphm;
14619 	rnode4_t *rp;
14620 
14621 	/* make sure we're looking at the master vnode, not a shadow */
14622 	rp = VTOR4(vp);
14623 	if (IS_SHADOW(vp, rp))
14624 		vp = RTOV4(rp);
14625 
14626 	vphm = page_vnode_mutex(vp);
14627 	mutex_enter(vphm);
14628 	if ((pp = vp->v_pages) != NULL) {
14629 		do {
14630 			if (pp->p_fsdata != C_NOCOMMIT) {
14631 				mutex_exit(vphm);
14632 				return (1);
14633 			}
14634 		} while ((pp = pp->p_vpnext) != vp->v_pages);
14635 	}
14636 	mutex_exit(vphm);
14637 
14638 	return (0);
14639 }
14640 
14641 /*
14642  * Bookkeeping for a close that doesn't need to go over the wire.
14643  * *have_lockp is set to 0 if 'os_sync_lock' is released; otherwise
14644  * it is left at 1.
14645  */
14646 void
14647 nfs4close_notw(vnode_t *vp, nfs4_open_stream_t *osp, int *have_lockp)
14648 {
14649 	rnode4_t		*rp;
14650 	mntinfo4_t		*mi;
14651 
14652 	mi = VTOMI4(vp);
14653 	rp = VTOR4(vp);
14654 
14655 	NFS4_DEBUG(nfs4close_notw_debug, (CE_NOTE, "nfs4close_notw: "
14656 	    "rp=%p osp=%p", (void *)rp, (void *)osp));
14657 	ASSERT(nfs_zone() == mi->mi_zone);
14658 	ASSERT(mutex_owned(&osp->os_sync_lock));
14659 	ASSERT(*have_lockp);
14660 
14661 	if (!osp->os_valid ||
14662 	    osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) {
14663 		return;
14664 	}
14665 
14666 	/*
14667 	 * This removes the reference obtained at OPEN; ie,
14668 	 * when the open stream structure was created.
14669 	 *
14670 	 * We don't have to worry about calling 'open_stream_rele'
14671 	 * since we our currently holding a reference to this
14672 	 * open stream which means the count can not go to 0 with
14673 	 * this decrement.
14674 	 */
14675 	ASSERT(osp->os_ref_count >= 2);
14676 	osp->os_ref_count--;
14677 	osp->os_valid = 0;
14678 	mutex_exit(&osp->os_sync_lock);
14679 	*have_lockp = 0;
14680 
14681 	nfs4_dec_state_ref_count(mi);
14682 }
14683 
14684 /*
14685  * Close all remaining open streams on the rnode.  These open streams
14686  * could be here because:
14687  * - The close attempted at either close or delmap failed
14688  * - Some kernel entity did VOP_OPEN but never did VOP_CLOSE
14689  * - Someone did mknod on a regular file but never opened it
14690  */
14691 int
14692 nfs4close_all(vnode_t *vp, cred_t *cr)
14693 {
14694 	nfs4_open_stream_t *osp;
14695 	int error;
14696 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
14697 	rnode4_t *rp;
14698 
14699 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14700 
14701 	error = 0;
14702 	rp = VTOR4(vp);
14703 
14704 	/*
14705 	 * At this point, all we know is that the last time
14706 	 * someone called vn_rele, the count was 1.  Since then,
14707 	 * the vnode could have been re-activated.  We want to
14708 	 * loop through the open streams and close each one, but
14709 	 * we have to be careful since once we release the rnode
14710 	 * hash bucket lock, someone else is free to come in and
14711 	 * re-activate the rnode and add new open streams.  The
14712 	 * strategy is take the rnode hash bucket lock, verify that
14713 	 * the count is still 1, grab the open stream off the
14714 	 * head of the list and mark it invalid, then release the
14715 	 * rnode hash bucket lock and proceed with that open stream.
14716 	 * This is ok because nfs4close_one() will acquire the proper
14717 	 * open/create to close/destroy synchronization for open
14718 	 * streams, and will ensure that if someone has reopened
14719 	 * the open stream after we've dropped the hash bucket lock
14720 	 * then we'll just simply return without destroying the
14721 	 * open stream.
14722 	 * Repeat until the list is empty.
14723 	 */
14724 
14725 	for (;;) {
14726 
14727 		/* make sure vnode hasn't been reactivated */
14728 		rw_enter(&rp->r_hashq->r_lock, RW_READER);
14729 		mutex_enter(&vp->v_lock);
14730 		if (vp->v_count > 1) {
14731 			mutex_exit(&vp->v_lock);
14732 			rw_exit(&rp->r_hashq->r_lock);
14733 			break;
14734 		}
14735 		/*
14736 		 * Grabbing r_os_lock before releasing v_lock prevents
14737 		 * a window where the rnode/open stream could get
14738 		 * reactivated (and os_force_close set to 0) before we
14739 		 * had a chance to set os_force_close to 1.
14740 		 */
14741 		mutex_enter(&rp->r_os_lock);
14742 		mutex_exit(&vp->v_lock);
14743 
14744 		osp = list_head(&rp->r_open_streams);
14745 		if (!osp) {
14746 			/* nothing left to CLOSE OTW, so return */
14747 			mutex_exit(&rp->r_os_lock);
14748 			rw_exit(&rp->r_hashq->r_lock);
14749 			break;
14750 		}
14751 
14752 		mutex_enter(&rp->r_statev4_lock);
14753 		/* the file can't still be mem mapped */
14754 		ASSERT(rp->r_mapcnt == 0);
14755 		if (rp->created_v4)
14756 			rp->created_v4 = 0;
14757 		mutex_exit(&rp->r_statev4_lock);
14758 
14759 		/*
14760 		 * Grab a ref on this open stream; nfs4close_one
14761 		 * will mark it as invalid
14762 		 */
14763 		mutex_enter(&osp->os_sync_lock);
14764 		osp->os_ref_count++;
14765 		osp->os_force_close = 1;
14766 		mutex_exit(&osp->os_sync_lock);
14767 		mutex_exit(&rp->r_os_lock);
14768 		rw_exit(&rp->r_hashq->r_lock);
14769 
14770 		nfs4close_one(vp, osp, cr, 0, NULL, &e, CLOSE_FORCE, 0, 0, 0);
14771 
14772 		/* Update error if it isn't already non-zero */
14773 		if (error == 0) {
14774 			if (e.error)
14775 				error = e.error;
14776 			else if (e.stat)
14777 				error = geterrno4(e.stat);
14778 		}
14779 
14780 #ifdef	DEBUG
14781 		nfs4close_all_cnt++;
14782 #endif
14783 		/* Release the ref on osp acquired above. */
14784 		open_stream_rele(osp, rp);
14785 
14786 		/* Proceed to the next open stream, if any */
14787 	}
14788 	return (error);
14789 }
14790 
14791 /*
14792  * nfs4close_one - close one open stream for a file if needed.
14793  *
14794  * "close_type" indicates which close path this is:
14795  * CLOSE_NORM: close initiated via VOP_CLOSE.
14796  * CLOSE_DELMAP: close initiated via VOP_DELMAP.
14797  * CLOSE_FORCE: close initiated via VOP_INACTIVE.  This path forces
14798  *	the close and release of client state for this open stream
14799  *	(unless someone else has the open stream open).
14800  * CLOSE_RESEND: indicates the request is a replay of an earlier request
14801  *	(e.g., due to abort because of a signal).
14802  * CLOSE_AFTER_RESEND: close initiated to "undo" a successful resent OPEN.
14803  *
14804  * CLOSE_RESEND and CLOSE_AFTER_RESEND will not attempt to retry after client
14805  * recovery.  Instead, the caller is expected to deal with retries.
14806  *
14807  * The caller can either pass in the osp ('provided_osp') or not.
14808  *
14809  * 'access_bits' represents the access we are closing/downgrading.
14810  *
14811  * 'len', 'prot', and 'mmap_flags' are used for CLOSE_DELMAP.  'len' is the
14812  * number of bytes we are unmapping, 'maxprot' is the mmap protection, and
14813  * 'mmap_flags' tells us the type of sharing (MAP_PRIVATE or MAP_SHARED).
14814  *
14815  * Errors are returned via the nfs4_error_t.
14816  */
14817 void
14818 nfs4close_one(vnode_t *vp, nfs4_open_stream_t *provided_osp, cred_t *cr,
14819 	int access_bits, nfs4_lost_rqst_t *lrp, nfs4_error_t *ep,
14820 	nfs4_close_type_t close_type, size_t len, uint_t maxprot,
14821 	uint_t mmap_flags)
14822 {
14823 	nfs4_open_owner_t *oop;
14824 	nfs4_open_stream_t *osp = NULL;
14825 	int retry = 0;
14826 	int num_retries = NFS4_NUM_RECOV_RETRIES;
14827 	rnode4_t *rp;
14828 	mntinfo4_t *mi;
14829 	nfs4_recov_state_t recov_state;
14830 	cred_t *cred_otw = NULL;
14831 	bool_t recovonly = FALSE;
14832 	int isrecov;
14833 	int force_close;
14834 	int close_failed = 0;
14835 	int did_dec_count = 0;
14836 	int did_start_op = 0;
14837 	int did_force_recovlock = 0;
14838 	int did_start_seqid_sync = 0;
14839 	int have_sync_lock = 0;
14840 
14841 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14842 
14843 	NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, "closing vp %p osp %p, "
14844 	    "lrp %p, close type %d len %ld prot %x mmap flags %x bits %x",
14845 	    (void *)vp, (void *)provided_osp, (void *)lrp, close_type,
14846 	    len, maxprot, mmap_flags, access_bits));
14847 
14848 	nfs4_error_zinit(ep);
14849 	rp = VTOR4(vp);
14850 	mi = VTOMI4(vp);
14851 	isrecov = (close_type == CLOSE_RESEND ||
14852 			close_type == CLOSE_AFTER_RESEND);
14853 
14854 	/*
14855 	 * First get the open owner.
14856 	 */
14857 	if (!provided_osp) {
14858 		oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
14859 	} else {
14860 		oop = provided_osp->os_open_owner;
14861 		ASSERT(oop != NULL);
14862 		open_owner_hold(oop);
14863 	}
14864 
14865 	if (!oop) {
14866 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
14867 		    "nfs4close_one: no oop, rp %p, mi %p, cr %p, osp %p, "
14868 		    "close type %d", (void *)rp, (void *)mi, (void *)cr,
14869 		    (void *)provided_osp, close_type));
14870 		ep->error = EIO;
14871 		goto out;
14872 	}
14873 
14874 	cred_otw = nfs4_get_otw_cred(cr, mi, oop);
14875 recov_retry:
14876 	osp = NULL;
14877 	close_failed = 0;
14878 	force_close = (close_type == CLOSE_FORCE);
14879 	retry = 0;
14880 	did_start_op = 0;
14881 	did_force_recovlock = 0;
14882 	did_start_seqid_sync = 0;
14883 	have_sync_lock = 0;
14884 	recovonly = FALSE;
14885 	recov_state.rs_flags = 0;
14886 	recov_state.rs_num_retry_despite_err = 0;
14887 
14888 	/*
14889 	 * Second synchronize with recovery.
14890 	 */
14891 	if (!isrecov) {
14892 		ep->error = nfs4_start_fop(mi, vp, NULL, OH_CLOSE,
14893 				&recov_state, &recovonly);
14894 		if (!ep->error) {
14895 			did_start_op = 1;
14896 		} else {
14897 			close_failed = 1;
14898 			/*
14899 			 * If we couldn't get start_fop, but have to
14900 			 * cleanup state, then at least acquire the
14901 			 * mi_recovlock so we can synchronize with
14902 			 * recovery.
14903 			 */
14904 			if (close_type == CLOSE_FORCE) {
14905 				(void) nfs_rw_enter_sig(&mi->mi_recovlock,
14906 					RW_READER, FALSE);
14907 				did_force_recovlock = 1;
14908 			} else
14909 				goto out;
14910 		}
14911 	}
14912 
14913 	/*
14914 	 * We cannot attempt to get the open seqid sync if nfs4_start_fop
14915 	 * set 'recovonly' to TRUE since most likely this is due to
14916 	 * reovery being active (MI4_RECOV_ACTIV).  If recovery is active,
14917 	 * nfs4_start_open_seqid_sync() will fail with EAGAIN asking us
14918 	 * to retry, causing us to loop until recovery finishes.  Plus we
14919 	 * don't need protection over the open seqid since we're not going
14920 	 * OTW, hence don't need to use the seqid.
14921 	 */
14922 	if (recovonly == FALSE) {
14923 		/* need to grab the open owner sync before 'os_sync_lock' */
14924 		ep->error = nfs4_start_open_seqid_sync(oop, mi);
14925 		if (ep->error == EAGAIN) {
14926 			ASSERT(!isrecov);
14927 			if (did_start_op)
14928 				nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
14929 					&recov_state, TRUE);
14930 			if (did_force_recovlock)
14931 				nfs_rw_exit(&mi->mi_recovlock);
14932 			goto recov_retry;
14933 		}
14934 		did_start_seqid_sync = 1;
14935 	}
14936 
14937 	/*
14938 	 * Third get an open stream and acquire 'os_sync_lock' to
14939 	 * sychronize the opening/creating of an open stream with the
14940 	 * closing/destroying of an open stream.
14941 	 */
14942 	if (!provided_osp) {
14943 		/* returns with 'os_sync_lock' held */
14944 		osp = find_open_stream(oop, rp);
14945 		if (!osp) {
14946 			ep->error = EIO;
14947 			goto out;
14948 		}
14949 	} else {
14950 		osp = provided_osp;
14951 		open_stream_hold(osp);
14952 		mutex_enter(&osp->os_sync_lock);
14953 	}
14954 	have_sync_lock = 1;
14955 
14956 	ASSERT(oop == osp->os_open_owner);
14957 
14958 	/*
14959 	 * Fourth, do any special pre-OTW CLOSE processing
14960 	 * based on the specific close type.
14961 	 */
14962 	if ((close_type == CLOSE_NORM || close_type == CLOSE_AFTER_RESEND) &&
14963 	    !did_dec_count) {
14964 		ASSERT(osp->os_open_ref_count > 0);
14965 		osp->os_open_ref_count--;
14966 		did_dec_count = 1;
14967 		if (osp->os_open_ref_count == 0)
14968 			osp->os_final_close = 1;
14969 	}
14970 
14971 	if (close_type == CLOSE_FORCE) {
14972 		/* see if somebody reopened the open stream. */
14973 		if (!osp->os_force_close) {
14974 			NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE,
14975 			    "nfs4close_one: skip CLOSE_FORCE as osp %p "
14976 			    "was reopened, vp %p", (void *)osp, (void *)vp));
14977 			ep->error = 0;
14978 			ep->stat = NFS4_OK;
14979 			goto out;
14980 		}
14981 
14982 		if (!osp->os_final_close && !did_dec_count) {
14983 			osp->os_open_ref_count--;
14984 			did_dec_count = 1;
14985 		}
14986 
14987 		/*
14988 		 * We can't depend on os_open_ref_count being 0 due to the
14989 		 * way executables are opened (VN_RELE to match a VOP_OPEN).
14990 		 */
14991 #ifdef	NOTYET
14992 		ASSERT(osp->os_open_ref_count == 0);
14993 #endif
14994 		if (osp->os_open_ref_count != 0) {
14995 			NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE,
14996 			    "nfs4close_one: should panic here on an "
14997 			    "ASSERT(osp->os_open_ref_count == 0). Ignoring "
14998 			    "since this is probably the exec problem."));
14999 
15000 			osp->os_open_ref_count = 0;
15001 		}
15002 
15003 		/*
15004 		 * There is the possibility that nfs4close_one()
15005 		 * for close_type == CLOSE_DELMAP couldn't find the
15006 		 * open stream, thus couldn't decrement its os_mapcnt;
15007 		 * therefore we can't use this ASSERT yet.
15008 		 */
15009 #ifdef	NOTYET
15010 		ASSERT(osp->os_mapcnt == 0);
15011 #endif
15012 		osp->os_mapcnt = 0;
15013 	}
15014 
15015 	if (close_type == CLOSE_DELMAP && !did_dec_count) {
15016 		ASSERT(osp->os_mapcnt >= btopr(len));
15017 
15018 		if ((mmap_flags & MAP_SHARED) && (maxprot & PROT_WRITE))
15019 			osp->os_mmap_write -= btopr(len);
15020 		if (maxprot & PROT_READ)
15021 			osp->os_mmap_read -= btopr(len);
15022 		if (maxprot & PROT_EXEC)
15023 			osp->os_mmap_read -= btopr(len);
15024 		/* mirror the PROT_NONE check in nfs4_addmap() */
15025 		if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) &&
15026 		    !(maxprot & PROT_EXEC))
15027 			osp->os_mmap_read -= btopr(len);
15028 		osp->os_mapcnt -= btopr(len);
15029 		did_dec_count = 1;
15030 	}
15031 
15032 	if (recovonly) {
15033 		nfs4_lost_rqst_t lost_rqst;
15034 
15035 		/* request should not already be in recovery queue */
15036 		ASSERT(lrp == NULL);
15037 		nfs4_error_init(ep, EINTR);
15038 		nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop,
15039 			osp, cred_otw, vp);
15040 		mutex_exit(&osp->os_sync_lock);
15041 		have_sync_lock = 0;
15042 		(void) nfs4_start_recovery(ep, mi, vp, NULL, NULL,
15043 				lost_rqst.lr_op == OP_CLOSE ?
15044 				&lost_rqst : NULL, OP_CLOSE, NULL);
15045 		close_failed = 1;
15046 		force_close = 0;
15047 		goto close_cleanup;
15048 	}
15049 
15050 	/*
15051 	 * If a previous OTW call got NFS4ERR_BAD_SEQID, then
15052 	 * we stopped operating on the open owner's <old oo_name, old seqid>
15053 	 * space, which means we stopped operating on the open stream
15054 	 * too.  So don't go OTW (as the seqid is likely bad, and the
15055 	 * stateid could be stale, potentially triggering a false
15056 	 * setclientid), and just clean up the client's internal state.
15057 	 */
15058 	if (osp->os_orig_oo_name != oop->oo_name) {
15059 		NFS4_DEBUG(nfs4close_one_debug || nfs4_client_recov_debug,
15060 		    (CE_NOTE, "nfs4close_one: skip OTW close for osp %p "
15061 		    "oop %p due to bad seqid (orig oo_name %" PRIx64 " current "
15062 		    "oo_name %" PRIx64")",
15063 		    (void *)osp, (void *)oop, osp->os_orig_oo_name,
15064 		    oop->oo_name));
15065 		close_failed = 1;
15066 	}
15067 
15068 	/* If the file failed recovery, just quit. */
15069 	mutex_enter(&rp->r_statelock);
15070 	if (rp->r_flags & R4RECOVERR) {
15071 		close_failed = 1;
15072 	}
15073 	mutex_exit(&rp->r_statelock);
15074 
15075 	/*
15076 	 * If the force close path failed to obtain start_fop
15077 	 * then skip the OTW close and just remove the state.
15078 	 */
15079 	if (close_failed)
15080 		goto close_cleanup;
15081 
15082 	/*
15083 	 * Fifth, check to see if there are still mapped pages or other
15084 	 * opens using this open stream.  If there are then we can't
15085 	 * close yet but we can see if an OPEN_DOWNGRADE is necessary.
15086 	 */
15087 	if (osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) {
15088 		nfs4_lost_rqst_t	new_lost_rqst;
15089 		bool_t			needrecov = FALSE;
15090 		cred_t			*odg_cred_otw = NULL;
15091 		seqid4			open_dg_seqid = 0;
15092 
15093 		if (osp->os_delegation) {
15094 			/*
15095 			 * If this open stream was never OPENed OTW then we
15096 			 * surely can't DOWNGRADE it (especially since the
15097 			 * osp->open_stateid is really a delegation stateid
15098 			 * when os_delegation is 1).
15099 			 */
15100 			if (access_bits & FREAD)
15101 				osp->os_share_acc_read--;
15102 			if (access_bits & FWRITE)
15103 				osp->os_share_acc_write--;
15104 			osp->os_share_deny_none--;
15105 			nfs4_error_zinit(ep);
15106 			goto out;
15107 		}
15108 		nfs4_open_downgrade(access_bits, 0, oop, osp, vp, cr,
15109 				lrp, ep, &odg_cred_otw, &open_dg_seqid);
15110 		needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
15111 		if (needrecov && !isrecov) {
15112 			bool_t abort;
15113 			nfs4_bseqid_entry_t *bsep = NULL;
15114 
15115 			if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID)
15116 				bsep = nfs4_create_bseqid_entry(oop, NULL,
15117 					vp, 0,
15118 					lrp ? TAG_OPEN_DG_LOST : TAG_OPEN_DG,
15119 					open_dg_seqid);
15120 
15121 			nfs4open_dg_save_lost_rqst(ep->error, &new_lost_rqst,
15122 			    oop, osp, odg_cred_otw, vp, access_bits, 0);
15123 			mutex_exit(&osp->os_sync_lock);
15124 			have_sync_lock = 0;
15125 			abort = nfs4_start_recovery(ep, mi, vp, NULL, NULL,
15126 				    new_lost_rqst.lr_op == OP_OPEN_DOWNGRADE ?
15127 				    &new_lost_rqst : NULL, OP_OPEN_DOWNGRADE,
15128 				    bsep);
15129 			if (odg_cred_otw)
15130 				crfree(odg_cred_otw);
15131 			if (bsep)
15132 				kmem_free(bsep, sizeof (*bsep));
15133 
15134 			if (abort == TRUE)
15135 				goto out;
15136 
15137 			if (did_start_seqid_sync) {
15138 				nfs4_end_open_seqid_sync(oop);
15139 				did_start_seqid_sync = 0;
15140 			}
15141 			open_stream_rele(osp, rp);
15142 
15143 			if (did_start_op)
15144 				nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15145 					&recov_state, FALSE);
15146 			if (did_force_recovlock)
15147 				nfs_rw_exit(&mi->mi_recovlock);
15148 
15149 			goto recov_retry;
15150 		} else {
15151 			if (odg_cred_otw)
15152 				crfree(odg_cred_otw);
15153 		}
15154 		goto out;
15155 	}
15156 
15157 	/*
15158 	 * If this open stream was created as the results of an open
15159 	 * while holding a delegation, then just release it; no need
15160 	 * to do an OTW close.  Otherwise do a "normal" OTW close.
15161 	 */
15162 	if (osp->os_delegation) {
15163 		nfs4close_notw(vp, osp, &have_sync_lock);
15164 		nfs4_error_zinit(ep);
15165 		goto out;
15166 	}
15167 
15168 	/*
15169 	 * If this stream is not valid, we're done.
15170 	 */
15171 	if (!osp->os_valid) {
15172 		nfs4_error_zinit(ep);
15173 		goto out;
15174 	}
15175 
15176 	/*
15177 	 * Last open or mmap ref has vanished, need to do an OTW close.
15178 	 * First check to see if a close is still necessary.
15179 	 */
15180 	if (osp->os_failed_reopen) {
15181 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15182 		    "don't close OTW osp %p since reopen failed.",
15183 		    (void *)osp));
15184 		/*
15185 		 * Reopen of the open stream failed, hence the
15186 		 * stateid of the open stream is invalid/stale, and
15187 		 * sending this OTW would incorrectly cause another
15188 		 * round of recovery.  In this case, we need to set
15189 		 * the 'os_valid' bit to 0 so another thread doesn't
15190 		 * come in and re-open this open stream before
15191 		 * this "closing" thread cleans up state (decrementing
15192 		 * the nfs4_server_t's state_ref_count and decrementing
15193 		 * the os_ref_count).
15194 		 */
15195 		osp->os_valid = 0;
15196 		/*
15197 		 * This removes the reference obtained at OPEN; ie,
15198 		 * when the open stream structure was created.
15199 		 *
15200 		 * We don't have to worry about calling 'open_stream_rele'
15201 		 * since we our currently holding a reference to this
15202 		 * open stream which means the count can not go to 0 with
15203 		 * this decrement.
15204 		 */
15205 		ASSERT(osp->os_ref_count >= 2);
15206 		osp->os_ref_count--;
15207 		nfs4_error_zinit(ep);
15208 		close_failed = 0;
15209 		goto close_cleanup;
15210 	}
15211 
15212 	ASSERT(osp->os_ref_count > 1);
15213 
15214 	if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY) &&
15215 		nfs4_dross_pages(vp)) {
15216 		nfs4_invalidate_pages(vp, 0, cred_otw);
15217 	}
15218 
15219 	/*
15220 	 * Sixth, try the CLOSE OTW.
15221 	 */
15222 	nfs4close_otw(rp, cred_otw, oop, osp, &retry, &did_start_seqid_sync,
15223 	    close_type, ep, &have_sync_lock);
15224 
15225 	if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) {
15226 		/*
15227 		 * Let the recovery thread be responsible for
15228 		 * removing the state for CLOSE.
15229 		 */
15230 		close_failed = 1;
15231 		force_close = 0;
15232 		retry = 0;
15233 	}
15234 
15235 	/* See if we need to retry with a different cred */
15236 	if ((ep->error == EACCES ||
15237 	    (ep->error == 0 && ep->stat == NFS4ERR_ACCESS)) &&
15238 	    cred_otw != cr) {
15239 		crfree(cred_otw);
15240 		cred_otw = cr;
15241 		crhold(cred_otw);
15242 		retry = 1;
15243 	}
15244 
15245 	if (ep->error || ep->stat)
15246 		close_failed = 1;
15247 
15248 	if (retry && !isrecov && num_retries-- > 0) {
15249 		if (have_sync_lock) {
15250 			mutex_exit(&osp->os_sync_lock);
15251 			have_sync_lock = 0;
15252 		}
15253 		if (did_start_seqid_sync) {
15254 			nfs4_end_open_seqid_sync(oop);
15255 			did_start_seqid_sync = 0;
15256 		}
15257 		open_stream_rele(osp, rp);
15258 
15259 		if (did_start_op)
15260 			nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15261 				&recov_state, FALSE);
15262 		if (did_force_recovlock)
15263 			nfs_rw_exit(&mi->mi_recovlock);
15264 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15265 			"nfs4close_one: need to retry the close "
15266 			"operation"));
15267 		goto recov_retry;
15268 	}
15269 close_cleanup:
15270 	/*
15271 	 * Seventh and lastly, process our results.
15272 	 */
15273 	if (close_failed && force_close) {
15274 		/*
15275 		 * It's ok to drop and regrab the 'os_sync_lock' since
15276 		 * nfs4close_notw() will recheck to make sure the
15277 		 * "close"/removal of state should happen.
15278 		 */
15279 		if (!have_sync_lock) {
15280 			mutex_enter(&osp->os_sync_lock);
15281 			have_sync_lock = 1;
15282 		}
15283 		/*
15284 		 * This is last call, remove the ref on the open
15285 		 * stream created by open and clean everything up.
15286 		 */
15287 		osp->os_pending_close = 0;
15288 		nfs4close_notw(vp, osp, &have_sync_lock);
15289 		nfs4_error_zinit(ep);
15290 	}
15291 
15292 	if (!close_failed) {
15293 		if (have_sync_lock) {
15294 			osp->os_pending_close = 0;
15295 			mutex_exit(&osp->os_sync_lock);
15296 			have_sync_lock = 0;
15297 		} else {
15298 			mutex_enter(&osp->os_sync_lock);
15299 			osp->os_pending_close = 0;
15300 			mutex_exit(&osp->os_sync_lock);
15301 		}
15302 		if (did_start_op && recov_state.rs_sp != NULL) {
15303 			mutex_enter(&recov_state.rs_sp->s_lock);
15304 			nfs4_dec_state_ref_count_nolock(recov_state.rs_sp, mi);
15305 			mutex_exit(&recov_state.rs_sp->s_lock);
15306 		} else {
15307 			nfs4_dec_state_ref_count(mi);
15308 		}
15309 		nfs4_error_zinit(ep);
15310 	}
15311 
15312 out:
15313 	if (have_sync_lock)
15314 		mutex_exit(&osp->os_sync_lock);
15315 	if (did_start_op)
15316 		nfs4_end_fop(mi, vp, NULL, OH_CLOSE, &recov_state,
15317 		    recovonly ? TRUE : FALSE);
15318 	if (did_force_recovlock)
15319 		nfs_rw_exit(&mi->mi_recovlock);
15320 	if (cred_otw)
15321 		crfree(cred_otw);
15322 	if (osp)
15323 		open_stream_rele(osp, rp);
15324 	if (oop) {
15325 		if (did_start_seqid_sync)
15326 			nfs4_end_open_seqid_sync(oop);
15327 		open_owner_rele(oop);
15328 	}
15329 }
15330 
15331 /*
15332  * Convert information returned by the server in the LOCK4denied
15333  * structure to the form required by fcntl.
15334  */
15335 static void
15336 denied_to_flk(LOCK4denied *lockt_denied, flock64_t *flk, LOCKT4args *lockt_args)
15337 {
15338 	nfs4_lo_name_t *lo;
15339 
15340 #ifdef	DEBUG
15341 	if (denied_to_flk_debug) {
15342 		lockt_denied_debug = lockt_denied;
15343 		debug_enter("lockt_denied");
15344 	}
15345 #endif
15346 
15347 	flk->l_type = lockt_denied->locktype == READ_LT ? F_RDLCK : F_WRLCK;
15348 	flk->l_whence = 0;	/* aka SEEK_SET */
15349 	flk->l_start = lockt_denied->offset;
15350 	flk->l_len = lockt_denied->length;
15351 
15352 	/*
15353 	 * If the blocking clientid matches our client id, then we can
15354 	 * interpret the lockowner (since we built it).  If not, then
15355 	 * fabricate a sysid and pid.  Note that the l_sysid field
15356 	 * in *flk already has the local sysid.
15357 	 */
15358 
15359 	if (lockt_denied->owner.clientid == lockt_args->owner.clientid) {
15360 
15361 		if (lockt_denied->owner.owner_len == sizeof (*lo)) {
15362 			lo = (nfs4_lo_name_t *)
15363 				lockt_denied->owner.owner_val;
15364 
15365 			flk->l_pid = lo->ln_pid;
15366 		} else {
15367 			NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
15368 			"denied_to_flk: bad lock owner length\n"));
15369 
15370 			flk->l_pid = lo_to_pid(&lockt_denied->owner);
15371 		}
15372 	} else {
15373 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
15374 		"denied_to_flk: foreign clientid\n"));
15375 
15376 		/*
15377 		 * Construct a new sysid which should be different from
15378 		 * sysids of other systems.
15379 		 */
15380 
15381 		flk->l_sysid++;
15382 		flk->l_pid = lo_to_pid(&lockt_denied->owner);
15383 	}
15384 }
15385 
15386 static pid_t
15387 lo_to_pid(lock_owner4 *lop)
15388 {
15389 	pid_t pid = 0;
15390 	uchar_t *cp;
15391 	int i;
15392 
15393 	cp = (uchar_t *)&lop->clientid;
15394 
15395 	for (i = 0; i < sizeof (lop->clientid); i++)
15396 		pid += (pid_t)*cp++;
15397 
15398 	cp = (uchar_t *)lop->owner_val;
15399 
15400 	for (i = 0; i < lop->owner_len; i++)
15401 		pid += (pid_t)*cp++;
15402 
15403 	return (pid);
15404 }
15405 
15406 /*
15407  * Given a lock pointer, returns the length of that lock.
15408  * "end" is the last locked offset the "l_len" covers from
15409  * the start of the lock.
15410  */
15411 static off64_t
15412 lock_to_end(flock64_t *lock)
15413 {
15414 	off64_t lock_end;
15415 
15416 	if (lock->l_len == 0)
15417 		lock_end = (off64_t)MAXEND;
15418 	else
15419 		lock_end = lock->l_start + lock->l_len - 1;
15420 
15421 	return (lock_end);
15422 }
15423 
15424 /*
15425  * Given the end of a lock, it will return you the length "l_len" for that lock.
15426  */
15427 static off64_t
15428 end_to_len(off64_t start, off64_t end)
15429 {
15430 	off64_t lock_len;
15431 
15432 	ASSERT(end >= start);
15433 	if (end == MAXEND)
15434 		lock_len = 0;
15435 	else
15436 		lock_len = end - start + 1;
15437 
15438 	return (lock_len);
15439 }
15440 
15441 /*
15442  * On given end for a lock it determines if it is the last locked offset
15443  * or not, if so keeps it as is, else adds one to return the length for
15444  * valid start.
15445  */
15446 static off64_t
15447 start_check(off64_t x)
15448 {
15449 	if (x == MAXEND)
15450 		return (x);
15451 	else
15452 		return (x + 1);
15453 }
15454 
15455 /*
15456  * See if these two locks overlap, and if so return 1;
15457  * otherwise, return 0.
15458  */
15459 static int
15460 locks_intersect(flock64_t *llfp, flock64_t *curfp)
15461 {
15462 	off64_t llfp_end, curfp_end;
15463 
15464 	llfp_end = lock_to_end(llfp);
15465 	curfp_end = lock_to_end(curfp);
15466 
15467 	if (((llfp_end >= curfp->l_start) &&
15468 		(llfp->l_start <= curfp->l_start)) ||
15469 	    ((curfp->l_start <= llfp->l_start) && (curfp_end >= llfp->l_start)))
15470 		return (1);
15471 	return (0);
15472 }
15473 
15474 /*
15475  * Determine what the interseting lock region is, and add that to the
15476  * 'nl_llpp' locklist in increasing order (by l_start).
15477  */
15478 static void
15479 nfs4_add_lock_range(flock64_t *lost_flp, flock64_t *local_flp,
15480 	locklist_t **nl_llpp, vnode_t *vp)
15481 {
15482 	locklist_t *intersect_llp, *tmp_fllp, *cur_fllp;
15483 	off64_t lost_flp_end, local_flp_end, len, start;
15484 
15485 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range:"));
15486 
15487 	if (!locks_intersect(lost_flp, local_flp))
15488 		return;
15489 
15490 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: "
15491 	    "locks intersect"));
15492 
15493 	lost_flp_end = lock_to_end(lost_flp);
15494 	local_flp_end = lock_to_end(local_flp);
15495 
15496 	/* Find the starting point of the intersecting region */
15497 	if (local_flp->l_start > lost_flp->l_start)
15498 		start = local_flp->l_start;
15499 	else
15500 		start = lost_flp->l_start;
15501 
15502 	/* Find the lenght of the intersecting region */
15503 	if (lost_flp_end < local_flp_end)
15504 		len = end_to_len(start, lost_flp_end);
15505 	else
15506 		len = end_to_len(start, local_flp_end);
15507 
15508 	/*
15509 	 * Prepare the flock structure for the intersection found and insert
15510 	 * it into the new list in increasing l_start order. This list contains
15511 	 * intersections of locks registered by the client with the local host
15512 	 * and the lost lock.
15513 	 * The lock type of this lock is the same as that of the local_flp.
15514 	 */
15515 	intersect_llp = (locklist_t *)kmem_alloc(sizeof (locklist_t), KM_SLEEP);
15516 	intersect_llp->ll_flock.l_start = start;
15517 	intersect_llp->ll_flock.l_len = len;
15518 	intersect_llp->ll_flock.l_type = local_flp->l_type;
15519 	intersect_llp->ll_flock.l_pid = local_flp->l_pid;
15520 	intersect_llp->ll_flock.l_sysid = local_flp->l_sysid;
15521 	intersect_llp->ll_flock.l_whence = 0;	/* aka SEEK_SET */
15522 	intersect_llp->ll_vp = vp;
15523 
15524 	tmp_fllp = *nl_llpp;
15525 	cur_fllp = NULL;
15526 	while (tmp_fllp != NULL && tmp_fllp->ll_flock.l_start <
15527 		intersect_llp->ll_flock.l_start) {
15528 			cur_fllp = tmp_fllp;
15529 			tmp_fllp = tmp_fllp->ll_next;
15530 	}
15531 	if (cur_fllp == NULL) {
15532 		/* first on the list */
15533 		intersect_llp->ll_next = *nl_llpp;
15534 		*nl_llpp = intersect_llp;
15535 	} else {
15536 		intersect_llp->ll_next = cur_fllp->ll_next;
15537 		cur_fllp->ll_next = intersect_llp;
15538 	}
15539 
15540 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: "
15541 	    "created lock region: start %"PRIx64" end %"PRIx64" : %s\n",
15542 	    intersect_llp->ll_flock.l_start,
15543 	    intersect_llp->ll_flock.l_start + intersect_llp->ll_flock.l_len,
15544 	    intersect_llp->ll_flock.l_type == F_RDLCK ? "READ" : "WRITE"));
15545 }
15546 
15547 /*
15548  * Our local locking current state is potentially different than
15549  * what the NFSv4 server thinks we have due to a lost lock that was
15550  * resent and then received.  We need to reset our "NFSv4" locking
15551  * state to match the current local locking state for this pid since
15552  * that is what the user/application sees as what the world is.
15553  *
15554  * We cannot afford to drop the open/lock seqid sync since then we can
15555  * get confused about what the current local locking state "is" versus
15556  * "was".
15557  *
15558  * If we are unable to fix up the locks, we send SIGLOST to the affected
15559  * process.  This is not done if the filesystem has been forcibly
15560  * unmounted, in case the process has already exited and a new process
15561  * exists with the same pid.
15562  */
15563 static void
15564 nfs4_reinstitute_local_lock_state(vnode_t *vp, flock64_t *lost_flp, cred_t *cr,
15565 		nfs4_lock_owner_t *lop)
15566 {
15567 	locklist_t *locks, *llp, *ri_llp, *tmp_llp;
15568 	mntinfo4_t *mi = VTOMI4(vp);
15569 	const int cmd = F_SETLK;
15570 	off64_t cur_start, llp_ll_flock_end, lost_flp_end;
15571 	flock64_t ul_fl;
15572 
15573 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15574 		"nfs4_reinstitute_local_lock_state"));
15575 
15576 	/*
15577 	 * Find active locks for this vp from the local locking code.
15578 	 * Scan through this list and find out the locks that intersect with
15579 	 * the lost lock. Once we find the lock that intersects, add the
15580 	 * intersection area as a new lock to a new list "ri_llp". The lock
15581 	 * type of the intersection region lock added to ri_llp is the same
15582 	 * as that found in the active lock list, "list". The intersecting
15583 	 * region locks are added to ri_llp in increasing l_start order.
15584 	 */
15585 	ASSERT(nfs_zone() == mi->mi_zone);
15586 
15587 	locks = flk_active_locks_for_vp(vp);
15588 	ri_llp = NULL;
15589 
15590 	for (llp = locks; llp != NULL; llp = llp->ll_next) {
15591 		ASSERT(llp->ll_vp == vp);
15592 		/*
15593 		 * Pick locks that belong to this pid/lockowner
15594 		 */
15595 		if (llp->ll_flock.l_pid != lost_flp->l_pid)
15596 			continue;
15597 
15598 		nfs4_add_lock_range(lost_flp, &llp->ll_flock, &ri_llp, vp);
15599 	}
15600 
15601 	/*
15602 	 * Now we have the list of intersections with the lost lock. These are
15603 	 * the locks that were/are active before the server replied to the
15604 	 * last/lost lock. Issue these locks to the server here. Playing these
15605 	 * locks to the server will re-establish aur current local locking state
15606 	 * with the v4 server.
15607 	 * If we get an error, send SIGLOST to the application for that lock.
15608 	 */
15609 
15610 	for (llp = ri_llp; llp != NULL; llp = llp->ll_next) {
15611 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15612 		    "nfs4_reinstitute_local_lock_state: need to issue "
15613 		    "flock: [%"PRIx64" - %"PRIx64"] : %s",
15614 		    llp->ll_flock.l_start,
15615 		    llp->ll_flock.l_start + llp->ll_flock.l_len,
15616 		    llp->ll_flock.l_type == F_RDLCK ? "READ" :
15617 		    llp->ll_flock.l_type == F_WRLCK ? "WRITE" : "INVALID"));
15618 		/*
15619 		 * No need to relock what we already have
15620 		 */
15621 		if (llp->ll_flock.l_type == lost_flp->l_type)
15622 			continue;
15623 
15624 		push_reinstate(vp, cmd, &llp->ll_flock, cr, lop);
15625 	}
15626 
15627 	/*
15628 	 * Now keeping the start of the lost lock as our reference parse the
15629 	 * newly created ri_llp locklist to find the ranges that we have locked
15630 	 * with the v4 server but not in the current local locking. We need
15631 	 * to unlock these ranges.
15632 	 * These ranges can also be reffered to as those ranges, where the lost
15633 	 * lock does not overlap with the locks in the ri_llp but are locked
15634 	 * since the server replied to the lost lock.
15635 	 */
15636 	cur_start = lost_flp->l_start;
15637 	lost_flp_end = lock_to_end(lost_flp);
15638 
15639 	ul_fl.l_type = F_UNLCK;
15640 	ul_fl.l_whence = 0;	/* aka SEEK_SET */
15641 	ul_fl.l_sysid = lost_flp->l_sysid;
15642 	ul_fl.l_pid = lost_flp->l_pid;
15643 
15644 	for (llp = ri_llp; llp != NULL; llp = llp->ll_next) {
15645 		llp_ll_flock_end = lock_to_end(&llp->ll_flock);
15646 
15647 		if (llp->ll_flock.l_start <= cur_start) {
15648 			cur_start = start_check(llp_ll_flock_end);
15649 			continue;
15650 		}
15651 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15652 			"nfs4_reinstitute_local_lock_state: "
15653 			"UNLOCK [%"PRIx64" - %"PRIx64"]",
15654 			cur_start, llp->ll_flock.l_start));
15655 
15656 		ul_fl.l_start = cur_start;
15657 		ul_fl.l_len = end_to_len(cur_start,
15658 		    (llp->ll_flock.l_start - 1));
15659 
15660 		push_reinstate(vp, cmd, &ul_fl, cr, lop);
15661 		cur_start = start_check(llp_ll_flock_end);
15662 	}
15663 
15664 	/*
15665 	 * In the case where the lost lock ends after all intersecting locks,
15666 	 * unlock the last part of the lost lock range.
15667 	 */
15668 	if (cur_start != start_check(lost_flp_end)) {
15669 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15670 			"nfs4_reinstitute_local_lock_state: UNLOCK end of the "
15671 			"lost lock region [%"PRIx64" - %"PRIx64"]",
15672 			cur_start, lost_flp->l_start + lost_flp->l_len));
15673 
15674 		ul_fl.l_start = cur_start;
15675 		/*
15676 		 * Is it an to-EOF lock? if so unlock till the end
15677 		 */
15678 		if (lost_flp->l_len == 0)
15679 			ul_fl.l_len = 0;
15680 		else
15681 			ul_fl.l_len = start_check(lost_flp_end) - cur_start;
15682 
15683 		push_reinstate(vp, cmd, &ul_fl, cr, lop);
15684 	}
15685 
15686 	if (locks != NULL)
15687 		flk_free_locklist(locks);
15688 
15689 	/* Free up our newly created locklist */
15690 	for (llp = ri_llp; llp != NULL; ) {
15691 		tmp_llp = llp->ll_next;
15692 		kmem_free(llp, sizeof (locklist_t));
15693 		llp = tmp_llp;
15694 	}
15695 
15696 	/*
15697 	 * Now return back to the original calling nfs4frlock()
15698 	 * and let us naturally drop our seqid syncs.
15699 	 */
15700 }
15701 
15702 /*
15703  * Create a lost state record for the given lock reinstantiation request
15704  * and push it onto the lost state queue.
15705  */
15706 static void
15707 push_reinstate(vnode_t *vp, int cmd, flock64_t *flk, cred_t *cr,
15708 	nfs4_lock_owner_t *lop)
15709 {
15710 	nfs4_lost_rqst_t req;
15711 	nfs_lock_type4 locktype;
15712 	nfs4_error_t e = { EINTR, NFS4_OK, RPC_SUCCESS };
15713 
15714 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
15715 
15716 	locktype = flk_to_locktype(cmd, flk->l_type);
15717 	nfs4frlock_save_lost_rqst(NFS4_LCK_CTYPE_REINSTATE, EINTR, locktype,
15718 				NULL, NULL, lop, flk, &req, cr, vp);
15719 	(void) nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
15720 		    (req.lr_op == OP_LOCK || req.lr_op == OP_LOCKU) ?
15721 		    &req : NULL, flk->l_type == F_UNLCK ? OP_LOCKU : OP_LOCK,
15722 		    NULL);
15723 }
15724