1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28 
29 #pragma ident	"%Z%%M%	%I%	%E% SMI"
30 
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/cred.h>
35 #include <sys/vfs.h>
36 #include <sys/vnode.h>
37 #include <sys/pathname.h>
38 #include <sys/sysmacros.h>
39 #include <sys/kmem.h>
40 #include <sys/kstat.h>
41 #include <sys/mkdev.h>
42 #include <sys/mount.h>
43 #include <sys/statvfs.h>
44 #include <sys/errno.h>
45 #include <sys/debug.h>
46 #include <sys/cmn_err.h>
47 #include <sys/utsname.h>
48 #include <sys/bootconf.h>
49 #include <sys/modctl.h>
50 #include <sys/acl.h>
51 #include <sys/flock.h>
52 #include <sys/kstr.h>
53 #include <sys/stropts.h>
54 #include <sys/strsubr.h>
55 #include <sys/atomic.h>
56 #include <sys/disp.h>
57 #include <sys/policy.h>
58 #include <sys/list.h>
59 #include <sys/zone.h>
60 
61 #include <rpc/types.h>
62 #include <rpc/auth.h>
63 #include <rpc/rpcsec_gss.h>
64 #include <rpc/clnt.h>
65 #include <rpc/xdr.h>
66 
67 #include <nfs/nfs.h>
68 #include <nfs/nfs_clnt.h>
69 #include <nfs/mount.h>
70 #include <nfs/nfs_acl.h>
71 
72 #include <fs/fs_subr.h>
73 
74 #include <nfs/nfs4.h>
75 #include <nfs/rnode4.h>
76 #include <nfs/nfs4_clnt.h>
77 #include <nfs/nfssys.h>
78 
79 #ifdef	DEBUG
80 /*
81  * These are "special" state IDs and file handles that
82  * match any delegation state ID or file handled.  This
83  * is for testing purposes only.
84  */
85 
86 stateid4 nfs4_deleg_any = { 0x7FFFFFF0 };
87 char nfs4_deleg_fh[] = "\0377\0376\0375\0374";
88 nfs_fh4 nfs4_deleg_anyfh = { sizeof (nfs4_deleg_fh)-1, nfs4_deleg_fh };
89 nfsstat4 cb4_getattr_fail = NFS4_OK;
90 nfsstat4 cb4_recall_fail = NFS4_OK;
91 
92 int nfs4_callback_debug;
93 int nfs4_recall_debug;
94 int nfs4_drat_debug;
95 
96 #endif
97 
98 #define	CB_NOTE(x)	NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE, x))
99 #define	CB_WARN(x)	NFS4_DEBUG(nfs4_callback_debug, (CE_WARN, x))
100 #define	CB_WARN1(x, y)	NFS4_DEBUG(nfs4_callback_debug, (CE_WARN, x, y))
101 
102 enum nfs4_delegreturn_policy nfs4_delegreturn_policy = INACTIVE;
103 
104 static zone_key_t nfs4_callback_zone_key;
105 
106 /*
107  * NFS4_MAPSIZE is the number of bytes we are willing to consume
108  * for the block allocation map when the server grants a NFS_LIMIT_BLOCK
109  * style delegation.
110  */
111 
112 #define	NFS4_MAPSIZE	8192
113 #define	NFS4_MAPWORDS	NFS4_MAPSIZE/sizeof (uint_t)
114 #define	NbPW		(NBBY*sizeof (uint_t))
115 
116 static int nfs4_num_prognums = 1024;
117 static SVC_CALLOUT_TABLE nfs4_cb_sct;
118 
119 struct nfs4_dnode {
120 	list_node_t	linkage;
121 	rnode4_t	*rnodep;
122 	int		flags;		/* Flags for nfs4delegreturn_impl() */
123 };
124 
125 static const struct nfs4_callback_stats nfs4_callback_stats_tmpl = {
126 	{ "delegations",	KSTAT_DATA_UINT64 },
127 	{ "cb_getattr",		KSTAT_DATA_UINT64 },
128 	{ "cb_recall",		KSTAT_DATA_UINT64 },
129 	{ "cb_null",		KSTAT_DATA_UINT64 },
130 	{ "cb_dispatch",	KSTAT_DATA_UINT64 },
131 	{ "delegaccept_r",	KSTAT_DATA_UINT64 },
132 	{ "delegaccept_rw",	KSTAT_DATA_UINT64 },
133 	{ "delegreturn",	KSTAT_DATA_UINT64 },
134 	{ "callbacks",		KSTAT_DATA_UINT64 },
135 	{ "claim_cur",		KSTAT_DATA_UINT64 },
136 	{ "claim_cur_ok",	KSTAT_DATA_UINT64 },
137 	{ "recall_trunc",	KSTAT_DATA_UINT64 },
138 	{ "recall_failed",	KSTAT_DATA_UINT64 },
139 	{ "return_limit_write",	KSTAT_DATA_UINT64 },
140 	{ "return_limit_addmap", KSTAT_DATA_UINT64 },
141 	{ "deleg_recover",	KSTAT_DATA_UINT64 },
142 	{ "cb_illegal",		KSTAT_DATA_UINT64 }
143 };
144 
145 struct nfs4_cb_port {
146 	list_node_t		linkage; /* linkage into per-zone port list */
147 	char			netid[KNC_STRSIZE];
148 	char			uaddr[KNC_STRSIZE];
149 	char			protofmly[KNC_STRSIZE];
150 	char			proto[KNC_STRSIZE];
151 };
152 
153 static int cb_getattr_bytes;
154 
155 struct cb_recall_pass {
156 	rnode4_t	*rp;
157 	int		flags;		/* Flags for nfs4delegreturn_impl() */
158 	bool_t		truncate;
159 };
160 
161 static nfs4_open_stream_t *get_next_deleg_stream(rnode4_t *, int);
162 static void nfs4delegreturn_thread(struct cb_recall_pass *);
163 static int deleg_reopen(vnode_t *, bool_t *, struct nfs4_callback_globals *,
164     int);
165 static void nfs4_dlistadd(rnode4_t *, struct nfs4_callback_globals *, int);
166 static void nfs4_dlistclean_impl(struct nfs4_callback_globals *, int);
167 static int nfs4delegreturn_impl(rnode4_t *, int,
168     struct nfs4_callback_globals *);
169 static void nfs4delegreturn_cleanup_impl(rnode4_t *, nfs4_server_t *,
170     struct nfs4_callback_globals *);
171 
172 static void
173 cb_getattr(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
174 	struct compound_state *cs, struct nfs4_callback_globals *ncg)
175 {
176 	CB_GETATTR4args *args = &argop->nfs_cb_argop4_u.opcbgetattr;
177 	CB_GETATTR4res *resp = &resop->nfs_cb_resop4_u.opcbgetattr;
178 	rnode4_t *rp;
179 	vnode_t *vp;
180 	bool_t found = FALSE;
181 	struct nfs4_server *sp;
182 	struct fattr4 *fap;
183 	rpc_inline_t *fdata;
184 	long mapcnt;
185 	fattr4_change change;
186 	fattr4_size size;
187 	uint_t rflag;
188 
189 	ncg->nfs4_callback_stats.cb_getattr.value.ui64++;
190 
191 #ifdef DEBUG
192 	/*
193 	 * error injection hook: set cb_getattr_fail global to
194 	 * NFS4 pcol error to be returned
195 	 */
196 	if (cb4_getattr_fail != NFS4_OK) {
197 		*cs->statusp = resp->status = cb4_getattr_fail;
198 		return;
199 	}
200 #endif
201 
202 	resp->obj_attributes.attrmask = 0;
203 
204 	mutex_enter(&ncg->nfs4_cb_lock);
205 	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
206 	mutex_exit(&ncg->nfs4_cb_lock);
207 
208 	if (nfs4_server_vlock(sp, 0) == FALSE) {
209 
210 		CB_WARN("cb_getattr: cannot find server\n");
211 
212 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
213 		return;
214 	}
215 
216 	/*
217 	 * In cb_compound, callback_ident was validated against rq_prog,
218 	 * but we couldn't verify that it was set to the value we provided
219 	 * at setclientid time (because we didn't have server struct yet).
220 	 * Now we have the server struct, but don't have callback_ident
221 	 * handy.  So, validate server struct program number against req
222 	 * RPC's prog number.  At this point, we know the RPC prog num
223 	 * is valid (else we wouldn't be here); however, we don't know
224 	 * that it was the prog number we supplied to this server at
225 	 * setclientid time.  If the prog numbers aren't equivalent, then
226 	 * log the problem and fail the request because either cbserv
227 	 * and/or cbclient are confused.  This will probably never happen.
228 	 */
229 	if (sp->s_program != req->rq_prog) {
230 #ifdef DEBUG
231 		zcmn_err(getzoneid(), CE_WARN,
232 		    "cb_getattr: wrong server program number srv=%d req=%d\n",
233 		    sp->s_program, req->rq_prog);
234 #else
235 		zcmn_err(getzoneid(), CE_WARN,
236 		    "cb_getattr: wrong server program number\n");
237 #endif
238 		mutex_exit(&sp->s_lock);
239 		nfs4_server_rele(sp);
240 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
241 		return;
242 	}
243 
244 	/*
245 	 * Search the delegation list for a matching file handle;
246 	 * mutex on sp prevents the list from changing.
247 	 */
248 
249 	rp = list_head(&sp->s_deleg_list);
250 	for (; rp != NULL; rp = list_next(&sp->s_deleg_list, rp)) {
251 		nfs4_fhandle_t fhandle;
252 
253 		sfh4_copyval(rp->r_fh, &fhandle);
254 
255 		if ((fhandle.fh_len == args->fh.nfs_fh4_len &&
256 		    bcmp(fhandle.fh_buf, args->fh.nfs_fh4_val,
257 		    fhandle.fh_len) == 0)) {
258 
259 			found = TRUE;
260 			break;
261 		}
262 #ifdef	DEBUG
263 		if (nfs4_deleg_anyfh.nfs_fh4_len == args->fh.nfs_fh4_len &&
264 		    bcmp(nfs4_deleg_anyfh.nfs_fh4_val, args->fh.nfs_fh4_val,
265 		    args->fh.nfs_fh4_len) == 0) {
266 
267 			found = TRUE;
268 			break;
269 		}
270 #endif
271 	}
272 
273 	/*
274 	 * VN_HOLD the vnode before releasing s_lock to guarantee
275 	 * we have a valid vnode reference.
276 	 */
277 	if (found == TRUE) {
278 		vp = RTOV4(rp);
279 		VN_HOLD(vp);
280 	}
281 
282 	mutex_exit(&sp->s_lock);
283 	nfs4_server_rele(sp);
284 
285 	if (found == FALSE) {
286 
287 		CB_WARN("cb_getattr: bad fhandle\n");
288 
289 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
290 		return;
291 	}
292 
293 	/*
294 	 * Figure out which attributes the server wants.  We only
295 	 * offer FATTR4_CHANGE & FATTR4_SIZE; ignore the rest.
296 	 */
297 	fdata = kmem_alloc(cb_getattr_bytes, KM_SLEEP);
298 
299 	/*
300 	 * Don't actually need to create XDR to encode these
301 	 * simple data structures.
302 	 * xdrmem_create(&xdr, fdata, cb_getattr_bytes, XDR_ENCODE);
303 	 */
304 	fap = &resp->obj_attributes;
305 
306 	fap->attrmask = 0;
307 	/* attrlist4_len starts at 0 and increases as attrs are processed */
308 	fap->attrlist4 = (char *)fdata;
309 	fap->attrlist4_len = 0;
310 
311 	/* don't supply attrs if request was zero */
312 	if (args->attr_request != 0) {
313 		if (args->attr_request & FATTR4_CHANGE_MASK) {
314 			/*
315 			 * If the file is mmapped, then increment the change
316 			 * attribute and return it.  This will guarantee that
317 			 * the server will perceive that the file has changed
318 			 * if there is any chance that the client application
319 			 * has changed it.  Otherwise, just return the change
320 			 * attribute as it has been updated by nfs4write_deleg.
321 			 */
322 
323 			mutex_enter(&rp->r_statelock);
324 			mapcnt = rp->r_mapcnt;
325 			rflag = rp->r_flags;
326 			mutex_exit(&rp->r_statelock);
327 
328 			mutex_enter(&rp->r_statev4_lock);
329 			/*
330 			 * If object mapped, then always return new change.
331 			 * Otherwise, return change if object has dirty
332 			 * pages.  If object doesn't have any dirty pages,
333 			 * then all changes have been pushed to server, so
334 			 * reset change to grant change.
335 			 */
336 			if (mapcnt)
337 				rp->r_deleg_change++;
338 			else if (! (rflag & R4DIRTY))
339 				rp->r_deleg_change = rp->r_deleg_change_grant;
340 			change = rp->r_deleg_change;
341 			mutex_exit(&rp->r_statev4_lock);
342 
343 			/*
344 			 * Use inline XDR code directly, we know that we
345 			 * going to a memory buffer and it has enough
346 			 * space so it cannot fail.
347 			 */
348 			IXDR_PUT_U_HYPER(fdata, change);
349 			fap->attrlist4_len += 2 * BYTES_PER_XDR_UNIT;
350 			fap->attrmask |= FATTR4_CHANGE_MASK;
351 		}
352 
353 		if (args->attr_request & FATTR4_SIZE_MASK) {
354 			/*
355 			 * Use an atomic add of 0 to fetch a consistent view
356 			 * of r_size; this avoids having to take rw_lock
357 			 * which could cause a deadlock.
358 			 */
359 			size = atomic_add_64_nv((uint64_t *)&rp->r_size, 0);
360 
361 			/*
362 			 * Use inline XDR code directly, we know that we
363 			 * going to a memory buffer and it has enough
364 			 * space so it cannot fail.
365 			 */
366 			IXDR_PUT_U_HYPER(fdata, size);
367 			fap->attrlist4_len += 2 * BYTES_PER_XDR_UNIT;
368 			fap->attrmask |= FATTR4_SIZE_MASK;
369 		}
370 	}
371 
372 	VN_RELE(vp);
373 
374 	*cs->statusp = resp->status = NFS4_OK;
375 }
376 
377 static void
378 cb_getattr_free(nfs_cb_resop4 *resop)
379 {
380 	if (resop->nfs_cb_resop4_u.opcbgetattr.obj_attributes.attrlist4)
381 		kmem_free(resop->nfs_cb_resop4_u.opcbgetattr.
382 		    obj_attributes.attrlist4, cb_getattr_bytes);
383 }
384 
385 static void
386 cb_recall(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
387 	struct compound_state *cs, struct nfs4_callback_globals *ncg)
388 {
389 	CB_RECALL4args * args = &argop->nfs_cb_argop4_u.opcbrecall;
390 	CB_RECALL4res *resp = &resop->nfs_cb_resop4_u.opcbrecall;
391 	rnode4_t *rp;
392 	vnode_t *vp;
393 	struct nfs4_server *sp;
394 	bool_t found = FALSE;
395 
396 	ncg->nfs4_callback_stats.cb_recall.value.ui64++;
397 
398 	ASSERT(req->rq_prog >= NFS4_CALLBACK);
399 	ASSERT(req->rq_prog < NFS4_CALLBACK+nfs4_num_prognums);
400 
401 #ifdef DEBUG
402 	/*
403 	 * error injection hook: set cb_recall_fail global to
404 	 * NFS4 pcol error to be returned
405 	 */
406 	if (cb4_recall_fail != NFS4_OK) {
407 		*cs->statusp = resp->status = cb4_recall_fail;
408 		return;
409 	}
410 #endif
411 
412 	mutex_enter(&ncg->nfs4_cb_lock);
413 	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
414 	mutex_exit(&ncg->nfs4_cb_lock);
415 
416 	if (nfs4_server_vlock(sp, 0) == FALSE) {
417 
418 		CB_WARN("cb_recall: cannot find server\n");
419 
420 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
421 		return;
422 	}
423 
424 	/*
425 	 * Search the delegation list for a matching file handle
426 	 * AND stateid; mutex on sp prevents the list from changing.
427 	 */
428 
429 	rp = list_head(&sp->s_deleg_list);
430 	for (; rp != NULL; rp = list_next(&sp->s_deleg_list, rp)) {
431 		mutex_enter(&rp->r_statev4_lock);
432 
433 		/* check both state id and file handle! */
434 
435 		if ((bcmp(&rp->r_deleg_stateid, &args->stateid,
436 		    sizeof (stateid4)) == 0)) {
437 			nfs4_fhandle_t fhandle;
438 
439 			sfh4_copyval(rp->r_fh, &fhandle);
440 			if ((fhandle.fh_len == args->fh.nfs_fh4_len &&
441 			    bcmp(fhandle.fh_buf, args->fh.nfs_fh4_val,
442 			    fhandle.fh_len) == 0)) {
443 
444 				found = TRUE;
445 				break;
446 			} else {
447 #ifdef	DEBUG
448 				CB_WARN("cb_recall: stateid OK, bad fh");
449 #endif
450 			}
451 		}
452 #ifdef	DEBUG
453 		if (bcmp(&args->stateid, &nfs4_deleg_any,
454 		    sizeof (stateid4)) == 0) {
455 
456 			found = TRUE;
457 			break;
458 		}
459 #endif
460 		mutex_exit(&rp->r_statev4_lock);
461 	}
462 
463 	/*
464 	 * VN_HOLD the vnode before releasing s_lock to guarantee
465 	 * we have a valid vnode reference.  The async thread will
466 	 * release the hold when it's done.
467 	 */
468 	if (found == TRUE) {
469 		mutex_exit(&rp->r_statev4_lock);
470 		vp = RTOV4(rp);
471 		VN_HOLD(vp);
472 	}
473 	mutex_exit(&sp->s_lock);
474 	nfs4_server_rele(sp);
475 
476 	if (found == FALSE) {
477 
478 		CB_WARN("cb_recall: bad stateid\n");
479 
480 		*cs->statusp = resp->status = NFS4ERR_BAD_STATEID;
481 		return;
482 	}
483 
484 	/* Fire up a thread to do the delegreturn */
485 	nfs4delegreturn_async(rp, NFS4_DR_RECALL|NFS4_DR_REOPEN,
486 	    args->truncate);
487 
488 	*cs->statusp = resp->status = 0;
489 }
490 
491 /* ARGSUSED */
492 static void
493 cb_recall_free(nfs_cb_resop4 *resop)
494 {
495 	/* nothing to do here, cb_recall doesn't kmem_alloc */
496 }
497 
498 /*
499  * This function handles the CB_NULL proc call from an NFSv4 Server.
500  *
501  * We take note that the server has sent a CB_NULL for later processing
502  * in the recovery logic. It is noted so we may pause slightly after the
503  * setclientid and before reopening files. The pause is to allow the
504  * NFSv4 Server time to receive the CB_NULL reply and adjust any of
505  * its internal structures such that it has the opportunity to grant
506  * delegations to reopened files.
507  *
508  */
509 
510 /* ARGSUSED */
511 static void
512 cb_null(CB_COMPOUND4args *args, CB_COMPOUND4res *resp, struct svc_req *req,
513     struct nfs4_callback_globals *ncg)
514 {
515 	struct nfs4_server *sp;
516 
517 	ncg->nfs4_callback_stats.cb_null.value.ui64++;
518 
519 	ASSERT(req->rq_prog >= NFS4_CALLBACK);
520 	ASSERT(req->rq_prog < NFS4_CALLBACK+nfs4_num_prognums);
521 
522 	mutex_enter(&ncg->nfs4_cb_lock);
523 	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
524 	mutex_exit(&ncg->nfs4_cb_lock);
525 
526 	if (nfs4_server_vlock(sp, 0) != FALSE) {
527 		sp->s_flags |= N4S_CB_PINGED;
528 		cv_broadcast(&sp->wait_cb_null);
529 		mutex_exit(&sp->s_lock);
530 		nfs4_server_rele(sp);
531 	}
532 }
533 
534 /*
535  * cb_illegal	args: void
536  *		res : status (NFS4ERR_OP_CB_ILLEGAL)
537  */
538 /* ARGSUSED */
539 static void
540 cb_illegal(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
541 	struct compound_state *cs, struct nfs4_callback_globals *ncg)
542 {
543 	CB_ILLEGAL4res *resp = &resop->nfs_cb_resop4_u.opcbillegal;
544 
545 	ncg->nfs4_callback_stats.cb_illegal.value.ui64++;
546 	resop->resop = OP_CB_ILLEGAL;
547 	*cs->statusp = resp->status = NFS4ERR_OP_ILLEGAL;
548 }
549 
550 static void
551 cb_compound(CB_COMPOUND4args *args, CB_COMPOUND4res *resp, struct svc_req *req,
552 	struct nfs4_callback_globals *ncg)
553 {
554 	uint_t i;
555 	struct compound_state cs;
556 	nfs_cb_argop4 *argop;
557 	nfs_cb_resop4 *resop, *new_res;
558 	uint_t op;
559 
560 	bzero(&cs, sizeof (cs));
561 	cs.statusp = &resp->status;
562 	cs.cont = TRUE;
563 
564 	/*
565 	 * Form a reply tag by copying over the reqeuest tag.
566 	 */
567 	resp->tag.utf8string_len = args->tag.utf8string_len;
568 	resp->tag.utf8string_val = kmem_alloc(resp->tag.utf8string_len,
569 	    KM_SLEEP);
570 	bcopy(args->tag.utf8string_val, resp->tag.utf8string_val,
571 	    args->tag.utf8string_len);
572 
573 	/*
574 	 * XXX for now, minorversion should be zero
575 	 */
576 	if (args->minorversion != CB4_MINORVERSION) {
577 		resp->array_len = 0;
578 		resp->array = NULL;
579 		resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
580 		return;
581 	}
582 
583 #ifdef DEBUG
584 	/*
585 	 * Verify callback_ident.  It doesn't really matter if it's wrong
586 	 * because we don't really use callback_ident -- we use prog number
587 	 * of the RPC request instead.  In this case, just print a DEBUG
588 	 * console message to reveal brokenness of cbclient (at bkoff/cthon).
589 	 */
590 	if (args->callback_ident != req->rq_prog)
591 		zcmn_err(getzoneid(), CE_WARN,
592 		    "cb_compound: cb_client using wrong "
593 		    "callback_ident(%d), should be %d",
594 		    args->callback_ident, req->rq_prog);
595 #endif
596 
597 	resp->array_len = args->array_len;
598 	resp->array = kmem_zalloc(args->array_len * sizeof (nfs_cb_resop4),
599 	    KM_SLEEP);
600 
601 	for (i = 0; i < args->array_len && cs.cont; i++) {
602 
603 		argop = &args->array[i];
604 		resop = &resp->array[i];
605 		resop->resop = argop->argop;
606 		op = (uint_t)resop->resop;
607 
608 		switch (op) {
609 
610 		case OP_CB_GETATTR:
611 
612 			cb_getattr(argop, resop, req, &cs, ncg);
613 			break;
614 
615 		case OP_CB_RECALL:
616 
617 			cb_recall(argop, resop, req, &cs, ncg);
618 			break;
619 
620 		case OP_CB_ILLEGAL:
621 
622 			/* fall through */
623 
624 		default:
625 			/*
626 			 * Handle OP_CB_ILLEGAL and any undefined opcode.
627 			 * Currently, the XDR code will return BADXDR
628 			 * if cb op doesn't decode to legal value, so
629 			 * it really only handles OP_CB_ILLEGAL.
630 			 */
631 			op = OP_CB_ILLEGAL;
632 			cb_illegal(argop, resop, req, &cs, ncg);
633 		}
634 
635 		if (*cs.statusp != NFS4_OK)
636 			cs.cont = FALSE;
637 
638 		/*
639 		 * If not at last op, and if we are to stop, then
640 		 * compact the results array.
641 		 */
642 		if ((i + 1) < args->array_len && !cs.cont) {
643 
644 			new_res = kmem_alloc(
645 			    (i+1) * sizeof (nfs_cb_resop4), KM_SLEEP);
646 			bcopy(resp->array,
647 			    new_res, (i+1) * sizeof (nfs_cb_resop4));
648 			kmem_free(resp->array,
649 			    args->array_len * sizeof (nfs_cb_resop4));
650 
651 			resp->array_len =  i + 1;
652 			resp->array = new_res;
653 		}
654 	}
655 
656 }
657 
658 static void
659 cb_compound_free(CB_COMPOUND4res *resp)
660 {
661 	uint_t i, op;
662 	nfs_cb_resop4 *resop;
663 
664 	if (resp->tag.utf8string_val) {
665 		UTF8STRING_FREE(resp->tag)
666 	}
667 
668 	for (i = 0; i < resp->array_len; i++) {
669 
670 		resop = &resp->array[i];
671 		op = (uint_t)resop->resop;
672 
673 		switch (op) {
674 
675 		case OP_CB_GETATTR:
676 
677 			cb_getattr_free(resop);
678 			break;
679 
680 		case OP_CB_RECALL:
681 
682 			cb_recall_free(resop);
683 			break;
684 
685 		default:
686 			break;
687 		}
688 	}
689 
690 	if (resp->array != NULL) {
691 		kmem_free(resp->array,
692 		    resp->array_len * sizeof (nfs_cb_resop4));
693 	}
694 }
695 
696 static void
697 cb_dispatch(struct svc_req *req, SVCXPRT *xprt)
698 {
699 	CB_COMPOUND4args args;
700 	CB_COMPOUND4res res;
701 	struct nfs4_callback_globals *ncg;
702 
703 	bool_t (*xdr_args)(), (*xdr_res)();
704 	void (*proc)(CB_COMPOUND4args *, CB_COMPOUND4res *, struct svc_req *,
705 	    struct nfs4_callback_globals *);
706 	void (*freeproc)(CB_COMPOUND4res *);
707 
708 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
709 	ASSERT(ncg != NULL);
710 
711 	ncg->nfs4_callback_stats.cb_dispatch.value.ui64++;
712 
713 	switch (req->rq_proc) {
714 	case CB_NULL:
715 		xdr_args = xdr_void;
716 		xdr_res = xdr_void;
717 		proc = cb_null;
718 		freeproc = NULL;
719 		break;
720 
721 	case CB_COMPOUND:
722 		xdr_args = xdr_CB_COMPOUND4args_clnt;
723 		xdr_res = xdr_CB_COMPOUND4res;
724 		proc = cb_compound;
725 		freeproc = cb_compound_free;
726 		break;
727 
728 	default:
729 		CB_WARN("cb_dispatch: no proc\n");
730 		svcerr_noproc(xprt);
731 		return;
732 	}
733 
734 	args.tag.utf8string_val = NULL;
735 	args.array = NULL;
736 
737 	if (!SVC_GETARGS(xprt, xdr_args, (caddr_t)&args)) {
738 
739 		CB_WARN("cb_dispatch: cannot getargs\n");
740 		svcerr_decode(xprt);
741 		return;
742 	}
743 
744 	(*proc)(&args, &res, req, ncg);
745 
746 	if (svc_sendreply(xprt, xdr_res, (caddr_t)&res) == FALSE) {
747 
748 		CB_WARN("cb_dispatch: bad sendreply\n");
749 		svcerr_systemerr(xprt);
750 	}
751 
752 	if (freeproc)
753 		(*freeproc)(&res);
754 
755 	if (!SVC_FREEARGS(xprt, xdr_args, (caddr_t)&args)) {
756 
757 		CB_WARN("cb_dispatch: bad freeargs\n");
758 	}
759 }
760 
761 static rpcprog_t
762 nfs4_getnextprogram(struct nfs4_callback_globals *ncg)
763 {
764 	int i, j;
765 
766 	j = ncg->nfs4_program_hint;
767 	for (i = 0; i < nfs4_num_prognums; i++, j++) {
768 
769 		if (j >= nfs4_num_prognums)
770 			j = 0;
771 
772 		if (ncg->nfs4prog2server[j] == NULL) {
773 			ncg->nfs4_program_hint = j+1;
774 			return (j+NFS4_CALLBACK);
775 		}
776 	}
777 
778 	return (0);
779 }
780 
781 void
782 nfs4callback_destroy(nfs4_server_t *np)
783 {
784 	struct nfs4_callback_globals *ncg;
785 	int i;
786 
787 	if (np->s_program == 0)
788 		return;
789 
790 	ncg = np->zone_globals;
791 	i = np->s_program - NFS4_CALLBACK;
792 
793 	mutex_enter(&ncg->nfs4_cb_lock);
794 
795 	ASSERT(ncg->nfs4prog2server[i] == np);
796 
797 	ncg->nfs4prog2server[i] = NULL;
798 
799 	if (i < ncg->nfs4_program_hint)
800 		ncg->nfs4_program_hint = i;
801 
802 	mutex_exit(&ncg->nfs4_cb_lock);
803 }
804 
805 /*
806  * nfs4_setport - This function saves a netid and univeral address for
807  * the callback program.  These values will be used during setclientid.
808  */
809 static void
810 nfs4_setport(char *netid, char *uaddr, char *protofmly, char *proto,
811 	struct nfs4_callback_globals *ncg)
812 {
813 	struct nfs4_cb_port *p;
814 	bool_t found = FALSE;
815 
816 	ASSERT(MUTEX_HELD(&ncg->nfs4_cb_lock));
817 
818 	p = list_head(&ncg->nfs4_cb_ports);
819 	for (; p != NULL; p = list_next(&ncg->nfs4_cb_ports, p)) {
820 		if (strcmp(p->netid, netid) == 0) {
821 			found = TRUE;
822 			break;
823 		}
824 	}
825 	if (found == TRUE)
826 		(void) strcpy(p->uaddr, uaddr);
827 	else {
828 		p = kmem_alloc(sizeof (*p), KM_SLEEP);
829 
830 		(void) strcpy(p->uaddr, uaddr);
831 		(void) strcpy(p->netid, netid);
832 		(void) strcpy(p->protofmly, protofmly);
833 		(void) strcpy(p->proto, proto);
834 		list_insert_head(&ncg->nfs4_cb_ports, p);
835 	}
836 }
837 
838 /*
839  * nfs4_cb_args - This function is used to construct the callback
840  * portion of the arguments needed for setclientid.
841  */
842 
843 void
844 nfs4_cb_args(nfs4_server_t *np, struct knetconfig *knc, SETCLIENTID4args *args)
845 {
846 	struct nfs4_cb_port *p;
847 	bool_t found = FALSE;
848 	rpcprog_t pgm;
849 	struct nfs4_callback_globals *ncg = np->zone_globals;
850 
851 	/*
852 	 * This server structure may already have a program number
853 	 * assigned to it.  This happens when the client has to
854 	 * re-issue SETCLIENTID.  Just re-use the information.
855 	 */
856 	if (np->s_program >= NFS4_CALLBACK &&
857 	    np->s_program < NFS4_CALLBACK + nfs4_num_prognums)
858 		nfs4callback_destroy(np);
859 
860 	mutex_enter(&ncg->nfs4_cb_lock);
861 
862 	p = list_head(&ncg->nfs4_cb_ports);
863 	for (; p != NULL; p = list_next(&ncg->nfs4_cb_ports, p)) {
864 		if (strcmp(p->protofmly, knc->knc_protofmly) == 0 &&
865 		    strcmp(p->proto, knc->knc_proto) == 0) {
866 			found = TRUE;
867 			break;
868 		}
869 	}
870 
871 	if (found == FALSE) {
872 
873 		NFS4_DEBUG(nfs4_callback_debug,
874 		    (CE_WARN, "nfs4_cb_args: could not find netid for %s/%s\n",
875 		    knc->knc_protofmly, knc->knc_proto));
876 
877 		args->callback.cb_program = 0;
878 		args->callback.cb_location.r_netid = NULL;
879 		args->callback.cb_location.r_addr = NULL;
880 		args->callback_ident = 0;
881 		mutex_exit(&ncg->nfs4_cb_lock);
882 		return;
883 	}
884 
885 	if ((pgm = nfs4_getnextprogram(ncg)) == 0) {
886 		CB_WARN("nfs4_cb_args: out of program numbers\n");
887 
888 		args->callback.cb_program = 0;
889 		args->callback.cb_location.r_netid = NULL;
890 		args->callback.cb_location.r_addr = NULL;
891 		args->callback_ident = 0;
892 		mutex_exit(&ncg->nfs4_cb_lock);
893 		return;
894 	}
895 
896 	ncg->nfs4prog2server[pgm-NFS4_CALLBACK] = np;
897 	args->callback.cb_program = pgm;
898 	args->callback.cb_location.r_netid = p->netid;
899 	args->callback.cb_location.r_addr = p->uaddr;
900 	args->callback_ident = pgm;
901 
902 	np->s_program = pgm;
903 
904 	mutex_exit(&ncg->nfs4_cb_lock);
905 }
906 
907 static int
908 nfs4_dquery(struct nfs4_svc_args *arg, model_t model)
909 {
910 	file_t *fp;
911 	vnode_t *vp;
912 	rnode4_t *rp;
913 	int error;
914 	STRUCT_HANDLE(nfs4_svc_args, uap);
915 
916 	STRUCT_SET_HANDLE(uap, model, arg);
917 
918 	if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
919 		return (EBADF);
920 
921 	vp = fp->f_vnode;
922 
923 	if (vp == NULL || vp->v_type != VREG ||
924 	    !vn_matchops(vp, nfs4_vnodeops)) {
925 		releasef(STRUCT_FGET(uap, fd));
926 		return (EBADF);
927 	}
928 
929 	rp = VTOR4(vp);
930 
931 	/*
932 	 * I can't convince myself that we need locking here.  The
933 	 * rnode cannot disappear and the value returned is instantly
934 	 * stale anway, so why bother?
935 	 */
936 
937 	error = suword32(STRUCT_FGETP(uap, netid), rp->r_deleg_type);
938 	releasef(STRUCT_FGET(uap, fd));
939 	return (error);
940 }
941 
942 
943 /*
944  * NFS4 client system call.  This service does the
945  * necessary initialization for the callback program.
946  * This is fashioned after the server side interaction
947  * between nfsd and the kernel.  On the client, the
948  * mount command forks and the child process does the
949  * necessary interaction with the kernel.
950  *
951  * uap->fd is the fd of an open transport provider
952  */
953 int
954 nfs4_svc(struct nfs4_svc_args *arg, model_t model)
955 {
956 	file_t *fp;
957 	int error;
958 	int readsize;
959 	char buf[KNC_STRSIZE], uaddr[KNC_STRSIZE];
960 	char protofmly[KNC_STRSIZE], proto[KNC_STRSIZE];
961 	size_t len;
962 	STRUCT_HANDLE(nfs4_svc_args, uap);
963 	struct netbuf addrmask;
964 	int cmd;
965 	SVCMASTERXPRT *cb_xprt;
966 	struct nfs4_callback_globals *ncg;
967 
968 #ifdef lint
969 	model = model;		/* STRUCT macros don't always refer to it */
970 #endif
971 
972 	STRUCT_SET_HANDLE(uap, model, arg);
973 
974 	if (STRUCT_FGET(uap, cmd) == NFS4_DQUERY)
975 		return (nfs4_dquery(arg, model));
976 
977 	if (secpolicy_nfs(CRED()) != 0)
978 		return (EPERM);
979 
980 	if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
981 		return (EBADF);
982 
983 	/*
984 	 * Set read buffer size to rsize
985 	 * and add room for RPC headers.
986 	 */
987 	readsize = nfs3tsize() + (RPC_MAXDATASIZE - NFS_MAXDATA);
988 	if (readsize < RPC_MAXDATASIZE)
989 		readsize = RPC_MAXDATASIZE;
990 
991 	error = copyinstr((const char *)STRUCT_FGETP(uap, netid), buf,
992 	    KNC_STRSIZE, &len);
993 	if (error) {
994 		releasef(STRUCT_FGET(uap, fd));
995 		return (error);
996 	}
997 
998 	cmd = STRUCT_FGET(uap, cmd);
999 
1000 	if (cmd & NFS4_KRPC_START) {
1001 		addrmask.len = STRUCT_FGET(uap, addrmask.len);
1002 		addrmask.maxlen = STRUCT_FGET(uap, addrmask.maxlen);
1003 		addrmask.buf = kmem_alloc(addrmask.maxlen, KM_SLEEP);
1004 		error = copyin(STRUCT_FGETP(uap, addrmask.buf), addrmask.buf,
1005 		    addrmask.len);
1006 		if (error) {
1007 			releasef(STRUCT_FGET(uap, fd));
1008 			kmem_free(addrmask.buf, addrmask.maxlen);
1009 			return (error);
1010 		}
1011 	}
1012 	else
1013 		addrmask.buf = NULL;
1014 
1015 	error = copyinstr((const char *)STRUCT_FGETP(uap, addr), uaddr,
1016 	    sizeof (uaddr), &len);
1017 	if (error) {
1018 		releasef(STRUCT_FGET(uap, fd));
1019 		if (addrmask.buf)
1020 			kmem_free(addrmask.buf, addrmask.maxlen);
1021 		return (error);
1022 	}
1023 
1024 	error = copyinstr((const char *)STRUCT_FGETP(uap, protofmly), protofmly,
1025 	    sizeof (protofmly), &len);
1026 	if (error) {
1027 		releasef(STRUCT_FGET(uap, fd));
1028 		if (addrmask.buf)
1029 			kmem_free(addrmask.buf, addrmask.maxlen);
1030 		return (error);
1031 	}
1032 
1033 	error = copyinstr((const char *)STRUCT_FGETP(uap, proto), proto,
1034 	    sizeof (proto), &len);
1035 	if (error) {
1036 		releasef(STRUCT_FGET(uap, fd));
1037 		if (addrmask.buf)
1038 			kmem_free(addrmask.buf, addrmask.maxlen);
1039 		return (error);
1040 	}
1041 
1042 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1043 	ASSERT(ncg != NULL);
1044 
1045 	mutex_enter(&ncg->nfs4_cb_lock);
1046 	if (cmd & NFS4_SETPORT)
1047 		nfs4_setport(buf, uaddr, protofmly, proto, ncg);
1048 
1049 	if (cmd & NFS4_KRPC_START) {
1050 		error = svc_tli_kcreate(fp, readsize, buf, &addrmask, &cb_xprt,
1051 		    &nfs4_cb_sct, NULL, NFS_CB_SVCPOOL_ID, FALSE);
1052 		if (error) {
1053 			CB_WARN1("nfs4_svc: svc_tli_kcreate failed %d\n",
1054 			    error);
1055 			kmem_free(addrmask.buf, addrmask.maxlen);
1056 		}
1057 	}
1058 
1059 	mutex_exit(&ncg->nfs4_cb_lock);
1060 	releasef(STRUCT_FGET(uap, fd));
1061 	return (error);
1062 }
1063 
1064 struct nfs4_callback_globals *
1065 nfs4_get_callback_globals(void)
1066 {
1067 	return (zone_getspecific(nfs4_callback_zone_key, nfs_zone()));
1068 }
1069 
1070 static void *
1071 nfs4_callback_init_zone(zoneid_t zoneid)
1072 {
1073 	kstat_t *nfs4_callback_kstat;
1074 	struct nfs4_callback_globals *ncg;
1075 
1076 	ncg = kmem_zalloc(sizeof (*ncg), KM_SLEEP);
1077 
1078 	ncg->nfs4prog2server = kmem_zalloc(nfs4_num_prognums *
1079 	    sizeof (struct nfs4_server *), KM_SLEEP);
1080 
1081 	/* initialize the dlist */
1082 	mutex_init(&ncg->nfs4_dlist_lock, NULL, MUTEX_DEFAULT, NULL);
1083 	list_create(&ncg->nfs4_dlist, sizeof (struct nfs4_dnode),
1084 	    offsetof(struct nfs4_dnode, linkage));
1085 
1086 	/* initialize cb_port list */
1087 	mutex_init(&ncg->nfs4_cb_lock, NULL, MUTEX_DEFAULT, NULL);
1088 	list_create(&ncg->nfs4_cb_ports, sizeof (struct nfs4_cb_port),
1089 	    offsetof(struct nfs4_cb_port, linkage));
1090 
1091 	/* get our own copy of the kstats */
1092 	bcopy(&nfs4_callback_stats_tmpl, &ncg->nfs4_callback_stats,
1093 	    sizeof (nfs4_callback_stats_tmpl));
1094 	/* register "nfs:0:nfs4_callback_stats" for this zone */
1095 	if ((nfs4_callback_kstat =
1096 	    kstat_create_zone("nfs", 0, "nfs4_callback_stats", "misc",
1097 	    KSTAT_TYPE_NAMED,
1098 	    sizeof (ncg->nfs4_callback_stats) / sizeof (kstat_named_t),
1099 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
1100 	    zoneid)) != NULL) {
1101 		nfs4_callback_kstat->ks_data = &ncg->nfs4_callback_stats;
1102 		kstat_install(nfs4_callback_kstat);
1103 	}
1104 	return (ncg);
1105 }
1106 
1107 static void
1108 nfs4_discard_delegations(struct nfs4_callback_globals *ncg)
1109 {
1110 	nfs4_server_t *sp;
1111 	int i, num_removed;
1112 
1113 	/*
1114 	 * It's OK here to just run through the registered "programs", as
1115 	 * servers without programs won't have any delegations to handle.
1116 	 */
1117 	for (i = 0; i < nfs4_num_prognums; i++) {
1118 		rnode4_t *rp;
1119 
1120 		mutex_enter(&ncg->nfs4_cb_lock);
1121 		sp = ncg->nfs4prog2server[i];
1122 		mutex_exit(&ncg->nfs4_cb_lock);
1123 
1124 		if (nfs4_server_vlock(sp, 1) == FALSE)
1125 			continue;
1126 		num_removed = 0;
1127 		while ((rp = list_head(&sp->s_deleg_list)) != NULL) {
1128 			mutex_enter(&rp->r_statev4_lock);
1129 			if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1130 				/*
1131 				 * We need to take matters into our own hands,
1132 				 * as nfs4delegreturn_cleanup_impl() won't
1133 				 * remove this from the list.
1134 				 */
1135 				list_remove(&sp->s_deleg_list, rp);
1136 				mutex_exit(&rp->r_statev4_lock);
1137 				nfs4_dec_state_ref_count_nolock(sp,
1138 				    VTOMI4(RTOV4(rp)));
1139 				num_removed++;
1140 				continue;
1141 			}
1142 			mutex_exit(&rp->r_statev4_lock);
1143 			VN_HOLD(RTOV4(rp));
1144 			mutex_exit(&sp->s_lock);
1145 			/*
1146 			 * The following will remove the node from the list.
1147 			 */
1148 			nfs4delegreturn_cleanup_impl(rp, sp, ncg);
1149 			VN_RELE(RTOV4(rp));
1150 			mutex_enter(&sp->s_lock);
1151 		}
1152 		mutex_exit(&sp->s_lock);
1153 		/* each removed list node reles a reference */
1154 		while (num_removed-- > 0)
1155 			nfs4_server_rele(sp);
1156 		/* remove our reference for nfs4_server_vlock */
1157 		nfs4_server_rele(sp);
1158 	}
1159 }
1160 
1161 /* ARGSUSED */
1162 static void
1163 nfs4_callback_shutdown_zone(zoneid_t zoneid, void *data)
1164 {
1165 	struct nfs4_callback_globals *ncg = data;
1166 
1167 	/*
1168 	 * Clean pending delegation return list.
1169 	 */
1170 	nfs4_dlistclean_impl(ncg, NFS4_DR_DISCARD);
1171 
1172 	/*
1173 	 * Discard all delegations.
1174 	 */
1175 	nfs4_discard_delegations(ncg);
1176 }
1177 
1178 static void
1179 nfs4_callback_fini_zone(zoneid_t zoneid, void *data)
1180 {
1181 	struct nfs4_callback_globals *ncg = data;
1182 	struct nfs4_cb_port *p;
1183 	nfs4_server_t *sp, *next;
1184 	nfs4_server_t freelist;
1185 	int i;
1186 
1187 	kstat_delete_byname_zone("nfs", 0, "nfs4_callback_stats", zoneid);
1188 
1189 	/*
1190 	 * Discard all delegations that may have crept in since we did the
1191 	 * _shutdown.
1192 	 */
1193 	nfs4_discard_delegations(ncg);
1194 	/*
1195 	 * We're completely done with this zone and all associated
1196 	 * nfs4_server_t's.  Any remaining nfs4_server_ts should only have one
1197 	 * more reference outstanding -- the reference we didn't release in
1198 	 * nfs4_renew_lease_thread().
1199 	 *
1200 	 * Here we need to run through the global nfs4_server_lst as we need to
1201 	 * deal with nfs4_server_ts without programs, as they also have threads
1202 	 * created for them, and so have outstanding references that we need to
1203 	 * release.
1204 	 */
1205 	freelist.forw = &freelist;
1206 	freelist.back = &freelist;
1207 	mutex_enter(&nfs4_server_lst_lock);
1208 	sp = nfs4_server_lst.forw;
1209 	while (sp != &nfs4_server_lst) {
1210 		next = sp->forw;
1211 		if (sp->zoneid == zoneid) {
1212 			remque(sp);
1213 			insque(sp, &freelist);
1214 		}
1215 		sp = next;
1216 	}
1217 	mutex_exit(&nfs4_server_lst_lock);
1218 
1219 	sp = freelist.forw;
1220 	while (sp != &freelist) {
1221 		next = sp->forw;
1222 		nfs4_server_rele(sp);	/* free the list's reference */
1223 		sp = next;
1224 	}
1225 
1226 #ifdef DEBUG
1227 	for (i = 0; i < nfs4_num_prognums; i++) {
1228 		ASSERT(ncg->nfs4prog2server[i] == NULL);
1229 	}
1230 #endif
1231 	kmem_free(ncg->nfs4prog2server, nfs4_num_prognums *
1232 	    sizeof (struct nfs4_server *));
1233 
1234 	mutex_enter(&ncg->nfs4_cb_lock);
1235 	while ((p = list_head(&ncg->nfs4_cb_ports)) != NULL) {
1236 		list_remove(&ncg->nfs4_cb_ports, p);
1237 		kmem_free(p, sizeof (*p));
1238 	}
1239 	list_destroy(&ncg->nfs4_cb_ports);
1240 	mutex_destroy(&ncg->nfs4_cb_lock);
1241 	list_destroy(&ncg->nfs4_dlist);
1242 	mutex_destroy(&ncg->nfs4_dlist_lock);
1243 	kmem_free(ncg, sizeof (*ncg));
1244 }
1245 
1246 void
1247 nfs4_callback_init(void)
1248 {
1249 	int i;
1250 	SVC_CALLOUT *nfs4_cb_sc;
1251 
1252 	/* initialize the callback table */
1253 	nfs4_cb_sc = kmem_alloc(nfs4_num_prognums *
1254 	    sizeof (SVC_CALLOUT), KM_SLEEP);
1255 
1256 	for (i = 0; i < nfs4_num_prognums; i++) {
1257 		nfs4_cb_sc[i].sc_prog = NFS4_CALLBACK+i;
1258 		nfs4_cb_sc[i].sc_versmin = NFS_CB;
1259 		nfs4_cb_sc[i].sc_versmax = NFS_CB;
1260 		nfs4_cb_sc[i].sc_dispatch = cb_dispatch;
1261 	}
1262 
1263 	nfs4_cb_sct.sct_size = nfs4_num_prognums;
1264 	nfs4_cb_sct.sct_free = FALSE;
1265 	nfs4_cb_sct.sct_sc = nfs4_cb_sc;
1266 
1267 	/*
1268 	 * Compute max bytes required for dyamically allocated parts
1269 	 * of cb_getattr reply.  Only size and change are supported now.
1270 	 * If CB_GETATTR is changed to reply with additional attrs,
1271 	 * additional sizes must be added below.
1272 	 *
1273 	 * fattr4_change + fattr4_size == uint64_t + uint64_t
1274 	 */
1275 	cb_getattr_bytes = 2 * BYTES_PER_XDR_UNIT + 2 * BYTES_PER_XDR_UNIT;
1276 
1277 	zone_key_create(&nfs4_callback_zone_key, nfs4_callback_init_zone,
1278 	    nfs4_callback_shutdown_zone, nfs4_callback_fini_zone);
1279 }
1280 
1281 void
1282 nfs4_callback_fini(void)
1283 {
1284 }
1285 
1286 /*
1287  * NB: This function can be called from the *wrong* zone (ie, the zone that
1288  * 'rp' belongs to and the caller's zone may not be the same).  This can happen
1289  * if the zone is going away and we get called from nfs4_async_inactive().  In
1290  * this case the globals will be NULL and we won't update the counters, which
1291  * doesn't matter as the zone is going away anyhow.
1292  */
1293 static void
1294 nfs4delegreturn_cleanup_impl(rnode4_t *rp, nfs4_server_t *np,
1295 	struct nfs4_callback_globals *ncg)
1296 {
1297 	mntinfo4_t *mi = VTOMI4(RTOV4(rp));
1298 	boolean_t need_rele = B_FALSE;
1299 
1300 	/*
1301 	 * Caller must be holding mi_recovlock in read mode
1302 	 * to call here.  This is provided by start_op.
1303 	 * Delegation management requires to grab s_lock
1304 	 * first and then r_statev4_lock.
1305 	 */
1306 
1307 	if (np == NULL) {
1308 		np = find_nfs4_server_all(mi, 1);
1309 		ASSERT(np != NULL);
1310 		need_rele = B_TRUE;
1311 	} else {
1312 		mutex_enter(&np->s_lock);
1313 	}
1314 
1315 	mutex_enter(&rp->r_statev4_lock);
1316 
1317 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1318 		mutex_exit(&rp->r_statev4_lock);
1319 		mutex_exit(&np->s_lock);
1320 		if (need_rele)
1321 			nfs4_server_rele(np);
1322 		return;
1323 	}
1324 
1325 	/*
1326 	 * Free the cred originally held when
1327 	 * the delegation was granted.  Caller must
1328 	 * hold this cred if it wants to use it after
1329 	 * this call.
1330 	 */
1331 	crfree(rp->r_deleg_cred);
1332 	rp->r_deleg_cred = NULL;
1333 	rp->r_deleg_type = OPEN_DELEGATE_NONE;
1334 	rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
1335 	rp->r_deleg_needs_recall = FALSE;
1336 	rp->r_deleg_return_pending = FALSE;
1337 
1338 	/*
1339 	 * Remove the rnode from the server's list and
1340 	 * update the ref counts.
1341 	 */
1342 	list_remove(&np->s_deleg_list, rp);
1343 	mutex_exit(&rp->r_statev4_lock);
1344 	nfs4_dec_state_ref_count_nolock(np, mi);
1345 	mutex_exit(&np->s_lock);
1346 	/* removed list node removes a reference */
1347 	nfs4_server_rele(np);
1348 	if (need_rele)
1349 		nfs4_server_rele(np);
1350 	if (ncg != NULL)
1351 		ncg->nfs4_callback_stats.delegations.value.ui64--;
1352 }
1353 
1354 void
1355 nfs4delegreturn_cleanup(rnode4_t *rp, nfs4_server_t *np)
1356 {
1357 	struct nfs4_callback_globals *ncg;
1358 
1359 	if (np != NULL) {
1360 		ncg = np->zone_globals;
1361 	} else if (nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone) {
1362 		ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1363 		ASSERT(ncg != NULL);
1364 	} else {
1365 		/*
1366 		 * Request coming from the wrong zone.
1367 		 */
1368 		ASSERT(getzoneid() == GLOBAL_ZONEID);
1369 		ncg = NULL;
1370 	}
1371 
1372 	nfs4delegreturn_cleanup_impl(rp, np, ncg);
1373 }
1374 
1375 static void
1376 nfs4delegreturn_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
1377 	cred_t *cr, vnode_t *vp)
1378 {
1379 	if (error != ETIMEDOUT && error != EINTR &&
1380 	    !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
1381 		lost_rqstp->lr_op = 0;
1382 		return;
1383 	}
1384 
1385 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
1386 	    "nfs4close_save_lost_rqst: error %d", error));
1387 
1388 	lost_rqstp->lr_op = OP_DELEGRETURN;
1389 	/*
1390 	 * The vp is held and rele'd via the recovery code.
1391 	 * See nfs4_save_lost_rqst.
1392 	 */
1393 	lost_rqstp->lr_vp = vp;
1394 	lost_rqstp->lr_dvp = NULL;
1395 	lost_rqstp->lr_oop = NULL;
1396 	lost_rqstp->lr_osp = NULL;
1397 	lost_rqstp->lr_lop = NULL;
1398 	lost_rqstp->lr_cr = cr;
1399 	lost_rqstp->lr_flk = NULL;
1400 	lost_rqstp->lr_putfirst = FALSE;
1401 }
1402 
1403 static void
1404 nfs4delegreturn_otw(rnode4_t *rp, cred_t *cr, nfs4_error_t *ep)
1405 {
1406 	COMPOUND4args_clnt args;
1407 	COMPOUND4res_clnt res;
1408 	nfs_argop4 argops[3];
1409 	nfs4_ga_res_t *garp = NULL;
1410 	hrtime_t t;
1411 	int numops;
1412 	int doqueue = 1;
1413 
1414 	args.ctag = TAG_DELEGRETURN;
1415 
1416 	numops = 3;		/* PUTFH, GETATTR, DELEGRETURN */
1417 
1418 	args.array = argops;
1419 	args.array_len = numops;
1420 
1421 	argops[0].argop = OP_CPUTFH;
1422 	argops[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1423 
1424 	argops[1].argop = OP_GETATTR;
1425 	argops[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1426 	argops[1].nfs_argop4_u.opgetattr.mi = VTOMI4(RTOV4(rp));
1427 
1428 	argops[2].argop = OP_DELEGRETURN;
1429 	argops[2].nfs_argop4_u.opdelegreturn.deleg_stateid =
1430 	    rp->r_deleg_stateid;
1431 
1432 	t = gethrtime();
1433 	rfs4call(VTOMI4(RTOV4(rp)), &args, &res, cr, &doqueue, 0, ep);
1434 
1435 	if (ep->error)
1436 		return;
1437 
1438 	if (res.status == NFS4_OK) {
1439 		garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
1440 		nfs4_attr_cache(RTOV4(rp), garp, t, cr, TRUE, NULL);
1441 
1442 	}
1443 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1444 }
1445 
1446 int
1447 nfs4_do_delegreturn(rnode4_t *rp, int flags, cred_t *cr,
1448 	struct nfs4_callback_globals *ncg)
1449 {
1450 	vnode_t *vp = RTOV4(rp);
1451 	mntinfo4_t *mi = VTOMI4(vp);
1452 	nfs4_lost_rqst_t lost_rqst;
1453 	nfs4_recov_state_t recov_state;
1454 	bool_t needrecov = FALSE, recovonly, done = FALSE;
1455 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1456 
1457 	ncg->nfs4_callback_stats.delegreturn.value.ui64++;
1458 
1459 	while (!done) {
1460 		e.error = nfs4_start_fop(mi, vp, NULL, OH_DELEGRETURN,
1461 		    &recov_state, &recovonly);
1462 
1463 		if (e.error) {
1464 			if (flags & NFS4_DR_FORCE) {
1465 				(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1466 				    RW_READER, 0);
1467 				nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1468 				nfs_rw_exit(&mi->mi_recovlock);
1469 			}
1470 			break;
1471 		}
1472 
1473 		/*
1474 		 * Check to see if the delegation has already been
1475 		 * returned by the recovery thread.   The state of
1476 		 * the delegation cannot change at this point due
1477 		 * to start_fop and the r_deleg_recall_lock.
1478 		 */
1479 		if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1480 			e.error = 0;
1481 			nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1482 			break;
1483 		}
1484 
1485 		if (recovonly) {
1486 			/*
1487 			 * Delegation will be returned via the
1488 			 * recovery framework.  Build a lost request
1489 			 * structure, start recovery and get out.
1490 			 */
1491 			nfs4_error_init(&e, EINTR);
1492 			nfs4delegreturn_save_lost_rqst(e.error, &lost_rqst,
1493 			    cr, vp);
1494 			(void) nfs4_start_recovery(&e, mi, vp,
1495 			    NULL, &rp->r_deleg_stateid,
1496 			    lost_rqst.lr_op == OP_DELEGRETURN ?
1497 			    &lost_rqst : NULL, OP_DELEGRETURN, NULL);
1498 			nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1499 			break;
1500 		}
1501 
1502 		nfs4delegreturn_otw(rp, cr, &e);
1503 
1504 		/*
1505 		 * Ignore some errors on delegreturn; no point in marking
1506 		 * the file dead on a state destroying operation.
1507 		 */
1508 		if (e.error == 0 && (nfs4_recov_marks_dead(e.stat) ||
1509 		    e.stat == NFS4ERR_BADHANDLE ||
1510 		    e.stat == NFS4ERR_STALE))
1511 			needrecov = FALSE;
1512 		else
1513 			needrecov = nfs4_needs_recovery(&e, TRUE, vp->v_vfsp);
1514 
1515 		if (needrecov) {
1516 			nfs4delegreturn_save_lost_rqst(e.error, &lost_rqst,
1517 			    cr, vp);
1518 			(void) nfs4_start_recovery(&e, mi, vp,
1519 			    NULL, &rp->r_deleg_stateid,
1520 			    lost_rqst.lr_op == OP_DELEGRETURN ?
1521 			    &lost_rqst : NULL, OP_DELEGRETURN, NULL);
1522 		} else {
1523 			nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1524 			done = TRUE;
1525 		}
1526 
1527 		nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1528 	}
1529 	return (e.error);
1530 }
1531 
1532 /*
1533  * nfs4_resend_delegreturn - used to drive the delegreturn
1534  * operation via the recovery thread.
1535  */
1536 void
1537 nfs4_resend_delegreturn(nfs4_lost_rqst_t *lorp, nfs4_error_t *ep,
1538 	nfs4_server_t *np)
1539 {
1540 	rnode4_t *rp = VTOR4(lorp->lr_vp);
1541 
1542 	/* If the file failed recovery, just quit. */
1543 	mutex_enter(&rp->r_statelock);
1544 	if (rp->r_flags & R4RECOVERR) {
1545 		ep->error = EIO;
1546 	}
1547 	mutex_exit(&rp->r_statelock);
1548 
1549 	if (!ep->error)
1550 		nfs4delegreturn_otw(rp, lorp->lr_cr, ep);
1551 
1552 	/*
1553 	 * If recovery is now needed, then return the error
1554 	 * and status and let the recovery thread handle it,
1555 	 * including re-driving another delegreturn.  Otherwise,
1556 	 * just give up and clean up the delegation.
1557 	 */
1558 	if (nfs4_needs_recovery(ep, TRUE, lorp->lr_vp->v_vfsp))
1559 		return;
1560 
1561 	if (rp->r_deleg_type != OPEN_DELEGATE_NONE)
1562 		nfs4delegreturn_cleanup(rp, np);
1563 
1564 	nfs4_error_zinit(ep);
1565 }
1566 
1567 /*
1568  * nfs4delegreturn - general function to return a delegation.
1569  *
1570  * NFS4_DR_FORCE - return the delegation even if start_op fails
1571  * NFS4_DR_PUSH - push modified data back to the server via VOP_PUTPAGE
1572  * NFS4_DR_DISCARD - discard the delegation w/o delegreturn
1573  * NFS4_DR_DID_OP - calling function already did nfs4_start_op
1574  * NFS4_DR_RECALL - delegreturned initiated via CB_RECALL
1575  * NFS4_DR_REOPEN - do file reopens, if applicable
1576  */
1577 static int
1578 nfs4delegreturn_impl(rnode4_t *rp, int flags, struct nfs4_callback_globals *ncg)
1579 {
1580 	int error = 0;
1581 	cred_t *cr = NULL;
1582 	vnode_t *vp;
1583 	bool_t needrecov = FALSE;
1584 	bool_t rw_entered = FALSE;
1585 	bool_t do_reopen;
1586 
1587 	vp = RTOV4(rp);
1588 
1589 	/*
1590 	 * If NFS4_DR_DISCARD is set by itself, take a short-cut and
1591 	 * discard without doing an otw DELEGRETURN.  This may only be used
1592 	 * by the recovery thread because it bypasses the synchronization
1593 	 * with r_deleg_recall_lock and mi->mi_recovlock.
1594 	 */
1595 	if (flags == NFS4_DR_DISCARD) {
1596 		nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1597 		return (0);
1598 	}
1599 
1600 	if (flags & NFS4_DR_DID_OP) {
1601 		/*
1602 		 * Caller had already done start_op, which means the
1603 		 * r_deleg_recall_lock is already held in READ mode
1604 		 * so we cannot take it in write mode.  Return the
1605 		 * delegation asynchronously.
1606 		 *
1607 		 * Remove the NFS4_DR_DID_OP flag so we don't
1608 		 * get stuck looping through here.
1609 		 */
1610 		VN_HOLD(vp);
1611 		nfs4delegreturn_async(rp, (flags & ~NFS4_DR_DID_OP), FALSE);
1612 		return (0);
1613 	}
1614 
1615 	/*
1616 	 * Verify we still have a delegation and crhold the credential.
1617 	 */
1618 	mutex_enter(&rp->r_statev4_lock);
1619 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1620 		mutex_exit(&rp->r_statev4_lock);
1621 		goto out;
1622 	}
1623 	cr = rp->r_deleg_cred;
1624 	ASSERT(cr != NULL);
1625 	crhold(cr);
1626 	mutex_exit(&rp->r_statev4_lock);
1627 
1628 	/*
1629 	 * Push the modified data back to the server synchronously
1630 	 * before doing DELEGRETURN.
1631 	 */
1632 	if (flags & NFS4_DR_PUSH)
1633 		(void) VOP_PUTPAGE(vp, 0, 0, 0, cr, NULL);
1634 
1635 	/*
1636 	 * Take r_deleg_recall_lock in WRITE mode, this will prevent
1637 	 * nfs4_is_otw_open_necessary from trying to use the delegation
1638 	 * while the DELEGRETURN is in progress.
1639 	 */
1640 	(void) nfs_rw_enter_sig(&rp->r_deleg_recall_lock, RW_WRITER, FALSE);
1641 
1642 	rw_entered = TRUE;
1643 
1644 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE)
1645 		goto out;
1646 
1647 	if (flags & NFS4_DR_REOPEN) {
1648 		/*
1649 		 * If R4RECOVERRP is already set, then skip re-opening
1650 		 * the delegation open streams and go straight to doing
1651 		 * delegreturn.  (XXX if the file has failed recovery, then the
1652 		 * delegreturn attempt is likely to be futile.)
1653 		 */
1654 		mutex_enter(&rp->r_statelock);
1655 		do_reopen = !(rp->r_flags & R4RECOVERRP);
1656 		mutex_exit(&rp->r_statelock);
1657 
1658 		if (do_reopen) {
1659 			error = deleg_reopen(vp, &needrecov, ncg, flags);
1660 			if (error != 0) {
1661 				if ((flags & (NFS4_DR_FORCE | NFS4_DR_RECALL))
1662 				    == 0)
1663 					goto out;
1664 			} else if (needrecov) {
1665 				if ((flags & NFS4_DR_FORCE) == 0)
1666 					goto out;
1667 			}
1668 		}
1669 	}
1670 
1671 	if (flags & NFS4_DR_DISCARD) {
1672 		mntinfo4_t *mi = VTOMI4(RTOV4(rp));
1673 
1674 		mutex_enter(&rp->r_statelock);
1675 		/*
1676 		 * deleg_return_pending is cleared inside of delegation_accept
1677 		 * when a delegation is accepted.  if this flag has been
1678 		 * cleared, then a new delegation has overwritten the one we
1679 		 * were about to throw away.
1680 		 */
1681 		if (!rp->r_deleg_return_pending) {
1682 			mutex_exit(&rp->r_statelock);
1683 			goto out;
1684 		}
1685 		mutex_exit(&rp->r_statelock);
1686 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, FALSE);
1687 		nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1688 		nfs_rw_exit(&mi->mi_recovlock);
1689 	} else {
1690 		error = nfs4_do_delegreturn(rp, flags, cr, ncg);
1691 	}
1692 
1693 out:
1694 	if (cr)
1695 		crfree(cr);
1696 	if (rw_entered)
1697 		nfs_rw_exit(&rp->r_deleg_recall_lock);
1698 	return (error);
1699 }
1700 
1701 int
1702 nfs4delegreturn(rnode4_t *rp, int flags)
1703 {
1704 	struct nfs4_callback_globals *ncg;
1705 
1706 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1707 	ASSERT(ncg != NULL);
1708 
1709 	return (nfs4delegreturn_impl(rp, flags, ncg));
1710 }
1711 
1712 void
1713 nfs4delegreturn_async(rnode4_t *rp, int flags, bool_t trunc)
1714 {
1715 	struct cb_recall_pass *pp;
1716 
1717 	pp = kmem_alloc(sizeof (struct cb_recall_pass), KM_SLEEP);
1718 	pp->rp = rp;
1719 	pp->flags = flags;
1720 	pp->truncate = trunc;
1721 
1722 	/*
1723 	 * Fire up a thread to do the actual delegreturn
1724 	 * Caller must guarantee that the rnode doesn't
1725 	 * vanish (by calling VN_HOLD).
1726 	 */
1727 
1728 	(void) zthread_create(NULL, 0, nfs4delegreturn_thread, pp, 0,
1729 	    minclsyspri);
1730 }
1731 
1732 static void
1733 delegreturn_all_thread(rpcprog_t *pp)
1734 {
1735 	nfs4_server_t *np;
1736 	bool_t found = FALSE;
1737 	rpcprog_t prog;
1738 	rnode4_t *rp;
1739 	vnode_t *vp;
1740 	zoneid_t zoneid = getzoneid();
1741 	struct nfs4_callback_globals *ncg;
1742 
1743 	NFS4_DEBUG(nfs4_drat_debug,
1744 	    (CE_NOTE, "delereturn_all_thread: prog %d\n", *pp));
1745 
1746 	prog = *pp;
1747 	kmem_free(pp, sizeof (*pp));
1748 	pp = NULL;
1749 
1750 	mutex_enter(&nfs4_server_lst_lock);
1751 	for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
1752 		if (np->zoneid == zoneid && np->s_program == prog) {
1753 			mutex_enter(&np->s_lock);
1754 			found = TRUE;
1755 			break;
1756 		}
1757 	}
1758 	mutex_exit(&nfs4_server_lst_lock);
1759 
1760 	/*
1761 	 * It's possible that the nfs4_server which was using this
1762 	 * program number has vanished since this thread is async.
1763 	 * If so, just return.  Your work here is finished, my friend.
1764 	 */
1765 	if (!found)
1766 		goto out;
1767 
1768 	ncg = np->zone_globals;
1769 	while ((rp = list_head(&np->s_deleg_list)) != NULL) {
1770 		vp = RTOV4(rp);
1771 		VN_HOLD(vp);
1772 		mutex_exit(&np->s_lock);
1773 		(void) nfs4delegreturn_impl(rp, NFS4_DR_PUSH|NFS4_DR_REOPEN,
1774 		    ncg);
1775 		VN_RELE(vp);
1776 
1777 		/* retake the s_lock for next trip through the loop */
1778 		mutex_enter(&np->s_lock);
1779 	}
1780 	mutex_exit(&np->s_lock);
1781 out:
1782 	NFS4_DEBUG(nfs4_drat_debug,
1783 	    (CE_NOTE, "delereturn_all_thread: complete\n"));
1784 	zthread_exit();
1785 }
1786 
1787 void
1788 nfs4_delegreturn_all(nfs4_server_t *sp)
1789 {
1790 	rpcprog_t pro, *pp;
1791 
1792 	mutex_enter(&sp->s_lock);
1793 
1794 	/* Check to see if the delegation list is empty */
1795 
1796 	if (list_head(&sp->s_deleg_list) == NULL) {
1797 		mutex_exit(&sp->s_lock);
1798 		return;
1799 	}
1800 	/*
1801 	 * Grab the program number; the async thread will use this
1802 	 * to find the nfs4_server.
1803 	 */
1804 	pro = sp->s_program;
1805 	mutex_exit(&sp->s_lock);
1806 	pp = kmem_alloc(sizeof (rpcprog_t), KM_SLEEP);
1807 	*pp = pro;
1808 	(void) zthread_create(NULL, 0, delegreturn_all_thread, pp, 0,
1809 	    minclsyspri);
1810 }
1811 
1812 
1813 /*
1814  * Discard any delegations
1815  *
1816  * Iterate over the servers s_deleg_list and
1817  * for matching mount-point rnodes discard
1818  * the delegation.
1819  */
1820 void
1821 nfs4_deleg_discard(mntinfo4_t *mi, nfs4_server_t *sp)
1822 {
1823 	rnode4_t *rp, *next;
1824 	mntinfo4_t *r_mi;
1825 	struct nfs4_callback_globals *ncg;
1826 
1827 	ASSERT(mutex_owned(&sp->s_lock));
1828 	ncg = sp->zone_globals;
1829 
1830 	for (rp = list_head(&sp->s_deleg_list); rp != NULL; rp = next) {
1831 		r_mi = VTOMI4(RTOV4(rp));
1832 		next = list_next(&sp->s_deleg_list, rp);
1833 
1834 		if (r_mi != mi) {
1835 			/*
1836 			 * Skip if this rnode is in not on the
1837 			 * same mount-point
1838 			 */
1839 			continue;
1840 		}
1841 
1842 		ASSERT(rp->r_deleg_type == OPEN_DELEGATE_READ);
1843 
1844 #ifdef DEBUG
1845 		if (nfs4_client_recov_debug) {
1846 			zprintf(getzoneid(),
1847 			    "nfs4_deleg_discard: matched rnode %p "
1848 			"-- discarding delegation\n", (void *)rp);
1849 		}
1850 #endif
1851 		mutex_enter(&rp->r_statev4_lock);
1852 		/*
1853 		 * Free the cred originally held when the delegation
1854 		 * was granted. Also need to decrement the refcnt
1855 		 * on this server for each delegation we discard
1856 		 */
1857 		if (rp->r_deleg_cred)
1858 			crfree(rp->r_deleg_cred);
1859 		rp->r_deleg_cred = NULL;
1860 		rp->r_deleg_type = OPEN_DELEGATE_NONE;
1861 		rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
1862 		rp->r_deleg_needs_recall = FALSE;
1863 		ASSERT(sp->s_refcnt > 1);
1864 		sp->s_refcnt--;
1865 		list_remove(&sp->s_deleg_list, rp);
1866 		mutex_exit(&rp->r_statev4_lock);
1867 		nfs4_dec_state_ref_count_nolock(sp, mi);
1868 		ncg->nfs4_callback_stats.delegations.value.ui64--;
1869 	}
1870 }
1871 
1872 /*
1873  * Reopen any open streams that were covered by the given file's
1874  * delegation.
1875  * Returns zero or an errno value.  If there was no error, *recovp
1876  * indicates whether recovery was initiated.
1877  */
1878 
1879 static int
1880 deleg_reopen(vnode_t *vp, bool_t *recovp, struct nfs4_callback_globals *ncg,
1881 	int flags)
1882 {
1883 	nfs4_open_stream_t *osp;
1884 	nfs4_recov_state_t recov_state;
1885 	bool_t needrecov = FALSE;
1886 	mntinfo4_t *mi;
1887 	rnode4_t *rp;
1888 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1889 	int claimnull;
1890 
1891 	mi = VTOMI4(vp);
1892 	rp = VTOR4(vp);
1893 
1894 	recov_state.rs_flags = 0;
1895 	recov_state.rs_num_retry_despite_err = 0;
1896 
1897 retry:
1898 	if ((e.error = nfs4_start_op(mi, vp, NULL, &recov_state)) != 0) {
1899 		return (e.error);
1900 	}
1901 
1902 	/*
1903 	 * if we mean to discard the delegation, it must be BAD, so don't
1904 	 * use it when doing the reopen or it will fail too.
1905 	 */
1906 	claimnull = (flags & NFS4_DR_DISCARD);
1907 	/*
1908 	 * Loop through the open streams for this rnode to find
1909 	 * all of the ones created using the delegation state ID.
1910 	 * Each of these needs to be re-opened.
1911 	 */
1912 
1913 	while ((osp = get_next_deleg_stream(rp, claimnull)) != NULL) {
1914 
1915 		if (claimnull) {
1916 			nfs4_reopen(vp, osp, &e, CLAIM_NULL, FALSE, FALSE);
1917 		} else {
1918 			ncg->nfs4_callback_stats.claim_cur.value.ui64++;
1919 
1920 			nfs4_reopen(vp, osp, &e, CLAIM_DELEGATE_CUR, FALSE,
1921 			    FALSE);
1922 			if (e.error == 0 && e.stat == NFS4_OK)
1923 				ncg->nfs4_callback_stats.
1924 				    claim_cur_ok.value.ui64++;
1925 		}
1926 
1927 		if (e.error == EAGAIN) {
1928 			nfs4_end_op(mi, vp, NULL, &recov_state, TRUE);
1929 			goto retry;
1930 		}
1931 
1932 		/*
1933 		 * if error is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, then
1934 		 * recovery has already been started inside of nfs4_reopen.
1935 		 */
1936 		if (e.error == EINTR || e.error == ETIMEDOUT ||
1937 		    NFS4_FRC_UNMT_ERR(e.error, vp->v_vfsp)) {
1938 			open_stream_rele(osp, rp);
1939 			break;
1940 		}
1941 
1942 		needrecov = nfs4_needs_recovery(&e, TRUE, vp->v_vfsp);
1943 
1944 		if (e.error != 0 && !needrecov) {
1945 			/*
1946 			 * Recovery is not possible, but don't give up yet;
1947 			 * we'd still like to do delegreturn after
1948 			 * reopening as many streams as possible.
1949 			 * Continue processing the open streams.
1950 			 */
1951 
1952 			ncg->nfs4_callback_stats.recall_failed.value.ui64++;
1953 
1954 		} else if (needrecov) {
1955 			/*
1956 			 * Start recovery and bail out.  The recovery
1957 			 * thread will take it from here.
1958 			 */
1959 			(void) nfs4_start_recovery(&e, mi, vp, NULL, NULL,
1960 			    NULL, OP_OPEN, NULL);
1961 			open_stream_rele(osp, rp);
1962 			*recovp = TRUE;
1963 			break;
1964 		}
1965 
1966 		open_stream_rele(osp, rp);
1967 	}
1968 
1969 	nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1970 
1971 	return (e.error);
1972 }
1973 
1974 /*
1975  * get_next_deleg_stream - returns the next open stream which
1976  * represents a delegation for this rnode.  In order to assure
1977  * forward progress, the caller must guarantee that each open
1978  * stream returned is changed so that a future call won't return
1979  * it again.
1980  *
1981  * There are several ways for the open stream to change.  If the open
1982  * stream is !os_delegation, then we aren't interested in it.  Also, if
1983  * either os_failed_reopen or !os_valid, then don't return the osp.
1984  *
1985  * If claimnull is false (doing reopen CLAIM_DELEGATE_CUR) then return
1986  * the osp if it is an os_delegation open stream.  Also, if the rnode still
1987  * has r_deleg_return_pending, then return the os_delegation osp.  Lastly,
1988  * if the rnode's r_deleg_stateid is different from the osp's open_stateid,
1989  * then return the osp.
1990  *
1991  * We have already taken the 'r_deleg_recall_lock' as WRITER, which
1992  * prevents new OPENs from going OTW (as start_fop takes this
1993  * lock in READ mode); thus, no new open streams can be created
1994  * (which inherently means no new delegation open streams are
1995  * being created).
1996  */
1997 
1998 static nfs4_open_stream_t *
1999 get_next_deleg_stream(rnode4_t *rp, int claimnull)
2000 {
2001 	nfs4_open_stream_t	*osp;
2002 
2003 	ASSERT(nfs_rw_lock_held(&rp->r_deleg_recall_lock, RW_WRITER));
2004 
2005 	/*
2006 	 * Search through the list of open streams looking for
2007 	 * one that was created while holding the delegation.
2008 	 */
2009 	mutex_enter(&rp->r_os_lock);
2010 	for (osp = list_head(&rp->r_open_streams); osp != NULL;
2011 	    osp = list_next(&rp->r_open_streams, osp)) {
2012 		mutex_enter(&osp->os_sync_lock);
2013 		if (!osp->os_delegation || osp->os_failed_reopen ||
2014 		    !osp->os_valid) {
2015 			mutex_exit(&osp->os_sync_lock);
2016 			continue;
2017 		}
2018 		if (!claimnull || rp->r_deleg_return_pending ||
2019 		    !stateid4_cmp(&osp->open_stateid, &rp->r_deleg_stateid)) {
2020 			osp->os_ref_count++;
2021 			mutex_exit(&osp->os_sync_lock);
2022 			mutex_exit(&rp->r_os_lock);
2023 			return (osp);
2024 		}
2025 		mutex_exit(&osp->os_sync_lock);
2026 	}
2027 	mutex_exit(&rp->r_os_lock);
2028 
2029 	return (NULL);
2030 }
2031 
2032 static void
2033 nfs4delegreturn_thread(struct cb_recall_pass *args)
2034 {
2035 	rnode4_t *rp;
2036 	vnode_t *vp;
2037 	cred_t *cr;
2038 	int dtype, error, flags;
2039 	bool_t rdirty, rip;
2040 	kmutex_t cpr_lock;
2041 	callb_cpr_t cpr_info;
2042 	struct nfs4_callback_globals *ncg;
2043 
2044 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2045 	ASSERT(ncg != NULL);
2046 
2047 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
2048 
2049 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr,
2050 	    "nfsv4delegRtn");
2051 
2052 	rp = args->rp;
2053 	vp = RTOV4(rp);
2054 
2055 	mutex_enter(&rp->r_statev4_lock);
2056 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2057 		mutex_exit(&rp->r_statev4_lock);
2058 		goto out;
2059 	}
2060 	mutex_exit(&rp->r_statev4_lock);
2061 
2062 	/*
2063 	 * Take the read-write lock in read mode to prevent other
2064 	 * threads from modifying the data during the recall.  This
2065 	 * doesn't affect mmappers.
2066 	 */
2067 	(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
2068 
2069 	/* Proceed with delegreturn */
2070 
2071 	mutex_enter(&rp->r_statev4_lock);
2072 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2073 		mutex_exit(&rp->r_statev4_lock);
2074 		nfs_rw_exit(&rp->r_rwlock);
2075 		goto out;
2076 	}
2077 	dtype = rp->r_deleg_type;
2078 	cr = rp->r_deleg_cred;
2079 	ASSERT(cr != NULL);
2080 	crhold(cr);
2081 	mutex_exit(&rp->r_statev4_lock);
2082 
2083 	flags = args->flags;
2084 
2085 	/*
2086 	 * If the file is being truncated at the server, then throw
2087 	 * away all of the pages, it doesn't matter what flavor of
2088 	 * delegation we have.
2089 	 */
2090 
2091 	if (args->truncate) {
2092 		ncg->nfs4_callback_stats.recall_trunc.value.ui64++;
2093 		nfs4_invalidate_pages(vp, 0, cr);
2094 	} else if (dtype == OPEN_DELEGATE_WRITE) {
2095 
2096 		mutex_enter(&rp->r_statelock);
2097 		rdirty = rp->r_flags & R4DIRTY;
2098 		mutex_exit(&rp->r_statelock);
2099 
2100 		if (rdirty) {
2101 			error = VOP_PUTPAGE(vp, 0, 0, 0, cr, NULL);
2102 
2103 			if (error)
2104 				CB_WARN1("nfs4delegreturn_thread:"
2105 				" VOP_PUTPAGE: %d\n", error);
2106 		}
2107 		/* turn off NFS4_DR_PUSH because we just did that above. */
2108 		flags &= ~NFS4_DR_PUSH;
2109 	}
2110 
2111 	mutex_enter(&rp->r_statelock);
2112 	rip =  rp->r_flags & R4RECOVERRP;
2113 	mutex_exit(&rp->r_statelock);
2114 
2115 	/* If a failed recovery is indicated, discard the pages */
2116 
2117 	if (rip) {
2118 
2119 		error = VOP_PUTPAGE(vp, 0, 0, B_INVAL, cr, NULL);
2120 
2121 		if (error)
2122 			CB_WARN1("nfs4delegreturn_thread: VOP_PUTPAGE: %d\n",
2123 			    error);
2124 	}
2125 
2126 	/*
2127 	 * Pass the flags to nfs4delegreturn_impl, but be sure not to pass
2128 	 * NFS4_DR_DID_OP, which just calls nfs4delegreturn_async again.
2129 	 */
2130 	flags &= ~NFS4_DR_DID_OP;
2131 
2132 	(void) nfs4delegreturn_impl(rp, flags, ncg);
2133 
2134 	nfs_rw_exit(&rp->r_rwlock);
2135 	crfree(cr);
2136 out:
2137 	kmem_free(args, sizeof (struct cb_recall_pass));
2138 	VN_RELE(vp);
2139 	mutex_enter(&cpr_lock);
2140 	CALLB_CPR_EXIT(&cpr_info);
2141 	mutex_destroy(&cpr_lock);
2142 	zthread_exit();
2143 }
2144 
2145 /*
2146  * This function has one assumption that the caller of this function is
2147  * either doing recovery (therefore cannot call nfs4_start_op) or has
2148  * already called nfs4_start_op().
2149  */
2150 void
2151 nfs4_delegation_accept(rnode4_t *rp, open_claim_type4 claim, OPEN4res *res,
2152 	nfs4_ga_res_t *garp, cred_t *cr)
2153 {
2154 	open_read_delegation4 *orp;
2155 	open_write_delegation4 *owp;
2156 	nfs4_server_t *np;
2157 	bool_t already = FALSE;
2158 	bool_t recall = FALSE;
2159 	bool_t valid_garp = TRUE;
2160 	bool_t delegation_granted = FALSE;
2161 	bool_t dr_needed = FALSE;
2162 	bool_t recov;
2163 	int dr_flags = 0;
2164 	long mapcnt;
2165 	uint_t rflag;
2166 	mntinfo4_t *mi;
2167 	struct nfs4_callback_globals *ncg;
2168 	open_delegation_type4 odt;
2169 
2170 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2171 	ASSERT(ncg != NULL);
2172 
2173 	mi = VTOMI4(RTOV4(rp));
2174 
2175 	/*
2176 	 * Accept a delegation granted to the client via an OPEN.
2177 	 * Set the delegation fields in the rnode and insert the
2178 	 * rnode onto the list anchored in the nfs4_server_t.  The
2179 	 * proper locking order requires the nfs4_server_t first,
2180 	 * even though it may not be needed in all cases.
2181 	 *
2182 	 * NB: find_nfs4_server returns with s_lock held.
2183 	 */
2184 
2185 	if ((np = find_nfs4_server(mi)) == NULL)
2186 		return;
2187 
2188 	/* grab the statelock too, for examining r_mapcnt */
2189 	mutex_enter(&rp->r_statelock);
2190 	mutex_enter(&rp->r_statev4_lock);
2191 
2192 	if (rp->r_deleg_type == OPEN_DELEGATE_READ ||
2193 	    rp->r_deleg_type == OPEN_DELEGATE_WRITE)
2194 		already = TRUE;
2195 
2196 	odt = res->delegation.delegation_type;
2197 
2198 	if (odt == OPEN_DELEGATE_READ) {
2199 
2200 		rp->r_deleg_type = res->delegation.delegation_type;
2201 		orp = &res->delegation.open_delegation4_u.read;
2202 		rp->r_deleg_stateid = orp->stateid;
2203 		rp->r_deleg_perms = orp->permissions;
2204 		if (claim == CLAIM_PREVIOUS)
2205 			if ((recall = orp->recall) != 0)
2206 				dr_needed = TRUE;
2207 
2208 		delegation_granted = TRUE;
2209 
2210 		ncg->nfs4_callback_stats.delegations.value.ui64++;
2211 		ncg->nfs4_callback_stats.delegaccept_r.value.ui64++;
2212 
2213 	} else if (odt == OPEN_DELEGATE_WRITE) {
2214 
2215 		rp->r_deleg_type = res->delegation.delegation_type;
2216 		owp = &res->delegation.open_delegation4_u.write;
2217 		rp->r_deleg_stateid = owp->stateid;
2218 		rp->r_deleg_perms = owp->permissions;
2219 		rp->r_deleg_limit = owp->space_limit;
2220 		if (claim == CLAIM_PREVIOUS)
2221 			if ((recall = owp->recall) != 0)
2222 				dr_needed = TRUE;
2223 
2224 		delegation_granted = TRUE;
2225 
2226 		if (garp == NULL || !garp->n4g_change_valid) {
2227 			valid_garp = FALSE;
2228 			rp->r_deleg_change = 0;
2229 			rp->r_deleg_change_grant = 0;
2230 		} else {
2231 			rp->r_deleg_change = garp->n4g_change;
2232 			rp->r_deleg_change_grant = garp->n4g_change;
2233 		}
2234 		mapcnt = rp->r_mapcnt;
2235 		rflag = rp->r_flags;
2236 
2237 		/*
2238 		 * Update the delegation change attribute if
2239 		 * there are mappers for the file is dirty.  This
2240 		 * might be the case during recovery after server
2241 		 * reboot.
2242 		 */
2243 		if (mapcnt > 0 || rflag & R4DIRTY)
2244 			rp->r_deleg_change++;
2245 
2246 		NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE,
2247 		    "nfs4_delegation_accept: r_deleg_change: 0x%x\n",
2248 		    (int)(rp->r_deleg_change >> 32)));
2249 		NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE,
2250 		    "nfs4_delegation_accept: r_delg_change_grant: 0x%x\n",
2251 		    (int)(rp->r_deleg_change_grant >> 32)));
2252 
2253 
2254 		ncg->nfs4_callback_stats.delegations.value.ui64++;
2255 		ncg->nfs4_callback_stats.delegaccept_rw.value.ui64++;
2256 	} else if (already) {
2257 		/*
2258 		 * No delegation granted.  If the rnode currently has
2259 		 * has one, then consider it tainted and return it.
2260 		 */
2261 		dr_needed = TRUE;
2262 	}
2263 
2264 	if (delegation_granted) {
2265 		/* Add the rnode to the list. */
2266 		if (!already) {
2267 			crhold(cr);
2268 			rp->r_deleg_cred = cr;
2269 
2270 			ASSERT(mutex_owned(&np->s_lock));
2271 			list_insert_head(&np->s_deleg_list, rp);
2272 			/* added list node gets a reference */
2273 			np->s_refcnt++;
2274 			nfs4_inc_state_ref_count_nolock(np, mi);
2275 		}
2276 		rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
2277 	}
2278 
2279 	/*
2280 	 * We've now safely accepted the delegation, if any.  Drop the
2281 	 * locks and figure out what post-processing is needed.  We'd
2282 	 * like to retain r_statev4_lock, but nfs4_server_rele takes
2283 	 * s_lock which would be a lock ordering violation.
2284 	 */
2285 	mutex_exit(&rp->r_statev4_lock);
2286 	mutex_exit(&rp->r_statelock);
2287 	mutex_exit(&np->s_lock);
2288 	nfs4_server_rele(np);
2289 
2290 	/*
2291 	 * Check to see if we are in recovery.  Remember that
2292 	 * this function is protected by start_op, so a recovery
2293 	 * cannot begin until we are out of here.
2294 	 */
2295 	mutex_enter(&mi->mi_lock);
2296 	recov = mi->mi_recovflags & MI4_RECOV_ACTIV;
2297 	mutex_exit(&mi->mi_lock);
2298 
2299 	mutex_enter(&rp->r_statev4_lock);
2300 
2301 	if (nfs4_delegreturn_policy == IMMEDIATE || !valid_garp)
2302 		dr_needed = TRUE;
2303 
2304 	if (dr_needed && rp->r_deleg_return_pending == FALSE) {
2305 		if (recov) {
2306 			/*
2307 			 * We cannot call delegreturn from inside
2308 			 * of recovery or VOP_PUTPAGE will hang
2309 			 * due to nfs4_start_fop call in
2310 			 * nfs4write.  Use dlistadd to add the
2311 			 * rnode to the list of rnodes needing
2312 			 * cleaning.  We do not need to do reopen
2313 			 * here because recov_openfiles will do it.
2314 			 * In the non-recall case, just discard the
2315 			 * delegation as it is no longer valid.
2316 			 */
2317 			if (recall)
2318 				dr_flags = NFS4_DR_PUSH;
2319 			else
2320 				dr_flags = NFS4_DR_PUSH|NFS4_DR_DISCARD;
2321 
2322 			nfs4_dlistadd(rp, ncg, dr_flags);
2323 			dr_flags = 0;
2324 		} else {
2325 			/*
2326 			 * Push the modified data back to the server,
2327 			 * reopen any delegation open streams, and return
2328 			 * the delegation.  Drop the statev4_lock first!
2329 			 */
2330 			dr_flags =  NFS4_DR_PUSH|NFS4_DR_DID_OP|NFS4_DR_REOPEN;
2331 		}
2332 	}
2333 	mutex_exit(&rp->r_statev4_lock);
2334 	if (dr_flags)
2335 		(void) nfs4delegreturn_impl(rp, dr_flags, ncg);
2336 }
2337 
2338 /*
2339  * nfs4delegabandon - Abandon the delegation on an rnode4.  This code
2340  * is called when the client receives EXPIRED, BAD_STATEID, OLD_STATEID
2341  * or BADSEQID and the recovery code is unable to recover.  Push any
2342  * dirty data back to the server and return the delegation (if any).
2343  */
2344 
2345 void
2346 nfs4delegabandon(rnode4_t *rp)
2347 {
2348 	vnode_t *vp;
2349 	struct cb_recall_pass *pp;
2350 	open_delegation_type4 dt;
2351 
2352 	mutex_enter(&rp->r_statev4_lock);
2353 	dt = rp->r_deleg_type;
2354 	mutex_exit(&rp->r_statev4_lock);
2355 
2356 	if (dt == OPEN_DELEGATE_NONE)
2357 		return;
2358 
2359 	vp = RTOV4(rp);
2360 	VN_HOLD(vp);
2361 
2362 	pp = kmem_alloc(sizeof (struct cb_recall_pass), KM_SLEEP);
2363 	pp->rp = rp;
2364 	/*
2365 	 * Recovery on the file has failed and we want to return
2366 	 * the delegation.  We don't want to reopen files and
2367 	 * nfs4delegreturn_thread() figures out what to do about
2368 	 * the data.  The only thing to do is attempt to return
2369 	 * the delegation.
2370 	 */
2371 	pp->flags = 0;
2372 	pp->truncate = FALSE;
2373 
2374 	/*
2375 	 * Fire up a thread to do the delegreturn; this is
2376 	 * necessary because we could be inside a GETPAGE or
2377 	 * PUTPAGE and we cannot do another one.
2378 	 */
2379 
2380 	(void) zthread_create(NULL, 0, nfs4delegreturn_thread, pp, 0,
2381 	    minclsyspri);
2382 }
2383 
2384 static int
2385 wait_for_recall1(vnode_t *vp, nfs4_op_hint_t op, nfs4_recov_state_t *rsp,
2386 	int flg)
2387 {
2388 	rnode4_t *rp;
2389 	int error = 0;
2390 
2391 #ifdef lint
2392 	op = op;
2393 #endif
2394 
2395 	if (vp && vp->v_type == VREG) {
2396 		rp = VTOR4(vp);
2397 
2398 		/*
2399 		 * Take r_deleg_recall_lock in read mode to synchronize
2400 		 * with delegreturn.
2401 		 */
2402 		error = nfs_rw_enter_sig(&rp->r_deleg_recall_lock,
2403 		    RW_READER, INTR4(vp));
2404 
2405 		if (error == 0)
2406 			rsp->rs_flags |= flg;
2407 
2408 	}
2409 	return (error);
2410 }
2411 
2412 void
2413 nfs4_end_op_recall(vnode_t *vp1, vnode_t *vp2, nfs4_recov_state_t *rsp)
2414 {
2415 	NFS4_DEBUG(nfs4_recall_debug,
2416 	    (CE_NOTE, "nfs4_end_op_recall: 0x%p, 0x%p\n",
2417 	    (void *)vp1, (void *)vp2));
2418 
2419 	if (vp2 && rsp->rs_flags & NFS4_RS_RECALL_HELD2)
2420 		nfs_rw_exit(&VTOR4(vp2)->r_deleg_recall_lock);
2421 	if (vp1 && rsp->rs_flags & NFS4_RS_RECALL_HELD1)
2422 		nfs_rw_exit(&VTOR4(vp1)->r_deleg_recall_lock);
2423 }
2424 
2425 int
2426 wait_for_recall(vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
2427 	nfs4_recov_state_t *rsp)
2428 {
2429 	int error;
2430 
2431 	NFS4_DEBUG(nfs4_recall_debug,
2432 	    (CE_NOTE, "wait_for_recall:    0x%p, 0x%p\n",
2433 	    (void *)vp1, (void *) vp2));
2434 
2435 	rsp->rs_flags &= ~(NFS4_RS_RECALL_HELD1|NFS4_RS_RECALL_HELD2);
2436 
2437 	if ((error = wait_for_recall1(vp1, op, rsp, NFS4_RS_RECALL_HELD1)) != 0)
2438 		return (error);
2439 
2440 	if ((error = wait_for_recall1(vp2, op, rsp, NFS4_RS_RECALL_HELD2))
2441 	    != 0) {
2442 		if (rsp->rs_flags & NFS4_RS_RECALL_HELD1) {
2443 			nfs_rw_exit(&VTOR4(vp1)->r_deleg_recall_lock);
2444 			rsp->rs_flags &= ~NFS4_RS_RECALL_HELD1;
2445 		}
2446 
2447 		return (error);
2448 	}
2449 
2450 	return (0);
2451 }
2452 
2453 /*
2454  * nfs4_dlistadd - Add this rnode to a list of rnodes to be
2455  * DELEGRETURN'd at the end of recovery.
2456  */
2457 
2458 static void
2459 nfs4_dlistadd(rnode4_t *rp, struct nfs4_callback_globals *ncg, int flags)
2460 {
2461 	struct nfs4_dnode *dp;
2462 
2463 	ASSERT(mutex_owned(&rp->r_statev4_lock));
2464 	/*
2465 	 * Mark the delegation as having a return pending.
2466 	 * This will prevent the use of the delegation stateID
2467 	 * by read, write, setattr and open.
2468 	 */
2469 	rp->r_deleg_return_pending = TRUE;
2470 	dp = kmem_alloc(sizeof (*dp), KM_SLEEP);
2471 	VN_HOLD(RTOV4(rp));
2472 	dp->rnodep = rp;
2473 	dp->flags = flags;
2474 	mutex_enter(&ncg->nfs4_dlist_lock);
2475 	list_insert_head(&ncg->nfs4_dlist, dp);
2476 #ifdef	DEBUG
2477 	ncg->nfs4_dlistadd_c++;
2478 #endif
2479 	mutex_exit(&ncg->nfs4_dlist_lock);
2480 }
2481 
2482 /*
2483  * nfs4_dlistclean_impl - Do DELEGRETURN for each rnode on the list.
2484  * of files awaiting cleaning.  If the override_flags are non-zero
2485  * then use them rather than the flags that were set when the rnode
2486  * was added to the dlist.
2487  */
2488 static void
2489 nfs4_dlistclean_impl(struct nfs4_callback_globals *ncg, int override_flags)
2490 {
2491 	rnode4_t *rp;
2492 	struct nfs4_dnode *dp;
2493 	int flags;
2494 
2495 	ASSERT(override_flags == 0 || override_flags == NFS4_DR_DISCARD);
2496 
2497 	mutex_enter(&ncg->nfs4_dlist_lock);
2498 	while ((dp = list_head(&ncg->nfs4_dlist)) != NULL) {
2499 #ifdef	DEBUG
2500 		ncg->nfs4_dlistclean_c++;
2501 #endif
2502 		list_remove(&ncg->nfs4_dlist, dp);
2503 		mutex_exit(&ncg->nfs4_dlist_lock);
2504 		rp = dp->rnodep;
2505 		flags = (override_flags != 0) ? override_flags : dp->flags;
2506 		kmem_free(dp, sizeof (*dp));
2507 		(void) nfs4delegreturn_impl(rp, flags, ncg);
2508 		VN_RELE(RTOV4(rp));
2509 		mutex_enter(&ncg->nfs4_dlist_lock);
2510 	}
2511 	mutex_exit(&ncg->nfs4_dlist_lock);
2512 }
2513 
2514 void
2515 nfs4_dlistclean(void)
2516 {
2517 	struct nfs4_callback_globals *ncg;
2518 
2519 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2520 	ASSERT(ncg != NULL);
2521 
2522 	nfs4_dlistclean_impl(ncg, 0);
2523 }
2524