1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 #include <nfs/nfs4_clnt.h>
33 #include <nfs/rnode4.h>
34 #include <sys/systm.h>
35 #include <sys/cmn_err.h>
36 #include <sys/atomic.h>
37 
38 static void	nfs4_free_open_owner(nfs4_open_owner_t *, mntinfo4_t *);
39 static nfs4_open_owner_t *find_freed_open_owner(cred_t *,
40 				nfs4_oo_hash_bucket_t *, mntinfo4_t *);
41 static open_delegation_type4 get_dtype(rnode4_t *);
42 
43 #ifdef DEBUG
44 int nfs4_client_foo_debug = 0x0;
45 int nfs4_client_open_dg = 0x0;
46 /*
47  * If this is non-zero, the lockowner and openowner seqid sync primitives
48  * will intermittently return errors.
49  */
50 static int seqid_sync_faults = 0;
51 #endif
52 
53 stateid4 clnt_special0 = {
54 	0,
55 	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
56 };
57 
58 stateid4 clnt_special1 = {
59 	0xffffffff,
60 	{
61 		(char)0xff, (char)0xff, (char)0xff, (char)0xff,
62 		(char)0xff, (char)0xff, (char)0xff, (char)0xff,
63 		(char)0xff, (char)0xff, (char)0xff, (char)0xff
64 	}
65 };
66 
67 /* finds hash bucket and locks it */
68 static nfs4_oo_hash_bucket_t *
69 lock_bucket(cred_t *cr, mntinfo4_t *mi)
70 {
71 	nfs4_oo_hash_bucket_t *bucketp;
72 	uint32_t hash_key;
73 
74 	hash_key = (uint32_t)(crgetuid(cr) + crgetruid(cr))
75 			% NFS4_NUM_OO_BUCKETS;
76 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "lock_bucket: "
77 		"hash_key %d for cred %p", hash_key, (void*)cr));
78 
79 	ASSERT(hash_key >= 0 && hash_key < NFS4_NUM_OO_BUCKETS);
80 	ASSERT(mi != NULL);
81 	ASSERT(mutex_owned(&mi->mi_lock));
82 
83 	bucketp = &(mi->mi_oo_list[hash_key]);
84 	mutex_enter(&bucketp->b_lock);
85 	return (bucketp);
86 }
87 
88 /* unlocks hash bucket pointed by bucket_ptr */
89 static void
90 unlock_bucket(nfs4_oo_hash_bucket_t *bucketp)
91 {
92 	mutex_exit(&bucketp->b_lock);
93 }
94 
95 /*
96  * Removes the lock owner from the rnode's lock_owners list and frees the
97  * corresponding reference.
98  */
99 void
100 nfs4_rnode_remove_lock_owner(rnode4_t *rp, nfs4_lock_owner_t *lop)
101 {
102 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
103 		"nfs4_rnode_remove_lock_owner"));
104 
105 	mutex_enter(&rp->r_statev4_lock);
106 
107 	if (lop->lo_next_rnode == NULL) {
108 		/* already removed from list */
109 		mutex_exit(&rp->r_statev4_lock);
110 		return;
111 	}
112 
113 	ASSERT(lop->lo_prev_rnode != NULL);
114 
115 	lop->lo_prev_rnode->lo_next_rnode = lop->lo_next_rnode;
116 	lop->lo_next_rnode->lo_prev_rnode = lop->lo_prev_rnode;
117 
118 	lop->lo_next_rnode = lop->lo_prev_rnode = NULL;
119 
120 	mutex_exit(&rp->r_statev4_lock);
121 
122 	/*
123 	 * This would be an appropriate place for
124 	 * RELEASE_LOCKOWNER.  For now, this is overkill
125 	 * because in the common case, close is going to
126 	 * release any lockowners anyway.
127 	 */
128 	lock_owner_rele(lop);
129 }
130 
131 /*
132  * Remove all lock owners from the rnode's lock_owners list.  Frees up
133  * their references from the list.
134  */
135 
136 void
137 nfs4_flush_lock_owners(rnode4_t *rp)
138 {
139 	nfs4_lock_owner_t *lop;
140 
141 	mutex_enter(&rp->r_statev4_lock);
142 	while (rp->r_lo_head.lo_next_rnode != &rp->r_lo_head) {
143 		lop = rp->r_lo_head.lo_next_rnode;
144 		lop->lo_prev_rnode->lo_next_rnode = lop->lo_next_rnode;
145 		lop->lo_next_rnode->lo_prev_rnode = lop->lo_prev_rnode;
146 		lop->lo_next_rnode = lop->lo_prev_rnode = NULL;
147 		lock_owner_rele(lop);
148 	}
149 	mutex_exit(&rp->r_statev4_lock);
150 }
151 
152 void
153 nfs4_clear_open_streams(rnode4_t *rp)
154 {
155 	nfs4_open_stream_t *osp;
156 
157 	mutex_enter(&rp->r_os_lock);
158 	while ((osp = list_head(&rp->r_open_streams)) != NULL) {
159 		open_owner_rele(osp->os_open_owner);
160 		list_remove(&rp->r_open_streams, osp);
161 		mutex_destroy(&osp->os_sync_lock);
162 		osp->os_open_owner = NULL;
163 		kmem_free(osp, sizeof (*osp));
164 	}
165 	mutex_exit(&rp->r_os_lock);
166 }
167 
168 void
169 open_owner_hold(nfs4_open_owner_t *oop)
170 {
171 	mutex_enter(&oop->oo_lock);
172 	oop->oo_ref_count++;
173 	mutex_exit(&oop->oo_lock);
174 }
175 
176 /*
177  * Frees the open owner if the ref count hits zero.
178  */
179 void
180 open_owner_rele(nfs4_open_owner_t *oop)
181 {
182 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
183 		"open_owner_rele"));
184 
185 	mutex_enter(&oop->oo_lock);
186 	oop->oo_ref_count--;
187 	if (oop->oo_ref_count == 0) {
188 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
189 			"open_owner_rele: freeing open owner"));
190 		oop->oo_valid = 0;
191 		mutex_exit(&oop->oo_lock);
192 		/*
193 		 * Ok, we don't destroy the open owner, nor do we put it on
194 		 * the mntinfo4's free list just yet.  We are lazy about it
195 		 * and let callers to find_open_owner() do that to keep locking
196 		 * simple.
197 		 */
198 	} else {
199 		mutex_exit(&oop->oo_lock);
200 	}
201 }
202 
203 void
204 open_stream_hold(nfs4_open_stream_t *osp)
205 {
206 	mutex_enter(&osp->os_sync_lock);
207 	osp->os_ref_count++;
208 	mutex_exit(&osp->os_sync_lock);
209 }
210 
211 /*
212  * Frees the open stream and removes it from the rnode4's open streams list if
213  * the ref count drops to zero.
214  */
215 void
216 open_stream_rele(nfs4_open_stream_t *osp, rnode4_t *rp)
217 {
218 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
219 		"open_stream_rele"));
220 
221 	ASSERT(!mutex_owned(&rp->r_os_lock));
222 
223 	mutex_enter(&osp->os_sync_lock);
224 	ASSERT(osp->os_ref_count > 0);
225 	osp->os_ref_count--;
226 	if (osp->os_ref_count == 0) {
227 		nfs4_open_owner_t *tmp_oop;
228 
229 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
230 			"open_stream_rele: freeing open stream"));
231 		osp->os_valid = 0;
232 		tmp_oop = osp->os_open_owner;
233 		mutex_exit(&osp->os_sync_lock);
234 
235 		/* now see if we need to destroy the open owner */
236 		open_owner_rele(tmp_oop);
237 
238 		mutex_enter(&rp->r_os_lock);
239 		list_remove(&rp->r_open_streams, osp);
240 		mutex_exit(&rp->r_os_lock);
241 
242 		/* free up osp */
243 		mutex_destroy(&osp->os_sync_lock);
244 		osp->os_open_owner = NULL;
245 		kmem_free(osp, sizeof (*osp));
246 	} else {
247 		mutex_exit(&osp->os_sync_lock);
248 	}
249 }
250 
251 void
252 lock_owner_hold(nfs4_lock_owner_t *lop)
253 {
254 	mutex_enter(&lop->lo_lock);
255 	lop->lo_ref_count++;
256 	mutex_exit(&lop->lo_lock);
257 }
258 
259 /*
260  * Frees the lock owner if the ref count hits zero and
261  * the structure no longer has no locks.
262  */
263 void
264 lock_owner_rele(nfs4_lock_owner_t *lop)
265 {
266 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
267 		"lock_owner_rele"));
268 
269 	mutex_enter(&lop->lo_lock);
270 	lop->lo_ref_count--;
271 	if (lop->lo_ref_count == 0) {
272 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
273 			"lock_owner_rele: freeing lock owner: "
274 			"%x", lop->lo_pid));
275 		lop->lo_valid = 0;
276 		/*
277 		 * If there are no references, the lock_owner should
278 		 * already be off the rnode's list.
279 		 */
280 		ASSERT(lop->lo_next_rnode == NULL);
281 		ASSERT(lop->lo_prev_rnode == NULL);
282 		ASSERT(!(lop->lo_flags & NFS4_LOCK_SEQID_INUSE));
283 		ASSERT(lop->lo_seqid_holder == NULL);
284 		mutex_exit(&lop->lo_lock);
285 
286 		/* free up lop */
287 		cv_destroy(&lop->lo_cv_seqid_sync);
288 		mutex_destroy(&lop->lo_lock);
289 		kmem_free(lop, sizeof (*lop));
290 	} else {
291 		mutex_exit(&lop->lo_lock);
292 	}
293 }
294 
295 /*
296  * This increments the open owner ref count if found.
297  * The argument 'just_created' determines whether we are looking for open
298  * owners with the 'oo_just_created' flag set or not.
299  */
300 nfs4_open_owner_t *
301 find_open_owner_nolock(cred_t *cr, int just_created, mntinfo4_t *mi)
302 {
303 	nfs4_open_owner_t	*oop = NULL, *next_oop;
304 	nfs4_oo_hash_bucket_t	*bucketp;
305 
306 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
307 	    "find_open_owner: cred %p, just_created %d",
308 	    (void*)cr, just_created));
309 
310 	ASSERT(mi != NULL);
311 	ASSERT(mutex_owned(&mi->mi_lock));
312 
313 	bucketp = lock_bucket(cr, mi);
314 
315 	/* got hash bucket, search through open owners */
316 	for (oop = list_head(&bucketp->b_oo_hash_list); oop != NULL; ) {
317 		mutex_enter(&oop->oo_lock);
318 		if (!crcmp(oop->oo_cred, cr) &&
319 		    (oop->oo_just_created == just_created ||
320 		    just_created == NFS4_JUST_CREATED)) {
321 			/* match */
322 			if (oop->oo_valid == 0) {
323 				/* reactivate the open owner */
324 				oop->oo_valid = 1;
325 				ASSERT(oop->oo_ref_count == 0);
326 			}
327 			oop->oo_ref_count++;
328 			mutex_exit(&oop->oo_lock);
329 			unlock_bucket(bucketp);
330 			return (oop);
331 		}
332 		next_oop = list_next(&bucketp->b_oo_hash_list, oop);
333 		if (oop->oo_valid == 0) {
334 			list_remove(&bucketp->b_oo_hash_list, oop);
335 
336 			/*
337 			 * Now we go ahead and put this open owner
338 			 * on the freed list.  This is our lazy method.
339 			 */
340 			nfs4_free_open_owner(oop, mi);
341 		}
342 
343 		mutex_exit(&oop->oo_lock);
344 		oop = next_oop;
345 	}
346 
347 	/* search through recently freed open owners */
348 	oop = find_freed_open_owner(cr, bucketp, mi);
349 
350 	unlock_bucket(bucketp);
351 
352 	return (oop);
353 }
354 
355 nfs4_open_owner_t *
356 find_open_owner(cred_t *cr, int just_created, mntinfo4_t *mi)
357 {
358 	nfs4_open_owner_t *oop;
359 
360 	mutex_enter(&mi->mi_lock);
361 	oop = find_open_owner_nolock(cr, just_created, mi);
362 	mutex_exit(&mi->mi_lock);
363 
364 	return (oop);
365 }
366 
367 /*
368  * This increments osp's ref count if found.
369  * Returns with 'os_sync_lock' held.
370  */
371 nfs4_open_stream_t *
372 find_open_stream(nfs4_open_owner_t *oop, rnode4_t *rp)
373 {
374 	nfs4_open_stream_t	*osp;
375 
376 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
377 		"find_open_stream"));
378 
379 	mutex_enter(&rp->r_os_lock);
380 	/* Now, no one can add or delete to rp's open streams list */
381 	for (osp = list_head(&rp->r_open_streams); osp != NULL;
382 	    osp = list_next(&rp->r_open_streams, osp)) {
383 		mutex_enter(&osp->os_sync_lock);
384 		if (osp->os_open_owner == oop && osp->os_valid != 0) {
385 			/* match */
386 			NFS4_DEBUG(nfs4_client_state_debug,
387 				(CE_NOTE, "find_open_stream "
388 				"got a match"));
389 
390 			osp->os_ref_count++;
391 			mutex_exit(&rp->r_os_lock);
392 			return (osp);
393 		}
394 		mutex_exit(&osp->os_sync_lock);
395 	}
396 
397 	mutex_exit(&rp->r_os_lock);
398 	return (NULL);
399 }
400 
401 /*
402  * Find the lock owner for the given file and process ID.  If "which" is
403  * LOWN_VALID_STATEID, require that the lock owner contain a valid stateid
404  * from the server.
405  *
406  * This increments the lock owner's ref count if found.  Returns NULL if
407  * there was no match.
408  */
409 nfs4_lock_owner_t *
410 find_lock_owner(rnode4_t *rp, pid_t pid, lown_which_t which)
411 {
412 	nfs4_lock_owner_t	*lop, *next_lop;
413 
414 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
415 		"find_lock_owner: pid %x, which %d", pid, which));
416 
417 	ASSERT(which == LOWN_ANY || which == LOWN_VALID_STATEID);
418 
419 	/* search by pid */
420 	mutex_enter(&rp->r_statev4_lock);
421 
422 	lop = rp->r_lo_head.lo_next_rnode;
423 	while (lop != &rp->r_lo_head) {
424 		mutex_enter(&lop->lo_lock);
425 		if (lop->lo_pid == pid && lop->lo_valid != 0 &&
426 		    !(lop->lo_flags & NFS4_BAD_SEQID_LOCK)) {
427 			if (which == LOWN_ANY ||
428 			    lop->lo_just_created != NFS4_JUST_CREATED) {
429 				/* Found a matching lock owner */
430 				NFS4_DEBUG(nfs4_client_state_debug,
431 					(CE_NOTE, "find_lock_owner: "
432 					"got a match"));
433 
434 				lop->lo_ref_count++;
435 				mutex_exit(&lop->lo_lock);
436 				mutex_exit(&rp->r_statev4_lock);
437 				return (lop);
438 			}
439 		}
440 		next_lop = lop->lo_next_rnode;
441 		mutex_exit(&lop->lo_lock);
442 		lop = next_lop;
443 	}
444 
445 	mutex_exit(&rp->r_statev4_lock);
446 	return (NULL);
447 }
448 
449 /*
450  * This returns the delegation stateid as 'sid'. Returns 1 if a successful
451  * delegation stateid was found, otherwise returns 0.
452  */
453 
454 static int
455 nfs4_get_deleg_stateid(rnode4_t *rp, nfs_opnum4 op, stateid4 *sid)
456 {
457 	ASSERT(!mutex_owned(&rp->r_statev4_lock));
458 
459 	mutex_enter(&rp->r_statev4_lock);
460 	if (((rp->r_deleg_type == OPEN_DELEGATE_WRITE && op == OP_WRITE) ||
461 	    (rp->r_deleg_type != OPEN_DELEGATE_NONE && op != OP_WRITE)) &&
462 	    !rp->r_deleg_return_pending) {
463 
464 		*sid = rp->r_deleg_stateid;
465 		mutex_exit(&rp->r_statev4_lock);
466 		return (1);
467 	}
468 	mutex_exit(&rp->r_statev4_lock);
469 	return (0);
470 }
471 
472 /*
473  * This returns the lock stateid as 'sid'. Returns 1 if a successful lock
474  * stateid was found, otherwise returns 0.
475  */
476 static int
477 nfs4_get_lock_stateid(rnode4_t *rp, pid_t pid, stateid4 *sid)
478 {
479 	nfs4_lock_owner_t *lop;
480 
481 	lop = find_lock_owner(rp, pid, LOWN_VALID_STATEID);
482 
483 	if (lop) {
484 		/*
485 		 * Found a matching lock owner, so use a lock
486 		 * stateid rather than an open stateid.
487 		 */
488 		mutex_enter(&lop->lo_lock);
489 		*sid = lop->lock_stateid;
490 		mutex_exit(&lop->lo_lock);
491 		lock_owner_rele(lop);
492 		return (1);
493 	}
494 
495 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
496 	    "nfs4_get_lock_stateid: no lop"));
497 	return (0);
498 }
499 
500 /*
501  * This returns the open stateid as 'sid'. Returns 1 if a successful open
502  * stateid was found, otherwise returns 0.
503  *
504  * Once the stateid is returned to the caller, it is no longer protected;
505  * so the caller must be prepared to handle OLD/BAD_STATEID where
506  * appropiate.
507  */
508 static int
509 nfs4_get_open_stateid(rnode4_t *rp, cred_t *cr, mntinfo4_t *mi, stateid4 *sid)
510 {
511 	nfs4_open_owner_t *oop;
512 	nfs4_open_stream_t *osp;
513 
514 	ASSERT(mi != NULL);
515 
516 	oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
517 	if (!oop) {
518 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
519 		    "nfs4_get_open_stateid: no oop"));
520 		return (0);
521 	}
522 
523 	osp = find_open_stream(oop, rp);
524 	open_owner_rele(oop);
525 	if (!osp) {
526 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
527 		    "nfs4_get_open_stateid: no osp"));
528 		return (0);
529 	}
530 
531 	if (osp->os_failed_reopen) {
532 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
533 		    "nfs4_get_open_stateid: osp %p failed reopen",
534 		    (void *)osp));
535 		mutex_exit(&osp->os_sync_lock);
536 		open_stream_rele(osp, rp);
537 		return (0);
538 	}
539 	*sid = osp->open_stateid;
540 	mutex_exit(&osp->os_sync_lock);
541 	open_stream_rele(osp, rp);
542 	return (1);
543 }
544 
545 /*
546  * Returns the delegation stateid if this 'op' is OP_WRITE and the
547  * delegation we hold is a write delegation, OR this 'op' is not
548  * OP_WRITE and we have a delegation held (read or write), otherwise
549  * returns the lock stateid if there is a lock owner, otherwise
550  * returns the open stateid if there is a open stream, otherwise
551  * returns special stateid <seqid = 0, other = 0>.
552  *
553  * Used for WRITE operations.
554  */
555 stateid4
556 nfs4_get_w_stateid(cred_t *cr, rnode4_t *rp, pid_t pid, mntinfo4_t *mi,
557 	nfs_opnum4 op, nfs4_stateid_types_t *sid_tp)
558 {
559 	stateid4 sid;
560 
561 	if (nfs4_get_deleg_stateid(rp, op, &sid)) {
562 		if (!stateid4_cmp(&sid, &sid_tp->d_sid)) {
563 			sid_tp->cur_sid_type = DEL_SID;
564 			return (sid);
565 		}
566 	}
567 	if (nfs4_get_lock_stateid(rp, pid, &sid)) {
568 		if (!stateid4_cmp(&sid, &sid_tp->l_sid)) {
569 			sid_tp->cur_sid_type = LOCK_SID;
570 			return (sid);
571 		}
572 	}
573 	if (nfs4_get_open_stateid(rp, cr, mi, &sid)) {
574 		if (!stateid4_cmp(&sid, &sid_tp->o_sid)) {
575 			sid_tp->cur_sid_type = OPEN_SID;
576 			return (sid);
577 		}
578 	}
579 	bzero(&sid, sizeof (stateid4));
580 	sid_tp->cur_sid_type = SPEC_SID;
581 	return (sid);
582 }
583 
584 /*
585  * Returns the delegation stateid if this 'op' is OP_WRITE and the
586  * delegation we hold is a write delegation, OR this 'op' is not
587  * OP_WRITE and we have a delegation held (read or write), otherwise
588  * returns the lock stateid if there is a lock owner, otherwise
589  * returns the open stateid if there is a open stream, otherwise
590  * returns special stateid <seqid = 0, other = 0>.
591  *
592  * This also updates which stateid we are using in 'sid_tp', skips
593  * previously attempted stateids, and skips checking higher priority
594  * stateids than the current level as dictated by 'sid_tp->cur_sid_type'
595  * for async reads.
596  *
597  * Used for READ and SETATTR operations.
598  */
599 stateid4
600 nfs4_get_stateid(cred_t *cr, rnode4_t *rp, pid_t pid, mntinfo4_t *mi,
601 	nfs_opnum4 op, nfs4_stateid_types_t *sid_tp, bool_t async_read)
602 {
603 	stateid4 sid;
604 
605 	/*
606 	 * For asynchronous READs, do not attempt to retry from the start of
607 	 * the stateid priority list, just continue from where you last left
608 	 * off.
609 	 */
610 	if (async_read) {
611 		switch (sid_tp->cur_sid_type) {
612 		case NO_SID:
613 			break;
614 		case DEL_SID:
615 			goto lock_stateid;
616 		case LOCK_SID:
617 			goto open_stateid;
618 		case OPEN_SID:
619 			goto special_stateid;
620 		case SPEC_SID:
621 		default:
622 			cmn_err(CE_PANIC, "nfs4_get_stateid: illegal current "
623 			    "stateid type %d", sid_tp->cur_sid_type);
624 		}
625 	}
626 
627 	if (nfs4_get_deleg_stateid(rp, op, &sid)) {
628 		if (!stateid4_cmp(&sid, &sid_tp->d_sid)) {
629 			sid_tp->cur_sid_type = DEL_SID;
630 			return (sid);
631 		}
632 	}
633 lock_stateid:
634 	if (nfs4_get_lock_stateid(rp, pid, &sid)) {
635 		if (!stateid4_cmp(&sid, &sid_tp->l_sid)) {
636 			sid_tp->cur_sid_type = LOCK_SID;
637 			return (sid);
638 		}
639 	}
640 open_stateid:
641 	if (nfs4_get_open_stateid(rp, cr, mi, &sid)) {
642 		if (!stateid4_cmp(&sid, &sid_tp->o_sid)) {
643 			sid_tp->cur_sid_type = OPEN_SID;
644 			return (sid);
645 		}
646 	}
647 special_stateid:
648 	bzero(&sid, sizeof (stateid4));
649 	sid_tp->cur_sid_type = SPEC_SID;
650 	return	(sid);
651 }
652 
653 void
654 nfs4_set_lock_stateid(nfs4_lock_owner_t *lop, stateid4 stateid)
655 {
656 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
657 		"nfs4_set_lock_stateid"));
658 
659 	ASSERT(lop);
660 	ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
661 
662 	mutex_enter(&lop->lo_lock);
663 	lop->lock_stateid = stateid;
664 	mutex_exit(&lop->lo_lock);
665 }
666 
667 /*
668  * Sequence number used when a new open owner is needed.
669  * This is used so as to not confuse the server.  Since a open owner
670  * is based off of cred, a cred could be re-used quickly, and the server
671  * may not release all state for a cred.
672  */
673 static uint64_t open_owner_seq_num = 0;
674 
675 uint64_t
676 nfs4_get_new_oo_name(void)
677 {
678 	return (atomic_add_64_nv(&open_owner_seq_num, 1));
679 }
680 
681 /*
682  * Create a new open owner and add it to the open owner hash table.
683  */
684 nfs4_open_owner_t *
685 create_open_owner(cred_t *cr, mntinfo4_t *mi)
686 {
687 	nfs4_open_owner_t	*oop;
688 	nfs4_oo_hash_bucket_t	*bucketp;
689 
690 	oop = kmem_alloc(sizeof (nfs4_open_owner_t), KM_SLEEP);
691 	/*
692 	 * Make sure the cred doesn't go away when we put this open owner
693 	 * on the free list, as well as make crcmp() a valid check.
694 	 */
695 	crhold(cr);
696 	oop->oo_cred = cr;
697 	mutex_init(&oop->oo_lock, NULL, MUTEX_DEFAULT, NULL);
698 	oop->oo_ref_count = 1;
699 	oop->oo_valid = 1;
700 	oop->oo_just_created = NFS4_JUST_CREATED;
701 	oop->oo_seqid = 0;
702 	oop->oo_seqid_inuse = 0;
703 	oop->oo_last_good_seqid = 0;
704 	oop->oo_last_good_op = TAG_NONE;
705 	oop->oo_cred_otw = NULL;
706 	cv_init(&oop->oo_cv_seqid_sync, NULL, CV_DEFAULT, NULL);
707 
708 	/*
709 	 * A Solaris open_owner is <oo_seq_num>
710 	 */
711 	oop->oo_name = nfs4_get_new_oo_name();
712 
713 	/* now add the struct into the cred hash table */
714 	ASSERT(mutex_owned(&mi->mi_lock));
715 	bucketp = lock_bucket(cr, mi);
716 	list_insert_head(&bucketp->b_oo_hash_list, oop);
717 	unlock_bucket(bucketp);
718 
719 	return (oop);
720 }
721 
722 /*
723  * Create a new open stream and it to the rnode's list.
724  * Increments the ref count on oop.
725  * Returns with 'os_sync_lock' held.
726  */
727 nfs4_open_stream_t *
728 create_open_stream(nfs4_open_owner_t *oop, rnode4_t *rp)
729 {
730 	nfs4_open_stream_t	*osp;
731 
732 #ifdef DEBUG
733 	mutex_enter(&oop->oo_lock);
734 	ASSERT(oop->oo_seqid_inuse);
735 	mutex_exit(&oop->oo_lock);
736 #endif
737 
738 	osp = kmem_alloc(sizeof (nfs4_open_stream_t), KM_SLEEP);
739 	osp->os_open_ref_count = 1;
740 	osp->os_mapcnt = 0;
741 	osp->os_ref_count = 2;
742 	osp->os_valid = 1;
743 	osp->os_open_owner = oop;
744 	osp->os_orig_oo_name = oop->oo_name;
745 	bzero(&osp->open_stateid, sizeof (stateid4));
746 	osp->os_share_acc_read = 0;
747 	osp->os_share_acc_write = 0;
748 	osp->os_mmap_read = 0;
749 	osp->os_mmap_write = 0;
750 	osp->os_share_deny_none = 0;
751 	osp->os_share_deny_read = 0;
752 	osp->os_share_deny_write = 0;
753 	osp->os_delegation = 0;
754 	osp->os_dc_openacc = 0;
755 	osp->os_final_close = 0;
756 	osp->os_pending_close = 0;
757 	osp->os_failed_reopen = 0;
758 	osp->os_force_close = 0;
759 	mutex_init(&osp->os_sync_lock, NULL, MUTEX_DEFAULT, NULL);
760 
761 	/* open owner gets a reference */
762 	open_owner_hold(oop);
763 
764 	/* now add the open stream to rp */
765 	mutex_enter(&rp->r_os_lock);
766 	mutex_enter(&osp->os_sync_lock);
767 	list_insert_head(&rp->r_open_streams, osp);
768 	mutex_exit(&rp->r_os_lock);
769 
770 	return (osp);
771 }
772 
773 /*
774  * Returns an open stream with 'os_sync_lock' held.
775  * If the open stream is found (rather than created), its
776  * 'os_open_ref_count' is bumped.
777  *
778  * There is no race with two threads entering this function
779  * and creating two open streams for the same <oop, rp> pair.
780  * This is because the open seqid sync must be acquired, thus
781  * only allowing one thread in at a time.
782  */
783 nfs4_open_stream_t *
784 find_or_create_open_stream(nfs4_open_owner_t *oop, rnode4_t *rp,
785 	int *created_osp)
786 {
787 	nfs4_open_stream_t *osp;
788 
789 #ifdef DEBUG
790 	mutex_enter(&oop->oo_lock);
791 	ASSERT(oop->oo_seqid_inuse);
792 	mutex_exit(&oop->oo_lock);
793 #endif
794 
795 	osp = find_open_stream(oop, rp);
796 	if (!osp) {
797 		osp = create_open_stream(oop, rp);
798 		if (osp)
799 			*created_osp = 1;
800 	} else {
801 		*created_osp = 0;
802 		osp->os_open_ref_count++;
803 	}
804 
805 	return (osp);
806 }
807 
808 static uint64_t lock_owner_seq_num = 0;
809 
810 /*
811  * Create a new lock owner and add it to the rnode's list.
812  * Assumes the rnode's r_statev4_lock is held.
813  * The created lock owner has a reference count of 2: one for the list and
814  * one for the caller to use.  Returns the lock owner locked down.
815  */
816 nfs4_lock_owner_t *
817 create_lock_owner(rnode4_t *rp, pid_t pid)
818 {
819 	nfs4_lock_owner_t	*lop;
820 
821 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
822 		"create_lock_owner: pid %x", pid));
823 
824 	ASSERT(mutex_owned(&rp->r_statev4_lock));
825 
826 	lop = kmem_alloc(sizeof (nfs4_lock_owner_t), KM_SLEEP);
827 	lop->lo_ref_count = 2;
828 	lop->lo_valid = 1;
829 	bzero(&lop->lock_stateid, sizeof (stateid4));
830 	lop->lo_pid = pid;
831 	lop->lock_seqid = 0;
832 	lop->lo_pending_rqsts = 0;
833 	lop->lo_just_created = NFS4_JUST_CREATED;
834 	lop->lo_flags = 0;
835 	lop->lo_seqid_holder = NULL;
836 
837 	/*
838 	 * A Solaris lock_owner is <seq_num><pid>
839 	 */
840 	lop->lock_owner_name.ln_seq_num =
841 		atomic_add_64_nv(&lock_owner_seq_num, 1);
842 	lop->lock_owner_name.ln_pid = pid;
843 
844 	cv_init(&lop->lo_cv_seqid_sync, NULL, CV_DEFAULT, NULL);
845 	mutex_init(&lop->lo_lock, NULL, MUTEX_DEFAULT, NULL);
846 
847 	mutex_enter(&lop->lo_lock);
848 
849 	/* now add the lock owner to rp */
850 	lop->lo_prev_rnode = &rp->r_lo_head;
851 	lop->lo_next_rnode = rp->r_lo_head.lo_next_rnode;
852 	rp->r_lo_head.lo_next_rnode->lo_prev_rnode = lop;
853 	rp->r_lo_head.lo_next_rnode = lop;
854 
855 	return (lop);
856 
857 }
858 
859 /*
860  * This sets the lock seqid of a lock owner.
861  */
862 void
863 nfs4_set_lock_seqid(seqid4 seqid, nfs4_lock_owner_t *lop)
864 {
865 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
866 		"nfs4_set_lock_seqid"));
867 
868 	ASSERT(lop != NULL);
869 	ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
870 
871 	lop->lock_seqid = seqid;
872 }
873 
874 static void
875 nfs4_set_new_lock_owner_args(lock_owner4 *owner, pid_t pid)
876 {
877 	nfs4_lo_name_t *cast_namep;
878 
879 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
880 		"nfs4_set_new_lock_owner_args"));
881 
882 	owner->owner_len = sizeof (*cast_namep);
883 	owner->owner_val = kmem_alloc(owner->owner_len, KM_SLEEP);
884 	/*
885 	 * A Solaris lock_owner is <seq_num><pid>
886 	 */
887 	cast_namep = (nfs4_lo_name_t *)owner->owner_val;
888 	cast_namep->ln_seq_num = atomic_add_64_nv(&lock_owner_seq_num, 1);
889 	cast_namep->ln_pid = pid;
890 }
891 
892 /*
893  * Fill in the lock owner args.
894  */
895 void
896 nfs4_setlockowner_args(lock_owner4 *owner, rnode4_t *rp, pid_t pid)
897 {
898 	nfs4_lock_owner_t *lop;
899 
900 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
901 		"nfs4_setlockowner_args"));
902 
903 	/* This increments lop's ref count */
904 	lop = find_lock_owner(rp, pid, LOWN_VALID_STATEID);
905 
906 	if (!lop)
907 		goto make_up_args;
908 
909 	mutex_enter(&lop->lo_lock);
910 	owner->owner_len = sizeof (lop->lock_owner_name);
911 	owner->owner_val = kmem_alloc(owner->owner_len, KM_SLEEP);
912 	bcopy(&lop->lock_owner_name, owner->owner_val,
913 		owner->owner_len);
914 	mutex_exit(&lop->lo_lock);
915 	lock_owner_rele(lop);
916 	return;
917 
918 make_up_args:
919 	nfs4_set_new_lock_owner_args(owner, pid);
920 }
921 
922 /*
923  * This ends our use of the open owner's open seqid by setting
924  * the appropiate flags and issuing a cv_signal to wake up another
925  * thread waiting to use the open seqid.
926  */
927 
928 void
929 nfs4_end_open_seqid_sync(nfs4_open_owner_t *oop)
930 {
931 	mutex_enter(&oop->oo_lock);
932 	ASSERT(oop->oo_seqid_inuse);
933 	oop->oo_seqid_inuse = 0;
934 	cv_broadcast(&oop->oo_cv_seqid_sync);
935 	mutex_exit(&oop->oo_lock);
936 }
937 
938 /*
939  * This starts our use of the open owner's open seqid by setting
940  * the oo_seqid_inuse to true.  We will wait (forever) with a
941  * cv_wait() until we are woken up.
942  *
943  * Return values:
944  * 0		no problems
945  * EAGAIN	caller should retry (like a recovery retry)
946  */
947 int
948 nfs4_start_open_seqid_sync(nfs4_open_owner_t *oop, mntinfo4_t *mi)
949 {
950 	int error = 0;
951 #ifdef DEBUG
952 	static int ops = 0;		/* fault injection */
953 #endif
954 
955 #ifdef DEBUG
956 	if (seqid_sync_faults && curthread != mi->mi_recovthread &&
957 	    ++ops % 5 == 0)
958 		return (EAGAIN);
959 #endif
960 
961 	mutex_enter(&mi->mi_lock);
962 	if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
963 	    curthread != mi->mi_recovthread)
964 		error = EAGAIN;
965 	mutex_exit(&mi->mi_lock);
966 	if (error != 0)
967 		goto done;
968 
969 	mutex_enter(&oop->oo_lock);
970 
971 	while (oop->oo_seqid_inuse) {
972 		NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
973 			"nfs4_start_open_seqid_sync waiting on cv"));
974 
975 		cv_wait(&oop->oo_cv_seqid_sync, &oop->oo_lock);
976 	}
977 
978 	oop->oo_seqid_inuse = 1;
979 
980 	mutex_exit(&oop->oo_lock);
981 
982 	mutex_enter(&mi->mi_lock);
983 	if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
984 	    curthread != mi->mi_recovthread)
985 		error = EAGAIN;
986 	mutex_exit(&mi->mi_lock);
987 
988 	if (error == EAGAIN)
989 		nfs4_end_open_seqid_sync(oop);
990 
991 	NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
992 		"nfs4_start_open_seqid_sync: error=%d", error));
993 
994 done:
995 	return (error);
996 }
997 
998 #ifdef	DEBUG
999 int bypass_otw[2];
1000 #endif
1001 
1002 /*
1003  * Checks to see if the OPEN OTW is necessary that is, if it's already
1004  * been opened with the same access and deny bits we are now asking for.
1005  * Note, this assumes that *vpp is a rnode.
1006  */
1007 int
1008 nfs4_is_otw_open_necessary(nfs4_open_owner_t *oop, int flag, vnode_t *vp,
1009 	int just_been_created, int *errorp, int acc, nfs4_recov_state_t *rsp)
1010 {
1011 	rnode4_t *rp;
1012 	nfs4_open_stream_t *osp;
1013 	open_delegation_type4 dt;
1014 
1015 	rp = VTOR4(vp);
1016 
1017 	/*
1018 	 * Grab the delegation type.  This function is protected against
1019 	 * the delegation being returned by virtue of start_op (called
1020 	 * by nfs4open_otw) taking the r_deleg_recall_lock in read mode,
1021 	 * delegreturn requires this lock in write mode to proceed.
1022 	 */
1023 	ASSERT(nfs_rw_lock_held(&rp->r_deleg_recall_lock, RW_READER));
1024 	dt = get_dtype(rp);
1025 
1026 	/* returns with 'os_sync_lock' held */
1027 	osp = find_open_stream(oop, rp);
1028 
1029 	if (osp) {
1030 		uint32_t	do_otw = 0;
1031 
1032 		if (osp->os_failed_reopen) {
1033 			NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
1034 			    "nfs4_is_otw_open_necessary: os_failed_reopen "
1035 			    "set on osp %p, cr %p, rp %s", (void *)osp,
1036 			    (void *)osp->os_open_owner->oo_cred,
1037 			    rnode4info(rp)));
1038 			do_otw = 1;
1039 		}
1040 
1041 		/*
1042 		 * check access/deny bits
1043 		 */
1044 		if (!do_otw && (flag & FREAD))
1045 			if (osp->os_share_acc_read == 0 &&
1046 			    dt == OPEN_DELEGATE_NONE)
1047 				do_otw = 1;
1048 
1049 		if (!do_otw && (flag & FWRITE))
1050 			if (osp->os_share_acc_write == 0 &&
1051 			    dt != OPEN_DELEGATE_WRITE)
1052 				do_otw = 1;
1053 
1054 		if (!do_otw) {
1055 			NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1056 				"nfs4_is_otw_open_necessary: can skip this "
1057 				"open OTW"));
1058 			if (!just_been_created) {
1059 				osp->os_open_ref_count++;
1060 				if (flag & FREAD)
1061 					osp->os_share_acc_read++;
1062 				if (flag & FWRITE)
1063 					osp->os_share_acc_write++;
1064 				osp->os_share_deny_none++;
1065 			}
1066 
1067 			/*
1068 			 * Need to reset this bitfield for the possible case
1069 			 * where we were going to OTW CLOSE the file, got a
1070 			 * non-recoverable error, and before we could retry
1071 			 * the CLOSE, OPENed the file again.
1072 			 */
1073 			ASSERT(osp->os_open_owner->oo_seqid_inuse);
1074 			osp->os_final_close = 0;
1075 			osp->os_force_close = 0;
1076 
1077 			mutex_exit(&osp->os_sync_lock);
1078 			open_stream_rele(osp, rp);
1079 
1080 #ifdef	DEBUG
1081 			bypass_otw[0]++;
1082 #endif
1083 
1084 			*errorp = 0;
1085 			return (0);
1086 		}
1087 		mutex_exit(&osp->os_sync_lock);
1088 		open_stream_rele(osp, rp);
1089 
1090 	} else if (dt != OPEN_DELEGATE_NONE) {
1091 		/*
1092 		 * Even if there isn't an open_stream yet, we may still be
1093 		 * able to bypass the otw open if the client owns a delegation.
1094 		 *
1095 		 * If you are asking for for WRITE, but I only have
1096 		 * a read delegation, then you still have to go otw.
1097 		 */
1098 
1099 		if (flag & FWRITE && dt == OPEN_DELEGATE_READ)
1100 			return (1);
1101 
1102 		/*
1103 		 * TODO - evaluate the nfsace4
1104 		 */
1105 
1106 		/*
1107 		 * Check the access flags to make sure the caller
1108 		 * had permission.
1109 		 */
1110 		if (flag & FREAD && !(acc & VREAD))
1111 			return (1);
1112 
1113 		if (flag & FWRITE && !(acc & VWRITE))
1114 			return (1);
1115 
1116 		/*
1117 		 * create_open_stream will add a reference to oop,
1118 		 * this will prevent the open_owner_rele done in
1119 		 * nfs4open_otw from destroying the open_owner.
1120 		 */
1121 
1122 		/* returns with 'os_sync_lock' held */
1123 		osp = create_open_stream(oop, rp);
1124 		if (osp == NULL)
1125 			return (1);
1126 
1127 		osp->open_stateid = rp->r_deleg_stateid;
1128 		osp->os_delegation = 1;
1129 
1130 		if (flag & FREAD)
1131 			osp->os_share_acc_read++;
1132 		if (flag & FWRITE)
1133 			osp->os_share_acc_write++;
1134 
1135 		osp->os_share_deny_none++;
1136 		mutex_exit(&osp->os_sync_lock);
1137 
1138 		open_stream_rele(osp, rp);
1139 
1140 		mutex_enter(&oop->oo_lock);
1141 		oop->oo_just_created = NFS4_PERM_CREATED;
1142 		mutex_exit(&oop->oo_lock);
1143 
1144 		ASSERT(rsp != NULL);
1145 		if (rsp->rs_sp != NULL) {
1146 			mutex_enter(&rsp->rs_sp->s_lock);
1147 			nfs4_inc_state_ref_count_nolock(rsp->rs_sp,
1148 							VTOMI4(vp));
1149 			mutex_exit(&rsp->rs_sp->s_lock);
1150 		}
1151 #ifdef	DEBUG
1152 		bypass_otw[1]++;
1153 #endif
1154 
1155 		*errorp = 0;
1156 		return (0);
1157 	}
1158 
1159 	return (1);
1160 }
1161 
1162 static open_delegation_type4
1163 get_dtype(rnode4_t *rp)
1164 {
1165 	open_delegation_type4 dt;
1166 
1167 	mutex_enter(&rp->r_statev4_lock);
1168 	ASSERT(!rp->r_deleg_return_inprog);
1169 	if (rp->r_deleg_return_pending)
1170 		dt = OPEN_DELEGATE_NONE;
1171 	else
1172 		dt = rp->r_deleg_type;
1173 	mutex_exit(&rp->r_statev4_lock);
1174 
1175 	return (dt);
1176 }
1177 
1178 /*
1179  * Fill in *locker with the lock state arguments for a LOCK call.  If
1180  * lop->lo_just_created == NFS4_JUST_CREATED, oop and osp must be non-NULL.
1181  * Caller must already hold the necessary seqid sync lock(s).
1182  */
1183 
1184 void
1185 nfs4_setup_lock_args(nfs4_lock_owner_t *lop, nfs4_open_owner_t *oop,
1186 	nfs4_open_stream_t *osp, clientid4 clientid, locker4 *locker)
1187 {
1188 	ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
1189 	if (lop->lo_just_created == NFS4_JUST_CREATED) {
1190 		/* this is a new lock request */
1191 		open_to_lock_owner4 *nown;
1192 
1193 		ASSERT(oop != NULL);
1194 		ASSERT(osp != NULL);
1195 
1196 		locker->new_lock_owner = TRUE;
1197 		nown = &locker->locker4_u.open_owner;
1198 		nown->open_seqid = nfs4_get_open_seqid(oop) + 1;
1199 		mutex_enter(&osp->os_sync_lock);
1200 		nown->open_stateid = osp->open_stateid;
1201 		mutex_exit(&osp->os_sync_lock);
1202 		nown->lock_seqid = lop->lock_seqid; /* initial, so no +1 */
1203 
1204 		nown->lock_owner.clientid = clientid;
1205 		nown->lock_owner.owner_len = sizeof (lop->lock_owner_name);
1206 		nown->lock_owner.owner_val =
1207 			kmem_alloc(nown->lock_owner.owner_len, KM_SLEEP);
1208 		bcopy(&lop->lock_owner_name, nown->lock_owner.owner_val,
1209 			nown->lock_owner.owner_len);
1210 	} else {
1211 		exist_lock_owner4 *eown;
1212 		/* have an existing lock owner */
1213 
1214 		locker->new_lock_owner = FALSE;
1215 		eown = &locker->locker4_u.lock_owner;
1216 		mutex_enter(&lop->lo_lock);
1217 		eown->lock_stateid = lop->lock_stateid;
1218 		mutex_exit(&lop->lo_lock);
1219 		eown->lock_seqid = lop->lock_seqid + 1;
1220 	}
1221 }
1222 
1223 /*
1224  * This starts our use of the lock owner's lock seqid by setting
1225  * the lo_flags to NFS4_LOCK_SEQID_INUSE.  We will wait (forever)
1226  * with a cv_wait() until we are woken up.
1227  *
1228  * Return values:
1229  * 0		no problems
1230  * EAGAIN	caller should retry (like a recovery retry)
1231  */
1232 int
1233 nfs4_start_lock_seqid_sync(nfs4_lock_owner_t *lop, mntinfo4_t *mi)
1234 {
1235 	int error = 0;
1236 #ifdef DEBUG
1237 	static int ops = 0;		/* fault injection */
1238 #endif
1239 
1240 #ifdef DEBUG
1241 	if (seqid_sync_faults && curthread != mi->mi_recovthread &&
1242 	    ++ops % 7 == 0)
1243 		return (EAGAIN);
1244 #endif
1245 
1246 	mutex_enter(&mi->mi_lock);
1247 	if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
1248 	    curthread != mi->mi_recovthread)
1249 		error = EAGAIN;
1250 	mutex_exit(&mi->mi_lock);
1251 	if (error != 0)
1252 		goto done;
1253 
1254 	mutex_enter(&lop->lo_lock);
1255 
1256 	ASSERT(lop->lo_seqid_holder != curthread);
1257 	while (lop->lo_flags & NFS4_LOCK_SEQID_INUSE) {
1258 		NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
1259 			"nfs4_start_lock_seqid_sync: waiting on cv"));
1260 
1261 		cv_wait(&lop->lo_cv_seqid_sync, &lop->lo_lock);
1262 	}
1263 	NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4_start_lock_seqid_sync: "
1264 		"NFS4_LOCK_SEQID_INUSE"));
1265 
1266 	lop->lo_flags |= NFS4_LOCK_SEQID_INUSE;
1267 	lop->lo_seqid_holder = curthread;
1268 	mutex_exit(&lop->lo_lock);
1269 
1270 	mutex_enter(&mi->mi_lock);
1271 	if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
1272 	    curthread != mi->mi_recovthread)
1273 		error = EAGAIN;
1274 	mutex_exit(&mi->mi_lock);
1275 
1276 	if (error == EAGAIN)
1277 		nfs4_end_lock_seqid_sync(lop);
1278 
1279 	NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
1280 		"nfs4_start_lock_seqid_sync: error=%d", error));
1281 
1282 done:
1283 	return (error);
1284 }
1285 
1286 /*
1287  * This ends our use of the lock owner's lock seqid by setting
1288  * the appropiate flags and issuing a cv_signal to wake up another
1289  * thread waiting to use the lock seqid.
1290  */
1291 void
1292 nfs4_end_lock_seqid_sync(nfs4_lock_owner_t *lop)
1293 {
1294 	mutex_enter(&lop->lo_lock);
1295 	ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
1296 	ASSERT(lop->lo_seqid_holder == curthread);
1297 	lop->lo_flags &= ~NFS4_LOCK_SEQID_INUSE;
1298 	lop->lo_seqid_holder = NULL;
1299 	cv_broadcast(&lop->lo_cv_seqid_sync);
1300 	mutex_exit(&lop->lo_lock);
1301 }
1302 
1303 /*
1304  * Returns a reference to a lock owner via lopp, which has its lock seqid
1305  * synchronization started.
1306  * If the lock owner is in the 'just_created' state, then we return its open
1307  * owner and open stream and start the open seqid synchronization.
1308  *
1309  * Return value:
1310  * NFS4_OK		no problems
1311  * NFS4ERR_DELAY	there is lost state to recover; caller should retry
1312  * NFS4ERR_IO		no open stream
1313  */
1314 nfsstat4
1315 nfs4_find_or_create_lock_owner(pid_t pid, rnode4_t *rp, cred_t *cr,
1316 	nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
1317 	nfs4_lock_owner_t **lopp)
1318 {
1319 	nfs4_lock_owner_t *lop, *next_lop;
1320 	mntinfo4_t *mi;
1321 	int error = 0;
1322 	nfsstat4 stat;
1323 
1324 	mi = VTOMI4(RTOV4(rp));
1325 
1326 	mutex_enter(&rp->r_statev4_lock);
1327 
1328 	lop = rp->r_lo_head.lo_next_rnode;
1329 	while (lop != &rp->r_lo_head) {
1330 		mutex_enter(&lop->lo_lock);
1331 		if (lop->lo_pid == pid && lop->lo_valid != 0) {
1332 			/* Found a matching lock owner */
1333 			NFS4_DEBUG(nfs4_client_state_debug,
1334 				(CE_NOTE, "nfs4_find_or_create_lock_owner: "
1335 				"got a match"));
1336 			lop->lo_ref_count++;
1337 			break;
1338 		}
1339 		next_lop = lop->lo_next_rnode;
1340 		mutex_exit(&lop->lo_lock);
1341 		lop = next_lop;
1342 	}
1343 
1344 	if (lop == &rp->r_lo_head) {
1345 		/* create temporary lock owner */
1346 		lop = create_lock_owner(rp, pid);
1347 	}
1348 	mutex_exit(&rp->r_statev4_lock);
1349 
1350 	/* Have a locked down lock owner struct now */
1351 	if (lop->lo_just_created != NFS4_JUST_CREATED) {
1352 		/* This is an existing lock owner */
1353 		*oopp = NULL;
1354 		*ospp = NULL;
1355 	} else {
1356 		/* Lock owner doesn't exist yet */
1357 
1358 		/* First grab open owner seqid synchronization */
1359 		mutex_exit(&lop->lo_lock);
1360 		*oopp = find_open_owner(cr, NFS4_PERM_CREATED, mi);
1361 		if (*oopp == NULL)
1362 			goto kill_new_lop;
1363 		error = nfs4_start_open_seqid_sync(*oopp, mi);
1364 		if (error == EAGAIN) {
1365 			stat = NFS4ERR_DELAY;
1366 			goto failed;
1367 		}
1368 		*ospp = find_open_stream(*oopp, rp);
1369 		if (*ospp == NULL) {
1370 			nfs4_end_open_seqid_sync(*oopp);
1371 			goto kill_new_lop;
1372 		}
1373 		if ((*ospp)->os_failed_reopen) {
1374 			mutex_exit(&(*ospp)->os_sync_lock);
1375 			NFS4_DEBUG((nfs4_open_stream_debug ||
1376 				    nfs4_client_lock_debug), (CE_NOTE,
1377 			    "nfs4_find_or_create_lock_owner: os_failed_reopen;"
1378 			    "osp %p, cr %p, rp %s", (void *)(*ospp),
1379 			    (void *)cr, rnode4info(rp)));
1380 			nfs4_end_open_seqid_sync(*oopp);
1381 			stat = NFS4ERR_IO;
1382 			goto failed;
1383 		}
1384 		mutex_exit(&(*ospp)->os_sync_lock);
1385 
1386 		/*
1387 		 * Now see if the lock owner has become permanent while we
1388 		 * had released our lock.
1389 		 */
1390 		mutex_enter(&lop->lo_lock);
1391 		if (lop->lo_just_created != NFS4_JUST_CREATED) {
1392 			nfs4_end_open_seqid_sync(*oopp);
1393 			open_stream_rele(*ospp, rp);
1394 			open_owner_rele(*oopp);
1395 			*oopp = NULL;
1396 			*ospp = NULL;
1397 		}
1398 	}
1399 	mutex_exit(&lop->lo_lock);
1400 
1401 	error = nfs4_start_lock_seqid_sync(lop, mi);
1402 	if (error == EAGAIN) {
1403 		if (*oopp != NULL)
1404 			nfs4_end_open_seqid_sync(*oopp);
1405 		stat = NFS4ERR_DELAY;
1406 		goto failed;
1407 	}
1408 	ASSERT(error == 0);
1409 
1410 	*lopp = lop;
1411 	return (NFS4_OK);
1412 
1413 kill_new_lop:
1414 	/*
1415 	 * A previous CLOSE was attempted but got EINTR, but the application
1416 	 * continued to use the unspecified state file descriptor.  But now the
1417 	 * open stream is gone (which could also destroy the open owner), hence
1418 	 * we can no longer continue.  The calling function should return EIO
1419 	 * to the application.
1420 	 */
1421 	NFS4_DEBUG(nfs4_lost_rqst_debug || nfs4_client_lock_debug,
1422 	    (CE_NOTE, "nfs4_find_or_create_lock_owner: destroy newly created "
1423 	    "lop %p, oop %p, osp %p", (void *)lop, (void *)(*oopp),
1424 	    (void *)(*ospp)));
1425 
1426 	nfs4_rnode_remove_lock_owner(rp, lop);
1427 	stat = NFS4ERR_IO;
1428 
1429 failed:
1430 	lock_owner_rele(lop);
1431 	if (*oopp) {
1432 		open_owner_rele(*oopp);
1433 		*oopp = NULL;
1434 	}
1435 	if (*ospp) {
1436 		open_stream_rele(*ospp, rp);
1437 		*ospp = NULL;
1438 	}
1439 	return (stat);
1440 }
1441 
1442 /*
1443  * This function grabs a recently freed open owner off of the freed open
1444  * owner list if there is a match on the cred 'cr'.  It returns NULL if no
1445  * such match is found.  It will set the 'oo_ref_count' and 'oo_valid' back
1446  * to both 1 (sane values) in the case a match is found.
1447  */
1448 static nfs4_open_owner_t *
1449 find_freed_open_owner(cred_t *cr, nfs4_oo_hash_bucket_t *bucketp,
1450 	mntinfo4_t *mi)
1451 {
1452 	nfs4_open_owner_t		*foop;
1453 
1454 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1455 		"find_freed_open_owner: cred %p", (void*)cr));
1456 
1457 	ASSERT(mutex_owned(&mi->mi_lock));
1458 	ASSERT(mutex_owned(&bucketp->b_lock));
1459 
1460 	/* got hash bucket, search through freed open owners */
1461 	for (foop = list_head(&mi->mi_foo_list); foop != NULL;
1462 	    foop = list_next(&mi->mi_foo_list, foop)) {
1463 		if (!crcmp(foop->oo_cred, cr)) {
1464 			NFS4_DEBUG(nfs4_client_foo_debug, (CE_NOTE,
1465 				"find_freed_open_owner: got a match open owner "
1466 				"%p", (void *)foop));
1467 			foop->oo_ref_count = 1;
1468 			foop->oo_valid = 1;
1469 			list_remove(&mi->mi_foo_list, foop);
1470 			mi->mi_foo_num--;
1471 
1472 			/* now add the struct into the cred hash table */
1473 			list_insert_head(&bucketp->b_oo_hash_list, foop);
1474 			return (foop);
1475 		}
1476 	}
1477 
1478 	return (NULL);
1479 }
1480 
1481 /*
1482  * Insert the newly freed 'oop' into the mi's freed oop list,
1483  * always at the head of the list.  If we've already reached
1484  * our maximum allowed number of freed open owners (mi_foo_max),
1485  * then remove the LRU open owner on the list (namely the tail).
1486  */
1487 static void
1488 nfs4_free_open_owner(nfs4_open_owner_t *oop, mntinfo4_t *mi)
1489 {
1490 	nfs4_open_owner_t *lru_foop;
1491 
1492 	if (mi->mi_foo_num < mi->mi_foo_max) {
1493 		NFS4_DEBUG(nfs4_client_foo_debug, (CE_NOTE,
1494 			"nfs4_free_open_owner: num free %d, max free %d, "
1495 			"insert open owner %p for mntinfo4 %p",
1496 			mi->mi_foo_num, mi->mi_foo_max, (void *)oop,
1497 			(void *)mi));
1498 		list_insert_head(&mi->mi_foo_list, oop);
1499 		mi->mi_foo_num++;
1500 		return;
1501 	}
1502 
1503 	/* need to replace a freed open owner */
1504 
1505 	lru_foop = list_tail(&mi->mi_foo_list);
1506 
1507 	NFS4_DEBUG(nfs4_client_foo_debug, (CE_NOTE,
1508 	    "nfs4_free_open_owner: destroy %p, insert %p",
1509 	    (void *)lru_foop, (void *)oop));
1510 
1511 	list_remove(&mi->mi_foo_list, lru_foop);
1512 	nfs4_destroy_open_owner(lru_foop);
1513 
1514 	/* head always has latest freed oop */
1515 	list_insert_head(&mi->mi_foo_list, oop);
1516 }
1517 
1518 void
1519 nfs4_destroy_open_owner(nfs4_open_owner_t *oop)
1520 {
1521 	ASSERT(oop != NULL);
1522 
1523 	crfree(oop->oo_cred);
1524 	if (oop->oo_cred_otw)
1525 		crfree(oop->oo_cred_otw);
1526 	mutex_destroy(&oop->oo_lock);
1527 	cv_destroy(&oop->oo_cv_seqid_sync);
1528 	kmem_free(oop, sizeof (*oop));
1529 }
1530 
1531 seqid4
1532 nfs4_get_open_seqid(nfs4_open_owner_t *oop)
1533 {
1534 	ASSERT(oop->oo_seqid_inuse);
1535 	return (oop->oo_seqid);
1536 }
1537 
1538 /*
1539  * This set's the open seqid for a <open owner/ mntinfo4> pair.
1540  */
1541 void
1542 nfs4_set_open_seqid(seqid4 seqid, nfs4_open_owner_t *oop,
1543 	nfs4_tag_type_t tag_type)
1544 {
1545 	ASSERT(oop->oo_seqid_inuse);
1546 	oop->oo_seqid = seqid;
1547 	oop->oo_last_good_seqid = seqid;
1548 	oop->oo_last_good_op = tag_type;
1549 }
1550 
1551 /*
1552  * This bumps the current open seqid for the open owner 'oop'.
1553  */
1554 void
1555 nfs4_get_and_set_next_open_seqid(nfs4_open_owner_t *oop,
1556     nfs4_tag_type_t tag_type)
1557 {
1558 	ASSERT(oop->oo_seqid_inuse);
1559 	oop->oo_seqid++;
1560 	oop->oo_last_good_seqid = oop->oo_seqid;
1561 	oop->oo_last_good_op = tag_type;
1562 }
1563 
1564 /*
1565  * If no open owner was provided, this function takes the cred to find an
1566  * open owner within the given mntinfo4_t.  Either way we return the
1567  * open owner's OTW credential if it exists; otherwise returns the
1568  * supplied 'cr'.
1569  *
1570  * A hold is put on the returned credential, and it is up to the caller
1571  * to free the cred.
1572  */
1573 cred_t *
1574 nfs4_get_otw_cred(cred_t *cr, mntinfo4_t *mi, nfs4_open_owner_t *provided_oop)
1575 {
1576 	cred_t *ret_cr;
1577 	nfs4_open_owner_t *oop = provided_oop;
1578 
1579 	if (oop == NULL)
1580 		oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
1581 	if (oop != NULL) {
1582 		mutex_enter(&oop->oo_lock);
1583 		if (oop->oo_cred_otw)
1584 			ret_cr = oop->oo_cred_otw;
1585 		else
1586 			ret_cr = cr;
1587 		crhold(ret_cr);
1588 		mutex_exit(&oop->oo_lock);
1589 		if (provided_oop == NULL)
1590 			open_owner_rele(oop);
1591 	} else {
1592 		ret_cr = cr;
1593 		crhold(ret_cr);
1594 	}
1595 	return (ret_cr);
1596 }
1597 
1598 /*
1599  * Retrieves the next open stream in the rnode's list if an open stream
1600  * is provided; otherwise gets the first open stream in the list.
1601  * The open owner for that open stream is then retrieved, and if its
1602  * oo_cred_otw exists then it is returned; otherwise the provided 'cr'
1603  * is returned.  *osp is set to the 'found' open stream.
1604  *
1605  * Note: we don't set *osp to the open stream retrieved via the
1606  * optimized check since that won't necessarily be at the beginning
1607  * of the rnode list, and if that osp doesn't work we'd like to
1608  * check _all_ open streams (starting from the beginning of the
1609  * rnode list).
1610  */
1611 cred_t *
1612 nfs4_get_otw_cred_by_osp(rnode4_t *rp, cred_t *cr,
1613 	nfs4_open_stream_t **osp, bool_t *first_time, bool_t *last_time)
1614 {
1615 	nfs4_open_stream_t *next_osp = NULL;
1616 	cred_t *ret_cr;
1617 
1618 	ASSERT(cr != NULL);
1619 	/*
1620 	 * As an optimization, try to find the open owner
1621 	 * for the cred provided since that's most likely
1622 	 * to work.
1623 	 */
1624 	if (*first_time) {
1625 		nfs4_open_owner_t *oop;
1626 
1627 		oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(RTOV4(rp)));
1628 		if (oop) {
1629 			next_osp = find_open_stream(oop, rp);
1630 			if (next_osp)
1631 				mutex_exit(&next_osp->os_sync_lock);
1632 			open_owner_rele(oop);
1633 		} else {
1634 			next_osp = NULL;
1635 		}
1636 	} else  {
1637 		int delay_rele = 0;
1638 
1639 		/* return the next open stream for this rnode */
1640 		mutex_enter(&rp->r_os_lock);
1641 		/* Now, no one can add or delete to rp's open streams list */
1642 
1643 		if (*osp) {
1644 			next_osp = list_next(&rp->r_open_streams, *osp);
1645 			/*
1646 			 * Delay the rele of *osp until after we drop
1647 			 * r_os_lock to not deadlock with oo_lock
1648 			 * via an open_stream_rele()->open_owner_rele().
1649 			 */
1650 			delay_rele = 1;
1651 		} else {
1652 			next_osp = list_head(&rp->r_open_streams);
1653 		}
1654 		if (next_osp) {
1655 			nfs4_open_stream_t *tmp_osp;
1656 
1657 			/* find the next valid open stream */
1658 			mutex_enter(&next_osp->os_sync_lock);
1659 			while (next_osp && !next_osp->os_valid) {
1660 				tmp_osp =
1661 				    list_next(&rp->r_open_streams, next_osp);
1662 				mutex_exit(&next_osp->os_sync_lock);
1663 				next_osp = tmp_osp;
1664 				if (next_osp)
1665 					mutex_enter(&next_osp->os_sync_lock);
1666 			}
1667 			if (next_osp) {
1668 				next_osp->os_ref_count++;
1669 				mutex_exit(&next_osp->os_sync_lock);
1670 			}
1671 		}
1672 		mutex_exit(&rp->r_os_lock);
1673 		if (delay_rele)
1674 			open_stream_rele(*osp, rp);
1675 	}
1676 
1677 	if (next_osp) {
1678 		nfs4_open_owner_t *oop;
1679 
1680 		oop = next_osp->os_open_owner;
1681 		mutex_enter(&oop->oo_lock);
1682 		if (oop->oo_cred_otw)
1683 			ret_cr = oop->oo_cred_otw;
1684 		else
1685 			ret_cr = cr;
1686 		crhold(ret_cr);
1687 		mutex_exit(&oop->oo_lock);
1688 		if (*first_time) {
1689 			open_stream_rele(next_osp, rp);
1690 			*osp = NULL;
1691 		} else
1692 			*osp = next_osp;
1693 	} else {
1694 		/* just return the cred provided to us */
1695 		if (*first_time != TRUE)
1696 			*last_time = TRUE;
1697 		*osp = NULL;
1698 		ret_cr = cr;
1699 		crhold(ret_cr);
1700 	}
1701 
1702 	if (*first_time)
1703 		*first_time = FALSE;
1704 	return (ret_cr);
1705 }
1706 
1707 void
1708 nfs4_init_stateid_types(nfs4_stateid_types_t *sid_tp)
1709 {
1710 	bzero(&sid_tp->d_sid, sizeof (stateid4));
1711 	bzero(&sid_tp->l_sid, sizeof (stateid4));
1712 	bzero(&sid_tp->o_sid, sizeof (stateid4));
1713 	sid_tp->cur_sid_type = NO_SID;
1714 }
1715 
1716 void
1717 nfs4_save_stateid(stateid4 *s1, nfs4_stateid_types_t *sid_tp)
1718 {
1719 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1720 	    "nfs4_save_stateid: saved %s stateid",
1721 	    sid_tp->cur_sid_type == DEL_SID ? "delegation" :
1722 	    sid_tp->cur_sid_type == LOCK_SID ? "lock" :
1723 	    sid_tp->cur_sid_type == OPEN_SID ? "open" : "special"));
1724 
1725 	switch (sid_tp->cur_sid_type) {
1726 	case DEL_SID:
1727 		sid_tp->d_sid = *s1;
1728 		break;
1729 	case LOCK_SID:
1730 		sid_tp->l_sid = *s1;
1731 		break;
1732 	case OPEN_SID:
1733 		sid_tp->o_sid = *s1;
1734 		break;
1735 	case SPEC_SID:
1736 	default:
1737 		cmn_err(CE_PANIC, "nfs4_save_stateid: illegal "
1738 		    "stateid type %d", sid_tp->cur_sid_type);
1739 	}
1740 }
1741 
1742 /*
1743  * We got NFS4ERR_BAD_SEQID.  Setup some arguments to pass to recovery.
1744  * Caller is responsible for freeing.
1745  */
1746 nfs4_bseqid_entry_t *
1747 nfs4_create_bseqid_entry(nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop,
1748     vnode_t *vp, pid_t pid, nfs4_tag_type_t tag, seqid4 seqid)
1749 {
1750 	nfs4_bseqid_entry_t	*bsep;
1751 
1752 	bsep = kmem_alloc(sizeof (*bsep), KM_SLEEP);
1753 	bsep->bs_oop = oop;
1754 	bsep->bs_lop = lop;
1755 	bsep->bs_vp = vp;
1756 	bsep->bs_pid = pid;
1757 	bsep->bs_tag = tag;
1758 	bsep->bs_seqid = seqid;
1759 
1760 	return (bsep);
1761 }
1762 
1763 void
1764 nfs4open_dg_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
1765 	nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr,
1766 	vnode_t *vp, int access_close, int deny_close)
1767 {
1768 	lost_rqstp->lr_putfirst = FALSE;
1769 
1770 	ASSERT(vp != NULL);
1771 	if (error == ETIMEDOUT || error == EINTR ||
1772 	    NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
1773 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
1774 			"nfs4open_dg_save_lost_rqst: error %d", error));
1775 
1776 		lost_rqstp->lr_op = OP_OPEN_DOWNGRADE;
1777 		/*
1778 		 * The vp is held and rele'd via the recovery code.
1779 		 * See nfs4_save_lost_rqst.
1780 		 */
1781 		lost_rqstp->lr_vp = vp;
1782 		lost_rqstp->lr_dvp = NULL;
1783 		lost_rqstp->lr_oop = oop;
1784 		lost_rqstp->lr_osp = osp;
1785 		lost_rqstp->lr_lop = NULL;
1786 		lost_rqstp->lr_cr = cr;
1787 		lost_rqstp->lr_flk = NULL;
1788 		lost_rqstp->lr_dg_acc = access_close;
1789 		lost_rqstp->lr_dg_deny = deny_close;
1790 		lost_rqstp->lr_putfirst = FALSE;
1791 	} else {
1792 		lost_rqstp->lr_op = 0;
1793 	}
1794 }
1795 
1796 /*
1797  * Change the access and deny bits of an OPEN.
1798  * If recovery is needed, *recov_credpp is set to the cred used OTW,
1799  * a hold is placed on it, and *recov_seqidp is set to the seqid used OTW.
1800  */
1801 void
1802 nfs4_open_downgrade(int access_close, int deny_close, nfs4_open_owner_t *oop,
1803 	nfs4_open_stream_t *osp, vnode_t *vp, cred_t *cr, nfs4_lost_rqst_t *lrp,
1804 	nfs4_error_t *ep, cred_t **recov_credpp, seqid4 *recov_seqidp)
1805 {
1806 	mntinfo4_t		*mi;
1807 	int			downgrade_acc, downgrade_deny;
1808 	int			new_acc, new_deny;
1809 	COMPOUND4args_clnt	args;
1810 	COMPOUND4res_clnt	res;
1811 	OPEN_DOWNGRADE4res	*odg_res;
1812 	nfs_argop4		argop[3];
1813 	nfs_resop4		*resop;
1814 	rnode4_t		*rp;
1815 	bool_t			needrecov = FALSE;
1816 	int			doqueue = 1;
1817 	seqid4			seqid = 0;
1818 	cred_t			*cred_otw;
1819 	hrtime_t		t;
1820 
1821 	ASSERT(mutex_owned(&osp->os_sync_lock));
1822 #if DEBUG
1823 	mutex_enter(&oop->oo_lock);
1824 	ASSERT(oop->oo_seqid_inuse);
1825 	mutex_exit(&oop->oo_lock);
1826 #endif
1827 
1828 
1829 	if (access_close == 0 && deny_close == 0) {
1830 		nfs4_error_zinit(ep);
1831 		return;
1832 	}
1833 
1834 	cred_otw = nfs4_get_otw_cred(cr, VTOMI4(vp), oop);
1835 
1836 cred_retry:
1837 	nfs4_error_zinit(ep);
1838 	downgrade_acc = 0;
1839 	downgrade_deny = 0;
1840 	mi = VTOMI4(vp);
1841 	rp = VTOR4(vp);
1842 
1843 	/*
1844 	 * Check to see if the open stream got closed before we go OTW,
1845 	 * now that we have acquired the 'os_sync_lock'.
1846 	 */
1847 	if (!osp->os_valid) {
1848 		NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
1849 		    " open stream has already been closed, return success"));
1850 		/* error has already been set */
1851 		goto no_args_out;
1852 	}
1853 
1854 	/* If the file failed recovery, just quit. */
1855 	mutex_enter(&rp->r_statelock);
1856 	if (rp->r_flags & R4RECOVERR) {
1857 		mutex_exit(&rp->r_statelock);
1858 		ep->error = EIO;
1859 		goto no_args_out;
1860 	}
1861 	mutex_exit(&rp->r_statelock);
1862 
1863 	seqid = nfs4_get_open_seqid(oop) + 1;
1864 
1865 	NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
1866 	    "access_close %d, acc_read %"PRIu64" acc_write %"PRIu64"",
1867 	    access_close, osp->os_share_acc_read, osp->os_share_acc_write));
1868 
1869 	/* If we're closing the last READ, need to downgrade */
1870 	if ((access_close & FREAD) && (osp->os_share_acc_read == 1))
1871 		downgrade_acc |= OPEN4_SHARE_ACCESS_READ;
1872 
1873 	/* if we're closing the last WRITE, need to downgrade */
1874 	if ((access_close & FWRITE) && (osp->os_share_acc_write == 1))
1875 		downgrade_acc |= OPEN4_SHARE_ACCESS_WRITE;
1876 
1877 	downgrade_deny = OPEN4_SHARE_DENY_NONE;
1878 
1879 	new_acc = 0;
1880 	new_deny = 0;
1881 
1882 	/* set our new access and deny share bits */
1883 	if ((osp->os_share_acc_read > 0) &&
1884 	    !(downgrade_acc & OPEN4_SHARE_ACCESS_READ))
1885 		new_acc |= OPEN4_SHARE_ACCESS_READ;
1886 	if ((osp->os_share_acc_write > 0) &&
1887 	    !(downgrade_acc & OPEN4_SHARE_ACCESS_WRITE))
1888 		new_acc |= OPEN4_SHARE_ACCESS_WRITE;
1889 
1890 	new_deny = OPEN4_SHARE_DENY_NONE;
1891 
1892 	NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
1893 	    "downgrade acc 0x%x deny 0x%x", downgrade_acc, downgrade_deny));
1894 	NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
1895 	    "new acc 0x%x deny 0x%x", new_acc, new_deny));
1896 
1897 	/*
1898 	 * Check to see if we aren't actually doing any downgrade or
1899 	 * if this is the last 'close' but the file is still mmapped.
1900 	 * Skip this if this a lost request resend so we don't decrement
1901 	 * the osp's share counts more than once.
1902 	 */
1903 	if (!lrp &&
1904 	    ((downgrade_acc == 0 && downgrade_deny == 0) ||
1905 	    (new_acc == 0 && new_deny == 0))) {
1906 		/*
1907 		 * No downgrade to do, but still need to
1908 		 * update osp's os_share_* counts.
1909 		 */
1910 		NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE,
1911 		    "nfs4_open_downgrade: just lower the osp's count by %s",
1912 		    (access_close & FREAD) && (access_close & FWRITE) ?
1913 		    "read and write" : (access_close & FREAD) ? "read" :
1914 		    (access_close & FWRITE) ? "write" : "bogus"));
1915 		if (access_close & FREAD)
1916 			osp->os_share_acc_read--;
1917 		if (access_close & FWRITE)
1918 			osp->os_share_acc_write--;
1919 		osp->os_share_deny_none--;
1920 		nfs4_error_zinit(ep);
1921 
1922 		goto no_args_out;
1923 	}
1924 
1925 	if (osp->os_orig_oo_name != oop->oo_name) {
1926 		ep->error = EIO;
1927 		goto no_args_out;
1928 	}
1929 
1930 	/* setup the COMPOUND args */
1931 	if (lrp)
1932 		args.ctag = TAG_OPEN_DG_LOST;
1933 	else
1934 		args.ctag = TAG_OPEN_DG;
1935 
1936 	args.array_len = 3;
1937 	args.array = argop;
1938 
1939 	/* putfh */
1940 	argop[0].argop = OP_CPUTFH;
1941 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1942 
1943 	argop[1].argop = OP_GETATTR;
1944 	argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1945 	argop[1].nfs_argop4_u.opgetattr.mi = mi;
1946 
1947 	ASSERT(mutex_owned(&osp->os_sync_lock));
1948 	ASSERT(osp->os_delegation == FALSE);
1949 
1950 	/* open downgrade */
1951 	argop[2].argop = OP_OPEN_DOWNGRADE;
1952 	argop[2].nfs_argop4_u.opopen_downgrade.open_stateid = osp->open_stateid;
1953 	argop[2].nfs_argop4_u.opopen_downgrade.share_access = new_acc;
1954 	argop[2].nfs_argop4_u.opopen_downgrade.share_deny = new_deny;
1955 	argop[2].nfs_argop4_u.opopen_downgrade.seqid = seqid;
1956 
1957 	t = gethrtime();
1958 
1959 	rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
1960 
1961 	if (ep->error == 0 && nfs4_need_to_bump_seqid(&res))
1962 		nfs4_set_open_seqid(seqid, oop, args.ctag);
1963 
1964 	if ((ep->error == EACCES ||
1965 	    (ep->error == 0 && res.status == NFS4ERR_ACCESS)) &&
1966 	    cred_otw != cr) {
1967 		crfree(cred_otw);
1968 		cred_otw = cr;
1969 		crhold(cred_otw);
1970 		if (!ep->error)
1971 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1972 		goto cred_retry;
1973 	}
1974 
1975 	needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
1976 
1977 	if (needrecov && recov_credpp) {
1978 		*recov_credpp = cred_otw;
1979 		crhold(*recov_credpp);
1980 		if (recov_seqidp)
1981 			*recov_seqidp = seqid;
1982 	}
1983 
1984 	if (!ep->error && !res.status) {
1985 		/* get the open downgrade results */
1986 		resop = &res.array[2];
1987 		odg_res = &resop->nfs_resop4_u.opopen_downgrade;
1988 
1989 		osp->open_stateid = odg_res->open_stateid;
1990 
1991 		/* set the open streams new access/deny bits */
1992 		if (access_close & FREAD)
1993 			osp->os_share_acc_read--;
1994 		if (access_close & FWRITE)
1995 			osp->os_share_acc_write--;
1996 		osp->os_share_deny_none--;
1997 		osp->os_dc_openacc = new_acc;
1998 
1999 		nfs4_attr_cache(vp,
2000 				&res.array[1].nfs_resop4_u.opgetattr.ga_res,
2001 				t, cred_otw, TRUE, NULL);
2002 	}
2003 
2004 	if (!ep->error)
2005 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2006 
2007 no_args_out:
2008 	crfree(cred_otw);
2009 }
2010 
2011 /*
2012  * If an OPEN request gets ETIMEDOUT or EINTR (that includes bailing out
2013  * because the filesystem was forcibly unmounted) then we don't know if we
2014  * potentially left state dangling on the server, therefore the recovery
2015  * framework makes this call to resend the OPEN request and then undo it.
2016  */
2017 void
2018 nfs4_resend_open_otw(vnode_t **vpp, nfs4_lost_rqst_t *resend_rqstp,
2019 	nfs4_error_t *ep)
2020 {
2021 	COMPOUND4args_clnt	args;
2022 	COMPOUND4res_clnt	res;
2023 	nfs_argop4		argop[4];
2024 	GETFH4res		*gf_res = NULL;
2025 	OPEN4cargs		*open_args;
2026 	OPEN4res		*op_res;
2027 	char			*destcfp;
2028 	int			destclen;
2029 	nfs4_ga_res_t		*garp;
2030 	vnode_t			*dvp = NULL, *vp = NULL;
2031 	rnode4_t		*rp = NULL, *drp = NULL;
2032 	cred_t			*cr = NULL;
2033 	seqid4			seqid;
2034 	nfs4_open_owner_t	*oop = NULL;
2035 	nfs4_open_stream_t	*osp = NULL;
2036 	component4		*srcfp;
2037 	open_claim_type4	claim;
2038 	mntinfo4_t		*mi;
2039 	int			doqueue = 1;
2040 	bool_t			retry_open = FALSE;
2041 	int			created_osp = 0;
2042 	hrtime_t		t;
2043 	char 			*failed_msg = "";
2044 	int			fh_different;
2045 
2046 	nfs4_error_zinit(ep);
2047 
2048 	cr = resend_rqstp->lr_cr;
2049 	dvp = resend_rqstp->lr_dvp;
2050 
2051 	vp = *vpp;
2052 	if (vp) {
2053 		ASSERT(nfs4_consistent_type(vp));
2054 		rp = VTOR4(vp);
2055 	}
2056 
2057 	if (rp) {
2058 		/* If the file failed recovery, just quit. */
2059 		mutex_enter(&rp->r_statelock);
2060 		if (rp->r_flags & R4RECOVERR) {
2061 			mutex_exit(&rp->r_statelock);
2062 			ep->error = EIO;
2063 			return;
2064 		}
2065 		mutex_exit(&rp->r_statelock);
2066 	}
2067 
2068 	if (dvp) {
2069 		drp = VTOR4(dvp);
2070 		/* If the parent directory failed recovery, just quit. */
2071 		mutex_enter(&drp->r_statelock);
2072 		if (drp->r_flags & R4RECOVERR) {
2073 			mutex_exit(&drp->r_statelock);
2074 			ep->error = EIO;
2075 			return;
2076 		}
2077 		mutex_exit(&drp->r_statelock);
2078 	}
2079 
2080 	claim = resend_rqstp->lr_oclaim;
2081 	ASSERT(claim == CLAIM_NULL || claim == CLAIM_DELEGATE_CUR);
2082 
2083 	args.ctag = TAG_OPEN_LOST;
2084 	args.array_len = 4;
2085 	args.array = argop;
2086 
2087 	argop[0].argop = OP_CPUTFH;
2088 	if (claim == CLAIM_DELEGATE_CUR) {
2089 		ASSERT(vp != NULL);
2090 
2091 		mi = VTOMI4(vp);
2092 		/*
2093 		 * if this is a file mount then
2094 		 * use the mntinfo parentfh
2095 		 */
2096 		argop[0].nfs_argop4_u.opcputfh.sfh =
2097 			(vp->v_flag & VROOT) ? mi->mi_srvparentfh :
2098 						VTOSV(vp)->sv_dfh;
2099 		args.ctag = TAG_REOPEN_LOST;
2100 	} else {
2101 		argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
2102 		mi = VTOMI4(dvp);
2103 	}
2104 
2105 	argop[1].argop = OP_COPEN;
2106 	open_args = &argop[1].nfs_argop4_u.opcopen;
2107 	open_args->claim = claim;
2108 
2109 	/*
2110 	 * If we sent over a OPEN with CREATE then the only
2111 	 * thing we care about is to not leave dangling state
2112 	 * on the server, not whether the file we potentially
2113 	 * created remains on the server.  So even though the
2114 	 * lost open request specified a CREATE, we only wish
2115 	 * to do a non-CREATE OPEN.
2116 	 */
2117 	open_args->opentype = OPEN4_NOCREATE;
2118 
2119 	srcfp = &resend_rqstp->lr_ofile;
2120 	destclen = srcfp->utf8string_len;
2121 	destcfp = kmem_alloc(destclen + 1, KM_SLEEP);
2122 	bcopy(srcfp->utf8string_val, destcfp, destclen);
2123 	destcfp[destclen] = '\0';
2124 	if (claim == CLAIM_DELEGATE_CUR) {
2125 		open_args->open_claim4_u.delegate_cur_info.delegate_stateid =
2126 				resend_rqstp->lr_ostateid;
2127 		open_args->open_claim4_u.delegate_cur_info.cfile = destcfp;
2128 	} else {
2129 		open_args->open_claim4_u.cfile = destcfp;
2130 	}
2131 
2132 	open_args->share_access = resend_rqstp->lr_oacc;
2133 	open_args->share_deny = resend_rqstp->lr_odeny;
2134 	oop = resend_rqstp->lr_oop;
2135 	ASSERT(oop != NULL);
2136 
2137 	open_args->owner.clientid = mi2clientid(mi);
2138 	/* this length never changes */
2139 	open_args->owner.owner_len = sizeof (oop->oo_name);
2140 	open_args->owner.owner_val =
2141 	    kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
2142 
2143 	ep->error = nfs4_start_open_seqid_sync(oop, mi);
2144 	ASSERT(ep->error == 0);		/* recov thread always succeeds */
2145 	/*
2146 	 * We can get away with not saving the seqid upon detection
2147 	 * of a lost request, and now just use the open owner's current
2148 	 * seqid since we only allow one op OTW per seqid and lost
2149 	 * requests are saved FIFO.
2150 	 */
2151 	seqid = nfs4_get_open_seqid(oop) + 1;
2152 	open_args->seqid = seqid;
2153 
2154 	bcopy(&oop->oo_name, open_args->owner.owner_val,
2155 	    open_args->owner.owner_len);
2156 
2157 	/* getfh */
2158 	argop[2].argop = OP_GETFH;
2159 
2160 	/* Construct the getattr part of the compound */
2161 	argop[3].argop = OP_GETATTR;
2162 	argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
2163 	argop[3].nfs_argop4_u.opgetattr.mi = mi;
2164 
2165 	res.array = NULL;
2166 
2167 	t = gethrtime();
2168 
2169 	rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
2170 
2171 	if (ep->error == 0 && nfs4_need_to_bump_seqid(&res))
2172 		nfs4_set_open_seqid(seqid, oop, args.ctag);
2173 
2174 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2175 	    "nfs4_resend_open_otw: error %d stat %d", ep->error, res.status));
2176 
2177 	if (ep->error || res.status)
2178 		goto err_out;
2179 
2180 	op_res = &res.array[1].nfs_resop4_u.opopen;
2181 	gf_res = &res.array[2].nfs_resop4_u.opgetfh;
2182 	garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res;
2183 
2184 	if (!vp) {
2185 		int rnode_err = 0;
2186 		nfs4_sharedfh_t *sfh;
2187 
2188 		/*
2189 		 * If we can't decode all the attributes they are not usable,
2190 		 * just make the vnode.
2191 		 */
2192 
2193 		sfh = sfh4_get(&gf_res->object, VTOMI4(dvp));
2194 		*vpp = makenfs4node(sfh, garp, dvp->v_vfsp, t, cr, dvp,
2195 			fn_get(VTOSV(dvp)->sv_name,
2196 			open_args->open_claim4_u.cfile));
2197 		sfh4_rele(&sfh);
2198 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2199 		    "nfs4_resend_open_otw: made vp %p for file %s",
2200 		    (void *)(*vpp), open_args->open_claim4_u.cfile));
2201 
2202 		if (ep->error)
2203 			PURGE_ATTRCACHE4(*vpp);
2204 
2205 		/*
2206 		 * For the newly created *vpp case, make sure the rnode
2207 		 * isn't bad before using it.
2208 		 */
2209 		mutex_enter(&(VTOR4(*vpp))->r_statelock);
2210 		if (VTOR4(*vpp)->r_flags & R4RECOVERR)
2211 			rnode_err = EIO;
2212 		mutex_exit(&(VTOR4(*vpp))->r_statelock);
2213 
2214 		if (rnode_err) {
2215 			NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2216 			    "nfs4_resend_open_otw: rp %p is bad",
2217 			    (void *)VTOR4(*vpp)));
2218 			ep->error = rnode_err;
2219 			goto err_out;
2220 		}
2221 
2222 		vp = *vpp;
2223 		rp = VTOR4(vp);
2224 	}
2225 
2226 	if (claim == CLAIM_DELEGATE_CUR) {
2227 		/*
2228 		 * Check if the path we reopened really is the same
2229 		 * file. We could end up in a situation were the file
2230 		 * was removed and a new file created with the same name.
2231 		 */
2232 		(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
2233 		fh_different =
2234 			(nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0);
2235 		if (fh_different) {
2236 			if (mi->mi_fh_expire_type == FH4_PERSISTENT ||
2237 			    mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) {
2238 				/* Oops, we don't have the same file */
2239 				if (mi->mi_fh_expire_type == FH4_PERSISTENT)
2240 					failed_msg =
2241 					    "Couldn't reopen: Persistant "
2242 					    "file handle changed";
2243 				else
2244 					failed_msg =
2245 					    "Couldn't reopen: Volatile "
2246 					    "(no expire on open) file handle "
2247 					    "changed";
2248 
2249 				nfs4_end_open_seqid_sync(oop);
2250 				kmem_free(destcfp, destclen + 1);
2251 				nfs4args_copen_free(open_args);
2252 				(void) xdr_free(xdr_COMPOUND4res_clnt,
2253 						(caddr_t)&res);
2254 				nfs_rw_exit(&mi->mi_fh_lock);
2255 				nfs4_fail_recov(vp, failed_msg, ep->error,
2256 						ep->stat);
2257 				return;
2258 			} else {
2259 				/*
2260 				 * We have volatile file handles that don't
2261 				 * compare.  If the fids are the same then we
2262 				 * assume that the file handle expired but the
2263 				 * renode still refers to the same file object.
2264 				 *
2265 				 * First check that we have fids or not.
2266 				 * If we don't we have a dumb server so we will
2267 				 * just assume every thing is ok for now.
2268 				 */
2269 				if (!ep->error &&
2270 				    garp->n4g_va.va_mask & AT_NODEID &&
2271 				    rp->r_attr.va_mask & AT_NODEID &&
2272 				    rp->r_attr.va_nodeid !=
2273 					garp->n4g_va.va_nodeid) {
2274 					/*
2275 					 * We have fids, but they don't
2276 					 * compare. So kill the file.
2277 					 */
2278 					failed_msg =
2279 					    "Couldn't reopen: file handle "
2280 					    "changed due to mismatched fids";
2281 					nfs4_end_open_seqid_sync(oop);
2282 					kmem_free(destcfp, destclen + 1);
2283 					nfs4args_copen_free(open_args);
2284 					(void) xdr_free(xdr_COMPOUND4res_clnt,
2285 							(caddr_t)&res);
2286 					nfs_rw_exit(&mi->mi_fh_lock);
2287 					nfs4_fail_recov(vp, failed_msg,
2288 							ep->error, ep->stat);
2289 					return;
2290 				} else {
2291 					/*
2292 					 * We have volatile file handles that
2293 					 * refers to the same file (at least
2294 					 * they have the same fid) or we don't
2295 					 * have fids so we can't tell. :(. We'll
2296 					 * be a kind and accepting client so
2297 					 * we'll update the rnode's file
2298 					 * handle with the otw handle.
2299 					 *
2300 					 * We need to drop mi->mi_fh_lock since
2301 					 * sh4_update acquires it. Since there
2302 					 * is only one recovery thread there is
2303 					 * no race.
2304 					 */
2305 					nfs_rw_exit(&mi->mi_fh_lock);
2306 					sfh4_update(rp->r_fh, &gf_res->object);
2307 				}
2308 			}
2309 		} else {
2310 			nfs_rw_exit(&mi->mi_fh_lock);
2311 		}
2312 	}
2313 
2314 	ASSERT(nfs4_consistent_type(vp));
2315 
2316 	if (op_res->rflags & OPEN4_RESULT_CONFIRM)
2317 		nfs4open_confirm(vp, &seqid, &op_res->stateid, cr, TRUE,
2318 		    &retry_open, oop, TRUE, ep, NULL);
2319 	if (ep->error || ep->stat) {
2320 		nfs4_end_open_seqid_sync(oop);
2321 		kmem_free(destcfp, destclen + 1);
2322 		nfs4args_copen_free(open_args);
2323 		if (!ep->error)
2324 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2325 		return;
2326 	}
2327 
2328 	if (claim == CLAIM_DELEGATE_CUR) {
2329 		/*
2330 		 * Doing a reopen here so the osp should already exist.
2331 		 * If not, something changed or went very wrong.
2332 		 *
2333 		 * returns with 'os_sync_lock' held
2334 		 */
2335 		osp = find_open_stream(oop, rp);
2336 		if (!osp) {
2337 			NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2338 			    "nfs4_resend_open_otw: couldn't find osp"));
2339 			ep->error = EINVAL;
2340 			goto err_out;
2341 		}
2342 		osp->os_open_ref_count++;
2343 	} else {
2344 		mutex_enter(&oop->oo_lock);
2345 		oop->oo_just_created = NFS4_PERM_CREATED;
2346 		mutex_exit(&oop->oo_lock);
2347 
2348 		/* returns with 'os_sync_lock' held */
2349 		osp = find_or_create_open_stream(oop, rp, &created_osp);
2350 		if (!osp) {
2351 			NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2352 			    "nfs4_resend_open_otw: couldn't create osp"));
2353 			ep->error = EINVAL;
2354 			goto err_out;
2355 		}
2356 	}
2357 
2358 	osp->open_stateid = op_res->stateid;
2359 	osp->os_delegation = FALSE;
2360 	/*
2361 	 * Need to reset this bitfield for the possible case where we were
2362 	 * going to OTW CLOSE the file, got a non-recoverable error, and before
2363 	 * we could retry the CLOSE, OPENed the file again.
2364 	 */
2365 	ASSERT(osp->os_open_owner->oo_seqid_inuse);
2366 	osp->os_final_close = 0;
2367 	osp->os_force_close = 0;
2368 
2369 	if (claim != CLAIM_DELEGATE_CUR) {
2370 		if (open_args->share_access & OPEN4_SHARE_ACCESS_READ)
2371 			osp->os_share_acc_read++;
2372 		if (open_args->share_access & OPEN4_SHARE_ACCESS_WRITE)
2373 			osp->os_share_acc_write++;
2374 		osp->os_share_deny_none++;
2375 	}
2376 
2377 	mutex_exit(&osp->os_sync_lock);
2378 	if (created_osp)
2379 		nfs4_inc_state_ref_count(mi);
2380 	open_stream_rele(osp, rp);
2381 
2382 	nfs4_end_open_seqid_sync(oop);
2383 
2384 	/* accept delegation, if any */
2385 	nfs4_delegation_accept(rp, claim, op_res, garp, cr);
2386 
2387 	kmem_free(destcfp, destclen + 1);
2388 	nfs4args_copen_free(open_args);
2389 
2390 	if (claim == CLAIM_DELEGATE_CUR)
2391 		nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
2392 	else
2393 		PURGE_ATTRCACHE4(vp);
2394 
2395 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2396 
2397 	ASSERT(nfs4_consistent_type(vp));
2398 
2399 	return;
2400 
2401 err_out:
2402 	nfs4_end_open_seqid_sync(oop);
2403 	kmem_free(destcfp, destclen + 1);
2404 	nfs4args_copen_free(open_args);
2405 	if (!ep->error)
2406 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2407 }
2408