1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28 
29 #pragma ident	"%Z%%M%	%I%	%E% SMI"
30 
31 #include <nfs/nfs4_clnt.h>
32 #include <nfs/rnode4.h>
33 #include <sys/systm.h>
34 #include <sys/cmn_err.h>
35 #include <sys/atomic.h>
36 
37 static void	nfs4_free_open_owner(nfs4_open_owner_t *, mntinfo4_t *);
38 static nfs4_open_owner_t *find_freed_open_owner(cred_t *,
39 				nfs4_oo_hash_bucket_t *, mntinfo4_t *);
40 static open_delegation_type4 get_dtype(rnode4_t *);
41 
42 #ifdef DEBUG
43 int nfs4_client_foo_debug = 0x0;
44 int nfs4_client_open_dg = 0x0;
45 /*
46  * If this is non-zero, the lockowner and openowner seqid sync primitives
47  * will intermittently return errors.
48  */
49 static int seqid_sync_faults = 0;
50 #endif
51 
52 stateid4 clnt_special0 = {
53 	0,
54 	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
55 };
56 
57 stateid4 clnt_special1 = {
58 	0xffffffff,
59 	{
60 		(char)0xff, (char)0xff, (char)0xff, (char)0xff,
61 		(char)0xff, (char)0xff, (char)0xff, (char)0xff,
62 		(char)0xff, (char)0xff, (char)0xff, (char)0xff
63 	}
64 };
65 
66 /* finds hash bucket and locks it */
67 static nfs4_oo_hash_bucket_t *
68 lock_bucket(cred_t *cr, mntinfo4_t *mi)
69 {
70 	nfs4_oo_hash_bucket_t *bucketp;
71 	uint32_t hash_key;
72 
73 	hash_key = (uint32_t)(crgetuid(cr) + crgetruid(cr))
74 			% NFS4_NUM_OO_BUCKETS;
75 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "lock_bucket: "
76 		"hash_key %d for cred %p", hash_key, (void*)cr));
77 
78 	ASSERT(hash_key >= 0 && hash_key < NFS4_NUM_OO_BUCKETS);
79 	ASSERT(mi != NULL);
80 	ASSERT(mutex_owned(&mi->mi_lock));
81 
82 	bucketp = &(mi->mi_oo_list[hash_key]);
83 	mutex_enter(&bucketp->b_lock);
84 	return (bucketp);
85 }
86 
87 /* unlocks hash bucket pointed by bucket_ptr */
88 static void
89 unlock_bucket(nfs4_oo_hash_bucket_t *bucketp)
90 {
91 	mutex_exit(&bucketp->b_lock);
92 }
93 
94 /*
95  * Removes the lock owner from the rnode's lock_owners list and frees the
96  * corresponding reference.
97  */
98 void
99 nfs4_rnode_remove_lock_owner(rnode4_t *rp, nfs4_lock_owner_t *lop)
100 {
101 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
102 		"nfs4_rnode_remove_lock_owner"));
103 
104 	mutex_enter(&rp->r_statev4_lock);
105 
106 	if (lop->lo_next_rnode == NULL) {
107 		/* already removed from list */
108 		mutex_exit(&rp->r_statev4_lock);
109 		return;
110 	}
111 
112 	ASSERT(lop->lo_prev_rnode != NULL);
113 
114 	lop->lo_prev_rnode->lo_next_rnode = lop->lo_next_rnode;
115 	lop->lo_next_rnode->lo_prev_rnode = lop->lo_prev_rnode;
116 
117 	lop->lo_next_rnode = lop->lo_prev_rnode = NULL;
118 
119 	mutex_exit(&rp->r_statev4_lock);
120 
121 	/*
122 	 * This would be an appropriate place for
123 	 * RELEASE_LOCKOWNER.  For now, this is overkill
124 	 * because in the common case, close is going to
125 	 * release any lockowners anyway.
126 	 */
127 	lock_owner_rele(lop);
128 }
129 
130 /*
131  * Remove all lock owners from the rnode's lock_owners list.  Frees up
132  * their references from the list.
133  */
134 
135 void
136 nfs4_flush_lock_owners(rnode4_t *rp)
137 {
138 	nfs4_lock_owner_t *lop;
139 
140 	mutex_enter(&rp->r_statev4_lock);
141 	while (rp->r_lo_head.lo_next_rnode != &rp->r_lo_head) {
142 		lop = rp->r_lo_head.lo_next_rnode;
143 		lop->lo_prev_rnode->lo_next_rnode = lop->lo_next_rnode;
144 		lop->lo_next_rnode->lo_prev_rnode = lop->lo_prev_rnode;
145 		lop->lo_next_rnode = lop->lo_prev_rnode = NULL;
146 		lock_owner_rele(lop);
147 	}
148 	mutex_exit(&rp->r_statev4_lock);
149 }
150 
151 void
152 nfs4_clear_open_streams(rnode4_t *rp)
153 {
154 	nfs4_open_stream_t *osp;
155 
156 	mutex_enter(&rp->r_os_lock);
157 	while ((osp = list_head(&rp->r_open_streams)) != NULL) {
158 		open_owner_rele(osp->os_open_owner);
159 		list_remove(&rp->r_open_streams, osp);
160 		mutex_destroy(&osp->os_sync_lock);
161 		osp->os_open_owner = NULL;
162 		kmem_free(osp, sizeof (*osp));
163 	}
164 	mutex_exit(&rp->r_os_lock);
165 }
166 
167 void
168 open_owner_hold(nfs4_open_owner_t *oop)
169 {
170 	mutex_enter(&oop->oo_lock);
171 	oop->oo_ref_count++;
172 	mutex_exit(&oop->oo_lock);
173 }
174 
175 /*
176  * Frees the open owner if the ref count hits zero.
177  */
178 void
179 open_owner_rele(nfs4_open_owner_t *oop)
180 {
181 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
182 		"open_owner_rele"));
183 
184 	mutex_enter(&oop->oo_lock);
185 	oop->oo_ref_count--;
186 	if (oop->oo_ref_count == 0) {
187 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
188 			"open_owner_rele: freeing open owner"));
189 		oop->oo_valid = 0;
190 		mutex_exit(&oop->oo_lock);
191 		/*
192 		 * Ok, we don't destroy the open owner, nor do we put it on
193 		 * the mntinfo4's free list just yet.  We are lazy about it
194 		 * and let callers to find_open_owner() do that to keep locking
195 		 * simple.
196 		 */
197 	} else {
198 		mutex_exit(&oop->oo_lock);
199 	}
200 }
201 
202 void
203 open_stream_hold(nfs4_open_stream_t *osp)
204 {
205 	mutex_enter(&osp->os_sync_lock);
206 	osp->os_ref_count++;
207 	mutex_exit(&osp->os_sync_lock);
208 }
209 
210 /*
211  * Frees the open stream and removes it from the rnode4's open streams list if
212  * the ref count drops to zero.
213  */
214 void
215 open_stream_rele(nfs4_open_stream_t *osp, rnode4_t *rp)
216 {
217 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
218 		"open_stream_rele"));
219 
220 	ASSERT(!mutex_owned(&rp->r_os_lock));
221 
222 	mutex_enter(&osp->os_sync_lock);
223 	ASSERT(osp->os_ref_count > 0);
224 	osp->os_ref_count--;
225 	if (osp->os_ref_count == 0) {
226 		nfs4_open_owner_t *tmp_oop;
227 
228 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
229 			"open_stream_rele: freeing open stream"));
230 		osp->os_valid = 0;
231 		tmp_oop = osp->os_open_owner;
232 		mutex_exit(&osp->os_sync_lock);
233 
234 		/* now see if we need to destroy the open owner */
235 		open_owner_rele(tmp_oop);
236 
237 		mutex_enter(&rp->r_os_lock);
238 		list_remove(&rp->r_open_streams, osp);
239 		mutex_exit(&rp->r_os_lock);
240 
241 		/* free up osp */
242 		mutex_destroy(&osp->os_sync_lock);
243 		osp->os_open_owner = NULL;
244 		kmem_free(osp, sizeof (*osp));
245 	} else {
246 		mutex_exit(&osp->os_sync_lock);
247 	}
248 }
249 
250 void
251 lock_owner_hold(nfs4_lock_owner_t *lop)
252 {
253 	mutex_enter(&lop->lo_lock);
254 	lop->lo_ref_count++;
255 	mutex_exit(&lop->lo_lock);
256 }
257 
258 /*
259  * Frees the lock owner if the ref count hits zero and
260  * the structure no longer has no locks.
261  */
262 void
263 lock_owner_rele(nfs4_lock_owner_t *lop)
264 {
265 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
266 		"lock_owner_rele"));
267 
268 	mutex_enter(&lop->lo_lock);
269 	lop->lo_ref_count--;
270 	if (lop->lo_ref_count == 0) {
271 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
272 			"lock_owner_rele: freeing lock owner: "
273 			"%x", lop->lo_pid));
274 		lop->lo_valid = 0;
275 		/*
276 		 * If there are no references, the lock_owner should
277 		 * already be off the rnode's list.
278 		 */
279 		ASSERT(lop->lo_next_rnode == NULL);
280 		ASSERT(lop->lo_prev_rnode == NULL);
281 		ASSERT(!(lop->lo_flags & NFS4_LOCK_SEQID_INUSE));
282 		ASSERT(lop->lo_seqid_holder == NULL);
283 		mutex_exit(&lop->lo_lock);
284 
285 		/* free up lop */
286 		cv_destroy(&lop->lo_cv_seqid_sync);
287 		mutex_destroy(&lop->lo_lock);
288 		kmem_free(lop, sizeof (*lop));
289 	} else {
290 		mutex_exit(&lop->lo_lock);
291 	}
292 }
293 
294 /*
295  * This increments the open owner ref count if found.
296  * The argument 'just_created' determines whether we are looking for open
297  * owners with the 'oo_just_created' flag set or not.
298  */
299 nfs4_open_owner_t *
300 find_open_owner_nolock(cred_t *cr, int just_created, mntinfo4_t *mi)
301 {
302 	nfs4_open_owner_t	*oop = NULL, *next_oop;
303 	nfs4_oo_hash_bucket_t	*bucketp;
304 
305 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
306 	    "find_open_owner: cred %p, just_created %d",
307 	    (void*)cr, just_created));
308 
309 	ASSERT(mi != NULL);
310 	ASSERT(mutex_owned(&mi->mi_lock));
311 
312 	bucketp = lock_bucket(cr, mi);
313 
314 	/* got hash bucket, search through open owners */
315 	for (oop = list_head(&bucketp->b_oo_hash_list); oop != NULL; ) {
316 		mutex_enter(&oop->oo_lock);
317 		if (!crcmp(oop->oo_cred, cr) &&
318 		    (oop->oo_just_created == just_created ||
319 		    just_created == NFS4_JUST_CREATED)) {
320 			/* match */
321 			if (oop->oo_valid == 0) {
322 				/* reactivate the open owner */
323 				oop->oo_valid = 1;
324 				ASSERT(oop->oo_ref_count == 0);
325 			}
326 			oop->oo_ref_count++;
327 			mutex_exit(&oop->oo_lock);
328 			unlock_bucket(bucketp);
329 			return (oop);
330 		}
331 		next_oop = list_next(&bucketp->b_oo_hash_list, oop);
332 		if (oop->oo_valid == 0) {
333 			list_remove(&bucketp->b_oo_hash_list, oop);
334 
335 			/*
336 			 * Now we go ahead and put this open owner
337 			 * on the freed list.  This is our lazy method.
338 			 */
339 			nfs4_free_open_owner(oop, mi);
340 		}
341 
342 		mutex_exit(&oop->oo_lock);
343 		oop = next_oop;
344 	}
345 
346 	/* search through recently freed open owners */
347 	oop = find_freed_open_owner(cr, bucketp, mi);
348 
349 	unlock_bucket(bucketp);
350 
351 	return (oop);
352 }
353 
354 nfs4_open_owner_t *
355 find_open_owner(cred_t *cr, int just_created, mntinfo4_t *mi)
356 {
357 	nfs4_open_owner_t *oop;
358 
359 	mutex_enter(&mi->mi_lock);
360 	oop = find_open_owner_nolock(cr, just_created, mi);
361 	mutex_exit(&mi->mi_lock);
362 
363 	return (oop);
364 }
365 
366 /*
367  * This increments osp's ref count if found.
368  * Returns with 'os_sync_lock' held.
369  */
370 nfs4_open_stream_t *
371 find_open_stream(nfs4_open_owner_t *oop, rnode4_t *rp)
372 {
373 	nfs4_open_stream_t	*osp;
374 
375 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
376 		"find_open_stream"));
377 
378 	mutex_enter(&rp->r_os_lock);
379 	/* Now, no one can add or delete to rp's open streams list */
380 	for (osp = list_head(&rp->r_open_streams); osp != NULL;
381 	    osp = list_next(&rp->r_open_streams, osp)) {
382 		mutex_enter(&osp->os_sync_lock);
383 		if (osp->os_open_owner == oop && osp->os_valid != 0) {
384 			/* match */
385 			NFS4_DEBUG(nfs4_client_state_debug,
386 				(CE_NOTE, "find_open_stream "
387 				"got a match"));
388 
389 			osp->os_ref_count++;
390 			mutex_exit(&rp->r_os_lock);
391 			return (osp);
392 		}
393 		mutex_exit(&osp->os_sync_lock);
394 	}
395 
396 	mutex_exit(&rp->r_os_lock);
397 	return (NULL);
398 }
399 
400 /*
401  * Find the lock owner for the given file and process ID.  If "which" is
402  * LOWN_VALID_STATEID, require that the lock owner contain a valid stateid
403  * from the server.
404  *
405  * This increments the lock owner's ref count if found.  Returns NULL if
406  * there was no match.
407  */
408 nfs4_lock_owner_t *
409 find_lock_owner(rnode4_t *rp, pid_t pid, lown_which_t which)
410 {
411 	nfs4_lock_owner_t	*lop, *next_lop;
412 
413 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
414 		"find_lock_owner: pid %x, which %d", pid, which));
415 
416 	ASSERT(which == LOWN_ANY || which == LOWN_VALID_STATEID);
417 
418 	/* search by pid */
419 	mutex_enter(&rp->r_statev4_lock);
420 
421 	lop = rp->r_lo_head.lo_next_rnode;
422 	while (lop != &rp->r_lo_head) {
423 		mutex_enter(&lop->lo_lock);
424 		if (lop->lo_pid == pid && lop->lo_valid != 0 &&
425 		    !(lop->lo_flags & NFS4_BAD_SEQID_LOCK)) {
426 			if (which == LOWN_ANY ||
427 			    lop->lo_just_created != NFS4_JUST_CREATED) {
428 				/* Found a matching lock owner */
429 				NFS4_DEBUG(nfs4_client_state_debug,
430 					(CE_NOTE, "find_lock_owner: "
431 					"got a match"));
432 
433 				lop->lo_ref_count++;
434 				mutex_exit(&lop->lo_lock);
435 				mutex_exit(&rp->r_statev4_lock);
436 				return (lop);
437 			}
438 		}
439 		next_lop = lop->lo_next_rnode;
440 		mutex_exit(&lop->lo_lock);
441 		lop = next_lop;
442 	}
443 
444 	mutex_exit(&rp->r_statev4_lock);
445 	return (NULL);
446 }
447 
448 /*
449  * This returns the delegation stateid as 'sid'. Returns 1 if a successful
450  * delegation stateid was found, otherwise returns 0.
451  */
452 
453 static int
454 nfs4_get_deleg_stateid(rnode4_t *rp, nfs_opnum4 op, stateid4 *sid)
455 {
456 	ASSERT(!mutex_owned(&rp->r_statev4_lock));
457 
458 	mutex_enter(&rp->r_statev4_lock);
459 	if (((rp->r_deleg_type == OPEN_DELEGATE_WRITE && op == OP_WRITE) ||
460 	    (rp->r_deleg_type != OPEN_DELEGATE_NONE && op != OP_WRITE)) &&
461 	    !rp->r_deleg_return_pending) {
462 
463 		*sid = rp->r_deleg_stateid;
464 		mutex_exit(&rp->r_statev4_lock);
465 		return (1);
466 	}
467 	mutex_exit(&rp->r_statev4_lock);
468 	return (0);
469 }
470 
471 /*
472  * This returns the lock stateid as 'sid'. Returns 1 if a successful lock
473  * stateid was found, otherwise returns 0.
474  */
475 static int
476 nfs4_get_lock_stateid(rnode4_t *rp, pid_t pid, stateid4 *sid)
477 {
478 	nfs4_lock_owner_t *lop;
479 
480 	lop = find_lock_owner(rp, pid, LOWN_VALID_STATEID);
481 
482 	if (lop) {
483 		/*
484 		 * Found a matching lock owner, so use a lock
485 		 * stateid rather than an open stateid.
486 		 */
487 		mutex_enter(&lop->lo_lock);
488 		*sid = lop->lock_stateid;
489 		mutex_exit(&lop->lo_lock);
490 		lock_owner_rele(lop);
491 		return (1);
492 	}
493 
494 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
495 	    "nfs4_get_lock_stateid: no lop"));
496 	return (0);
497 }
498 
499 /*
500  * This returns the open stateid as 'sid'. Returns 1 if a successful open
501  * stateid was found, otherwise returns 0.
502  *
503  * Once the stateid is returned to the caller, it is no longer protected;
504  * so the caller must be prepared to handle OLD/BAD_STATEID where
505  * appropiate.
506  */
507 static int
508 nfs4_get_open_stateid(rnode4_t *rp, cred_t *cr, mntinfo4_t *mi, stateid4 *sid)
509 {
510 	nfs4_open_owner_t *oop;
511 	nfs4_open_stream_t *osp;
512 
513 	ASSERT(mi != NULL);
514 
515 	oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
516 	if (!oop) {
517 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
518 		    "nfs4_get_open_stateid: no oop"));
519 		return (0);
520 	}
521 
522 	osp = find_open_stream(oop, rp);
523 	open_owner_rele(oop);
524 	if (!osp) {
525 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
526 		    "nfs4_get_open_stateid: no osp"));
527 		return (0);
528 	}
529 
530 	if (osp->os_failed_reopen) {
531 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
532 		    "nfs4_get_open_stateid: osp %p failed reopen",
533 		    (void *)osp));
534 		mutex_exit(&osp->os_sync_lock);
535 		open_stream_rele(osp, rp);
536 		return (0);
537 	}
538 	*sid = osp->open_stateid;
539 	mutex_exit(&osp->os_sync_lock);
540 	open_stream_rele(osp, rp);
541 	return (1);
542 }
543 
544 /*
545  * Returns the delegation stateid if this 'op' is OP_WRITE and the
546  * delegation we hold is a write delegation, OR this 'op' is not
547  * OP_WRITE and we have a delegation held (read or write), otherwise
548  * returns the lock stateid if there is a lock owner, otherwise
549  * returns the open stateid if there is a open stream, otherwise
550  * returns special stateid <seqid = 0, other = 0>.
551  *
552  * Used for WRITE operations.
553  */
554 stateid4
555 nfs4_get_w_stateid(cred_t *cr, rnode4_t *rp, pid_t pid, mntinfo4_t *mi,
556 	nfs_opnum4 op, nfs4_stateid_types_t *sid_tp)
557 {
558 	stateid4 sid;
559 
560 	if (nfs4_get_deleg_stateid(rp, op, &sid)) {
561 		if (!stateid4_cmp(&sid, &sid_tp->d_sid)) {
562 			sid_tp->cur_sid_type = DEL_SID;
563 			return (sid);
564 		}
565 	}
566 	if (nfs4_get_lock_stateid(rp, pid, &sid)) {
567 		if (!stateid4_cmp(&sid, &sid_tp->l_sid)) {
568 			sid_tp->cur_sid_type = LOCK_SID;
569 			return (sid);
570 		}
571 	}
572 	if (nfs4_get_open_stateid(rp, cr, mi, &sid)) {
573 		if (!stateid4_cmp(&sid, &sid_tp->o_sid)) {
574 			sid_tp->cur_sid_type = OPEN_SID;
575 			return (sid);
576 		}
577 	}
578 	bzero(&sid, sizeof (stateid4));
579 	sid_tp->cur_sid_type = SPEC_SID;
580 	return (sid);
581 }
582 
583 /*
584  * Returns the delegation stateid if this 'op' is OP_WRITE and the
585  * delegation we hold is a write delegation, OR this 'op' is not
586  * OP_WRITE and we have a delegation held (read or write), otherwise
587  * returns the lock stateid if there is a lock owner, otherwise
588  * returns the open stateid if there is a open stream, otherwise
589  * returns special stateid <seqid = 0, other = 0>.
590  *
591  * This also updates which stateid we are using in 'sid_tp', skips
592  * previously attempted stateids, and skips checking higher priority
593  * stateids than the current level as dictated by 'sid_tp->cur_sid_type'
594  * for async reads.
595  *
596  * Used for READ and SETATTR operations.
597  */
598 stateid4
599 nfs4_get_stateid(cred_t *cr, rnode4_t *rp, pid_t pid, mntinfo4_t *mi,
600 	nfs_opnum4 op, nfs4_stateid_types_t *sid_tp, bool_t async_read)
601 {
602 	stateid4 sid;
603 
604 	/*
605 	 * For asynchronous READs, do not attempt to retry from the start of
606 	 * the stateid priority list, just continue from where you last left
607 	 * off.
608 	 */
609 	if (async_read) {
610 		switch (sid_tp->cur_sid_type) {
611 		case NO_SID:
612 			break;
613 		case DEL_SID:
614 			goto lock_stateid;
615 		case LOCK_SID:
616 			goto open_stateid;
617 		case OPEN_SID:
618 			goto special_stateid;
619 		case SPEC_SID:
620 		default:
621 			cmn_err(CE_PANIC, "nfs4_get_stateid: illegal current "
622 			    "stateid type %d", sid_tp->cur_sid_type);
623 		}
624 	}
625 
626 	if (nfs4_get_deleg_stateid(rp, op, &sid)) {
627 		if (!stateid4_cmp(&sid, &sid_tp->d_sid)) {
628 			sid_tp->cur_sid_type = DEL_SID;
629 			return (sid);
630 		}
631 	}
632 lock_stateid:
633 	if (nfs4_get_lock_stateid(rp, pid, &sid)) {
634 		if (!stateid4_cmp(&sid, &sid_tp->l_sid)) {
635 			sid_tp->cur_sid_type = LOCK_SID;
636 			return (sid);
637 		}
638 	}
639 open_stateid:
640 	if (nfs4_get_open_stateid(rp, cr, mi, &sid)) {
641 		if (!stateid4_cmp(&sid, &sid_tp->o_sid)) {
642 			sid_tp->cur_sid_type = OPEN_SID;
643 			return (sid);
644 		}
645 	}
646 special_stateid:
647 	bzero(&sid, sizeof (stateid4));
648 	sid_tp->cur_sid_type = SPEC_SID;
649 	return	(sid);
650 }
651 
652 void
653 nfs4_set_lock_stateid(nfs4_lock_owner_t *lop, stateid4 stateid)
654 {
655 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
656 		"nfs4_set_lock_stateid"));
657 
658 	ASSERT(lop);
659 	ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
660 
661 	mutex_enter(&lop->lo_lock);
662 	lop->lock_stateid = stateid;
663 	mutex_exit(&lop->lo_lock);
664 }
665 
666 /*
667  * Sequence number used when a new open owner is needed.
668  * This is used so as to not confuse the server.  Since a open owner
669  * is based off of cred, a cred could be re-used quickly, and the server
670  * may not release all state for a cred.
671  */
672 static uint64_t open_owner_seq_num = 0;
673 
674 uint64_t
675 nfs4_get_new_oo_name(void)
676 {
677 	return (atomic_add_64_nv(&open_owner_seq_num, 1));
678 }
679 
680 /*
681  * Create a new open owner and add it to the open owner hash table.
682  */
683 nfs4_open_owner_t *
684 create_open_owner(cred_t *cr, mntinfo4_t *mi)
685 {
686 	nfs4_open_owner_t	*oop;
687 	nfs4_oo_hash_bucket_t	*bucketp;
688 
689 	oop = kmem_alloc(sizeof (nfs4_open_owner_t), KM_SLEEP);
690 	/*
691 	 * Make sure the cred doesn't go away when we put this open owner
692 	 * on the free list, as well as make crcmp() a valid check.
693 	 */
694 	crhold(cr);
695 	oop->oo_cred = cr;
696 	mutex_init(&oop->oo_lock, NULL, MUTEX_DEFAULT, NULL);
697 	oop->oo_ref_count = 1;
698 	oop->oo_valid = 1;
699 	oop->oo_just_created = NFS4_JUST_CREATED;
700 	oop->oo_seqid = 0;
701 	oop->oo_seqid_inuse = 0;
702 	oop->oo_last_good_seqid = 0;
703 	oop->oo_last_good_op = TAG_NONE;
704 	oop->oo_cred_otw = NULL;
705 	cv_init(&oop->oo_cv_seqid_sync, NULL, CV_DEFAULT, NULL);
706 
707 	/*
708 	 * A Solaris open_owner is <oo_seq_num>
709 	 */
710 	oop->oo_name = nfs4_get_new_oo_name();
711 
712 	/* now add the struct into the cred hash table */
713 	ASSERT(mutex_owned(&mi->mi_lock));
714 	bucketp = lock_bucket(cr, mi);
715 	list_insert_head(&bucketp->b_oo_hash_list, oop);
716 	unlock_bucket(bucketp);
717 
718 	return (oop);
719 }
720 
721 /*
722  * Create a new open stream and it to the rnode's list.
723  * Increments the ref count on oop.
724  * Returns with 'os_sync_lock' held.
725  */
726 nfs4_open_stream_t *
727 create_open_stream(nfs4_open_owner_t *oop, rnode4_t *rp)
728 {
729 	nfs4_open_stream_t	*osp;
730 
731 #ifdef DEBUG
732 	mutex_enter(&oop->oo_lock);
733 	ASSERT(oop->oo_seqid_inuse);
734 	mutex_exit(&oop->oo_lock);
735 #endif
736 
737 	osp = kmem_alloc(sizeof (nfs4_open_stream_t), KM_SLEEP);
738 	osp->os_open_ref_count = 1;
739 	osp->os_mapcnt = 0;
740 	osp->os_ref_count = 2;
741 	osp->os_valid = 1;
742 	osp->os_open_owner = oop;
743 	osp->os_orig_oo_name = oop->oo_name;
744 	bzero(&osp->open_stateid, sizeof (stateid4));
745 	osp->os_share_acc_read = 0;
746 	osp->os_share_acc_write = 0;
747 	osp->os_mmap_read = 0;
748 	osp->os_mmap_write = 0;
749 	osp->os_share_deny_none = 0;
750 	osp->os_share_deny_read = 0;
751 	osp->os_share_deny_write = 0;
752 	osp->os_delegation = 0;
753 	osp->os_dc_openacc = 0;
754 	osp->os_final_close = 0;
755 	osp->os_pending_close = 0;
756 	osp->os_failed_reopen = 0;
757 	osp->os_force_close = 0;
758 	mutex_init(&osp->os_sync_lock, NULL, MUTEX_DEFAULT, NULL);
759 
760 	/* open owner gets a reference */
761 	open_owner_hold(oop);
762 
763 	/* now add the open stream to rp */
764 	mutex_enter(&rp->r_os_lock);
765 	mutex_enter(&osp->os_sync_lock);
766 	list_insert_head(&rp->r_open_streams, osp);
767 	mutex_exit(&rp->r_os_lock);
768 
769 	return (osp);
770 }
771 
772 /*
773  * Returns an open stream with 'os_sync_lock' held.
774  * If the open stream is found (rather than created), its
775  * 'os_open_ref_count' is bumped.
776  *
777  * There is no race with two threads entering this function
778  * and creating two open streams for the same <oop, rp> pair.
779  * This is because the open seqid sync must be acquired, thus
780  * only allowing one thread in at a time.
781  */
782 nfs4_open_stream_t *
783 find_or_create_open_stream(nfs4_open_owner_t *oop, rnode4_t *rp,
784 	int *created_osp)
785 {
786 	nfs4_open_stream_t *osp;
787 
788 #ifdef DEBUG
789 	mutex_enter(&oop->oo_lock);
790 	ASSERT(oop->oo_seqid_inuse);
791 	mutex_exit(&oop->oo_lock);
792 #endif
793 
794 	osp = find_open_stream(oop, rp);
795 	if (!osp) {
796 		osp = create_open_stream(oop, rp);
797 		if (osp)
798 			*created_osp = 1;
799 	} else {
800 		*created_osp = 0;
801 		osp->os_open_ref_count++;
802 	}
803 
804 	return (osp);
805 }
806 
807 static uint64_t lock_owner_seq_num = 0;
808 
809 /*
810  * Create a new lock owner and add it to the rnode's list.
811  * Assumes the rnode's r_statev4_lock is held.
812  * The created lock owner has a reference count of 2: one for the list and
813  * one for the caller to use.  Returns the lock owner locked down.
814  */
815 nfs4_lock_owner_t *
816 create_lock_owner(rnode4_t *rp, pid_t pid)
817 {
818 	nfs4_lock_owner_t	*lop;
819 
820 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
821 		"create_lock_owner: pid %x", pid));
822 
823 	ASSERT(mutex_owned(&rp->r_statev4_lock));
824 
825 	lop = kmem_alloc(sizeof (nfs4_lock_owner_t), KM_SLEEP);
826 	lop->lo_ref_count = 2;
827 	lop->lo_valid = 1;
828 	bzero(&lop->lock_stateid, sizeof (stateid4));
829 	lop->lo_pid = pid;
830 	lop->lock_seqid = 0;
831 	lop->lo_pending_rqsts = 0;
832 	lop->lo_just_created = NFS4_JUST_CREATED;
833 	lop->lo_flags = 0;
834 	lop->lo_seqid_holder = NULL;
835 
836 	/*
837 	 * A Solaris lock_owner is <seq_num><pid>
838 	 */
839 	lop->lock_owner_name.ln_seq_num =
840 		atomic_add_64_nv(&lock_owner_seq_num, 1);
841 	lop->lock_owner_name.ln_pid = pid;
842 
843 	cv_init(&lop->lo_cv_seqid_sync, NULL, CV_DEFAULT, NULL);
844 	mutex_init(&lop->lo_lock, NULL, MUTEX_DEFAULT, NULL);
845 
846 	mutex_enter(&lop->lo_lock);
847 
848 	/* now add the lock owner to rp */
849 	lop->lo_prev_rnode = &rp->r_lo_head;
850 	lop->lo_next_rnode = rp->r_lo_head.lo_next_rnode;
851 	rp->r_lo_head.lo_next_rnode->lo_prev_rnode = lop;
852 	rp->r_lo_head.lo_next_rnode = lop;
853 
854 	return (lop);
855 
856 }
857 
858 /*
859  * This sets the lock seqid of a lock owner.
860  */
861 void
862 nfs4_set_lock_seqid(seqid4 seqid, nfs4_lock_owner_t *lop)
863 {
864 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
865 		"nfs4_set_lock_seqid"));
866 
867 	ASSERT(lop != NULL);
868 	ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
869 
870 	lop->lock_seqid = seqid;
871 }
872 
873 static void
874 nfs4_set_new_lock_owner_args(lock_owner4 *owner, pid_t pid)
875 {
876 	nfs4_lo_name_t *cast_namep;
877 
878 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
879 		"nfs4_set_new_lock_owner_args"));
880 
881 	owner->owner_len = sizeof (*cast_namep);
882 	owner->owner_val = kmem_alloc(owner->owner_len, KM_SLEEP);
883 	/*
884 	 * A Solaris lock_owner is <seq_num><pid>
885 	 */
886 	cast_namep = (nfs4_lo_name_t *)owner->owner_val;
887 	cast_namep->ln_seq_num = atomic_add_64_nv(&lock_owner_seq_num, 1);
888 	cast_namep->ln_pid = pid;
889 }
890 
891 /*
892  * Fill in the lock owner args.
893  */
894 void
895 nfs4_setlockowner_args(lock_owner4 *owner, rnode4_t *rp, pid_t pid)
896 {
897 	nfs4_lock_owner_t *lop;
898 
899 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
900 		"nfs4_setlockowner_args"));
901 
902 	/* This increments lop's ref count */
903 	lop = find_lock_owner(rp, pid, LOWN_VALID_STATEID);
904 
905 	if (!lop)
906 		goto make_up_args;
907 
908 	mutex_enter(&lop->lo_lock);
909 	owner->owner_len = sizeof (lop->lock_owner_name);
910 	owner->owner_val = kmem_alloc(owner->owner_len, KM_SLEEP);
911 	bcopy(&lop->lock_owner_name, owner->owner_val,
912 		owner->owner_len);
913 	mutex_exit(&lop->lo_lock);
914 	lock_owner_rele(lop);
915 	return;
916 
917 make_up_args:
918 	nfs4_set_new_lock_owner_args(owner, pid);
919 }
920 
921 /*
922  * This ends our use of the open owner's open seqid by setting
923  * the appropiate flags and issuing a cv_signal to wake up another
924  * thread waiting to use the open seqid.
925  */
926 
927 void
928 nfs4_end_open_seqid_sync(nfs4_open_owner_t *oop)
929 {
930 	mutex_enter(&oop->oo_lock);
931 	ASSERT(oop->oo_seqid_inuse);
932 	oop->oo_seqid_inuse = 0;
933 	cv_broadcast(&oop->oo_cv_seqid_sync);
934 	mutex_exit(&oop->oo_lock);
935 }
936 
937 /*
938  * This starts our use of the open owner's open seqid by setting
939  * the oo_seqid_inuse to true.  We will wait (forever) with a
940  * cv_wait() until we are woken up.
941  *
942  * Return values:
943  * 0		no problems
944  * EAGAIN	caller should retry (like a recovery retry)
945  */
946 int
947 nfs4_start_open_seqid_sync(nfs4_open_owner_t *oop, mntinfo4_t *mi)
948 {
949 	int error = 0;
950 #ifdef DEBUG
951 	static int ops = 0;		/* fault injection */
952 #endif
953 
954 #ifdef DEBUG
955 	if (seqid_sync_faults && curthread != mi->mi_recovthread &&
956 	    ++ops % 5 == 0)
957 		return (EAGAIN);
958 #endif
959 
960 	mutex_enter(&mi->mi_lock);
961 	if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
962 	    curthread != mi->mi_recovthread)
963 		error = EAGAIN;
964 	mutex_exit(&mi->mi_lock);
965 	if (error != 0)
966 		goto done;
967 
968 	mutex_enter(&oop->oo_lock);
969 
970 	while (oop->oo_seqid_inuse) {
971 		NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
972 			"nfs4_start_open_seqid_sync waiting on cv"));
973 
974 		cv_wait(&oop->oo_cv_seqid_sync, &oop->oo_lock);
975 	}
976 
977 	oop->oo_seqid_inuse = 1;
978 
979 	mutex_exit(&oop->oo_lock);
980 
981 	mutex_enter(&mi->mi_lock);
982 	if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
983 	    curthread != mi->mi_recovthread)
984 		error = EAGAIN;
985 	mutex_exit(&mi->mi_lock);
986 
987 	if (error == EAGAIN)
988 		nfs4_end_open_seqid_sync(oop);
989 
990 	NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
991 		"nfs4_start_open_seqid_sync: error=%d", error));
992 
993 done:
994 	return (error);
995 }
996 
997 #ifdef	DEBUG
998 int bypass_otw[2];
999 #endif
1000 
1001 /*
1002  * Checks to see if the OPEN OTW is necessary that is, if it's already
1003  * been opened with the same access and deny bits we are now asking for.
1004  * Note, this assumes that *vpp is a rnode.
1005  */
1006 int
1007 nfs4_is_otw_open_necessary(nfs4_open_owner_t *oop, int flag, vnode_t *vp,
1008 	int just_been_created, int *errorp, int acc, nfs4_recov_state_t *rsp)
1009 {
1010 	rnode4_t *rp;
1011 	nfs4_open_stream_t *osp;
1012 	open_delegation_type4 dt;
1013 
1014 	rp = VTOR4(vp);
1015 
1016 	/*
1017 	 * Grab the delegation type.  This function is protected against
1018 	 * the delegation being returned by virtue of start_op (called
1019 	 * by nfs4open_otw) taking the r_deleg_recall_lock in read mode,
1020 	 * delegreturn requires this lock in write mode to proceed.
1021 	 */
1022 	ASSERT(nfs_rw_lock_held(&rp->r_deleg_recall_lock, RW_READER));
1023 	dt = get_dtype(rp);
1024 
1025 	/* returns with 'os_sync_lock' held */
1026 	osp = find_open_stream(oop, rp);
1027 
1028 	if (osp) {
1029 		uint32_t	do_otw = 0;
1030 
1031 		if (osp->os_failed_reopen) {
1032 			NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
1033 			    "nfs4_is_otw_open_necessary: os_failed_reopen "
1034 			    "set on osp %p, cr %p, rp %s", (void *)osp,
1035 			    (void *)osp->os_open_owner->oo_cred,
1036 			    rnode4info(rp)));
1037 			do_otw = 1;
1038 		}
1039 
1040 		/*
1041 		 * check access/deny bits
1042 		 */
1043 		if (!do_otw && (flag & FREAD))
1044 			if (osp->os_share_acc_read == 0 &&
1045 			    dt == OPEN_DELEGATE_NONE)
1046 				do_otw = 1;
1047 
1048 		if (!do_otw && (flag & FWRITE))
1049 			if (osp->os_share_acc_write == 0 &&
1050 			    dt != OPEN_DELEGATE_WRITE)
1051 				do_otw = 1;
1052 
1053 		if (!do_otw) {
1054 			NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1055 				"nfs4_is_otw_open_necessary: can skip this "
1056 				"open OTW"));
1057 			if (!just_been_created) {
1058 				osp->os_open_ref_count++;
1059 				if (flag & FREAD)
1060 					osp->os_share_acc_read++;
1061 				if (flag & FWRITE)
1062 					osp->os_share_acc_write++;
1063 				osp->os_share_deny_none++;
1064 			}
1065 
1066 			/*
1067 			 * Need to reset this bitfield for the possible case
1068 			 * where we were going to OTW CLOSE the file, got a
1069 			 * non-recoverable error, and before we could retry
1070 			 * the CLOSE, OPENed the file again.
1071 			 */
1072 			ASSERT(osp->os_open_owner->oo_seqid_inuse);
1073 			osp->os_final_close = 0;
1074 			osp->os_force_close = 0;
1075 
1076 			mutex_exit(&osp->os_sync_lock);
1077 			open_stream_rele(osp, rp);
1078 
1079 #ifdef	DEBUG
1080 			bypass_otw[0]++;
1081 #endif
1082 
1083 			*errorp = 0;
1084 			return (0);
1085 		}
1086 		mutex_exit(&osp->os_sync_lock);
1087 		open_stream_rele(osp, rp);
1088 
1089 	} else if (dt != OPEN_DELEGATE_NONE) {
1090 		/*
1091 		 * Even if there isn't an open_stream yet, we may still be
1092 		 * able to bypass the otw open if the client owns a delegation.
1093 		 *
1094 		 * If you are asking for for WRITE, but I only have
1095 		 * a read delegation, then you still have to go otw.
1096 		 */
1097 
1098 		if (flag & FWRITE && dt == OPEN_DELEGATE_READ)
1099 			return (1);
1100 
1101 		/*
1102 		 * TODO - evaluate the nfsace4
1103 		 */
1104 
1105 		/*
1106 		 * Check the access flags to make sure the caller
1107 		 * had permission.
1108 		 */
1109 		if (flag & FREAD && !(acc & VREAD))
1110 			return (1);
1111 
1112 		if (flag & FWRITE && !(acc & VWRITE))
1113 			return (1);
1114 
1115 		/*
1116 		 * create_open_stream will add a reference to oop,
1117 		 * this will prevent the open_owner_rele done in
1118 		 * nfs4open_otw from destroying the open_owner.
1119 		 */
1120 
1121 		/* returns with 'os_sync_lock' held */
1122 		osp = create_open_stream(oop, rp);
1123 		if (osp == NULL)
1124 			return (1);
1125 
1126 		osp->open_stateid = rp->r_deleg_stateid;
1127 		osp->os_delegation = 1;
1128 
1129 		if (flag & FREAD)
1130 			osp->os_share_acc_read++;
1131 		if (flag & FWRITE)
1132 			osp->os_share_acc_write++;
1133 
1134 		osp->os_share_deny_none++;
1135 		mutex_exit(&osp->os_sync_lock);
1136 
1137 		open_stream_rele(osp, rp);
1138 
1139 		mutex_enter(&oop->oo_lock);
1140 		oop->oo_just_created = NFS4_PERM_CREATED;
1141 		mutex_exit(&oop->oo_lock);
1142 
1143 		ASSERT(rsp != NULL);
1144 		if (rsp->rs_sp != NULL) {
1145 			mutex_enter(&rsp->rs_sp->s_lock);
1146 			nfs4_inc_state_ref_count_nolock(rsp->rs_sp,
1147 							VTOMI4(vp));
1148 			mutex_exit(&rsp->rs_sp->s_lock);
1149 		}
1150 #ifdef	DEBUG
1151 		bypass_otw[1]++;
1152 #endif
1153 
1154 		*errorp = 0;
1155 		return (0);
1156 	}
1157 
1158 	return (1);
1159 }
1160 
1161 static open_delegation_type4
1162 get_dtype(rnode4_t *rp)
1163 {
1164 	open_delegation_type4 dt;
1165 
1166 	mutex_enter(&rp->r_statev4_lock);
1167 	ASSERT(!rp->r_deleg_return_inprog);
1168 	if (rp->r_deleg_return_pending)
1169 		dt = OPEN_DELEGATE_NONE;
1170 	else
1171 		dt = rp->r_deleg_type;
1172 	mutex_exit(&rp->r_statev4_lock);
1173 
1174 	return (dt);
1175 }
1176 
1177 /*
1178  * Fill in *locker with the lock state arguments for a LOCK call.  If
1179  * lop->lo_just_created == NFS4_JUST_CREATED, oop and osp must be non-NULL.
1180  * Caller must already hold the necessary seqid sync lock(s).
1181  */
1182 
1183 void
1184 nfs4_setup_lock_args(nfs4_lock_owner_t *lop, nfs4_open_owner_t *oop,
1185 	nfs4_open_stream_t *osp, clientid4 clientid, locker4 *locker)
1186 {
1187 	ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
1188 	if (lop->lo_just_created == NFS4_JUST_CREATED) {
1189 		/* this is a new lock request */
1190 		open_to_lock_owner4 *nown;
1191 
1192 		ASSERT(oop != NULL);
1193 		ASSERT(osp != NULL);
1194 
1195 		locker->new_lock_owner = TRUE;
1196 		nown = &locker->locker4_u.open_owner;
1197 		nown->open_seqid = nfs4_get_open_seqid(oop) + 1;
1198 		mutex_enter(&osp->os_sync_lock);
1199 		nown->open_stateid = osp->open_stateid;
1200 		mutex_exit(&osp->os_sync_lock);
1201 		nown->lock_seqid = lop->lock_seqid; /* initial, so no +1 */
1202 
1203 		nown->lock_owner.clientid = clientid;
1204 		nown->lock_owner.owner_len = sizeof (lop->lock_owner_name);
1205 		nown->lock_owner.owner_val =
1206 			kmem_alloc(nown->lock_owner.owner_len, KM_SLEEP);
1207 		bcopy(&lop->lock_owner_name, nown->lock_owner.owner_val,
1208 			nown->lock_owner.owner_len);
1209 	} else {
1210 		exist_lock_owner4 *eown;
1211 		/* have an existing lock owner */
1212 
1213 		locker->new_lock_owner = FALSE;
1214 		eown = &locker->locker4_u.lock_owner;
1215 		mutex_enter(&lop->lo_lock);
1216 		eown->lock_stateid = lop->lock_stateid;
1217 		mutex_exit(&lop->lo_lock);
1218 		eown->lock_seqid = lop->lock_seqid + 1;
1219 	}
1220 }
1221 
1222 /*
1223  * This starts our use of the lock owner's lock seqid by setting
1224  * the lo_flags to NFS4_LOCK_SEQID_INUSE.  We will wait (forever)
1225  * with a cv_wait() until we are woken up.
1226  *
1227  * Return values:
1228  * 0		no problems
1229  * EAGAIN	caller should retry (like a recovery retry)
1230  */
1231 int
1232 nfs4_start_lock_seqid_sync(nfs4_lock_owner_t *lop, mntinfo4_t *mi)
1233 {
1234 	int error = 0;
1235 #ifdef DEBUG
1236 	static int ops = 0;		/* fault injection */
1237 #endif
1238 
1239 #ifdef DEBUG
1240 	if (seqid_sync_faults && curthread != mi->mi_recovthread &&
1241 	    ++ops % 7 == 0)
1242 		return (EAGAIN);
1243 #endif
1244 
1245 	mutex_enter(&mi->mi_lock);
1246 	if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
1247 	    curthread != mi->mi_recovthread)
1248 		error = EAGAIN;
1249 	mutex_exit(&mi->mi_lock);
1250 	if (error != 0)
1251 		goto done;
1252 
1253 	mutex_enter(&lop->lo_lock);
1254 
1255 	ASSERT(lop->lo_seqid_holder != curthread);
1256 	while (lop->lo_flags & NFS4_LOCK_SEQID_INUSE) {
1257 		NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
1258 			"nfs4_start_lock_seqid_sync: waiting on cv"));
1259 
1260 		cv_wait(&lop->lo_cv_seqid_sync, &lop->lo_lock);
1261 	}
1262 	NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4_start_lock_seqid_sync: "
1263 		"NFS4_LOCK_SEQID_INUSE"));
1264 
1265 	lop->lo_flags |= NFS4_LOCK_SEQID_INUSE;
1266 	lop->lo_seqid_holder = curthread;
1267 	mutex_exit(&lop->lo_lock);
1268 
1269 	mutex_enter(&mi->mi_lock);
1270 	if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
1271 	    curthread != mi->mi_recovthread)
1272 		error = EAGAIN;
1273 	mutex_exit(&mi->mi_lock);
1274 
1275 	if (error == EAGAIN)
1276 		nfs4_end_lock_seqid_sync(lop);
1277 
1278 	NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
1279 		"nfs4_start_lock_seqid_sync: error=%d", error));
1280 
1281 done:
1282 	return (error);
1283 }
1284 
1285 /*
1286  * This ends our use of the lock owner's lock seqid by setting
1287  * the appropiate flags and issuing a cv_signal to wake up another
1288  * thread waiting to use the lock seqid.
1289  */
1290 void
1291 nfs4_end_lock_seqid_sync(nfs4_lock_owner_t *lop)
1292 {
1293 	mutex_enter(&lop->lo_lock);
1294 	ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
1295 	ASSERT(lop->lo_seqid_holder == curthread);
1296 	lop->lo_flags &= ~NFS4_LOCK_SEQID_INUSE;
1297 	lop->lo_seqid_holder = NULL;
1298 	cv_broadcast(&lop->lo_cv_seqid_sync);
1299 	mutex_exit(&lop->lo_lock);
1300 }
1301 
1302 /*
1303  * Returns a reference to a lock owner via lopp, which has its lock seqid
1304  * synchronization started.
1305  * If the lock owner is in the 'just_created' state, then we return its open
1306  * owner and open stream and start the open seqid synchronization.
1307  *
1308  * Return value:
1309  * NFS4_OK		no problems
1310  * NFS4ERR_DELAY	there is lost state to recover; caller should retry
1311  * NFS4ERR_IO		no open stream
1312  */
1313 nfsstat4
1314 nfs4_find_or_create_lock_owner(pid_t pid, rnode4_t *rp, cred_t *cr,
1315 	nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
1316 	nfs4_lock_owner_t **lopp)
1317 {
1318 	nfs4_lock_owner_t *lop, *next_lop;
1319 	mntinfo4_t *mi;
1320 	int error = 0;
1321 	nfsstat4 stat;
1322 
1323 	mi = VTOMI4(RTOV4(rp));
1324 
1325 	mutex_enter(&rp->r_statev4_lock);
1326 
1327 	lop = rp->r_lo_head.lo_next_rnode;
1328 	while (lop != &rp->r_lo_head) {
1329 		mutex_enter(&lop->lo_lock);
1330 		if (lop->lo_pid == pid && lop->lo_valid != 0) {
1331 			/* Found a matching lock owner */
1332 			NFS4_DEBUG(nfs4_client_state_debug,
1333 				(CE_NOTE, "nfs4_find_or_create_lock_owner: "
1334 				"got a match"));
1335 			lop->lo_ref_count++;
1336 			break;
1337 		}
1338 		next_lop = lop->lo_next_rnode;
1339 		mutex_exit(&lop->lo_lock);
1340 		lop = next_lop;
1341 	}
1342 
1343 	if (lop == &rp->r_lo_head) {
1344 		/* create temporary lock owner */
1345 		lop = create_lock_owner(rp, pid);
1346 	}
1347 	mutex_exit(&rp->r_statev4_lock);
1348 
1349 	/* Have a locked down lock owner struct now */
1350 	if (lop->lo_just_created != NFS4_JUST_CREATED) {
1351 		/* This is an existing lock owner */
1352 		*oopp = NULL;
1353 		*ospp = NULL;
1354 	} else {
1355 		/* Lock owner doesn't exist yet */
1356 
1357 		/* First grab open owner seqid synchronization */
1358 		mutex_exit(&lop->lo_lock);
1359 		*oopp = find_open_owner(cr, NFS4_PERM_CREATED, mi);
1360 		if (*oopp == NULL)
1361 			goto kill_new_lop;
1362 		error = nfs4_start_open_seqid_sync(*oopp, mi);
1363 		if (error == EAGAIN) {
1364 			stat = NFS4ERR_DELAY;
1365 			goto failed;
1366 		}
1367 		*ospp = find_open_stream(*oopp, rp);
1368 		if (*ospp == NULL) {
1369 			nfs4_end_open_seqid_sync(*oopp);
1370 			goto kill_new_lop;
1371 		}
1372 		if ((*ospp)->os_failed_reopen) {
1373 			mutex_exit(&(*ospp)->os_sync_lock);
1374 			NFS4_DEBUG((nfs4_open_stream_debug ||
1375 				    nfs4_client_lock_debug), (CE_NOTE,
1376 			    "nfs4_find_or_create_lock_owner: os_failed_reopen;"
1377 			    "osp %p, cr %p, rp %s", (void *)(*ospp),
1378 			    (void *)cr, rnode4info(rp)));
1379 			nfs4_end_open_seqid_sync(*oopp);
1380 			stat = NFS4ERR_IO;
1381 			goto failed;
1382 		}
1383 		mutex_exit(&(*ospp)->os_sync_lock);
1384 
1385 		/*
1386 		 * Now see if the lock owner has become permanent while we
1387 		 * had released our lock.
1388 		 */
1389 		mutex_enter(&lop->lo_lock);
1390 		if (lop->lo_just_created != NFS4_JUST_CREATED) {
1391 			nfs4_end_open_seqid_sync(*oopp);
1392 			open_stream_rele(*ospp, rp);
1393 			open_owner_rele(*oopp);
1394 			*oopp = NULL;
1395 			*ospp = NULL;
1396 		}
1397 	}
1398 	mutex_exit(&lop->lo_lock);
1399 
1400 	error = nfs4_start_lock_seqid_sync(lop, mi);
1401 	if (error == EAGAIN) {
1402 		if (*oopp != NULL)
1403 			nfs4_end_open_seqid_sync(*oopp);
1404 		stat = NFS4ERR_DELAY;
1405 		goto failed;
1406 	}
1407 	ASSERT(error == 0);
1408 
1409 	*lopp = lop;
1410 	return (NFS4_OK);
1411 
1412 kill_new_lop:
1413 	/*
1414 	 * A previous CLOSE was attempted but got EINTR, but the application
1415 	 * continued to use the unspecified state file descriptor.  But now the
1416 	 * open stream is gone (which could also destroy the open owner), hence
1417 	 * we can no longer continue.  The calling function should return EIO
1418 	 * to the application.
1419 	 */
1420 	NFS4_DEBUG(nfs4_lost_rqst_debug || nfs4_client_lock_debug,
1421 	    (CE_NOTE, "nfs4_find_or_create_lock_owner: destroy newly created "
1422 	    "lop %p, oop %p, osp %p", (void *)lop, (void *)(*oopp),
1423 	    (void *)(*ospp)));
1424 
1425 	nfs4_rnode_remove_lock_owner(rp, lop);
1426 	stat = NFS4ERR_IO;
1427 
1428 failed:
1429 	lock_owner_rele(lop);
1430 	if (*oopp) {
1431 		open_owner_rele(*oopp);
1432 		*oopp = NULL;
1433 	}
1434 	if (*ospp) {
1435 		open_stream_rele(*ospp, rp);
1436 		*ospp = NULL;
1437 	}
1438 	return (stat);
1439 }
1440 
1441 /*
1442  * This function grabs a recently freed open owner off of the freed open
1443  * owner list if there is a match on the cred 'cr'.  It returns NULL if no
1444  * such match is found.  It will set the 'oo_ref_count' and 'oo_valid' back
1445  * to both 1 (sane values) in the case a match is found.
1446  */
1447 static nfs4_open_owner_t *
1448 find_freed_open_owner(cred_t *cr, nfs4_oo_hash_bucket_t *bucketp,
1449 	mntinfo4_t *mi)
1450 {
1451 	nfs4_open_owner_t		*foop;
1452 
1453 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1454 		"find_freed_open_owner: cred %p", (void*)cr));
1455 
1456 	ASSERT(mutex_owned(&mi->mi_lock));
1457 	ASSERT(mutex_owned(&bucketp->b_lock));
1458 
1459 	/* got hash bucket, search through freed open owners */
1460 	for (foop = list_head(&mi->mi_foo_list); foop != NULL;
1461 	    foop = list_next(&mi->mi_foo_list, foop)) {
1462 		if (!crcmp(foop->oo_cred, cr)) {
1463 			NFS4_DEBUG(nfs4_client_foo_debug, (CE_NOTE,
1464 				"find_freed_open_owner: got a match open owner "
1465 				"%p", (void *)foop));
1466 			foop->oo_ref_count = 1;
1467 			foop->oo_valid = 1;
1468 			list_remove(&mi->mi_foo_list, foop);
1469 			mi->mi_foo_num--;
1470 
1471 			/* now add the struct into the cred hash table */
1472 			list_insert_head(&bucketp->b_oo_hash_list, foop);
1473 			return (foop);
1474 		}
1475 	}
1476 
1477 	return (NULL);
1478 }
1479 
1480 /*
1481  * Insert the newly freed 'oop' into the mi's freed oop list,
1482  * always at the head of the list.  If we've already reached
1483  * our maximum allowed number of freed open owners (mi_foo_max),
1484  * then remove the LRU open owner on the list (namely the tail).
1485  */
1486 static void
1487 nfs4_free_open_owner(nfs4_open_owner_t *oop, mntinfo4_t *mi)
1488 {
1489 	nfs4_open_owner_t *lru_foop;
1490 
1491 	if (mi->mi_foo_num < mi->mi_foo_max) {
1492 		NFS4_DEBUG(nfs4_client_foo_debug, (CE_NOTE,
1493 			"nfs4_free_open_owner: num free %d, max free %d, "
1494 			"insert open owner %p for mntinfo4 %p",
1495 			mi->mi_foo_num, mi->mi_foo_max, (void *)oop,
1496 			(void *)mi));
1497 		list_insert_head(&mi->mi_foo_list, oop);
1498 		mi->mi_foo_num++;
1499 		return;
1500 	}
1501 
1502 	/* need to replace a freed open owner */
1503 
1504 	lru_foop = list_tail(&mi->mi_foo_list);
1505 
1506 	NFS4_DEBUG(nfs4_client_foo_debug, (CE_NOTE,
1507 	    "nfs4_free_open_owner: destroy %p, insert %p",
1508 	    (void *)lru_foop, (void *)oop));
1509 
1510 	list_remove(&mi->mi_foo_list, lru_foop);
1511 	nfs4_destroy_open_owner(lru_foop);
1512 
1513 	/* head always has latest freed oop */
1514 	list_insert_head(&mi->mi_foo_list, oop);
1515 }
1516 
1517 void
1518 nfs4_destroy_open_owner(nfs4_open_owner_t *oop)
1519 {
1520 	ASSERT(oop != NULL);
1521 
1522 	crfree(oop->oo_cred);
1523 	if (oop->oo_cred_otw)
1524 		crfree(oop->oo_cred_otw);
1525 	mutex_destroy(&oop->oo_lock);
1526 	cv_destroy(&oop->oo_cv_seqid_sync);
1527 	kmem_free(oop, sizeof (*oop));
1528 }
1529 
1530 seqid4
1531 nfs4_get_open_seqid(nfs4_open_owner_t *oop)
1532 {
1533 	ASSERT(oop->oo_seqid_inuse);
1534 	return (oop->oo_seqid);
1535 }
1536 
1537 /*
1538  * This set's the open seqid for a <open owner/ mntinfo4> pair.
1539  */
1540 void
1541 nfs4_set_open_seqid(seqid4 seqid, nfs4_open_owner_t *oop,
1542 	nfs4_tag_type_t tag_type)
1543 {
1544 	ASSERT(oop->oo_seqid_inuse);
1545 	oop->oo_seqid = seqid;
1546 	oop->oo_last_good_seqid = seqid;
1547 	oop->oo_last_good_op = tag_type;
1548 }
1549 
1550 /*
1551  * This bumps the current open seqid for the open owner 'oop'.
1552  */
1553 void
1554 nfs4_get_and_set_next_open_seqid(nfs4_open_owner_t *oop,
1555     nfs4_tag_type_t tag_type)
1556 {
1557 	ASSERT(oop->oo_seqid_inuse);
1558 	oop->oo_seqid++;
1559 	oop->oo_last_good_seqid = oop->oo_seqid;
1560 	oop->oo_last_good_op = tag_type;
1561 }
1562 
1563 /*
1564  * If no open owner was provided, this function takes the cred to find an
1565  * open owner within the given mntinfo4_t.  Either way we return the
1566  * open owner's OTW credential if it exists; otherwise returns the
1567  * supplied 'cr'.
1568  *
1569  * A hold is put on the returned credential, and it is up to the caller
1570  * to free the cred.
1571  */
1572 cred_t *
1573 nfs4_get_otw_cred(cred_t *cr, mntinfo4_t *mi, nfs4_open_owner_t *provided_oop)
1574 {
1575 	cred_t *ret_cr;
1576 	nfs4_open_owner_t *oop = provided_oop;
1577 
1578 	if (oop == NULL)
1579 		oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
1580 	if (oop != NULL) {
1581 		mutex_enter(&oop->oo_lock);
1582 		if (oop->oo_cred_otw)
1583 			ret_cr = oop->oo_cred_otw;
1584 		else
1585 			ret_cr = cr;
1586 		crhold(ret_cr);
1587 		mutex_exit(&oop->oo_lock);
1588 		if (provided_oop == NULL)
1589 			open_owner_rele(oop);
1590 	} else {
1591 		ret_cr = cr;
1592 		crhold(ret_cr);
1593 	}
1594 	return (ret_cr);
1595 }
1596 
1597 /*
1598  * Retrieves the next open stream in the rnode's list if an open stream
1599  * is provided; otherwise gets the first open stream in the list.
1600  * The open owner for that open stream is then retrieved, and if its
1601  * oo_cred_otw exists then it is returned; otherwise the provided 'cr'
1602  * is returned.  *osp is set to the 'found' open stream.
1603  *
1604  * Note: we don't set *osp to the open stream retrieved via the
1605  * optimized check since that won't necessarily be at the beginning
1606  * of the rnode list, and if that osp doesn't work we'd like to
1607  * check _all_ open streams (starting from the beginning of the
1608  * rnode list).
1609  */
1610 cred_t *
1611 nfs4_get_otw_cred_by_osp(rnode4_t *rp, cred_t *cr,
1612 	nfs4_open_stream_t **osp, bool_t *first_time, bool_t *last_time)
1613 {
1614 	nfs4_open_stream_t *next_osp = NULL;
1615 	cred_t *ret_cr;
1616 
1617 	ASSERT(cr != NULL);
1618 	/*
1619 	 * As an optimization, try to find the open owner
1620 	 * for the cred provided since that's most likely
1621 	 * to work.
1622 	 */
1623 	if (*first_time) {
1624 		nfs4_open_owner_t *oop;
1625 
1626 		oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(RTOV4(rp)));
1627 		if (oop) {
1628 			next_osp = find_open_stream(oop, rp);
1629 			if (next_osp)
1630 				mutex_exit(&next_osp->os_sync_lock);
1631 			open_owner_rele(oop);
1632 		} else {
1633 			next_osp = NULL;
1634 		}
1635 	} else  {
1636 		int delay_rele = 0;
1637 
1638 		/* return the next open stream for this rnode */
1639 		mutex_enter(&rp->r_os_lock);
1640 		/* Now, no one can add or delete to rp's open streams list */
1641 
1642 		if (*osp) {
1643 			next_osp = list_next(&rp->r_open_streams, *osp);
1644 			/*
1645 			 * Delay the rele of *osp until after we drop
1646 			 * r_os_lock to not deadlock with oo_lock
1647 			 * via an open_stream_rele()->open_owner_rele().
1648 			 */
1649 			delay_rele = 1;
1650 		} else {
1651 			next_osp = list_head(&rp->r_open_streams);
1652 		}
1653 		if (next_osp) {
1654 			nfs4_open_stream_t *tmp_osp;
1655 
1656 			/* find the next valid open stream */
1657 			mutex_enter(&next_osp->os_sync_lock);
1658 			while (next_osp && !next_osp->os_valid) {
1659 				tmp_osp =
1660 				    list_next(&rp->r_open_streams, next_osp);
1661 				mutex_exit(&next_osp->os_sync_lock);
1662 				next_osp = tmp_osp;
1663 				if (next_osp)
1664 					mutex_enter(&next_osp->os_sync_lock);
1665 			}
1666 			if (next_osp) {
1667 				next_osp->os_ref_count++;
1668 				mutex_exit(&next_osp->os_sync_lock);
1669 			}
1670 		}
1671 		mutex_exit(&rp->r_os_lock);
1672 		if (delay_rele)
1673 			open_stream_rele(*osp, rp);
1674 	}
1675 
1676 	if (next_osp) {
1677 		nfs4_open_owner_t *oop;
1678 
1679 		oop = next_osp->os_open_owner;
1680 		mutex_enter(&oop->oo_lock);
1681 		if (oop->oo_cred_otw)
1682 			ret_cr = oop->oo_cred_otw;
1683 		else
1684 			ret_cr = cr;
1685 		crhold(ret_cr);
1686 		mutex_exit(&oop->oo_lock);
1687 		if (*first_time) {
1688 			open_stream_rele(next_osp, rp);
1689 			*osp = NULL;
1690 		} else
1691 			*osp = next_osp;
1692 	} else {
1693 		/* just return the cred provided to us */
1694 		if (*first_time != TRUE)
1695 			*last_time = TRUE;
1696 		*osp = NULL;
1697 		ret_cr = cr;
1698 		crhold(ret_cr);
1699 	}
1700 
1701 	if (*first_time)
1702 		*first_time = FALSE;
1703 	return (ret_cr);
1704 }
1705 
1706 void
1707 nfs4_init_stateid_types(nfs4_stateid_types_t *sid_tp)
1708 {
1709 	bzero(&sid_tp->d_sid, sizeof (stateid4));
1710 	bzero(&sid_tp->l_sid, sizeof (stateid4));
1711 	bzero(&sid_tp->o_sid, sizeof (stateid4));
1712 	sid_tp->cur_sid_type = NO_SID;
1713 }
1714 
1715 void
1716 nfs4_save_stateid(stateid4 *s1, nfs4_stateid_types_t *sid_tp)
1717 {
1718 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1719 	    "nfs4_save_stateid: saved %s stateid",
1720 	    sid_tp->cur_sid_type == DEL_SID ? "delegation" :
1721 	    sid_tp->cur_sid_type == LOCK_SID ? "lock" :
1722 	    sid_tp->cur_sid_type == OPEN_SID ? "open" : "special"));
1723 
1724 	switch (sid_tp->cur_sid_type) {
1725 	case DEL_SID:
1726 		sid_tp->d_sid = *s1;
1727 		break;
1728 	case LOCK_SID:
1729 		sid_tp->l_sid = *s1;
1730 		break;
1731 	case OPEN_SID:
1732 		sid_tp->o_sid = *s1;
1733 		break;
1734 	case SPEC_SID:
1735 	default:
1736 		cmn_err(CE_PANIC, "nfs4_save_stateid: illegal "
1737 		    "stateid type %d", sid_tp->cur_sid_type);
1738 	}
1739 }
1740 
1741 /*
1742  * We got NFS4ERR_BAD_SEQID.  Setup some arguments to pass to recovery.
1743  * Caller is responsible for freeing.
1744  */
1745 nfs4_bseqid_entry_t *
1746 nfs4_create_bseqid_entry(nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop,
1747     vnode_t *vp, pid_t pid, nfs4_tag_type_t tag, seqid4 seqid)
1748 {
1749 	nfs4_bseqid_entry_t	*bsep;
1750 
1751 	bsep = kmem_alloc(sizeof (*bsep), KM_SLEEP);
1752 	bsep->bs_oop = oop;
1753 	bsep->bs_lop = lop;
1754 	bsep->bs_vp = vp;
1755 	bsep->bs_pid = pid;
1756 	bsep->bs_tag = tag;
1757 	bsep->bs_seqid = seqid;
1758 
1759 	return (bsep);
1760 }
1761 
1762 void
1763 nfs4open_dg_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
1764 	nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr,
1765 	vnode_t *vp, int access_close, int deny_close)
1766 {
1767 	lost_rqstp->lr_putfirst = FALSE;
1768 
1769 	ASSERT(vp != NULL);
1770 	if (error == ETIMEDOUT || error == EINTR ||
1771 	    NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
1772 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
1773 			"nfs4open_dg_save_lost_rqst: error %d", error));
1774 
1775 		lost_rqstp->lr_op = OP_OPEN_DOWNGRADE;
1776 		/*
1777 		 * The vp is held and rele'd via the recovery code.
1778 		 * See nfs4_save_lost_rqst.
1779 		 */
1780 		lost_rqstp->lr_vp = vp;
1781 		lost_rqstp->lr_dvp = NULL;
1782 		lost_rqstp->lr_oop = oop;
1783 		lost_rqstp->lr_osp = osp;
1784 		lost_rqstp->lr_lop = NULL;
1785 		lost_rqstp->lr_cr = cr;
1786 		lost_rqstp->lr_flk = NULL;
1787 		lost_rqstp->lr_dg_acc = access_close;
1788 		lost_rqstp->lr_dg_deny = deny_close;
1789 		lost_rqstp->lr_putfirst = FALSE;
1790 	} else {
1791 		lost_rqstp->lr_op = 0;
1792 	}
1793 }
1794 
1795 /*
1796  * Change the access and deny bits of an OPEN.
1797  * If recovery is needed, *recov_credpp is set to the cred used OTW,
1798  * a hold is placed on it, and *recov_seqidp is set to the seqid used OTW.
1799  */
1800 void
1801 nfs4_open_downgrade(int access_close, int deny_close, nfs4_open_owner_t *oop,
1802 	nfs4_open_stream_t *osp, vnode_t *vp, cred_t *cr, nfs4_lost_rqst_t *lrp,
1803 	nfs4_error_t *ep, cred_t **recov_credpp, seqid4 *recov_seqidp)
1804 {
1805 	mntinfo4_t		*mi;
1806 	int			downgrade_acc, downgrade_deny;
1807 	int			new_acc, new_deny;
1808 	COMPOUND4args_clnt	args;
1809 	COMPOUND4res_clnt	res;
1810 	OPEN_DOWNGRADE4res	*odg_res;
1811 	nfs_argop4		argop[3];
1812 	nfs_resop4		*resop;
1813 	rnode4_t		*rp;
1814 	bool_t			needrecov = FALSE;
1815 	int			doqueue = 1;
1816 	seqid4			seqid = 0;
1817 	cred_t			*cred_otw;
1818 	hrtime_t		t;
1819 
1820 	ASSERT(mutex_owned(&osp->os_sync_lock));
1821 #if DEBUG
1822 	mutex_enter(&oop->oo_lock);
1823 	ASSERT(oop->oo_seqid_inuse);
1824 	mutex_exit(&oop->oo_lock);
1825 #endif
1826 
1827 
1828 	if (access_close == 0 && deny_close == 0) {
1829 		nfs4_error_zinit(ep);
1830 		return;
1831 	}
1832 
1833 	cred_otw = nfs4_get_otw_cred(cr, VTOMI4(vp), oop);
1834 
1835 cred_retry:
1836 	nfs4_error_zinit(ep);
1837 	downgrade_acc = 0;
1838 	downgrade_deny = 0;
1839 	mi = VTOMI4(vp);
1840 	rp = VTOR4(vp);
1841 
1842 	/*
1843 	 * Check to see if the open stream got closed before we go OTW,
1844 	 * now that we have acquired the 'os_sync_lock'.
1845 	 */
1846 	if (!osp->os_valid) {
1847 		NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
1848 		    " open stream has already been closed, return success"));
1849 		/* error has already been set */
1850 		goto no_args_out;
1851 	}
1852 
1853 	/* If the file failed recovery, just quit. */
1854 	mutex_enter(&rp->r_statelock);
1855 	if (rp->r_flags & R4RECOVERR) {
1856 		mutex_exit(&rp->r_statelock);
1857 		ep->error = EIO;
1858 		goto no_args_out;
1859 	}
1860 	mutex_exit(&rp->r_statelock);
1861 
1862 	seqid = nfs4_get_open_seqid(oop) + 1;
1863 
1864 	NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
1865 	    "access_close %d, acc_read %"PRIu64" acc_write %"PRIu64"",
1866 	    access_close, osp->os_share_acc_read, osp->os_share_acc_write));
1867 
1868 	/* If we're closing the last READ, need to downgrade */
1869 	if ((access_close & FREAD) && (osp->os_share_acc_read == 1))
1870 		downgrade_acc |= OPEN4_SHARE_ACCESS_READ;
1871 
1872 	/* if we're closing the last WRITE, need to downgrade */
1873 	if ((access_close & FWRITE) && (osp->os_share_acc_write == 1))
1874 		downgrade_acc |= OPEN4_SHARE_ACCESS_WRITE;
1875 
1876 	downgrade_deny = OPEN4_SHARE_DENY_NONE;
1877 
1878 	new_acc = 0;
1879 	new_deny = 0;
1880 
1881 	/* set our new access and deny share bits */
1882 	if ((osp->os_share_acc_read > 0) &&
1883 	    !(downgrade_acc & OPEN4_SHARE_ACCESS_READ))
1884 		new_acc |= OPEN4_SHARE_ACCESS_READ;
1885 	if ((osp->os_share_acc_write > 0) &&
1886 	    !(downgrade_acc & OPEN4_SHARE_ACCESS_WRITE))
1887 		new_acc |= OPEN4_SHARE_ACCESS_WRITE;
1888 
1889 	new_deny = OPEN4_SHARE_DENY_NONE;
1890 
1891 	NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
1892 	    "downgrade acc 0x%x deny 0x%x", downgrade_acc, downgrade_deny));
1893 	NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
1894 	    "new acc 0x%x deny 0x%x", new_acc, new_deny));
1895 
1896 	/*
1897 	 * Check to see if we aren't actually doing any downgrade or
1898 	 * if this is the last 'close' but the file is still mmapped.
1899 	 * Skip this if this a lost request resend so we don't decrement
1900 	 * the osp's share counts more than once.
1901 	 */
1902 	if (!lrp &&
1903 	    ((downgrade_acc == 0 && downgrade_deny == 0) ||
1904 	    (new_acc == 0 && new_deny == 0))) {
1905 		/*
1906 		 * No downgrade to do, but still need to
1907 		 * update osp's os_share_* counts.
1908 		 */
1909 		NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE,
1910 		    "nfs4_open_downgrade: just lower the osp's count by %s",
1911 		    (access_close & FREAD) && (access_close & FWRITE) ?
1912 		    "read and write" : (access_close & FREAD) ? "read" :
1913 		    (access_close & FWRITE) ? "write" : "bogus"));
1914 		if (access_close & FREAD)
1915 			osp->os_share_acc_read--;
1916 		if (access_close & FWRITE)
1917 			osp->os_share_acc_write--;
1918 		osp->os_share_deny_none--;
1919 		nfs4_error_zinit(ep);
1920 
1921 		goto no_args_out;
1922 	}
1923 
1924 	if (osp->os_orig_oo_name != oop->oo_name) {
1925 		ep->error = EIO;
1926 		goto no_args_out;
1927 	}
1928 
1929 	/* setup the COMPOUND args */
1930 	if (lrp)
1931 		args.ctag = TAG_OPEN_DG_LOST;
1932 	else
1933 		args.ctag = TAG_OPEN_DG;
1934 
1935 	args.array_len = 3;
1936 	args.array = argop;
1937 
1938 	/* putfh */
1939 	argop[0].argop = OP_CPUTFH;
1940 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1941 
1942 	argop[1].argop = OP_GETATTR;
1943 	argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1944 	argop[1].nfs_argop4_u.opgetattr.mi = mi;
1945 
1946 	ASSERT(mutex_owned(&osp->os_sync_lock));
1947 	ASSERT(osp->os_delegation == FALSE);
1948 
1949 	/* open downgrade */
1950 	argop[2].argop = OP_OPEN_DOWNGRADE;
1951 	argop[2].nfs_argop4_u.opopen_downgrade.open_stateid = osp->open_stateid;
1952 	argop[2].nfs_argop4_u.opopen_downgrade.share_access = new_acc;
1953 	argop[2].nfs_argop4_u.opopen_downgrade.share_deny = new_deny;
1954 	argop[2].nfs_argop4_u.opopen_downgrade.seqid = seqid;
1955 
1956 	t = gethrtime();
1957 
1958 	rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
1959 
1960 	if (ep->error == 0 && nfs4_need_to_bump_seqid(&res))
1961 		nfs4_set_open_seqid(seqid, oop, args.ctag);
1962 
1963 	if ((ep->error == EACCES ||
1964 	    (ep->error == 0 && res.status == NFS4ERR_ACCESS)) &&
1965 	    cred_otw != cr) {
1966 		crfree(cred_otw);
1967 		cred_otw = cr;
1968 		crhold(cred_otw);
1969 		if (!ep->error)
1970 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1971 		goto cred_retry;
1972 	}
1973 
1974 	needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
1975 
1976 	if (needrecov && recov_credpp) {
1977 		*recov_credpp = cred_otw;
1978 		crhold(*recov_credpp);
1979 		if (recov_seqidp)
1980 			*recov_seqidp = seqid;
1981 	}
1982 
1983 	if (!ep->error && !res.status) {
1984 		/* get the open downgrade results */
1985 		resop = &res.array[2];
1986 		odg_res = &resop->nfs_resop4_u.opopen_downgrade;
1987 
1988 		osp->open_stateid = odg_res->open_stateid;
1989 
1990 		/* set the open streams new access/deny bits */
1991 		if (access_close & FREAD)
1992 			osp->os_share_acc_read--;
1993 		if (access_close & FWRITE)
1994 			osp->os_share_acc_write--;
1995 		osp->os_share_deny_none--;
1996 		osp->os_dc_openacc = new_acc;
1997 
1998 		nfs4_attr_cache(vp,
1999 				&res.array[1].nfs_resop4_u.opgetattr.ga_res,
2000 				t, cred_otw, TRUE, NULL);
2001 	}
2002 
2003 	if (!ep->error)
2004 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2005 
2006 no_args_out:
2007 	crfree(cred_otw);
2008 }
2009 
2010 /*
2011  * If an OPEN request gets ETIMEDOUT or EINTR (that includes bailing out
2012  * because the filesystem was forcibly unmounted) then we don't know if we
2013  * potentially left state dangling on the server, therefore the recovery
2014  * framework makes this call to resend the OPEN request and then undo it.
2015  */
2016 void
2017 nfs4_resend_open_otw(vnode_t **vpp, nfs4_lost_rqst_t *resend_rqstp,
2018 	nfs4_error_t *ep)
2019 {
2020 	COMPOUND4args_clnt	args;
2021 	COMPOUND4res_clnt	res;
2022 	nfs_argop4		argop[4];
2023 	GETFH4res		*gf_res = NULL;
2024 	OPEN4cargs		*open_args;
2025 	OPEN4res		*op_res;
2026 	char			*destcfp;
2027 	int			destclen;
2028 	nfs4_ga_res_t		*garp;
2029 	vnode_t			*dvp = NULL, *vp = NULL;
2030 	rnode4_t		*rp = NULL, *drp = NULL;
2031 	cred_t			*cr = NULL;
2032 	seqid4			seqid;
2033 	nfs4_open_owner_t	*oop = NULL;
2034 	nfs4_open_stream_t	*osp = NULL;
2035 	component4		*srcfp;
2036 	open_claim_type4	claim;
2037 	mntinfo4_t		*mi;
2038 	int			doqueue = 1;
2039 	bool_t			retry_open = FALSE;
2040 	int			created_osp = 0;
2041 	hrtime_t		t;
2042 	char 			*failed_msg = "";
2043 	int			fh_different;
2044 	int			reopen = 0;
2045 
2046 	nfs4_error_zinit(ep);
2047 
2048 	cr = resend_rqstp->lr_cr;
2049 	dvp = resend_rqstp->lr_dvp;
2050 
2051 	vp = *vpp;
2052 	if (vp) {
2053 		ASSERT(nfs4_consistent_type(vp));
2054 		rp = VTOR4(vp);
2055 	}
2056 
2057 	if (rp) {
2058 		/* If the file failed recovery, just quit. */
2059 		mutex_enter(&rp->r_statelock);
2060 		if (rp->r_flags & R4RECOVERR) {
2061 			mutex_exit(&rp->r_statelock);
2062 			ep->error = EIO;
2063 			return;
2064 		}
2065 		mutex_exit(&rp->r_statelock);
2066 	}
2067 
2068 	if (dvp) {
2069 		drp = VTOR4(dvp);
2070 		/* If the parent directory failed recovery, just quit. */
2071 		mutex_enter(&drp->r_statelock);
2072 		if (drp->r_flags & R4RECOVERR) {
2073 			mutex_exit(&drp->r_statelock);
2074 			ep->error = EIO;
2075 			return;
2076 		}
2077 		mutex_exit(&drp->r_statelock);
2078 	} else
2079 		reopen = 1;	/* NULL dvp means this is a reopen */
2080 
2081 	claim = resend_rqstp->lr_oclaim;
2082 	ASSERT(claim == CLAIM_NULL || claim == CLAIM_DELEGATE_CUR);
2083 
2084 	args.ctag = TAG_OPEN_LOST;
2085 	args.array_len = 4;
2086 	args.array = argop;
2087 
2088 	argop[0].argop = OP_CPUTFH;
2089 	if (reopen) {
2090 		ASSERT(vp != NULL);
2091 
2092 		mi = VTOMI4(vp);
2093 		/*
2094 		 * if this is a file mount then
2095 		 * use the mntinfo parentfh
2096 		 */
2097 		argop[0].nfs_argop4_u.opcputfh.sfh =
2098 			(vp->v_flag & VROOT) ? mi->mi_srvparentfh :
2099 						VTOSV(vp)->sv_dfh;
2100 		args.ctag = TAG_REOPEN_LOST;
2101 	} else {
2102 		argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
2103 		mi = VTOMI4(dvp);
2104 	}
2105 
2106 	argop[1].argop = OP_COPEN;
2107 	open_args = &argop[1].nfs_argop4_u.opcopen;
2108 	open_args->claim = claim;
2109 
2110 	/*
2111 	 * If we sent over a OPEN with CREATE then the only
2112 	 * thing we care about is to not leave dangling state
2113 	 * on the server, not whether the file we potentially
2114 	 * created remains on the server.  So even though the
2115 	 * lost open request specified a CREATE, we only wish
2116 	 * to do a non-CREATE OPEN.
2117 	 */
2118 	open_args->opentype = OPEN4_NOCREATE;
2119 
2120 	srcfp = &resend_rqstp->lr_ofile;
2121 	destclen = srcfp->utf8string_len;
2122 	destcfp = kmem_alloc(destclen + 1, KM_SLEEP);
2123 	bcopy(srcfp->utf8string_val, destcfp, destclen);
2124 	destcfp[destclen] = '\0';
2125 	if (claim == CLAIM_DELEGATE_CUR) {
2126 		open_args->open_claim4_u.delegate_cur_info.delegate_stateid =
2127 				resend_rqstp->lr_ostateid;
2128 		open_args->open_claim4_u.delegate_cur_info.cfile = destcfp;
2129 	} else {
2130 		open_args->open_claim4_u.cfile = destcfp;
2131 	}
2132 
2133 	open_args->share_access = resend_rqstp->lr_oacc;
2134 	open_args->share_deny = resend_rqstp->lr_odeny;
2135 	oop = resend_rqstp->lr_oop;
2136 	ASSERT(oop != NULL);
2137 
2138 	open_args->owner.clientid = mi2clientid(mi);
2139 	/* this length never changes */
2140 	open_args->owner.owner_len = sizeof (oop->oo_name);
2141 	open_args->owner.owner_val =
2142 	    kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
2143 
2144 	ep->error = nfs4_start_open_seqid_sync(oop, mi);
2145 	ASSERT(ep->error == 0);		/* recov thread always succeeds */
2146 	/*
2147 	 * We can get away with not saving the seqid upon detection
2148 	 * of a lost request, and now just use the open owner's current
2149 	 * seqid since we only allow one op OTW per seqid and lost
2150 	 * requests are saved FIFO.
2151 	 */
2152 	seqid = nfs4_get_open_seqid(oop) + 1;
2153 	open_args->seqid = seqid;
2154 
2155 	bcopy(&oop->oo_name, open_args->owner.owner_val,
2156 	    open_args->owner.owner_len);
2157 
2158 	/* getfh */
2159 	argop[2].argop = OP_GETFH;
2160 
2161 	/* Construct the getattr part of the compound */
2162 	argop[3].argop = OP_GETATTR;
2163 	argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
2164 	argop[3].nfs_argop4_u.opgetattr.mi = mi;
2165 
2166 	res.array = NULL;
2167 
2168 	t = gethrtime();
2169 
2170 	rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
2171 
2172 	if (ep->error == 0 && nfs4_need_to_bump_seqid(&res))
2173 		nfs4_set_open_seqid(seqid, oop, args.ctag);
2174 
2175 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2176 	    "nfs4_resend_open_otw: error %d stat %d", ep->error, res.status));
2177 
2178 	if (ep->error || res.status)
2179 		goto err_out;
2180 
2181 	op_res = &res.array[1].nfs_resop4_u.opopen;
2182 	gf_res = &res.array[2].nfs_resop4_u.opgetfh;
2183 	garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res;
2184 
2185 	if (!vp) {
2186 		int rnode_err = 0;
2187 		nfs4_sharedfh_t *sfh;
2188 
2189 		/*
2190 		 * If we can't decode all the attributes they are not usable,
2191 		 * just make the vnode.
2192 		 */
2193 
2194 		sfh = sfh4_get(&gf_res->object, VTOMI4(dvp));
2195 		*vpp = makenfs4node(sfh, garp, dvp->v_vfsp, t, cr, dvp,
2196 			fn_get(VTOSV(dvp)->sv_name,
2197 			open_args->open_claim4_u.cfile));
2198 		sfh4_rele(&sfh);
2199 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2200 		    "nfs4_resend_open_otw: made vp %p for file %s",
2201 		    (void *)(*vpp), open_args->open_claim4_u.cfile));
2202 
2203 		if (ep->error)
2204 			PURGE_ATTRCACHE4(*vpp);
2205 
2206 		/*
2207 		 * For the newly created *vpp case, make sure the rnode
2208 		 * isn't bad before using it.
2209 		 */
2210 		mutex_enter(&(VTOR4(*vpp))->r_statelock);
2211 		if (VTOR4(*vpp)->r_flags & R4RECOVERR)
2212 			rnode_err = EIO;
2213 		mutex_exit(&(VTOR4(*vpp))->r_statelock);
2214 
2215 		if (rnode_err) {
2216 			NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2217 			    "nfs4_resend_open_otw: rp %p is bad",
2218 			    (void *)VTOR4(*vpp)));
2219 			ep->error = rnode_err;
2220 			goto err_out;
2221 		}
2222 
2223 		vp = *vpp;
2224 		rp = VTOR4(vp);
2225 	}
2226 
2227 	if (reopen) {
2228 		/*
2229 		 * Check if the path we reopened really is the same
2230 		 * file. We could end up in a situation were the file
2231 		 * was removed and a new file created with the same name.
2232 		 */
2233 		(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
2234 		fh_different =
2235 			(nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0);
2236 		if (fh_different) {
2237 			if (mi->mi_fh_expire_type == FH4_PERSISTENT ||
2238 			    mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) {
2239 				/* Oops, we don't have the same file */
2240 				if (mi->mi_fh_expire_type == FH4_PERSISTENT)
2241 					failed_msg =
2242 					    "Couldn't reopen: Persistant "
2243 					    "file handle changed";
2244 				else
2245 					failed_msg =
2246 					    "Couldn't reopen: Volatile "
2247 					    "(no expire on open) file handle "
2248 					    "changed";
2249 
2250 				nfs4_end_open_seqid_sync(oop);
2251 				kmem_free(destcfp, destclen + 1);
2252 				nfs4args_copen_free(open_args);
2253 				(void) xdr_free(xdr_COMPOUND4res_clnt,
2254 						(caddr_t)&res);
2255 				nfs_rw_exit(&mi->mi_fh_lock);
2256 				nfs4_fail_recov(vp, failed_msg, ep->error,
2257 						ep->stat);
2258 				return;
2259 			} else {
2260 				/*
2261 				 * We have volatile file handles that don't
2262 				 * compare.  If the fids are the same then we
2263 				 * assume that the file handle expired but the
2264 				 * renode still refers to the same file object.
2265 				 *
2266 				 * First check that we have fids or not.
2267 				 * If we don't we have a dumb server so we will
2268 				 * just assume every thing is ok for now.
2269 				 */
2270 				if (!ep->error &&
2271 				    garp->n4g_va.va_mask & AT_NODEID &&
2272 				    rp->r_attr.va_mask & AT_NODEID &&
2273 				    rp->r_attr.va_nodeid !=
2274 					garp->n4g_va.va_nodeid) {
2275 					/*
2276 					 * We have fids, but they don't
2277 					 * compare. So kill the file.
2278 					 */
2279 					failed_msg =
2280 					    "Couldn't reopen: file handle "
2281 					    "changed due to mismatched fids";
2282 					nfs4_end_open_seqid_sync(oop);
2283 					kmem_free(destcfp, destclen + 1);
2284 					nfs4args_copen_free(open_args);
2285 					(void) xdr_free(xdr_COMPOUND4res_clnt,
2286 							(caddr_t)&res);
2287 					nfs_rw_exit(&mi->mi_fh_lock);
2288 					nfs4_fail_recov(vp, failed_msg,
2289 							ep->error, ep->stat);
2290 					return;
2291 				} else {
2292 					/*
2293 					 * We have volatile file handles that
2294 					 * refers to the same file (at least
2295 					 * they have the same fid) or we don't
2296 					 * have fids so we can't tell. :(. We'll
2297 					 * be a kind and accepting client so
2298 					 * we'll update the rnode's file
2299 					 * handle with the otw handle.
2300 					 *
2301 					 * We need to drop mi->mi_fh_lock since
2302 					 * sh4_update acquires it. Since there
2303 					 * is only one recovery thread there is
2304 					 * no race.
2305 					 */
2306 					nfs_rw_exit(&mi->mi_fh_lock);
2307 					sfh4_update(rp->r_fh, &gf_res->object);
2308 				}
2309 			}
2310 		} else {
2311 			nfs_rw_exit(&mi->mi_fh_lock);
2312 		}
2313 	}
2314 
2315 	ASSERT(nfs4_consistent_type(vp));
2316 
2317 	if (op_res->rflags & OPEN4_RESULT_CONFIRM)
2318 		nfs4open_confirm(vp, &seqid, &op_res->stateid, cr, TRUE,
2319 		    &retry_open, oop, TRUE, ep, NULL);
2320 	if (ep->error || ep->stat) {
2321 		nfs4_end_open_seqid_sync(oop);
2322 		kmem_free(destcfp, destclen + 1);
2323 		nfs4args_copen_free(open_args);
2324 		if (!ep->error)
2325 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2326 		return;
2327 	}
2328 
2329 	if (reopen) {
2330 		/*
2331 		 * Doing a reopen here so the osp should already exist.
2332 		 * If not, something changed or went very wrong.
2333 		 *
2334 		 * returns with 'os_sync_lock' held
2335 		 */
2336 		osp = find_open_stream(oop, rp);
2337 		if (!osp) {
2338 			NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2339 			    "nfs4_resend_open_otw: couldn't find osp"));
2340 			ep->error = EINVAL;
2341 			goto err_out;
2342 		}
2343 		osp->os_open_ref_count++;
2344 	} else {
2345 		mutex_enter(&oop->oo_lock);
2346 		oop->oo_just_created = NFS4_PERM_CREATED;
2347 		mutex_exit(&oop->oo_lock);
2348 
2349 		/* returns with 'os_sync_lock' held */
2350 		osp = find_or_create_open_stream(oop, rp, &created_osp);
2351 		if (!osp) {
2352 			NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2353 			    "nfs4_resend_open_otw: couldn't create osp"));
2354 			ep->error = EINVAL;
2355 			goto err_out;
2356 		}
2357 	}
2358 
2359 	osp->open_stateid = op_res->stateid;
2360 	osp->os_delegation = FALSE;
2361 	/*
2362 	 * Need to reset this bitfield for the possible case where we were
2363 	 * going to OTW CLOSE the file, got a non-recoverable error, and before
2364 	 * we could retry the CLOSE, OPENed the file again.
2365 	 */
2366 	ASSERT(osp->os_open_owner->oo_seqid_inuse);
2367 	osp->os_final_close = 0;
2368 	osp->os_force_close = 0;
2369 
2370 	if (!reopen) {
2371 		if (open_args->share_access & OPEN4_SHARE_ACCESS_READ)
2372 			osp->os_share_acc_read++;
2373 		if (open_args->share_access & OPEN4_SHARE_ACCESS_WRITE)
2374 			osp->os_share_acc_write++;
2375 		osp->os_share_deny_none++;
2376 	}
2377 
2378 	mutex_exit(&osp->os_sync_lock);
2379 	if (created_osp)
2380 		nfs4_inc_state_ref_count(mi);
2381 	open_stream_rele(osp, rp);
2382 
2383 	nfs4_end_open_seqid_sync(oop);
2384 
2385 	/* accept delegation, if any */
2386 	nfs4_delegation_accept(rp, claim, op_res, garp, cr);
2387 
2388 	kmem_free(destcfp, destclen + 1);
2389 	nfs4args_copen_free(open_args);
2390 
2391 	if (claim == CLAIM_DELEGATE_CUR)
2392 		nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
2393 	else
2394 		PURGE_ATTRCACHE4(vp);
2395 
2396 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2397 
2398 	ASSERT(nfs4_consistent_type(vp));
2399 
2400 	return;
2401 
2402 err_out:
2403 	nfs4_end_open_seqid_sync(oop);
2404 	kmem_free(destcfp, destclen + 1);
2405 	nfs4args_copen_free(open_args);
2406 	if (!ep->error)
2407 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2408 }
2409