1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2017 by Delphix. All rights reserved.
24 */
25
26 /*
27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
28 * All Rights Reserved
29 */
30
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/thread.h>
35 #include <sys/t_lock.h>
36 #include <sys/time.h>
37 #include <sys/vnode.h>
38 #include <sys/vfs.h>
39 #include <sys/errno.h>
40 #include <sys/buf.h>
41 #include <sys/stat.h>
42 #include <sys/cred.h>
43 #include <sys/kmem.h>
44 #include <sys/debug.h>
45 #include <sys/dnlc.h>
46 #include <sys/vmsystm.h>
47 #include <sys/flock.h>
48 #include <sys/share.h>
49 #include <sys/cmn_err.h>
50 #include <sys/tiuser.h>
51 #include <sys/sysmacros.h>
52 #include <sys/callb.h>
53 #include <sys/acl.h>
54 #include <sys/kstat.h>
55 #include <sys/signal.h>
56 #include <sys/disp.h>
57 #include <sys/atomic.h>
58 #include <sys/list.h>
59 #include <sys/sdt.h>
60
61 #include <rpc/types.h>
62 #include <rpc/xdr.h>
63 #include <rpc/auth.h>
64 #include <rpc/clnt.h>
65
66 #include <nfs/nfs.h>
67 #include <nfs/nfs_clnt.h>
68 #include <nfs/nfs_acl.h>
69
70 #include <nfs/nfs4.h>
71 #include <nfs/rnode4.h>
72 #include <nfs/nfs4_clnt.h>
73
74 #include <vm/hat.h>
75 #include <vm/as.h>
76 #include <vm/page.h>
77 #include <vm/pvn.h>
78 #include <vm/seg.h>
79 #include <vm/seg_map.h>
80 #include <vm/seg_vn.h>
81
82 #include <sys/ddi.h>
83
84 /*
85 * Arguments to page-flush thread.
86 */
87 typedef struct {
88 vnode_t *vp;
89 cred_t *cr;
90 } pgflush_t;
91
92 #ifdef DEBUG
93 int nfs4_client_lease_debug;
94 int nfs4_sharedfh_debug;
95 int nfs4_fname_debug;
96
97 /* temporary: panic if v_type is inconsistent with r_attr va_type */
98 int nfs4_vtype_debug;
99
100 uint_t nfs4_tsd_key;
101 #endif
102
103 static time_t nfs4_client_resumed = 0;
104 static callb_id_t cid = 0;
105
106 static int nfs4renew(nfs4_server_t *);
107 static void nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int);
108 static void nfs4_pgflush_thread(pgflush_t *);
109
110 static boolean_t nfs4_client_cpr_callb(void *, int);
111
112 struct mi4_globals {
113 kmutex_t mig_lock; /* lock protecting mig_list */
114 list_t mig_list; /* list of NFS v4 mounts in zone */
115 boolean_t mig_destructor_called;
116 };
117
118 static zone_key_t mi4_list_key;
119
120 /*
121 * Attributes caching:
122 *
123 * Attributes are cached in the rnode in struct vattr form.
124 * There is a time associated with the cached attributes (r_time_attr_inval)
125 * which tells whether the attributes are valid. The time is initialized
126 * to the difference between current time and the modify time of the vnode
127 * when new attributes are cached. This allows the attributes for
128 * files that have changed recently to be timed out sooner than for files
129 * that have not changed for a long time. There are minimum and maximum
130 * timeout values that can be set per mount point.
131 */
132
133 /*
134 * If a cache purge is in progress, wait for it to finish.
135 *
136 * The current thread must not be in the middle of an
137 * nfs4_start_op/nfs4_end_op region. Otherwise, there could be a deadlock
138 * between this thread, a recovery thread, and the page flush thread.
139 */
140 int
nfs4_waitfor_purge_complete(vnode_t * vp)141 nfs4_waitfor_purge_complete(vnode_t *vp)
142 {
143 rnode4_t *rp;
144 k_sigset_t smask;
145
146 rp = VTOR4(vp);
147 if ((rp->r_serial != NULL && rp->r_serial != curthread) ||
148 ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) {
149 mutex_enter(&rp->r_statelock);
150 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
151 while ((rp->r_serial != NULL && rp->r_serial != curthread) ||
152 ((rp->r_flags & R4PGFLUSH) &&
153 rp->r_pgflush != curthread)) {
154 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
155 sigunintr(&smask);
156 mutex_exit(&rp->r_statelock);
157 return (EINTR);
158 }
159 }
160 sigunintr(&smask);
161 mutex_exit(&rp->r_statelock);
162 }
163 return (0);
164 }
165
166 /*
167 * Validate caches by checking cached attributes. If they have timed out,
168 * then get new attributes from the server. As a side effect, cache
169 * invalidation is done if the attributes have changed.
170 *
171 * If the attributes have not timed out and if there is a cache
172 * invalidation being done by some other thread, then wait until that
173 * thread has completed the cache invalidation.
174 */
175 int
nfs4_validate_caches(vnode_t * vp,cred_t * cr)176 nfs4_validate_caches(vnode_t *vp, cred_t *cr)
177 {
178 int error;
179 nfs4_ga_res_t gar;
180
181 if (ATTRCACHE4_VALID(vp)) {
182 error = nfs4_waitfor_purge_complete(vp);
183 if (error)
184 return (error);
185 return (0);
186 }
187
188 return (nfs4_getattr_otw(vp, &gar, cr, 0));
189 }
190
191 /*
192 * Fill in attribute from the cache.
193 * If valid, then return 0 to indicate that no error occurred,
194 * otherwise return 1 to indicate that an error occurred.
195 */
196 static int
nfs4_getattr_cache(vnode_t * vp,struct vattr * vap)197 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap)
198 {
199 rnode4_t *rp;
200
201 rp = VTOR4(vp);
202 mutex_enter(&rp->r_statelock);
203 mutex_enter(&rp->r_statev4_lock);
204 if (ATTRCACHE4_VALID(vp)) {
205 mutex_exit(&rp->r_statev4_lock);
206 /*
207 * Cached attributes are valid
208 */
209 *vap = rp->r_attr;
210 mutex_exit(&rp->r_statelock);
211 return (0);
212 }
213 mutex_exit(&rp->r_statev4_lock);
214 mutex_exit(&rp->r_statelock);
215 return (1);
216 }
217
218
219 /*
220 * If returned error is ESTALE flush all caches. The nfs4_purge_caches()
221 * call is synchronous because all the pages were invalidated by the
222 * nfs4_invalidate_pages() call.
223 */
224 void
nfs4_purge_stale_fh(int errno,vnode_t * vp,cred_t * cr)225 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr)
226 {
227 struct rnode4 *rp = VTOR4(vp);
228
229 /* Ensure that the ..._end_op() call has been done */
230 ASSERT(tsd_get(nfs4_tsd_key) == NULL);
231
232 if (errno != ESTALE)
233 return;
234
235 mutex_enter(&rp->r_statelock);
236 rp->r_flags |= R4STALE;
237 if (!rp->r_error)
238 rp->r_error = errno;
239 mutex_exit(&rp->r_statelock);
240 if (nfs4_has_pages(vp))
241 nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
242 nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE);
243 }
244
245 /*
246 * Purge all of the various NFS `data' caches. If "asyncpg" is TRUE, the
247 * page purge is done asynchronously.
248 */
249 void
nfs4_purge_caches(vnode_t * vp,int purge_dnlc,cred_t * cr,int asyncpg)250 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg)
251 {
252 rnode4_t *rp;
253 char *contents;
254 vnode_t *xattr;
255 int size;
256 int pgflush; /* are we the page flush thread? */
257
258 /*
259 * Purge the DNLC for any entries which refer to this file.
260 */
261 if (vp->v_count > 1 &&
262 (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC))
263 dnlc_purge_vp(vp);
264
265 /*
266 * Clear any readdir state bits and purge the readlink response cache.
267 */
268 rp = VTOR4(vp);
269 mutex_enter(&rp->r_statelock);
270 rp->r_flags &= ~R4LOOKUP;
271 contents = rp->r_symlink.contents;
272 size = rp->r_symlink.size;
273 rp->r_symlink.contents = NULL;
274
275 xattr = rp->r_xattr_dir;
276 rp->r_xattr_dir = NULL;
277
278 /*
279 * Purge pathconf cache too.
280 */
281 rp->r_pathconf.pc4_xattr_valid = 0;
282 rp->r_pathconf.pc4_cache_valid = 0;
283
284 pgflush = (curthread == rp->r_pgflush);
285 mutex_exit(&rp->r_statelock);
286
287 if (contents != NULL) {
288
289 kmem_free((void *)contents, size);
290 }
291
292 if (xattr != NULL)
293 VN_RELE(xattr);
294
295 /*
296 * Flush the page cache. If the current thread is the page flush
297 * thread, don't initiate a new page flush. There's no need for
298 * it, and doing it correctly is hard.
299 */
300 if (nfs4_has_pages(vp) && !pgflush) {
301 if (!asyncpg) {
302 (void) nfs4_waitfor_purge_complete(vp);
303 nfs4_flush_pages(vp, cr);
304 } else {
305 pgflush_t *args;
306
307 /*
308 * We don't hold r_statelock while creating the
309 * thread, in case the call blocks. So we use a
310 * flag to indicate that a page flush thread is
311 * active.
312 */
313 mutex_enter(&rp->r_statelock);
314 if (rp->r_flags & R4PGFLUSH) {
315 mutex_exit(&rp->r_statelock);
316 } else {
317 rp->r_flags |= R4PGFLUSH;
318 mutex_exit(&rp->r_statelock);
319
320 args = kmem_alloc(sizeof (pgflush_t),
321 KM_SLEEP);
322 args->vp = vp;
323 VN_HOLD(args->vp);
324 args->cr = cr;
325 crhold(args->cr);
326 (void) zthread_create(NULL, 0,
327 nfs4_pgflush_thread, args, 0,
328 minclsyspri);
329 }
330 }
331 }
332
333 /*
334 * Flush the readdir response cache.
335 */
336 nfs4_purge_rddir_cache(vp);
337 }
338
339 /*
340 * Invalidate all pages for the given file, after writing back the dirty
341 * ones.
342 */
343
344 void
nfs4_flush_pages(vnode_t * vp,cred_t * cr)345 nfs4_flush_pages(vnode_t *vp, cred_t *cr)
346 {
347 int error;
348 rnode4_t *rp = VTOR4(vp);
349
350 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
351 if (error == ENOSPC || error == EDQUOT) {
352 mutex_enter(&rp->r_statelock);
353 if (!rp->r_error)
354 rp->r_error = error;
355 mutex_exit(&rp->r_statelock);
356 }
357 }
358
359 /*
360 * Page flush thread.
361 */
362
363 static void
nfs4_pgflush_thread(pgflush_t * args)364 nfs4_pgflush_thread(pgflush_t *args)
365 {
366 rnode4_t *rp = VTOR4(args->vp);
367
368 /* remember which thread we are, so we don't deadlock ourselves */
369 mutex_enter(&rp->r_statelock);
370 ASSERT(rp->r_pgflush == NULL);
371 rp->r_pgflush = curthread;
372 mutex_exit(&rp->r_statelock);
373
374 nfs4_flush_pages(args->vp, args->cr);
375
376 mutex_enter(&rp->r_statelock);
377 rp->r_pgflush = NULL;
378 rp->r_flags &= ~R4PGFLUSH;
379 cv_broadcast(&rp->r_cv);
380 mutex_exit(&rp->r_statelock);
381
382 VN_RELE(args->vp);
383 crfree(args->cr);
384 kmem_free(args, sizeof (pgflush_t));
385 zthread_exit();
386 }
387
388 /*
389 * Purge the readdir cache of all entries which are not currently
390 * being filled.
391 */
392 void
nfs4_purge_rddir_cache(vnode_t * vp)393 nfs4_purge_rddir_cache(vnode_t *vp)
394 {
395 rnode4_t *rp;
396
397 rp = VTOR4(vp);
398
399 mutex_enter(&rp->r_statelock);
400 rp->r_direof = NULL;
401 rp->r_flags &= ~R4LOOKUP;
402 rp->r_flags |= R4READDIRWATTR;
403 rddir4_cache_purge(rp);
404 mutex_exit(&rp->r_statelock);
405 }
406
407 /*
408 * Set attributes cache for given vnode using virtual attributes. There is
409 * no cache validation, but if the attributes are deemed to be stale, they
410 * are ignored. This corresponds to nfs3_attrcache().
411 *
412 * Set the timeout value on the attribute cache and fill it
413 * with the passed in attributes.
414 */
415 void
nfs4_attrcache_noinval(vnode_t * vp,nfs4_ga_res_t * garp,hrtime_t t)416 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t)
417 {
418 rnode4_t *rp = VTOR4(vp);
419
420 mutex_enter(&rp->r_statelock);
421 if (rp->r_time_attr_saved <= t)
422 nfs4_attrcache_va(vp, garp, FALSE);
423 mutex_exit(&rp->r_statelock);
424 }
425
426 /*
427 * Use the passed in virtual attributes to check to see whether the
428 * data and metadata caches are valid, cache the new attributes, and
429 * then do the cache invalidation if required.
430 *
431 * The cache validation and caching of the new attributes is done
432 * atomically via the use of the mutex, r_statelock. If required,
433 * the cache invalidation is done atomically w.r.t. the cache
434 * validation and caching of the attributes via the pseudo lock,
435 * r_serial.
436 *
437 * This routine is used to do cache validation and attributes caching
438 * for operations with a single set of post operation attributes.
439 */
440
441 void
nfs4_attr_cache(vnode_t * vp,nfs4_ga_res_t * garp,hrtime_t t,cred_t * cr,int async,change_info4 * cinfo)442 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
443 hrtime_t t, cred_t *cr, int async,
444 change_info4 *cinfo)
445 {
446 rnode4_t *rp;
447 int mtime_changed = 0;
448 int ctime_changed = 0;
449 vsecattr_t *vsp;
450 int was_serial, set_time_cache_inval, recov;
451 vattr_t *vap = &garp->n4g_va;
452 mntinfo4_t *mi = VTOMI4(vp);
453 len_t preattr_rsize;
454 boolean_t writemodify_set = B_FALSE;
455 boolean_t cachepurge_set = B_FALSE;
456
457 ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid);
458
459 /* Is curthread the recovery thread? */
460 mutex_enter(&mi->mi_lock);
461 recov = (VTOMI4(vp)->mi_recovthread == curthread);
462 mutex_exit(&mi->mi_lock);
463
464 rp = VTOR4(vp);
465 mutex_enter(&rp->r_statelock);
466 was_serial = (rp->r_serial == curthread);
467 if (rp->r_serial != NULL && !was_serial) {
468 /*
469 * Purge current attrs and bail out to avoid potential deadlock
470 * between another thread caching attrs (r_serial thread), this
471 * thread, and a thread trying to read or write pages.
472 */
473 PURGE_ATTRCACHE4_LOCKED(rp);
474 mutex_exit(&rp->r_statelock);
475 return;
476 }
477
478 /*
479 * If there is a page flush thread, the current thread needs to
480 * bail out, to prevent a possible deadlock between the current
481 * thread (which might be in a start_op/end_op region), the
482 * recovery thread, and the page flush thread. Expire the
483 * attribute cache, so that any attributes the current thread was
484 * going to set are not lost.
485 */
486 if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) {
487 PURGE_ATTRCACHE4_LOCKED(rp);
488 mutex_exit(&rp->r_statelock);
489 return;
490 }
491
492 if (rp->r_time_attr_saved > t) {
493 /*
494 * Attributes have been cached since these attributes were
495 * probably made. If there is an inconsistency in what is
496 * cached, mark them invalid. If not, don't act on them.
497 */
498 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
499 PURGE_ATTRCACHE4_LOCKED(rp);
500 mutex_exit(&rp->r_statelock);
501 return;
502 }
503 set_time_cache_inval = 0;
504 if (cinfo) {
505 /*
506 * Only directory modifying callers pass non-NULL cinfo.
507 */
508 ASSERT(vp->v_type == VDIR);
509 /*
510 * If the cache timeout either doesn't exist or hasn't expired,
511 * and dir didn't changed on server before dirmod op
512 * and dir didn't change after dirmod op but before getattr
513 * then there's a chance that the client's cached data for
514 * this object is current (not stale). No immediate cache
515 * flush is required.
516 *
517 */
518 if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) &&
519 cinfo->before == rp->r_change &&
520 (garp->n4g_change_valid &&
521 cinfo->after == garp->n4g_change)) {
522
523 /*
524 * If atomic isn't set, then the before/after info
525 * cannot be blindly trusted. For this case, we tell
526 * nfs4_attrcache_va to cache the attrs but also
527 * establish an absolute maximum cache timeout. When
528 * the timeout is reached, caches will be flushed.
529 */
530 if (! cinfo->atomic)
531 set_time_cache_inval = 1;
532 } else {
533
534 /*
535 * We're not sure exactly what changed, but we know
536 * what to do. flush all caches for dir. remove the
537 * attr timeout.
538 *
539 * a) timeout expired. flush all caches.
540 * b) r_change != cinfo.before. flush all caches.
541 * c) r_change == cinfo.before, but cinfo.after !=
542 * post-op getattr(change). flush all caches.
543 * d) post-op getattr(change) not provided by server.
544 * flush all caches.
545 */
546 mtime_changed = 1;
547 ctime_changed = 1;
548 rp->r_time_cache_inval = 0;
549 }
550 } else {
551 /*
552 * Write thread after writing data to file on remote server,
553 * will always set R4WRITEMODIFIED to indicate that file on
554 * remote server was modified with a WRITE operation and would
555 * have marked attribute cache as timed out. If R4WRITEMODIFIED
556 * is set, then do not check for mtime and ctime change.
557 */
558 if (!(rp->r_flags & R4WRITEMODIFIED)) {
559 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
560 mtime_changed = 1;
561
562 if (rp->r_attr.va_ctime.tv_sec !=
563 vap->va_ctime.tv_sec ||
564 rp->r_attr.va_ctime.tv_nsec !=
565 vap->va_ctime.tv_nsec)
566 ctime_changed = 1;
567
568 /*
569 * If the change attribute was not provided by server
570 * or it differs, then flush all caches.
571 */
572 if (!garp->n4g_change_valid ||
573 rp->r_change != garp->n4g_change) {
574 mtime_changed = 1;
575 ctime_changed = 1;
576 }
577 } else {
578 writemodify_set = B_TRUE;
579 }
580 }
581
582 preattr_rsize = rp->r_size;
583
584 nfs4_attrcache_va(vp, garp, set_time_cache_inval);
585
586 /*
587 * If we have updated filesize in nfs4_attrcache_va, as soon as we
588 * drop statelock we will be in transition of purging all
589 * our caches and updating them. It is possible for another
590 * thread to pick this new file size and read in zeroed data.
591 * stall other threads till cache purge is complete.
592 */
593 if ((!cinfo) && (rp->r_size != preattr_rsize)) {
594 /*
595 * If R4WRITEMODIFIED was set and we have updated the file
596 * size, Server's returned file size need not necessarily
597 * be because of this Client's WRITE. We need to purge
598 * all caches.
599 */
600 if (writemodify_set)
601 mtime_changed = 1;
602
603 if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) {
604 rp->r_flags |= R4INCACHEPURGE;
605 cachepurge_set = B_TRUE;
606 }
607 }
608
609 if (!mtime_changed && !ctime_changed) {
610 mutex_exit(&rp->r_statelock);
611 return;
612 }
613
614 rp->r_serial = curthread;
615
616 mutex_exit(&rp->r_statelock);
617
618 /*
619 * If we're the recov thread, then force async nfs4_purge_caches
620 * to avoid potential deadlock.
621 */
622 if (mtime_changed)
623 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async);
624
625 if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) {
626 mutex_enter(&rp->r_statelock);
627 rp->r_flags &= ~R4INCACHEPURGE;
628 cv_broadcast(&rp->r_cv);
629 mutex_exit(&rp->r_statelock);
630 cachepurge_set = B_FALSE;
631 }
632
633 if (ctime_changed) {
634 (void) nfs4_access_purge_rp(rp);
635 if (rp->r_secattr != NULL) {
636 mutex_enter(&rp->r_statelock);
637 vsp = rp->r_secattr;
638 rp->r_secattr = NULL;
639 mutex_exit(&rp->r_statelock);
640 if (vsp != NULL)
641 nfs4_acl_free_cache(vsp);
642 }
643 }
644
645 if (!was_serial) {
646 mutex_enter(&rp->r_statelock);
647 rp->r_serial = NULL;
648 cv_broadcast(&rp->r_cv);
649 mutex_exit(&rp->r_statelock);
650 }
651 }
652
653 /*
654 * Set attributes cache for given vnode using virtual attributes.
655 *
656 * Set the timeout value on the attribute cache and fill it
657 * with the passed in attributes.
658 *
659 * The caller must be holding r_statelock.
660 */
661 static void
nfs4_attrcache_va(vnode_t * vp,nfs4_ga_res_t * garp,int set_cache_timeout)662 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout)
663 {
664 rnode4_t *rp;
665 mntinfo4_t *mi;
666 hrtime_t delta;
667 hrtime_t now;
668 vattr_t *vap = &garp->n4g_va;
669
670 rp = VTOR4(vp);
671
672 ASSERT(MUTEX_HELD(&rp->r_statelock));
673 ASSERT(vap->va_mask == AT_ALL);
674
675 /* Switch to master before checking v_flag */
676 if (IS_SHADOW(vp, rp))
677 vp = RTOV4(rp);
678
679 now = gethrtime();
680
681 mi = VTOMI4(vp);
682
683 /*
684 * Only establish a new cache timeout (if requested). Never
685 * extend a timeout. Never clear a timeout. Clearing a timeout
686 * is done by nfs4_update_dircaches (ancestor in our call chain)
687 */
688 if (set_cache_timeout && ! rp->r_time_cache_inval)
689 rp->r_time_cache_inval = now + mi->mi_acdirmax;
690
691 /*
692 * Delta is the number of nanoseconds that we will
693 * cache the attributes of the file. It is based on
694 * the number of nanoseconds since the last time that
695 * we detected a change. The assumption is that files
696 * that changed recently are likely to change again.
697 * There is a minimum and a maximum for regular files
698 * and for directories which is enforced though.
699 *
700 * Using the time since last change was detected
701 * eliminates direct comparison or calculation
702 * using mixed client and server times. NFS does
703 * not make any assumptions regarding the client
704 * and server clocks being synchronized.
705 */
706 if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
707 vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
708 vap->va_size != rp->r_attr.va_size) {
709 rp->r_time_attr_saved = now;
710 }
711
712 if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE))
713 delta = 0;
714 else {
715 delta = now - rp->r_time_attr_saved;
716 if (vp->v_type == VDIR) {
717 if (delta < mi->mi_acdirmin)
718 delta = mi->mi_acdirmin;
719 else if (delta > mi->mi_acdirmax)
720 delta = mi->mi_acdirmax;
721 } else {
722 if (delta < mi->mi_acregmin)
723 delta = mi->mi_acregmin;
724 else if (delta > mi->mi_acregmax)
725 delta = mi->mi_acregmax;
726 }
727 }
728 rp->r_time_attr_inval = now + delta;
729
730 rp->r_attr = *vap;
731 if (garp->n4g_change_valid)
732 rp->r_change = garp->n4g_change;
733
734 /*
735 * The attributes that were returned may be valid and can
736 * be used, but they may not be allowed to be cached.
737 * Reset the timers to cause immediate invalidation and
738 * clear r_change so no VERIFY operations will suceed
739 */
740 if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) {
741 rp->r_time_attr_inval = now;
742 rp->r_time_attr_saved = now;
743 rp->r_change = 0;
744 }
745
746 /*
747 * If mounted_on_fileid returned AND the object is a stub,
748 * then set object's va_nodeid to the mounted over fid
749 * returned by server.
750 *
751 * If mounted_on_fileid not provided/supported, then
752 * just set it to 0 for now. Eventually it would be
753 * better to set it to a hashed version of FH. This
754 * would probably be good enough to provide a unique
755 * fid/d_ino within a dir.
756 *
757 * We don't need to carry mounted_on_fileid in the
758 * rnode as long as the client never requests fileid
759 * without also requesting mounted_on_fileid. For
760 * now, it stays.
761 */
762 if (garp->n4g_mon_fid_valid) {
763 rp->r_mntd_fid = garp->n4g_mon_fid;
764
765 if (RP_ISSTUB(rp))
766 rp->r_attr.va_nodeid = rp->r_mntd_fid;
767 }
768
769 /*
770 * Check to see if there are valid pathconf bits to
771 * cache in the rnode.
772 */
773 if (garp->n4g_ext_res) {
774 if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) {
775 rp->r_pathconf = garp->n4g_ext_res->n4g_pc4;
776 } else {
777 if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) {
778 rp->r_pathconf.pc4_xattr_valid = TRUE;
779 rp->r_pathconf.pc4_xattr_exists =
780 garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists;
781 }
782 }
783 }
784 /*
785 * Update the size of the file if there is no cached data or if
786 * the cached data is clean and there is no data being written
787 * out.
788 */
789 if (rp->r_size != vap->va_size &&
790 (!vn_has_cached_data(vp) ||
791 (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) {
792 rp->r_size = vap->va_size;
793 }
794 nfs_setswaplike(vp, vap);
795 rp->r_flags &= ~R4WRITEMODIFIED;
796 }
797
798 /*
799 * Get attributes over-the-wire and update attributes cache
800 * if no error occurred in the over-the-wire operation.
801 * Return 0 if successful, otherwise error.
802 */
803 int
nfs4_getattr_otw(vnode_t * vp,nfs4_ga_res_t * garp,cred_t * cr,int get_acl)804 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl)
805 {
806 mntinfo4_t *mi = VTOMI4(vp);
807 hrtime_t t;
808 nfs4_recov_state_t recov_state;
809 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
810
811 recov_state.rs_flags = 0;
812 recov_state.rs_num_retry_despite_err = 0;
813
814 /* Save the original mount point security flavor */
815 (void) save_mnt_secinfo(mi->mi_curr_serv);
816
817 recov_retry:
818
819 if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR,
820 &recov_state, NULL))) {
821 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
822 return (e.error);
823 }
824
825 t = gethrtime();
826
827 nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl);
828
829 if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) {
830 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
831 NULL, OP_GETATTR, NULL, NULL, NULL) == FALSE) {
832 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR,
833 &recov_state, 1);
834 goto recov_retry;
835 }
836 }
837
838 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0);
839
840 if (!e.error) {
841 if (e.stat == NFS4_OK) {
842 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
843 } else {
844 e.error = geterrno4(e.stat);
845
846 nfs4_purge_stale_fh(e.error, vp, cr);
847 }
848 }
849
850 /*
851 * If getattr a node that is a stub for a crossed
852 * mount point, keep the original secinfo flavor for
853 * the current file system, not the crossed one.
854 */
855 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
856
857 return (e.error);
858 }
859
860 /*
861 * Generate a compound to get attributes over-the-wire.
862 */
863 void
nfs4_getattr_otw_norecovery(vnode_t * vp,nfs4_ga_res_t * garp,nfs4_error_t * ep,cred_t * cr,int get_acl)864 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp,
865 nfs4_error_t *ep, cred_t *cr, int get_acl)
866 {
867 COMPOUND4args_clnt args;
868 COMPOUND4res_clnt res;
869 int doqueue;
870 rnode4_t *rp = VTOR4(vp);
871 nfs_argop4 argop[2];
872
873 args.ctag = TAG_GETATTR;
874
875 args.array_len = 2;
876 args.array = argop;
877
878 /* putfh */
879 argop[0].argop = OP_CPUTFH;
880 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
881
882 /* getattr */
883 /*
884 * Unlike nfs version 2 and 3, where getattr returns all the
885 * attributes, nfs version 4 returns only the ones explicitly
886 * asked for. This creates problems, as some system functions
887 * (e.g. cache check) require certain attributes and if the
888 * cached node lacks some attributes such as uid/gid, it can
889 * affect system utilities (e.g. "ls") that rely on the information
890 * to be there. This can lead to anything from system crashes to
891 * corrupted information processed by user apps.
892 * So to ensure that all bases are covered, request at least
893 * the AT_ALL attribute mask.
894 */
895 argop[1].argop = OP_GETATTR;
896 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
897 if (get_acl)
898 argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK;
899 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
900
901 doqueue = 1;
902
903 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep);
904
905 if (ep->error)
906 return;
907
908 if (res.status != NFS4_OK) {
909 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
910 return;
911 }
912
913 *garp = res.array[1].nfs_resop4_u.opgetattr.ga_res;
914
915 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
916 }
917
918 /*
919 * Return either cached or remote attributes. If get remote attr
920 * use them to check and invalidate caches, then cache the new attributes.
921 */
922 int
nfs4getattr(vnode_t * vp,vattr_t * vap,cred_t * cr)923 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr)
924 {
925 int error;
926 rnode4_t *rp;
927 nfs4_ga_res_t gar;
928
929 ASSERT(nfs4_consistent_type(vp));
930
931 /*
932 * If we've got cached attributes, we're done, otherwise go
933 * to the server to get attributes, which will update the cache
934 * in the process. Either way, use the cached attributes for
935 * the caller's vattr_t.
936 *
937 * Note that we ignore the gar set by the OTW call: the attr caching
938 * code may make adjustments when storing to the rnode, and we want
939 * to see those changes here.
940 */
941 rp = VTOR4(vp);
942 error = 0;
943 mutex_enter(&rp->r_statelock);
944 if (!ATTRCACHE4_VALID(vp)) {
945 mutex_exit(&rp->r_statelock);
946 error = nfs4_getattr_otw(vp, &gar, cr, 0);
947 mutex_enter(&rp->r_statelock);
948 }
949
950 if (!error)
951 *vap = rp->r_attr;
952
953 /* Return the client's view of file size */
954 vap->va_size = rp->r_size;
955
956 mutex_exit(&rp->r_statelock);
957
958 ASSERT(nfs4_consistent_type(vp));
959
960 return (error);
961 }
962
963 int
nfs4_attr_otw(vnode_t * vp,nfs4_tag_type_t tag_type,nfs4_ga_res_t * garp,bitmap4 reqbitmap,cred_t * cr)964 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type,
965 nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr)
966 {
967 COMPOUND4args_clnt args;
968 COMPOUND4res_clnt res;
969 int doqueue;
970 nfs_argop4 argop[2];
971 mntinfo4_t *mi = VTOMI4(vp);
972 bool_t needrecov = FALSE;
973 nfs4_recov_state_t recov_state;
974 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
975 nfs4_ga_ext_res_t *gerp;
976
977 recov_state.rs_flags = 0;
978 recov_state.rs_num_retry_despite_err = 0;
979
980 recov_retry:
981 args.ctag = tag_type;
982
983 args.array_len = 2;
984 args.array = argop;
985
986 e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL);
987 if (e.error)
988 return (e.error);
989
990 /* putfh */
991 argop[0].argop = OP_CPUTFH;
992 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
993
994 /* getattr */
995 argop[1].argop = OP_GETATTR;
996 argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap;
997 argop[1].nfs_argop4_u.opgetattr.mi = mi;
998
999 doqueue = 1;
1000
1001 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1002 "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first",
1003 rnode4info(VTOR4(vp))));
1004
1005 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1006
1007 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
1008 if (!needrecov && e.error) {
1009 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1010 needrecov);
1011 return (e.error);
1012 }
1013
1014 if (needrecov) {
1015 bool_t abort;
1016
1017 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1018 "nfs4_attr_otw: initiating recovery\n"));
1019
1020 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
1021 NULL, OP_GETATTR, NULL, NULL, NULL);
1022 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1023 needrecov);
1024 if (!e.error) {
1025 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1026 e.error = geterrno4(res.status);
1027 }
1028 if (abort == FALSE)
1029 goto recov_retry;
1030 return (e.error);
1031 }
1032
1033 if (res.status) {
1034 e.error = geterrno4(res.status);
1035 } else {
1036 gerp = garp->n4g_ext_res;
1037 bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res,
1038 garp, sizeof (nfs4_ga_res_t));
1039 garp->n4g_ext_res = gerp;
1040 if (garp->n4g_ext_res &&
1041 res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res)
1042 bcopy(res.array[1].nfs_resop4_u.opgetattr.
1043 ga_res.n4g_ext_res,
1044 garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t));
1045 }
1046 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1047 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1048 needrecov);
1049 return (e.error);
1050 }
1051
1052 /*
1053 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark
1054 * for the demand-based allocation of async threads per-mount. The
1055 * nfs_async_timeout is the amount of time a thread will live after it
1056 * becomes idle, unless new I/O requests are received before the thread
1057 * dies. See nfs4_async_putpage and nfs4_async_start.
1058 */
1059
1060 static void nfs4_async_start(struct vfs *);
1061 static void nfs4_async_pgops_start(struct vfs *);
1062 static void nfs4_async_common_start(struct vfs *, int);
1063
1064 static void
free_async_args4(struct nfs4_async_reqs * args)1065 free_async_args4(struct nfs4_async_reqs *args)
1066 {
1067 rnode4_t *rp;
1068
1069 if (args->a_io != NFS4_INACTIVE) {
1070 rp = VTOR4(args->a_vp);
1071 mutex_enter(&rp->r_statelock);
1072 rp->r_count--;
1073 if (args->a_io == NFS4_PUTAPAGE ||
1074 args->a_io == NFS4_PAGEIO)
1075 rp->r_awcount--;
1076 cv_broadcast(&rp->r_cv);
1077 mutex_exit(&rp->r_statelock);
1078 VN_RELE(args->a_vp);
1079 }
1080 crfree(args->a_cred);
1081 kmem_free(args, sizeof (*args));
1082 }
1083
1084 /*
1085 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1086 * pageout(), running in the global zone, have legitimate reasons to do
1087 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by
1088 * use of a a per-mount "asynchronous requests manager thread" which is
1089 * signaled by the various asynchronous work routines when there is
1090 * asynchronous work to be done. It is responsible for creating new
1091 * worker threads if necessary, and notifying existing worker threads
1092 * that there is work to be done.
1093 *
1094 * In other words, it will "take the specifications from the customers and
1095 * give them to the engineers."
1096 *
1097 * Worker threads die off of their own accord if they are no longer
1098 * needed.
1099 *
1100 * This thread is killed when the zone is going away or the filesystem
1101 * is being unmounted.
1102 */
1103 void
nfs4_async_manager(vfs_t * vfsp)1104 nfs4_async_manager(vfs_t *vfsp)
1105 {
1106 callb_cpr_t cprinfo;
1107 mntinfo4_t *mi;
1108 uint_t max_threads;
1109
1110 mi = VFTOMI4(vfsp);
1111
1112 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1113 "nfs4_async_manager");
1114
1115 mutex_enter(&mi->mi_async_lock);
1116 /*
1117 * We want to stash the max number of threads that this mount was
1118 * allowed so we can use it later when the variable is set to zero as
1119 * part of the zone/mount going away.
1120 *
1121 * We want to be able to create at least one thread to handle
1122 * asynchronous inactive calls.
1123 */
1124 max_threads = MAX(mi->mi_max_threads, 1);
1125 /*
1126 * We don't want to wait for mi_max_threads to go to zero, since that
1127 * happens as part of a failed unmount, but this thread should only
1128 * exit when the mount is really going away.
1129 *
1130 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be
1131 * attempted: the various _async_*() functions know to do things
1132 * inline if mi_max_threads == 0. Henceforth we just drain out the
1133 * outstanding requests.
1134 *
1135 * Note that we still create zthreads even if we notice the zone is
1136 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone
1137 * shutdown sequence to take slightly longer in some cases, but
1138 * doesn't violate the protocol, as all threads will exit as soon as
1139 * they're done processing the remaining requests.
1140 */
1141 for (;;) {
1142 while (mi->mi_async_req_count > 0) {
1143 /*
1144 * Paranoia: If the mount started out having
1145 * (mi->mi_max_threads == 0), and the value was
1146 * later changed (via a debugger or somesuch),
1147 * we could be confused since we will think we
1148 * can't create any threads, and the calling
1149 * code (which looks at the current value of
1150 * mi->mi_max_threads, now non-zero) thinks we
1151 * can.
1152 *
1153 * So, because we're paranoid, we create threads
1154 * up to the maximum of the original and the
1155 * current value. This means that future
1156 * (debugger-induced) alterations of
1157 * mi->mi_max_threads are ignored for our
1158 * purposes, but who told them they could change
1159 * random values on a live kernel anyhow?
1160 */
1161 if (mi->mi_threads[NFS4_ASYNC_QUEUE] <
1162 MAX(mi->mi_max_threads, max_threads)) {
1163 mi->mi_threads[NFS4_ASYNC_QUEUE]++;
1164 mutex_exit(&mi->mi_async_lock);
1165 MI4_HOLD(mi);
1166 VFS_HOLD(vfsp); /* hold for new thread */
1167 (void) zthread_create(NULL, 0, nfs4_async_start,
1168 vfsp, 0, minclsyspri);
1169 mutex_enter(&mi->mi_async_lock);
1170 } else if (mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] <
1171 NUM_ASYNC_PGOPS_THREADS) {
1172 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE]++;
1173 mutex_exit(&mi->mi_async_lock);
1174 MI4_HOLD(mi);
1175 VFS_HOLD(vfsp); /* hold for new thread */
1176 (void) zthread_create(NULL, 0,
1177 nfs4_async_pgops_start, vfsp, 0,
1178 minclsyspri);
1179 mutex_enter(&mi->mi_async_lock);
1180 }
1181 NFS4_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
1182 ASSERT(mi->mi_async_req_count != 0);
1183 mi->mi_async_req_count--;
1184 }
1185
1186 mutex_enter(&mi->mi_lock);
1187 if (mi->mi_flags & MI4_ASYNC_MGR_STOP) {
1188 mutex_exit(&mi->mi_lock);
1189 break;
1190 }
1191 mutex_exit(&mi->mi_lock);
1192
1193 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1194 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1195 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1196 }
1197
1198 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1199 "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp));
1200 /*
1201 * Let everyone know we're done.
1202 */
1203 mi->mi_manager_thread = NULL;
1204 /*
1205 * Wake up the inactive thread.
1206 */
1207 cv_broadcast(&mi->mi_inact_req_cv);
1208 /*
1209 * Wake up anyone sitting in nfs4_async_manager_stop()
1210 */
1211 cv_broadcast(&mi->mi_async_cv);
1212 /*
1213 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1214 * since CALLB_CPR_EXIT is actually responsible for releasing
1215 * 'mi_async_lock'.
1216 */
1217 CALLB_CPR_EXIT(&cprinfo);
1218 VFS_RELE(vfsp); /* release thread's hold */
1219 MI4_RELE(mi);
1220 zthread_exit();
1221 }
1222
1223 /*
1224 * Signal (and wait for) the async manager thread to clean up and go away.
1225 */
1226 void
nfs4_async_manager_stop(vfs_t * vfsp)1227 nfs4_async_manager_stop(vfs_t *vfsp)
1228 {
1229 mntinfo4_t *mi = VFTOMI4(vfsp);
1230
1231 mutex_enter(&mi->mi_async_lock);
1232 mutex_enter(&mi->mi_lock);
1233 mi->mi_flags |= MI4_ASYNC_MGR_STOP;
1234 mutex_exit(&mi->mi_lock);
1235 cv_broadcast(&mi->mi_async_reqs_cv);
1236 /*
1237 * Wait for the async manager thread to die.
1238 */
1239 while (mi->mi_manager_thread != NULL)
1240 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1241 mutex_exit(&mi->mi_async_lock);
1242 }
1243
1244 int
nfs4_async_readahead(vnode_t * vp,u_offset_t blkoff,caddr_t addr,struct seg * seg,cred_t * cr,void (* readahead)(vnode_t *,u_offset_t,caddr_t,struct seg *,cred_t *))1245 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1246 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1247 u_offset_t, caddr_t, struct seg *, cred_t *))
1248 {
1249 rnode4_t *rp;
1250 mntinfo4_t *mi;
1251 struct nfs4_async_reqs *args;
1252
1253 rp = VTOR4(vp);
1254 ASSERT(rp->r_freef == NULL);
1255
1256 mi = VTOMI4(vp);
1257
1258 /*
1259 * If addr falls in a different segment, don't bother doing readahead.
1260 */
1261 if (addr >= seg->s_base + seg->s_size)
1262 return (-1);
1263
1264 /*
1265 * If we can't allocate a request structure, punt on the readahead.
1266 */
1267 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1268 return (-1);
1269
1270 /*
1271 * If a lock operation is pending, don't initiate any new
1272 * readaheads. Otherwise, bump r_count to indicate the new
1273 * asynchronous I/O.
1274 */
1275 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1276 kmem_free(args, sizeof (*args));
1277 return (-1);
1278 }
1279 mutex_enter(&rp->r_statelock);
1280 rp->r_count++;
1281 mutex_exit(&rp->r_statelock);
1282 nfs_rw_exit(&rp->r_lkserlock);
1283
1284 args->a_next = NULL;
1285 #ifdef DEBUG
1286 args->a_queuer = curthread;
1287 #endif
1288 VN_HOLD(vp);
1289 args->a_vp = vp;
1290 ASSERT(cr != NULL);
1291 crhold(cr);
1292 args->a_cred = cr;
1293 args->a_io = NFS4_READ_AHEAD;
1294 args->a_nfs4_readahead = readahead;
1295 args->a_nfs4_blkoff = blkoff;
1296 args->a_nfs4_seg = seg;
1297 args->a_nfs4_addr = addr;
1298
1299 mutex_enter(&mi->mi_async_lock);
1300
1301 /*
1302 * If asyncio has been disabled, don't bother readahead.
1303 */
1304 if (mi->mi_max_threads == 0) {
1305 mutex_exit(&mi->mi_async_lock);
1306 goto noasync;
1307 }
1308
1309 /*
1310 * Link request structure into the async list and
1311 * wakeup async thread to do the i/o.
1312 */
1313 if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) {
1314 mi->mi_async_reqs[NFS4_READ_AHEAD] = args;
1315 mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1316 } else {
1317 mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args;
1318 mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1319 }
1320
1321 if (mi->mi_io_kstats) {
1322 mutex_enter(&mi->mi_lock);
1323 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1324 mutex_exit(&mi->mi_lock);
1325 }
1326
1327 mi->mi_async_req_count++;
1328 ASSERT(mi->mi_async_req_count != 0);
1329 cv_signal(&mi->mi_async_reqs_cv);
1330 mutex_exit(&mi->mi_async_lock);
1331 return (0);
1332
1333 noasync:
1334 mutex_enter(&rp->r_statelock);
1335 rp->r_count--;
1336 cv_broadcast(&rp->r_cv);
1337 mutex_exit(&rp->r_statelock);
1338 VN_RELE(vp);
1339 crfree(cr);
1340 kmem_free(args, sizeof (*args));
1341 return (-1);
1342 }
1343
1344 static void
nfs4_async_start(struct vfs * vfsp)1345 nfs4_async_start(struct vfs *vfsp)
1346 {
1347 nfs4_async_common_start(vfsp, NFS4_ASYNC_QUEUE);
1348 }
1349
1350 static void
nfs4_async_pgops_start(struct vfs * vfsp)1351 nfs4_async_pgops_start(struct vfs *vfsp)
1352 {
1353 nfs4_async_common_start(vfsp, NFS4_ASYNC_PGOPS_QUEUE);
1354 }
1355
1356 /*
1357 * The async queues for each mounted file system are arranged as a
1358 * set of queues, one for each async i/o type. Requests are taken
1359 * from the queues in a round-robin fashion. A number of consecutive
1360 * requests are taken from each queue before moving on to the next
1361 * queue. This functionality may allow the NFS Version 2 server to do
1362 * write clustering, even if the client is mixing writes and reads
1363 * because it will take multiple write requests from the queue
1364 * before processing any of the other async i/o types.
1365 *
1366 * XXX The nfs4_async_common_start thread is unsafe in the light of the present
1367 * model defined by cpr to suspend the system. Specifically over the
1368 * wire calls are cpr-unsafe. The thread should be reevaluated in
1369 * case of future updates to the cpr model.
1370 */
1371 static void
nfs4_async_common_start(struct vfs * vfsp,int async_queue)1372 nfs4_async_common_start(struct vfs *vfsp, int async_queue)
1373 {
1374 struct nfs4_async_reqs *args;
1375 mntinfo4_t *mi = VFTOMI4(vfsp);
1376 clock_t time_left = 1;
1377 callb_cpr_t cprinfo;
1378 int i;
1379 extern int nfs_async_timeout;
1380 int async_types;
1381 kcondvar_t *async_work_cv;
1382
1383 if (async_queue == NFS4_ASYNC_QUEUE) {
1384 async_types = NFS4_ASYNC_TYPES;
1385 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_QUEUE];
1386 } else {
1387 async_types = NFS4_ASYNC_PGOPS_TYPES;
1388 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE];
1389 }
1390
1391 /*
1392 * Dynamic initialization of nfs_async_timeout to allow nfs to be
1393 * built in an implementation independent manner.
1394 */
1395 if (nfs_async_timeout == -1)
1396 nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1397
1398 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1399
1400 mutex_enter(&mi->mi_async_lock);
1401 for (;;) {
1402 /*
1403 * Find the next queue containing an entry. We start
1404 * at the current queue pointer and then round robin
1405 * through all of them until we either find a non-empty
1406 * queue or have looked through all of them.
1407 */
1408 for (i = 0; i < async_types; i++) {
1409 args = *mi->mi_async_curr[async_queue];
1410 if (args != NULL)
1411 break;
1412 mi->mi_async_curr[async_queue]++;
1413 if (mi->mi_async_curr[async_queue] ==
1414 &mi->mi_async_reqs[async_types]) {
1415 mi->mi_async_curr[async_queue] =
1416 &mi->mi_async_reqs[0];
1417 }
1418 }
1419 /*
1420 * If we didn't find a entry, then block until woken up
1421 * again and then look through the queues again.
1422 */
1423 if (args == NULL) {
1424 /*
1425 * Exiting is considered to be safe for CPR as well
1426 */
1427 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1428
1429 /*
1430 * Wakeup thread waiting to unmount the file
1431 * system only if all async threads are inactive.
1432 *
1433 * If we've timed-out and there's nothing to do,
1434 * then get rid of this thread.
1435 */
1436 if (mi->mi_max_threads == 0 || time_left <= 0) {
1437 --mi->mi_threads[async_queue];
1438
1439 if (mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
1440 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0)
1441 cv_signal(&mi->mi_async_cv);
1442 CALLB_CPR_EXIT(&cprinfo);
1443 VFS_RELE(vfsp); /* release thread's hold */
1444 MI4_RELE(mi);
1445 zthread_exit();
1446 /* NOTREACHED */
1447 }
1448 time_left = cv_reltimedwait(async_work_cv,
1449 &mi->mi_async_lock, nfs_async_timeout,
1450 TR_CLOCK_TICK);
1451
1452 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1453
1454 continue;
1455 } else {
1456 time_left = 1;
1457 }
1458
1459 /*
1460 * Remove the request from the async queue and then
1461 * update the current async request queue pointer. If
1462 * the current queue is empty or we have removed enough
1463 * consecutive entries from it, then reset the counter
1464 * for this queue and then move the current pointer to
1465 * the next queue.
1466 */
1467 *mi->mi_async_curr[async_queue] = args->a_next;
1468 if (*mi->mi_async_curr[async_queue] == NULL ||
1469 --mi->mi_async_clusters[args->a_io] == 0) {
1470 mi->mi_async_clusters[args->a_io] =
1471 mi->mi_async_init_clusters;
1472 mi->mi_async_curr[async_queue]++;
1473 if (mi->mi_async_curr[async_queue] ==
1474 &mi->mi_async_reqs[async_types]) {
1475 mi->mi_async_curr[async_queue] =
1476 &mi->mi_async_reqs[0];
1477 }
1478 }
1479
1480 if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) {
1481 mutex_enter(&mi->mi_lock);
1482 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1483 mutex_exit(&mi->mi_lock);
1484 }
1485
1486 mutex_exit(&mi->mi_async_lock);
1487
1488 /*
1489 * Obtain arguments from the async request structure.
1490 */
1491 if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) {
1492 (*args->a_nfs4_readahead)(args->a_vp,
1493 args->a_nfs4_blkoff, args->a_nfs4_addr,
1494 args->a_nfs4_seg, args->a_cred);
1495 } else if (args->a_io == NFS4_PUTAPAGE) {
1496 (void) (*args->a_nfs4_putapage)(args->a_vp,
1497 args->a_nfs4_pp, args->a_nfs4_off,
1498 args->a_nfs4_len, args->a_nfs4_flags,
1499 args->a_cred);
1500 } else if (args->a_io == NFS4_PAGEIO) {
1501 (void) (*args->a_nfs4_pageio)(args->a_vp,
1502 args->a_nfs4_pp, args->a_nfs4_off,
1503 args->a_nfs4_len, args->a_nfs4_flags,
1504 args->a_cred);
1505 } else if (args->a_io == NFS4_READDIR) {
1506 (void) ((*args->a_nfs4_readdir)(args->a_vp,
1507 args->a_nfs4_rdc, args->a_cred));
1508 } else if (args->a_io == NFS4_COMMIT) {
1509 (*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist,
1510 args->a_nfs4_offset, args->a_nfs4_count,
1511 args->a_cred);
1512 } else if (args->a_io == NFS4_INACTIVE) {
1513 nfs4_inactive_otw(args->a_vp, args->a_cred);
1514 }
1515
1516 /*
1517 * Now, release the vnode and free the credentials
1518 * structure.
1519 */
1520 free_async_args4(args);
1521 /*
1522 * Reacquire the mutex because it will be needed above.
1523 */
1524 mutex_enter(&mi->mi_async_lock);
1525 }
1526 }
1527
1528 /*
1529 * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as
1530 * part of VOP_INACTIVE.
1531 */
1532
1533 void
nfs4_inactive_thread(mntinfo4_t * mi)1534 nfs4_inactive_thread(mntinfo4_t *mi)
1535 {
1536 struct nfs4_async_reqs *args;
1537 callb_cpr_t cprinfo;
1538 vfs_t *vfsp = mi->mi_vfsp;
1539
1540 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1541 "nfs4_inactive_thread");
1542
1543 for (;;) {
1544 mutex_enter(&mi->mi_async_lock);
1545 args = mi->mi_async_reqs[NFS4_INACTIVE];
1546 if (args == NULL) {
1547 mutex_enter(&mi->mi_lock);
1548 /*
1549 * We don't want to exit until the async manager is done
1550 * with its work; hence the check for mi_manager_thread
1551 * being NULL.
1552 *
1553 * The async manager thread will cv_broadcast() on
1554 * mi_inact_req_cv when it's done, at which point we'll
1555 * wake up and exit.
1556 */
1557 if (mi->mi_manager_thread == NULL)
1558 goto die;
1559 mi->mi_flags |= MI4_INACTIVE_IDLE;
1560 mutex_exit(&mi->mi_lock);
1561 cv_signal(&mi->mi_async_cv);
1562 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1563 cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock);
1564 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1565 mutex_exit(&mi->mi_async_lock);
1566 } else {
1567 mutex_enter(&mi->mi_lock);
1568 mi->mi_flags &= ~MI4_INACTIVE_IDLE;
1569 mutex_exit(&mi->mi_lock);
1570 mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next;
1571 mutex_exit(&mi->mi_async_lock);
1572 nfs4_inactive_otw(args->a_vp, args->a_cred);
1573 crfree(args->a_cred);
1574 kmem_free(args, sizeof (*args));
1575 }
1576 }
1577 die:
1578 mutex_exit(&mi->mi_lock);
1579 mi->mi_inactive_thread = NULL;
1580 cv_signal(&mi->mi_async_cv);
1581
1582 /*
1583 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since
1584 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'.
1585 */
1586 CALLB_CPR_EXIT(&cprinfo);
1587
1588 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1589 "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp));
1590
1591 MI4_RELE(mi);
1592 zthread_exit();
1593 /* NOTREACHED */
1594 }
1595
1596 /*
1597 * nfs_async_stop:
1598 * Wait for all outstanding putpage operations and the inactive thread to
1599 * complete; nfs4_async_stop_sig() without interruptibility.
1600 */
1601 void
nfs4_async_stop(struct vfs * vfsp)1602 nfs4_async_stop(struct vfs *vfsp)
1603 {
1604 mntinfo4_t *mi = VFTOMI4(vfsp);
1605
1606 /*
1607 * Wait for all outstanding async operations to complete and for
1608 * worker threads to exit.
1609 */
1610 mutex_enter(&mi->mi_async_lock);
1611 mi->mi_max_threads = 0;
1612 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1613 while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1614 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0)
1615 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1616
1617 /*
1618 * Wait for the inactive thread to finish doing what it's doing. It
1619 * won't exit until the last reference to the vfs_t goes away.
1620 */
1621 if (mi->mi_inactive_thread != NULL) {
1622 mutex_enter(&mi->mi_lock);
1623 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1624 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1625 mutex_exit(&mi->mi_lock);
1626 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1627 mutex_enter(&mi->mi_lock);
1628 }
1629 mutex_exit(&mi->mi_lock);
1630 }
1631 mutex_exit(&mi->mi_async_lock);
1632 }
1633
1634 /*
1635 * nfs_async_stop_sig:
1636 * Wait for all outstanding putpage operations and the inactive thread to
1637 * complete. If a signal is delivered we will abort and return non-zero;
1638 * otherwise return 0. Since this routine is called from nfs4_unmount, we
1639 * need to make it interruptible.
1640 */
1641 int
nfs4_async_stop_sig(struct vfs * vfsp)1642 nfs4_async_stop_sig(struct vfs *vfsp)
1643 {
1644 mntinfo4_t *mi = VFTOMI4(vfsp);
1645 ushort_t omax;
1646 bool_t intr = FALSE;
1647
1648 /*
1649 * Wait for all outstanding putpage operations to complete and for
1650 * worker threads to exit.
1651 */
1652 mutex_enter(&mi->mi_async_lock);
1653 omax = mi->mi_max_threads;
1654 mi->mi_max_threads = 0;
1655 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1656 while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1657 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) {
1658 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) {
1659 intr = TRUE;
1660 goto interrupted;
1661 }
1662 }
1663
1664 /*
1665 * Wait for the inactive thread to finish doing what it's doing. It
1666 * won't exit until the a last reference to the vfs_t goes away.
1667 */
1668 if (mi->mi_inactive_thread != NULL) {
1669 mutex_enter(&mi->mi_lock);
1670 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1671 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1672 mutex_exit(&mi->mi_lock);
1673 if (!cv_wait_sig(&mi->mi_async_cv,
1674 &mi->mi_async_lock)) {
1675 intr = TRUE;
1676 goto interrupted;
1677 }
1678 mutex_enter(&mi->mi_lock);
1679 }
1680 mutex_exit(&mi->mi_lock);
1681 }
1682 interrupted:
1683 if (intr)
1684 mi->mi_max_threads = omax;
1685 mutex_exit(&mi->mi_async_lock);
1686
1687 return (intr);
1688 }
1689
1690 int
nfs4_async_putapage(vnode_t * vp,page_t * pp,u_offset_t off,size_t len,int flags,cred_t * cr,int (* putapage)(vnode_t *,page_t *,u_offset_t,size_t,int,cred_t *))1691 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1692 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1693 u_offset_t, size_t, int, cred_t *))
1694 {
1695 rnode4_t *rp;
1696 mntinfo4_t *mi;
1697 struct nfs4_async_reqs *args;
1698
1699 ASSERT(flags & B_ASYNC);
1700 ASSERT(vp->v_vfsp != NULL);
1701
1702 rp = VTOR4(vp);
1703 ASSERT(rp->r_count > 0);
1704
1705 mi = VTOMI4(vp);
1706
1707 /*
1708 * If we can't allocate a request structure, do the putpage
1709 * operation synchronously in this thread's context.
1710 */
1711 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1712 goto noasync;
1713
1714 args->a_next = NULL;
1715 #ifdef DEBUG
1716 args->a_queuer = curthread;
1717 #endif
1718 VN_HOLD(vp);
1719 args->a_vp = vp;
1720 ASSERT(cr != NULL);
1721 crhold(cr);
1722 args->a_cred = cr;
1723 args->a_io = NFS4_PUTAPAGE;
1724 args->a_nfs4_putapage = putapage;
1725 args->a_nfs4_pp = pp;
1726 args->a_nfs4_off = off;
1727 args->a_nfs4_len = (uint_t)len;
1728 args->a_nfs4_flags = flags;
1729
1730 mutex_enter(&mi->mi_async_lock);
1731
1732 /*
1733 * If asyncio has been disabled, then make a synchronous request.
1734 * This check is done a second time in case async io was diabled
1735 * while this thread was blocked waiting for memory pressure to
1736 * reduce or for the queue to drain.
1737 */
1738 if (mi->mi_max_threads == 0) {
1739 mutex_exit(&mi->mi_async_lock);
1740
1741 VN_RELE(vp);
1742 crfree(cr);
1743 kmem_free(args, sizeof (*args));
1744 goto noasync;
1745 }
1746
1747 /*
1748 * Link request structure into the async list and
1749 * wakeup async thread to do the i/o.
1750 */
1751 if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) {
1752 mi->mi_async_reqs[NFS4_PUTAPAGE] = args;
1753 mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1754 } else {
1755 mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args;
1756 mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1757 }
1758
1759 mutex_enter(&rp->r_statelock);
1760 rp->r_count++;
1761 rp->r_awcount++;
1762 mutex_exit(&rp->r_statelock);
1763
1764 if (mi->mi_io_kstats) {
1765 mutex_enter(&mi->mi_lock);
1766 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1767 mutex_exit(&mi->mi_lock);
1768 }
1769
1770 mi->mi_async_req_count++;
1771 ASSERT(mi->mi_async_req_count != 0);
1772 cv_signal(&mi->mi_async_reqs_cv);
1773 mutex_exit(&mi->mi_async_lock);
1774 return (0);
1775
1776 noasync:
1777
1778 if (curproc == proc_pageout || curproc == proc_fsflush) {
1779 /*
1780 * If we get here in the context of the pageout/fsflush,
1781 * or we have run out of memory or we're attempting to
1782 * unmount we refuse to do a sync write, because this may
1783 * hang pageout/fsflush and the machine. In this case,
1784 * we just re-mark the page as dirty and punt on the page.
1785 *
1786 * Make sure B_FORCE isn't set. We can re-mark the
1787 * pages as dirty and unlock the pages in one swoop by
1788 * passing in B_ERROR to pvn_write_done(). However,
1789 * we should make sure B_FORCE isn't set - we don't
1790 * want the page tossed before it gets written out.
1791 */
1792 if (flags & B_FORCE)
1793 flags &= ~(B_INVAL | B_FORCE);
1794 pvn_write_done(pp, flags | B_ERROR);
1795 return (0);
1796 }
1797
1798 if (nfs_zone() != mi->mi_zone) {
1799 /*
1800 * So this was a cross-zone sync putpage.
1801 *
1802 * We pass in B_ERROR to pvn_write_done() to re-mark the pages
1803 * as dirty and unlock them.
1804 *
1805 * We don't want to clear B_FORCE here as the caller presumably
1806 * knows what they're doing if they set it.
1807 */
1808 pvn_write_done(pp, flags | B_ERROR);
1809 return (EPERM);
1810 }
1811 return ((*putapage)(vp, pp, off, len, flags, cr));
1812 }
1813
1814 int
nfs4_async_pageio(vnode_t * vp,page_t * pp,u_offset_t io_off,size_t io_len,int flags,cred_t * cr,int (* pageio)(vnode_t *,page_t *,u_offset_t,size_t,int,cred_t *))1815 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1816 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1817 size_t, int, cred_t *))
1818 {
1819 rnode4_t *rp;
1820 mntinfo4_t *mi;
1821 struct nfs4_async_reqs *args;
1822
1823 ASSERT(flags & B_ASYNC);
1824 ASSERT(vp->v_vfsp != NULL);
1825
1826 rp = VTOR4(vp);
1827 ASSERT(rp->r_count > 0);
1828
1829 mi = VTOMI4(vp);
1830
1831 /*
1832 * If we can't allocate a request structure, do the pageio
1833 * request synchronously in this thread's context.
1834 */
1835 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1836 goto noasync;
1837
1838 args->a_next = NULL;
1839 #ifdef DEBUG
1840 args->a_queuer = curthread;
1841 #endif
1842 VN_HOLD(vp);
1843 args->a_vp = vp;
1844 ASSERT(cr != NULL);
1845 crhold(cr);
1846 args->a_cred = cr;
1847 args->a_io = NFS4_PAGEIO;
1848 args->a_nfs4_pageio = pageio;
1849 args->a_nfs4_pp = pp;
1850 args->a_nfs4_off = io_off;
1851 args->a_nfs4_len = (uint_t)io_len;
1852 args->a_nfs4_flags = flags;
1853
1854 mutex_enter(&mi->mi_async_lock);
1855
1856 /*
1857 * If asyncio has been disabled, then make a synchronous request.
1858 * This check is done a second time in case async io was diabled
1859 * while this thread was blocked waiting for memory pressure to
1860 * reduce or for the queue to drain.
1861 */
1862 if (mi->mi_max_threads == 0) {
1863 mutex_exit(&mi->mi_async_lock);
1864
1865 VN_RELE(vp);
1866 crfree(cr);
1867 kmem_free(args, sizeof (*args));
1868 goto noasync;
1869 }
1870
1871 /*
1872 * Link request structure into the async list and
1873 * wakeup async thread to do the i/o.
1874 */
1875 if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) {
1876 mi->mi_async_reqs[NFS4_PAGEIO] = args;
1877 mi->mi_async_tail[NFS4_PAGEIO] = args;
1878 } else {
1879 mi->mi_async_tail[NFS4_PAGEIO]->a_next = args;
1880 mi->mi_async_tail[NFS4_PAGEIO] = args;
1881 }
1882
1883 mutex_enter(&rp->r_statelock);
1884 rp->r_count++;
1885 rp->r_awcount++;
1886 mutex_exit(&rp->r_statelock);
1887
1888 if (mi->mi_io_kstats) {
1889 mutex_enter(&mi->mi_lock);
1890 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1891 mutex_exit(&mi->mi_lock);
1892 }
1893
1894 mi->mi_async_req_count++;
1895 ASSERT(mi->mi_async_req_count != 0);
1896 cv_signal(&mi->mi_async_reqs_cv);
1897 mutex_exit(&mi->mi_async_lock);
1898 return (0);
1899
1900 noasync:
1901 /*
1902 * If we can't do it ASYNC, for reads we do nothing (but cleanup
1903 * the page list), for writes we do it synchronously, except for
1904 * proc_pageout/proc_fsflush as described below.
1905 */
1906 if (flags & B_READ) {
1907 pvn_read_done(pp, flags | B_ERROR);
1908 return (0);
1909 }
1910
1911 if (curproc == proc_pageout || curproc == proc_fsflush) {
1912 /*
1913 * If we get here in the context of the pageout/fsflush,
1914 * we refuse to do a sync write, because this may hang
1915 * pageout/fsflush (and the machine). In this case, we just
1916 * re-mark the page as dirty and punt on the page.
1917 *
1918 * Make sure B_FORCE isn't set. We can re-mark the
1919 * pages as dirty and unlock the pages in one swoop by
1920 * passing in B_ERROR to pvn_write_done(). However,
1921 * we should make sure B_FORCE isn't set - we don't
1922 * want the page tossed before it gets written out.
1923 */
1924 if (flags & B_FORCE)
1925 flags &= ~(B_INVAL | B_FORCE);
1926 pvn_write_done(pp, flags | B_ERROR);
1927 return (0);
1928 }
1929
1930 if (nfs_zone() != mi->mi_zone) {
1931 /*
1932 * So this was a cross-zone sync pageio. We pass in B_ERROR
1933 * to pvn_write_done() to re-mark the pages as dirty and unlock
1934 * them.
1935 *
1936 * We don't want to clear B_FORCE here as the caller presumably
1937 * knows what they're doing if they set it.
1938 */
1939 pvn_write_done(pp, flags | B_ERROR);
1940 return (EPERM);
1941 }
1942 return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1943 }
1944
1945 void
nfs4_async_readdir(vnode_t * vp,rddir4_cache * rdc,cred_t * cr,int (* readdir)(vnode_t *,rddir4_cache *,cred_t *))1946 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr,
1947 int (*readdir)(vnode_t *, rddir4_cache *, cred_t *))
1948 {
1949 rnode4_t *rp;
1950 mntinfo4_t *mi;
1951 struct nfs4_async_reqs *args;
1952
1953 rp = VTOR4(vp);
1954 ASSERT(rp->r_freef == NULL);
1955
1956 mi = VTOMI4(vp);
1957
1958 /*
1959 * If we can't allocate a request structure, skip the readdir.
1960 */
1961 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1962 goto noasync;
1963
1964 args->a_next = NULL;
1965 #ifdef DEBUG
1966 args->a_queuer = curthread;
1967 #endif
1968 VN_HOLD(vp);
1969 args->a_vp = vp;
1970 ASSERT(cr != NULL);
1971 crhold(cr);
1972 args->a_cred = cr;
1973 args->a_io = NFS4_READDIR;
1974 args->a_nfs4_readdir = readdir;
1975 args->a_nfs4_rdc = rdc;
1976
1977 mutex_enter(&mi->mi_async_lock);
1978
1979 /*
1980 * If asyncio has been disabled, then skip this request
1981 */
1982 if (mi->mi_max_threads == 0) {
1983 mutex_exit(&mi->mi_async_lock);
1984
1985 VN_RELE(vp);
1986 crfree(cr);
1987 kmem_free(args, sizeof (*args));
1988 goto noasync;
1989 }
1990
1991 /*
1992 * Link request structure into the async list and
1993 * wakeup async thread to do the i/o.
1994 */
1995 if (mi->mi_async_reqs[NFS4_READDIR] == NULL) {
1996 mi->mi_async_reqs[NFS4_READDIR] = args;
1997 mi->mi_async_tail[NFS4_READDIR] = args;
1998 } else {
1999 mi->mi_async_tail[NFS4_READDIR]->a_next = args;
2000 mi->mi_async_tail[NFS4_READDIR] = args;
2001 }
2002
2003 mutex_enter(&rp->r_statelock);
2004 rp->r_count++;
2005 mutex_exit(&rp->r_statelock);
2006
2007 if (mi->mi_io_kstats) {
2008 mutex_enter(&mi->mi_lock);
2009 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2010 mutex_exit(&mi->mi_lock);
2011 }
2012
2013 mi->mi_async_req_count++;
2014 ASSERT(mi->mi_async_req_count != 0);
2015 cv_signal(&mi->mi_async_reqs_cv);
2016 mutex_exit(&mi->mi_async_lock);
2017 return;
2018
2019 noasync:
2020 mutex_enter(&rp->r_statelock);
2021 rdc->entries = NULL;
2022 /*
2023 * Indicate that no one is trying to fill this entry and
2024 * it still needs to be filled.
2025 */
2026 rdc->flags &= ~RDDIR;
2027 rdc->flags |= RDDIRREQ;
2028 rddir4_cache_rele(rp, rdc);
2029 mutex_exit(&rp->r_statelock);
2030 }
2031
2032 void
nfs4_async_commit(vnode_t * vp,page_t * plist,offset3 offset,count3 count,cred_t * cr,void (* commit)(vnode_t *,page_t *,offset3,count3,cred_t *))2033 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
2034 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
2035 cred_t *))
2036 {
2037 rnode4_t *rp;
2038 mntinfo4_t *mi;
2039 struct nfs4_async_reqs *args;
2040 page_t *pp;
2041
2042 rp = VTOR4(vp);
2043 mi = VTOMI4(vp);
2044
2045 /*
2046 * If we can't allocate a request structure, do the commit
2047 * operation synchronously in this thread's context.
2048 */
2049 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
2050 goto noasync;
2051
2052 args->a_next = NULL;
2053 #ifdef DEBUG
2054 args->a_queuer = curthread;
2055 #endif
2056 VN_HOLD(vp);
2057 args->a_vp = vp;
2058 ASSERT(cr != NULL);
2059 crhold(cr);
2060 args->a_cred = cr;
2061 args->a_io = NFS4_COMMIT;
2062 args->a_nfs4_commit = commit;
2063 args->a_nfs4_plist = plist;
2064 args->a_nfs4_offset = offset;
2065 args->a_nfs4_count = count;
2066
2067 mutex_enter(&mi->mi_async_lock);
2068
2069 /*
2070 * If asyncio has been disabled, then make a synchronous request.
2071 * This check is done a second time in case async io was diabled
2072 * while this thread was blocked waiting for memory pressure to
2073 * reduce or for the queue to drain.
2074 */
2075 if (mi->mi_max_threads == 0) {
2076 mutex_exit(&mi->mi_async_lock);
2077
2078 VN_RELE(vp);
2079 crfree(cr);
2080 kmem_free(args, sizeof (*args));
2081 goto noasync;
2082 }
2083
2084 /*
2085 * Link request structure into the async list and
2086 * wakeup async thread to do the i/o.
2087 */
2088 if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) {
2089 mi->mi_async_reqs[NFS4_COMMIT] = args;
2090 mi->mi_async_tail[NFS4_COMMIT] = args;
2091 } else {
2092 mi->mi_async_tail[NFS4_COMMIT]->a_next = args;
2093 mi->mi_async_tail[NFS4_COMMIT] = args;
2094 }
2095
2096 mutex_enter(&rp->r_statelock);
2097 rp->r_count++;
2098 mutex_exit(&rp->r_statelock);
2099
2100 if (mi->mi_io_kstats) {
2101 mutex_enter(&mi->mi_lock);
2102 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2103 mutex_exit(&mi->mi_lock);
2104 }
2105
2106 mi->mi_async_req_count++;
2107 ASSERT(mi->mi_async_req_count != 0);
2108 cv_signal(&mi->mi_async_reqs_cv);
2109 mutex_exit(&mi->mi_async_lock);
2110 return;
2111
2112 noasync:
2113 if (curproc == proc_pageout || curproc == proc_fsflush ||
2114 nfs_zone() != mi->mi_zone) {
2115 while (plist != NULL) {
2116 pp = plist;
2117 page_sub(&plist, pp);
2118 pp->p_fsdata = C_COMMIT;
2119 page_unlock(pp);
2120 }
2121 return;
2122 }
2123 (*commit)(vp, plist, offset, count, cr);
2124 }
2125
2126 /*
2127 * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread. The
2128 * reference to the vnode is handed over to the thread; the caller should
2129 * no longer refer to the vnode.
2130 *
2131 * Unlike most of the async routines, this handoff is needed for
2132 * correctness reasons, not just performance. So doing operations in the
2133 * context of the current thread is not an option.
2134 */
2135 void
nfs4_async_inactive(vnode_t * vp,cred_t * cr)2136 nfs4_async_inactive(vnode_t *vp, cred_t *cr)
2137 {
2138 mntinfo4_t *mi;
2139 struct nfs4_async_reqs *args;
2140 boolean_t signal_inactive_thread = B_FALSE;
2141
2142 mi = VTOMI4(vp);
2143
2144 args = kmem_alloc(sizeof (*args), KM_SLEEP);
2145 args->a_next = NULL;
2146 #ifdef DEBUG
2147 args->a_queuer = curthread;
2148 #endif
2149 args->a_vp = vp;
2150 ASSERT(cr != NULL);
2151 crhold(cr);
2152 args->a_cred = cr;
2153 args->a_io = NFS4_INACTIVE;
2154
2155 /*
2156 * Note that we don't check mi->mi_max_threads here, since we
2157 * *need* to get rid of this vnode regardless of whether someone
2158 * set nfs4_max_threads to zero in /etc/system.
2159 *
2160 * The manager thread knows about this and is willing to create
2161 * at least one thread to accommodate us.
2162 */
2163 mutex_enter(&mi->mi_async_lock);
2164 if (mi->mi_inactive_thread == NULL) {
2165 rnode4_t *rp;
2166 vnode_t *unldvp = NULL;
2167 char *unlname;
2168 cred_t *unlcred;
2169
2170 mutex_exit(&mi->mi_async_lock);
2171 /*
2172 * We just need to free up the memory associated with the
2173 * vnode, which can be safely done from within the current
2174 * context.
2175 */
2176 crfree(cr); /* drop our reference */
2177 kmem_free(args, sizeof (*args));
2178 rp = VTOR4(vp);
2179 mutex_enter(&rp->r_statelock);
2180 if (rp->r_unldvp != NULL) {
2181 unldvp = rp->r_unldvp;
2182 rp->r_unldvp = NULL;
2183 unlname = rp->r_unlname;
2184 rp->r_unlname = NULL;
2185 unlcred = rp->r_unlcred;
2186 rp->r_unlcred = NULL;
2187 }
2188 mutex_exit(&rp->r_statelock);
2189 /*
2190 * No need to explicitly throw away any cached pages. The
2191 * eventual r4inactive() will attempt a synchronous
2192 * VOP_PUTPAGE() which will immediately fail since the request
2193 * is coming from the wrong zone, and then will proceed to call
2194 * nfs4_invalidate_pages() which will clean things up for us.
2195 *
2196 * Throw away the delegation here so rp4_addfree()'s attempt to
2197 * return any existing delegations becomes a no-op.
2198 */
2199 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
2200 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
2201 FALSE);
2202 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
2203 nfs_rw_exit(&mi->mi_recovlock);
2204 }
2205 nfs4_clear_open_streams(rp);
2206
2207 rp4_addfree(rp, cr);
2208 if (unldvp != NULL) {
2209 kmem_free(unlname, MAXNAMELEN);
2210 VN_RELE(unldvp);
2211 crfree(unlcred);
2212 }
2213 return;
2214 }
2215
2216 if (mi->mi_manager_thread == NULL) {
2217 /*
2218 * We want to talk to the inactive thread.
2219 */
2220 signal_inactive_thread = B_TRUE;
2221 }
2222
2223 /*
2224 * Enqueue the vnode and wake up either the special thread (empty
2225 * list) or an async thread.
2226 */
2227 if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) {
2228 mi->mi_async_reqs[NFS4_INACTIVE] = args;
2229 mi->mi_async_tail[NFS4_INACTIVE] = args;
2230 signal_inactive_thread = B_TRUE;
2231 } else {
2232 mi->mi_async_tail[NFS4_INACTIVE]->a_next = args;
2233 mi->mi_async_tail[NFS4_INACTIVE] = args;
2234 }
2235 if (signal_inactive_thread) {
2236 cv_signal(&mi->mi_inact_req_cv);
2237 } else {
2238 mi->mi_async_req_count++;
2239 ASSERT(mi->mi_async_req_count != 0);
2240 cv_signal(&mi->mi_async_reqs_cv);
2241 }
2242
2243 mutex_exit(&mi->mi_async_lock);
2244 }
2245
2246 int
writerp4(rnode4_t * rp,caddr_t base,int tcount,struct uio * uio,int pgcreated)2247 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2248 {
2249 int pagecreate;
2250 int n;
2251 int saved_n;
2252 caddr_t saved_base;
2253 u_offset_t offset;
2254 int error;
2255 int sm_error;
2256 vnode_t *vp = RTOV(rp);
2257
2258 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2259 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2260 if (!vpm_enable) {
2261 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2262 }
2263
2264 /*
2265 * Move bytes in at most PAGESIZE chunks. We must avoid
2266 * spanning pages in uiomove() because page faults may cause
2267 * the cache to be invalidated out from under us. The r_size is not
2268 * updated until after the uiomove. If we push the last page of a
2269 * file before r_size is correct, we will lose the data written past
2270 * the current (and invalid) r_size.
2271 */
2272 do {
2273 offset = uio->uio_loffset;
2274 pagecreate = 0;
2275
2276 /*
2277 * n is the number of bytes required to satisfy the request
2278 * or the number of bytes to fill out the page.
2279 */
2280 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2281
2282 /*
2283 * Check to see if we can skip reading in the page
2284 * and just allocate the memory. We can do this
2285 * if we are going to rewrite the entire mapping
2286 * or if we are going to write to or beyond the current
2287 * end of file from the beginning of the mapping.
2288 *
2289 * The read of r_size is now protected by r_statelock.
2290 */
2291 mutex_enter(&rp->r_statelock);
2292 /*
2293 * When pgcreated is nonzero the caller has already done
2294 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2295 * segkpm this means we already have at least one page
2296 * created and mapped at base.
2297 */
2298 pagecreate = pgcreated ||
2299 ((offset & PAGEOFFSET) == 0 &&
2300 (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2301
2302 mutex_exit(&rp->r_statelock);
2303
2304 if (!vpm_enable && pagecreate) {
2305 /*
2306 * The last argument tells segmap_pagecreate() to
2307 * always lock the page, as opposed to sometimes
2308 * returning with the page locked. This way we avoid a
2309 * fault on the ensuing uiomove(), but also
2310 * more importantly (to fix bug 1094402) we can
2311 * call segmap_fault() to unlock the page in all
2312 * cases. An alternative would be to modify
2313 * segmap_pagecreate() to tell us when it is
2314 * locking a page, but that's a fairly major
2315 * interface change.
2316 */
2317 if (pgcreated == 0)
2318 (void) segmap_pagecreate(segkmap, base,
2319 (uint_t)n, 1);
2320 saved_base = base;
2321 saved_n = n;
2322 }
2323
2324 /*
2325 * The number of bytes of data in the last page can not
2326 * be accurately be determined while page is being
2327 * uiomove'd to and the size of the file being updated.
2328 * Thus, inform threads which need to know accurately
2329 * how much data is in the last page of the file. They
2330 * will not do the i/o immediately, but will arrange for
2331 * the i/o to happen later when this modify operation
2332 * will have finished.
2333 */
2334 ASSERT(!(rp->r_flags & R4MODINPROGRESS));
2335 mutex_enter(&rp->r_statelock);
2336 rp->r_flags |= R4MODINPROGRESS;
2337 rp->r_modaddr = (offset & MAXBMASK);
2338 mutex_exit(&rp->r_statelock);
2339
2340 if (vpm_enable) {
2341 /*
2342 * Copy data. If new pages are created, part of
2343 * the page that is not written will be initizliazed
2344 * with zeros.
2345 */
2346 error = vpm_data_copy(vp, offset, n, uio,
2347 !pagecreate, NULL, 0, S_WRITE);
2348 } else {
2349 error = uiomove(base, n, UIO_WRITE, uio);
2350 }
2351
2352 /*
2353 * r_size is the maximum number of
2354 * bytes known to be in the file.
2355 * Make sure it is at least as high as the
2356 * first unwritten byte pointed to by uio_loffset.
2357 */
2358 mutex_enter(&rp->r_statelock);
2359 if (rp->r_size < uio->uio_loffset)
2360 rp->r_size = uio->uio_loffset;
2361 rp->r_flags &= ~R4MODINPROGRESS;
2362 rp->r_flags |= R4DIRTY;
2363 mutex_exit(&rp->r_statelock);
2364
2365 /* n = # of bytes written */
2366 n = (int)(uio->uio_loffset - offset);
2367
2368 if (!vpm_enable) {
2369 base += n;
2370 }
2371
2372 tcount -= n;
2373 /*
2374 * If we created pages w/o initializing them completely,
2375 * we need to zero the part that wasn't set up.
2376 * This happens on a most EOF write cases and if
2377 * we had some sort of error during the uiomove.
2378 */
2379 if (!vpm_enable && pagecreate) {
2380 if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2381 (void) kzero(base, PAGESIZE - n);
2382
2383 if (pgcreated) {
2384 /*
2385 * Caller is responsible for this page,
2386 * it was not created in this loop.
2387 */
2388 pgcreated = 0;
2389 } else {
2390 /*
2391 * For bug 1094402: segmap_pagecreate locks
2392 * page. Unlock it. This also unlocks the
2393 * pages allocated by page_create_va() in
2394 * segmap_pagecreate().
2395 */
2396 sm_error = segmap_fault(kas.a_hat, segkmap,
2397 saved_base, saved_n,
2398 F_SOFTUNLOCK, S_WRITE);
2399 if (error == 0)
2400 error = sm_error;
2401 }
2402 }
2403 } while (tcount > 0 && error == 0);
2404
2405 return (error);
2406 }
2407
2408 int
nfs4_putpages(vnode_t * vp,u_offset_t off,size_t len,int flags,cred_t * cr)2409 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2410 {
2411 rnode4_t *rp;
2412 page_t *pp;
2413 u_offset_t eoff;
2414 u_offset_t io_off;
2415 size_t io_len;
2416 int error;
2417 int rdirty;
2418 int err;
2419
2420 rp = VTOR4(vp);
2421 ASSERT(rp->r_count > 0);
2422
2423 if (!nfs4_has_pages(vp))
2424 return (0);
2425
2426 ASSERT(vp->v_type != VCHR);
2427
2428 /*
2429 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL
2430 * writes. B_FORCE is set to force the VM system to actually
2431 * invalidate the pages, even if the i/o failed. The pages
2432 * need to get invalidated because they can't be written out
2433 * because there isn't any space left on either the server's
2434 * file system or in the user's disk quota. The B_FREE bit
2435 * is cleared to avoid confusion as to whether this is a
2436 * request to place the page on the freelist or to destroy
2437 * it.
2438 */
2439 if ((rp->r_flags & R4OUTOFSPACE) ||
2440 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2441 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2442
2443 if (len == 0) {
2444 /*
2445 * If doing a full file synchronous operation, then clear
2446 * the R4DIRTY bit. If a page gets dirtied while the flush
2447 * is happening, then R4DIRTY will get set again. The
2448 * R4DIRTY bit must get cleared before the flush so that
2449 * we don't lose this information.
2450 *
2451 * If there are no full file async write operations
2452 * pending and RDIRTY bit is set, clear it.
2453 */
2454 if (off == (u_offset_t)0 &&
2455 !(flags & B_ASYNC) &&
2456 (rp->r_flags & R4DIRTY)) {
2457 mutex_enter(&rp->r_statelock);
2458 rdirty = (rp->r_flags & R4DIRTY);
2459 rp->r_flags &= ~R4DIRTY;
2460 mutex_exit(&rp->r_statelock);
2461 } else if (flags & B_ASYNC && off == (u_offset_t)0) {
2462 mutex_enter(&rp->r_statelock);
2463 if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) {
2464 rdirty = (rp->r_flags & R4DIRTY);
2465 rp->r_flags &= ~R4DIRTY;
2466 }
2467 mutex_exit(&rp->r_statelock);
2468 } else
2469 rdirty = 0;
2470
2471 /*
2472 * Search the entire vp list for pages >= off, and flush
2473 * the dirty pages.
2474 */
2475 error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2476 flags, cr);
2477
2478 /*
2479 * If an error occurred and the file was marked as dirty
2480 * before and we aren't forcibly invalidating pages, then
2481 * reset the R4DIRTY flag.
2482 */
2483 if (error && rdirty &&
2484 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2485 mutex_enter(&rp->r_statelock);
2486 rp->r_flags |= R4DIRTY;
2487 mutex_exit(&rp->r_statelock);
2488 }
2489 } else {
2490 /*
2491 * Do a range from [off...off + len) looking for pages
2492 * to deal with.
2493 */
2494 error = 0;
2495 io_len = 0;
2496 eoff = off + len;
2497 mutex_enter(&rp->r_statelock);
2498 for (io_off = off; io_off < eoff && io_off < rp->r_size;
2499 io_off += io_len) {
2500 mutex_exit(&rp->r_statelock);
2501 /*
2502 * If we are not invalidating, synchronously
2503 * freeing or writing pages use the routine
2504 * page_lookup_nowait() to prevent reclaiming
2505 * them from the free list.
2506 */
2507 if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2508 pp = page_lookup(vp, io_off,
2509 (flags & (B_INVAL | B_FREE)) ?
2510 SE_EXCL : SE_SHARED);
2511 } else {
2512 pp = page_lookup_nowait(vp, io_off,
2513 (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2514 }
2515
2516 if (pp == NULL || !pvn_getdirty(pp, flags))
2517 io_len = PAGESIZE;
2518 else {
2519 err = (*rp->r_putapage)(vp, pp, &io_off,
2520 &io_len, flags, cr);
2521 if (!error)
2522 error = err;
2523 /*
2524 * "io_off" and "io_len" are returned as
2525 * the range of pages we actually wrote.
2526 * This allows us to skip ahead more quickly
2527 * since several pages may've been dealt
2528 * with by this iteration of the loop.
2529 */
2530 }
2531 mutex_enter(&rp->r_statelock);
2532 }
2533 mutex_exit(&rp->r_statelock);
2534 }
2535
2536 return (error);
2537 }
2538
2539 void
nfs4_invalidate_pages(vnode_t * vp,u_offset_t off,cred_t * cr)2540 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2541 {
2542 rnode4_t *rp;
2543
2544 rp = VTOR4(vp);
2545 if (IS_SHADOW(vp, rp))
2546 vp = RTOV4(rp);
2547 mutex_enter(&rp->r_statelock);
2548 while (rp->r_flags & R4TRUNCATE)
2549 cv_wait(&rp->r_cv, &rp->r_statelock);
2550 rp->r_flags |= R4TRUNCATE;
2551 if (off == (u_offset_t)0) {
2552 rp->r_flags &= ~R4DIRTY;
2553 if (!(rp->r_flags & R4STALE))
2554 rp->r_error = 0;
2555 }
2556 rp->r_truncaddr = off;
2557 mutex_exit(&rp->r_statelock);
2558 (void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2559 B_INVAL | B_TRUNC, cr);
2560 mutex_enter(&rp->r_statelock);
2561 rp->r_flags &= ~R4TRUNCATE;
2562 cv_broadcast(&rp->r_cv);
2563 mutex_exit(&rp->r_statelock);
2564 }
2565
2566 static int
nfs4_mnt_kstat_update(kstat_t * ksp,int rw)2567 nfs4_mnt_kstat_update(kstat_t *ksp, int rw)
2568 {
2569 mntinfo4_t *mi;
2570 struct mntinfo_kstat *mik;
2571 vfs_t *vfsp;
2572
2573 /* this is a read-only kstat. Bail out on a write */
2574 if (rw == KSTAT_WRITE)
2575 return (EACCES);
2576
2577
2578 /*
2579 * We don't want to wait here as kstat_chain_lock could be held by
2580 * dounmount(). dounmount() takes vfs_reflock before the chain lock
2581 * and thus could lead to a deadlock.
2582 */
2583 vfsp = (struct vfs *)ksp->ks_private;
2584
2585 mi = VFTOMI4(vfsp);
2586 mik = (struct mntinfo_kstat *)ksp->ks_data;
2587
2588 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
2589
2590 mik->mik_vers = (uint32_t)mi->mi_vers;
2591 mik->mik_flags = mi->mi_flags;
2592 /*
2593 * The sv_secdata holds the flavor the client specifies.
2594 * If the client uses default and a security negotiation
2595 * occurs, sv_currsec will point to the current flavor
2596 * selected from the server flavor list.
2597 * sv_currsec is NULL if no security negotiation takes place.
2598 */
2599 mik->mik_secmod = mi->mi_curr_serv->sv_currsec ?
2600 mi->mi_curr_serv->sv_currsec->secmod :
2601 mi->mi_curr_serv->sv_secdata->secmod;
2602 mik->mik_curread = (uint32_t)mi->mi_curread;
2603 mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
2604 mik->mik_retrans = mi->mi_retrans;
2605 mik->mik_timeo = mi->mi_timeo;
2606 mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
2607 mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
2608 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
2609 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
2610 mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
2611 mik->mik_failover = (uint32_t)mi->mi_failover;
2612 mik->mik_remap = (uint32_t)mi->mi_remap;
2613
2614 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
2615
2616 return (0);
2617 }
2618
2619 void
nfs4_mnt_kstat_init(struct vfs * vfsp)2620 nfs4_mnt_kstat_init(struct vfs *vfsp)
2621 {
2622 mntinfo4_t *mi = VFTOMI4(vfsp);
2623
2624 /*
2625 * PSARC 2001/697 Contract Private Interface
2626 * All nfs kstats are under SunMC contract
2627 * Please refer to the PSARC listed above and contact
2628 * SunMC before making any changes!
2629 *
2630 * Changes must be reviewed by Solaris File Sharing
2631 * Changes must be communicated to contract-2001-697@sun.com
2632 *
2633 */
2634
2635 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
2636 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
2637 if (mi->mi_io_kstats) {
2638 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2639 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
2640 mi->mi_io_kstats->ks_lock = &mi->mi_lock;
2641 kstat_install(mi->mi_io_kstats);
2642 }
2643
2644 if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
2645 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
2646 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
2647 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2648 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
2649 mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update;
2650 mi->mi_ro_kstats->ks_private = (void *)vfsp;
2651 kstat_install(mi->mi_ro_kstats);
2652 }
2653
2654 nfs4_mnt_recov_kstat_init(vfsp);
2655 }
2656
2657 void
nfs4_write_error(vnode_t * vp,int error,cred_t * cr)2658 nfs4_write_error(vnode_t *vp, int error, cred_t *cr)
2659 {
2660 mntinfo4_t *mi;
2661 clock_t now = ddi_get_lbolt();
2662
2663 mi = VTOMI4(vp);
2664 /*
2665 * In case of forced unmount, do not print any messages
2666 * since it can flood the console with error messages.
2667 */
2668 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
2669 return;
2670
2671 /*
2672 * If the mount point is dead, not recoverable, do not
2673 * print error messages that can flood the console.
2674 */
2675 if (mi->mi_flags & MI4_RECOV_FAIL)
2676 return;
2677
2678 /*
2679 * No use in flooding the console with ENOSPC
2680 * messages from the same file system.
2681 */
2682 if ((error != ENOSPC && error != EDQUOT) ||
2683 now - mi->mi_printftime > 0) {
2684 zoneid_t zoneid = mi->mi_zone->zone_id;
2685
2686 #ifdef DEBUG
2687 nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2688 mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL);
2689 #else
2690 nfs_perror(error, "NFS write error on host %s: %m.\n",
2691 VTOR4(vp)->r_server->sv_hostname, NULL);
2692 #endif
2693 if (error == ENOSPC || error == EDQUOT) {
2694 zcmn_err(zoneid, CE_CONT,
2695 "^File: userid=%d, groupid=%d\n",
2696 crgetuid(cr), crgetgid(cr));
2697 if (crgetuid(curthread->t_cred) != crgetuid(cr) ||
2698 crgetgid(curthread->t_cred) != crgetgid(cr)) {
2699 zcmn_err(zoneid, CE_CONT,
2700 "^User: userid=%d, groupid=%d\n",
2701 crgetuid(curthread->t_cred),
2702 crgetgid(curthread->t_cred));
2703 }
2704 mi->mi_printftime = now +
2705 nfs_write_error_interval * hz;
2706 }
2707 sfh4_printfhandle(VTOR4(vp)->r_fh);
2708 #ifdef DEBUG
2709 if (error == EACCES) {
2710 zcmn_err(zoneid, CE_CONT,
2711 "nfs_bio: cred is%s kcred\n",
2712 cr == kcred ? "" : " not");
2713 }
2714 #endif
2715 }
2716 }
2717
2718 /*
2719 * Return non-zero if the given file can be safely memory mapped. Locks
2720 * are safe if whole-file (length and offset are both zero).
2721 */
2722
2723 #define SAFE_LOCK(flk) ((flk).l_start == 0 && (flk).l_len == 0)
2724
2725 static int
nfs4_safemap(const vnode_t * vp)2726 nfs4_safemap(const vnode_t *vp)
2727 {
2728 locklist_t *llp, *next_llp;
2729 int safe = 1;
2730 rnode4_t *rp = VTOR4(vp);
2731
2732 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2733
2734 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: "
2735 "vp = %p", (void *)vp));
2736
2737 /*
2738 * Review all the locks for the vnode, both ones that have been
2739 * acquired and ones that are pending. We assume that
2740 * flk_active_locks_for_vp() has merged any locks that can be
2741 * merged (so that if a process has the entire file locked, it is
2742 * represented as a single lock).
2743 *
2744 * Note that we can't bail out of the loop if we find a non-safe
2745 * lock, because we have to free all the elements in the llp list.
2746 * We might be able to speed up this code slightly by not looking
2747 * at each lock's l_start and l_len fields once we've found a
2748 * non-safe lock.
2749 */
2750
2751 llp = flk_active_locks_for_vp(vp);
2752 while (llp) {
2753 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2754 "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")",
2755 llp->ll_flock.l_start, llp->ll_flock.l_len));
2756 if (!SAFE_LOCK(llp->ll_flock)) {
2757 safe = 0;
2758 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2759 "nfs4_safemap: unsafe active lock (%" PRId64
2760 ", %" PRId64 ")", llp->ll_flock.l_start,
2761 llp->ll_flock.l_len));
2762 }
2763 next_llp = llp->ll_next;
2764 VN_RELE(llp->ll_vp);
2765 kmem_free(llp, sizeof (*llp));
2766 llp = next_llp;
2767 }
2768
2769 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s",
2770 safe ? "safe" : "unsafe"));
2771 return (safe);
2772 }
2773
2774 /*
2775 * Return whether there is a lost LOCK or LOCKU queued up for the given
2776 * file that would make an mmap request unsafe. cf. nfs4_safemap().
2777 */
2778
2779 bool_t
nfs4_map_lost_lock_conflict(vnode_t * vp)2780 nfs4_map_lost_lock_conflict(vnode_t *vp)
2781 {
2782 bool_t conflict = FALSE;
2783 nfs4_lost_rqst_t *lrp;
2784 mntinfo4_t *mi = VTOMI4(vp);
2785
2786 mutex_enter(&mi->mi_lock);
2787 for (lrp = list_head(&mi->mi_lost_state); lrp != NULL;
2788 lrp = list_next(&mi->mi_lost_state, lrp)) {
2789 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2790 continue;
2791 ASSERT(lrp->lr_vp != NULL);
2792 if (!VOP_CMP(lrp->lr_vp, vp, NULL))
2793 continue; /* different file */
2794 if (!SAFE_LOCK(*lrp->lr_flk)) {
2795 conflict = TRUE;
2796 break;
2797 }
2798 }
2799
2800 mutex_exit(&mi->mi_lock);
2801 return (conflict);
2802 }
2803
2804 /*
2805 * nfs_lockcompletion:
2806 *
2807 * If the vnode has a lock that makes it unsafe to cache the file, mark it
2808 * as non cachable (set VNOCACHE bit).
2809 */
2810
2811 void
nfs4_lockcompletion(vnode_t * vp,int cmd)2812 nfs4_lockcompletion(vnode_t *vp, int cmd)
2813 {
2814 rnode4_t *rp = VTOR4(vp);
2815
2816 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2817 ASSERT(!IS_SHADOW(vp, rp));
2818
2819 if (cmd == F_SETLK || cmd == F_SETLKW) {
2820
2821 if (!nfs4_safemap(vp)) {
2822 mutex_enter(&vp->v_lock);
2823 vp->v_flag |= VNOCACHE;
2824 mutex_exit(&vp->v_lock);
2825 } else {
2826 mutex_enter(&vp->v_lock);
2827 vp->v_flag &= ~VNOCACHE;
2828 mutex_exit(&vp->v_lock);
2829 }
2830 }
2831 /*
2832 * The cached attributes of the file are stale after acquiring
2833 * the lock on the file. They were updated when the file was
2834 * opened, but not updated when the lock was acquired. Therefore the
2835 * cached attributes are invalidated after the lock is obtained.
2836 */
2837 PURGE_ATTRCACHE4(vp);
2838 }
2839
2840 /* ARGSUSED */
2841 static void *
nfs4_mi_init(zoneid_t zoneid)2842 nfs4_mi_init(zoneid_t zoneid)
2843 {
2844 struct mi4_globals *mig;
2845
2846 mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2847 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2848 list_create(&mig->mig_list, sizeof (mntinfo4_t),
2849 offsetof(mntinfo4_t, mi_zone_node));
2850 mig->mig_destructor_called = B_FALSE;
2851 return (mig);
2852 }
2853
2854 /*
2855 * Callback routine to tell all NFSv4 mounts in the zone to start tearing down
2856 * state and killing off threads.
2857 */
2858 /* ARGSUSED */
2859 static void
nfs4_mi_shutdown(zoneid_t zoneid,void * data)2860 nfs4_mi_shutdown(zoneid_t zoneid, void *data)
2861 {
2862 struct mi4_globals *mig = data;
2863 mntinfo4_t *mi;
2864 nfs4_server_t *np;
2865
2866 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2867 "nfs4_mi_shutdown zone %d\n", zoneid));
2868 ASSERT(mig != NULL);
2869 for (;;) {
2870 mutex_enter(&mig->mig_lock);
2871 mi = list_head(&mig->mig_list);
2872 if (mi == NULL) {
2873 mutex_exit(&mig->mig_lock);
2874 break;
2875 }
2876
2877 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2878 "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp));
2879 /*
2880 * purge the DNLC for this filesystem
2881 */
2882 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2883 /*
2884 * Tell existing async worker threads to exit.
2885 */
2886 mutex_enter(&mi->mi_async_lock);
2887 mi->mi_max_threads = 0;
2888 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2889 /*
2890 * Set the appropriate flags, signal and wait for both the
2891 * async manager and the inactive thread to exit when they're
2892 * done with their current work.
2893 */
2894 mutex_enter(&mi->mi_lock);
2895 mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD);
2896 mutex_exit(&mi->mi_lock);
2897 mutex_exit(&mi->mi_async_lock);
2898 if (mi->mi_manager_thread) {
2899 nfs4_async_manager_stop(mi->mi_vfsp);
2900 }
2901 if (mi->mi_inactive_thread) {
2902 mutex_enter(&mi->mi_async_lock);
2903 cv_signal(&mi->mi_inact_req_cv);
2904 /*
2905 * Wait for the inactive thread to exit.
2906 */
2907 while (mi->mi_inactive_thread != NULL) {
2908 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2909 }
2910 mutex_exit(&mi->mi_async_lock);
2911 }
2912 /*
2913 * Wait for the recovery thread to complete, that is, it will
2914 * signal when it is done using the "mi" structure and about
2915 * to exit
2916 */
2917 mutex_enter(&mi->mi_lock);
2918 while (mi->mi_in_recovery > 0)
2919 cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
2920 mutex_exit(&mi->mi_lock);
2921 /*
2922 * We're done when every mi has been done or the list is empty.
2923 * This one is done, remove it from the list.
2924 */
2925 list_remove(&mig->mig_list, mi);
2926 mutex_exit(&mig->mig_lock);
2927 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2928
2929 /*
2930 * Release hold on vfs and mi done to prevent race with zone
2931 * shutdown. This releases the hold in nfs4_mi_zonelist_add.
2932 */
2933 VFS_RELE(mi->mi_vfsp);
2934 MI4_RELE(mi);
2935 }
2936 /*
2937 * Tell each renew thread in the zone to exit
2938 */
2939 mutex_enter(&nfs4_server_lst_lock);
2940 for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
2941 mutex_enter(&np->s_lock);
2942 if (np->zoneid == zoneid) {
2943 /*
2944 * We add another hold onto the nfs4_server_t
2945 * because this will make sure tha the nfs4_server_t
2946 * stays around until nfs4_callback_fini_zone destroys
2947 * the zone. This way, the renew thread can
2948 * unconditionally release its holds on the
2949 * nfs4_server_t.
2950 */
2951 np->s_refcnt++;
2952 nfs4_mark_srv_dead(np);
2953 }
2954 mutex_exit(&np->s_lock);
2955 }
2956 mutex_exit(&nfs4_server_lst_lock);
2957 }
2958
2959 static void
nfs4_mi_free_globals(struct mi4_globals * mig)2960 nfs4_mi_free_globals(struct mi4_globals *mig)
2961 {
2962 list_destroy(&mig->mig_list); /* makes sure the list is empty */
2963 mutex_destroy(&mig->mig_lock);
2964 kmem_free(mig, sizeof (*mig));
2965 }
2966
2967 /* ARGSUSED */
2968 static void
nfs4_mi_destroy(zoneid_t zoneid,void * data)2969 nfs4_mi_destroy(zoneid_t zoneid, void *data)
2970 {
2971 struct mi4_globals *mig = data;
2972
2973 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2974 "nfs4_mi_destroy zone %d\n", zoneid));
2975 ASSERT(mig != NULL);
2976 mutex_enter(&mig->mig_lock);
2977 if (list_head(&mig->mig_list) != NULL) {
2978 /* Still waiting for VFS_FREEVFS() */
2979 mig->mig_destructor_called = B_TRUE;
2980 mutex_exit(&mig->mig_lock);
2981 return;
2982 }
2983 nfs4_mi_free_globals(mig);
2984 }
2985
2986 /*
2987 * Add an NFS mount to the per-zone list of NFS mounts.
2988 */
2989 void
nfs4_mi_zonelist_add(mntinfo4_t * mi)2990 nfs4_mi_zonelist_add(mntinfo4_t *mi)
2991 {
2992 struct mi4_globals *mig;
2993
2994 mig = zone_getspecific(mi4_list_key, mi->mi_zone);
2995 mutex_enter(&mig->mig_lock);
2996 list_insert_head(&mig->mig_list, mi);
2997 /*
2998 * hold added to eliminate race with zone shutdown -this will be
2999 * released in mi_shutdown
3000 */
3001 MI4_HOLD(mi);
3002 VFS_HOLD(mi->mi_vfsp);
3003 mutex_exit(&mig->mig_lock);
3004 }
3005
3006 /*
3007 * Remove an NFS mount from the per-zone list of NFS mounts.
3008 */
3009 int
nfs4_mi_zonelist_remove(mntinfo4_t * mi)3010 nfs4_mi_zonelist_remove(mntinfo4_t *mi)
3011 {
3012 struct mi4_globals *mig;
3013 int ret = 0;
3014
3015 mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3016 mutex_enter(&mig->mig_lock);
3017 mutex_enter(&mi->mi_lock);
3018 /* if this mi is marked dead, then the zone already released it */
3019 if (!(mi->mi_flags & MI4_DEAD)) {
3020 list_remove(&mig->mig_list, mi);
3021 mutex_exit(&mi->mi_lock);
3022
3023 /* release the holds put on in zonelist_add(). */
3024 VFS_RELE(mi->mi_vfsp);
3025 MI4_RELE(mi);
3026 ret = 1;
3027 } else {
3028 mutex_exit(&mi->mi_lock);
3029 }
3030
3031 /*
3032 * We can be called asynchronously by VFS_FREEVFS() after the zone
3033 * shutdown/destroy callbacks have executed; if so, clean up the zone's
3034 * mi globals.
3035 */
3036 if (list_head(&mig->mig_list) == NULL &&
3037 mig->mig_destructor_called == B_TRUE) {
3038 nfs4_mi_free_globals(mig);
3039 return (ret);
3040 }
3041 mutex_exit(&mig->mig_lock);
3042 return (ret);
3043 }
3044
3045 void
nfs_free_mi4(mntinfo4_t * mi)3046 nfs_free_mi4(mntinfo4_t *mi)
3047 {
3048 nfs4_open_owner_t *foop;
3049 nfs4_oo_hash_bucket_t *bucketp;
3050 nfs4_debug_msg_t *msgp;
3051 int i;
3052 servinfo4_t *svp;
3053
3054 /*
3055 * Code introduced here should be carefully evaluated to make
3056 * sure none of the freed resources are accessed either directly
3057 * or indirectly after freeing them. For eg: Introducing calls to
3058 * NFS4_DEBUG that use mntinfo4_t structure member after freeing
3059 * the structure members or other routines calling back into NFS
3060 * accessing freed mntinfo4_t structure member.
3061 */
3062 mutex_enter(&mi->mi_lock);
3063 ASSERT(mi->mi_recovthread == NULL);
3064 ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP);
3065 mutex_exit(&mi->mi_lock);
3066 mutex_enter(&mi->mi_async_lock);
3067 ASSERT(mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
3068 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0);
3069 ASSERT(mi->mi_manager_thread == NULL);
3070 mutex_exit(&mi->mi_async_lock);
3071 if (mi->mi_io_kstats) {
3072 kstat_delete(mi->mi_io_kstats);
3073 mi->mi_io_kstats = NULL;
3074 }
3075 if (mi->mi_ro_kstats) {
3076 kstat_delete(mi->mi_ro_kstats);
3077 mi->mi_ro_kstats = NULL;
3078 }
3079 if (mi->mi_recov_ksp) {
3080 kstat_delete(mi->mi_recov_ksp);
3081 mi->mi_recov_ksp = NULL;
3082 }
3083 mutex_enter(&mi->mi_msg_list_lock);
3084 while (msgp = list_head(&mi->mi_msg_list)) {
3085 list_remove(&mi->mi_msg_list, msgp);
3086 nfs4_free_msg(msgp);
3087 }
3088 mutex_exit(&mi->mi_msg_list_lock);
3089 list_destroy(&mi->mi_msg_list);
3090 if (mi->mi_fname != NULL)
3091 fn_rele(&mi->mi_fname);
3092 if (mi->mi_rootfh != NULL)
3093 sfh4_rele(&mi->mi_rootfh);
3094 if (mi->mi_srvparentfh != NULL)
3095 sfh4_rele(&mi->mi_srvparentfh);
3096 svp = mi->mi_servers;
3097 sv4_free(svp);
3098 mutex_destroy(&mi->mi_lock);
3099 mutex_destroy(&mi->mi_async_lock);
3100 mutex_destroy(&mi->mi_msg_list_lock);
3101 mutex_destroy(&mi->mi_rnodes_lock);
3102 nfs_rw_destroy(&mi->mi_recovlock);
3103 nfs_rw_destroy(&mi->mi_rename_lock);
3104 nfs_rw_destroy(&mi->mi_fh_lock);
3105 cv_destroy(&mi->mi_failover_cv);
3106 cv_destroy(&mi->mi_async_reqs_cv);
3107 cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]);
3108 cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]);
3109 cv_destroy(&mi->mi_async_cv);
3110 cv_destroy(&mi->mi_inact_req_cv);
3111 /*
3112 * Destroy the oo hash lists and mutexes for the cred hash table.
3113 */
3114 for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
3115 bucketp = &(mi->mi_oo_list[i]);
3116 /* Destroy any remaining open owners on the list */
3117 foop = list_head(&bucketp->b_oo_hash_list);
3118 while (foop != NULL) {
3119 list_remove(&bucketp->b_oo_hash_list, foop);
3120 nfs4_destroy_open_owner(foop);
3121 foop = list_head(&bucketp->b_oo_hash_list);
3122 }
3123 list_destroy(&bucketp->b_oo_hash_list);
3124 mutex_destroy(&bucketp->b_lock);
3125 }
3126 /*
3127 * Empty and destroy the freed open owner list.
3128 */
3129 foop = list_head(&mi->mi_foo_list);
3130 while (foop != NULL) {
3131 list_remove(&mi->mi_foo_list, foop);
3132 nfs4_destroy_open_owner(foop);
3133 foop = list_head(&mi->mi_foo_list);
3134 }
3135 list_destroy(&mi->mi_foo_list);
3136 list_destroy(&mi->mi_bseqid_list);
3137 list_destroy(&mi->mi_lost_state);
3138 list_destroy(&mi->mi_rnodes);
3139 avl_destroy(&mi->mi_filehandles);
3140 kmem_free(mi, sizeof (*mi));
3141 }
3142 void
mi_hold(mntinfo4_t * mi)3143 mi_hold(mntinfo4_t *mi)
3144 {
3145 atomic_inc_32(&mi->mi_count);
3146 ASSERT(mi->mi_count != 0);
3147 }
3148
3149 void
mi_rele(mntinfo4_t * mi)3150 mi_rele(mntinfo4_t *mi)
3151 {
3152 ASSERT(mi->mi_count != 0);
3153 if (atomic_dec_32_nv(&mi->mi_count) == 0) {
3154 nfs_free_mi4(mi);
3155 }
3156 }
3157
3158 vnode_t nfs4_xattr_notsupp_vnode;
3159
3160 void
nfs4_clnt_init(void)3161 nfs4_clnt_init(void)
3162 {
3163 nfs4_vnops_init();
3164 (void) nfs4_rnode_init();
3165 (void) nfs4_shadow_init();
3166 (void) nfs4_acache_init();
3167 (void) nfs4_subr_init();
3168 nfs4_acl_init();
3169 nfs_idmap_init();
3170 nfs4_callback_init();
3171 nfs4_secinfo_init();
3172 #ifdef DEBUG
3173 tsd_create(&nfs4_tsd_key, NULL);
3174 #endif
3175
3176 /*
3177 * Add a CPR callback so that we can update client
3178 * lease after a suspend and resume.
3179 */
3180 cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4");
3181
3182 zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown,
3183 nfs4_mi_destroy);
3184
3185 /*
3186 * Initialize the reference count of the notsupp xattr cache vnode to 1
3187 * so that it never goes away (VOP_INACTIVE isn't called on it).
3188 */
3189 vn_reinit(&nfs4_xattr_notsupp_vnode);
3190 }
3191
3192 void
nfs4_clnt_fini(void)3193 nfs4_clnt_fini(void)
3194 {
3195 (void) zone_key_delete(mi4_list_key);
3196 nfs4_vnops_fini();
3197 (void) nfs4_rnode_fini();
3198 (void) nfs4_shadow_fini();
3199 (void) nfs4_acache_fini();
3200 (void) nfs4_subr_fini();
3201 nfs_idmap_fini();
3202 nfs4_callback_fini();
3203 nfs4_secinfo_fini();
3204 #ifdef DEBUG
3205 tsd_destroy(&nfs4_tsd_key);
3206 #endif
3207 if (cid)
3208 (void) callb_delete(cid);
3209 }
3210
3211 /*ARGSUSED*/
3212 static boolean_t
nfs4_client_cpr_callb(void * arg,int code)3213 nfs4_client_cpr_callb(void *arg, int code)
3214 {
3215 /*
3216 * We get called for Suspend and Resume events.
3217 * For the suspend case we simply don't care!
3218 */
3219 if (code == CB_CODE_CPR_CHKPT) {
3220 return (B_TRUE);
3221 }
3222
3223 /*
3224 * When we get to here we are in the process of
3225 * resuming the system from a previous suspend.
3226 */
3227 nfs4_client_resumed = gethrestime_sec();
3228 return (B_TRUE);
3229 }
3230
3231 void
nfs4_renew_lease_thread(nfs4_server_t * sp)3232 nfs4_renew_lease_thread(nfs4_server_t *sp)
3233 {
3234 int error = 0;
3235 time_t tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs;
3236 clock_t tick_delay = 0;
3237 clock_t time_left = 0;
3238 callb_cpr_t cpr_info;
3239 kmutex_t cpr_lock;
3240
3241 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3242 "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp));
3243 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
3244 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease");
3245
3246 mutex_enter(&sp->s_lock);
3247 /* sp->s_lease_time is set via a GETATTR */
3248 sp->last_renewal_time = gethrestime_sec();
3249 sp->lease_valid = NFS4_LEASE_UNINITIALIZED;
3250 ASSERT(sp->s_refcnt >= 1);
3251
3252 for (;;) {
3253 if (!sp->state_ref_count ||
3254 sp->lease_valid != NFS4_LEASE_VALID) {
3255
3256 kip_secs = MAX((sp->s_lease_time >> 1) -
3257 (3 * sp->propagation_delay.tv_sec), 1);
3258
3259 tick_delay = SEC_TO_TICK(kip_secs);
3260
3261 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3262 "nfs4_renew_lease_thread: no renew : thread "
3263 "wait %ld secs", kip_secs));
3264
3265 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3266 "nfs4_renew_lease_thread: no renew : "
3267 "state_ref_count %d, lease_valid %d",
3268 sp->state_ref_count, sp->lease_valid));
3269
3270 mutex_enter(&cpr_lock);
3271 CALLB_CPR_SAFE_BEGIN(&cpr_info);
3272 mutex_exit(&cpr_lock);
3273 time_left = cv_reltimedwait(&sp->cv_thread_exit,
3274 &sp->s_lock, tick_delay, TR_CLOCK_TICK);
3275 mutex_enter(&cpr_lock);
3276 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3277 mutex_exit(&cpr_lock);
3278
3279 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3280 "nfs4_renew_lease_thread: no renew: "
3281 "time left %ld", time_left));
3282
3283 if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3284 goto die;
3285 continue;
3286 }
3287
3288 tmp_last_renewal_time = sp->last_renewal_time;
3289
3290 tmp_time = gethrestime_sec() - sp->last_renewal_time +
3291 (3 * sp->propagation_delay.tv_sec);
3292
3293 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3294 "nfs4_renew_lease_thread: tmp_time %ld, "
3295 "sp->last_renewal_time %ld", tmp_time,
3296 sp->last_renewal_time));
3297
3298 kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1);
3299
3300 tick_delay = SEC_TO_TICK(kip_secs);
3301
3302 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3303 "nfs4_renew_lease_thread: valid lease: sleep for %ld "
3304 "secs", kip_secs));
3305
3306 mutex_enter(&cpr_lock);
3307 CALLB_CPR_SAFE_BEGIN(&cpr_info);
3308 mutex_exit(&cpr_lock);
3309 time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock,
3310 tick_delay, TR_CLOCK_TICK);
3311 mutex_enter(&cpr_lock);
3312 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3313 mutex_exit(&cpr_lock);
3314
3315 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3316 "nfs4_renew_lease_thread: valid lease: time left %ld :"
3317 "sp last_renewal_time %ld, nfs4_client_resumed %ld, "
3318 "tmp_last_renewal_time %ld", time_left,
3319 sp->last_renewal_time, nfs4_client_resumed,
3320 tmp_last_renewal_time));
3321
3322 if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3323 goto die;
3324
3325 if (tmp_last_renewal_time == sp->last_renewal_time ||
3326 (nfs4_client_resumed != 0 &&
3327 nfs4_client_resumed > sp->last_renewal_time)) {
3328 /*
3329 * Issue RENEW op since we haven't renewed the lease
3330 * since we slept.
3331 */
3332 tmp_now_time = gethrestime_sec();
3333 error = nfs4renew(sp);
3334 /*
3335 * Need to re-acquire sp's lock, nfs4renew()
3336 * relinqueshes it.
3337 */
3338 mutex_enter(&sp->s_lock);
3339
3340 /*
3341 * See if someone changed s_thread_exit while we gave
3342 * up s_lock.
3343 */
3344 if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3345 goto die;
3346
3347 if (!error) {
3348 /*
3349 * check to see if we implicitly renewed while
3350 * we waited for a reply for our RENEW call.
3351 */
3352 if (tmp_last_renewal_time ==
3353 sp->last_renewal_time) {
3354 /* no implicit renew came */
3355 sp->last_renewal_time = tmp_now_time;
3356 } else {
3357 NFS4_DEBUG(nfs4_client_lease_debug,
3358 (CE_NOTE, "renew_thread: did "
3359 "implicit renewal before reply "
3360 "from server for RENEW"));
3361 }
3362 } else {
3363 /* figure out error */
3364 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3365 "renew_thread: nfs4renew returned error"
3366 " %d", error));
3367 }
3368
3369 }
3370 }
3371
3372 die:
3373 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3374 "nfs4_renew_lease_thread: thread exiting"));
3375
3376 while (sp->s_otw_call_count != 0) {
3377 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3378 "nfs4_renew_lease_thread: waiting for outstanding "
3379 "otw calls to finish for sp 0x%p, current "
3380 "s_otw_call_count %d", (void *)sp,
3381 sp->s_otw_call_count));
3382 mutex_enter(&cpr_lock);
3383 CALLB_CPR_SAFE_BEGIN(&cpr_info);
3384 mutex_exit(&cpr_lock);
3385 cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
3386 mutex_enter(&cpr_lock);
3387 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3388 mutex_exit(&cpr_lock);
3389 }
3390 mutex_exit(&sp->s_lock);
3391
3392 nfs4_server_rele(sp); /* free the thread's reference */
3393 nfs4_server_rele(sp); /* free the list's reference */
3394 sp = NULL;
3395
3396 done:
3397 mutex_enter(&cpr_lock);
3398 CALLB_CPR_EXIT(&cpr_info); /* drops cpr_lock */
3399 mutex_destroy(&cpr_lock);
3400
3401 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3402 "nfs4_renew_lease_thread: renew thread exit officially"));
3403
3404 zthread_exit();
3405 /* NOT REACHED */
3406 }
3407
3408 /*
3409 * Send out a RENEW op to the server.
3410 * Assumes sp is locked down.
3411 */
3412 static int
nfs4renew(nfs4_server_t * sp)3413 nfs4renew(nfs4_server_t *sp)
3414 {
3415 COMPOUND4args_clnt args;
3416 COMPOUND4res_clnt res;
3417 nfs_argop4 argop[1];
3418 int doqueue = 1;
3419 int rpc_error;
3420 cred_t *cr;
3421 mntinfo4_t *mi;
3422 timespec_t prop_time, after_time;
3423 int needrecov = FALSE;
3424 nfs4_recov_state_t recov_state;
3425 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3426
3427 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew"));
3428
3429 recov_state.rs_flags = 0;
3430 recov_state.rs_num_retry_despite_err = 0;
3431
3432 recov_retry:
3433 mi = sp->mntinfo4_list;
3434 VFS_HOLD(mi->mi_vfsp);
3435 mutex_exit(&sp->s_lock);
3436 ASSERT(mi != NULL);
3437
3438 e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3439 if (e.error) {
3440 VFS_RELE(mi->mi_vfsp);
3441 return (e.error);
3442 }
3443
3444 /* Check to see if we're dealing with a marked-dead sp */
3445 mutex_enter(&sp->s_lock);
3446 if (sp->s_thread_exit == NFS4_THREAD_EXIT) {
3447 mutex_exit(&sp->s_lock);
3448 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3449 VFS_RELE(mi->mi_vfsp);
3450 return (0);
3451 }
3452
3453 /* Make sure mi hasn't changed on us */
3454 if (mi != sp->mntinfo4_list) {
3455 /* Must drop sp's lock to avoid a recursive mutex enter */
3456 mutex_exit(&sp->s_lock);
3457 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3458 VFS_RELE(mi->mi_vfsp);
3459 mutex_enter(&sp->s_lock);
3460 goto recov_retry;
3461 }
3462 mutex_exit(&sp->s_lock);
3463
3464 args.ctag = TAG_RENEW;
3465
3466 args.array_len = 1;
3467 args.array = argop;
3468
3469 argop[0].argop = OP_RENEW;
3470
3471 mutex_enter(&sp->s_lock);
3472 argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid;
3473 cr = sp->s_cred;
3474 crhold(cr);
3475 mutex_exit(&sp->s_lock);
3476
3477 ASSERT(cr != NULL);
3478
3479 /* used to figure out RTT for sp */
3480 gethrestime(&prop_time);
3481
3482 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3483 "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first",
3484 (void*)sp));
3485 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ",
3486 prop_time.tv_sec, prop_time.tv_nsec));
3487
3488 DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp,
3489 mntinfo4_t *, mi);
3490
3491 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3492 crfree(cr);
3493
3494 DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp,
3495 mntinfo4_t *, mi);
3496
3497 gethrestime(&after_time);
3498
3499 mutex_enter(&sp->s_lock);
3500 sp->propagation_delay.tv_sec =
3501 MAX(1, after_time.tv_sec - prop_time.tv_sec);
3502 mutex_exit(&sp->s_lock);
3503
3504 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ",
3505 after_time.tv_sec, after_time.tv_nsec));
3506
3507 if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) {
3508 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3509 nfs4_delegreturn_all(sp);
3510 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3511 VFS_RELE(mi->mi_vfsp);
3512 /*
3513 * If the server returns CB_PATH_DOWN, it has renewed
3514 * the lease and informed us that the callback path is
3515 * down. Since the lease is renewed, just return 0 and
3516 * let the renew thread proceed as normal.
3517 */
3518 return (0);
3519 }
3520
3521 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3522 if (!needrecov && e.error) {
3523 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3524 VFS_RELE(mi->mi_vfsp);
3525 return (e.error);
3526 }
3527
3528 rpc_error = e.error;
3529
3530 if (needrecov) {
3531 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3532 "nfs4renew: initiating recovery\n"));
3533
3534 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
3535 OP_RENEW, NULL, NULL, NULL) == FALSE) {
3536 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3537 VFS_RELE(mi->mi_vfsp);
3538 if (!e.error)
3539 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3540 mutex_enter(&sp->s_lock);
3541 goto recov_retry;
3542 }
3543 /* fall through for res.status case */
3544 }
3545
3546 if (res.status) {
3547 if (res.status == NFS4ERR_LEASE_MOVED) {
3548 /*EMPTY*/
3549 /*
3550 * XXX need to try every mntinfo4 in sp->mntinfo4_list
3551 * to renew the lease on that server
3552 */
3553 }
3554 e.error = geterrno4(res.status);
3555 }
3556
3557 if (!rpc_error)
3558 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3559
3560 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3561
3562 VFS_RELE(mi->mi_vfsp);
3563
3564 return (e.error);
3565 }
3566
3567 void
nfs4_inc_state_ref_count(mntinfo4_t * mi)3568 nfs4_inc_state_ref_count(mntinfo4_t *mi)
3569 {
3570 nfs4_server_t *sp;
3571
3572 /* this locks down sp if it is found */
3573 sp = find_nfs4_server(mi);
3574
3575 if (sp != NULL) {
3576 nfs4_inc_state_ref_count_nolock(sp, mi);
3577 mutex_exit(&sp->s_lock);
3578 nfs4_server_rele(sp);
3579 }
3580 }
3581
3582 /*
3583 * Bump the number of OPEN files (ie: those with state) so we know if this
3584 * nfs4_server has any state to maintain a lease for or not.
3585 *
3586 * Also, marks the nfs4_server's lease valid if it hasn't been done so already.
3587 */
3588 void
nfs4_inc_state_ref_count_nolock(nfs4_server_t * sp,mntinfo4_t * mi)3589 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3590 {
3591 ASSERT(mutex_owned(&sp->s_lock));
3592
3593 sp->state_ref_count++;
3594 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3595 "nfs4_inc_state_ref_count: state_ref_count now %d",
3596 sp->state_ref_count));
3597
3598 if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED)
3599 sp->lease_valid = NFS4_LEASE_VALID;
3600
3601 /*
3602 * If this call caused the lease to be marked valid and/or
3603 * took the state_ref_count from 0 to 1, then start the time
3604 * on lease renewal.
3605 */
3606 if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1)
3607 sp->last_renewal_time = gethrestime_sec();
3608
3609 /* update the number of open files for mi */
3610 mi->mi_open_files++;
3611 }
3612
3613 void
nfs4_dec_state_ref_count(mntinfo4_t * mi)3614 nfs4_dec_state_ref_count(mntinfo4_t *mi)
3615 {
3616 nfs4_server_t *sp;
3617
3618 /* this locks down sp if it is found */
3619 sp = find_nfs4_server_all(mi, 1);
3620
3621 if (sp != NULL) {
3622 nfs4_dec_state_ref_count_nolock(sp, mi);
3623 mutex_exit(&sp->s_lock);
3624 nfs4_server_rele(sp);
3625 }
3626 }
3627
3628 /*
3629 * Decrement the number of OPEN files (ie: those with state) so we know if
3630 * this nfs4_server has any state to maintain a lease for or not.
3631 */
3632 void
nfs4_dec_state_ref_count_nolock(nfs4_server_t * sp,mntinfo4_t * mi)3633 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3634 {
3635 ASSERT(mutex_owned(&sp->s_lock));
3636 ASSERT(sp->state_ref_count != 0);
3637 sp->state_ref_count--;
3638
3639 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3640 "nfs4_dec_state_ref_count: state ref count now %d",
3641 sp->state_ref_count));
3642
3643 mi->mi_open_files--;
3644 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3645 "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x",
3646 mi->mi_open_files, mi->mi_flags));
3647
3648 /* We don't have to hold the mi_lock to test mi_flags */
3649 if (mi->mi_open_files == 0 &&
3650 (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) {
3651 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3652 "nfs4_dec_state_ref_count: remove mntinfo4 %p since "
3653 "we have closed the last open file", (void*)mi));
3654 nfs4_remove_mi_from_server(mi, sp);
3655 }
3656 }
3657
3658 bool_t
inlease(nfs4_server_t * sp)3659 inlease(nfs4_server_t *sp)
3660 {
3661 bool_t result;
3662
3663 ASSERT(mutex_owned(&sp->s_lock));
3664
3665 if (sp->lease_valid == NFS4_LEASE_VALID &&
3666 gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time)
3667 result = TRUE;
3668 else
3669 result = FALSE;
3670
3671 return (result);
3672 }
3673
3674
3675 /*
3676 * Return non-zero if the given nfs4_server_t is going through recovery.
3677 */
3678
3679 int
nfs4_server_in_recovery(nfs4_server_t * sp)3680 nfs4_server_in_recovery(nfs4_server_t *sp)
3681 {
3682 return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
3683 }
3684
3685 /*
3686 * Compare two shared filehandle objects. Returns -1, 0, or +1, if the
3687 * first is less than, equal to, or greater than the second.
3688 */
3689
3690 int
sfh4cmp(const void * p1,const void * p2)3691 sfh4cmp(const void *p1, const void *p2)
3692 {
3693 const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1;
3694 const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2;
3695
3696 return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh));
3697 }
3698
3699 /*
3700 * Create a table for shared filehandle objects.
3701 */
3702
3703 void
sfh4_createtab(avl_tree_t * tab)3704 sfh4_createtab(avl_tree_t *tab)
3705 {
3706 avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t),
3707 offsetof(nfs4_sharedfh_t, sfh_tree));
3708 }
3709
3710 /*
3711 * Return a shared filehandle object for the given filehandle. The caller
3712 * is responsible for eventually calling sfh4_rele().
3713 */
3714
3715 nfs4_sharedfh_t *
sfh4_put(const nfs_fh4 * fh,mntinfo4_t * mi,nfs4_sharedfh_t * key)3716 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key)
3717 {
3718 nfs4_sharedfh_t *sfh, *nsfh;
3719 avl_index_t where;
3720 nfs4_sharedfh_t skey;
3721
3722 if (!key) {
3723 skey.sfh_fh = *fh;
3724 key = &skey;
3725 }
3726
3727 nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP);
3728 nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len;
3729 /*
3730 * We allocate the largest possible filehandle size because it's
3731 * not that big, and it saves us from possibly having to resize the
3732 * buffer later.
3733 */
3734 nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP);
3735 bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len);
3736 mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL);
3737 nsfh->sfh_refcnt = 1;
3738 nsfh->sfh_flags = SFH4_IN_TREE;
3739 nsfh->sfh_mi = mi;
3740 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)",
3741 (void *)nsfh));
3742
3743 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3744 sfh = avl_find(&mi->mi_filehandles, key, &where);
3745 if (sfh != NULL) {
3746 mutex_enter(&sfh->sfh_lock);
3747 sfh->sfh_refcnt++;
3748 mutex_exit(&sfh->sfh_lock);
3749 nfs_rw_exit(&mi->mi_fh_lock);
3750 /* free our speculative allocs */
3751 kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3752 kmem_free(nsfh, sizeof (nfs4_sharedfh_t));
3753 return (sfh);
3754 }
3755
3756 avl_insert(&mi->mi_filehandles, nsfh, where);
3757 nfs_rw_exit(&mi->mi_fh_lock);
3758
3759 return (nsfh);
3760 }
3761
3762 /*
3763 * Return a shared filehandle object for the given filehandle. The caller
3764 * is responsible for eventually calling sfh4_rele().
3765 */
3766
3767 nfs4_sharedfh_t *
sfh4_get(const nfs_fh4 * fh,mntinfo4_t * mi)3768 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi)
3769 {
3770 nfs4_sharedfh_t *sfh;
3771 nfs4_sharedfh_t key;
3772
3773 ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE);
3774
3775 #ifdef DEBUG
3776 if (nfs4_sharedfh_debug) {
3777 nfs4_fhandle_t fhandle;
3778
3779 fhandle.fh_len = fh->nfs_fh4_len;
3780 bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len);
3781 zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:");
3782 nfs4_printfhandle(&fhandle);
3783 }
3784 #endif
3785
3786 /*
3787 * If there's already an object for the given filehandle, bump the
3788 * reference count and return it. Otherwise, create a new object
3789 * and add it to the AVL tree.
3790 */
3791
3792 key.sfh_fh = *fh;
3793
3794 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3795 sfh = avl_find(&mi->mi_filehandles, &key, NULL);
3796 if (sfh != NULL) {
3797 mutex_enter(&sfh->sfh_lock);
3798 sfh->sfh_refcnt++;
3799 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3800 "sfh4_get: found existing %p, new refcnt=%d",
3801 (void *)sfh, sfh->sfh_refcnt));
3802 mutex_exit(&sfh->sfh_lock);
3803 nfs_rw_exit(&mi->mi_fh_lock);
3804 return (sfh);
3805 }
3806 nfs_rw_exit(&mi->mi_fh_lock);
3807
3808 return (sfh4_put(fh, mi, &key));
3809 }
3810
3811 /*
3812 * Get a reference to the given shared filehandle object.
3813 */
3814
3815 void
sfh4_hold(nfs4_sharedfh_t * sfh)3816 sfh4_hold(nfs4_sharedfh_t *sfh)
3817 {
3818 ASSERT(sfh->sfh_refcnt > 0);
3819
3820 mutex_enter(&sfh->sfh_lock);
3821 sfh->sfh_refcnt++;
3822 NFS4_DEBUG(nfs4_sharedfh_debug,
3823 (CE_NOTE, "sfh4_hold %p, new refcnt=%d",
3824 (void *)sfh, sfh->sfh_refcnt));
3825 mutex_exit(&sfh->sfh_lock);
3826 }
3827
3828 /*
3829 * Release a reference to the given shared filehandle object and null out
3830 * the given pointer.
3831 */
3832
3833 void
sfh4_rele(nfs4_sharedfh_t ** sfhpp)3834 sfh4_rele(nfs4_sharedfh_t **sfhpp)
3835 {
3836 mntinfo4_t *mi;
3837 nfs4_sharedfh_t *sfh = *sfhpp;
3838
3839 ASSERT(sfh->sfh_refcnt > 0);
3840
3841 mutex_enter(&sfh->sfh_lock);
3842 if (sfh->sfh_refcnt > 1) {
3843 sfh->sfh_refcnt--;
3844 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3845 "sfh4_rele %p, new refcnt=%d",
3846 (void *)sfh, sfh->sfh_refcnt));
3847 mutex_exit(&sfh->sfh_lock);
3848 goto finish;
3849 }
3850 mutex_exit(&sfh->sfh_lock);
3851
3852 /*
3853 * Possibly the last reference, so get the lock for the table in
3854 * case it's time to remove the object from the table.
3855 */
3856 mi = sfh->sfh_mi;
3857 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3858 mutex_enter(&sfh->sfh_lock);
3859 sfh->sfh_refcnt--;
3860 if (sfh->sfh_refcnt > 0) {
3861 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3862 "sfh4_rele %p, new refcnt=%d",
3863 (void *)sfh, sfh->sfh_refcnt));
3864 mutex_exit(&sfh->sfh_lock);
3865 nfs_rw_exit(&mi->mi_fh_lock);
3866 goto finish;
3867 }
3868
3869 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3870 "sfh4_rele %p, last ref", (void *)sfh));
3871 if (sfh->sfh_flags & SFH4_IN_TREE) {
3872 avl_remove(&mi->mi_filehandles, sfh);
3873 sfh->sfh_flags &= ~SFH4_IN_TREE;
3874 }
3875 mutex_exit(&sfh->sfh_lock);
3876 nfs_rw_exit(&mi->mi_fh_lock);
3877 mutex_destroy(&sfh->sfh_lock);
3878 kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3879 kmem_free(sfh, sizeof (nfs4_sharedfh_t));
3880
3881 finish:
3882 *sfhpp = NULL;
3883 }
3884
3885 /*
3886 * Update the filehandle for the given shared filehandle object.
3887 */
3888
3889 int nfs4_warn_dupfh = 0; /* if set, always warn about dup fhs below */
3890
3891 void
sfh4_update(nfs4_sharedfh_t * sfh,const nfs_fh4 * newfh)3892 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh)
3893 {
3894 mntinfo4_t *mi = sfh->sfh_mi;
3895 nfs4_sharedfh_t *dupsfh;
3896 avl_index_t where;
3897 nfs4_sharedfh_t key;
3898
3899 #ifdef DEBUG
3900 mutex_enter(&sfh->sfh_lock);
3901 ASSERT(sfh->sfh_refcnt > 0);
3902 mutex_exit(&sfh->sfh_lock);
3903 #endif
3904 ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE);
3905
3906 /*
3907 * The basic plan is to remove the shared filehandle object from
3908 * the table, update it to have the new filehandle, then reinsert
3909 * it.
3910 */
3911
3912 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3913 mutex_enter(&sfh->sfh_lock);
3914 if (sfh->sfh_flags & SFH4_IN_TREE) {
3915 avl_remove(&mi->mi_filehandles, sfh);
3916 sfh->sfh_flags &= ~SFH4_IN_TREE;
3917 }
3918 mutex_exit(&sfh->sfh_lock);
3919 sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len;
3920 bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val,
3921 sfh->sfh_fh.nfs_fh4_len);
3922
3923 /*
3924 * XXX If there is already a shared filehandle object with the new
3925 * filehandle, we're in trouble, because the rnode code assumes
3926 * that there is only one shared filehandle object for a given
3927 * filehandle. So issue a warning (for read-write mounts only)
3928 * and don't try to re-insert the given object into the table.
3929 * Hopefully the given object will quickly go away and everyone
3930 * will use the new object.
3931 */
3932 key.sfh_fh = *newfh;
3933 dupsfh = avl_find(&mi->mi_filehandles, &key, &where);
3934 if (dupsfh != NULL) {
3935 if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) {
3936 zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: "
3937 "duplicate filehandle detected");
3938 sfh4_printfhandle(dupsfh);
3939 }
3940 } else {
3941 avl_insert(&mi->mi_filehandles, sfh, where);
3942 mutex_enter(&sfh->sfh_lock);
3943 sfh->sfh_flags |= SFH4_IN_TREE;
3944 mutex_exit(&sfh->sfh_lock);
3945 }
3946 nfs_rw_exit(&mi->mi_fh_lock);
3947 }
3948
3949 /*
3950 * Copy out the current filehandle for the given shared filehandle object.
3951 */
3952
3953 void
sfh4_copyval(const nfs4_sharedfh_t * sfh,nfs4_fhandle_t * fhp)3954 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp)
3955 {
3956 mntinfo4_t *mi = sfh->sfh_mi;
3957
3958 ASSERT(sfh->sfh_refcnt > 0);
3959
3960 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3961 fhp->fh_len = sfh->sfh_fh.nfs_fh4_len;
3962 ASSERT(fhp->fh_len <= NFS4_FHSIZE);
3963 bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len);
3964 nfs_rw_exit(&mi->mi_fh_lock);
3965 }
3966
3967 /*
3968 * Print out the filehandle for the given shared filehandle object.
3969 */
3970
3971 void
sfh4_printfhandle(const nfs4_sharedfh_t * sfh)3972 sfh4_printfhandle(const nfs4_sharedfh_t *sfh)
3973 {
3974 nfs4_fhandle_t fhandle;
3975
3976 sfh4_copyval(sfh, &fhandle);
3977 nfs4_printfhandle(&fhandle);
3978 }
3979
3980 /*
3981 * Compare 2 fnames. Returns -1 if the first is "less" than the second, 0
3982 * if they're the same, +1 if the first is "greater" than the second. The
3983 * caller (or whoever's calling the AVL package) is responsible for
3984 * handling locking issues.
3985 */
3986
3987 static int
fncmp(const void * p1,const void * p2)3988 fncmp(const void *p1, const void *p2)
3989 {
3990 const nfs4_fname_t *f1 = p1;
3991 const nfs4_fname_t *f2 = p2;
3992 int res;
3993
3994 res = strcmp(f1->fn_name, f2->fn_name);
3995 /*
3996 * The AVL package wants +/-1, not arbitrary positive or negative
3997 * integers.
3998 */
3999 if (res > 0)
4000 res = 1;
4001 else if (res < 0)
4002 res = -1;
4003 return (res);
4004 }
4005
4006 /*
4007 * Get or create an fname with the given name, as a child of the given
4008 * fname. The caller is responsible for eventually releasing the reference
4009 * (fn_rele()). parent may be NULL.
4010 */
4011
4012 nfs4_fname_t *
fn_get(nfs4_fname_t * parent,char * name,nfs4_sharedfh_t * sfh)4013 fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh)
4014 {
4015 nfs4_fname_t key;
4016 nfs4_fname_t *fnp;
4017 avl_index_t where;
4018
4019 key.fn_name = name;
4020
4021 /*
4022 * If there's already an fname registered with the given name, bump
4023 * its reference count and return it. Otherwise, create a new one
4024 * and add it to the parent's AVL tree.
4025 *
4026 * fname entries we are looking for should match both name
4027 * and sfh stored in the fname.
4028 */
4029 again:
4030 if (parent != NULL) {
4031 mutex_enter(&parent->fn_lock);
4032 fnp = avl_find(&parent->fn_children, &key, &where);
4033 if (fnp != NULL) {
4034 /*
4035 * This hold on fnp is released below later,
4036 * in case this is not the fnp we want.
4037 */
4038 fn_hold(fnp);
4039
4040 if (fnp->fn_sfh == sfh) {
4041 /*
4042 * We have found our entry.
4043 * put an hold and return it.
4044 */
4045 mutex_exit(&parent->fn_lock);
4046 return (fnp);
4047 }
4048
4049 /*
4050 * We have found an entry that has a mismatching
4051 * fn_sfh. This could be a stale entry due to
4052 * server side rename. We will remove this entry
4053 * and make sure no such entries exist.
4054 */
4055 mutex_exit(&parent->fn_lock);
4056 mutex_enter(&fnp->fn_lock);
4057 if (fnp->fn_parent == parent) {
4058 /*
4059 * Remove ourselves from parent's
4060 * fn_children tree.
4061 */
4062 mutex_enter(&parent->fn_lock);
4063 avl_remove(&parent->fn_children, fnp);
4064 mutex_exit(&parent->fn_lock);
4065 fn_rele(&fnp->fn_parent);
4066 }
4067 mutex_exit(&fnp->fn_lock);
4068 fn_rele(&fnp);
4069 goto again;
4070 }
4071 }
4072
4073 fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP);
4074 mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL);
4075 fnp->fn_parent = parent;
4076 if (parent != NULL)
4077 fn_hold(parent);
4078 fnp->fn_len = strlen(name);
4079 ASSERT(fnp->fn_len < MAXNAMELEN);
4080 fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP);
4081 (void) strcpy(fnp->fn_name, name);
4082 fnp->fn_refcnt = 1;
4083
4084 /*
4085 * This hold on sfh is later released
4086 * when we do the final fn_rele() on this fname.
4087 */
4088 sfh4_hold(sfh);
4089 fnp->fn_sfh = sfh;
4090
4091 avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t),
4092 offsetof(nfs4_fname_t, fn_tree));
4093 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4094 "fn_get %p:%s, a new nfs4_fname_t!",
4095 (void *)fnp, fnp->fn_name));
4096 if (parent != NULL) {
4097 avl_insert(&parent->fn_children, fnp, where);
4098 mutex_exit(&parent->fn_lock);
4099 }
4100
4101 return (fnp);
4102 }
4103
4104 void
fn_hold(nfs4_fname_t * fnp)4105 fn_hold(nfs4_fname_t *fnp)
4106 {
4107 atomic_inc_32(&fnp->fn_refcnt);
4108 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4109 "fn_hold %p:%s, new refcnt=%d",
4110 (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4111 }
4112
4113 /*
4114 * Decrement the reference count of the given fname, and destroy it if its
4115 * reference count goes to zero. Nulls out the given pointer.
4116 */
4117
4118 void
fn_rele(nfs4_fname_t ** fnpp)4119 fn_rele(nfs4_fname_t **fnpp)
4120 {
4121 nfs4_fname_t *parent;
4122 uint32_t newref;
4123 nfs4_fname_t *fnp;
4124
4125 recur:
4126 fnp = *fnpp;
4127 *fnpp = NULL;
4128
4129 mutex_enter(&fnp->fn_lock);
4130 parent = fnp->fn_parent;
4131 if (parent != NULL)
4132 mutex_enter(&parent->fn_lock); /* prevent new references */
4133 newref = atomic_dec_32_nv(&fnp->fn_refcnt);
4134 if (newref > 0) {
4135 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4136 "fn_rele %p:%s, new refcnt=%d",
4137 (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4138 if (parent != NULL)
4139 mutex_exit(&parent->fn_lock);
4140 mutex_exit(&fnp->fn_lock);
4141 return;
4142 }
4143
4144 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4145 "fn_rele %p:%s, last reference, deleting...",
4146 (void *)fnp, fnp->fn_name));
4147 if (parent != NULL) {
4148 avl_remove(&parent->fn_children, fnp);
4149 mutex_exit(&parent->fn_lock);
4150 }
4151 kmem_free(fnp->fn_name, fnp->fn_len + 1);
4152 sfh4_rele(&fnp->fn_sfh);
4153 mutex_destroy(&fnp->fn_lock);
4154 avl_destroy(&fnp->fn_children);
4155 kmem_free(fnp, sizeof (nfs4_fname_t));
4156 /*
4157 * Recursivly fn_rele the parent.
4158 * Use goto instead of a recursive call to avoid stack overflow.
4159 */
4160 if (parent != NULL) {
4161 fnpp = &parent;
4162 goto recur;
4163 }
4164 }
4165
4166 /*
4167 * Returns the single component name of the given fname, in a MAXNAMELEN
4168 * string buffer, which the caller is responsible for freeing. Note that
4169 * the name may become invalid as a result of fn_move().
4170 */
4171
4172 char *
fn_name(nfs4_fname_t * fnp)4173 fn_name(nfs4_fname_t *fnp)
4174 {
4175 char *name;
4176
4177 ASSERT(fnp->fn_len < MAXNAMELEN);
4178 name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
4179 mutex_enter(&fnp->fn_lock);
4180 (void) strcpy(name, fnp->fn_name);
4181 mutex_exit(&fnp->fn_lock);
4182
4183 return (name);
4184 }
4185
4186
4187 /*
4188 * fn_path_realloc
4189 *
4190 * This function, used only by fn_path, constructs
4191 * a new string which looks like "prepend" + "/" + "current".
4192 * by allocating a new string and freeing the old one.
4193 */
4194 static void
fn_path_realloc(char ** curses,char * prepend)4195 fn_path_realloc(char **curses, char *prepend)
4196 {
4197 int len, curlen = 0;
4198 char *news;
4199
4200 if (*curses == NULL) {
4201 /*
4202 * Prime the pump, allocate just the
4203 * space for prepend and return that.
4204 */
4205 len = strlen(prepend) + 1;
4206 news = kmem_alloc(len, KM_SLEEP);
4207 (void) strncpy(news, prepend, len);
4208 } else {
4209 /*
4210 * Allocate the space for a new string
4211 * +1 +1 is for the "/" and the NULL
4212 * byte at the end of it all.
4213 */
4214 curlen = strlen(*curses);
4215 len = curlen + strlen(prepend) + 1 + 1;
4216 news = kmem_alloc(len, KM_SLEEP);
4217 (void) strncpy(news, prepend, len);
4218 (void) strcat(news, "/");
4219 (void) strcat(news, *curses);
4220 kmem_free(*curses, curlen + 1);
4221 }
4222 *curses = news;
4223 }
4224
4225 /*
4226 * Returns the path name (starting from the fs root) for the given fname.
4227 * The caller is responsible for freeing. Note that the path may be or
4228 * become invalid as a result of fn_move().
4229 */
4230
4231 char *
fn_path(nfs4_fname_t * fnp)4232 fn_path(nfs4_fname_t *fnp)
4233 {
4234 char *path;
4235 nfs4_fname_t *nextfnp;
4236
4237 if (fnp == NULL)
4238 return (NULL);
4239
4240 path = NULL;
4241
4242 /* walk up the tree constructing the pathname. */
4243
4244 fn_hold(fnp); /* adjust for later rele */
4245 do {
4246 mutex_enter(&fnp->fn_lock);
4247 /*
4248 * Add fn_name in front of the current path
4249 */
4250 fn_path_realloc(&path, fnp->fn_name);
4251 nextfnp = fnp->fn_parent;
4252 if (nextfnp != NULL)
4253 fn_hold(nextfnp);
4254 mutex_exit(&fnp->fn_lock);
4255 fn_rele(&fnp);
4256 fnp = nextfnp;
4257 } while (fnp != NULL);
4258
4259 return (path);
4260 }
4261
4262 /*
4263 * Return a reference to the parent of the given fname, which the caller is
4264 * responsible for eventually releasing.
4265 */
4266
4267 nfs4_fname_t *
fn_parent(nfs4_fname_t * fnp)4268 fn_parent(nfs4_fname_t *fnp)
4269 {
4270 nfs4_fname_t *parent;
4271
4272 mutex_enter(&fnp->fn_lock);
4273 parent = fnp->fn_parent;
4274 if (parent != NULL)
4275 fn_hold(parent);
4276 mutex_exit(&fnp->fn_lock);
4277
4278 return (parent);
4279 }
4280
4281 /*
4282 * Update fnp so that its parent is newparent and its name is newname.
4283 */
4284
4285 void
fn_move(nfs4_fname_t * fnp,nfs4_fname_t * newparent,char * newname)4286 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname)
4287 {
4288 nfs4_fname_t *parent, *tmpfnp;
4289 ssize_t newlen;
4290 nfs4_fname_t key;
4291 avl_index_t where;
4292
4293 /*
4294 * This assert exists to catch the client trying to rename
4295 * a dir to be a child of itself. This happened at a recent
4296 * bakeoff against a 3rd party (broken) server which allowed
4297 * the rename to succeed. If it trips it means that:
4298 * a) the code in nfs4rename that detects this case is broken
4299 * b) the server is broken (since it allowed the bogus rename)
4300 *
4301 * For non-DEBUG kernels, prepare for a recursive mutex_enter
4302 * panic below from: mutex_enter(&newparent->fn_lock);
4303 */
4304 ASSERT(fnp != newparent);
4305
4306 /*
4307 * Remove fnp from its current parent, change its name, then add it
4308 * to newparent. It might happen that fnp was replaced by another
4309 * nfs4_fname_t with the same fn_name in parent->fn_children.
4310 * In such case, fnp->fn_parent is NULL and we skip the removal
4311 * of fnp from its current parent.
4312 */
4313 mutex_enter(&fnp->fn_lock);
4314 parent = fnp->fn_parent;
4315 if (parent != NULL) {
4316 mutex_enter(&parent->fn_lock);
4317 avl_remove(&parent->fn_children, fnp);
4318 mutex_exit(&parent->fn_lock);
4319 fn_rele(&fnp->fn_parent);
4320 }
4321
4322 newlen = strlen(newname);
4323 if (newlen != fnp->fn_len) {
4324 ASSERT(newlen < MAXNAMELEN);
4325 kmem_free(fnp->fn_name, fnp->fn_len + 1);
4326 fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP);
4327 fnp->fn_len = newlen;
4328 }
4329 (void) strcpy(fnp->fn_name, newname);
4330
4331 again:
4332 mutex_enter(&newparent->fn_lock);
4333 key.fn_name = fnp->fn_name;
4334 tmpfnp = avl_find(&newparent->fn_children, &key, &where);
4335 if (tmpfnp != NULL) {
4336 /*
4337 * This could be due to a file that was unlinked while
4338 * open, or perhaps the rnode is in the free list. Remove
4339 * it from newparent and let it go away on its own. The
4340 * contorted code is to deal with lock order issues and
4341 * race conditions.
4342 */
4343 fn_hold(tmpfnp);
4344 mutex_exit(&newparent->fn_lock);
4345 mutex_enter(&tmpfnp->fn_lock);
4346 if (tmpfnp->fn_parent == newparent) {
4347 mutex_enter(&newparent->fn_lock);
4348 avl_remove(&newparent->fn_children, tmpfnp);
4349 mutex_exit(&newparent->fn_lock);
4350 fn_rele(&tmpfnp->fn_parent);
4351 }
4352 mutex_exit(&tmpfnp->fn_lock);
4353 fn_rele(&tmpfnp);
4354 goto again;
4355 }
4356 fnp->fn_parent = newparent;
4357 fn_hold(newparent);
4358 avl_insert(&newparent->fn_children, fnp, where);
4359 mutex_exit(&newparent->fn_lock);
4360 mutex_exit(&fnp->fn_lock);
4361 }
4362
4363 #ifdef DEBUG
4364 /*
4365 * Return non-zero if the type information makes sense for the given vnode.
4366 * Otherwise panic.
4367 */
4368 int
nfs4_consistent_type(vnode_t * vp)4369 nfs4_consistent_type(vnode_t *vp)
4370 {
4371 rnode4_t *rp = VTOR4(vp);
4372
4373 if (nfs4_vtype_debug && vp->v_type != VNON &&
4374 rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) {
4375 cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, "
4376 "rnode attr type=%d", (void *)vp, vp->v_type,
4377 rp->r_attr.va_type);
4378 }
4379
4380 return (1);
4381 }
4382 #endif /* DEBUG */
4383