xref: /netbsd/sys/kern/vfs_trans.c (revision 4dedf99c)
1 /*	$NetBSD: vfs_trans.c,v 1.70 2022/11/04 11:20:39 hannken Exp $	*/
2 
3 /*-
4  * Copyright (c) 2007, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Juergen Hannken-Illjes.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: vfs_trans.c,v 1.70 2022/11/04 11:20:39 hannken Exp $");
34 
35 /*
36  * File system transaction operations.
37  */
38 
39 #ifdef _KERNEL_OPT
40 #include "opt_ddb.h"
41 #endif
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/atomic.h>
46 #include <sys/buf.h>
47 #include <sys/hash.h>
48 #include <sys/kmem.h>
49 #include <sys/mount.h>
50 #include <sys/pserialize.h>
51 #include <sys/vnode.h>
52 #include <sys/fstrans.h>
53 #include <sys/proc.h>
54 #include <sys/pool.h>
55 
56 #include <miscfs/deadfs/deadfs.h>
57 #include <miscfs/specfs/specdev.h>
58 
59 #define FSTRANS_MOUNT_HASHSIZE	32
60 
61 enum fstrans_lock_type {
62 	FSTRANS_LAZY,			/* Granted while not suspended */
63 	FSTRANS_SHARED			/* Granted while not suspending */
64 };
65 
66 struct fscow_handler {
67 	LIST_ENTRY(fscow_handler) ch_list;
68 	int (*ch_func)(void *, struct buf *, bool);
69 	void *ch_arg;
70 };
71 struct fstrans_lwp_info {
72 	struct fstrans_lwp_info *fli_succ;
73 	struct lwp *fli_self;
74 	struct mount *fli_mount;
75 	struct fstrans_lwp_info *fli_alias;
76 	struct fstrans_mount_info *fli_mountinfo;
77 	int fli_trans_cnt;
78 	int fli_alias_cnt;
79 	int fli_cow_cnt;
80 	enum fstrans_lock_type fli_lock_type;
81 	LIST_ENTRY(fstrans_lwp_info) fli_list;
82 };
83 struct fstrans_mount_info {
84 	enum fstrans_state fmi_state;
85 	unsigned int fmi_ref_cnt;
86 	bool fmi_gone;
87 	bool fmi_cow_change;
88 	SLIST_ENTRY(fstrans_mount_info) fmi_hash;
89 	LIST_HEAD(, fscow_handler) fmi_cow_handler;
90 	struct mount *fmi_mount;
91 	struct fstrans_mount_info *fmi_lower_info;
92 	struct lwp *fmi_owner;
93 };
94 SLIST_HEAD(fstrans_mount_hashhead, fstrans_mount_info);
95 
96 static kmutex_t vfs_suspend_lock	/* Serialize suspensions. */
97     __cacheline_aligned;
98 static kmutex_t fstrans_lock		/* Fstrans big lock. */
99     __cacheline_aligned;
100 static kcondvar_t fstrans_state_cv;	/* Fstrans or cow state changed. */
101 static kcondvar_t fstrans_count_cv;	/* Fstrans or cow count changed. */
102 static pserialize_t fstrans_psz;	/* Pserialize state. */
103 static LIST_HEAD(fstrans_lwp_head, fstrans_lwp_info) fstrans_fli_head;
104 					/* List of all fstrans_lwp_info. */
105 static pool_cache_t fstrans_lwp_cache;	/* Cache of fstrans_lwp_info. */
106 
107 static u_long fstrans_mount_hashmask;
108 static struct fstrans_mount_hashhead *fstrans_mount_hashtab;
109 static int fstrans_gone_count;		/* Number of fstrans_mount_info gone. */
110 
111 static inline uint32_t fstrans_mount_hash(struct mount *);
112 static inline struct fstrans_mount_info *fstrans_mount_get(struct mount *);
113 static void fstrans_mount_dtor(struct fstrans_mount_info *);
114 static void fstrans_clear_lwp_info(void);
115 static inline struct fstrans_lwp_info *
116     fstrans_get_lwp_info(struct mount *, bool);
117 static struct fstrans_lwp_info *fstrans_alloc_lwp_info(struct mount *);
118 static int fstrans_lwp_pcc(void *, void *, int);
119 static void fstrans_lwp_pcd(void *, void *);
120 static inline int _fstrans_start(struct mount *, enum fstrans_lock_type, int);
121 static bool grant_lock(const struct fstrans_mount_info *,
122     const enum fstrans_lock_type);
123 static bool state_change_done(const struct fstrans_mount_info *);
124 static bool cow_state_change_done(const struct fstrans_mount_info *);
125 static void cow_change_enter(struct fstrans_mount_info *);
126 static void cow_change_done(struct fstrans_mount_info *);
127 
128 /*
129  * Initialize.
130  */
131 void
fstrans_init(void)132 fstrans_init(void)
133 {
134 
135 	mutex_init(&vfs_suspend_lock, MUTEX_DEFAULT, IPL_NONE);
136 	mutex_init(&fstrans_lock, MUTEX_DEFAULT, IPL_NONE);
137 	cv_init(&fstrans_state_cv, "fstchg");
138 	cv_init(&fstrans_count_cv, "fstcnt");
139 	fstrans_psz = pserialize_create();
140 	LIST_INIT(&fstrans_fli_head);
141 	fstrans_lwp_cache = pool_cache_init(sizeof(struct fstrans_lwp_info),
142 	    coherency_unit, 0, 0, "fstlwp", NULL, IPL_NONE,
143 	    fstrans_lwp_pcc, fstrans_lwp_pcd, NULL);
144 	KASSERT(fstrans_lwp_cache != NULL);
145 	fstrans_mount_hashtab = hashinit(FSTRANS_MOUNT_HASHSIZE, HASH_SLIST,
146 	    true, &fstrans_mount_hashmask);
147 }
148 
149 /*
150  * pool_cache constructor for fstrans_lwp_info.  Updating the global list
151  * produces cache misses on MP.  Minimise by keeping free entries on list.
152  */
153 int
fstrans_lwp_pcc(void * arg,void * obj,int flags)154 fstrans_lwp_pcc(void *arg, void *obj, int flags)
155 {
156 	struct fstrans_lwp_info *fli = obj;
157 
158 	memset(fli, 0, sizeof(*fli));
159 
160 	mutex_enter(&fstrans_lock);
161 	LIST_INSERT_HEAD(&fstrans_fli_head, fli, fli_list);
162 	mutex_exit(&fstrans_lock);
163 
164 	return 0;
165 }
166 
167 /*
168  * pool_cache destructor
169  */
170 void
fstrans_lwp_pcd(void * arg,void * obj)171 fstrans_lwp_pcd(void *arg, void *obj)
172 {
173 	struct fstrans_lwp_info *fli = obj;
174 
175 	mutex_enter(&fstrans_lock);
176 	LIST_REMOVE(fli, fli_list);
177 	mutex_exit(&fstrans_lock);
178 }
179 
180 /*
181  * Deallocate lwp state.
182  */
183 void
fstrans_lwp_dtor(lwp_t * l)184 fstrans_lwp_dtor(lwp_t *l)
185 {
186 	struct fstrans_lwp_info *fli, *fli_next;
187 
188 	if (l->l_fstrans == NULL)
189 		return;
190 
191 	mutex_enter(&fstrans_lock);
192 	for (fli = l->l_fstrans; fli; fli = fli_next) {
193 		KASSERT(fli->fli_trans_cnt == 0);
194 		KASSERT(fli->fli_cow_cnt == 0);
195 		KASSERT(fli->fli_self == l);
196 		if (fli->fli_mount != NULL)
197 			fstrans_mount_dtor(fli->fli_mountinfo);
198 		fli_next = fli->fli_succ;
199 		fli->fli_alias_cnt = 0;
200 		fli->fli_mount = NULL;
201 		fli->fli_alias = NULL;
202 		fli->fli_mountinfo = NULL;
203 		fli->fli_self = NULL;
204 	}
205 	mutex_exit(&fstrans_lock);
206 
207 	for (fli = l->l_fstrans; fli; fli = fli_next) {
208 		fli_next = fli->fli_succ;
209 		pool_cache_put(fstrans_lwp_cache, fli);
210 	}
211 	l->l_fstrans = NULL;
212 }
213 
214 /*
215  * mount pointer to hash
216  */
217 static inline uint32_t
fstrans_mount_hash(struct mount * mp)218 fstrans_mount_hash(struct mount *mp)
219 {
220 
221 	return hash32_buf(&mp, sizeof(mp), HASH32_BUF_INIT) &
222 	    fstrans_mount_hashmask;
223 }
224 
225 /*
226  * retrieve fstrans_mount_info by mount or NULL
227  */
228 static inline struct fstrans_mount_info *
fstrans_mount_get(struct mount * mp)229 fstrans_mount_get(struct mount *mp)
230 {
231 	uint32_t indx;
232 	struct fstrans_mount_info *fmi, *fmi_lower;
233 
234 	KASSERT(mutex_owned(&fstrans_lock));
235 
236 	indx = fstrans_mount_hash(mp);
237 	SLIST_FOREACH(fmi, &fstrans_mount_hashtab[indx], fmi_hash) {
238 		if (fmi->fmi_mount == mp) {
239 			if (__predict_false(mp->mnt_lower != NULL &&
240 			    fmi->fmi_lower_info == NULL)) {
241 				/*
242 				 * Intern the lower/lowest mount into
243 				 * this mount info on first lookup.
244 				 */
245 				KASSERT(fmi->fmi_ref_cnt == 1);
246 
247 				fmi_lower = fstrans_mount_get(mp->mnt_lower);
248 				if (fmi_lower && fmi_lower->fmi_lower_info)
249 					fmi_lower = fmi_lower->fmi_lower_info;
250 				if (fmi_lower == NULL)
251 					return NULL;
252 				fmi->fmi_lower_info = fmi_lower;
253 				fmi->fmi_lower_info->fmi_ref_cnt += 1;
254 			}
255 			return fmi;
256 		}
257 	}
258 
259 	return NULL;
260 }
261 
262 /*
263  * Dereference mount state.
264  */
265 static void
fstrans_mount_dtor(struct fstrans_mount_info * fmi)266 fstrans_mount_dtor(struct fstrans_mount_info *fmi)
267 {
268 
269 	KASSERT(mutex_owned(&fstrans_lock));
270 
271 	KASSERT(fmi != NULL);
272 	fmi->fmi_ref_cnt -= 1;
273 	if (__predict_true(fmi->fmi_ref_cnt > 0)) {
274 		return;
275 	}
276 
277 	KASSERT(fmi->fmi_state == FSTRANS_NORMAL);
278 	KASSERT(LIST_FIRST(&fmi->fmi_cow_handler) == NULL);
279 	KASSERT(fmi->fmi_owner == NULL);
280 
281 	if (fmi->fmi_lower_info)
282 		fstrans_mount_dtor(fmi->fmi_lower_info);
283 
284 	KASSERT(fstrans_gone_count > 0);
285 	fstrans_gone_count -= 1;
286 
287 	KASSERT(fmi->fmi_mount->mnt_lower == NULL);
288 
289 	kmem_free(fmi->fmi_mount, sizeof(*fmi->fmi_mount));
290 	kmem_free(fmi, sizeof(*fmi));
291 }
292 
293 /*
294  * Allocate mount state.
295  */
296 int
fstrans_mount(struct mount * mp)297 fstrans_mount(struct mount *mp)
298 {
299 	uint32_t indx;
300 	struct fstrans_mount_info *newfmi;
301 
302 	indx = fstrans_mount_hash(mp);
303 
304 	newfmi = kmem_alloc(sizeof(*newfmi), KM_SLEEP);
305 	newfmi->fmi_state = FSTRANS_NORMAL;
306 	newfmi->fmi_ref_cnt = 1;
307 	newfmi->fmi_gone = false;
308 	LIST_INIT(&newfmi->fmi_cow_handler);
309 	newfmi->fmi_cow_change = false;
310 	newfmi->fmi_mount = mp;
311 	newfmi->fmi_lower_info = NULL;
312 	newfmi->fmi_owner = NULL;
313 
314 	mutex_enter(&fstrans_lock);
315 	SLIST_INSERT_HEAD(&fstrans_mount_hashtab[indx], newfmi, fmi_hash);
316 	mutex_exit(&fstrans_lock);
317 
318 	return 0;
319 }
320 
321 /*
322  * Deallocate mount state.
323  */
324 void
fstrans_unmount(struct mount * mp)325 fstrans_unmount(struct mount *mp)
326 {
327 	uint32_t indx;
328 	struct fstrans_mount_info *fmi;
329 
330 	indx = fstrans_mount_hash(mp);
331 
332 	mutex_enter(&fstrans_lock);
333 	fmi = fstrans_mount_get(mp);
334 	KASSERT(fmi != NULL);
335 	fmi->fmi_gone = true;
336 	SLIST_REMOVE(&fstrans_mount_hashtab[indx],
337 	    fmi, fstrans_mount_info, fmi_hash);
338 	fstrans_gone_count += 1;
339 	fstrans_mount_dtor(fmi);
340 	mutex_exit(&fstrans_lock);
341 }
342 
343 /*
344  * Clear mount entries whose mount is gone.
345  */
346 static void
fstrans_clear_lwp_info(void)347 fstrans_clear_lwp_info(void)
348 {
349 	struct fstrans_lwp_info **p, *fli, *tofree = NULL;
350 
351 	/*
352 	 * Scan our list clearing entries whose mount is gone.
353 	 */
354 	mutex_enter(&fstrans_lock);
355 	for (p = &curlwp->l_fstrans; *p; ) {
356 		fli = *p;
357 		if (fli->fli_mount != NULL &&
358 		    fli->fli_mountinfo->fmi_gone &&
359 		    fli->fli_trans_cnt == 0 &&
360 		    fli->fli_cow_cnt == 0 &&
361 		    fli->fli_alias_cnt == 0) {
362 			*p = (*p)->fli_succ;
363 			fstrans_mount_dtor(fli->fli_mountinfo);
364 			if (fli->fli_alias) {
365 				KASSERT(fli->fli_alias->fli_alias_cnt > 0);
366 				fli->fli_alias->fli_alias_cnt--;
367 			}
368 			fli->fli_mount = NULL;
369 			fli->fli_alias = NULL;
370 			fli->fli_mountinfo = NULL;
371 			fli->fli_self = NULL;
372 			p = &curlwp->l_fstrans;
373 			fli->fli_succ = tofree;
374 			tofree = fli;
375 		} else {
376 			p = &(*p)->fli_succ;
377 		}
378 	}
379 #ifdef DIAGNOSTIC
380 	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ)
381 		if (fli->fli_alias != NULL)
382 			KASSERT(fli->fli_alias->fli_self == curlwp);
383 #endif /* DIAGNOSTIC */
384 	mutex_exit(&fstrans_lock);
385 
386 	while (tofree != NULL) {
387 		fli = tofree;
388 		tofree = fli->fli_succ;
389 		pool_cache_put(fstrans_lwp_cache, fli);
390 	}
391 }
392 
393 /*
394  * Allocate and return per lwp info for this mount.
395  */
396 static struct fstrans_lwp_info *
fstrans_alloc_lwp_info(struct mount * mp)397 fstrans_alloc_lwp_info(struct mount *mp)
398 {
399 	struct fstrans_lwp_info *fli, *fli_lower;
400 	struct fstrans_mount_info *fmi;
401 
402 	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
403 		if (fli->fli_mount == mp)
404 			return fli;
405 	}
406 
407 	/*
408 	 * Lookup mount info and get lower mount per lwp info.
409 	 */
410 	mutex_enter(&fstrans_lock);
411 	fmi = fstrans_mount_get(mp);
412 	if (fmi == NULL) {
413 		mutex_exit(&fstrans_lock);
414 		return NULL;
415 	}
416 	fmi->fmi_ref_cnt += 1;
417 	mutex_exit(&fstrans_lock);
418 
419 	if (fmi->fmi_lower_info) {
420 		fli_lower =
421 		    fstrans_alloc_lwp_info(fmi->fmi_lower_info->fmi_mount);
422 		if (fli_lower == NULL) {
423 			mutex_enter(&fstrans_lock);
424 			fstrans_mount_dtor(fmi);
425 			mutex_exit(&fstrans_lock);
426 
427 			return NULL;
428 		}
429 	} else {
430 		fli_lower = NULL;
431 	}
432 
433 	/*
434 	 * Allocate a new entry.
435 	 */
436 	fli = pool_cache_get(fstrans_lwp_cache, PR_WAITOK);
437 	KASSERT(fli->fli_trans_cnt == 0);
438 	KASSERT(fli->fli_cow_cnt == 0);
439 	KASSERT(fli->fli_alias_cnt == 0);
440 	KASSERT(fli->fli_mount == NULL);
441 	KASSERT(fli->fli_alias == NULL);
442 	KASSERT(fli->fli_mountinfo == NULL);
443 	KASSERT(fli->fli_self == NULL);
444 
445 	/*
446 	 * Attach the mount info and alias.
447 	 */
448 
449 	fli->fli_self = curlwp;
450 	fli->fli_mount = mp;
451 	fli->fli_mountinfo = fmi;
452 
453 	fli->fli_succ = curlwp->l_fstrans;
454 	curlwp->l_fstrans = fli;
455 
456 	if (fli_lower) {
457 		fli->fli_alias = fli_lower;
458 		fli->fli_alias->fli_alias_cnt++;
459 		fli = fli->fli_alias;
460 	}
461 
462 	return fli;
463 }
464 
465 /*
466  * Retrieve the per lwp info for this mount allocating if necessary.
467  */
468 static inline struct fstrans_lwp_info *
fstrans_get_lwp_info(struct mount * mp,bool do_alloc)469 fstrans_get_lwp_info(struct mount *mp, bool do_alloc)
470 {
471 	struct fstrans_lwp_info *fli;
472 
473 	/*
474 	 * Scan our list for a match.
475 	 */
476 	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
477 		if (fli->fli_mount == mp) {
478 			KASSERT(mp->mnt_lower == NULL ||
479 			    fli->fli_alias != NULL);
480 			if (fli->fli_alias != NULL)
481 				fli = fli->fli_alias;
482 			break;
483 		}
484 	}
485 
486 	if (do_alloc) {
487 		if (__predict_false(fli == NULL))
488 			fli = fstrans_alloc_lwp_info(mp);
489 	}
490 
491 	return fli;
492 }
493 
494 /*
495  * Check if this lock type is granted at this state.
496  */
497 static bool
grant_lock(const struct fstrans_mount_info * fmi,const enum fstrans_lock_type type)498 grant_lock(const struct fstrans_mount_info *fmi,
499     const enum fstrans_lock_type type)
500 {
501 
502 	if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL))
503 		return true;
504 	if (fmi->fmi_owner == curlwp)
505 		return true;
506 	if  (fmi->fmi_state == FSTRANS_SUSPENDING && type == FSTRANS_LAZY)
507 		return true;
508 
509 	return false;
510 }
511 
512 /*
513  * Start a transaction.  If this thread already has a transaction on this
514  * file system increment the reference counter.
515  */
516 static inline int
_fstrans_start(struct mount * mp,enum fstrans_lock_type lock_type,int wait)517 _fstrans_start(struct mount *mp, enum fstrans_lock_type lock_type, int wait)
518 {
519 	int s;
520 	struct fstrans_lwp_info *fli;
521 	struct fstrans_mount_info *fmi;
522 
523 	ASSERT_SLEEPABLE();
524 
525 	fli = fstrans_get_lwp_info(mp, true);
526 	if (fli == NULL)
527 		return 0;
528 	fmi = fli->fli_mountinfo;
529 
530 	if (fli->fli_trans_cnt > 0) {
531 		fli->fli_trans_cnt += 1;
532 
533 		return 0;
534 	}
535 
536 	s = pserialize_read_enter();
537 	if (__predict_true(grant_lock(fmi, lock_type))) {
538 		fli->fli_trans_cnt = 1;
539 		fli->fli_lock_type = lock_type;
540 		pserialize_read_exit(s);
541 
542 		return 0;
543 	}
544 	pserialize_read_exit(s);
545 
546 	if (! wait)
547 		return EBUSY;
548 
549 	mutex_enter(&fstrans_lock);
550 	while (! grant_lock(fmi, lock_type))
551 		cv_wait(&fstrans_state_cv, &fstrans_lock);
552 	fli->fli_trans_cnt = 1;
553 	fli->fli_lock_type = lock_type;
554 	mutex_exit(&fstrans_lock);
555 
556 	return 0;
557 }
558 
559 void
fstrans_start(struct mount * mp)560 fstrans_start(struct mount *mp)
561 {
562 	int error __diagused;
563 
564 	error = _fstrans_start(mp, FSTRANS_SHARED, 1);
565 	KASSERT(error == 0);
566 }
567 
568 int
fstrans_start_nowait(struct mount * mp)569 fstrans_start_nowait(struct mount *mp)
570 {
571 
572 	return _fstrans_start(mp, FSTRANS_SHARED, 0);
573 }
574 
575 void
fstrans_start_lazy(struct mount * mp)576 fstrans_start_lazy(struct mount *mp)
577 {
578 	int error __diagused;
579 
580 	error = _fstrans_start(mp, FSTRANS_LAZY, 1);
581 	KASSERT(error == 0);
582 }
583 
584 /*
585  * Finish a transaction.
586  */
587 void
fstrans_done(struct mount * mp)588 fstrans_done(struct mount *mp)
589 {
590 	int s;
591 	struct fstrans_lwp_info *fli;
592 	struct fstrans_mount_info *fmi;
593 
594 	fli = fstrans_get_lwp_info(mp, false);
595 	if (fli == NULL)
596 		return;
597 	fmi = fli->fli_mountinfo;
598 	KASSERT(fli->fli_trans_cnt > 0);
599 
600 	if (fli->fli_trans_cnt > 1) {
601 		fli->fli_trans_cnt -= 1;
602 
603 		return;
604 	}
605 
606 	if (__predict_false(fstrans_gone_count > 0))
607 		fstrans_clear_lwp_info();
608 
609 	s = pserialize_read_enter();
610 	if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL)) {
611 		fli->fli_trans_cnt = 0;
612 		pserialize_read_exit(s);
613 
614 		return;
615 	}
616 	pserialize_read_exit(s);
617 
618 	mutex_enter(&fstrans_lock);
619 	fli->fli_trans_cnt = 0;
620 	cv_signal(&fstrans_count_cv);
621 	mutex_exit(&fstrans_lock);
622 }
623 
624 /*
625  * Check if we hold an lock.
626  */
627 int
fstrans_held(struct mount * mp)628 fstrans_held(struct mount *mp)
629 {
630 	struct fstrans_lwp_info *fli;
631 	struct fstrans_mount_info *fmi;
632 
633 	KASSERT(mp != dead_rootmount);
634 
635 	fli = fstrans_get_lwp_info(mp, false);
636 	if (fli == NULL)
637 		return 0;
638 	fmi = fli->fli_mountinfo;
639 
640 	return (fli->fli_trans_cnt > 0 || fmi->fmi_owner == curlwp);
641 }
642 
643 /*
644  * Check if this thread has an exclusive lock.
645  */
646 int
fstrans_is_owner(struct mount * mp)647 fstrans_is_owner(struct mount *mp)
648 {
649 	struct fstrans_lwp_info *fli;
650 	struct fstrans_mount_info *fmi;
651 
652 	KASSERT(mp != dead_rootmount);
653 
654 	fli = fstrans_get_lwp_info(mp, false);
655 	if (fli == NULL)
656 		return 0;
657 	fmi = fli->fli_mountinfo;
658 
659 	return (fmi->fmi_owner == curlwp);
660 }
661 
662 /*
663  * True, if no thread is in a transaction not granted at the current state.
664  */
665 static bool
state_change_done(const struct fstrans_mount_info * fmi)666 state_change_done(const struct fstrans_mount_info *fmi)
667 {
668 	struct fstrans_lwp_info *fli;
669 
670 	KASSERT(mutex_owned(&fstrans_lock));
671 
672 	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
673 		if (fli->fli_mountinfo != fmi)
674 			continue;
675 		if (fli->fli_trans_cnt == 0)
676 			continue;
677 		if (fli->fli_self == curlwp)
678 			continue;
679 		if (grant_lock(fmi, fli->fli_lock_type))
680 			continue;
681 
682 		return false;
683 	}
684 
685 	return true;
686 }
687 
688 /*
689  * Set new file system state.
690  */
691 int
fstrans_setstate(struct mount * mp,enum fstrans_state new_state)692 fstrans_setstate(struct mount *mp, enum fstrans_state new_state)
693 {
694 	int error;
695 	enum fstrans_state old_state;
696 	struct fstrans_lwp_info *fli;
697 	struct fstrans_mount_info *fmi;
698 
699 	KASSERT(mp != dead_rootmount);
700 
701 	fli = fstrans_get_lwp_info(mp, true);
702 	if (fli == NULL)
703 		return ENOENT;
704 	fmi = fli->fli_mountinfo;
705 	old_state = fmi->fmi_state;
706 	if (old_state == new_state)
707 		return 0;
708 
709 	mutex_enter(&fstrans_lock);
710 	fmi->fmi_state = new_state;
711 	pserialize_perform(fstrans_psz);
712 
713 	/*
714 	 * All threads see the new state now.
715 	 * Wait for transactions invalid at this state to leave.
716 	 */
717 	error = 0;
718 	while (! state_change_done(fmi)) {
719 		error = cv_wait_sig(&fstrans_count_cv, &fstrans_lock);
720 		if (error) {
721 			new_state = fmi->fmi_state = FSTRANS_NORMAL;
722 			break;
723 		}
724 	}
725 	if (old_state != new_state) {
726 		if (old_state == FSTRANS_NORMAL) {
727 			KASSERT(fmi->fmi_owner == NULL);
728 			fmi->fmi_owner = curlwp;
729 		}
730 		if (new_state == FSTRANS_NORMAL) {
731 			KASSERT(fmi->fmi_owner == curlwp);
732 			fmi->fmi_owner = NULL;
733 		}
734 	}
735 	cv_broadcast(&fstrans_state_cv);
736 	mutex_exit(&fstrans_lock);
737 
738 	return error;
739 }
740 
741 /*
742  * Get current file system state.
743  */
744 enum fstrans_state
fstrans_getstate(struct mount * mp)745 fstrans_getstate(struct mount *mp)
746 {
747 	struct fstrans_lwp_info *fli;
748 	struct fstrans_mount_info *fmi;
749 
750 	KASSERT(mp != dead_rootmount);
751 
752 	fli = fstrans_get_lwp_info(mp, true);
753 	KASSERT(fli != NULL);
754 	fmi = fli->fli_mountinfo;
755 
756 	return fmi->fmi_state;
757 }
758 
759 /*
760  * Request a filesystem to suspend all operations.
761  */
762 int
vfs_suspend(struct mount * mp,int nowait)763 vfs_suspend(struct mount *mp, int nowait)
764 {
765 	struct fstrans_lwp_info *fli;
766 	int error;
767 
768 	if (mp == dead_rootmount)
769 		return EOPNOTSUPP;
770 
771 	fli = fstrans_get_lwp_info(mp, true);
772 	if (fli == NULL)
773 		return ENOENT;
774 
775 	if (nowait) {
776 		if (!mutex_tryenter(&vfs_suspend_lock))
777 			return EWOULDBLOCK;
778 	} else
779 		mutex_enter(&vfs_suspend_lock);
780 
781 	if ((error = VFS_SUSPENDCTL(fli->fli_mount, SUSPEND_SUSPEND)) != 0) {
782 		mutex_exit(&vfs_suspend_lock);
783 		return error;
784 	}
785 
786 	if ((mp->mnt_iflag & IMNT_GONE) != 0) {
787 		vfs_resume(mp);
788 		return ENOENT;
789 	}
790 
791 	return 0;
792 }
793 
794 /*
795  * Request a filesystem to resume all operations.
796  */
797 void
vfs_resume(struct mount * mp)798 vfs_resume(struct mount *mp)
799 {
800 	struct fstrans_lwp_info *fli;
801 
802 	KASSERT(mp != dead_rootmount);
803 
804 	fli = fstrans_get_lwp_info(mp, false);
805 	mp = fli->fli_mount;
806 
807 	VFS_SUSPENDCTL(mp, SUSPEND_RESUME);
808 	mutex_exit(&vfs_suspend_lock);
809 }
810 
811 
812 /*
813  * True, if no thread is running a cow handler.
814  */
815 static bool
cow_state_change_done(const struct fstrans_mount_info * fmi)816 cow_state_change_done(const struct fstrans_mount_info *fmi)
817 {
818 	struct fstrans_lwp_info *fli;
819 
820 	KASSERT(mutex_owned(&fstrans_lock));
821 	KASSERT(fmi->fmi_cow_change);
822 
823 	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
824 		if (fli->fli_mount != fmi->fmi_mount)
825 			continue;
826 		if (fli->fli_cow_cnt == 0)
827 			continue;
828 
829 		return false;
830 	}
831 
832 	return true;
833 }
834 
835 /*
836  * Prepare for changing this mounts cow list.
837  * Returns with fstrans_lock locked.
838  */
839 static void
cow_change_enter(struct fstrans_mount_info * fmi)840 cow_change_enter(struct fstrans_mount_info *fmi)
841 {
842 
843 	mutex_enter(&fstrans_lock);
844 
845 	/*
846 	 * Wait for other threads changing the list.
847 	 */
848 	while (fmi->fmi_cow_change)
849 		cv_wait(&fstrans_state_cv, &fstrans_lock);
850 
851 	/*
852 	 * Wait until all threads are aware of a state change.
853 	 */
854 	fmi->fmi_cow_change = true;
855 	pserialize_perform(fstrans_psz);
856 
857 	while (! cow_state_change_done(fmi))
858 		cv_wait(&fstrans_count_cv, &fstrans_lock);
859 }
860 
861 /*
862  * Done changing this mounts cow list.
863  */
864 static void
cow_change_done(struct fstrans_mount_info * fmi)865 cow_change_done(struct fstrans_mount_info *fmi)
866 {
867 
868 	KASSERT(mutex_owned(&fstrans_lock));
869 
870 	fmi->fmi_cow_change = false;
871 	pserialize_perform(fstrans_psz);
872 
873 	cv_broadcast(&fstrans_state_cv);
874 
875 	mutex_exit(&fstrans_lock);
876 }
877 
878 /*
879  * Add a handler to this mount.
880  */
881 int
fscow_establish(struct mount * mp,int (* func)(void *,struct buf *,bool),void * arg)882 fscow_establish(struct mount *mp, int (*func)(void *, struct buf *, bool),
883     void *arg)
884 {
885 	struct fstrans_mount_info *fmi;
886 	struct fscow_handler *newch;
887 
888 	KASSERT(mp != dead_rootmount);
889 
890 	mutex_enter(&fstrans_lock);
891 	fmi = fstrans_mount_get(mp);
892 	KASSERT(fmi != NULL);
893 	fmi->fmi_ref_cnt += 1;
894 	mutex_exit(&fstrans_lock);
895 
896 	newch = kmem_alloc(sizeof(*newch), KM_SLEEP);
897 	newch->ch_func = func;
898 	newch->ch_arg = arg;
899 
900 	cow_change_enter(fmi);
901 	LIST_INSERT_HEAD(&fmi->fmi_cow_handler, newch, ch_list);
902 	cow_change_done(fmi);
903 
904 	return 0;
905 }
906 
907 /*
908  * Remove a handler from this mount.
909  */
910 int
fscow_disestablish(struct mount * mp,int (* func)(void *,struct buf *,bool),void * arg)911 fscow_disestablish(struct mount *mp, int (*func)(void *, struct buf *, bool),
912     void *arg)
913 {
914 	struct fstrans_mount_info *fmi;
915 	struct fscow_handler *hp = NULL;
916 
917 	KASSERT(mp != dead_rootmount);
918 
919 	mutex_enter(&fstrans_lock);
920 	fmi = fstrans_mount_get(mp);
921 	KASSERT(fmi != NULL);
922 	mutex_exit(&fstrans_lock);
923 
924 	cow_change_enter(fmi);
925 	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
926 		if (hp->ch_func == func && hp->ch_arg == arg)
927 			break;
928 	if (hp != NULL) {
929 		LIST_REMOVE(hp, ch_list);
930 		kmem_free(hp, sizeof(*hp));
931 	}
932 	fstrans_mount_dtor(fmi);
933 	cow_change_done(fmi);
934 
935 	return hp ? 0 : EINVAL;
936 }
937 
938 /*
939  * Check for need to copy block that is about to be written.
940  */
941 int
fscow_run(struct buf * bp,bool data_valid)942 fscow_run(struct buf *bp, bool data_valid)
943 {
944 	int error, s;
945 	struct mount *mp;
946 	struct fstrans_lwp_info *fli;
947 	struct fstrans_mount_info *fmi;
948 	struct fscow_handler *hp;
949 
950 	/*
951 	 * First check if we need run the copy-on-write handler.
952 	 */
953 	if ((bp->b_flags & B_COWDONE))
954 		return 0;
955 	if (bp->b_vp == NULL) {
956 		bp->b_flags |= B_COWDONE;
957 		return 0;
958 	}
959 	if (bp->b_vp->v_type == VBLK)
960 		mp = spec_node_getmountedfs(bp->b_vp);
961 	else
962 		mp = bp->b_vp->v_mount;
963 	if (mp == NULL || mp == dead_rootmount) {
964 		bp->b_flags |= B_COWDONE;
965 		return 0;
966 	}
967 
968 	fli = fstrans_get_lwp_info(mp, true);
969 	KASSERT(fli != NULL);
970 	fmi = fli->fli_mountinfo;
971 
972 	/*
973 	 * On non-recursed run check if other threads
974 	 * want to change the list.
975 	 */
976 	if (fli->fli_cow_cnt == 0) {
977 		s = pserialize_read_enter();
978 		if (__predict_false(fmi->fmi_cow_change)) {
979 			pserialize_read_exit(s);
980 			mutex_enter(&fstrans_lock);
981 			while (fmi->fmi_cow_change)
982 				cv_wait(&fstrans_state_cv, &fstrans_lock);
983 			fli->fli_cow_cnt = 1;
984 			mutex_exit(&fstrans_lock);
985 		} else {
986 			fli->fli_cow_cnt = 1;
987 			pserialize_read_exit(s);
988 		}
989 	} else
990 		fli->fli_cow_cnt += 1;
991 
992 	/*
993 	 * Run all copy-on-write handlers, stop on error.
994 	 */
995 	error = 0;
996 	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
997 		if ((error = (*hp->ch_func)(hp->ch_arg, bp, data_valid)) != 0)
998 			break;
999  	if (error == 0)
1000  		bp->b_flags |= B_COWDONE;
1001 
1002 	/*
1003 	 * Check if other threads want to change the list.
1004 	 */
1005 	if (fli->fli_cow_cnt > 1) {
1006 		fli->fli_cow_cnt -= 1;
1007 	} else {
1008 		s = pserialize_read_enter();
1009 		if (__predict_false(fmi->fmi_cow_change)) {
1010 			pserialize_read_exit(s);
1011 			mutex_enter(&fstrans_lock);
1012 			fli->fli_cow_cnt = 0;
1013 			cv_signal(&fstrans_count_cv);
1014 			mutex_exit(&fstrans_lock);
1015 		} else {
1016 			fli->fli_cow_cnt = 0;
1017 			pserialize_read_exit(s);
1018 		}
1019 	}
1020 
1021 	return error;
1022 }
1023 
1024 #if defined(DDB)
1025 void fstrans_dump(int);
1026 
1027 static void
fstrans_print_lwp(struct proc * p,struct lwp * l,int verbose)1028 fstrans_print_lwp(struct proc *p, struct lwp *l, int verbose)
1029 {
1030 	char prefix[9];
1031 	struct fstrans_lwp_info *fli;
1032 
1033 	snprintf(prefix, sizeof(prefix), "%d.%d", p->p_pid, l->l_lid);
1034 	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
1035 		if (fli->fli_self != l)
1036 			continue;
1037 		if (fli->fli_trans_cnt == 0 && fli->fli_cow_cnt == 0) {
1038 			if (! verbose)
1039 				continue;
1040 		}
1041 		printf("%-8s", prefix);
1042 		if (verbose)
1043 			printf(" @%p", fli);
1044 		if (fli->fli_mount == dead_rootmount)
1045 			printf(" <dead>");
1046 		else if (fli->fli_mount != NULL)
1047 			printf(" (%s)", fli->fli_mount->mnt_stat.f_mntonname);
1048 		else
1049 			printf(" NULL");
1050 		if (fli->fli_alias != NULL) {
1051 			struct mount *amp = fli->fli_alias->fli_mount;
1052 
1053 			printf(" alias");
1054 			if (verbose)
1055 				printf(" @%p", fli->fli_alias);
1056 			if (amp == NULL)
1057 				printf(" NULL");
1058 			else
1059 				printf(" (%s)", amp->mnt_stat.f_mntonname);
1060 		}
1061 		if (fli->fli_mountinfo && fli->fli_mountinfo->fmi_gone)
1062 			printf(" gone");
1063 		if (fli->fli_trans_cnt == 0) {
1064 			printf(" -");
1065 		} else {
1066 			switch (fli->fli_lock_type) {
1067 			case FSTRANS_LAZY:
1068 				printf(" lazy");
1069 				break;
1070 			case FSTRANS_SHARED:
1071 				printf(" shared");
1072 				break;
1073 			default:
1074 				printf(" %#x", fli->fli_lock_type);
1075 				break;
1076 			}
1077 		}
1078 		printf(" %d cow %d alias %d\n",
1079 		    fli->fli_trans_cnt, fli->fli_cow_cnt, fli->fli_alias_cnt);
1080 		prefix[0] = '\0';
1081 	}
1082 }
1083 
1084 static void
fstrans_print_mount(struct mount * mp,int verbose)1085 fstrans_print_mount(struct mount *mp, int verbose)
1086 {
1087 	uint32_t indx;
1088 	struct fstrans_mount_info *fmi;
1089 
1090 	indx = fstrans_mount_hash(mp);
1091 	SLIST_FOREACH(fmi, &fstrans_mount_hashtab[indx], fmi_hash)
1092 		if (fmi->fmi_mount == mp)
1093 			break;
1094 
1095 	if (!verbose && (fmi == NULL || fmi->fmi_state == FSTRANS_NORMAL))
1096 		return;
1097 
1098 	printf("%-16s ", mp->mnt_stat.f_mntonname);
1099 	if (fmi == NULL) {
1100 		printf("(null)\n");
1101 		return;
1102 	}
1103 	printf("owner %p ", fmi->fmi_owner);
1104 	switch (fmi->fmi_state) {
1105 	case FSTRANS_NORMAL:
1106 		printf("state normal\n");
1107 		break;
1108 	case FSTRANS_SUSPENDING:
1109 		printf("state suspending\n");
1110 		break;
1111 	case FSTRANS_SUSPENDED:
1112 		printf("state suspended\n");
1113 		break;
1114 	default:
1115 		printf("state %#x\n", fmi->fmi_state);
1116 		break;
1117 	}
1118 }
1119 
1120 void
fstrans_dump(int full)1121 fstrans_dump(int full)
1122 {
1123 	const struct proclist_desc *pd;
1124 	struct proc *p;
1125 	struct lwp *l;
1126 	struct mount *mp;
1127 
1128 	printf("Fstrans locks by lwp:\n");
1129 	for (pd = proclists; pd->pd_list != NULL; pd++)
1130 		PROCLIST_FOREACH(p, pd->pd_list)
1131 			LIST_FOREACH(l, &p->p_lwps, l_sibling)
1132 				fstrans_print_lwp(p, l, full == 1);
1133 
1134 	printf("Fstrans state by mount:\n");
1135 	for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp))
1136 		fstrans_print_mount(mp, full == 1);
1137 }
1138 #endif /* defined(DDB) */
1139