xref: /openbsd/sys/kern/vfs_sync.c (revision 5a0ec814)
1 /*       $OpenBSD: vfs_sync.c,v 1.73 2024/10/18 05:52:32 miod Exp $  */
2 
3 /*
4  *  Portions of this code are:
5  *
6  * Copyright (c) 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  */
38 
39 /*
40  * Syncer daemon
41  */
42 
43 #include <sys/queue.h>
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/proc.h>
47 #include <sys/mount.h>
48 #include <sys/vnode.h>
49 #include <sys/lock.h>
50 #include <sys/malloc.h>
51 #include <sys/time.h>
52 
53 /*
54  * The workitem queue.
55  */
56 #define SYNCER_MAXDELAY	32		/* maximum sync delay time */
57 #define SYNCER_DEFAULT 30		/* default sync delay time */
58 int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
59 int syncdelay = SYNCER_DEFAULT;		/* time to delay syncing vnodes */
60 
61 int syncer_delayno = 0;
62 long syncer_mask;
63 LIST_HEAD(synclist, vnode);
64 static struct synclist *syncer_workitem_pending;
65 
66 struct proc *syncerproc;
67 int syncer_chan;
68 
69 /*
70  * The workitem queue.
71  *
72  * It is useful to delay writes of file data and filesystem metadata
73  * for tens of seconds so that quickly created and deleted files need
74  * not waste disk bandwidth being created and removed. To realize this,
75  * we append vnodes to a "workitem" queue. When running with a soft
76  * updates implementation, most pending metadata dependencies should
77  * not wait for more than a few seconds. Thus, mounted block devices
78  * are delayed only about half the time that file data is delayed.
79  * Similarly, directory updates are more critical, so are only delayed
80  * about a third the time that file data is delayed. Thus, there are
81  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
82  * one each second (driven off the filesystem syncer process). The
83  * syncer_delayno variable indicates the next queue that is to be processed.
84  * Items that need to be processed soon are placed in this queue:
85  *
86  *	syncer_workitem_pending[syncer_delayno]
87  *
88  * A delay of fifteen seconds is done by placing the request fifteen
89  * entries later in the queue:
90  *
91  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
92  *
93  */
94 
95 void
vn_initialize_syncerd(void)96 vn_initialize_syncerd(void)
97 {
98 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, M_WAITOK,
99 	    &syncer_mask);
100 	syncer_maxdelay = syncer_mask + 1;
101 }
102 
103 /*
104  * Add an item to the syncer work queue.
105  */
106 void
vn_syncer_add_to_worklist(struct vnode * vp,int delay)107 vn_syncer_add_to_worklist(struct vnode *vp, int delay)
108 {
109 	int s, slot;
110 
111 	if (delay > syncer_maxdelay - 2)
112 		delay = syncer_maxdelay - 2;
113 	slot = (syncer_delayno + delay) & syncer_mask;
114 
115 	s = splbio();
116 	if (vp->v_bioflag & VBIOONSYNCLIST)
117 		LIST_REMOVE(vp, v_synclist);
118 
119 	vp->v_bioflag |= VBIOONSYNCLIST;
120 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
121 	splx(s);
122 }
123 
124 /*
125  * System filesystem synchronizer daemon.
126  */
127 void
syncer_thread(void * arg)128 syncer_thread(void *arg)
129 {
130 	uint64_t elapsed, start;
131 	struct proc *p = curproc;
132 	struct synclist *slp;
133 	struct vnode *vp;
134 	int s;
135 
136 	for (;;) {
137 		start = getnsecuptime();
138 
139 		/*
140 		 * Push files whose dirty time has expired.
141 		 */
142 		s = splbio();
143 		slp = &syncer_workitem_pending[syncer_delayno];
144 
145 		syncer_delayno += 1;
146 		if (syncer_delayno == syncer_maxdelay)
147 			syncer_delayno = 0;
148 
149 		while ((vp = LIST_FIRST(slp)) != NULL) {
150 			if (vget(vp, LK_EXCLUSIVE | LK_NOWAIT)) {
151 				/*
152 				 * If we fail to get the lock, we move this
153 				 * vnode one second ahead in time.
154 				 * XXX - no good, but the best we can do.
155 				 */
156 				vn_syncer_add_to_worklist(vp, 1);
157 				continue;
158 			}
159 			splx(s);
160 			(void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
161 			vput(vp);
162 			s = splbio();
163 			if (LIST_FIRST(slp) == vp) {
164 				/*
165 				 * Note: disk vps can remain on the
166 				 * worklist too with no dirty blocks, but
167 				 * since sync_fsync() moves it to a different
168 				 * slot we are safe.
169 				 */
170 #ifdef DIAGNOSTIC
171 				if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL &&
172 				    vp->v_type != VBLK) {
173 					vprint("fsync failed", vp);
174 					if (vp->v_mount != NULL)
175 						printf("mounted on: %s\n",
176 						    vp->v_mount->mnt_stat.f_mntonname);
177 					panic("%s: fsync failed", __func__);
178 				}
179 #endif /* DIAGNOSTIC */
180 				/*
181 				 * Put us back on the worklist.  The worklist
182 				 * routine will remove us from our current
183 				 * position and then add us back in at a later
184 				 * position.
185 				 */
186 				vn_syncer_add_to_worklist(vp, syncdelay);
187 			}
188 
189 			sched_pause(yield);
190 		}
191 
192 		splx(s);
193 
194 		/*
195 		 * If it has taken us less than a second to process the
196 		 * current work, then wait. Otherwise start right over
197 		 * again. We can still lose time if any single round
198 		 * takes more than two seconds, but it does not really
199 		 * matter as we are just trying to generally pace the
200 		 * filesystem activity.
201 		 */
202 		elapsed = getnsecuptime() - start;
203 		if (elapsed < SEC_TO_NSEC(1)) {
204 			tsleep_nsec(&syncer_chan, PPAUSE, "syncer",
205 			    SEC_TO_NSEC(1) - elapsed);
206 		}
207 	}
208 }
209 
210 /* Routine to create and manage a filesystem syncer vnode. */
211 int   sync_fsync(void *);
212 int   sync_inactive(void *);
213 int   sync_print(void *);
214 
215 const struct vops sync_vops = {
216 	.vop_close	= nullop,
217 	.vop_fsync	= sync_fsync,
218 	.vop_inactive	= sync_inactive,
219 	.vop_reclaim	= nullop,
220 	.vop_lock	= nullop,
221 	.vop_unlock	= nullop,
222 	.vop_islocked	= nullop,
223 	.vop_print	= sync_print,
224 
225 	.vop_abortop	= NULL,
226 	.vop_access	= NULL,
227 	.vop_advlock	= NULL,
228 	.vop_bmap	= NULL,
229 	.vop_bwrite	= NULL,
230 	.vop_create	= NULL,
231 	.vop_getattr	= NULL,
232 	.vop_ioctl	= NULL,
233 	.vop_link	= NULL,
234 	.vop_lookup	= NULL,
235 	.vop_mknod	= NULL,
236 	.vop_open	= NULL,
237 	.vop_pathconf	= NULL,
238 	.vop_read	= NULL,
239 	.vop_readdir	= NULL,
240 	.vop_readlink	= NULL,
241 	.vop_remove	= eopnotsupp,
242 	.vop_rename	= NULL,
243 	.vop_revoke	= NULL,
244 	.vop_mkdir	= NULL,
245 	.vop_rmdir	= NULL,
246 	.vop_setattr	= NULL,
247 	.vop_strategy	= NULL,
248 	.vop_symlink	= NULL,
249 	.vop_write	= NULL,
250 	.vop_kqfilter	= NULL
251 };
252 
253 /*
254  * Create a new filesystem syncer vnode for the specified mount point.
255  */
256 int
vfs_allocate_syncvnode(struct mount * mp)257 vfs_allocate_syncvnode(struct mount *mp)
258 {
259 	struct vnode *vp;
260 	static long start, incr, next;
261 	int error;
262 
263 	/* Allocate a new vnode */
264 	if ((error = getnewvnode(VT_VFS, mp, &sync_vops, &vp)) != 0) {
265 		mp->mnt_syncer = NULL;
266 		return (error);
267 	}
268 	vp->v_writecount = 1;
269 	vp->v_type = VNON;
270 	/*
271 	 * Place the vnode onto the syncer worklist. We attempt to
272 	 * scatter them about on the list so that they will go off
273 	 * at evenly distributed times even if all the filesystems
274 	 * are mounted at once.
275 	 */
276 	next += incr;
277 	if (next == 0 || next > syncer_maxdelay) {
278 		start /= 2;
279 		incr /= 2;
280 		if (start == 0) {
281 			start = syncer_maxdelay / 2;
282 			incr = syncer_maxdelay;
283 		}
284 		next = start;
285 	}
286 	vn_syncer_add_to_worklist(vp, next);
287 	mp->mnt_syncer = vp;
288 	return (0);
289 }
290 
291 /*
292  * Do a lazy sync of the filesystem.
293  */
294 int
sync_fsync(void * v)295 sync_fsync(void *v)
296 {
297 	struct vop_fsync_args *ap = v;
298 	struct vnode *syncvp = ap->a_vp;
299 	struct mount *mp = syncvp->v_mount;
300 	int asyncflag;
301 
302 	/*
303 	 * We only need to do something if this is a lazy evaluation.
304 	 */
305 	if (ap->a_waitfor != MNT_LAZY)
306 		return (0);
307 
308 	/*
309 	 * Move ourselves to the back of the sync list.
310 	 */
311 	vn_syncer_add_to_worklist(syncvp, syncdelay);
312 
313 	/*
314 	 * Walk the list of vnodes pushing all that are dirty and
315 	 * not already on the sync list.
316 	 */
317 	if (vfs_busy(mp, VB_READ|VB_NOWAIT) == 0) {
318 		asyncflag = mp->mnt_flag & MNT_ASYNC;
319 		mp->mnt_flag &= ~MNT_ASYNC;
320 		VFS_SYNC(mp, MNT_LAZY, 0, ap->a_cred, ap->a_p);
321 		if (asyncflag)
322 			mp->mnt_flag |= MNT_ASYNC;
323 		vfs_unbusy(mp);
324 	}
325 
326 	return (0);
327 }
328 
329 /*
330  * The syncer vnode is no longer needed and is being decommissioned.
331  */
332 int
sync_inactive(void * v)333 sync_inactive(void *v)
334 {
335 	struct vop_inactive_args *ap = v;
336 
337 	struct vnode *vp = ap->a_vp;
338 	int s;
339 
340 	if (vp->v_usecount == 0) {
341 		VOP_UNLOCK(vp);
342 		return (0);
343 	}
344 
345 	vp->v_mount->mnt_syncer = NULL;
346 
347 	s = splbio();
348 
349 	LIST_REMOVE(vp, v_synclist);
350 	vp->v_bioflag &= ~VBIOONSYNCLIST;
351 
352 	splx(s);
353 
354 	vp->v_writecount = 0;
355 	vput(vp);
356 
357 	return (0);
358 }
359 
360 /*
361  * Print out a syncer vnode.
362  */
363 int
sync_print(void * v)364 sync_print(void *v)
365 {
366 #if defined(DEBUG) || defined(DIAGNOSTIC) || defined(VFSLCKDEBUG)
367 	printf("syncer vnode\n");
368 #endif
369 
370 	return (0);
371 }
372