1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/errno.h>
32 #include <sys/kmem.h>
33 #include <sys/vnode.h>
34 #include <sys/vfs_opreg.h>
35 #include <sys/swap.h>
36 #include <sys/sysmacros.h>
37 #include <sys/buf.h>
38 #include <sys/callb.h>
39 #include <sys/debug.h>
40 #include <vm/seg.h>
41 #include <sys/fs/swapnode.h>
42 #include <fs/fs_subr.h>
43 #include <sys/cmn_err.h>
44 #include <sys/mem_config.h>
45 #include <sys/atomic.h>
46 
47 extern const fs_operation_def_t swap_vnodeops_template[];
48 
49 /*
50  * swapfs_minfree is the amount of physical memory (actually remaining
51  * availrmem) that we want to keep free for the rest of the system.  This
52  * means that swapfs can only grow to availrmem - swapfs_minfree.  This
53  * can be set as just constant value or a certain percentage of installed
54  * physical memory. It is set in swapinit().
55  *
56  * Users who want to change the amount of memory that can be used as swap
57  * space should do so by setting swapfs_desfree at boot time,
58  * not swapfs_minfree.
59  */
60 
61 pgcnt_t swapfs_desfree = 0;
62 pgcnt_t swapfs_minfree = 0;
63 pgcnt_t swapfs_reserve = 0;
64 
65 #ifdef SWAPFS_DEBUG
66 int swapfs_debug;
67 #endif /* SWAPFS_DEBUG */
68 
69 
70 static int swapfs_vpcount;
71 static kmutex_t swapfs_lock;
72 static struct async_reqs *sw_ar, *sw_pendlist, *sw_freelist;
73 
74 static struct vnode **swap_vnodes;	/* ptr's to swap vnodes */
75 
76 static void swap_init_mem_config(void);
77 
78 static pgcnt_t initial_swapfs_desfree;
79 static pgcnt_t initial_swapfs_minfree;
80 static pgcnt_t initial_swapfs_reserve;
81 
82 static int swap_sync(struct vfs *vfsp, short flag, struct cred *cr);
83 
84 static void
85 swapfs_recalc_save_initial(void)
86 {
87 	initial_swapfs_desfree = swapfs_desfree;
88 	initial_swapfs_minfree = swapfs_minfree;
89 	initial_swapfs_reserve = swapfs_reserve;
90 }
91 
92 static int
93 swapfs_recalc(pgcnt_t pgs)
94 {
95 	pgcnt_t new_swapfs_desfree;
96 	pgcnt_t new_swapfs_minfree;
97 	pgcnt_t new_swapfs_reserve;
98 
99 	new_swapfs_desfree = initial_swapfs_desfree;
100 	new_swapfs_minfree = initial_swapfs_minfree;
101 	new_swapfs_reserve = initial_swapfs_reserve;
102 
103 	if (new_swapfs_desfree == 0)
104 		new_swapfs_desfree = btopr(7 * 512 * 1024); /* 3-1/2Mb */;
105 
106 	if (new_swapfs_minfree == 0) {
107 		/*
108 		 * We set this lower than we'd like here, 2Mb, because we
109 		 * always boot on swapfs. It's up to a safer value,
110 		 * swapfs_desfree, when/if we add physical swap devices
111 		 * in swapadd(). Users who want to change the amount of
112 		 * memory that can be used as swap space should do so by
113 		 * setting swapfs_desfree at boot time, not swapfs_minfree.
114 		 * However, swapfs_minfree is tunable by install as a
115 		 * workaround for bugid 1147463.
116 		 */
117 		new_swapfs_minfree = MAX(btopr(2 * 1024 * 1024), pgs >> 3);
118 	}
119 
120 	/*
121 	 * priv processes can reserve memory as swap as long as availrmem
122 	 * remains greater than swapfs_minfree; in the case of non-priv
123 	 * processes, memory can be reserved as swap only if availrmem
124 	 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus,
125 	 * swapfs_reserve amount of memswap is not available to non-priv
126 	 * processes. This protects daemons such as automounter dying
127 	 * as a result of application processes eating away almost entire
128 	 * membased swap. This safeguard becomes useless if apps are run
129 	 * with root access.
130 	 *
131 	 * set swapfs_reserve to a minimum of 4Mb or 1/128 of physmem whichever
132 	 * is greater up to the limit of 128 MB.
133 	 */
134 	if (new_swapfs_reserve == 0)
135 		new_swapfs_reserve = MIN(btopr(128 * 1024 * 1024),
136 		    MAX(btopr(4 * 1024 * 1024), pgs >> 7));
137 
138 	/* Test basic numeric viability. */
139 	if (new_swapfs_minfree > pgs)
140 		return (0);
141 
142 	/* Equivalent test to anon_resvmem() check. */
143 	if (availrmem < new_swapfs_minfree) {
144 		/*
145 		 * If ism pages are being used, then there must be agreement
146 		 * between these two policies.
147 		 */
148 		if ((availrmem > segspt_minfree) && (segspt_minfree > 0)) {
149 			new_swapfs_minfree = segspt_minfree;
150 		} else {
151 			return (0);
152 		}
153 	}
154 
155 	swapfs_desfree = new_swapfs_desfree;
156 	swapfs_minfree = new_swapfs_minfree;
157 	swapfs_reserve = new_swapfs_reserve;
158 
159 	return (1);
160 }
161 
162 /*ARGSUSED1*/
163 int
164 swapinit(int fstype, char *name)
165 {							/* reserve for mp */
166 	ssize_t sw_freelist_size = klustsize / PAGESIZE * 2;
167 	int i, error;
168 
169 	static const fs_operation_def_t swap_vfsops[] = {
170 		VFSNAME_SYNC, { .vfs_sync = swap_sync },
171 		NULL, NULL
172 	};
173 
174 	SWAPFS_PRINT(SWAP_SUBR, "swapinit\n", 0, 0, 0, 0, 0);
175 	mutex_init(&swapfs_lock, NULL, MUTEX_DEFAULT, NULL);
176 
177 	swap_vnodes = kmem_zalloc(MAX_SWAP_VNODES * sizeof (struct vnode *),
178 	    KM_SLEEP);
179 
180 	swapfs_recalc_save_initial();
181 	if (!swapfs_recalc(physmem))
182 		cmn_err(CE_PANIC, "swapfs_minfree(%lu) > physmem(%lu)",
183 		    swapfs_minfree, physmem);
184 
185 	/*
186 	 * Arrange for a callback on memory size change.
187 	 */
188 	swap_init_mem_config();
189 
190 	sw_ar = (struct async_reqs *)
191 	    kmem_zalloc(sw_freelist_size*sizeof (struct async_reqs), KM_SLEEP);
192 
193 	error = vfs_setfsops(fstype, swap_vfsops, NULL);
194 	if (error != 0) {
195 		cmn_err(CE_WARN, "swapinit: bad vfs ops template");
196 		return (error);
197 	}
198 
199 	error = vn_make_ops(name, swap_vnodeops_template, &swap_vnodeops);
200 	if (error != 0) {
201 		(void) vfs_freevfsops_by_type(fstype);
202 		cmn_err(CE_WARN, "swapinit: bad vnode ops template");
203 		return (error);
204 	}
205 	sw_freelist = sw_ar;
206 	for (i = 0; i < sw_freelist_size - 1; i++)
207 		sw_ar[i].a_next = &sw_ar[i + 1];
208 
209 	return (0);
210 }
211 
212 /*
213  * Get a swapfs vnode corresponding to the specified identifier.
214  */
215 struct vnode *
216 swapfs_getvp(ulong_t vidx)
217 {
218 	struct vnode *vp;
219 
220 	vp = swap_vnodes[vidx];
221 	if (vp) {
222 		return (vp);
223 	}
224 
225 	mutex_enter(&swapfs_lock);
226 	vp = swap_vnodes[vidx];
227 	if (vp == NULL) {
228 		vp = vn_alloc(KM_SLEEP);
229 		vn_setops(vp, swap_vnodeops);
230 		vp->v_type = VREG;
231 		vp->v_flag |= (VISSWAP|VISSWAPFS);
232 		swap_vnodes[vidx] = vp;
233 		swapfs_vpcount++;
234 	}
235 	mutex_exit(&swapfs_lock);
236 	return (vp);
237 }
238 
239 int swap_lo;
240 
241 /*ARGSUSED*/
242 static int
243 swap_sync(struct vfs *vfsp, short flag, struct cred *cr)
244 {
245 	struct vnode *vp;
246 	int i;
247 
248 	if (!(flag & SYNC_ALL))
249 		return (1);
250 
251 	/*
252 	 * assumes that we are the only one left to access this so that
253 	 * no need to use swapfs_lock (since it's staticly defined)
254 	 */
255 	for (i = 0; i < MAX_SWAP_VNODES; i++) {
256 		vp = swap_vnodes[i];
257 		if (vp) {
258 			VN_HOLD(vp);
259 			(void) VOP_PUTPAGE(vp, (offset_t)0, 0,
260 			    (B_ASYNC | B_FREE), kcred);
261 			VN_RELE(vp);
262 		}
263 	}
264 	return (0);
265 }
266 
267 extern int sw_pending_size;
268 
269 /*
270  * Take an async request off the pending queue
271  */
272 struct async_reqs *
273 sw_getreq()
274 {
275 	struct async_reqs *arg;
276 
277 	mutex_enter(&swapfs_lock);
278 	arg = sw_pendlist;
279 	if (arg) {
280 		sw_pendlist = arg->a_next;
281 		arg->a_next = NULL;
282 		sw_pending_size -= PAGESIZE;
283 	}
284 	ASSERT(sw_pending_size >= 0);
285 	mutex_exit(&swapfs_lock);
286 	return (arg);
287 }
288 
289 /*
290  * Put an async request on the pending queue
291  */
292 void
293 sw_putreq(struct async_reqs *arg)
294 {
295 	/* Hold onto it */
296 	VN_HOLD(arg->a_vp);
297 
298 	mutex_enter(&swapfs_lock);
299 	arg->a_next = sw_pendlist;
300 	sw_pendlist = arg;
301 	sw_pending_size += PAGESIZE;
302 	mutex_exit(&swapfs_lock);
303 }
304 
305 /*
306  * Put an async request back on the pending queue
307  */
308 void
309 sw_putbackreq(struct async_reqs *arg)
310 {
311 	mutex_enter(&swapfs_lock);
312 	arg->a_next = sw_pendlist;
313 	sw_pendlist = arg;
314 	sw_pending_size += PAGESIZE;
315 	mutex_exit(&swapfs_lock);
316 }
317 
318 /*
319  * Take an async request structure off the free list
320  */
321 struct async_reqs *
322 sw_getfree()
323 {
324 	struct async_reqs *arg;
325 
326 	mutex_enter(&swapfs_lock);
327 	arg = sw_freelist;
328 	if (arg) {
329 		sw_freelist = arg->a_next;
330 		arg->a_next = NULL;
331 	}
332 	mutex_exit(&swapfs_lock);
333 	return (arg);
334 }
335 
336 /*
337  * Put an async request structure on the free list
338  */
339 void
340 sw_putfree(struct async_reqs *arg)
341 {
342 	/* Release our hold - should have locked the page by now */
343 	VN_RELE(arg->a_vp);
344 
345 	mutex_enter(&swapfs_lock);
346 	arg->a_next = sw_freelist;
347 	sw_freelist = arg;
348 	mutex_exit(&swapfs_lock);
349 }
350 
351 static pgcnt_t swapfs_pending_delete;
352 
353 /*ARGSUSED*/
354 static void
355 swap_mem_config_post_add(
356 	void *arg,
357 	pgcnt_t delta_swaps)
358 {
359 	(void) swapfs_recalc(physmem - swapfs_pending_delete);
360 }
361 
362 /*ARGSUSED*/
363 static int
364 swap_mem_config_pre_del(
365 	void *arg,
366 	pgcnt_t delta_swaps)
367 {
368 	pgcnt_t nv;
369 
370 	nv = atomic_add_long_nv(&swapfs_pending_delete, (spgcnt_t)delta_swaps);
371 	if (!swapfs_recalc(physmem - nv)) {
372 		/*
373 		 * Tidy-up is done by the call to post_del which
374 		 * is always made.
375 		 */
376 		return (EBUSY);
377 	}
378 	return (0);
379 }
380 
381 /*ARGSUSED*/
382 static void
383 swap_mem_config_post_del(
384 	void *arg,
385 	pgcnt_t delta_swaps,
386 	int cancelled)
387 {
388 	pgcnt_t nv;
389 
390 	nv = atomic_add_long_nv(&swapfs_pending_delete, -(spgcnt_t)delta_swaps);
391 	(void) swapfs_recalc(physmem - nv);
392 }
393 
394 static kphysm_setup_vector_t swap_mem_config_vec = {
395 	KPHYSM_SETUP_VECTOR_VERSION,
396 	swap_mem_config_post_add,
397 	swap_mem_config_pre_del,
398 	swap_mem_config_post_del,
399 };
400 
401 static void
402 swap_init_mem_config(void)
403 {
404 	int ret;
405 
406 	ret = kphysm_setup_func_register(&swap_mem_config_vec, (void *)NULL);
407 	ASSERT(ret == 0);
408 }
409