xref: /openbsd/sys/uvm/uvm_pager.c (revision 1819b7e2)
1 /*	$OpenBSD: uvm_pager.c,v 1.91 2023/08/11 17:53:22 mpi Exp $	*/
2 /*	$NetBSD: uvm_pager.c,v 1.36 2000/11/27 18:26:41 chs Exp $	*/
3 
4 /*
5  * Copyright (c) 1997 Charles D. Cranor and Washington University.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  * from: Id: uvm_pager.c,v 1.1.2.23 1998/02/02 20:38:06 chuck Exp
29  */
30 
31 /*
32  * uvm_pager.c: generic functions used to assist the pagers.
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/malloc.h>
38 #include <sys/pool.h>
39 #include <sys/buf.h>
40 #include <sys/atomic.h>
41 
42 #include <uvm/uvm.h>
43 
44 const struct uvm_pagerops *uvmpagerops[] = {
45 	&aobj_pager,
46 	&uvm_deviceops,
47 	&uvm_vnodeops,
48 };
49 
50 /*
51  * the pager map: provides KVA for I/O
52  *
53  * Each uvm_pseg has room for MAX_PAGERMAP_SEGS pager io space of
54  * MAXBSIZE bytes.
55  *
56  * The number of uvm_pseg instances is dynamic using an array segs.
57  * At most UVM_PSEG_COUNT instances can exist.
58  *
59  * psegs[0/1] always exist (so that the pager can always map in pages).
60  * psegs[0/1] element 0 are always reserved for the pagedaemon.
61  *
62  * Any other pseg is automatically created when no space is available
63  * and automatically destroyed when it is no longer in use.
64  */
65 #define MAX_PAGER_SEGS	16
66 #define PSEG_NUMSEGS	(PAGER_MAP_SIZE / MAX_PAGER_SEGS / MAXBSIZE)
67 struct uvm_pseg {
68 	/* Start of virtual space; 0 if not inited. */
69 	vaddr_t	start;
70 	/* Bitmap of the segments in use in this pseg. */
71 	int	use;
72 };
73 struct	mutex uvm_pseg_lck;
74 struct	uvm_pseg psegs[PSEG_NUMSEGS];
75 
76 #define UVM_PSEG_FULL(pseg)	((pseg)->use == (1 << MAX_PAGER_SEGS) - 1)
77 #define UVM_PSEG_EMPTY(pseg)	((pseg)->use == 0)
78 #define UVM_PSEG_INUSE(pseg,id)	(((pseg)->use & (1 << (id))) != 0)
79 
80 void		uvm_pseg_init(struct uvm_pseg *);
81 vaddr_t		uvm_pseg_get(int);
82 void		uvm_pseg_release(vaddr_t);
83 
84 /*
85  * uvm_pager_init: init pagers (at boot time)
86  */
87 void
uvm_pager_init(void)88 uvm_pager_init(void)
89 {
90 	int lcv;
91 
92 	/* init pager map */
93 	uvm_pseg_init(&psegs[0]);
94 	uvm_pseg_init(&psegs[1]);
95 	mtx_init(&uvm_pseg_lck, IPL_VM);
96 
97 	/* init ASYNC I/O queue */
98 	TAILQ_INIT(&uvm.aio_done);
99 
100 	/* call pager init functions */
101 	for (lcv = 0 ; lcv < sizeof(uvmpagerops)/sizeof(struct uvm_pagerops *);
102 	    lcv++) {
103 		if (uvmpagerops[lcv]->pgo_init)
104 			uvmpagerops[lcv]->pgo_init();
105 	}
106 }
107 
108 /*
109  * Initialize a uvm_pseg.
110  *
111  * May fail, in which case seg->start == 0.
112  *
113  * Caller locks uvm_pseg_lck.
114  */
115 void
uvm_pseg_init(struct uvm_pseg * pseg)116 uvm_pseg_init(struct uvm_pseg *pseg)
117 {
118 	KASSERT(pseg->start == 0);
119 	KASSERT(pseg->use == 0);
120 	pseg->start = (vaddr_t)km_alloc(MAX_PAGER_SEGS * MAXBSIZE,
121 	    &kv_any, &kp_none, &kd_trylock);
122 }
123 
124 /*
125  * Acquire a pager map segment.
126  *
127  * Returns a vaddr for paging. 0 on failure.
128  *
129  * Caller does not lock.
130  */
131 vaddr_t
uvm_pseg_get(int flags)132 uvm_pseg_get(int flags)
133 {
134 	int i;
135 	struct uvm_pseg *pseg;
136 
137 	/*
138 	 * XXX Prevent lock ordering issue in uvm_unmap_detach().  A real
139 	 * fix would be to move the KERNEL_LOCK() out of uvm_unmap_detach().
140 	 *
141 	 *  witness_checkorder() at witness_checkorder+0xba0
142 	 *  __mp_lock() at __mp_lock+0x5f
143 	 *  uvm_unmap_detach() at uvm_unmap_detach+0xc5
144 	 *  uvm_map() at uvm_map+0x857
145 	 *  uvm_km_valloc_try() at uvm_km_valloc_try+0x65
146 	 *  uvm_pseg_get() at uvm_pseg_get+0x6f
147 	 *  uvm_pagermapin() at uvm_pagermapin+0x45
148 	 *  uvn_io() at uvn_io+0xcf
149 	 *  uvn_get() at uvn_get+0x156
150 	 *  uvm_fault_lower() at uvm_fault_lower+0x28a
151 	 *  uvm_fault() at uvm_fault+0x1b3
152 	 *  upageflttrap() at upageflttrap+0x62
153 	 */
154 	KERNEL_LOCK();
155 	mtx_enter(&uvm_pseg_lck);
156 
157 pager_seg_restart:
158 	/* Find first pseg that has room. */
159 	for (pseg = &psegs[0]; pseg != &psegs[PSEG_NUMSEGS]; pseg++) {
160 		if (UVM_PSEG_FULL(pseg))
161 			continue;
162 
163 		if (pseg->start == 0) {
164 			/* Need initialization. */
165 			uvm_pseg_init(pseg);
166 			if (pseg->start == 0)
167 				goto pager_seg_fail;
168 		}
169 
170 		/* Keep indexes 0,1 reserved for pagedaemon. */
171 		if ((pseg == &psegs[0] || pseg == &psegs[1]) &&
172 		    (curproc != uvm.pagedaemon_proc))
173 			i = 2;
174 		else
175 			i = 0;
176 
177 		for (; i < MAX_PAGER_SEGS; i++) {
178 			if (!UVM_PSEG_INUSE(pseg, i)) {
179 				pseg->use |= 1 << i;
180 				mtx_leave(&uvm_pseg_lck);
181 				KERNEL_UNLOCK();
182 				return pseg->start + i * MAXBSIZE;
183 			}
184 		}
185 	}
186 
187 pager_seg_fail:
188 	if ((flags & UVMPAGER_MAPIN_WAITOK) != 0) {
189 		msleep_nsec(&psegs, &uvm_pseg_lck, PVM, "pagerseg", INFSLP);
190 		goto pager_seg_restart;
191 	}
192 
193 	mtx_leave(&uvm_pseg_lck);
194 	KERNEL_UNLOCK();
195 	return 0;
196 }
197 
198 /*
199  * Release a pager map segment.
200  *
201  * Caller does not lock.
202  *
203  * Deallocates pseg if it is no longer in use.
204  */
205 void
uvm_pseg_release(vaddr_t segaddr)206 uvm_pseg_release(vaddr_t segaddr)
207 {
208 	int id;
209 	struct uvm_pseg *pseg;
210 	vaddr_t va = 0;
211 
212 	mtx_enter(&uvm_pseg_lck);
213 	for (pseg = &psegs[0]; pseg != &psegs[PSEG_NUMSEGS]; pseg++) {
214 		if (pseg->start <= segaddr &&
215 		    segaddr < pseg->start + MAX_PAGER_SEGS * MAXBSIZE)
216 			break;
217 	}
218 	KASSERT(pseg != &psegs[PSEG_NUMSEGS]);
219 
220 	id = (segaddr - pseg->start) / MAXBSIZE;
221 	KASSERT(id >= 0 && id < MAX_PAGER_SEGS);
222 
223 	/* test for no remainder */
224 	KDASSERT(segaddr == pseg->start + id * MAXBSIZE);
225 
226 
227 	KASSERT(UVM_PSEG_INUSE(pseg, id));
228 
229 	pseg->use &= ~(1 << id);
230 	wakeup(&psegs);
231 
232 	if ((pseg != &psegs[0] && pseg != &psegs[1]) && UVM_PSEG_EMPTY(pseg)) {
233 		va = pseg->start;
234 		pseg->start = 0;
235 	}
236 
237 	mtx_leave(&uvm_pseg_lck);
238 
239 	if (va) {
240 		km_free((void *)va, MAX_PAGER_SEGS * MAXBSIZE,
241 		    &kv_any, &kp_none);
242 	}
243 }
244 
245 /*
246  * uvm_pagermapin: map pages into KVA for I/O that needs mappings
247  *
248  * We basically just km_valloc a blank map entry to reserve the space in the
249  * kernel map and then use pmap_enter() to put the mappings in by hand.
250  */
251 vaddr_t
uvm_pagermapin(struct vm_page ** pps,int npages,int flags)252 uvm_pagermapin(struct vm_page **pps, int npages, int flags)
253 {
254 	vaddr_t kva, cva;
255 	vm_prot_t prot;
256 	vsize_t size;
257 	struct vm_page *pp;
258 
259 #if defined(__HAVE_PMAP_DIRECT)
260 	/*
261 	 * Use direct mappings for single page, unless there is a risk
262 	 * of aliasing.
263 	 */
264 	if (npages == 1 && PMAP_PREFER_ALIGN() == 0) {
265 		KASSERT(pps[0]);
266 		KASSERT(pps[0]->pg_flags & PG_BUSY);
267 		return pmap_map_direct(pps[0]);
268 	}
269 #endif
270 
271 	prot = PROT_READ;
272 	if (flags & UVMPAGER_MAPIN_READ)
273 		prot |= PROT_WRITE;
274 	size = ptoa(npages);
275 
276 	KASSERT(size <= MAXBSIZE);
277 
278 	kva = uvm_pseg_get(flags);
279 	if (kva == 0)
280 		return 0;
281 
282 	for (cva = kva ; size != 0 ; size -= PAGE_SIZE, cva += PAGE_SIZE) {
283 		pp = *pps++;
284 		KASSERT(pp);
285 		KASSERT(pp->pg_flags & PG_BUSY);
286 		/* Allow pmap_enter to fail. */
287 		if (pmap_enter(pmap_kernel(), cva, VM_PAGE_TO_PHYS(pp),
288 		    prot, PMAP_WIRED | PMAP_CANFAIL | prot) != 0) {
289 			pmap_remove(pmap_kernel(), kva, cva);
290 			pmap_update(pmap_kernel());
291 			uvm_pseg_release(kva);
292 			return 0;
293 		}
294 	}
295 	pmap_update(pmap_kernel());
296 	return kva;
297 }
298 
299 /*
300  * uvm_pagermapout: remove KVA mapping
301  *
302  * We remove our mappings by hand and then remove the mapping.
303  */
304 void
uvm_pagermapout(vaddr_t kva,int npages)305 uvm_pagermapout(vaddr_t kva, int npages)
306 {
307 #if defined(__HAVE_PMAP_DIRECT)
308 	/*
309 	 * Use direct mappings for single page, unless there is a risk
310 	 * of aliasing.
311 	 */
312 	if (npages == 1 && PMAP_PREFER_ALIGN() == 0) {
313 		pmap_unmap_direct(kva);
314 		return;
315 	}
316 #endif
317 
318 	pmap_remove(pmap_kernel(), kva, kva + ((vsize_t)npages << PAGE_SHIFT));
319 	pmap_update(pmap_kernel());
320 	uvm_pseg_release(kva);
321 
322 }
323 
324 /*
325  * uvm_mk_pcluster
326  *
327  * generic "make 'pager put' cluster" function.  a pager can either
328  * [1] set pgo_mk_pcluster to NULL (never cluster), [2] set it to this
329  * generic function, or [3] set it to a pager specific function.
330  *
331  * => caller must lock object _and_ pagequeues (since we need to look
332  *    at active vs. inactive bits, etc.)
333  * => caller must make center page busy and write-protect it
334  * => we mark all cluster pages busy for the caller
335  * => the caller must unbusy all pages (and check wanted/released
336  *    status if it drops the object lock)
337  * => flags:
338  *      PGO_ALLPAGES:  all pages in object are valid targets
339  *      !PGO_ALLPAGES: use "lo" and "hi" to limit range of cluster
340  *      PGO_DOACTCLUST: include active pages in cluster.
341  *	PGO_FREE: set the PG_RELEASED bits on the cluster so they'll be freed
342  *		in async io (caller must clean on error).
343  *        NOTE: the caller should clear PG_CLEANCHK bits if PGO_DOACTCLUST.
344  *              PG_CLEANCHK is only a hint, but clearing will help reduce
345  *		the number of calls we make to the pmap layer.
346  */
347 
348 struct vm_page **
uvm_mk_pcluster(struct uvm_object * uobj,struct vm_page ** pps,int * npages,struct vm_page * center,int flags,voff_t mlo,voff_t mhi)349 uvm_mk_pcluster(struct uvm_object *uobj, struct vm_page **pps, int *npages,
350     struct vm_page *center, int flags, voff_t mlo, voff_t mhi)
351 {
352 	struct vm_page **ppsp, *pclust;
353 	voff_t lo, hi, curoff;
354 	int center_idx, forward, incr;
355 
356 	/*
357 	 * center page should already be busy and write protected.  XXX:
358 	 * suppose page is wired?  if we lock, then a process could
359 	 * fault/block on it.  if we don't lock, a process could write the
360 	 * pages in the middle of an I/O.  (consider an msync()).  let's
361 	 * lock it for now (better to delay than corrupt data?).
362 	 */
363 	/* get cluster boundaries, check sanity, and apply our limits as well.*/
364 	uobj->pgops->pgo_cluster(uobj, center->offset, &lo, &hi);
365 	if ((flags & PGO_ALLPAGES) == 0) {
366 		if (lo < mlo)
367 			lo = mlo;
368 		if (hi > mhi)
369 			hi = mhi;
370 	}
371 	if ((hi - lo) >> PAGE_SHIFT > *npages) { /* pps too small, bail out! */
372 		pps[0] = center;
373 		*npages = 1;
374 		return pps;
375 	}
376 
377 	/* now determine the center and attempt to cluster around the edges */
378 	center_idx = (center->offset - lo) >> PAGE_SHIFT;
379 	pps[center_idx] = center;	/* plug in the center page */
380 	ppsp = &pps[center_idx];
381 	*npages = 1;
382 
383 	/*
384 	 * attempt to cluster around the left [backward], and then
385 	 * the right side [forward].
386 	 *
387 	 * note that for inactive pages (pages that have been deactivated)
388 	 * there are no valid mappings and PG_CLEAN should be up to date.
389 	 * [i.e. there is no need to query the pmap with pmap_is_modified
390 	 * since there are no mappings].
391 	 */
392 	for (forward  = 0 ; forward <= 1 ; forward++) {
393 		incr = forward ? PAGE_SIZE : -PAGE_SIZE;
394 		curoff = center->offset + incr;
395 		for ( ;(forward == 0 && curoff >= lo) ||
396 		       (forward && curoff < hi);
397 		      curoff += incr) {
398 
399 			pclust = uvm_pagelookup(uobj, curoff); /* lookup page */
400 			if (pclust == NULL) {
401 				break;			/* no page */
402 			}
403 			/* handle active pages */
404 			/* NOTE: inactive pages don't have pmap mappings */
405 			if ((pclust->pg_flags & PQ_INACTIVE) == 0) {
406 				if ((flags & PGO_DOACTCLUST) == 0) {
407 					/* dont want mapped pages at all */
408 					break;
409 				}
410 
411 				/* make sure "clean" bit is sync'd */
412 				if ((pclust->pg_flags & PG_CLEANCHK) == 0) {
413 					if ((pclust->pg_flags & (PG_CLEAN|PG_BUSY))
414 					   == PG_CLEAN &&
415 					   pmap_is_modified(pclust))
416 						atomic_clearbits_int(
417 						    &pclust->pg_flags,
418 						    PG_CLEAN);
419 					/* now checked */
420 					atomic_setbits_int(&pclust->pg_flags,
421 					    PG_CLEANCHK);
422 				}
423 			}
424 
425 			/* is page available for cleaning and does it need it */
426 			if ((pclust->pg_flags & (PG_CLEAN|PG_BUSY)) != 0) {
427 				break;	/* page is already clean or is busy */
428 			}
429 
430 			/* yes!   enroll the page in our array */
431 			atomic_setbits_int(&pclust->pg_flags, PG_BUSY);
432 			UVM_PAGE_OWN(pclust, "uvm_mk_pcluster");
433 
434 			/*
435 			 * If we want to free after io is done, and we're
436 			 * async, set the released flag
437 			 */
438 			if ((flags & (PGO_FREE|PGO_SYNCIO)) == PGO_FREE)
439 				atomic_setbits_int(&pclust->pg_flags,
440 				    PG_RELEASED);
441 
442 			/* XXX: protect wired page?   see above comment. */
443 			pmap_page_protect(pclust, PROT_READ);
444 			if (!forward) {
445 				ppsp--;			/* back up one page */
446 				*ppsp = pclust;
447 			} else {
448 				/* move forward one page */
449 				ppsp[*npages] = pclust;
450 			}
451 			(*npages)++;
452 		}
453 	}
454 
455 	/*
456 	 * done!  return the cluster array to the caller!!!
457 	 */
458 	return ppsp;
459 }
460 
461 /*
462  * uvm_pager_put: high level pageout routine
463  *
464  * we want to pageout page "pg" to backing store, clustering if
465  * possible.
466  *
467  * => page queues must be locked by caller
468  * => if page is not swap-backed, then "uobj" points to the object
469  *	backing it.
470  * => if page is swap-backed, then "uobj" should be NULL.
471  * => "pg" should be PG_BUSY (by caller), and !PG_CLEAN
472  *    for swap-backed memory, "pg" can be NULL if there is no page
473  *    of interest [sometimes the case for the pagedaemon]
474  * => "ppsp_ptr" should point to an array of npages vm_page pointers
475  *	for possible cluster building
476  * => flags (first two for non-swap-backed pages)
477  *	PGO_ALLPAGES: all pages in uobj are valid targets
478  *	PGO_DOACTCLUST: include "PQ_ACTIVE" pages as valid targets
479  *	PGO_SYNCIO: do SYNC I/O (no async)
480  *	PGO_PDFREECLUST: pagedaemon: drop cluster on successful I/O
481  *	PGO_FREE: tell the aio daemon to free pages in the async case.
482  * => start/stop: if (uobj && !PGO_ALLPAGES) limit targets to this range
483  *		  if (!uobj) start is the (daddr_t) of the starting swapblk
484  * => return state:
485  *	1. we return the VM_PAGER status code of the pageout
486  *	2. we return with the page queues unlocked
487  *	3. on errors we always drop the cluster.   thus, if we return
488  *		!PEND, !OK, then the caller only has to worry about
489  *		un-busying the main page (not the cluster pages).
490  *	4. on success, if !PGO_PDFREECLUST, we return the cluster
491  *		with all pages busy (caller must un-busy and check
492  *		wanted/released flags).
493  */
494 int
uvm_pager_put(struct uvm_object * uobj,struct vm_page * pg,struct vm_page *** ppsp_ptr,int * npages,int flags,voff_t start,voff_t stop)495 uvm_pager_put(struct uvm_object *uobj, struct vm_page *pg,
496     struct vm_page ***ppsp_ptr, int *npages, int flags,
497     voff_t start, voff_t stop)
498 {
499 	int result;
500 	daddr_t swblk;
501 	struct vm_page **ppsp = *ppsp_ptr;
502 
503 	/*
504 	 * note that uobj is null  if we are doing a swap-backed pageout.
505 	 * note that uobj is !null if we are doing normal object pageout.
506 	 * note that the page queues must be locked to cluster.
507 	 */
508 	if (uobj) {	/* if !swap-backed */
509 		/*
510 		 * attempt to build a cluster for pageout using its
511 		 * make-put-cluster function (if it has one).
512 		 */
513 		if (uobj->pgops->pgo_mk_pcluster) {
514 			ppsp = uobj->pgops->pgo_mk_pcluster(uobj, ppsp,
515 			    npages, pg, flags, start, stop);
516 			*ppsp_ptr = ppsp;  /* update caller's pointer */
517 		} else {
518 			ppsp[0] = pg;
519 			*npages = 1;
520 		}
521 
522 		swblk = 0;		/* XXX: keep gcc happy */
523 	} else {
524 		/*
525 		 * for swap-backed pageout, the caller (the pagedaemon) has
526 		 * already built the cluster for us.   the starting swap
527 		 * block we are writing to has been passed in as "start."
528 		 * "pg" could be NULL if there is no page we are especially
529 		 * interested in (in which case the whole cluster gets dropped
530 		 * in the event of an error or a sync "done").
531 		 */
532 		swblk = start;
533 		/* ppsp and npages should be ok */
534 	}
535 
536 	/* now that we've clustered we can unlock the page queues */
537 	uvm_unlock_pageq();
538 
539 	/*
540 	 * now attempt the I/O.   if we have a failure and we are
541 	 * clustered, we will drop the cluster and try again.
542 	 */
543 ReTry:
544 	if (uobj) {
545 		result = uobj->pgops->pgo_put(uobj, ppsp, *npages, flags);
546 	} else {
547 		/* XXX daddr_t -> int */
548 		result = uvm_swap_put(swblk, ppsp, *npages, flags);
549 	}
550 
551 	/*
552 	 * we have attempted the I/O.
553 	 *
554 	 * if the I/O was a success then:
555 	 * 	if !PGO_PDFREECLUST, we return the cluster to the
556 	 *		caller (who must un-busy all pages)
557 	 *	else we un-busy cluster pages for the pagedaemon
558 	 *
559 	 * if I/O is pending (async i/o) then we return the pending code.
560 	 * [in this case the async i/o done function must clean up when
561 	 *  i/o is done...]
562 	 */
563 	if (result == VM_PAGER_PEND || result == VM_PAGER_OK) {
564 		if (result == VM_PAGER_OK && (flags & PGO_PDFREECLUST)) {
565 			/* drop cluster */
566 			if (*npages > 1 || pg == NULL)
567 				uvm_pager_dropcluster(uobj, pg, ppsp, npages,
568 				    PGO_PDFREECLUST);
569 		}
570 		return (result);
571 	}
572 
573 	/*
574 	 * a pager error occurred (even after dropping the cluster, if there
575 	 * was one).  give up! the caller only has one page ("pg")
576 	 * to worry about.
577 	 */
578 	if (*npages > 1 || pg == NULL) {
579 		uvm_pager_dropcluster(uobj, pg, ppsp, npages, PGO_REALLOCSWAP);
580 
581 		/*
582 		 * for failed swap-backed pageouts with a "pg",
583 		 * we need to reset pg's swslot to either:
584 		 * "swblk" (for transient errors, so we can retry),
585 		 * or 0 (for hard errors).
586 		 */
587 		if (uobj == NULL && pg != NULL) {
588 			/* XXX daddr_t -> int */
589 			int nswblk = (result == VM_PAGER_AGAIN) ? swblk : 0;
590 			if (pg->pg_flags & PQ_ANON) {
591 				rw_enter(pg->uanon->an_lock, RW_WRITE);
592 				pg->uanon->an_swslot = nswblk;
593 				rw_exit(pg->uanon->an_lock);
594 			} else {
595 				rw_enter(pg->uobject->vmobjlock, RW_WRITE);
596 				uao_set_swslot(pg->uobject,
597 					       pg->offset >> PAGE_SHIFT,
598 					       nswblk);
599 				rw_exit(pg->uobject->vmobjlock);
600 			}
601 		}
602 		if (result == VM_PAGER_AGAIN) {
603 			/*
604 			 * for transient failures, free all the swslots that
605 			 * we're not going to retry with.
606 			 */
607 			if (uobj == NULL) {
608 				if (pg) {
609 					/* XXX daddr_t -> int */
610 					uvm_swap_free(swblk + 1, *npages - 1);
611 				} else {
612 					/* XXX daddr_t -> int */
613 					uvm_swap_free(swblk, *npages);
614 				}
615 			}
616 			if (pg) {
617 				ppsp[0] = pg;
618 				*npages = 1;
619 				goto ReTry;
620 			}
621 		} else if (uobj == NULL) {
622 			/*
623 			 * for hard errors on swap-backed pageouts,
624 			 * mark the swslots as bad.  note that we do not
625 			 * free swslots that we mark bad.
626 			 */
627 			/* XXX daddr_t -> int */
628 			uvm_swap_markbad(swblk, *npages);
629 		}
630 	}
631 
632 	/*
633 	 * a pager error occurred (even after dropping the cluster, if there
634 	 * was one).    give up!   the caller only has one page ("pg")
635 	 * to worry about.
636 	 */
637 
638 	return result;
639 }
640 
641 /*
642  * uvm_pager_dropcluster: drop a cluster we have built (because we
643  * got an error, or, if PGO_PDFREECLUST we are un-busying the
644  * cluster pages on behalf of the pagedaemon).
645  *
646  * => uobj, if non-null, is a non-swap-backed object
647  * => page queues are not locked
648  * => pg is our page of interest (the one we clustered around, can be null)
649  * => ppsp/npages is our current cluster
650  * => flags: PGO_PDFREECLUST: pageout was a success: un-busy cluster
651  *	pages on behalf of the pagedaemon.
652  *           PGO_REALLOCSWAP: drop previously allocated swap slots for
653  *		clustered swap-backed pages (except for "pg" if !NULL)
654  *		"swblk" is the start of swap alloc (e.g. for ppsp[0])
655  *		[only meaningful if swap-backed (uobj == NULL)]
656  */
657 
658 void
uvm_pager_dropcluster(struct uvm_object * uobj,struct vm_page * pg,struct vm_page ** ppsp,int * npages,int flags)659 uvm_pager_dropcluster(struct uvm_object *uobj, struct vm_page *pg,
660     struct vm_page **ppsp, int *npages, int flags)
661 {
662 	int lcv;
663 
664 	KASSERT(uobj == NULL || rw_write_held(uobj->vmobjlock));
665 
666 	/* drop all pages but "pg" */
667 	for (lcv = 0 ; lcv < *npages ; lcv++) {
668 		/* skip "pg" or empty slot */
669 		if (ppsp[lcv] == pg || ppsp[lcv] == NULL)
670 			continue;
671 
672 		/*
673 		 * Note that PQ_ANON bit can't change as long as we are holding
674 		 * the PG_BUSY bit (so there is no need to lock the page
675 		 * queues to test it).
676 		 */
677 		if (!uobj) {
678 			if (ppsp[lcv]->pg_flags & PQ_ANON) {
679 				rw_enter(ppsp[lcv]->uanon->an_lock, RW_WRITE);
680 				if (flags & PGO_REALLOCSWAP)
681 					  /* zap swap block */
682 					  ppsp[lcv]->uanon->an_swslot = 0;
683 			} else {
684 				rw_enter(ppsp[lcv]->uobject->vmobjlock,
685 				    RW_WRITE);
686 				if (flags & PGO_REALLOCSWAP)
687 					uao_set_swslot(ppsp[lcv]->uobject,
688 					    ppsp[lcv]->offset >> PAGE_SHIFT, 0);
689 			}
690 		}
691 
692 		/* did someone want the page while we had it busy-locked? */
693 		if (ppsp[lcv]->pg_flags & PG_WANTED) {
694 			wakeup(ppsp[lcv]);
695 		}
696 
697 		/* if page was released, release it.  otherwise un-busy it */
698 		if (ppsp[lcv]->pg_flags & PG_RELEASED &&
699 		    ppsp[lcv]->pg_flags & PQ_ANON) {
700 				/* kills anon and frees pg */
701 				uvm_anon_release(ppsp[lcv]->uanon);
702 				continue;
703 		} else {
704 			/*
705 			 * if we were planning on async io then we would
706 			 * have PG_RELEASED set, clear that with the others.
707 			 */
708 			atomic_clearbits_int(&ppsp[lcv]->pg_flags,
709 			    PG_BUSY|PG_WANTED|PG_FAKE|PG_RELEASED);
710 			UVM_PAGE_OWN(ppsp[lcv], NULL);
711 		}
712 
713 		/*
714 		 * if we are operating on behalf of the pagedaemon and we
715 		 * had a successful pageout update the page!
716 		 */
717 		if (flags & PGO_PDFREECLUST) {
718 			pmap_clear_reference(ppsp[lcv]);
719 			pmap_clear_modify(ppsp[lcv]);
720 			atomic_setbits_int(&ppsp[lcv]->pg_flags, PG_CLEAN);
721 		}
722 
723 		/* if anonymous cluster, unlock object and move on */
724 		if (!uobj) {
725 			if (ppsp[lcv]->pg_flags & PQ_ANON)
726 				rw_exit(ppsp[lcv]->uanon->an_lock);
727 			else
728 				rw_exit(ppsp[lcv]->uobject->vmobjlock);
729 		}
730 	}
731 }
732 
733 /*
734  * interrupt-context iodone handler for single-buf i/os
735  * or the top-level buf of a nested-buf i/o.
736  *
737  * => must be at splbio().
738  */
739 
740 void
uvm_aio_biodone(struct buf * bp)741 uvm_aio_biodone(struct buf *bp)
742 {
743 	splassert(IPL_BIO);
744 
745 	/* reset b_iodone for when this is a single-buf i/o. */
746 	bp->b_iodone = uvm_aio_aiodone;
747 
748 	mtx_enter(&uvm.aiodoned_lock);
749 	TAILQ_INSERT_TAIL(&uvm.aio_done, bp, b_freelist);
750 	wakeup(&uvm.aiodoned);
751 	mtx_leave(&uvm.aiodoned_lock);
752 }
753 
754 void
uvm_aio_aiodone_pages(struct vm_page ** pgs,int npages,boolean_t write,int error)755 uvm_aio_aiodone_pages(struct vm_page **pgs, int npages, boolean_t write,
756     int error)
757 {
758 	struct vm_page *pg;
759 	struct rwlock *slock;
760 	boolean_t swap;
761 	int i, swslot;
762 
763 	slock = NULL;
764 	pg = pgs[0];
765 	swap = (pg->uanon != NULL && pg->uobject == NULL) ||
766 		(pg->pg_flags & PQ_AOBJ) != 0;
767 
768 	KASSERT(swap);
769 	KASSERT(write);
770 
771 	if (error) {
772 		if (pg->uobject != NULL) {
773 			swslot = uao_find_swslot(pg->uobject,
774 			    pg->offset >> PAGE_SHIFT);
775 		} else {
776 			swslot = pg->uanon->an_swslot;
777 		}
778 		KASSERT(swslot);
779 	}
780 
781 	for (i = 0; i < npages; i++) {
782 		int anon_disposed = 0;
783 
784 		pg = pgs[i];
785 		KASSERT((pg->pg_flags & PG_FAKE) == 0);
786 
787 		/*
788 		 * lock each page's object (or anon) individually since
789 		 * each page may need a different lock.
790 		 */
791 		if (pg->uobject != NULL) {
792 			slock = pg->uobject->vmobjlock;
793 		} else {
794 			slock = pg->uanon->an_lock;
795 		}
796 		rw_enter(slock, RW_WRITE);
797 		anon_disposed = (pg->pg_flags & PG_RELEASED) != 0;
798 		KASSERT(!anon_disposed || pg->uobject != NULL ||
799 		    pg->uanon->an_ref == 0);
800 		uvm_lock_pageq();
801 
802 		/*
803 		 * if this was a successful write,
804 		 * mark the page PG_CLEAN.
805 		 */
806 		if (!error) {
807 			pmap_clear_reference(pg);
808 			pmap_clear_modify(pg);
809 			atomic_setbits_int(&pg->pg_flags, PG_CLEAN);
810 		}
811 
812 		/*
813 		 * unlock everything for this page now.
814 		 */
815 		if (pg->uobject == NULL && anon_disposed) {
816 			uvm_unlock_pageq();
817 			uvm_anon_release(pg->uanon);
818 		} else {
819 			uvm_page_unbusy(&pg, 1);
820 			uvm_unlock_pageq();
821 			rw_exit(slock);
822 		}
823 	}
824 
825 	if (error) {
826 		uvm_swap_markbad(swslot, npages);
827 	}
828 }
829 
830 /*
831  * uvm_aio_aiodone: do iodone processing for async i/os.
832  * this should be called in thread context, not interrupt context.
833  */
834 void
uvm_aio_aiodone(struct buf * bp)835 uvm_aio_aiodone(struct buf *bp)
836 {
837 	int npages = bp->b_bufsize >> PAGE_SHIFT;
838 	struct vm_page *pgs[MAXPHYS >> PAGE_SHIFT];
839 	int i, error;
840 	boolean_t write;
841 
842 	KASSERT(npages <= MAXPHYS >> PAGE_SHIFT);
843 	splassert(IPL_BIO);
844 
845 	error = (bp->b_flags & B_ERROR) ? (bp->b_error ? bp->b_error : EIO) : 0;
846 	write = (bp->b_flags & B_READ) == 0;
847 
848 	for (i = 0; i < npages; i++)
849 		pgs[i] = uvm_atopg((vaddr_t)bp->b_data +
850 		    ((vsize_t)i << PAGE_SHIFT));
851 	uvm_pagermapout((vaddr_t)bp->b_data, npages);
852 #ifdef UVM_SWAP_ENCRYPT
853 	/*
854 	 * XXX - assumes that we only get ASYNC writes. used to be above.
855 	 */
856 	if (pgs[0]->pg_flags & PQ_ENCRYPT) {
857 		uvm_swap_freepages(pgs, npages);
858 		goto freed;
859 	}
860 #endif /* UVM_SWAP_ENCRYPT */
861 
862 	uvm_aio_aiodone_pages(pgs, npages, write, error);
863 
864 #ifdef UVM_SWAP_ENCRYPT
865 freed:
866 #endif
867 	pool_put(&bufpool, bp);
868 }
869