xref: /openbsd/sys/uvm/uvm_pdaemon.c (revision 6b5aed99)
1 /*	$OpenBSD: uvm_pdaemon.c,v 1.134 2025/01/25 08:55:52 mpi Exp $	*/
2 /*	$NetBSD: uvm_pdaemon.c,v 1.23 2000/08/20 10:24:14 bjh21 Exp $	*/
3 
4 /*
5  * Copyright (c) 1997 Charles D. Cranor and Washington University.
6  * Copyright (c) 1991, 1993, The Regents of the University of California.
7  *
8  * All rights reserved.
9  *
10  * This code is derived from software contributed to Berkeley by
11  * The Mach Operating System project at Carnegie-Mellon University.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *	@(#)vm_pageout.c        8.5 (Berkeley) 2/14/94
38  * from: Id: uvm_pdaemon.c,v 1.1.2.32 1998/02/06 05:26:30 chs Exp
39  *
40  *
41  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
42  * All rights reserved.
43  *
44  * Permission to use, copy, modify and distribute this software and
45  * its documentation is hereby granted, provided that both the copyright
46  * notice and this permission notice appear in all copies of the
47  * software, derivative works or modified versions, and any portions
48  * thereof, and that both notices appear in supporting documentation.
49  *
50  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
51  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
52  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
53  *
54  * Carnegie Mellon requests users of this software to return to
55  *
56  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
57  *  School of Computer Science
58  *  Carnegie Mellon University
59  *  Pittsburgh PA 15213-3890
60  *
61  * any improvements or extensions that they make and grant Carnegie the
62  * rights to redistribute these changes.
63  */
64 
65 /*
66  * uvm_pdaemon.c: the page daemon
67  */
68 
69 #include <sys/param.h>
70 #include <sys/systm.h>
71 #include <sys/kernel.h>
72 #include <sys/pool.h>
73 #include <sys/proc.h>
74 #include <sys/buf.h>
75 #include <sys/mount.h>
76 #include <sys/atomic.h>
77 
78 #ifdef HIBERNATE
79 #include <sys/hibernate.h>
80 #endif
81 
82 #include <uvm/uvm.h>
83 
84 #include "drm.h"
85 
86 #if NDRM > 0
87 extern unsigned long drmbackoff(long);
88 #endif
89 
90 /*
91  * UVMPD_NUMDIRTYREACTS is how many dirty pages the pagedaemon will reactivate
92  * in a pass thru the inactive list when swap is full.  the value should be
93  * "small"... if it's too large we'll cycle the active pages thru the inactive
94  * queue too quickly to for them to be referenced and avoid being freed.
95  */
96 
97 #define UVMPD_NUMDIRTYREACTS 16
98 
99 
100 /*
101  * local prototypes
102  */
103 
104 struct rwlock	*uvmpd_trylockowner(struct vm_page *);
105 void		uvmpd_scan(struct uvm_pmalloc *, int, int);
106 int		uvmpd_scan_inactive(struct uvm_pmalloc *, int);
107 void		uvmpd_scan_active(struct uvm_pmalloc *, int, int);
108 void		uvmpd_tune(void);
109 void		uvmpd_drop(struct pglist *);
110 int		uvmpd_dropswap(struct vm_page *);
111 
112 /*
113  * uvm_wait: wait (sleep) for the page daemon to free some pages
114  *
115  * => should be called with all locks released
116  * => should _not_ be called by the page daemon (to avoid deadlock)
117  */
118 
119 void
uvm_wait(const char * wmsg)120 uvm_wait(const char *wmsg)
121 {
122 	uint64_t timo = INFSLP;
123 
124 #ifdef DIAGNOSTIC
125 	if (curproc == &proc0)
126 		panic("%s: cannot sleep for memory during boot", __func__);
127 #endif
128 
129 	/*
130 	 * check for page daemon going to sleep (waiting for itself)
131 	 */
132 	if (curproc == uvm.pagedaemon_proc) {
133 		printf("uvm_wait emergency bufbackoff\n");
134 		if (bufbackoff(NULL, 4) >= 4)
135 			return;
136 		/*
137 		 * now we have a problem: the pagedaemon wants to go to
138 		 * sleep until it frees more memory.   but how can it
139 		 * free more memory if it is asleep?  that is a deadlock.
140 		 * we have two options:
141 		 *  [1] panic now
142 		 *  [2] put a timeout on the sleep, thus causing the
143 		 *      pagedaemon to only pause (rather than sleep forever)
144 		 *
145 		 * note that option [2] will only help us if we get lucky
146 		 * and some other process on the system breaks the deadlock
147 		 * by exiting or freeing memory (thus allowing the pagedaemon
148 		 * to continue).  for now we panic if DEBUG is defined,
149 		 * otherwise we hope for the best with option [2] (better
150 		 * yet, this should never happen in the first place!).
151 		 */
152 
153 		printf("pagedaemon: deadlock detected!\n");
154 		timo = MSEC_TO_NSEC(125);	/* set timeout */
155 #if defined(DEBUG)
156 		/* DEBUG: panic so we can debug it */
157 		panic("pagedaemon deadlock");
158 #endif
159 	}
160 
161 	uvm_lock_fpageq();
162 	wakeup(&uvm.pagedaemon);		/* wake the daemon! */
163 	msleep_nsec(&uvmexp.free, &uvm.fpageqlock, PVM | PNORELOCK, wmsg, timo);
164 }
165 
166 /*
167  * uvmpd_tune: tune paging parameters
168  */
169 void
uvmpd_tune(void)170 uvmpd_tune(void)
171 {
172 	int val;
173 
174 	val = uvmexp.npages / 30;
175 
176 	/* XXX:  what are these values good for? */
177 	val = max(val, (16*1024) >> PAGE_SHIFT);
178 
179 	/* Make sure there's always a user page free. */
180 	if (val < uvmexp.reserve_kernel + 1)
181 		val = uvmexp.reserve_kernel + 1;
182 	uvmexp.freemin = val;
183 
184 	/* Calculate free target. */
185 	val = (uvmexp.freemin * 4) / 3;
186 	if (val <= uvmexp.freemin)
187 		val = uvmexp.freemin + 1;
188 	uvmexp.freetarg = val;
189 
190 	uvmexp.wiredmax = uvmexp.npages / 3;
191 }
192 
193 /*
194  * Indicate to the page daemon that a nowait call failed and it should
195  * recover at least some memory in the most restricted region (assumed
196  * to be dma_constraint).
197  */
198 struct uvm_pmalloc nowait_pma;
199 
200 static inline int
uvmpd_pma_done(struct uvm_pmalloc * pma)201 uvmpd_pma_done(struct uvm_pmalloc *pma)
202 {
203 	if (pma == NULL || (pma->pm_flags & UVM_PMA_FREED))
204 		return 1;
205 	return 0;
206 }
207 
208 /*
209  * uvm_pageout: the main loop for the pagedaemon
210  */
211 void
uvm_pageout(void * arg)212 uvm_pageout(void *arg)
213 {
214 	struct uvm_constraint_range constraint;
215 	struct uvm_pmalloc *pma;
216 	int shortage, inactive_shortage;
217 
218 	/* ensure correct priority and set paging parameters... */
219 	uvm.pagedaemon_proc = curproc;
220 	(void) spl0();
221 	uvmpd_tune();
222 
223 	/*
224 	 * XXX realistically, this is what our nowait callers probably
225 	 * care about.
226 	 */
227 	nowait_pma.pm_constraint = dma_constraint;
228 	nowait_pma.pm_size = (16 << PAGE_SHIFT); /* XXX */
229 	nowait_pma.pm_flags = 0;
230 
231 	for (;;) {
232 		long size;
233 
234 		uvm_lock_fpageq();
235 		if (TAILQ_EMPTY(&uvm.pmr_control.allocs) || uvmexp.paging > 0) {
236 			msleep_nsec(&uvm.pagedaemon, &uvm.fpageqlock, PVM,
237 			    "pgdaemon", INFSLP);
238 			uvmexp.pdwoke++;
239 		}
240 
241 		if ((pma = TAILQ_FIRST(&uvm.pmr_control.allocs)) != NULL) {
242 			pma->pm_flags |= UVM_PMA_BUSY;
243 			constraint = pma->pm_constraint;
244 		} else {
245 			constraint = no_constraint;
246 		}
247 		/* How many pages do we need to free during this round? */
248 		shortage = uvmexp.freetarg -
249 		    (uvmexp.free + uvmexp.paging) + BUFPAGES_DEFICIT;
250 		uvm_unlock_fpageq();
251 
252 		/*
253 		 * now lock page queues and recompute inactive count
254 		 */
255 		uvm_lock_pageq();
256 		uvmexp.inactarg = (uvmexp.active + uvmexp.inactive) / 3;
257 		if (uvmexp.inactarg <= uvmexp.freetarg) {
258 			uvmexp.inactarg = uvmexp.freetarg + 1;
259 		}
260 		inactive_shortage =
261 			uvmexp.inactarg - uvmexp.inactive - BUFPAGES_INACT;
262 		uvm_unlock_pageq();
263 
264 		size = 0;
265 		if (pma != NULL)
266 			size += pma->pm_size >> PAGE_SHIFT;
267 		if (shortage > 0)
268 			size += shortage;
269 
270 		if (size == 0) {
271 			/*
272 			 * Since the inactive target just got updated
273 			 * above, both `size' and `inactive_shortage' can
274 			 * be 0.
275 			 */
276 			if (inactive_shortage) {
277 				uvm_lock_pageq();
278 				uvmpd_scan_active(NULL, 0, inactive_shortage);
279 				uvm_unlock_pageq();
280 			}
281 			continue;
282 		}
283 
284 		/* Reclaim pages from the buffer cache if possible. */
285 		shortage -= bufbackoff(&constraint, size * 2);
286 #if NDRM > 0
287 		shortage -= drmbackoff(size * 2);
288 #endif
289 		if (shortage > 0)
290 			shortage -= uvm_pmr_cache_drain();
291 
292 		/*
293 		 * scan if needed
294 		 */
295 		uvm_lock_pageq();
296 		if (!uvmpd_pma_done(pma) ||
297 		    (shortage > 0) || (inactive_shortage > 0)) {
298 			uvmpd_scan(pma, shortage, inactive_shortage);
299 		}
300 
301 		/*
302 		 * if there's any free memory to be had,
303 		 * wake up any waiters.
304 		 */
305 		uvm_lock_fpageq();
306 		if (uvmexp.free > uvmexp.reserve_kernel || uvmexp.paging == 0) {
307 			wakeup(&uvmexp.free);
308 		}
309 
310 		if (pma != NULL) {
311 			/*
312 			 * XXX If UVM_PMA_FREED isn't set, no pages
313 			 * were freed.  Should we set UVM_PMA_FAIL in
314 			 * that case?
315 			 */
316 			pma->pm_flags &= ~UVM_PMA_BUSY;
317 			if (pma->pm_flags & UVM_PMA_FREED) {
318 				pma->pm_flags &= ~UVM_PMA_LINKED;
319 				TAILQ_REMOVE(&uvm.pmr_control.allocs, pma, pmq);
320 				wakeup(pma);
321 			}
322 		}
323 		uvm_unlock_fpageq();
324 
325 		/*
326 		 * scan done.  unlock page queues (the only lock we are holding)
327 		 */
328 		uvm_unlock_pageq();
329 
330 		sched_pause(yield);
331 	}
332 	/*NOTREACHED*/
333 }
334 
335 
336 /*
337  * uvm_aiodone_daemon:  main loop for the aiodone daemon.
338  */
339 void
uvm_aiodone_daemon(void * arg)340 uvm_aiodone_daemon(void *arg)
341 {
342 	int s, npages;
343 	struct buf *bp, *nbp;
344 
345 	uvm.aiodoned_proc = curproc;
346 	KERNEL_UNLOCK();
347 
348 	for (;;) {
349 		/*
350 		 * Check for done aio structures. If we've got structures to
351 		 * process, do so. Otherwise sleep while avoiding races.
352 		 */
353 		mtx_enter(&uvm.aiodoned_lock);
354 		while ((bp = TAILQ_FIRST(&uvm.aio_done)) == NULL)
355 			msleep_nsec(&uvm.aiodoned, &uvm.aiodoned_lock,
356 			    PVM, "aiodoned", INFSLP);
357 		/* Take the list for ourselves. */
358 		TAILQ_INIT(&uvm.aio_done);
359 		mtx_leave(&uvm.aiodoned_lock);
360 
361 		/* process each i/o that's done. */
362 		npages = 0;
363 		KERNEL_LOCK();
364 		while (bp != NULL) {
365 			if (bp->b_flags & B_PDAEMON) {
366 				npages += bp->b_bufsize >> PAGE_SHIFT;
367 			}
368 			nbp = TAILQ_NEXT(bp, b_freelist);
369 			s = splbio();	/* b_iodone must by called at splbio */
370 			(*bp->b_iodone)(bp);
371 			splx(s);
372 			bp = nbp;
373 
374 			sched_pause(yield);
375 		}
376 		KERNEL_UNLOCK();
377 
378 		uvm_lock_fpageq();
379 		atomic_sub_int(&uvmexp.paging, npages);
380 		wakeup(uvmexp.free <= uvmexp.reserve_kernel ? &uvm.pagedaemon :
381 		    &uvmexp.free);
382 		uvm_unlock_fpageq();
383 	}
384 }
385 
386 /*
387  * uvmpd_trylockowner: trylock the page's owner.
388  *
389  * => return the locked rwlock on success.  otherwise, return NULL.
390  */
391 struct rwlock *
uvmpd_trylockowner(struct vm_page * pg)392 uvmpd_trylockowner(struct vm_page *pg)
393 {
394 
395 	struct uvm_object *uobj = pg->uobject;
396 	struct rwlock *slock;
397 
398 	if (uobj != NULL) {
399 		slock = uobj->vmobjlock;
400 	} else {
401 		struct vm_anon *anon = pg->uanon;
402 
403 		KASSERT(anon != NULL);
404 		slock = anon->an_lock;
405 	}
406 
407 	if (rw_enter(slock, RW_WRITE|RW_NOSLEEP)) {
408 		return NULL;
409 	}
410 
411 	return slock;
412 }
413 
414 /*
415  * uvmpd_dropswap: free any swap allocated to this page.
416  *
417  * => called with owner locked.
418  * => return 1 if a page had an associated slot.
419  */
420 int
uvmpd_dropswap(struct vm_page * pg)421 uvmpd_dropswap(struct vm_page *pg)
422 {
423 	struct vm_anon *anon = pg->uanon;
424 	int slot, result = 0;
425 
426 	if ((pg->pg_flags & PQ_ANON) && anon->an_swslot) {
427 		uvm_swap_free(anon->an_swslot, 1);
428 		anon->an_swslot = 0;
429 		result = 1;
430 	} else if (pg->pg_flags & PQ_AOBJ) {
431 		slot = uao_dropswap(pg->uobject, pg->offset >> PAGE_SHIFT);
432 		if (slot)
433 			result = 1;
434 	}
435 
436 	return result;
437 }
438 
439 /*
440  * Return 1 if the page `p' belongs to the memory range described by
441  * 'constraint', 0 otherwise.
442  */
443 static inline int
uvmpd_match_constraint(struct vm_page * p,struct uvm_constraint_range * constraint)444 uvmpd_match_constraint(struct vm_page *p,
445     struct uvm_constraint_range *constraint)
446 {
447 	paddr_t paddr;
448 
449 	paddr = atop(VM_PAGE_TO_PHYS(p));
450 	if (paddr >= constraint->ucr_low && paddr < constraint->ucr_high)
451 		return 1;
452 
453 	return 0;
454 }
455 
456 /*
457  * uvmpd_scan_inactive: scan an inactive list for pages to clean or free.
458  *
459  * => called with page queues locked
460  * => we work on meeting our free target by converting inactive pages
461  *    into free pages.
462  * => we handle the building of swap-backed clusters
463  * => we return TRUE if we are exiting because we met our target
464  */
465 int
uvmpd_scan_inactive(struct uvm_pmalloc * pma,int shortage)466 uvmpd_scan_inactive(struct uvm_pmalloc *pma, int shortage)
467 {
468 	struct pglist *pglst = &uvm.page_inactive;
469 	int result, freed = 0;
470 	struct vm_page *p, *nextpg;
471 	struct uvm_object *uobj;
472 	struct vm_page *pps[SWCLUSTPAGES], **ppsp;
473 	int npages;
474 	struct vm_page *swpps[SWCLUSTPAGES]; 	/* XXX: see below */
475 	struct rwlock *slock;
476 	int swnpages, swcpages;				/* XXX: see below */
477 	int swslot;
478 	struct vm_anon *anon;
479 	boolean_t swap_backed;
480 	vaddr_t start;
481 	int dirtyreacts;
482 
483 	/*
484 	 * swslot is non-zero if we are building a swap cluster.  we want
485 	 * to stay in the loop while we have a page to scan or we have
486 	 * a swap-cluster to build.
487 	 */
488 	swslot = 0;
489 	swnpages = swcpages = 0;
490 	dirtyreacts = 0;
491 	p = NULL;
492 
493 	/*
494 	 * If a thread is waiting for us to release memory from a specific
495 	 * memory range start with the first page on the list that fits in
496 	 * it.
497 	 */
498 	TAILQ_FOREACH(p, pglst, pageq) {
499 		if (uvmpd_pma_done(pma) ||
500 		    uvmpd_match_constraint(p, &pma->pm_constraint))
501 			break;
502 	}
503 
504 	for (; p != NULL || swslot != 0; p = nextpg) {
505 		/*
506 		 * note that p can be NULL iff we have traversed the whole
507 		 * list and need to do one final swap-backed clustered pageout.
508 		 */
509 		uobj = NULL;
510 		anon = NULL;
511 		if (p) {
512 			/*
513 			 * see if we've met our target
514 			 */
515 			if ((uvmpd_pma_done(pma) &&
516 			    (uvmexp.paging >= (shortage - freed))) ||
517 			    dirtyreacts == UVMPD_NUMDIRTYREACTS) {
518 				if (swslot == 0) {
519 					/* exit now if no swap-i/o pending */
520 					break;
521 				}
522 
523 				/* set p to null to signal final swap i/o */
524 				p = NULL;
525 				nextpg = NULL;
526 			}
527 		}
528 		if (p) {	/* if (we have a new page to consider) */
529 			/*
530 			 * we are below target and have a new page to consider.
531 			 */
532 			uvmexp.pdscans++;
533 			nextpg = TAILQ_NEXT(p, pageq);
534 
535 			/*
536 			 * If we are not short on memory and only interested
537 			 * in releasing pages from a given memory range, do not
538 			 * bother with other pages.
539 			 */
540 			if (uvmexp.paging >= (shortage - freed) &&
541 			    !uvmpd_pma_done(pma) &&
542 			    !uvmpd_match_constraint(p, &pma->pm_constraint))
543 				continue;
544 
545 			anon = p->uanon;
546 			uobj = p->uobject;
547 
548 			/*
549 			 * first we attempt to lock the object that this page
550 			 * belongs to.  if our attempt fails we skip on to
551 			 * the next page (no harm done).  it is important to
552 			 * "try" locking the object as we are locking in the
553 			 * wrong order (pageq -> object) and we don't want to
554 			 * deadlock.
555 			 */
556 			slock = uvmpd_trylockowner(p);
557 			if (slock == NULL) {
558 				continue;
559 			}
560 
561 			/*
562 			 * move referenced pages back to active queue
563 			 * and skip to next page.
564 			 */
565 			if (pmap_is_referenced(p)) {
566 				uvm_pageactivate(p);
567 				rw_exit(slock);
568 				uvmexp.pdreact++;
569 				continue;
570 			}
571 
572 			if (p->pg_flags & PG_BUSY) {
573 				rw_exit(slock);
574 				uvmexp.pdbusy++;
575 				continue;
576 			}
577 
578 			/* does the page belong to an object? */
579 			if (uobj != NULL) {
580 				uvmexp.pdobscan++;
581 			} else {
582 				KASSERT(anon != NULL);
583 				uvmexp.pdanscan++;
584 			}
585 
586 			/*
587 			 * we now have the page queues locked.
588 			 * the page is not busy.   if the page is clean we
589 			 * can free it now and continue.
590 			 */
591 			if (p->pg_flags & PG_CLEAN) {
592 				if (p->pg_flags & PQ_SWAPBACKED) {
593 					/* this page now lives only in swap */
594 					atomic_inc_int(&uvmexp.swpgonly);
595 				}
596 
597 				/* zap all mappings with pmap_page_protect... */
598 				pmap_page_protect(p, PROT_NONE);
599 				uvm_pagefree(p);
600 				freed++;
601 
602 				if (anon) {
603 
604 					/*
605 					 * an anonymous page can only be clean
606 					 * if it has backing store assigned.
607 					 */
608 
609 					KASSERT(anon->an_swslot != 0);
610 
611 					/* remove from object */
612 					anon->an_page = NULL;
613 				}
614 				rw_exit(slock);
615 				continue;
616 			}
617 
618 			/*
619 			 * this page is dirty, skip it if we'll have met our
620 			 * free target when all the current pageouts complete.
621 			 */
622 			if (uvmpd_pma_done(pma) &&
623 			    (uvmexp.paging > (shortage - freed))) {
624 				rw_exit(slock);
625 				continue;
626 			}
627 
628 			/*
629 			 * this page is dirty, but we can't page it out
630 			 * since all pages in swap are only in swap.
631 			 * reactivate it so that we eventually cycle
632 			 * all pages thru the inactive queue.
633 			 */
634 			if ((p->pg_flags & PQ_SWAPBACKED) && uvm_swapisfull()) {
635 				dirtyreacts++;
636 				uvm_pageactivate(p);
637 				rw_exit(slock);
638 				continue;
639 			}
640 
641 			/*
642 			 * if the page is swap-backed and dirty and swap space
643 			 * is full, free any swap allocated to the page
644 			 * so that other pages can be paged out.
645 			 */
646 			if ((p->pg_flags & PQ_SWAPBACKED) && uvm_swapisfilled())
647 				uvmpd_dropswap(p);
648 
649 			/*
650 			 * the page we are looking at is dirty.   we must
651 			 * clean it before it can be freed.  to do this we
652 			 * first mark the page busy so that no one else will
653 			 * touch the page.   we write protect all the mappings
654 			 * of the page so that no one touches it while it is
655 			 * in I/O.
656 			 */
657 
658 			swap_backed = ((p->pg_flags & PQ_SWAPBACKED) != 0);
659 			atomic_setbits_int(&p->pg_flags, PG_BUSY);
660 			UVM_PAGE_OWN(p, "scan_inactive");
661 			pmap_page_protect(p, PROT_READ);
662 			uvmexp.pgswapout++;
663 
664 			/*
665 			 * for swap-backed pages we need to (re)allocate
666 			 * swap space.
667 			 */
668 			if (swap_backed) {
669 				/* free old swap slot (if any) */
670 				uvmpd_dropswap(p);
671 
672 				/* start new cluster (if necessary) */
673 				if (swslot == 0) {
674 					swnpages = SWCLUSTPAGES;
675 					swslot = uvm_swap_alloc(&swnpages,
676 					    TRUE);
677 					if (swslot == 0) {
678 						/* no swap?  give up! */
679 						atomic_clearbits_int(
680 						    &p->pg_flags,
681 						    PG_BUSY);
682 						UVM_PAGE_OWN(p, NULL);
683 						rw_exit(slock);
684 						continue;
685 					}
686 					swcpages = 0;	/* cluster is empty */
687 				}
688 
689 				/* add block to cluster */
690 				swpps[swcpages] = p;
691 				if (anon)
692 					anon->an_swslot = swslot + swcpages;
693 				else
694 					uao_set_swslot(uobj,
695 					    p->offset >> PAGE_SHIFT,
696 					    swslot + swcpages);
697 				swcpages++;
698 				rw_exit(slock);
699 
700 				/* cluster not full yet? */
701 				if (swcpages < swnpages)
702 					continue;
703 			}
704 		} else {
705 			/* if p == NULL we must be doing a last swap i/o */
706 			swap_backed = TRUE;
707 		}
708 
709 		/*
710 		 * now consider doing the pageout.
711 		 *
712 		 * for swap-backed pages, we do the pageout if we have either
713 		 * filled the cluster (in which case (swnpages == swcpages) or
714 		 * run out of pages (p == NULL).
715 		 *
716 		 * for object pages, we always do the pageout.
717 		 */
718 		if (swap_backed) {
719 			/* starting I/O now... set up for it */
720 			npages = swcpages;
721 			ppsp = swpps;
722 			/* for swap-backed pages only */
723 			start = (vaddr_t) swslot;
724 
725 			/* if this is final pageout we could have a few
726 			 * extra swap blocks */
727 			if (swcpages < swnpages) {
728 				uvm_swap_free(swslot + swcpages,
729 				    (swnpages - swcpages));
730 			}
731 		} else {
732 			/* normal object pageout */
733 			ppsp = pps;
734 			npages = sizeof(pps) / sizeof(struct vm_page *);
735 			/* not looked at because PGO_ALLPAGES is set */
736 			start = 0;
737 		}
738 
739 		/*
740 		 * now do the pageout.
741 		 *
742 		 * for swap_backed pages we have already built the cluster.
743 		 * for !swap_backed pages, uvm_pager_put will call the object's
744 		 * "make put cluster" function to build a cluster on our behalf.
745 		 *
746 		 * we pass the PGO_PDFREECLUST flag to uvm_pager_put to instruct
747 		 * it to free the cluster pages for us on a successful I/O (it
748 		 * always does this for un-successful I/O requests).  this
749 		 * allows us to do clustered pageout without having to deal
750 		 * with cluster pages at this level.
751 		 *
752 		 * note locking semantics of uvm_pager_put with PGO_PDFREECLUST:
753 		 *  IN: locked: page queues
754 		 * OUT: locked:
755 		 *     !locked: pageqs
756 		 */
757 
758 		uvmexp.pdpageouts++;
759 		result = uvm_pager_put(swap_backed ? NULL : uobj, p,
760 		    &ppsp, &npages, PGO_ALLPAGES|PGO_PDFREECLUST, start, 0);
761 
762 		/*
763 		 * if we did i/o to swap, zero swslot to indicate that we are
764 		 * no longer building a swap-backed cluster.
765 		 */
766 
767 		if (swap_backed)
768 			swslot = 0;		/* done with this cluster */
769 
770 		/*
771 		 * first, we check for VM_PAGER_PEND which means that the
772 		 * async I/O is in progress and the async I/O done routine
773 		 * will clean up after us.   in this case we move on to the
774 		 * next page.
775 		 *
776 		 * there is a very remote chance that the pending async i/o can
777 		 * finish _before_ we get here.   if that happens, our page "p"
778 		 * may no longer be on the inactive queue.   so we verify this
779 		 * when determining the next page (starting over at the head if
780 		 * we've lost our inactive page).
781 		 */
782 
783 		if (result == VM_PAGER_PEND) {
784 			atomic_add_int(&uvmexp.paging, npages);
785 			uvm_lock_pageq();
786 			uvmexp.pdpending++;
787 			if (p) {
788 				if (p->pg_flags & PQ_INACTIVE)
789 					nextpg = TAILQ_NEXT(p, pageq);
790 				else
791 					nextpg = TAILQ_FIRST(pglst);
792 			} else {
793 				nextpg = NULL;
794 			}
795 			continue;
796 		}
797 
798 		/* clean up "p" if we have one */
799 		if (p) {
800 			/*
801 			 * the I/O request to "p" is done and uvm_pager_put
802 			 * has freed any cluster pages it may have allocated
803 			 * during I/O.  all that is left for us to do is
804 			 * clean up page "p" (which is still PG_BUSY).
805 			 *
806 			 * our result could be one of the following:
807 			 *   VM_PAGER_OK: successful pageout
808 			 *
809 			 *   VM_PAGER_AGAIN: tmp resource shortage, we skip
810 			 *     to next page
811 			 *   VM_PAGER_{FAIL,ERROR,BAD}: an error.   we
812 			 *     "reactivate" page to get it out of the way (it
813 			 *     will eventually drift back into the inactive
814 			 *     queue for a retry).
815 			 *   VM_PAGER_UNLOCK: should never see this as it is
816 			 *     only valid for "get" operations
817 			 */
818 
819 			/* relock p's object: page queues not lock yet, so
820 			 * no need for "try" */
821 
822 			/* !swap_backed case: already locked... */
823 			if (swap_backed) {
824 				rw_enter(slock, RW_WRITE);
825 			}
826 
827 #ifdef DIAGNOSTIC
828 			if (result == VM_PAGER_UNLOCK)
829 				panic("pagedaemon: pageout returned "
830 				    "invalid 'unlock' code");
831 #endif
832 
833 			/* handle PG_WANTED now */
834 			if (p->pg_flags & PG_WANTED)
835 				wakeup(p);
836 
837 			atomic_clearbits_int(&p->pg_flags, PG_BUSY|PG_WANTED);
838 			UVM_PAGE_OWN(p, NULL);
839 
840 			/* released during I/O? Can only happen for anons */
841 			if (p->pg_flags & PG_RELEASED) {
842 				KASSERT(anon != NULL);
843 				/*
844 				 * remove page so we can get nextpg,
845 				 * also zero out anon so we don't use
846 				 * it after the free.
847 				 */
848 				anon->an_page = NULL;
849 				p->uanon = NULL;
850 
851 				uvm_anfree(anon);	/* kills anon */
852 				pmap_page_protect(p, PROT_NONE);
853 				anon = NULL;
854 				uvm_lock_pageq();
855 				nextpg = TAILQ_NEXT(p, pageq);
856 				/* free released page */
857 				uvm_pagefree(p);
858 			} else {	/* page was not released during I/O */
859 				uvm_lock_pageq();
860 				nextpg = TAILQ_NEXT(p, pageq);
861 				if (result != VM_PAGER_OK) {
862 					/* pageout was a failure... */
863 					if (result != VM_PAGER_AGAIN)
864 						uvm_pageactivate(p);
865 					pmap_clear_reference(p);
866 				} else {
867 					/* pageout was a success... */
868 					pmap_clear_reference(p);
869 					pmap_clear_modify(p);
870 					atomic_setbits_int(&p->pg_flags,
871 					    PG_CLEAN);
872 				}
873 			}
874 
875 			/*
876 			 * drop object lock (if there is an object left).   do
877 			 * a safety check of nextpg to make sure it is on the
878 			 * inactive queue (it should be since PG_BUSY pages on
879 			 * the inactive queue can't be re-queued [note: not
880 			 * true for active queue]).
881 			 */
882 			rw_exit(slock);
883 
884 			if (nextpg && (nextpg->pg_flags & PQ_INACTIVE) == 0) {
885 				nextpg = TAILQ_FIRST(pglst);	/* reload! */
886 			}
887 		} else {
888 			/*
889 			 * if p is null in this loop, make sure it stays null
890 			 * in the next loop.
891 			 */
892 			nextpg = NULL;
893 
894 			/*
895 			 * lock page queues here just so they're always locked
896 			 * at the end of the loop.
897 			 */
898 			uvm_lock_pageq();
899 		}
900 	}
901 
902 	return freed;
903 }
904 
905 /*
906  * uvmpd_scan: scan the page queues and attempt to meet our targets.
907  *
908  * => called with pageq's locked
909  */
910 
911 void
uvmpd_scan(struct uvm_pmalloc * pma,int shortage,int inactive_shortage)912 uvmpd_scan(struct uvm_pmalloc *pma, int shortage, int inactive_shortage)
913 {
914 	int swap_shortage, pages_freed;
915 
916 	MUTEX_ASSERT_LOCKED(&uvm.pageqlock);
917 
918 	uvmexp.pdrevs++;		/* counter */
919 
920 
921 #ifdef __HAVE_PMAP_COLLECT
922 	/*
923 	 * swap out some processes if we are below our free target.
924 	 * we need to unlock the page queues for this.
925 	 */
926 	if (shortage > 0) {
927 		uvmexp.pdswout++;
928 		uvm_unlock_pageq();
929 		shortage -= uvm_swapout_threads();
930 		uvm_lock_pageq();
931 	}
932 #endif
933 
934 	/*
935 	 * now we want to work on meeting our targets.   first we work on our
936 	 * free target by converting inactive pages into free pages.  then
937 	 * we work on meeting our inactive target by converting active pages
938 	 * to inactive ones.
939 	 */
940 	pages_freed = uvmpd_scan_inactive(pma, shortage);
941 	uvmexp.pdfreed += pages_freed;
942 	shortage -= pages_freed;
943 
944 	/*
945 	 * we have done the scan to get free pages.   now we work on meeting
946 	 * our inactive target.
947 	 *
948 	 * detect if we're not going to be able to page anything out
949 	 * until we free some swap resources from active pages.
950 	 */
951 	swap_shortage = 0;
952 	if ((shortage > 0) && uvm_swapisfilled() && !uvm_swapisfull() &&
953 	    pages_freed == 0) {
954 		swap_shortage = shortage;
955 	}
956 
957 	uvmpd_scan_active(pma, swap_shortage, inactive_shortage);
958 }
959 
960 void
uvmpd_scan_active(struct uvm_pmalloc * pma,int swap_shortage,int inactive_shortage)961 uvmpd_scan_active(struct uvm_pmalloc *pma, int swap_shortage,
962     int inactive_shortage)
963 {
964 	struct vm_page *p, *nextpg;
965 	struct rwlock *slock;
966 
967 	MUTEX_ASSERT_LOCKED(&uvm.pageqlock);
968 
969 	for (p = TAILQ_FIRST(&uvm.page_active);
970 	     p != NULL && (inactive_shortage > 0 || swap_shortage > 0);
971 	     p = nextpg) {
972 		nextpg = TAILQ_NEXT(p, pageq);
973 		if (p->pg_flags & PG_BUSY) {
974 			continue;
975 		}
976 
977 		/*
978 		 * If we couldn't release enough pages from a given memory
979 		 * range try to deactivate them first...
980 		 *
981 		 * ...unless we are low on swap slots, in such case we are
982 		 * probably OOM and want to release swap resources as quickly
983 		 * as possible.
984 		 */
985 		if (inactive_shortage > 0 && swap_shortage == 0 &&
986 		    !uvmpd_pma_done(pma) &&
987 		    !uvmpd_match_constraint(p, &pma->pm_constraint))
988 			continue;
989 
990 		/*
991 		 * lock the page's owner.
992 		 */
993 		slock = uvmpd_trylockowner(p);
994 		if (slock == NULL) {
995 			continue;
996 		}
997 
998 		/*
999 		 * skip this page if it's busy.
1000 		 */
1001 		if ((p->pg_flags & PG_BUSY) != 0) {
1002 			rw_exit(slock);
1003 			continue;
1004 		}
1005 
1006 		/*
1007 		 * if there's a shortage of swap, free any swap allocated
1008 		 * to this page so that other pages can be paged out.
1009 		 */
1010 		if (swap_shortage > 0) {
1011 			if (uvmpd_dropswap(p)) {
1012 				atomic_clearbits_int(&p->pg_flags, PG_CLEAN);
1013 				swap_shortage--;
1014 			}
1015 		}
1016 
1017 		/*
1018 		 * deactivate this page if there's a shortage of
1019 		 * inactive pages.
1020 		 */
1021 		if (inactive_shortage > 0) {
1022 			/* no need to check wire_count as pg is "active" */
1023 			uvm_pagedeactivate(p);
1024 			uvmexp.pddeact++;
1025 			inactive_shortage--;
1026 		}
1027 
1028 		/*
1029 		 * we're done with this page.
1030 		 */
1031 		rw_exit(slock);
1032 	}
1033 }
1034 
1035 #ifdef HIBERNATE
1036 
1037 /*
1038  * uvmpd_drop: drop clean pages from list
1039  */
1040 void
uvmpd_drop(struct pglist * pglst)1041 uvmpd_drop(struct pglist *pglst)
1042 {
1043 	struct vm_page *p, *nextpg;
1044 
1045 	for (p = TAILQ_FIRST(pglst); p != NULL; p = nextpg) {
1046 		nextpg = TAILQ_NEXT(p, pageq);
1047 
1048 		if (p->pg_flags & PQ_ANON || p->uobject == NULL)
1049 			continue;
1050 
1051 		if (p->pg_flags & PG_BUSY)
1052 			continue;
1053 
1054 		if (p->pg_flags & PG_CLEAN) {
1055 			struct uvm_object * uobj = p->uobject;
1056 
1057 			rw_enter(uobj->vmobjlock, RW_WRITE);
1058 			uvm_lock_pageq();
1059 			/*
1060 			 * we now have the page queues locked.
1061 			 * the page is not busy.   if the page is clean we
1062 			 * can free it now and continue.
1063 			 */
1064 			if (p->pg_flags & PG_CLEAN) {
1065 				if (p->pg_flags & PQ_SWAPBACKED) {
1066 					/* this page now lives only in swap */
1067 					atomic_inc_int(&uvmexp.swpgonly);
1068 				}
1069 
1070 				/* zap all mappings with pmap_page_protect... */
1071 				pmap_page_protect(p, PROT_NONE);
1072 				uvm_pagefree(p);
1073 			}
1074 			uvm_unlock_pageq();
1075 			rw_exit(uobj->vmobjlock);
1076 		}
1077 	}
1078 }
1079 
1080 void
uvmpd_hibernate(void)1081 uvmpd_hibernate(void)
1082 {
1083 	uvmpd_drop(&uvm.page_inactive);
1084 	uvmpd_drop(&uvm.page_active);
1085 }
1086 
1087 #endif
1088