xref: /openbsd/sys/uvm/uvm_pmemrange.c (revision 73b046f9)
1 /*	$OpenBSD: uvm_pmemrange.c,v 1.76 2024/11/08 15:54:33 mpi Exp $	*/
2 
3 /*
4  * Copyright (c) 2024 Martin Pieuchot <mpi@openbsd.org>
5  * Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/param.h>
21 #include <sys/systm.h>
22 #include <uvm/uvm.h>
23 #include <sys/malloc.h>
24 #include <sys/kernel.h>
25 #include <sys/proc.h>
26 #include <sys/mount.h>
27 
28 /*
29  * 2 trees: addr tree and size tree.
30  *
31  * The allocator keeps chunks of free pages (called a range).
32  * Two pages are part of the same range if:
33  * - all pages in between are part of that range,
34  * - they are of the same memory type (zeroed or non-zeroed),
35  * - they are part of the same pmemrange.
36  * A pmemrange is a range of memory which is part of the same vm_physseg
37  * and has a use-count.
38  *
39  * addr tree is vm_page[0].objt
40  * size tree is vm_page[1].objt
41  *
42  * The size tree is not used for memory ranges of 1 page, instead,
43  * single queue is vm_page[0].pageq
44  *
45  * vm_page[0].fpgsz describes the length of a free range. Two adjacent ranges
46  * are joined, unless:
47  * - they have pages in between them which are not free
48  * - they belong to different memtypes (zeroed vs dirty memory)
49  * - they are in different pmemrange areas (ISA vs non-ISA memory for instance)
50  * - they are not a continuation of the same array
51  * The latter issue is caused by vm_physseg ordering and splitting from the
52  * MD initialization machinery. The MD code is dependant on freelists and
53  * happens to split ISA memory from non-ISA memory.
54  * (Note: freelists die die die!)
55  *
56  * uvm_page_init guarantees that every vm_physseg contains an array of
57  * struct vm_page. Also, uvm_page_physload allocates an array of struct
58  * vm_page. This code depends on that array. The array may break across
59  * vm_physsegs boundaries.
60  */
61 
62 /*
63  * Validate the flags of the page. (Used in asserts.)
64  * Any free page must have the PQ_FREE flag set.
65  * Free pages may be zeroed.
66  * Pmap flags are left untouched.
67  *
68  * The PQ_FREE flag is not checked here: by not checking, we can easily use
69  * this check in pages which are freed.
70  */
71 #define VALID_FLAGS(pg_flags)						\
72 	(((pg_flags) & ~(PQ_FREE|PG_ZERO|PG_PMAPMASK)) == 0x0)
73 
74 /* Tree comparators. */
75 int	uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *,
76 	    const struct uvm_pmemrange *);
77 int	uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *);
78 int	uvm_pmr_pg_to_memtype(struct vm_page *);
79 
80 #ifdef DDB
81 void	uvm_pmr_print(void);
82 #endif
83 
84 static inline int
in_pagedaemon(int allowsyncer)85 in_pagedaemon(int allowsyncer)
86 {
87 	if (curcpu()->ci_idepth > 0)
88 		return 0;
89 	if (curproc == uvm.pagedaemon_proc)
90 		return 1;
91 	/* XXX why is the syncer allowed to use the pagedaemon's reserve? */
92 	if (allowsyncer && (curproc == syncerproc))
93 		return 1;
94 	return 0;
95 }
96 
97 /*
98  * Memory types. The page flags are used to derive what the current memory
99  * type of a page is.
100  */
101 int
uvm_pmr_pg_to_memtype(struct vm_page * pg)102 uvm_pmr_pg_to_memtype(struct vm_page *pg)
103 {
104 	if (pg->pg_flags & PG_ZERO)
105 		return UVM_PMR_MEMTYPE_ZERO;
106 	/* Default: dirty memory. */
107 	return UVM_PMR_MEMTYPE_DIRTY;
108 }
109 
110 /* Trees. */
111 RBT_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp);
112 RBT_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp);
113 RBT_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr,
114     uvm_pmemrange_addr_cmp);
115 
116 /* Validation. */
117 #ifdef DEBUG
118 void	uvm_pmr_assertvalid(struct uvm_pmemrange *pmr);
119 #else
120 #define uvm_pmr_assertvalid(pmr)	do {} while (0)
121 #endif
122 
123 psize_t			 uvm_pmr_get1page(psize_t, int, struct pglist *,
124 			    paddr_t, paddr_t, int);
125 
126 struct uvm_pmemrange	*uvm_pmr_allocpmr(void);
127 struct vm_page		*uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int);
128 struct vm_page		*uvm_pmr_nextsz(struct uvm_pmemrange *,
129 			    struct vm_page *, int);
130 void			 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr,
131 			    struct vm_page *pg, struct vm_page **pg_prev,
132 			    struct vm_page **pg_next);
133 struct vm_page		*uvm_pmr_findnextsegment(struct uvm_pmemrange *,
134 			    struct vm_page *, paddr_t);
135 struct vm_page		*uvm_pmr_findprevsegment(struct uvm_pmemrange *,
136 			    struct vm_page *, paddr_t);
137 psize_t			 uvm_pmr_remove_1strange(struct pglist *, paddr_t,
138 			    struct vm_page **, int);
139 psize_t			 uvm_pmr_remove_1strange_reverse(struct pglist *,
140     			    paddr_t *);
141 void			 uvm_pmr_split(paddr_t);
142 struct uvm_pmemrange	*uvm_pmemrange_find(paddr_t);
143 struct uvm_pmemrange	*uvm_pmemrange_use_insert(struct uvm_pmemrange_use *,
144 			    struct uvm_pmemrange *);
145 psize_t			 pow2divide(psize_t, psize_t);
146 struct vm_page		*uvm_pmr_rootupdate(struct uvm_pmemrange *,
147 			    struct vm_page *, paddr_t, paddr_t, int);
148 
149 /*
150  * Computes num/denom and rounds it up to the next power-of-2.
151  *
152  * This is a division function which calculates an approximation of
153  * num/denom, with result =~ num/denom. It is meant to be fast and doesn't
154  * have to be accurate.
155  *
156  * Providing too large a value makes the allocator slightly faster, at the
157  * risk of hitting the failure case more often. Providing too small a value
158  * makes the allocator a bit slower, but less likely to hit a failure case.
159  */
160 psize_t
pow2divide(psize_t num,psize_t denom)161 pow2divide(psize_t num, psize_t denom)
162 {
163 	int rshift;
164 
165 	for (rshift = 0; num > denom; rshift++, denom <<= 1)
166 		;
167 	return (paddr_t)1 << rshift;
168 }
169 
170 /*
171  * Predicate: lhs is a subrange or rhs.
172  *
173  * If rhs_low == 0: don't care about lower bound.
174  * If rhs_high == 0: don't care about upper bound.
175  */
176 #define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high)	\
177 	(((rhs_low) == 0 || (lhs_low) >= (rhs_low)) &&			\
178 	((rhs_high) == 0 || (lhs_high) <= (rhs_high)))
179 
180 /*
181  * Predicate: lhs intersects with rhs.
182  *
183  * If rhs_low == 0: don't care about lower bound.
184  * If rhs_high == 0: don't care about upper bound.
185  * Ranges don't intersect if they don't have any page in common, array
186  * semantics mean that < instead of <= should be used here.
187  */
188 #define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high)	\
189 	(((rhs_low) == 0 || (rhs_low) < (lhs_high)) &&			\
190 	((rhs_high) == 0 || (lhs_low) < (rhs_high)))
191 
192 /*
193  * Align to power-of-2 alignment.
194  */
195 #define PMR_ALIGN(pgno, align)						\
196 	(((pgno) + ((align) - 1)) & ~((align) - 1))
197 #define PMR_ALIGN_DOWN(pgno, align)					\
198 	((pgno) & ~((align) - 1))
199 
200 
201 /*
202  * Comparator: sort by address ascending.
203  */
204 int
uvm_pmemrange_addr_cmp(const struct uvm_pmemrange * lhs,const struct uvm_pmemrange * rhs)205 uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *lhs,
206     const struct uvm_pmemrange *rhs)
207 {
208 	return lhs->low < rhs->low ? -1 : lhs->low > rhs->low;
209 }
210 
211 /*
212  * Comparator: sort by use ascending.
213  *
214  * The higher the use value of a range, the more devices need memory in
215  * this range. Therefore allocate from the range with the lowest use first.
216  */
217 int
uvm_pmemrange_use_cmp(struct uvm_pmemrange * lhs,struct uvm_pmemrange * rhs)218 uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs)
219 {
220 	int result;
221 
222 	result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use;
223 	if (result == 0)
224 		result = uvm_pmemrange_addr_cmp(lhs, rhs);
225 	return result;
226 }
227 
228 int
uvm_pmr_addr_cmp(const struct vm_page * lhs,const struct vm_page * rhs)229 uvm_pmr_addr_cmp(const struct vm_page *lhs, const struct vm_page *rhs)
230 {
231 	paddr_t lhs_addr, rhs_addr;
232 
233 	lhs_addr = VM_PAGE_TO_PHYS(lhs);
234 	rhs_addr = VM_PAGE_TO_PHYS(rhs);
235 
236 	return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr);
237 }
238 
239 int
uvm_pmr_size_cmp(const struct vm_page * lhs,const struct vm_page * rhs)240 uvm_pmr_size_cmp(const struct vm_page *lhs, const struct vm_page *rhs)
241 {
242 	psize_t lhs_size, rhs_size;
243 	int cmp;
244 
245 	/* Using second tree, so we receive pg[1] instead of pg[0]. */
246 	lhs_size = (lhs - 1)->fpgsz;
247 	rhs_size = (rhs - 1)->fpgsz;
248 
249 	cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size);
250 	if (cmp == 0)
251 		cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1);
252 	return cmp;
253 }
254 
255 /*
256  * Find the first range of free pages that is at least sz pages long.
257  */
258 struct vm_page *
uvm_pmr_nfindsz(struct uvm_pmemrange * pmr,psize_t sz,int mti)259 uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti)
260 {
261 	struct	vm_page *node, *best;
262 
263 	KASSERT(sz >= 1);
264 
265 	if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti]))
266 		return TAILQ_FIRST(&pmr->single[mti]);
267 
268 	node = RBT_ROOT(uvm_pmr_size, &pmr->size[mti]);
269 	best = NULL;
270 	while (node != NULL) {
271 		if ((node - 1)->fpgsz >= sz) {
272 			best = (node - 1);
273 			node = RBT_LEFT(uvm_objtree, node);
274 		} else
275 			node = RBT_RIGHT(uvm_objtree, node);
276 	}
277 	return best;
278 }
279 
280 /*
281  * Finds the next range. The next range has a size >= pg->fpgsz.
282  * Returns NULL if no more ranges are available.
283  */
284 struct vm_page *
uvm_pmr_nextsz(struct uvm_pmemrange * pmr,struct vm_page * pg,int mt)285 uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt)
286 {
287 	struct vm_page *npg;
288 
289 	KASSERT(pmr != NULL && pg != NULL);
290 	if (pg->fpgsz == 1) {
291 		if (TAILQ_NEXT(pg, pageq) != NULL)
292 			return TAILQ_NEXT(pg, pageq);
293 		else
294 			npg = RBT_MIN(uvm_pmr_size, &pmr->size[mt]);
295 	} else
296 		npg = RBT_NEXT(uvm_pmr_size, pg + 1);
297 
298 	return npg == NULL ? NULL : npg - 1;
299 }
300 
301 /*
302  * Finds the previous and next ranges relative to the (uninserted) pg range.
303  *
304  * *pg_prev == NULL if no previous range is available, that can join with
305  * 	pg.
306  * *pg_next == NULL if no next range is available, that can join with
307  * 	pg.
308  */
309 void
uvm_pmr_pnaddr(struct uvm_pmemrange * pmr,struct vm_page * pg,struct vm_page ** pg_prev,struct vm_page ** pg_next)310 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg,
311     struct vm_page **pg_prev, struct vm_page **pg_next)
312 {
313 	KASSERT(pg_prev != NULL && pg_next != NULL);
314 
315 	*pg_next = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg);
316 	if (*pg_next == NULL)
317 		*pg_prev = RBT_MAX(uvm_pmr_addr, &pmr->addr);
318 	else
319 		*pg_prev = RBT_PREV(uvm_pmr_addr, *pg_next);
320 
321 	KDASSERT(*pg_next == NULL ||
322 	    VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg));
323 	KDASSERT(*pg_prev == NULL ||
324 	    VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg));
325 
326 	/* Reset if not contig. */
327 	if (*pg_prev != NULL &&
328 	    (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz
329 	    != atop(VM_PAGE_TO_PHYS(pg)) ||
330 	    *pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */
331 	    uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg)))
332 		*pg_prev = NULL;
333 	if (*pg_next != NULL &&
334 	    (atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz
335 	    != atop(VM_PAGE_TO_PHYS(*pg_next)) ||
336 	    pg + pg->fpgsz != *pg_next || /* Array broke. */
337 	    uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg)))
338 		*pg_next = NULL;
339 	return;
340 }
341 
342 /*
343  * Remove a range from the address tree.
344  * Address tree maintains pmr counters.
345  */
346 void
uvm_pmr_remove_addr(struct uvm_pmemrange * pmr,struct vm_page * pg)347 uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg)
348 {
349 	KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
350 	KDASSERT(pg->pg_flags & PQ_FREE);
351 	RBT_REMOVE(uvm_pmr_addr, &pmr->addr, pg);
352 
353 	pmr->nsegs--;
354 }
355 /*
356  * Remove a range from the size tree.
357  */
358 void
uvm_pmr_remove_size(struct uvm_pmemrange * pmr,struct vm_page * pg)359 uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
360 {
361 	int memtype;
362 #ifdef DEBUG
363 	struct vm_page *i;
364 #endif
365 
366 	KDASSERT(pg->fpgsz >= 1);
367 	KDASSERT(pg->pg_flags & PQ_FREE);
368 	memtype = uvm_pmr_pg_to_memtype(pg);
369 
370 	if (pg->fpgsz == 1) {
371 #ifdef DEBUG
372 		TAILQ_FOREACH(i, &pmr->single[memtype], pageq) {
373 			if (i == pg)
374 				break;
375 		}
376 		KDASSERT(i == pg);
377 #endif
378 		TAILQ_REMOVE(&pmr->single[memtype], pg, pageq);
379 	} else {
380 		KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[memtype],
381 		    pg + 1) == pg + 1);
382 		RBT_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1);
383 	}
384 }
385 /* Remove from both trees. */
386 void
uvm_pmr_remove(struct uvm_pmemrange * pmr,struct vm_page * pg)387 uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg)
388 {
389 	uvm_pmr_assertvalid(pmr);
390 	uvm_pmr_remove_size(pmr, pg);
391 	uvm_pmr_remove_addr(pmr, pg);
392 	uvm_pmr_assertvalid(pmr);
393 }
394 
395 /*
396  * Insert the range described in pg.
397  * Returns the range thus created (which may be joined with the previous and
398  * next ranges).
399  * If no_join, the caller guarantees that the range cannot possibly join
400  * with adjacent ranges.
401  */
402 struct vm_page *
uvm_pmr_insert_addr(struct uvm_pmemrange * pmr,struct vm_page * pg,int no_join)403 uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
404 {
405 	struct vm_page *prev, *next;
406 
407 #ifdef DEBUG
408 	struct vm_page *i;
409 	int mt;
410 #endif
411 
412 	KDASSERT(pg->pg_flags & PQ_FREE);
413 	KDASSERT(pg->fpgsz >= 1);
414 
415 #ifdef DEBUG
416 	for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
417 		TAILQ_FOREACH(i, &pmr->single[mt], pageq)
418 			KDASSERT(i != pg);
419 		if (pg->fpgsz > 1) {
420 			KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mt],
421 			    pg + 1) == NULL);
422 		}
423 		KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL);
424 	}
425 #endif
426 
427 	if (!no_join) {
428 		uvm_pmr_pnaddr(pmr, pg, &prev, &next);
429 		if (next != NULL) {
430 			uvm_pmr_remove_size(pmr, next);
431 			uvm_pmr_remove_addr(pmr, next);
432 			pg->fpgsz += next->fpgsz;
433 			next->fpgsz = 0;
434 		}
435 		if (prev != NULL) {
436 			uvm_pmr_remove_size(pmr, prev);
437 			prev->fpgsz += pg->fpgsz;
438 			pg->fpgsz = 0;
439 			return prev;
440 		}
441 	}
442 
443 	RBT_INSERT(uvm_pmr_addr, &pmr->addr, pg);
444 
445 	pmr->nsegs++;
446 
447 	return pg;
448 }
449 /*
450  * Insert the range described in pg.
451  * Returns the range thus created (which may be joined with the previous and
452  * next ranges).
453  * Page must already be in the address tree.
454  */
455 void
uvm_pmr_insert_size(struct uvm_pmemrange * pmr,struct vm_page * pg)456 uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
457 {
458 	int memtype;
459 #ifdef DEBUG
460 	struct vm_page *i;
461 	int mti;
462 #endif
463 
464 	KDASSERT(pg->fpgsz >= 1);
465 	KDASSERT(pg->pg_flags & PQ_FREE);
466 
467 	memtype = uvm_pmr_pg_to_memtype(pg);
468 #ifdef DEBUG
469 	for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
470 		TAILQ_FOREACH(i, &pmr->single[mti], pageq)
471 			KDASSERT(i != pg);
472 		if (pg->fpgsz > 1) {
473 			KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mti],
474 			    pg + 1) == NULL);
475 		}
476 		KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
477 	}
478 	for (i = pg; i < pg + pg->fpgsz; i++)
479 		KASSERT(uvm_pmr_pg_to_memtype(i) == memtype);
480 #endif
481 
482 	if (pg->fpgsz == 1)
483 		TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq);
484 	else
485 		RBT_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1);
486 }
487 /* Insert in both trees. */
488 struct vm_page *
uvm_pmr_insert(struct uvm_pmemrange * pmr,struct vm_page * pg,int no_join)489 uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
490 {
491 	uvm_pmr_assertvalid(pmr);
492 	pg = uvm_pmr_insert_addr(pmr, pg, no_join);
493 	uvm_pmr_insert_size(pmr, pg);
494 	uvm_pmr_assertvalid(pmr);
495 	return pg;
496 }
497 
498 /*
499  * Find the last page that is part of this segment.
500  * => pg: the range at which to start the search.
501  * => boundary: the page number boundary specification (0 = no boundary).
502  * => pmr: the pmemrange of the page.
503  *
504  * This function returns 1 before the next range, so if you want to have the
505  * next range, you need to run TAILQ_NEXT(result, pageq) after calling.
506  * The reason is that this way, the length of the segment is easily
507  * calculated using: atop(result) - atop(pg) + 1.
508  * Hence this function also never returns NULL.
509  */
510 struct vm_page *
uvm_pmr_findnextsegment(struct uvm_pmemrange * pmr,struct vm_page * pg,paddr_t boundary)511 uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr,
512     struct vm_page *pg, paddr_t boundary)
513 {
514 	paddr_t	first_boundary;
515 	struct	vm_page *next;
516 	struct	vm_page *prev;
517 
518 	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) &&
519 	    pmr->high > atop(VM_PAGE_TO_PHYS(pg)));
520 	if (boundary != 0) {
521 		first_boundary =
522 		    PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary);
523 	} else
524 		first_boundary = 0;
525 
526 	/*
527 	 * Increase next until it hits the first page of the next segment.
528 	 *
529 	 * While loop checks the following:
530 	 * - next != NULL	we have not reached the end of pgl
531 	 * - boundary == 0 || next < first_boundary
532 	 *			we do not cross a boundary
533 	 * - atop(prev) + 1 == atop(next)
534 	 *			still in the same segment
535 	 * - low <= last
536 	 * - high > last	still in the same memory range
537 	 * - memtype is equal	allocator is unable to view different memtypes
538 	 *			as part of the same segment
539 	 * - prev + 1 == next	no array breakage occurs
540 	 */
541 	prev = pg;
542 	next = TAILQ_NEXT(prev, pageq);
543 	while (next != NULL &&
544 	    (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) &&
545 	    atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) &&
546 	    pmr->low <= atop(VM_PAGE_TO_PHYS(next)) &&
547 	    pmr->high > atop(VM_PAGE_TO_PHYS(next)) &&
548 	    uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) &&
549 	    prev + 1 == next) {
550 		prev = next;
551 		next = TAILQ_NEXT(prev, pageq);
552 	}
553 
554 	/*
555 	 * End of this segment.
556 	 */
557 	return prev;
558 }
559 
560 /*
561  * Find the first page that is part of this segment.
562  * => pg: the range at which to start the search.
563  * => boundary: the page number boundary specification (0 = no boundary).
564  * => pmr: the pmemrange of the page.
565  *
566  * This function returns 1 after the previous range, so if you want to have the
567  * previous range, you need to run TAILQ_NEXT(result, pageq) after calling.
568  * The reason is that this way, the length of the segment is easily
569  * calculated using: atop(pg) - atop(result) + 1.
570  * Hence this function also never returns NULL.
571  */
572 struct vm_page *
uvm_pmr_findprevsegment(struct uvm_pmemrange * pmr,struct vm_page * pg,paddr_t boundary)573 uvm_pmr_findprevsegment(struct uvm_pmemrange *pmr,
574     struct vm_page *pg, paddr_t boundary)
575 {
576 	paddr_t	first_boundary;
577 	struct	vm_page *next;
578 	struct	vm_page *prev;
579 
580 	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) &&
581 	    pmr->high > atop(VM_PAGE_TO_PHYS(pg)));
582 	if (boundary != 0) {
583 		first_boundary =
584 		    PMR_ALIGN_DOWN(atop(VM_PAGE_TO_PHYS(pg)), boundary);
585 	} else
586 		first_boundary = 0;
587 
588 	/*
589 	 * Increase next until it hits the first page of the previous segment.
590 	 *
591 	 * While loop checks the following:
592 	 * - next != NULL	we have not reached the end of pgl
593 	 * - boundary == 0 || next >= first_boundary
594 	 *			we do not cross a boundary
595 	 * - atop(prev) - 1 == atop(next)
596 	 *			still in the same segment
597 	 * - low <= last
598 	 * - high > last	still in the same memory range
599 	 * - memtype is equal	allocator is unable to view different memtypes
600 	 *			as part of the same segment
601 	 * - prev - 1 == next	no array breakage occurs
602 	 */
603 	prev = pg;
604 	next = TAILQ_NEXT(prev, pageq);
605 	while (next != NULL &&
606 	    (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) >= first_boundary) &&
607 	    atop(VM_PAGE_TO_PHYS(prev)) - 1 == atop(VM_PAGE_TO_PHYS(next)) &&
608 	    pmr->low <= atop(VM_PAGE_TO_PHYS(next)) &&
609 	    pmr->high > atop(VM_PAGE_TO_PHYS(next)) &&
610 	    uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) &&
611 	    prev - 1 == next) {
612 		prev = next;
613 		next = TAILQ_NEXT(prev, pageq);
614 	}
615 
616 	/*
617 	 * Start of this segment.
618 	 */
619 	return prev;
620 }
621 
622 /*
623  * Remove the first segment of contiguous pages from pgl.
624  * A segment ends if it crosses boundary (unless boundary = 0) or
625  * if it would enter a different uvm_pmemrange.
626  *
627  * Work: the page range that the caller is currently working with.
628  * May be null.
629  *
630  * If is_desperate is non-zero, the smallest segment is erased. Otherwise,
631  * the first segment is erased (which, if called by uvm_pmr_getpages(),
632  * probably is the smallest or very close to it).
633  */
634 psize_t
uvm_pmr_remove_1strange(struct pglist * pgl,paddr_t boundary,struct vm_page ** work,int is_desperate)635 uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary,
636     struct vm_page **work, int is_desperate)
637 {
638 	struct vm_page *start, *end, *iter, *iter_end, *inserted, *lowest;
639 	psize_t count;
640 	struct uvm_pmemrange *pmr, *pmr_iter;
641 
642 	KASSERT(!TAILQ_EMPTY(pgl));
643 
644 	/*
645 	 * Initialize to first page.
646 	 * Unless desperate scan finds a better candidate, this is what'll be
647 	 * erased.
648 	 */
649 	start = TAILQ_FIRST(pgl);
650 	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start)));
651 	end = uvm_pmr_findnextsegment(pmr, start, boundary);
652 
653 	/*
654 	 * If we are desperate, we _really_ want to get rid of the smallest
655 	 * element (rather than a close match to the smallest element).
656 	 */
657 	if (is_desperate) {
658 		/* Linear search for smallest segment. */
659 		pmr_iter = pmr;
660 		for (iter = TAILQ_NEXT(end, pageq);
661 		    iter != NULL && start != end;
662 		    iter = TAILQ_NEXT(iter_end, pageq)) {
663 			/*
664 			 * Only update pmr if it doesn't match current
665 			 * iteration.
666 			 */
667 			if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) ||
668 			    pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) {
669 				pmr_iter = uvm_pmemrange_find(atop(
670 				    VM_PAGE_TO_PHYS(iter)));
671 			}
672 
673 			iter_end = uvm_pmr_findnextsegment(pmr_iter, iter,
674 			    boundary);
675 
676 			/*
677 			 * Current iteration is smaller than best match so
678 			 * far; update.
679 			 */
680 			if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) <
681 			    VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) {
682 				start = iter;
683 				end = iter_end;
684 				pmr = pmr_iter;
685 			}
686 		}
687 	}
688 
689 	/*
690 	 * Calculate count and end of the list.
691 	 */
692 	count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1;
693 	lowest = start;
694 	end = TAILQ_NEXT(end, pageq);
695 
696 	/*
697 	 * Actually remove the range of pages.
698 	 *
699 	 * Sadly, this cannot be done using pointer iteration:
700 	 * vm_physseg is not guaranteed to be sorted on address, hence
701 	 * uvm_page_init() may not have initialized its array sorted by
702 	 * page number.
703 	 */
704 	for (iter = start; iter != end; iter = iter_end) {
705 		iter_end = TAILQ_NEXT(iter, pageq);
706 		TAILQ_REMOVE(pgl, iter, pageq);
707 	}
708 
709 	lowest->fpgsz = count;
710 	inserted = uvm_pmr_insert(pmr, lowest, 0);
711 
712 	/*
713 	 * If the caller was working on a range and this function modified
714 	 * that range, update the pointer.
715 	 */
716 	if (work != NULL && *work != NULL &&
717 	    atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) &&
718 	    atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz >
719 	    atop(VM_PAGE_TO_PHYS(*work)))
720 		*work = inserted;
721 	return count;
722 }
723 
724 /*
725  * Remove the first segment of contiguous pages from a pgl
726  * with the list elements in reverse order of physaddr.
727  *
728  * A segment ends if it would enter a different uvm_pmemrange.
729  *
730  * Stores starting physical address of the segment in pstart.
731  */
732 psize_t
uvm_pmr_remove_1strange_reverse(struct pglist * pgl,paddr_t * pstart)733 uvm_pmr_remove_1strange_reverse(struct pglist *pgl, paddr_t *pstart)
734 {
735 	struct vm_page *start, *end, *iter, *iter_end, *lowest;
736 	psize_t count;
737 	struct uvm_pmemrange *pmr;
738 
739 	KASSERT(!TAILQ_EMPTY(pgl));
740 
741 	start = TAILQ_FIRST(pgl);
742 	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start)));
743 	end = uvm_pmr_findprevsegment(pmr, start, 0);
744 
745 	KASSERT(end <= start);
746 
747 	/*
748 	 * Calculate count and end of the list.
749 	 */
750 	count = atop(VM_PAGE_TO_PHYS(start) - VM_PAGE_TO_PHYS(end)) + 1;
751 	lowest = end;
752 	end = TAILQ_NEXT(end, pageq);
753 
754 	/*
755 	 * Actually remove the range of pages.
756 	 *
757 	 * Sadly, this cannot be done using pointer iteration:
758 	 * vm_physseg is not guaranteed to be sorted on address, hence
759 	 * uvm_page_init() may not have initialized its array sorted by
760 	 * page number.
761 	 */
762 	for (iter = start; iter != end; iter = iter_end) {
763 		iter_end = TAILQ_NEXT(iter, pageq);
764 		TAILQ_REMOVE(pgl, iter, pageq);
765 	}
766 
767 	lowest->fpgsz = count;
768 	(void) uvm_pmr_insert(pmr, lowest, 0);
769 
770 	*pstart = VM_PAGE_TO_PHYS(lowest);
771 	return count;
772 }
773 
774 /*
775  * Extract a number of pages from a segment of free pages.
776  * Called by uvm_pmr_getpages.
777  *
778  * Returns the segment that was created from pages left over at the tail
779  * of the remove set of pages, or NULL if no pages were left at the tail.
780  */
781 struct vm_page *
uvm_pmr_extract_range(struct uvm_pmemrange * pmr,struct vm_page * pg,paddr_t start,paddr_t end,struct pglist * result)782 uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg,
783     paddr_t start, paddr_t end, struct pglist *result)
784 {
785 	struct vm_page *after, *pg_i;
786 	psize_t before_sz, after_sz;
787 #ifdef DEBUG
788 	psize_t i;
789 #endif
790 
791 	KDASSERT(end > start);
792 	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)));
793 	KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz);
794 	KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start);
795 	KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end);
796 
797 	before_sz = start - atop(VM_PAGE_TO_PHYS(pg));
798 	after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end;
799 	KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz);
800 	uvm_pmr_assertvalid(pmr);
801 
802 	uvm_pmr_remove_size(pmr, pg);
803 	if (before_sz == 0)
804 		uvm_pmr_remove_addr(pmr, pg);
805 	after = pg + before_sz + (end - start);
806 
807 	/* Add selected pages to result. */
808 	for (pg_i = pg + before_sz; pg_i != after; pg_i++) {
809 		KASSERT(pg_i->pg_flags & PQ_FREE);
810 		pg_i->fpgsz = 0;
811 		TAILQ_INSERT_TAIL(result, pg_i, pageq);
812 	}
813 
814 	/* Before handling. */
815 	if (before_sz > 0) {
816 		pg->fpgsz = before_sz;
817 		uvm_pmr_insert_size(pmr, pg);
818 	}
819 
820 	/* After handling. */
821 	if (after_sz > 0) {
822 #ifdef DEBUG
823 		for (i = 0; i < after_sz; i++) {
824 			KASSERT(!uvm_pmr_isfree(after + i));
825 		}
826 #endif
827 		KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end);
828 		after->fpgsz = after_sz;
829 		after = uvm_pmr_insert_addr(pmr, after, 1);
830 		uvm_pmr_insert_size(pmr, after);
831 	}
832 
833 	uvm_pmr_assertvalid(pmr);
834 	return (after_sz > 0 ? after : NULL);
835 }
836 
837 /*
838  * Acquire a number of pages.
839  *
840  * count:	the number of pages returned
841  * start:	lowest page number
842  * end:		highest page number +1
843  * 		(start = end = 0: no limitation)
844  * align:	power-of-2 alignment constraint (align = 1: no alignment)
845  * boundary:	power-of-2 boundary (boundary = 0: no boundary)
846  * maxseg:	maximum number of segments to return
847  * flags:	UVM_PLA_* flags
848  * result:	returned pages storage (uses pageq)
849  */
850 int
uvm_pmr_getpages(psize_t count,paddr_t start,paddr_t end,paddr_t align,paddr_t boundary,int maxseg,int flags,struct pglist * result)851 uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align,
852     paddr_t boundary, int maxseg, int flags, struct pglist *result)
853 {
854 	struct	uvm_pmemrange *pmr;	/* Iterate memory ranges. */
855 	struct	vm_page *found, *f_next; /* Iterate chunks. */
856 	psize_t	fcount;			/* Current found pages. */
857 	int	fnsegs;			/* Current segment counter. */
858 	int	try, start_try;
859 	psize_t	search[3];
860 	paddr_t	fstart, fend;		/* Pages to be taken from found. */
861 	int	memtype;		/* Requested memtype. */
862 	int	memtype_init;		/* Best memtype. */
863 	int	desperate;		/* True if allocation failed. */
864 #ifdef DIAGNOSTIC
865 	struct	vm_page *diag_prev;	/* Used during validation. */
866 #endif /* DIAGNOSTIC */
867 
868 	/*
869 	 * Validate arguments.
870 	 */
871 	KASSERT(count > 0);
872 	KASSERT(start == 0 || end == 0 || start < end);
873 	KASSERT(align >= 1);
874 	KASSERT(powerof2(align));
875 	KASSERT(maxseg > 0);
876 	KASSERT(boundary == 0 || powerof2(boundary));
877 	KASSERT(boundary == 0 || maxseg * boundary >= count);
878 	KASSERT(TAILQ_EMPTY(result));
879 	KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT));
880 
881 	/*
882 	 * TRYCONTIG is a noop if you only want a single segment.
883 	 * Remove it if that's the case: otherwise it'll deny the fast
884 	 * allocation.
885 	 */
886 	if (maxseg == 1 || count == 1)
887 		flags &= ~UVM_PLA_TRYCONTIG;
888 
889 	/*
890 	 * Configure search.
891 	 *
892 	 * search[0] is one segment, only used in UVM_PLA_TRYCONTIG case.
893 	 * search[1] is multiple segments, chosen to fulfill the search in
894 	 *   approximately even-sized segments.
895 	 *   This is a good trade-off between slightly reduced allocation speed
896 	 *   and less fragmentation.
897 	 * search[2] is the worst case, in which all segments are evaluated.
898 	 *   This provides the least fragmentation, but makes the search
899 	 *   possibly longer (although in the case it is selected, that no
900 	 *   longer matters most).
901 	 *
902 	 * The exception is when maxseg == 1: since we can only fulfill that
903 	 * with one segment of size pages, only a single search type has to
904 	 * be attempted.
905 	 */
906 	if (maxseg == 1 || count == 1) {
907 		start_try = 2;
908 		search[2] = count;
909 	} else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) {
910 		start_try = 2;
911 		search[2] = 1;
912 	} else {
913 		start_try = 0;
914 		search[0] = count;
915 		search[1] = pow2divide(count, maxseg);
916 		search[2] = 1;
917 		if ((flags & UVM_PLA_TRYCONTIG) == 0)
918 			start_try = 1;
919 		if (search[1] >= search[0]) {
920 			search[1] = search[0];
921 			start_try = 1;
922 		}
923 		if (search[2] >= search[start_try]) {
924 			start_try = 2;
925 		}
926 	}
927 
928 	/*
929 	 * Memory type: if zeroed memory is requested, traverse the zero set.
930 	 * Otherwise, traverse the dirty set.
931 	 *
932 	 * The memtype iterator is reinitialized to memtype_init on entrance
933 	 * of a pmemrange.
934 	 */
935 	if (flags & UVM_PLA_ZERO)
936 		memtype_init = UVM_PMR_MEMTYPE_ZERO;
937 	else
938 		memtype_init = UVM_PMR_MEMTYPE_DIRTY;
939 
940 	/*
941 	 * Initially, we're not desperate.
942 	 *
943 	 * Note that if we return from a sleep, we are still desperate.
944 	 * Chances are that memory pressure is still high, so resetting
945 	 * seems over-optimistic to me.
946 	 */
947 	desperate = 0;
948 
949 	uvm_lock_fpageq();
950 retry:		/* Return point after sleeping. */
951 	/*
952 	 * check to see if we need to generate some free pages waking
953 	 * the pagedaemon.
954 	 */
955 	if ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freemin ||
956 	    ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg &&
957 	    (uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg))
958 		wakeup(&uvm.pagedaemon);
959 
960 	/*
961 	 * fail if any of these conditions is true:
962 	 * [1]  there really are no free pages, or
963 	 * [2]  only kernel "reserved" pages remain and
964 	 *        the UVM_PLA_USERESERVE flag wasn't used.
965 	 * [3]  only pagedaemon "reserved" pages remain and
966 	 *        the requestor isn't the pagedaemon nor the syncer.
967 	 */
968 	if ((uvmexp.free <= (uvmexp.reserve_kernel + count)) &&
969 	    !(flags & UVM_PLA_USERESERVE)) {
970 		uvm_unlock_fpageq();
971 		return ENOMEM;
972 	}
973 
974 	if ((uvmexp.free <= (uvmexp.reserve_pagedaemon + count)) &&
975 	    !in_pagedaemon(1)) {
976 	    	uvm_unlock_fpageq();
977 		if (flags & UVM_PLA_WAITOK) {
978 			uvm_wait("uvm_pmr_getpages");
979 			uvm_lock_fpageq();
980 			goto retry;
981 		}
982 		return ENOMEM;
983 	}
984 
985 	fcount = 0;
986 	fnsegs = 0;
987 
988 retry_desperate:
989 	/*
990 	 * If we just want any page(s), go for the really fast option.
991 	 */
992 	if (count <= maxseg && align == 1 && boundary == 0 &&
993 	    (flags & UVM_PLA_TRYCONTIG) == 0) {
994 		fcount += uvm_pmr_get1page(count - fcount, memtype_init,
995 		    result, start, end, 0);
996 
997 		/*
998 		 * If we found sufficient pages, go to the success exit code.
999 		 *
1000 		 * Otherwise, go immediately to fail, since we collected
1001 		 * all we could anyway.
1002 		 */
1003 		if (fcount == count)
1004 			goto out;
1005 		else
1006 			goto fail;
1007 	}
1008 
1009 	/*
1010 	 * The heart of the contig case.
1011 	 *
1012 	 * The code actually looks like this:
1013 	 *
1014 	 * foreach (struct pmemrange) {
1015 	 *	foreach (memtype) {
1016 	 *		foreach(try) {
1017 	 *			foreach (free range of memtype in pmemrange,
1018 	 *			    starting at search[try]) {
1019 	 *				while (range has space left)
1020 	 *					take from range
1021 	 *			}
1022 	 *		}
1023 	 *	}
1024 	 *
1025 	 *	if next pmemrange has higher usecount than current:
1026 	 *		enter desperate case (which will drain the pmemranges
1027 	 *		until empty prior to moving to the next one)
1028 	 * }
1029 	 *
1030 	 * When desperate is activated, try always starts at the highest
1031 	 * value. The memtype loop is using a goto ReScanMemtype.
1032 	 * The try loop is using a goto ReScan.
1033 	 * The 'range has space left' loop uses label DrainFound.
1034 	 *
1035 	 * Writing them all as loops would take up a lot of screen space in
1036 	 * the form of indentation and some parts are easier to express
1037 	 * using the labels.
1038 	 */
1039 
1040 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1041 		/* Empty range. */
1042 		if (pmr->nsegs == 0)
1043 			continue;
1044 
1045 		/* Outside requested range. */
1046 		if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
1047 			continue;
1048 
1049 		memtype = memtype_init;
1050 
1051 rescan_memtype:	/* Return point at memtype++. */
1052 		try = start_try;
1053 
1054 rescan:		/* Return point at try++. */
1055 		for (found = uvm_pmr_nfindsz(pmr, search[try], memtype);
1056 		    found != NULL;
1057 		    found = f_next) {
1058 			f_next = uvm_pmr_nextsz(pmr, found, memtype);
1059 
1060 			fstart = atop(VM_PAGE_TO_PHYS(found));
1061 			if (start != 0)
1062 				fstart = MAX(start, fstart);
1063 drain_found:
1064 			/*
1065 			 * Throw away the first segment if fnsegs == maxseg
1066 			 *
1067 			 * Note that f_next is still valid after this call,
1068 			 * since we only allocated from entries before f_next.
1069 			 * We don't revisit the entries we already extracted
1070 			 * from unless we entered the desperate case.
1071 			 */
1072 			if (fnsegs == maxseg) {
1073 				fnsegs--;
1074 				fcount -=
1075 				    uvm_pmr_remove_1strange(result, boundary,
1076 				    &found, desperate);
1077 			}
1078 
1079 			fstart = PMR_ALIGN(fstart, align);
1080 			fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz;
1081 			if (end != 0)
1082 				fend = MIN(end, fend);
1083 			if (boundary != 0) {
1084 				fend =
1085 				    MIN(fend, PMR_ALIGN(fstart + 1, boundary));
1086 			}
1087 			if (fstart >= fend)
1088 				continue;
1089 			if (fend - fstart > count - fcount)
1090 				fend = fstart + (count - fcount);
1091 
1092 			fcount += fend - fstart;
1093 			fnsegs++;
1094 			found = uvm_pmr_extract_range(pmr, found,
1095 			    fstart, fend, result);
1096 
1097 			if (fcount == count)
1098 				goto out;
1099 
1100 			/*
1101 			 * If there's still space left in found, try to
1102 			 * fully drain it prior to continuing.
1103 			 */
1104 			if (found != NULL) {
1105 				fstart = fend;
1106 				goto drain_found;
1107 			}
1108 		}
1109 
1110 		/* Try a smaller search now. */
1111 		if (++try < nitems(search))
1112 			goto rescan;
1113 
1114 		/*
1115 		 * Exhaust all memory types prior to going to the next memory
1116 		 * segment.
1117 		 * This means that zero-vs-dirty are eaten prior to moving
1118 		 * to a pmemrange with a higher use-count.
1119 		 *
1120 		 * Code is basically a difficult way of writing:
1121 		 * memtype = memtype_init;
1122 		 * do {
1123 		 *	...;
1124 		 *	memtype += 1;
1125 		 *	memtype %= MEMTYPE_MAX;
1126 		 * } while (memtype != memtype_init);
1127 		 */
1128 		memtype += 1;
1129 		if (memtype == UVM_PMR_MEMTYPE_MAX)
1130 			memtype = 0;
1131 		if (memtype != memtype_init)
1132 			goto rescan_memtype;
1133 
1134 		/*
1135 		 * If not desperate, enter desperate case prior to eating all
1136 		 * the good stuff in the next range.
1137 		 */
1138 		if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL &&
1139 		    TAILQ_NEXT(pmr, pmr_use)->use != pmr->use)
1140 			break;
1141 	}
1142 
1143 	/*
1144 	 * Not enough memory of the requested type available. Fall back to
1145 	 * less good memory that we'll clean up better later.
1146 	 *
1147 	 * This algorithm is not very smart though, it just starts scanning
1148 	 * a different typed range, but the nicer ranges of the previous
1149 	 * iteration may fall out. Hence there is a small chance of a false
1150 	 * negative.
1151 	 *
1152 	 * When desperate: scan all sizes starting at the smallest
1153 	 * (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may
1154 	 * allow us to hit the fast path now).
1155 	 *
1156 	 * Also, because we will revisit entries we scanned before, we need
1157 	 * to reset the page queue, or we may end up releasing entries in
1158 	 * such a way as to invalidate f_next.
1159 	 */
1160 	if (!desperate) {
1161 		desperate = 1;
1162 		start_try = nitems(search) - 1;
1163 		flags &= ~UVM_PLA_TRYCONTIG;
1164 
1165 		while (!TAILQ_EMPTY(result))
1166 			uvm_pmr_remove_1strange(result, 0, NULL, 0);
1167 		fnsegs = 0;
1168 		fcount = 0;
1169 		goto retry_desperate;
1170 	}
1171 
1172 fail:
1173 	/* Allocation failed. */
1174 	/* XXX: claim from memory reserve here */
1175 
1176 	while (!TAILQ_EMPTY(result))
1177 		uvm_pmr_remove_1strange(result, 0, NULL, 0);
1178 
1179 	if (flags & UVM_PLA_WAITOK) {
1180 		if (uvm_wait_pla(ptoa(start), ptoa(end) - 1, ptoa(count),
1181 		    flags & UVM_PLA_FAILOK) == 0)
1182 			goto retry;
1183 		KASSERT(flags & UVM_PLA_FAILOK);
1184 	} else if (!(flags & UVM_PLA_NOWAKE)) {
1185 		struct uvm_pmalloc *pma = &nowait_pma;
1186 
1187 		if (!(nowait_pma.pm_flags & UVM_PMA_LINKED)) {
1188 			nowait_pma.pm_flags = UVM_PMA_LINKED;
1189 			TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, pma, pmq);
1190 			wakeup(&uvm.pagedaemon);
1191 		}
1192 	}
1193 	uvm_unlock_fpageq();
1194 
1195 	return ENOMEM;
1196 
1197 out:
1198 	/* Allocation successful. */
1199 	uvmexp.free -= fcount;
1200 
1201 	uvm_unlock_fpageq();
1202 
1203 	/* Update statistics and zero pages if UVM_PLA_ZERO. */
1204 #ifdef DIAGNOSTIC
1205 	fnsegs = 0;
1206 	fcount = 0;
1207 	diag_prev = NULL;
1208 #endif /* DIAGNOSTIC */
1209 	TAILQ_FOREACH(found, result, pageq) {
1210 		atomic_clearbits_int(&found->pg_flags, PG_PMAPMASK);
1211 
1212 		if (found->pg_flags & PG_ZERO) {
1213 			uvm_lock_fpageq();
1214 			uvmexp.zeropages--;
1215 			if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1216 				wakeup(&uvmexp.zeropages);
1217 			uvm_unlock_fpageq();
1218 		}
1219 		if (flags & UVM_PLA_ZERO) {
1220 			if (found->pg_flags & PG_ZERO)
1221 				uvmexp.pga_zerohit++;
1222 			else {
1223 				uvmexp.pga_zeromiss++;
1224 				uvm_pagezero(found);
1225 			}
1226 		}
1227 		atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE);
1228 
1229 		found->uobject = NULL;
1230 		found->uanon = NULL;
1231 		found->pg_version++;
1232 
1233 		/*
1234 		 * Validate that the page matches range criterium.
1235 		 */
1236 		KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start);
1237 		KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end);
1238 
1239 #ifdef DIAGNOSTIC
1240 		/*
1241 		 * Update fcount (# found pages) and
1242 		 * fnsegs (# found segments) counters.
1243 		 */
1244 		if (diag_prev == NULL ||
1245 		    /* new segment if it contains a hole */
1246 		    atop(VM_PAGE_TO_PHYS(diag_prev)) + 1 !=
1247 		    atop(VM_PAGE_TO_PHYS(found)) ||
1248 		    /* new segment if it crosses boundary */
1249 		    (atop(VM_PAGE_TO_PHYS(diag_prev)) & ~(boundary - 1)) !=
1250 		    (atop(VM_PAGE_TO_PHYS(found)) & ~(boundary - 1)))
1251 			fnsegs++;
1252 		fcount++;
1253 
1254 		diag_prev = found;
1255 #endif /* DIAGNOSTIC */
1256 	}
1257 
1258 #ifdef DIAGNOSTIC
1259 	/*
1260 	 * Panic on algorithm failure.
1261 	 */
1262 	if (fcount != count || fnsegs > maxseg) {
1263 		panic("pmemrange allocation error: "
1264 		    "allocated %ld pages in %d segments, "
1265 		    "but request was %ld pages in %d segments",
1266 		    fcount, fnsegs, count, maxseg);
1267 	}
1268 #endif /* DIAGNOSTIC */
1269 
1270 	return 0;
1271 }
1272 
1273 /*
1274  * Acquire a single page.
1275  *
1276  * flags:	UVM_PLA_* flags
1277  * result:	returned page.
1278  */
1279 struct vm_page *
uvm_pmr_getone(int flags)1280 uvm_pmr_getone(int flags)
1281 {
1282 	struct vm_page *pg;
1283 	struct pglist pgl;
1284 
1285 	TAILQ_INIT(&pgl);
1286 	if (uvm_pmr_getpages(1, 0, 0, 1, 0, 1, flags, &pgl) != 0)
1287 		return NULL;
1288 
1289 	pg = TAILQ_FIRST(&pgl);
1290 	KASSERT(pg != NULL && TAILQ_NEXT(pg, pageq) == NULL);
1291 
1292 	return pg;
1293 }
1294 
1295 /*
1296  * Free a number of contig pages (invoked by uvm_page_init).
1297  */
1298 void
uvm_pmr_freepages(struct vm_page * pg,psize_t count)1299 uvm_pmr_freepages(struct vm_page *pg, psize_t count)
1300 {
1301 	struct uvm_pmemrange *pmr;
1302 	psize_t i, pmr_count;
1303 	struct vm_page *firstpg = pg;
1304 
1305 	for (i = 0; i < count; i++) {
1306 		KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) ==
1307 		    atop(VM_PAGE_TO_PHYS(pg)) + i);
1308 
1309 		if (!((pg[i].pg_flags & PQ_FREE) == 0 &&
1310 		    VALID_FLAGS(pg[i].pg_flags))) {
1311 			printf("Flags: 0x%x, will panic now.\n",
1312 			    pg[i].pg_flags);
1313 		}
1314 		KASSERT((pg[i].pg_flags & PQ_FREE) == 0 &&
1315 		    VALID_FLAGS(pg[i].pg_flags));
1316 		atomic_setbits_int(&pg[i].pg_flags, PQ_FREE);
1317 		atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
1318 	}
1319 
1320 	uvm_lock_fpageq();
1321 
1322 	for (i = count; i > 0; i -= pmr_count) {
1323 		pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1324 		KASSERT(pmr != NULL);
1325 
1326 		pmr_count = MIN(i, pmr->high - atop(VM_PAGE_TO_PHYS(pg)));
1327 		pg->fpgsz = pmr_count;
1328 		uvm_pmr_insert(pmr, pg, 0);
1329 
1330 		uvmexp.free += pmr_count;
1331 		pg += pmr_count;
1332 	}
1333 	wakeup(&uvmexp.free);
1334 	if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1335 		wakeup(&uvmexp.zeropages);
1336 
1337 	uvm_wakeup_pla(VM_PAGE_TO_PHYS(firstpg), ptoa(count));
1338 
1339 	uvm_unlock_fpageq();
1340 }
1341 
1342 /*
1343  * Free all pages in the queue.
1344  */
1345 void
uvm_pmr_freepageq(struct pglist * pgl)1346 uvm_pmr_freepageq(struct pglist *pgl)
1347 {
1348 	struct vm_page *pg;
1349 	paddr_t pstart;
1350 	psize_t plen;
1351 
1352 	TAILQ_FOREACH(pg, pgl, pageq) {
1353 		if (!((pg->pg_flags & PQ_FREE) == 0 &&
1354 		    VALID_FLAGS(pg->pg_flags))) {
1355 			printf("Flags: 0x%x, will panic now.\n",
1356 			    pg->pg_flags);
1357 		}
1358 		KASSERT((pg->pg_flags & PQ_FREE) == 0 &&
1359 		    VALID_FLAGS(pg->pg_flags));
1360 		atomic_setbits_int(&pg->pg_flags, PQ_FREE);
1361 		atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
1362 	}
1363 
1364 	uvm_lock_fpageq();
1365 	while (!TAILQ_EMPTY(pgl)) {
1366 		pg = TAILQ_FIRST(pgl);
1367 		if (pg == TAILQ_NEXT(pg, pageq) + 1) {
1368 			/*
1369 			 * If pg is one behind the position of the
1370 			 * next page in the list in the page array,
1371 			 * try going backwards instead of forward.
1372 			 */
1373 			plen = uvm_pmr_remove_1strange_reverse(pgl, &pstart);
1374 		} else {
1375 			pstart = VM_PAGE_TO_PHYS(TAILQ_FIRST(pgl));
1376 			plen = uvm_pmr_remove_1strange(pgl, 0, NULL, 0);
1377 		}
1378 		uvmexp.free += plen;
1379 
1380 		uvm_wakeup_pla(pstart, ptoa(plen));
1381 	}
1382 	wakeup(&uvmexp.free);
1383 	if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1384 		wakeup(&uvmexp.zeropages);
1385 	uvm_unlock_fpageq();
1386 
1387 	return;
1388 }
1389 
1390 /*
1391  * Store a pmemrange in the list.
1392  *
1393  * The list is sorted by use.
1394  */
1395 struct uvm_pmemrange *
uvm_pmemrange_use_insert(struct uvm_pmemrange_use * useq,struct uvm_pmemrange * pmr)1396 uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq,
1397     struct uvm_pmemrange *pmr)
1398 {
1399 	struct uvm_pmemrange *iter;
1400 	int cmp = 1;
1401 
1402 	TAILQ_FOREACH(iter, useq, pmr_use) {
1403 		cmp = uvm_pmemrange_use_cmp(pmr, iter);
1404 		if (cmp == 0)
1405 			return iter;
1406 		if (cmp == -1)
1407 			break;
1408 	}
1409 
1410 	if (iter == NULL)
1411 		TAILQ_INSERT_TAIL(useq, pmr, pmr_use);
1412 	else
1413 		TAILQ_INSERT_BEFORE(iter, pmr, pmr_use);
1414 	return NULL;
1415 }
1416 
1417 #ifdef DEBUG
1418 /*
1419  * Validation of the whole pmemrange.
1420  * Called with fpageq locked.
1421  */
1422 void
uvm_pmr_assertvalid(struct uvm_pmemrange * pmr)1423 uvm_pmr_assertvalid(struct uvm_pmemrange *pmr)
1424 {
1425 	struct vm_page *prev, *next, *i, *xref;
1426 	int lcv, mti;
1427 
1428 	/* Empty range */
1429 	if (pmr->nsegs == 0)
1430 		return;
1431 
1432 	/* Validate address tree. */
1433 	RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) {
1434 		/* Validate the range. */
1435 		KASSERT(i->fpgsz > 0);
1436 		KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low);
1437 		KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz
1438 		    <= pmr->high);
1439 
1440 		/* Validate each page in this range. */
1441 		for (lcv = 0; lcv < i->fpgsz; lcv++) {
1442 			/*
1443 			 * Only the first page has a size specification.
1444 			 * Rest is size 0.
1445 			 */
1446 			KASSERT(lcv == 0 || i[lcv].fpgsz == 0);
1447 			/*
1448 			 * Flag check.
1449 			 */
1450 			KASSERT(VALID_FLAGS(i[lcv].pg_flags) &&
1451 			    (i[lcv].pg_flags & PQ_FREE) == PQ_FREE);
1452 			/*
1453 			 * Free pages are:
1454 			 * - not wired
1455 			 * - have no vm_anon
1456 			 * - have no uvm_object
1457 			 */
1458 			KASSERT(i[lcv].wire_count == 0);
1459 			KASSERT(i[lcv].uanon == (void*)0xdeadbeef ||
1460 			    i[lcv].uanon == NULL);
1461 			KASSERT(i[lcv].uobject == (void*)0xdeadbeef ||
1462 			    i[lcv].uobject == NULL);
1463 			/*
1464 			 * Pages in a single range always have the same
1465 			 * memtype.
1466 			 */
1467 			KASSERT(uvm_pmr_pg_to_memtype(&i[0]) ==
1468 			    uvm_pmr_pg_to_memtype(&i[lcv]));
1469 		}
1470 
1471 		/* Check that it shouldn't be joined with its predecessor. */
1472 		prev = RBT_PREV(uvm_pmr_addr, i);
1473 		if (prev != NULL) {
1474 			KASSERT(uvm_pmr_pg_to_memtype(i) !=
1475 			    uvm_pmr_pg_to_memtype(prev) ||
1476 			    atop(VM_PAGE_TO_PHYS(i)) >
1477 			    atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz ||
1478 			    prev + prev->fpgsz != i);
1479 		}
1480 
1481 		/* Assert i is in the size tree as well. */
1482 		if (i->fpgsz == 1) {
1483 			TAILQ_FOREACH(xref,
1484 			    &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) {
1485 				if (xref == i)
1486 					break;
1487 			}
1488 			KASSERT(xref == i);
1489 		} else {
1490 			KASSERT(RBT_FIND(uvm_pmr_size,
1491 			    &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) ==
1492 			    i + 1);
1493 		}
1494 	}
1495 
1496 	/* Validate size tree. */
1497 	for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
1498 		for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) {
1499 			next = uvm_pmr_nextsz(pmr, i, mti);
1500 			if (next != NULL) {
1501 				KASSERT(i->fpgsz <=
1502 				    next->fpgsz);
1503 			}
1504 
1505 			/* Assert i is in the addr tree as well. */
1506 			KASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, i) == i);
1507 
1508 			/* Assert i is of the correct memory type. */
1509 			KASSERT(uvm_pmr_pg_to_memtype(i) == mti);
1510 		}
1511 	}
1512 
1513 	/* Validate nsegs statistic. */
1514 	lcv = 0;
1515 	RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr)
1516 		lcv++;
1517 	KASSERT(pmr->nsegs == lcv);
1518 }
1519 #endif /* DEBUG */
1520 
1521 /*
1522  * Split pmr at split point pageno.
1523  * Called with fpageq unlocked.
1524  *
1525  * Split is only applied if a pmemrange spans pageno.
1526  */
1527 void
uvm_pmr_split(paddr_t pageno)1528 uvm_pmr_split(paddr_t pageno)
1529 {
1530 	struct uvm_pmemrange *pmr, *drain;
1531 	struct vm_page *rebuild, *prev, *next;
1532 	psize_t prev_sz;
1533 
1534 	uvm_lock_fpageq();
1535 	pmr = uvm_pmemrange_find(pageno);
1536 	if (pmr == NULL || !(pmr->low < pageno)) {
1537 		/* No split required. */
1538 		uvm_unlock_fpageq();
1539 		return;
1540 	}
1541 
1542 	KASSERT(pmr->low < pageno);
1543 	KASSERT(pmr->high > pageno);
1544 
1545 	/*
1546 	 * uvm_pmr_allocpmr() calls into malloc() which in turn calls into
1547 	 * uvm_kmemalloc which calls into pmemrange, making the locking
1548 	 * a bit hard, so we just race!
1549 	 */
1550 	uvm_unlock_fpageq();
1551 	drain = uvm_pmr_allocpmr();
1552 	uvm_lock_fpageq();
1553 	pmr = uvm_pmemrange_find(pageno);
1554 	if (pmr == NULL || !(pmr->low < pageno)) {
1555 		/*
1556 		 * We lost the race since someone else ran this or a related
1557 		 * function, however this should be triggered very rarely so
1558 		 * we just leak the pmr.
1559 		 */
1560 		printf("uvm_pmr_split: lost one pmr\n");
1561 		uvm_unlock_fpageq();
1562 		return;
1563 	}
1564 
1565 	drain->low = pageno;
1566 	drain->high = pmr->high;
1567 	drain->use = pmr->use;
1568 
1569 	uvm_pmr_assertvalid(pmr);
1570 	uvm_pmr_assertvalid(drain);
1571 	KASSERT(drain->nsegs == 0);
1572 
1573 	RBT_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) {
1574 		if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno)
1575 			break;
1576 	}
1577 	if (rebuild == NULL)
1578 		prev = RBT_MAX(uvm_pmr_addr, &pmr->addr);
1579 	else
1580 		prev = RBT_PREV(uvm_pmr_addr, rebuild);
1581 	KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1582 
1583 	/*
1584 	 * Handle free chunk that spans the split point.
1585 	 */
1586 	if (prev != NULL &&
1587 	    atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) {
1588 		psize_t before, after;
1589 
1590 		KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1591 
1592 		uvm_pmr_remove(pmr, prev);
1593 		prev_sz = prev->fpgsz;
1594 		before = pageno - atop(VM_PAGE_TO_PHYS(prev));
1595 		after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno;
1596 
1597 		KASSERT(before > 0);
1598 		KASSERT(after > 0);
1599 
1600 		prev->fpgsz = before;
1601 		uvm_pmr_insert(pmr, prev, 1);
1602 		(prev + before)->fpgsz = after;
1603 		uvm_pmr_insert(drain, prev + before, 1);
1604 	}
1605 
1606 	/* Move free chunks that no longer fall in the range. */
1607 	for (; rebuild != NULL; rebuild = next) {
1608 		next = RBT_NEXT(uvm_pmr_addr, rebuild);
1609 
1610 		uvm_pmr_remove(pmr, rebuild);
1611 		uvm_pmr_insert(drain, rebuild, 1);
1612 	}
1613 
1614 	pmr->high = pageno;
1615 	uvm_pmr_assertvalid(pmr);
1616 	uvm_pmr_assertvalid(drain);
1617 
1618 	RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain);
1619 	uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain);
1620 	uvm_unlock_fpageq();
1621 }
1622 
1623 /*
1624  * Increase the usage counter for the given range of memory.
1625  *
1626  * The more usage counters a given range of memory has, the more will be
1627  * attempted not to allocate from it.
1628  *
1629  * Addresses here are in paddr_t, not page-numbers.
1630  * The lowest and highest allowed address are specified.
1631  */
1632 void
uvm_pmr_use_inc(paddr_t low,paddr_t high)1633 uvm_pmr_use_inc(paddr_t low, paddr_t high)
1634 {
1635 	struct uvm_pmemrange *pmr;
1636 	paddr_t sz;
1637 
1638 	/* pmr uses page numbers, translate low and high. */
1639 	high++;
1640 	high = atop(trunc_page(high));
1641 	low = atop(round_page(low));
1642 	uvm_pmr_split(low);
1643 	uvm_pmr_split(high);
1644 
1645 	sz = 0;
1646 	uvm_lock_fpageq();
1647 	/* Increase use count on segments in range. */
1648 	RBT_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) {
1649 		if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) {
1650 			TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use);
1651 			pmr->use++;
1652 			sz += pmr->high - pmr->low;
1653 			uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr);
1654 		}
1655 		uvm_pmr_assertvalid(pmr);
1656 	}
1657 	uvm_unlock_fpageq();
1658 
1659 	KASSERT(sz >= high - low);
1660 }
1661 
1662 /*
1663  * Allocate a pmemrange.
1664  *
1665  * If called from uvm_page_init, the uvm_pageboot_alloc is used.
1666  * If called after uvm_init, malloc is used.
1667  * (And if called in between, you're dead.)
1668  */
1669 struct uvm_pmemrange *
uvm_pmr_allocpmr(void)1670 uvm_pmr_allocpmr(void)
1671 {
1672 	struct uvm_pmemrange *nw;
1673 	int i;
1674 
1675 	/* We're only ever hitting the !uvm.page_init_done case for now. */
1676 	if (!uvm.page_init_done) {
1677 		nw = (struct uvm_pmemrange *)
1678 		    uvm_pageboot_alloc(sizeof(struct uvm_pmemrange));
1679 	} else {
1680 		nw = malloc(sizeof(struct uvm_pmemrange),
1681 		    M_VMMAP, M_NOWAIT);
1682 	}
1683 	KASSERT(nw != NULL);
1684 	memset(nw, 0, sizeof(struct uvm_pmemrange));
1685 	RBT_INIT(uvm_pmr_addr, &nw->addr);
1686 	for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) {
1687 		RBT_INIT(uvm_pmr_size, &nw->size[i]);
1688 		TAILQ_INIT(&nw->single[i]);
1689 	}
1690 	return nw;
1691 }
1692 
1693 /*
1694  * Initialization of pmr.
1695  * Called by uvm_page_init.
1696  *
1697  * Sets up pmemranges.
1698  */
1699 void
uvm_pmr_init(void)1700 uvm_pmr_init(void)
1701 {
1702 	struct uvm_pmemrange *new_pmr;
1703 	int i;
1704 
1705 	TAILQ_INIT(&uvm.pmr_control.use);
1706 	RBT_INIT(uvm_pmemrange_addr, &uvm.pmr_control.addr);
1707 	TAILQ_INIT(&uvm.pmr_control.allocs);
1708 
1709 	/* By default, one range for the entire address space. */
1710 	new_pmr = uvm_pmr_allocpmr();
1711 	new_pmr->low = 0;
1712 	new_pmr->high = atop((paddr_t)-1) + 1;
1713 
1714 	RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr);
1715 	uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr);
1716 
1717 	for (i = 0; uvm_md_constraints[i] != NULL; i++) {
1718 		uvm_pmr_use_inc(uvm_md_constraints[i]->ucr_low,
1719 	    	    uvm_md_constraints[i]->ucr_high);
1720 	}
1721 }
1722 
1723 /*
1724  * Find the pmemrange that contains the given page number.
1725  *
1726  * (Manually traverses the binary tree, because that is cheaper on stack
1727  * usage.)
1728  */
1729 struct uvm_pmemrange *
uvm_pmemrange_find(paddr_t pageno)1730 uvm_pmemrange_find(paddr_t pageno)
1731 {
1732 	struct uvm_pmemrange *pmr;
1733 
1734 	pmr = RBT_ROOT(uvm_pmemrange_addr, &uvm.pmr_control.addr);
1735 	while (pmr != NULL) {
1736 		if (pmr->low > pageno)
1737 			pmr = RBT_LEFT(uvm_pmemrange_addr, pmr);
1738 		else if (pmr->high <= pageno)
1739 			pmr = RBT_RIGHT(uvm_pmemrange_addr, pmr);
1740 		else
1741 			break;
1742 	}
1743 
1744 	return pmr;
1745 }
1746 
1747 #if defined(DDB) || defined(DEBUG)
1748 /*
1749  * Return true if the given page is in any of the free lists.
1750  * Used by uvm_page_printit.
1751  * This function is safe, even if the page is not on the freeq.
1752  * Note: does not apply locking, only called from ddb.
1753  */
1754 int
uvm_pmr_isfree(struct vm_page * pg)1755 uvm_pmr_isfree(struct vm_page *pg)
1756 {
1757 	struct vm_page *r;
1758 	struct uvm_pmemrange *pmr;
1759 
1760 	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1761 	if (pmr == NULL)
1762 		return 0;
1763 	r = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg);
1764 	if (r == NULL)
1765 		r = RBT_MAX(uvm_pmr_addr, &pmr->addr);
1766 	else if (r != pg)
1767 		r = RBT_PREV(uvm_pmr_addr, r);
1768 	if (r == NULL)
1769 		return 0; /* Empty tree. */
1770 
1771 	KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg)));
1772 	return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz >
1773 	    atop(VM_PAGE_TO_PHYS(pg));
1774 }
1775 #endif /* DEBUG */
1776 
1777 /*
1778  * Given a root of a tree, find a range which intersects start, end and
1779  * is of the same memtype.
1780  *
1781  * Page must be in the address tree.
1782  */
1783 struct vm_page*
uvm_pmr_rootupdate(struct uvm_pmemrange * pmr,struct vm_page * init_root,paddr_t start,paddr_t end,int memtype)1784 uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root,
1785     paddr_t start, paddr_t end, int memtype)
1786 {
1787 	int	direction;
1788 	struct	vm_page *root;
1789 	struct	vm_page *high, *high_next;
1790 	struct	vm_page *low, *low_next;
1791 
1792 	KDASSERT(pmr != NULL && init_root != NULL);
1793 	root = init_root;
1794 
1795 	/* Which direction to use for searching. */
1796 	if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start)
1797 		direction =  1;
1798 	else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end)
1799 		direction = -1;
1800 	else /* nothing to do */
1801 		return root;
1802 
1803 	/* First, update root to fall within the chosen range. */
1804 	while (root && !PMR_INTERSECTS_WITH(
1805 	    atop(VM_PAGE_TO_PHYS(root)),
1806 	    atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz,
1807 	    start, end)) {
1808 		if (direction == 1)
1809 			root = RBT_RIGHT(uvm_objtree, root);
1810 		else
1811 			root = RBT_LEFT(uvm_objtree, root);
1812 	}
1813 	if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype)
1814 		return root;
1815 
1816 	/*
1817 	 * Root is valid, but of the wrong memtype.
1818 	 *
1819 	 * Try to find a range that has the given memtype in the subtree
1820 	 * (memtype mismatches are costly, either because the conversion
1821 	 * is expensive, or a later allocation will need to do the opposite
1822 	 * conversion, which will be expensive).
1823 	 *
1824 	 *
1825 	 * First, simply increase address until we hit something we can use.
1826 	 * Cache the upper page, so we can page-walk later.
1827 	 */
1828 	high = root;
1829 	high_next = RBT_RIGHT(uvm_objtree, high);
1830 	while (high_next != NULL && PMR_INTERSECTS_WITH(
1831 	    atop(VM_PAGE_TO_PHYS(high_next)),
1832 	    atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz,
1833 	    start, end)) {
1834 		high = high_next;
1835 		if (uvm_pmr_pg_to_memtype(high) == memtype)
1836 			return high;
1837 		high_next = RBT_RIGHT(uvm_objtree, high);
1838 	}
1839 
1840 	/*
1841 	 * Second, decrease the address until we hit something we can use.
1842 	 * Cache the lower page, so we can page-walk later.
1843 	 */
1844 	low = root;
1845 	low_next = RBT_LEFT(uvm_objtree, low);
1846 	while (low_next != NULL && PMR_INTERSECTS_WITH(
1847 	    atop(VM_PAGE_TO_PHYS(low_next)),
1848 	    atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz,
1849 	    start, end)) {
1850 		low = low_next;
1851 		if (uvm_pmr_pg_to_memtype(low) == memtype)
1852 			return low;
1853 		low_next = RBT_LEFT(uvm_objtree, low);
1854 	}
1855 
1856 	if (low == high)
1857 		return NULL;
1858 
1859 	/* No hits. Walk the address tree until we find something usable. */
1860 	for (low = RBT_NEXT(uvm_pmr_addr, low);
1861 	    low != high;
1862 	    low = RBT_NEXT(uvm_pmr_addr, low)) {
1863 		KDASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(low)),
1864 	    	    atop(VM_PAGE_TO_PHYS(low)) + low->fpgsz,
1865 	    	    start, end));
1866 		if (uvm_pmr_pg_to_memtype(low) == memtype)
1867 			return low;
1868 	}
1869 
1870 	/* Nothing found. */
1871 	return NULL;
1872 }
1873 
1874 /*
1875  * Allocate any page, the fastest way. Page number constraints only.
1876  */
1877 psize_t
uvm_pmr_get1page(psize_t count,int memtype_init,struct pglist * result,paddr_t start,paddr_t end,int memtype_only)1878 uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result,
1879     paddr_t start, paddr_t end, int memtype_only)
1880 {
1881 	struct	uvm_pmemrange *pmr;
1882 	struct	vm_page *found, *splitpg;
1883 	psize_t	fcount;
1884 	int	memtype;
1885 
1886 	fcount = 0;
1887 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1888 		/* We're done. */
1889 		if (fcount == count)
1890 			break;
1891 
1892 		/* Outside requested range. */
1893 		if (!(start == 0 && end == 0) &&
1894 		    !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
1895 			continue;
1896 
1897 		/* Range is empty. */
1898 		if (pmr->nsegs == 0)
1899 			continue;
1900 
1901 		/* Loop over all memtypes, starting at memtype_init. */
1902 		memtype = memtype_init;
1903 		while (fcount != count) {
1904 			found = TAILQ_FIRST(&pmr->single[memtype]);
1905 			/*
1906 			 * If found is outside the range, walk the list
1907 			 * until we find something that intersects with
1908 			 * boundaries.
1909 			 */
1910 			while (found && !PMR_INTERSECTS_WITH(
1911 			    atop(VM_PAGE_TO_PHYS(found)),
1912 			    atop(VM_PAGE_TO_PHYS(found)) + 1,
1913 			    start, end))
1914 				found = TAILQ_NEXT(found, pageq);
1915 
1916 			if (found == NULL) {
1917 				/*
1918 				 * Check if the size tree contains a range
1919 				 * that intersects with the boundaries. As the
1920 				 * allocation is for any page, try the smallest
1921 				 * range so that large ranges are preserved for
1922 				 * more constrained cases. Only one entry is
1923 				 * checked here, to avoid a brute-force search.
1924 				 *
1925 				 * Note that a size tree gives pg[1] instead of
1926 				 * pg[0].
1927 				 */
1928 				found = RBT_MIN(uvm_pmr_size,
1929 				    &pmr->size[memtype]);
1930 				if (found != NULL) {
1931 					found--;
1932 					if (!PMR_INTERSECTS_WITH(
1933 					    atop(VM_PAGE_TO_PHYS(found)),
1934 					    atop(VM_PAGE_TO_PHYS(found)) +
1935 					    found->fpgsz, start, end))
1936 						found = NULL;
1937 				}
1938 			}
1939 			if (found == NULL) {
1940 				/*
1941 				 * Try address-guided search to meet the page
1942 				 * number constraints.
1943 				 */
1944 				found = RBT_ROOT(uvm_pmr_addr, &pmr->addr);
1945 				if (found != NULL) {
1946 					found = uvm_pmr_rootupdate(pmr, found,
1947 					    start, end, memtype);
1948 				}
1949 			}
1950 			if (found != NULL) {
1951 				uvm_pmr_assertvalid(pmr);
1952 				uvm_pmr_remove_size(pmr, found);
1953 
1954 				/*
1955 				 * If the page intersects the end, then it'll
1956 				 * need splitting.
1957 				 *
1958 				 * Note that we don't need to split if the page
1959 				 * intersects start: the drain function will
1960 				 * simply stop on hitting start.
1961 				 */
1962 				if (end != 0 && atop(VM_PAGE_TO_PHYS(found)) +
1963 				    found->fpgsz > end) {
1964 					psize_t splitsz =
1965 					    atop(VM_PAGE_TO_PHYS(found)) +
1966 					    found->fpgsz - end;
1967 
1968 					uvm_pmr_remove_addr(pmr, found);
1969 					uvm_pmr_assertvalid(pmr);
1970 					found->fpgsz -= splitsz;
1971 					splitpg = found + found->fpgsz;
1972 					splitpg->fpgsz = splitsz;
1973 					uvm_pmr_insert(pmr, splitpg, 1);
1974 
1975 					/*
1976 					 * At this point, splitpg and found
1977 					 * actually should be joined.
1978 					 * But we explicitly disable that,
1979 					 * because we will start subtracting
1980 					 * from found.
1981 					 */
1982 					KASSERT(start == 0 ||
1983 					    atop(VM_PAGE_TO_PHYS(found)) +
1984 					    found->fpgsz > start);
1985 					uvm_pmr_insert_addr(pmr, found, 1);
1986 				}
1987 
1988 				/*
1989 				 * Fetch pages from the end.
1990 				 * If the range is larger than the requested
1991 				 * number of pages, this saves us an addr-tree
1992 				 * update.
1993 				 *
1994 				 * Since we take from the end and insert at
1995 				 * the head, any ranges keep preserved.
1996 				 */
1997 				while (found->fpgsz > 0 && fcount < count &&
1998 				    (start == 0 ||
1999 				    atop(VM_PAGE_TO_PHYS(found)) +
2000 				    found->fpgsz > start)) {
2001 					found->fpgsz--;
2002 					fcount++;
2003 					TAILQ_INSERT_HEAD(result,
2004 					    &found[found->fpgsz], pageq);
2005 				}
2006 				if (found->fpgsz > 0) {
2007 					uvm_pmr_insert_size(pmr, found);
2008 					KDASSERT(fcount == count);
2009 					uvm_pmr_assertvalid(pmr);
2010 					return fcount;
2011 				}
2012 
2013 				/*
2014 				 * Delayed addr-tree removal.
2015 				 */
2016 				uvm_pmr_remove_addr(pmr, found);
2017 				uvm_pmr_assertvalid(pmr);
2018 			} else {
2019 				if (memtype_only)
2020 					break;
2021 				/*
2022 				 * Skip to the next memtype.
2023 				 */
2024 				memtype += 1;
2025 				if (memtype == UVM_PMR_MEMTYPE_MAX)
2026 					memtype = 0;
2027 				if (memtype == memtype_init)
2028 					break;
2029 			}
2030 		}
2031 	}
2032 
2033 	/*
2034 	 * Search finished.
2035 	 *
2036 	 * Ran out of ranges before enough pages were gathered, or we hit the
2037 	 * case where found->fpgsz == count - fcount, in which case the
2038 	 * above exit condition didn't trigger.
2039 	 *
2040 	 * On failure, caller will free the pages.
2041 	 */
2042 	return fcount;
2043 }
2044 
2045 #ifdef DDB
2046 /*
2047  * Print information about pmemrange.
2048  * Does not do locking (so either call it from DDB or acquire fpageq lock
2049  * before invoking.
2050  */
2051 void
uvm_pmr_print(void)2052 uvm_pmr_print(void)
2053 {
2054 	struct	uvm_pmemrange *pmr;
2055 	struct	vm_page *pg;
2056 	psize_t	size[UVM_PMR_MEMTYPE_MAX];
2057 	psize_t	free;
2058 	int	useq_len;
2059 	int	mt;
2060 
2061 	printf("Ranges, use queue:\n");
2062 	useq_len = 0;
2063 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
2064 		useq_len++;
2065 		free = 0;
2066 		for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
2067 			pg = RBT_MAX(uvm_pmr_size, &pmr->size[mt]);
2068 			if (pg != NULL)
2069 				pg--;
2070 			else
2071 				pg = TAILQ_FIRST(&pmr->single[mt]);
2072 			size[mt] = (pg == NULL ? 0 : pg->fpgsz);
2073 
2074 			RBT_FOREACH(pg, uvm_pmr_addr, &pmr->addr)
2075 				free += pg->fpgsz;
2076 		}
2077 
2078 		printf("* [0x%lx-0x%lx] use=%d nsegs=%ld",
2079 		    (unsigned long)pmr->low, (unsigned long)pmr->high,
2080 		    pmr->use, (unsigned long)pmr->nsegs);
2081 		for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
2082 			printf(" maxsegsz[%d]=0x%lx", mt,
2083 			    (unsigned long)size[mt]);
2084 		}
2085 		printf(" free=0x%lx\n", (unsigned long)free);
2086 	}
2087 	printf("#ranges = %d\n", useq_len);
2088 }
2089 #endif
2090 
2091 /*
2092  * uvm_wait_pla: wait (sleep) for the page daemon to free some pages
2093  * in a specific physmem area.
2094  *
2095  * Returns ENOMEM if the pagedaemon failed to free any pages.
2096  * If not failok, failure will lead to panic.
2097  *
2098  * Must be called with fpageq locked.
2099  */
2100 int
uvm_wait_pla(paddr_t low,paddr_t high,paddr_t size,int failok)2101 uvm_wait_pla(paddr_t low, paddr_t high, paddr_t size, int failok)
2102 {
2103 	struct uvm_pmalloc pma;
2104 	const char *wmsg = "pmrwait";
2105 
2106 	KASSERT(curcpu()->ci_idepth == 0);
2107 
2108 	if (curproc == uvm.pagedaemon_proc) {
2109 		/*
2110 		 * This is not that uncommon when the pagedaemon is trying
2111 		 * to flush out a large mmapped file. VOP_WRITE will circle
2112 		 * back through the buffer cache and try to get more memory.
2113 		 * The pagedaemon starts by calling bufbackoff, but we can
2114 		 * easily use up that reserve in a single scan iteration.
2115 		 */
2116 		uvm_unlock_fpageq();
2117 		if (bufbackoff(NULL, atop(size)) >= atop(size)) {
2118 			uvm_lock_fpageq();
2119 			return 0;
2120 		}
2121 		uvm_lock_fpageq();
2122 
2123 		/*
2124 		 * XXX detect pagedaemon deadlock - see comment in
2125 		 * uvm_wait(), as this is exactly the same issue.
2126 		 */
2127 		printf("pagedaemon: wait_pla deadlock detected!\n");
2128 		msleep_nsec(&uvmexp.free, &uvm.fpageqlock, PVM, wmsg,
2129 		    MSEC_TO_NSEC(125));
2130 #if defined(DEBUG)
2131 		/* DEBUG: panic so we can debug it */
2132 		panic("wait_pla pagedaemon deadlock");
2133 #endif
2134 		return 0;
2135 	}
2136 
2137 	for (;;) {
2138 		pma.pm_constraint.ucr_low = low;
2139 		pma.pm_constraint.ucr_high = high;
2140 		pma.pm_size = size;
2141 		pma.pm_flags = UVM_PMA_LINKED;
2142 		TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, &pma, pmq);
2143 
2144 		wakeup(&uvm.pagedaemon);		/* wake the daemon! */
2145 		while (pma.pm_flags & (UVM_PMA_LINKED | UVM_PMA_BUSY))
2146 			msleep_nsec(&pma, &uvm.fpageqlock, PVM, wmsg, INFSLP);
2147 
2148 		if (!(pma.pm_flags & UVM_PMA_FREED) &&
2149 		    pma.pm_flags & UVM_PMA_FAIL) {
2150 			if (failok)
2151 				return ENOMEM;
2152 			printf("uvm_wait: failed to free %ld pages between "
2153 			    "0x%lx-0x%lx\n", atop(size), low, high);
2154 		} else
2155 			return 0;
2156 	}
2157 	/* UNREACHABLE */
2158 }
2159 
2160 /*
2161  * Wake up uvm_pmalloc sleepers.
2162  */
2163 void
uvm_wakeup_pla(paddr_t low,psize_t len)2164 uvm_wakeup_pla(paddr_t low, psize_t len)
2165 {
2166 	struct uvm_pmalloc *pma, *pma_next;
2167 	paddr_t high;
2168 
2169 	high = low + len;
2170 
2171 	/* Wake specific allocations waiting for this memory. */
2172 	for (pma = TAILQ_FIRST(&uvm.pmr_control.allocs); pma != NULL;
2173 	    pma = pma_next) {
2174 		pma_next = TAILQ_NEXT(pma, pmq);
2175 
2176 		if (low < pma->pm_constraint.ucr_high &&
2177 		    high > pma->pm_constraint.ucr_low) {
2178 			pma->pm_flags |= UVM_PMA_FREED;
2179 			if (!(pma->pm_flags & UVM_PMA_BUSY)) {
2180 				pma->pm_flags &= ~UVM_PMA_LINKED;
2181 				TAILQ_REMOVE(&uvm.pmr_control.allocs, pma,
2182 				    pmq);
2183 				wakeup(pma);
2184 			}
2185 		}
2186 	}
2187 }
2188 
2189 void
uvm_pagezero_thread(void * arg)2190 uvm_pagezero_thread(void *arg)
2191 {
2192 	struct pglist pgl;
2193 	struct vm_page *pg;
2194 	int count;
2195 
2196 	/* Run at the lowest possible priority. */
2197 	curproc->p_p->ps_nice = NZERO + PRIO_MAX;
2198 
2199 	KERNEL_UNLOCK();
2200 
2201 	TAILQ_INIT(&pgl);
2202 	for (;;) {
2203 		uvm_lock_fpageq();
2204 		while (uvmexp.zeropages >= UVM_PAGEZERO_TARGET ||
2205 		    (count = uvm_pmr_get1page(16, UVM_PMR_MEMTYPE_DIRTY,
2206 		     &pgl, 0, 0, 1)) == 0) {
2207 			msleep_nsec(&uvmexp.zeropages, &uvm.fpageqlock,
2208 			    MAXPRI, "pgzero", INFSLP);
2209 		}
2210 		uvm_unlock_fpageq();
2211 
2212 		TAILQ_FOREACH(pg, &pgl, pageq) {
2213 			uvm_pagezero(pg);
2214 			atomic_setbits_int(&pg->pg_flags, PG_ZERO);
2215 		}
2216 
2217 		uvm_lock_fpageq();
2218 		while (!TAILQ_EMPTY(&pgl))
2219 			uvm_pmr_remove_1strange(&pgl, 0, NULL, 0);
2220 		uvmexp.zeropages += count;
2221  		uvm_unlock_fpageq();
2222 
2223 		yield();
2224 	}
2225 }
2226 
2227 #if defined(MULTIPROCESSOR) && defined(__HAVE_UVM_PERCPU)
2228 int
uvm_pmr_cache_alloc(struct uvm_pmr_cache_item * upci)2229 uvm_pmr_cache_alloc(struct uvm_pmr_cache_item *upci)
2230 {
2231 	struct vm_page *pg;
2232 	struct pglist pgl;
2233 	int flags = UVM_PLA_NOWAIT|UVM_PLA_NOWAKE;
2234 	int npages = UVM_PMR_CACHEMAGSZ;
2235 
2236 	splassert(IPL_VM);
2237 	KASSERT(upci->upci_npages == 0);
2238 
2239 	TAILQ_INIT(&pgl);
2240 	if (uvm_pmr_getpages(npages, 0, 0, 1, 0, npages, flags, &pgl))
2241 		return -1;
2242 
2243 	while ((pg = TAILQ_FIRST(&pgl)) != NULL) {
2244 		TAILQ_REMOVE(&pgl, pg, pageq);
2245 		upci->upci_pages[upci->upci_npages] = pg;
2246 		upci->upci_npages++;
2247 	}
2248 	atomic_add_int(&uvmexp.percpucaches, npages);
2249 
2250 	return 0;
2251 }
2252 
2253 struct vm_page *
uvm_pmr_cache_get(int flags)2254 uvm_pmr_cache_get(int flags)
2255 {
2256 	struct uvm_pmr_cache *upc = &curcpu()->ci_uvm;
2257 	struct uvm_pmr_cache_item *upci;
2258 	struct vm_page *pg;
2259 	int s;
2260 
2261 	s = splvm();
2262 	upci = &upc->upc_magz[upc->upc_actv];
2263 	if (upci->upci_npages == 0) {
2264 		unsigned int prev;
2265 
2266 		prev = (upc->upc_actv == 0) ?  1 : 0;
2267 		upci = &upc->upc_magz[prev];
2268 		if (upci->upci_npages == 0) {
2269 			atomic_inc_int(&uvmexp.pcpmiss);
2270 			if (uvm_pmr_cache_alloc(upci)) {
2271 				splx(s);
2272 				return uvm_pmr_getone(flags);
2273 			}
2274 		}
2275 		/* Swap magazines */
2276 		upc->upc_actv = prev;
2277 	} else {
2278 		atomic_inc_int(&uvmexp.pcphit);
2279 	}
2280 
2281 	atomic_dec_int(&uvmexp.percpucaches);
2282 	upci->upci_npages--;
2283 	pg = upci->upci_pages[upci->upci_npages];
2284 	splx(s);
2285 
2286 	if (flags & UVM_PLA_ZERO)
2287 		uvm_pagezero(pg);
2288 
2289 	return pg;
2290 }
2291 
2292 unsigned int
uvm_pmr_cache_free(struct uvm_pmr_cache_item * upci)2293 uvm_pmr_cache_free(struct uvm_pmr_cache_item *upci)
2294 {
2295 	struct pglist pgl;
2296 	unsigned int i;
2297 
2298 	splassert(IPL_VM);
2299 
2300 	TAILQ_INIT(&pgl);
2301 	for (i = 0; i < upci->upci_npages; i++)
2302 		TAILQ_INSERT_TAIL(&pgl, upci->upci_pages[i], pageq);
2303 
2304 	uvm_pmr_freepageq(&pgl);
2305 
2306 	atomic_sub_int(&uvmexp.percpucaches, upci->upci_npages);
2307 	upci->upci_npages = 0;
2308 	memset(upci->upci_pages, 0, sizeof(upci->upci_pages));
2309 
2310 	return i;
2311 }
2312 
2313 void
uvm_pmr_cache_put(struct vm_page * pg)2314 uvm_pmr_cache_put(struct vm_page *pg)
2315 {
2316 	struct uvm_pmr_cache *upc = &curcpu()->ci_uvm;
2317 	struct uvm_pmr_cache_item *upci;
2318 	struct uvm_pmemrange *pmr;
2319 	int s;
2320 
2321 	/*
2322 	 * Always give back low pages to the allocator to not accelerate
2323 	 * their exhaustion.
2324 	 */
2325 	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
2326 	if (pmr->use > 0) {
2327 		uvm_pmr_freepages(pg, 1);
2328 		return;
2329 	}
2330 
2331 	s = splvm();
2332 	upci = &upc->upc_magz[upc->upc_actv];
2333 	if (upci->upci_npages >= UVM_PMR_CACHEMAGSZ) {
2334 		unsigned int prev;
2335 
2336 		prev = (upc->upc_actv == 0) ?  1 : 0;
2337 		upci = &upc->upc_magz[prev];
2338 		if (upci->upci_npages > 0)
2339 			uvm_pmr_cache_free(upci);
2340 
2341 		/* Swap magazines */
2342 		upc->upc_actv = prev;
2343 		KASSERT(upci->upci_npages == 0);
2344 	}
2345 
2346 	upci->upci_pages[upci->upci_npages] = pg;
2347 	upci->upci_npages++;
2348 	atomic_inc_int(&uvmexp.percpucaches);
2349 	splx(s);
2350 }
2351 
2352 unsigned int
uvm_pmr_cache_drain(void)2353 uvm_pmr_cache_drain(void)
2354 {
2355 	struct uvm_pmr_cache *upc = &curcpu()->ci_uvm;
2356 	unsigned int freed = 0;
2357 	int s;
2358 
2359 	s = splvm();
2360 	freed += uvm_pmr_cache_free(&upc->upc_magz[0]);
2361 	freed += uvm_pmr_cache_free(&upc->upc_magz[1]);
2362 	splx(s);
2363 
2364 	return freed;
2365 }
2366 
2367 #else /* !(MULTIPROCESSOR && __HAVE_UVM_PERCPU) */
2368 
2369 struct vm_page *
uvm_pmr_cache_get(int flags)2370 uvm_pmr_cache_get(int flags)
2371 {
2372 	return uvm_pmr_getone(flags);
2373 }
2374 
2375 void
uvm_pmr_cache_put(struct vm_page * pg)2376 uvm_pmr_cache_put(struct vm_page *pg)
2377 {
2378 	uvm_pmr_freepages(pg, 1);
2379 }
2380 
2381 unsigned int
uvm_pmr_cache_drain(void)2382 uvm_pmr_cache_drain(void)
2383 {
2384 	return 0;
2385 }
2386 #endif
2387