xref: /openbsd/sys/uvm/uvm_pmemrange.c (revision 82673a18)
1 /*	$OpenBSD: uvm_pmemrange.c,v 1.66 2024/05/01 12:54:27 mpi Exp $	*/
2 
3 /*
4  * Copyright (c) 2024 Martin Pieuchot <mpi@openbsd.org>
5  * Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/param.h>
21 #include <sys/systm.h>
22 #include <uvm/uvm.h>
23 #include <sys/malloc.h>
24 #include <sys/kernel.h>
25 #include <sys/proc.h>
26 #include <sys/mount.h>
27 
28 /*
29  * 2 trees: addr tree and size tree.
30  *
31  * The allocator keeps chunks of free pages (called a range).
32  * Two pages are part of the same range if:
33  * - all pages in between are part of that range,
34  * - they are of the same memory type (zeroed or non-zeroed),
35  * - they are part of the same pmemrange.
36  * A pmemrange is a range of memory which is part of the same vm_physseg
37  * and has a use-count.
38  *
39  * addr tree is vm_page[0].objt
40  * size tree is vm_page[1].objt
41  *
42  * The size tree is not used for memory ranges of 1 page, instead,
43  * single queue is vm_page[0].pageq
44  *
45  * vm_page[0].fpgsz describes the length of a free range. Two adjacent ranges
46  * are joined, unless:
47  * - they have pages in between them which are not free
48  * - they belong to different memtypes (zeroed vs dirty memory)
49  * - they are in different pmemrange areas (ISA vs non-ISA memory for instance)
50  * - they are not a continuation of the same array
51  * The latter issue is caused by vm_physseg ordering and splitting from the
52  * MD initialization machinery. The MD code is dependant on freelists and
53  * happens to split ISA memory from non-ISA memory.
54  * (Note: freelists die die die!)
55  *
56  * uvm_page_init guarantees that every vm_physseg contains an array of
57  * struct vm_page. Also, uvm_page_physload allocates an array of struct
58  * vm_page. This code depends on that array. The array may break across
59  * vm_physsegs boundaries.
60  */
61 
62 /*
63  * Validate the flags of the page. (Used in asserts.)
64  * Any free page must have the PQ_FREE flag set.
65  * Free pages may be zeroed.
66  * Pmap flags are left untouched.
67  *
68  * The PQ_FREE flag is not checked here: by not checking, we can easily use
69  * this check in pages which are freed.
70  */
71 #define VALID_FLAGS(pg_flags)						\
72 	(((pg_flags) & ~(PQ_FREE|PG_ZERO|PG_PMAPMASK)) == 0x0)
73 
74 /* Tree comparators. */
75 int	uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *,
76 	    const struct uvm_pmemrange *);
77 int	uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *);
78 int	uvm_pmr_pg_to_memtype(struct vm_page *);
79 
80 #ifdef DDB
81 void	uvm_pmr_print(void);
82 #endif
83 
84 /*
85  * Memory types. The page flags are used to derive what the current memory
86  * type of a page is.
87  */
88 int
uvm_pmr_pg_to_memtype(struct vm_page * pg)89 uvm_pmr_pg_to_memtype(struct vm_page *pg)
90 {
91 	if (pg->pg_flags & PG_ZERO)
92 		return UVM_PMR_MEMTYPE_ZERO;
93 	/* Default: dirty memory. */
94 	return UVM_PMR_MEMTYPE_DIRTY;
95 }
96 
97 /* Trees. */
98 RBT_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp);
99 RBT_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp);
100 RBT_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr,
101     uvm_pmemrange_addr_cmp);
102 
103 /* Validation. */
104 #ifdef DEBUG
105 void	uvm_pmr_assertvalid(struct uvm_pmemrange *pmr);
106 #else
107 #define uvm_pmr_assertvalid(pmr)	do {} while (0)
108 #endif
109 
110 psize_t			 uvm_pmr_get1page(psize_t, int, struct pglist *,
111 			    paddr_t, paddr_t, int);
112 
113 struct uvm_pmemrange	*uvm_pmr_allocpmr(void);
114 struct vm_page		*uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int);
115 struct vm_page		*uvm_pmr_nextsz(struct uvm_pmemrange *,
116 			    struct vm_page *, int);
117 void			 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr,
118 			    struct vm_page *pg, struct vm_page **pg_prev,
119 			    struct vm_page **pg_next);
120 struct vm_page		*uvm_pmr_findnextsegment(struct uvm_pmemrange *,
121 			    struct vm_page *, paddr_t);
122 struct vm_page		*uvm_pmr_findprevsegment(struct uvm_pmemrange *,
123 			    struct vm_page *, paddr_t);
124 psize_t			 uvm_pmr_remove_1strange(struct pglist *, paddr_t,
125 			    struct vm_page **, int);
126 psize_t			 uvm_pmr_remove_1strange_reverse(struct pglist *,
127     			    paddr_t *);
128 void			 uvm_pmr_split(paddr_t);
129 struct uvm_pmemrange	*uvm_pmemrange_find(paddr_t);
130 struct uvm_pmemrange	*uvm_pmemrange_use_insert(struct uvm_pmemrange_use *,
131 			    struct uvm_pmemrange *);
132 psize_t			 pow2divide(psize_t, psize_t);
133 struct vm_page		*uvm_pmr_rootupdate(struct uvm_pmemrange *,
134 			    struct vm_page *, paddr_t, paddr_t, int);
135 
136 /*
137  * Computes num/denom and rounds it up to the next power-of-2.
138  *
139  * This is a division function which calculates an approximation of
140  * num/denom, with result =~ num/denom. It is meant to be fast and doesn't
141  * have to be accurate.
142  *
143  * Providing too large a value makes the allocator slightly faster, at the
144  * risk of hitting the failure case more often. Providing too small a value
145  * makes the allocator a bit slower, but less likely to hit a failure case.
146  */
147 psize_t
pow2divide(psize_t num,psize_t denom)148 pow2divide(psize_t num, psize_t denom)
149 {
150 	int rshift;
151 
152 	for (rshift = 0; num > denom; rshift++, denom <<= 1)
153 		;
154 	return (paddr_t)1 << rshift;
155 }
156 
157 /*
158  * Predicate: lhs is a subrange or rhs.
159  *
160  * If rhs_low == 0: don't care about lower bound.
161  * If rhs_high == 0: don't care about upper bound.
162  */
163 #define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high)	\
164 	(((rhs_low) == 0 || (lhs_low) >= (rhs_low)) &&			\
165 	((rhs_high) == 0 || (lhs_high) <= (rhs_high)))
166 
167 /*
168  * Predicate: lhs intersects with rhs.
169  *
170  * If rhs_low == 0: don't care about lower bound.
171  * If rhs_high == 0: don't care about upper bound.
172  * Ranges don't intersect if they don't have any page in common, array
173  * semantics mean that < instead of <= should be used here.
174  */
175 #define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high)	\
176 	(((rhs_low) == 0 || (rhs_low) < (lhs_high)) &&			\
177 	((rhs_high) == 0 || (lhs_low) < (rhs_high)))
178 
179 /*
180  * Align to power-of-2 alignment.
181  */
182 #define PMR_ALIGN(pgno, align)						\
183 	(((pgno) + ((align) - 1)) & ~((align) - 1))
184 #define PMR_ALIGN_DOWN(pgno, align)					\
185 	((pgno) & ~((align) - 1))
186 
187 
188 /*
189  * Comparator: sort by address ascending.
190  */
191 int
uvm_pmemrange_addr_cmp(const struct uvm_pmemrange * lhs,const struct uvm_pmemrange * rhs)192 uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *lhs,
193     const struct uvm_pmemrange *rhs)
194 {
195 	return lhs->low < rhs->low ? -1 : lhs->low > rhs->low;
196 }
197 
198 /*
199  * Comparator: sort by use ascending.
200  *
201  * The higher the use value of a range, the more devices need memory in
202  * this range. Therefore allocate from the range with the lowest use first.
203  */
204 int
uvm_pmemrange_use_cmp(struct uvm_pmemrange * lhs,struct uvm_pmemrange * rhs)205 uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs)
206 {
207 	int result;
208 
209 	result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use;
210 	if (result == 0)
211 		result = uvm_pmemrange_addr_cmp(lhs, rhs);
212 	return result;
213 }
214 
215 int
uvm_pmr_addr_cmp(const struct vm_page * lhs,const struct vm_page * rhs)216 uvm_pmr_addr_cmp(const struct vm_page *lhs, const struct vm_page *rhs)
217 {
218 	paddr_t lhs_addr, rhs_addr;
219 
220 	lhs_addr = VM_PAGE_TO_PHYS(lhs);
221 	rhs_addr = VM_PAGE_TO_PHYS(rhs);
222 
223 	return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr);
224 }
225 
226 int
uvm_pmr_size_cmp(const struct vm_page * lhs,const struct vm_page * rhs)227 uvm_pmr_size_cmp(const struct vm_page *lhs, const struct vm_page *rhs)
228 {
229 	psize_t lhs_size, rhs_size;
230 	int cmp;
231 
232 	/* Using second tree, so we receive pg[1] instead of pg[0]. */
233 	lhs_size = (lhs - 1)->fpgsz;
234 	rhs_size = (rhs - 1)->fpgsz;
235 
236 	cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size);
237 	if (cmp == 0)
238 		cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1);
239 	return cmp;
240 }
241 
242 /*
243  * Find the first range of free pages that is at least sz pages long.
244  */
245 struct vm_page *
uvm_pmr_nfindsz(struct uvm_pmemrange * pmr,psize_t sz,int mti)246 uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti)
247 {
248 	struct	vm_page *node, *best;
249 
250 	KASSERT(sz >= 1);
251 
252 	if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti]))
253 		return TAILQ_FIRST(&pmr->single[mti]);
254 
255 	node = RBT_ROOT(uvm_pmr_size, &pmr->size[mti]);
256 	best = NULL;
257 	while (node != NULL) {
258 		if ((node - 1)->fpgsz >= sz) {
259 			best = (node - 1);
260 			node = RBT_LEFT(uvm_objtree, node);
261 		} else
262 			node = RBT_RIGHT(uvm_objtree, node);
263 	}
264 	return best;
265 }
266 
267 /*
268  * Finds the next range. The next range has a size >= pg->fpgsz.
269  * Returns NULL if no more ranges are available.
270  */
271 struct vm_page *
uvm_pmr_nextsz(struct uvm_pmemrange * pmr,struct vm_page * pg,int mt)272 uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt)
273 {
274 	struct vm_page *npg;
275 
276 	KASSERT(pmr != NULL && pg != NULL);
277 	if (pg->fpgsz == 1) {
278 		if (TAILQ_NEXT(pg, pageq) != NULL)
279 			return TAILQ_NEXT(pg, pageq);
280 		else
281 			npg = RBT_MIN(uvm_pmr_size, &pmr->size[mt]);
282 	} else
283 		npg = RBT_NEXT(uvm_pmr_size, pg + 1);
284 
285 	return npg == NULL ? NULL : npg - 1;
286 }
287 
288 /*
289  * Finds the previous and next ranges relative to the (uninserted) pg range.
290  *
291  * *pg_prev == NULL if no previous range is available, that can join with
292  * 	pg.
293  * *pg_next == NULL if no next range is available, that can join with
294  * 	pg.
295  */
296 void
uvm_pmr_pnaddr(struct uvm_pmemrange * pmr,struct vm_page * pg,struct vm_page ** pg_prev,struct vm_page ** pg_next)297 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg,
298     struct vm_page **pg_prev, struct vm_page **pg_next)
299 {
300 	KASSERT(pg_prev != NULL && pg_next != NULL);
301 
302 	*pg_next = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg);
303 	if (*pg_next == NULL)
304 		*pg_prev = RBT_MAX(uvm_pmr_addr, &pmr->addr);
305 	else
306 		*pg_prev = RBT_PREV(uvm_pmr_addr, *pg_next);
307 
308 	KDASSERT(*pg_next == NULL ||
309 	    VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg));
310 	KDASSERT(*pg_prev == NULL ||
311 	    VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg));
312 
313 	/* Reset if not contig. */
314 	if (*pg_prev != NULL &&
315 	    (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz
316 	    != atop(VM_PAGE_TO_PHYS(pg)) ||
317 	    *pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */
318 	    uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg)))
319 		*pg_prev = NULL;
320 	if (*pg_next != NULL &&
321 	    (atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz
322 	    != atop(VM_PAGE_TO_PHYS(*pg_next)) ||
323 	    pg + pg->fpgsz != *pg_next || /* Array broke. */
324 	    uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg)))
325 		*pg_next = NULL;
326 	return;
327 }
328 
329 /*
330  * Remove a range from the address tree.
331  * Address tree maintains pmr counters.
332  */
333 void
uvm_pmr_remove_addr(struct uvm_pmemrange * pmr,struct vm_page * pg)334 uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg)
335 {
336 	KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
337 	KDASSERT(pg->pg_flags & PQ_FREE);
338 	RBT_REMOVE(uvm_pmr_addr, &pmr->addr, pg);
339 
340 	pmr->nsegs--;
341 }
342 /*
343  * Remove a range from the size tree.
344  */
345 void
uvm_pmr_remove_size(struct uvm_pmemrange * pmr,struct vm_page * pg)346 uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
347 {
348 	int memtype;
349 #ifdef DEBUG
350 	struct vm_page *i;
351 #endif
352 
353 	KDASSERT(pg->fpgsz >= 1);
354 	KDASSERT(pg->pg_flags & PQ_FREE);
355 	memtype = uvm_pmr_pg_to_memtype(pg);
356 
357 	if (pg->fpgsz == 1) {
358 #ifdef DEBUG
359 		TAILQ_FOREACH(i, &pmr->single[memtype], pageq) {
360 			if (i == pg)
361 				break;
362 		}
363 		KDASSERT(i == pg);
364 #endif
365 		TAILQ_REMOVE(&pmr->single[memtype], pg, pageq);
366 	} else {
367 		KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[memtype],
368 		    pg + 1) == pg + 1);
369 		RBT_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1);
370 	}
371 }
372 /* Remove from both trees. */
373 void
uvm_pmr_remove(struct uvm_pmemrange * pmr,struct vm_page * pg)374 uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg)
375 {
376 	uvm_pmr_assertvalid(pmr);
377 	uvm_pmr_remove_size(pmr, pg);
378 	uvm_pmr_remove_addr(pmr, pg);
379 	uvm_pmr_assertvalid(pmr);
380 }
381 
382 /*
383  * Insert the range described in pg.
384  * Returns the range thus created (which may be joined with the previous and
385  * next ranges).
386  * If no_join, the caller guarantees that the range cannot possibly join
387  * with adjacent ranges.
388  */
389 struct vm_page *
uvm_pmr_insert_addr(struct uvm_pmemrange * pmr,struct vm_page * pg,int no_join)390 uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
391 {
392 	struct vm_page *prev, *next;
393 
394 #ifdef DEBUG
395 	struct vm_page *i;
396 	int mt;
397 #endif
398 
399 	KDASSERT(pg->pg_flags & PQ_FREE);
400 	KDASSERT(pg->fpgsz >= 1);
401 
402 #ifdef DEBUG
403 	for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
404 		TAILQ_FOREACH(i, &pmr->single[mt], pageq)
405 			KDASSERT(i != pg);
406 		if (pg->fpgsz > 1) {
407 			KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mt],
408 			    pg + 1) == NULL);
409 		}
410 		KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL);
411 	}
412 #endif
413 
414 	if (!no_join) {
415 		uvm_pmr_pnaddr(pmr, pg, &prev, &next);
416 		if (next != NULL) {
417 			uvm_pmr_remove_size(pmr, next);
418 			uvm_pmr_remove_addr(pmr, next);
419 			pg->fpgsz += next->fpgsz;
420 			next->fpgsz = 0;
421 		}
422 		if (prev != NULL) {
423 			uvm_pmr_remove_size(pmr, prev);
424 			prev->fpgsz += pg->fpgsz;
425 			pg->fpgsz = 0;
426 			return prev;
427 		}
428 	}
429 
430 	RBT_INSERT(uvm_pmr_addr, &pmr->addr, pg);
431 
432 	pmr->nsegs++;
433 
434 	return pg;
435 }
436 /*
437  * Insert the range described in pg.
438  * Returns the range thus created (which may be joined with the previous and
439  * next ranges).
440  * Page must already be in the address tree.
441  */
442 void
uvm_pmr_insert_size(struct uvm_pmemrange * pmr,struct vm_page * pg)443 uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
444 {
445 	int memtype;
446 #ifdef DEBUG
447 	struct vm_page *i;
448 	int mti;
449 #endif
450 
451 	KDASSERT(pg->fpgsz >= 1);
452 	KDASSERT(pg->pg_flags & PQ_FREE);
453 
454 	memtype = uvm_pmr_pg_to_memtype(pg);
455 #ifdef DEBUG
456 	for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
457 		TAILQ_FOREACH(i, &pmr->single[mti], pageq)
458 			KDASSERT(i != pg);
459 		if (pg->fpgsz > 1) {
460 			KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mti],
461 			    pg + 1) == NULL);
462 		}
463 		KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
464 	}
465 	for (i = pg; i < pg + pg->fpgsz; i++)
466 		KASSERT(uvm_pmr_pg_to_memtype(i) == memtype);
467 #endif
468 
469 	if (pg->fpgsz == 1)
470 		TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq);
471 	else
472 		RBT_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1);
473 }
474 /* Insert in both trees. */
475 struct vm_page *
uvm_pmr_insert(struct uvm_pmemrange * pmr,struct vm_page * pg,int no_join)476 uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
477 {
478 	uvm_pmr_assertvalid(pmr);
479 	pg = uvm_pmr_insert_addr(pmr, pg, no_join);
480 	uvm_pmr_insert_size(pmr, pg);
481 	uvm_pmr_assertvalid(pmr);
482 	return pg;
483 }
484 
485 /*
486  * Find the last page that is part of this segment.
487  * => pg: the range at which to start the search.
488  * => boundary: the page number boundary specification (0 = no boundary).
489  * => pmr: the pmemrange of the page.
490  *
491  * This function returns 1 before the next range, so if you want to have the
492  * next range, you need to run TAILQ_NEXT(result, pageq) after calling.
493  * The reason is that this way, the length of the segment is easily
494  * calculated using: atop(result) - atop(pg) + 1.
495  * Hence this function also never returns NULL.
496  */
497 struct vm_page *
uvm_pmr_findnextsegment(struct uvm_pmemrange * pmr,struct vm_page * pg,paddr_t boundary)498 uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr,
499     struct vm_page *pg, paddr_t boundary)
500 {
501 	paddr_t	first_boundary;
502 	struct	vm_page *next;
503 	struct	vm_page *prev;
504 
505 	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) &&
506 	    pmr->high > atop(VM_PAGE_TO_PHYS(pg)));
507 	if (boundary != 0) {
508 		first_boundary =
509 		    PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary);
510 	} else
511 		first_boundary = 0;
512 
513 	/*
514 	 * Increase next until it hits the first page of the next segment.
515 	 *
516 	 * While loop checks the following:
517 	 * - next != NULL	we have not reached the end of pgl
518 	 * - boundary == 0 || next < first_boundary
519 	 *			we do not cross a boundary
520 	 * - atop(prev) + 1 == atop(next)
521 	 *			still in the same segment
522 	 * - low <= last
523 	 * - high > last	still in the same memory range
524 	 * - memtype is equal	allocator is unable to view different memtypes
525 	 *			as part of the same segment
526 	 * - prev + 1 == next	no array breakage occurs
527 	 */
528 	prev = pg;
529 	next = TAILQ_NEXT(prev, pageq);
530 	while (next != NULL &&
531 	    (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) &&
532 	    atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) &&
533 	    pmr->low <= atop(VM_PAGE_TO_PHYS(next)) &&
534 	    pmr->high > atop(VM_PAGE_TO_PHYS(next)) &&
535 	    uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) &&
536 	    prev + 1 == next) {
537 		prev = next;
538 		next = TAILQ_NEXT(prev, pageq);
539 	}
540 
541 	/*
542 	 * End of this segment.
543 	 */
544 	return prev;
545 }
546 
547 /*
548  * Find the first page that is part of this segment.
549  * => pg: the range at which to start the search.
550  * => boundary: the page number boundary specification (0 = no boundary).
551  * => pmr: the pmemrange of the page.
552  *
553  * This function returns 1 after the previous range, so if you want to have the
554  * previous range, you need to run TAILQ_NEXT(result, pageq) after calling.
555  * The reason is that this way, the length of the segment is easily
556  * calculated using: atop(pg) - atop(result) + 1.
557  * Hence this function also never returns NULL.
558  */
559 struct vm_page *
uvm_pmr_findprevsegment(struct uvm_pmemrange * pmr,struct vm_page * pg,paddr_t boundary)560 uvm_pmr_findprevsegment(struct uvm_pmemrange *pmr,
561     struct vm_page *pg, paddr_t boundary)
562 {
563 	paddr_t	first_boundary;
564 	struct	vm_page *next;
565 	struct	vm_page *prev;
566 
567 	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) &&
568 	    pmr->high > atop(VM_PAGE_TO_PHYS(pg)));
569 	if (boundary != 0) {
570 		first_boundary =
571 		    PMR_ALIGN_DOWN(atop(VM_PAGE_TO_PHYS(pg)), boundary);
572 	} else
573 		first_boundary = 0;
574 
575 	/*
576 	 * Increase next until it hits the first page of the previous segment.
577 	 *
578 	 * While loop checks the following:
579 	 * - next != NULL	we have not reached the end of pgl
580 	 * - boundary == 0 || next >= first_boundary
581 	 *			we do not cross a boundary
582 	 * - atop(prev) - 1 == atop(next)
583 	 *			still in the same segment
584 	 * - low <= last
585 	 * - high > last	still in the same memory range
586 	 * - memtype is equal	allocator is unable to view different memtypes
587 	 *			as part of the same segment
588 	 * - prev - 1 == next	no array breakage occurs
589 	 */
590 	prev = pg;
591 	next = TAILQ_NEXT(prev, pageq);
592 	while (next != NULL &&
593 	    (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) >= first_boundary) &&
594 	    atop(VM_PAGE_TO_PHYS(prev)) - 1 == atop(VM_PAGE_TO_PHYS(next)) &&
595 	    pmr->low <= atop(VM_PAGE_TO_PHYS(next)) &&
596 	    pmr->high > atop(VM_PAGE_TO_PHYS(next)) &&
597 	    uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) &&
598 	    prev - 1 == next) {
599 		prev = next;
600 		next = TAILQ_NEXT(prev, pageq);
601 	}
602 
603 	/*
604 	 * Start of this segment.
605 	 */
606 	return prev;
607 }
608 
609 /*
610  * Remove the first segment of contiguous pages from pgl.
611  * A segment ends if it crosses boundary (unless boundary = 0) or
612  * if it would enter a different uvm_pmemrange.
613  *
614  * Work: the page range that the caller is currently working with.
615  * May be null.
616  *
617  * If is_desperate is non-zero, the smallest segment is erased. Otherwise,
618  * the first segment is erased (which, if called by uvm_pmr_getpages(),
619  * probably is the smallest or very close to it).
620  */
621 psize_t
uvm_pmr_remove_1strange(struct pglist * pgl,paddr_t boundary,struct vm_page ** work,int is_desperate)622 uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary,
623     struct vm_page **work, int is_desperate)
624 {
625 	struct vm_page *start, *end, *iter, *iter_end, *inserted, *lowest;
626 	psize_t count;
627 	struct uvm_pmemrange *pmr, *pmr_iter;
628 
629 	KASSERT(!TAILQ_EMPTY(pgl));
630 
631 	/*
632 	 * Initialize to first page.
633 	 * Unless desperate scan finds a better candidate, this is what'll be
634 	 * erased.
635 	 */
636 	start = TAILQ_FIRST(pgl);
637 	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start)));
638 	end = uvm_pmr_findnextsegment(pmr, start, boundary);
639 
640 	/*
641 	 * If we are desperate, we _really_ want to get rid of the smallest
642 	 * element (rather than a close match to the smallest element).
643 	 */
644 	if (is_desperate) {
645 		/* Linear search for smallest segment. */
646 		pmr_iter = pmr;
647 		for (iter = TAILQ_NEXT(end, pageq);
648 		    iter != NULL && start != end;
649 		    iter = TAILQ_NEXT(iter_end, pageq)) {
650 			/*
651 			 * Only update pmr if it doesn't match current
652 			 * iteration.
653 			 */
654 			if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) ||
655 			    pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) {
656 				pmr_iter = uvm_pmemrange_find(atop(
657 				    VM_PAGE_TO_PHYS(iter)));
658 			}
659 
660 			iter_end = uvm_pmr_findnextsegment(pmr_iter, iter,
661 			    boundary);
662 
663 			/*
664 			 * Current iteration is smaller than best match so
665 			 * far; update.
666 			 */
667 			if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) <
668 			    VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) {
669 				start = iter;
670 				end = iter_end;
671 				pmr = pmr_iter;
672 			}
673 		}
674 	}
675 
676 	/*
677 	 * Calculate count and end of the list.
678 	 */
679 	count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1;
680 	lowest = start;
681 	end = TAILQ_NEXT(end, pageq);
682 
683 	/*
684 	 * Actually remove the range of pages.
685 	 *
686 	 * Sadly, this cannot be done using pointer iteration:
687 	 * vm_physseg is not guaranteed to be sorted on address, hence
688 	 * uvm_page_init() may not have initialized its array sorted by
689 	 * page number.
690 	 */
691 	for (iter = start; iter != end; iter = iter_end) {
692 		iter_end = TAILQ_NEXT(iter, pageq);
693 		TAILQ_REMOVE(pgl, iter, pageq);
694 	}
695 
696 	lowest->fpgsz = count;
697 	inserted = uvm_pmr_insert(pmr, lowest, 0);
698 
699 	/*
700 	 * If the caller was working on a range and this function modified
701 	 * that range, update the pointer.
702 	 */
703 	if (work != NULL && *work != NULL &&
704 	    atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) &&
705 	    atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz >
706 	    atop(VM_PAGE_TO_PHYS(*work)))
707 		*work = inserted;
708 	return count;
709 }
710 
711 /*
712  * Remove the first segment of contiguous pages from a pgl
713  * with the list elements in reverse order of physaddr.
714  *
715  * A segment ends if it would enter a different uvm_pmemrange.
716  *
717  * Stores starting physical address of the segment in pstart.
718  */
719 psize_t
uvm_pmr_remove_1strange_reverse(struct pglist * pgl,paddr_t * pstart)720 uvm_pmr_remove_1strange_reverse(struct pglist *pgl, paddr_t *pstart)
721 {
722 	struct vm_page *start, *end, *iter, *iter_end, *lowest;
723 	psize_t count;
724 	struct uvm_pmemrange *pmr;
725 
726 	KASSERT(!TAILQ_EMPTY(pgl));
727 
728 	start = TAILQ_FIRST(pgl);
729 	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start)));
730 	end = uvm_pmr_findprevsegment(pmr, start, 0);
731 
732 	KASSERT(end <= start);
733 
734 	/*
735 	 * Calculate count and end of the list.
736 	 */
737 	count = atop(VM_PAGE_TO_PHYS(start) - VM_PAGE_TO_PHYS(end)) + 1;
738 	lowest = end;
739 	end = TAILQ_NEXT(end, pageq);
740 
741 	/*
742 	 * Actually remove the range of pages.
743 	 *
744 	 * Sadly, this cannot be done using pointer iteration:
745 	 * vm_physseg is not guaranteed to be sorted on address, hence
746 	 * uvm_page_init() may not have initialized its array sorted by
747 	 * page number.
748 	 */
749 	for (iter = start; iter != end; iter = iter_end) {
750 		iter_end = TAILQ_NEXT(iter, pageq);
751 		TAILQ_REMOVE(pgl, iter, pageq);
752 	}
753 
754 	lowest->fpgsz = count;
755 	(void) uvm_pmr_insert(pmr, lowest, 0);
756 
757 	*pstart = VM_PAGE_TO_PHYS(lowest);
758 	return count;
759 }
760 
761 /*
762  * Extract a number of pages from a segment of free pages.
763  * Called by uvm_pmr_getpages.
764  *
765  * Returns the segment that was created from pages left over at the tail
766  * of the remove set of pages, or NULL if no pages were left at the tail.
767  */
768 struct vm_page *
uvm_pmr_extract_range(struct uvm_pmemrange * pmr,struct vm_page * pg,paddr_t start,paddr_t end,struct pglist * result)769 uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg,
770     paddr_t start, paddr_t end, struct pglist *result)
771 {
772 	struct vm_page *after, *pg_i;
773 	psize_t before_sz, after_sz;
774 #ifdef DEBUG
775 	psize_t i;
776 #endif
777 
778 	KDASSERT(end > start);
779 	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)));
780 	KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz);
781 	KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start);
782 	KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end);
783 
784 	before_sz = start - atop(VM_PAGE_TO_PHYS(pg));
785 	after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end;
786 	KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz);
787 	uvm_pmr_assertvalid(pmr);
788 
789 	uvm_pmr_remove_size(pmr, pg);
790 	if (before_sz == 0)
791 		uvm_pmr_remove_addr(pmr, pg);
792 	after = pg + before_sz + (end - start);
793 
794 	/* Add selected pages to result. */
795 	for (pg_i = pg + before_sz; pg_i != after; pg_i++) {
796 		KASSERT(pg_i->pg_flags & PQ_FREE);
797 		pg_i->fpgsz = 0;
798 		TAILQ_INSERT_TAIL(result, pg_i, pageq);
799 	}
800 
801 	/* Before handling. */
802 	if (before_sz > 0) {
803 		pg->fpgsz = before_sz;
804 		uvm_pmr_insert_size(pmr, pg);
805 	}
806 
807 	/* After handling. */
808 	if (after_sz > 0) {
809 #ifdef DEBUG
810 		for (i = 0; i < after_sz; i++) {
811 			KASSERT(!uvm_pmr_isfree(after + i));
812 		}
813 #endif
814 		KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end);
815 		after->fpgsz = after_sz;
816 		after = uvm_pmr_insert_addr(pmr, after, 1);
817 		uvm_pmr_insert_size(pmr, after);
818 	}
819 
820 	uvm_pmr_assertvalid(pmr);
821 	return (after_sz > 0 ? after : NULL);
822 }
823 
824 /*
825  * Indicate to the page daemon that a nowait call failed and it should
826  * recover at least some memory in the most restricted region (assumed
827  * to be dma_constraint).
828  */
829 extern volatile int uvm_nowait_failed;
830 
831 /*
832  * Acquire a number of pages.
833  *
834  * count:	the number of pages returned
835  * start:	lowest page number
836  * end:		highest page number +1
837  * 		(start = end = 0: no limitation)
838  * align:	power-of-2 alignment constraint (align = 1: no alignment)
839  * boundary:	power-of-2 boundary (boundary = 0: no boundary)
840  * maxseg:	maximum number of segments to return
841  * flags:	UVM_PLA_* flags
842  * result:	returned pages storage (uses pageq)
843  */
844 int
uvm_pmr_getpages(psize_t count,paddr_t start,paddr_t end,paddr_t align,paddr_t boundary,int maxseg,int flags,struct pglist * result)845 uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align,
846     paddr_t boundary, int maxseg, int flags, struct pglist *result)
847 {
848 	struct	uvm_pmemrange *pmr;	/* Iterate memory ranges. */
849 	struct	vm_page *found, *f_next; /* Iterate chunks. */
850 	psize_t	fcount;			/* Current found pages. */
851 	int	fnsegs;			/* Current segment counter. */
852 	int	try, start_try;
853 	psize_t	search[3];
854 	paddr_t	fstart, fend;		/* Pages to be taken from found. */
855 	int	memtype;		/* Requested memtype. */
856 	int	memtype_init;		/* Best memtype. */
857 	int	desperate;		/* True if allocation failed. */
858 #ifdef DIAGNOSTIC
859 	struct	vm_page *diag_prev;	/* Used during validation. */
860 #endif /* DIAGNOSTIC */
861 
862 	/*
863 	 * Validate arguments.
864 	 */
865 	KASSERT(count > 0);
866 	KASSERT(start == 0 || end == 0 || start < end);
867 	KASSERT(align >= 1);
868 	KASSERT(powerof2(align));
869 	KASSERT(maxseg > 0);
870 	KASSERT(boundary == 0 || powerof2(boundary));
871 	KASSERT(boundary == 0 || maxseg * boundary >= count);
872 	KASSERT(TAILQ_EMPTY(result));
873 	KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT));
874 
875 	/*
876 	 * TRYCONTIG is a noop if you only want a single segment.
877 	 * Remove it if that's the case: otherwise it'll deny the fast
878 	 * allocation.
879 	 */
880 	if (maxseg == 1 || count == 1)
881 		flags &= ~UVM_PLA_TRYCONTIG;
882 
883 	/*
884 	 * Configure search.
885 	 *
886 	 * search[0] is one segment, only used in UVM_PLA_TRYCONTIG case.
887 	 * search[1] is multiple segments, chosen to fulfill the search in
888 	 *   approximately even-sized segments.
889 	 *   This is a good trade-off between slightly reduced allocation speed
890 	 *   and less fragmentation.
891 	 * search[2] is the worst case, in which all segments are evaluated.
892 	 *   This provides the least fragmentation, but makes the search
893 	 *   possibly longer (although in the case it is selected, that no
894 	 *   longer matters most).
895 	 *
896 	 * The exception is when maxseg == 1: since we can only fulfill that
897 	 * with one segment of size pages, only a single search type has to
898 	 * be attempted.
899 	 */
900 	if (maxseg == 1 || count == 1) {
901 		start_try = 2;
902 		search[2] = count;
903 	} else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) {
904 		start_try = 2;
905 		search[2] = 1;
906 	} else {
907 		start_try = 0;
908 		search[0] = count;
909 		search[1] = pow2divide(count, maxseg);
910 		search[2] = 1;
911 		if ((flags & UVM_PLA_TRYCONTIG) == 0)
912 			start_try = 1;
913 		if (search[1] >= search[0]) {
914 			search[1] = search[0];
915 			start_try = 1;
916 		}
917 		if (search[2] >= search[start_try]) {
918 			start_try = 2;
919 		}
920 	}
921 
922 	/*
923 	 * Memory type: if zeroed memory is requested, traverse the zero set.
924 	 * Otherwise, traverse the dirty set.
925 	 *
926 	 * The memtype iterator is reinitialized to memtype_init on entrance
927 	 * of a pmemrange.
928 	 */
929 	if (flags & UVM_PLA_ZERO)
930 		memtype_init = UVM_PMR_MEMTYPE_ZERO;
931 	else
932 		memtype_init = UVM_PMR_MEMTYPE_DIRTY;
933 
934 	/*
935 	 * Initially, we're not desperate.
936 	 *
937 	 * Note that if we return from a sleep, we are still desperate.
938 	 * Chances are that memory pressure is still high, so resetting
939 	 * seems over-optimistic to me.
940 	 */
941 	desperate = 0;
942 
943 again:
944 	uvm_lock_fpageq();
945 
946 	/*
947 	 * check to see if we need to generate some free pages waking
948 	 * the pagedaemon.
949 	 */
950 	if ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freemin ||
951 	    ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg &&
952 	    (uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg))
953 		wakeup(&uvm.pagedaemon);
954 
955 	/*
956 	 * fail if any of these conditions is true:
957 	 * [1]  there really are no free pages, or
958 	 * [2]  only kernel "reserved" pages remain and
959 	 *        the UVM_PLA_USERESERVE flag wasn't used.
960 	 * [3]  only pagedaemon "reserved" pages remain and
961 	 *        the requestor isn't the pagedaemon nor the syncer.
962 	 */
963 	if ((uvmexp.free <= (uvmexp.reserve_kernel + count)) &&
964 	    !(flags & UVM_PLA_USERESERVE)) {
965 		uvm_unlock_fpageq();
966 		return ENOMEM;
967 	}
968 
969 	if ((uvmexp.free <= (uvmexp.reserve_pagedaemon + count)) &&
970 	    (curproc != uvm.pagedaemon_proc) && (curproc != syncerproc)) {
971 		uvm_unlock_fpageq();
972 		if (flags & UVM_PLA_WAITOK) {
973 			uvm_wait("uvm_pmr_getpages");
974 			goto again;
975 		}
976 		return ENOMEM;
977 	}
978 
979 retry:		/* Return point after sleeping. */
980 	fcount = 0;
981 	fnsegs = 0;
982 
983 retry_desperate:
984 	/*
985 	 * If we just want any page(s), go for the really fast option.
986 	 */
987 	if (count <= maxseg && align == 1 && boundary == 0 &&
988 	    (flags & UVM_PLA_TRYCONTIG) == 0) {
989 		fcount += uvm_pmr_get1page(count - fcount, memtype_init,
990 		    result, start, end, 0);
991 
992 		/*
993 		 * If we found sufficient pages, go to the success exit code.
994 		 *
995 		 * Otherwise, go immediately to fail, since we collected
996 		 * all we could anyway.
997 		 */
998 		if (fcount == count)
999 			goto out;
1000 		else
1001 			goto fail;
1002 	}
1003 
1004 	/*
1005 	 * The heart of the contig case.
1006 	 *
1007 	 * The code actually looks like this:
1008 	 *
1009 	 * foreach (struct pmemrange) {
1010 	 *	foreach (memtype) {
1011 	 *		foreach(try) {
1012 	 *			foreach (free range of memtype in pmemrange,
1013 	 *			    starting at search[try]) {
1014 	 *				while (range has space left)
1015 	 *					take from range
1016 	 *			}
1017 	 *		}
1018 	 *	}
1019 	 *
1020 	 *	if next pmemrange has higher usecount than current:
1021 	 *		enter desperate case (which will drain the pmemranges
1022 	 *		until empty prior to moving to the next one)
1023 	 * }
1024 	 *
1025 	 * When desperate is activated, try always starts at the highest
1026 	 * value. The memtype loop is using a goto ReScanMemtype.
1027 	 * The try loop is using a goto ReScan.
1028 	 * The 'range has space left' loop uses label DrainFound.
1029 	 *
1030 	 * Writing them all as loops would take up a lot of screen space in
1031 	 * the form of indentation and some parts are easier to express
1032 	 * using the labels.
1033 	 */
1034 
1035 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1036 		/* Empty range. */
1037 		if (pmr->nsegs == 0)
1038 			continue;
1039 
1040 		/* Outside requested range. */
1041 		if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
1042 			continue;
1043 
1044 		memtype = memtype_init;
1045 
1046 rescan_memtype:	/* Return point at memtype++. */
1047 		try = start_try;
1048 
1049 rescan:		/* Return point at try++. */
1050 		for (found = uvm_pmr_nfindsz(pmr, search[try], memtype);
1051 		    found != NULL;
1052 		    found = f_next) {
1053 			f_next = uvm_pmr_nextsz(pmr, found, memtype);
1054 
1055 			fstart = atop(VM_PAGE_TO_PHYS(found));
1056 			if (start != 0)
1057 				fstart = MAX(start, fstart);
1058 drain_found:
1059 			/*
1060 			 * Throw away the first segment if fnsegs == maxseg
1061 			 *
1062 			 * Note that f_next is still valid after this call,
1063 			 * since we only allocated from entries before f_next.
1064 			 * We don't revisit the entries we already extracted
1065 			 * from unless we entered the desperate case.
1066 			 */
1067 			if (fnsegs == maxseg) {
1068 				fnsegs--;
1069 				fcount -=
1070 				    uvm_pmr_remove_1strange(result, boundary,
1071 				    &found, desperate);
1072 			}
1073 
1074 			fstart = PMR_ALIGN(fstart, align);
1075 			fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz;
1076 			if (end != 0)
1077 				fend = MIN(end, fend);
1078 			if (boundary != 0) {
1079 				fend =
1080 				    MIN(fend, PMR_ALIGN(fstart + 1, boundary));
1081 			}
1082 			if (fstart >= fend)
1083 				continue;
1084 			if (fend - fstart > count - fcount)
1085 				fend = fstart + (count - fcount);
1086 
1087 			fcount += fend - fstart;
1088 			fnsegs++;
1089 			found = uvm_pmr_extract_range(pmr, found,
1090 			    fstart, fend, result);
1091 
1092 			if (fcount == count)
1093 				goto out;
1094 
1095 			/*
1096 			 * If there's still space left in found, try to
1097 			 * fully drain it prior to continuing.
1098 			 */
1099 			if (found != NULL) {
1100 				fstart = fend;
1101 				goto drain_found;
1102 			}
1103 		}
1104 
1105 		/* Try a smaller search now. */
1106 		if (++try < nitems(search))
1107 			goto rescan;
1108 
1109 		/*
1110 		 * Exhaust all memory types prior to going to the next memory
1111 		 * segment.
1112 		 * This means that zero-vs-dirty are eaten prior to moving
1113 		 * to a pmemrange with a higher use-count.
1114 		 *
1115 		 * Code is basically a difficult way of writing:
1116 		 * memtype = memtype_init;
1117 		 * do {
1118 		 *	...;
1119 		 *	memtype += 1;
1120 		 *	memtype %= MEMTYPE_MAX;
1121 		 * } while (memtype != memtype_init);
1122 		 */
1123 		memtype += 1;
1124 		if (memtype == UVM_PMR_MEMTYPE_MAX)
1125 			memtype = 0;
1126 		if (memtype != memtype_init)
1127 			goto rescan_memtype;
1128 
1129 		/*
1130 		 * If not desperate, enter desperate case prior to eating all
1131 		 * the good stuff in the next range.
1132 		 */
1133 		if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL &&
1134 		    TAILQ_NEXT(pmr, pmr_use)->use != pmr->use)
1135 			break;
1136 	}
1137 
1138 	/*
1139 	 * Not enough memory of the requested type available. Fall back to
1140 	 * less good memory that we'll clean up better later.
1141 	 *
1142 	 * This algorithm is not very smart though, it just starts scanning
1143 	 * a different typed range, but the nicer ranges of the previous
1144 	 * iteration may fall out. Hence there is a small chance of a false
1145 	 * negative.
1146 	 *
1147 	 * When desperate: scan all sizes starting at the smallest
1148 	 * (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may
1149 	 * allow us to hit the fast path now).
1150 	 *
1151 	 * Also, because we will revisit entries we scanned before, we need
1152 	 * to reset the page queue, or we may end up releasing entries in
1153 	 * such a way as to invalidate f_next.
1154 	 */
1155 	if (!desperate) {
1156 		desperate = 1;
1157 		start_try = nitems(search) - 1;
1158 		flags &= ~UVM_PLA_TRYCONTIG;
1159 
1160 		while (!TAILQ_EMPTY(result))
1161 			uvm_pmr_remove_1strange(result, 0, NULL, 0);
1162 		fnsegs = 0;
1163 		fcount = 0;
1164 		goto retry_desperate;
1165 	}
1166 
1167 fail:
1168 	/* Allocation failed. */
1169 	/* XXX: claim from memory reserve here */
1170 
1171 	while (!TAILQ_EMPTY(result))
1172 		uvm_pmr_remove_1strange(result, 0, NULL, 0);
1173 
1174 	if (flags & UVM_PLA_WAITOK) {
1175 		if (uvm_wait_pla(ptoa(start), ptoa(end) - 1, ptoa(count),
1176 		    flags & UVM_PLA_FAILOK) == 0)
1177 			goto retry;
1178 		KASSERT(flags & UVM_PLA_FAILOK);
1179 	} else {
1180 		if (!(flags & UVM_PLA_NOWAKE)) {
1181 			uvm_nowait_failed = 1;
1182 			wakeup(&uvm.pagedaemon);
1183 		}
1184 	}
1185 	uvm_unlock_fpageq();
1186 
1187 	return ENOMEM;
1188 
1189 out:
1190 	/* Allocation successful. */
1191 	uvmexp.free -= fcount;
1192 
1193 	uvm_unlock_fpageq();
1194 
1195 	/* Update statistics and zero pages if UVM_PLA_ZERO. */
1196 #ifdef DIAGNOSTIC
1197 	fnsegs = 0;
1198 	fcount = 0;
1199 	diag_prev = NULL;
1200 #endif /* DIAGNOSTIC */
1201 	TAILQ_FOREACH(found, result, pageq) {
1202 		atomic_clearbits_int(&found->pg_flags, PG_PMAPMASK);
1203 
1204 		if (found->pg_flags & PG_ZERO) {
1205 			uvm_lock_fpageq();
1206 			uvmexp.zeropages--;
1207 			if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1208 				wakeup(&uvmexp.zeropages);
1209 			uvm_unlock_fpageq();
1210 		}
1211 		if (flags & UVM_PLA_ZERO) {
1212 			if (found->pg_flags & PG_ZERO)
1213 				uvmexp.pga_zerohit++;
1214 			else {
1215 				uvmexp.pga_zeromiss++;
1216 				uvm_pagezero(found);
1217 			}
1218 		}
1219 		atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE);
1220 
1221 		found->uobject = NULL;
1222 		found->uanon = NULL;
1223 		found->pg_version++;
1224 
1225 		/*
1226 		 * Validate that the page matches range criterium.
1227 		 */
1228 		KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start);
1229 		KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end);
1230 
1231 #ifdef DIAGNOSTIC
1232 		/*
1233 		 * Update fcount (# found pages) and
1234 		 * fnsegs (# found segments) counters.
1235 		 */
1236 		if (diag_prev == NULL ||
1237 		    /* new segment if it contains a hole */
1238 		    atop(VM_PAGE_TO_PHYS(diag_prev)) + 1 !=
1239 		    atop(VM_PAGE_TO_PHYS(found)) ||
1240 		    /* new segment if it crosses boundary */
1241 		    (atop(VM_PAGE_TO_PHYS(diag_prev)) & ~(boundary - 1)) !=
1242 		    (atop(VM_PAGE_TO_PHYS(found)) & ~(boundary - 1)))
1243 			fnsegs++;
1244 		fcount++;
1245 
1246 		diag_prev = found;
1247 #endif /* DIAGNOSTIC */
1248 	}
1249 
1250 #ifdef DIAGNOSTIC
1251 	/*
1252 	 * Panic on algorithm failure.
1253 	 */
1254 	if (fcount != count || fnsegs > maxseg) {
1255 		panic("pmemrange allocation error: "
1256 		    "allocated %ld pages in %d segments, "
1257 		    "but request was %ld pages in %d segments",
1258 		    fcount, fnsegs, count, maxseg);
1259 	}
1260 #endif /* DIAGNOSTIC */
1261 
1262 	return 0;
1263 }
1264 
1265 /*
1266  * Acquire a single page.
1267  *
1268  * flags:	UVM_PLA_* flags
1269  * result:	returned page.
1270  */
1271 struct vm_page *
uvm_pmr_getone(int flags)1272 uvm_pmr_getone(int flags)
1273 {
1274 	struct vm_page *pg;
1275 	struct pglist pgl;
1276 
1277 	TAILQ_INIT(&pgl);
1278 	if (uvm_pmr_getpages(1, 0, 0, 1, 0, 1, flags, &pgl) != 0)
1279 		return NULL;
1280 
1281 	pg = TAILQ_FIRST(&pgl);
1282 	KASSERT(pg != NULL && TAILQ_NEXT(pg, pageq) == NULL);
1283 
1284 	return pg;
1285 }
1286 
1287 /*
1288  * Free a number of contig pages (invoked by uvm_page_init).
1289  */
1290 void
uvm_pmr_freepages(struct vm_page * pg,psize_t count)1291 uvm_pmr_freepages(struct vm_page *pg, psize_t count)
1292 {
1293 	struct uvm_pmemrange *pmr;
1294 	psize_t i, pmr_count;
1295 	struct vm_page *firstpg = pg;
1296 
1297 	for (i = 0; i < count; i++) {
1298 		KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) ==
1299 		    atop(VM_PAGE_TO_PHYS(pg)) + i);
1300 
1301 		if (!((pg[i].pg_flags & PQ_FREE) == 0 &&
1302 		    VALID_FLAGS(pg[i].pg_flags))) {
1303 			printf("Flags: 0x%x, will panic now.\n",
1304 			    pg[i].pg_flags);
1305 		}
1306 		KASSERT((pg[i].pg_flags & PQ_FREE) == 0 &&
1307 		    VALID_FLAGS(pg[i].pg_flags));
1308 		atomic_setbits_int(&pg[i].pg_flags, PQ_FREE);
1309 		atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
1310 	}
1311 
1312 	uvm_lock_fpageq();
1313 
1314 	for (i = count; i > 0; i -= pmr_count) {
1315 		pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1316 		KASSERT(pmr != NULL);
1317 
1318 		pmr_count = MIN(i, pmr->high - atop(VM_PAGE_TO_PHYS(pg)));
1319 		pg->fpgsz = pmr_count;
1320 		uvm_pmr_insert(pmr, pg, 0);
1321 
1322 		uvmexp.free += pmr_count;
1323 		pg += pmr_count;
1324 	}
1325 	wakeup(&uvmexp.free);
1326 	if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1327 		wakeup(&uvmexp.zeropages);
1328 
1329 	uvm_wakeup_pla(VM_PAGE_TO_PHYS(firstpg), ptoa(count));
1330 
1331 	uvm_unlock_fpageq();
1332 }
1333 
1334 /*
1335  * Free all pages in the queue.
1336  */
1337 void
uvm_pmr_freepageq(struct pglist * pgl)1338 uvm_pmr_freepageq(struct pglist *pgl)
1339 {
1340 	struct vm_page *pg;
1341 	paddr_t pstart;
1342 	psize_t plen;
1343 
1344 	TAILQ_FOREACH(pg, pgl, pageq) {
1345 		if (!((pg->pg_flags & PQ_FREE) == 0 &&
1346 		    VALID_FLAGS(pg->pg_flags))) {
1347 			printf("Flags: 0x%x, will panic now.\n",
1348 			    pg->pg_flags);
1349 		}
1350 		KASSERT((pg->pg_flags & PQ_FREE) == 0 &&
1351 		    VALID_FLAGS(pg->pg_flags));
1352 		atomic_setbits_int(&pg->pg_flags, PQ_FREE);
1353 		atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
1354 	}
1355 
1356 	uvm_lock_fpageq();
1357 	while (!TAILQ_EMPTY(pgl)) {
1358 		pg = TAILQ_FIRST(pgl);
1359 		if (pg == TAILQ_NEXT(pg, pageq) + 1) {
1360 			/*
1361 			 * If pg is one behind the position of the
1362 			 * next page in the list in the page array,
1363 			 * try going backwards instead of forward.
1364 			 */
1365 			plen = uvm_pmr_remove_1strange_reverse(pgl, &pstart);
1366 		} else {
1367 			pstart = VM_PAGE_TO_PHYS(TAILQ_FIRST(pgl));
1368 			plen = uvm_pmr_remove_1strange(pgl, 0, NULL, 0);
1369 		}
1370 		uvmexp.free += plen;
1371 
1372 		uvm_wakeup_pla(pstart, ptoa(plen));
1373 	}
1374 	wakeup(&uvmexp.free);
1375 	if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1376 		wakeup(&uvmexp.zeropages);
1377 	uvm_unlock_fpageq();
1378 
1379 	return;
1380 }
1381 
1382 /*
1383  * Store a pmemrange in the list.
1384  *
1385  * The list is sorted by use.
1386  */
1387 struct uvm_pmemrange *
uvm_pmemrange_use_insert(struct uvm_pmemrange_use * useq,struct uvm_pmemrange * pmr)1388 uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq,
1389     struct uvm_pmemrange *pmr)
1390 {
1391 	struct uvm_pmemrange *iter;
1392 	int cmp = 1;
1393 
1394 	TAILQ_FOREACH(iter, useq, pmr_use) {
1395 		cmp = uvm_pmemrange_use_cmp(pmr, iter);
1396 		if (cmp == 0)
1397 			return iter;
1398 		if (cmp == -1)
1399 			break;
1400 	}
1401 
1402 	if (iter == NULL)
1403 		TAILQ_INSERT_TAIL(useq, pmr, pmr_use);
1404 	else
1405 		TAILQ_INSERT_BEFORE(iter, pmr, pmr_use);
1406 	return NULL;
1407 }
1408 
1409 #ifdef DEBUG
1410 /*
1411  * Validation of the whole pmemrange.
1412  * Called with fpageq locked.
1413  */
1414 void
uvm_pmr_assertvalid(struct uvm_pmemrange * pmr)1415 uvm_pmr_assertvalid(struct uvm_pmemrange *pmr)
1416 {
1417 	struct vm_page *prev, *next, *i, *xref;
1418 	int lcv, mti;
1419 
1420 	/* Empty range */
1421 	if (pmr->nsegs == 0)
1422 		return;
1423 
1424 	/* Validate address tree. */
1425 	RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) {
1426 		/* Validate the range. */
1427 		KASSERT(i->fpgsz > 0);
1428 		KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low);
1429 		KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz
1430 		    <= pmr->high);
1431 
1432 		/* Validate each page in this range. */
1433 		for (lcv = 0; lcv < i->fpgsz; lcv++) {
1434 			/*
1435 			 * Only the first page has a size specification.
1436 			 * Rest is size 0.
1437 			 */
1438 			KASSERT(lcv == 0 || i[lcv].fpgsz == 0);
1439 			/*
1440 			 * Flag check.
1441 			 */
1442 			KASSERT(VALID_FLAGS(i[lcv].pg_flags) &&
1443 			    (i[lcv].pg_flags & PQ_FREE) == PQ_FREE);
1444 			/*
1445 			 * Free pages are:
1446 			 * - not wired
1447 			 * - have no vm_anon
1448 			 * - have no uvm_object
1449 			 */
1450 			KASSERT(i[lcv].wire_count == 0);
1451 			KASSERT(i[lcv].uanon == (void*)0xdeadbeef ||
1452 			    i[lcv].uanon == NULL);
1453 			KASSERT(i[lcv].uobject == (void*)0xdeadbeef ||
1454 			    i[lcv].uobject == NULL);
1455 			/*
1456 			 * Pages in a single range always have the same
1457 			 * memtype.
1458 			 */
1459 			KASSERT(uvm_pmr_pg_to_memtype(&i[0]) ==
1460 			    uvm_pmr_pg_to_memtype(&i[lcv]));
1461 		}
1462 
1463 		/* Check that it shouldn't be joined with its predecessor. */
1464 		prev = RBT_PREV(uvm_pmr_addr, i);
1465 		if (prev != NULL) {
1466 			KASSERT(uvm_pmr_pg_to_memtype(i) !=
1467 			    uvm_pmr_pg_to_memtype(prev) ||
1468 			    atop(VM_PAGE_TO_PHYS(i)) >
1469 			    atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz ||
1470 			    prev + prev->fpgsz != i);
1471 		}
1472 
1473 		/* Assert i is in the size tree as well. */
1474 		if (i->fpgsz == 1) {
1475 			TAILQ_FOREACH(xref,
1476 			    &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) {
1477 				if (xref == i)
1478 					break;
1479 			}
1480 			KASSERT(xref == i);
1481 		} else {
1482 			KASSERT(RBT_FIND(uvm_pmr_size,
1483 			    &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) ==
1484 			    i + 1);
1485 		}
1486 	}
1487 
1488 	/* Validate size tree. */
1489 	for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
1490 		for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) {
1491 			next = uvm_pmr_nextsz(pmr, i, mti);
1492 			if (next != NULL) {
1493 				KASSERT(i->fpgsz <=
1494 				    next->fpgsz);
1495 			}
1496 
1497 			/* Assert i is in the addr tree as well. */
1498 			KASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, i) == i);
1499 
1500 			/* Assert i is of the correct memory type. */
1501 			KASSERT(uvm_pmr_pg_to_memtype(i) == mti);
1502 		}
1503 	}
1504 
1505 	/* Validate nsegs statistic. */
1506 	lcv = 0;
1507 	RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr)
1508 		lcv++;
1509 	KASSERT(pmr->nsegs == lcv);
1510 }
1511 #endif /* DEBUG */
1512 
1513 /*
1514  * Split pmr at split point pageno.
1515  * Called with fpageq unlocked.
1516  *
1517  * Split is only applied if a pmemrange spans pageno.
1518  */
1519 void
uvm_pmr_split(paddr_t pageno)1520 uvm_pmr_split(paddr_t pageno)
1521 {
1522 	struct uvm_pmemrange *pmr, *drain;
1523 	struct vm_page *rebuild, *prev, *next;
1524 	psize_t prev_sz;
1525 
1526 	uvm_lock_fpageq();
1527 	pmr = uvm_pmemrange_find(pageno);
1528 	if (pmr == NULL || !(pmr->low < pageno)) {
1529 		/* No split required. */
1530 		uvm_unlock_fpageq();
1531 		return;
1532 	}
1533 
1534 	KASSERT(pmr->low < pageno);
1535 	KASSERT(pmr->high > pageno);
1536 
1537 	/*
1538 	 * uvm_pmr_allocpmr() calls into malloc() which in turn calls into
1539 	 * uvm_kmemalloc which calls into pmemrange, making the locking
1540 	 * a bit hard, so we just race!
1541 	 */
1542 	uvm_unlock_fpageq();
1543 	drain = uvm_pmr_allocpmr();
1544 	uvm_lock_fpageq();
1545 	pmr = uvm_pmemrange_find(pageno);
1546 	if (pmr == NULL || !(pmr->low < pageno)) {
1547 		/*
1548 		 * We lost the race since someone else ran this or a related
1549 		 * function, however this should be triggered very rarely so
1550 		 * we just leak the pmr.
1551 		 */
1552 		printf("uvm_pmr_split: lost one pmr\n");
1553 		uvm_unlock_fpageq();
1554 		return;
1555 	}
1556 
1557 	drain->low = pageno;
1558 	drain->high = pmr->high;
1559 	drain->use = pmr->use;
1560 
1561 	uvm_pmr_assertvalid(pmr);
1562 	uvm_pmr_assertvalid(drain);
1563 	KASSERT(drain->nsegs == 0);
1564 
1565 	RBT_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) {
1566 		if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno)
1567 			break;
1568 	}
1569 	if (rebuild == NULL)
1570 		prev = RBT_MAX(uvm_pmr_addr, &pmr->addr);
1571 	else
1572 		prev = RBT_PREV(uvm_pmr_addr, rebuild);
1573 	KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1574 
1575 	/*
1576 	 * Handle free chunk that spans the split point.
1577 	 */
1578 	if (prev != NULL &&
1579 	    atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) {
1580 		psize_t before, after;
1581 
1582 		KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1583 
1584 		uvm_pmr_remove(pmr, prev);
1585 		prev_sz = prev->fpgsz;
1586 		before = pageno - atop(VM_PAGE_TO_PHYS(prev));
1587 		after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno;
1588 
1589 		KASSERT(before > 0);
1590 		KASSERT(after > 0);
1591 
1592 		prev->fpgsz = before;
1593 		uvm_pmr_insert(pmr, prev, 1);
1594 		(prev + before)->fpgsz = after;
1595 		uvm_pmr_insert(drain, prev + before, 1);
1596 	}
1597 
1598 	/* Move free chunks that no longer fall in the range. */
1599 	for (; rebuild != NULL; rebuild = next) {
1600 		next = RBT_NEXT(uvm_pmr_addr, rebuild);
1601 
1602 		uvm_pmr_remove(pmr, rebuild);
1603 		uvm_pmr_insert(drain, rebuild, 1);
1604 	}
1605 
1606 	pmr->high = pageno;
1607 	uvm_pmr_assertvalid(pmr);
1608 	uvm_pmr_assertvalid(drain);
1609 
1610 	RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain);
1611 	uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain);
1612 	uvm_unlock_fpageq();
1613 }
1614 
1615 /*
1616  * Increase the usage counter for the given range of memory.
1617  *
1618  * The more usage counters a given range of memory has, the more will be
1619  * attempted not to allocate from it.
1620  *
1621  * Addresses here are in paddr_t, not page-numbers.
1622  * The lowest and highest allowed address are specified.
1623  */
1624 void
uvm_pmr_use_inc(paddr_t low,paddr_t high)1625 uvm_pmr_use_inc(paddr_t low, paddr_t high)
1626 {
1627 	struct uvm_pmemrange *pmr;
1628 	paddr_t sz;
1629 
1630 	/* pmr uses page numbers, translate low and high. */
1631 	high++;
1632 	high = atop(trunc_page(high));
1633 	low = atop(round_page(low));
1634 	uvm_pmr_split(low);
1635 	uvm_pmr_split(high);
1636 
1637 	sz = 0;
1638 	uvm_lock_fpageq();
1639 	/* Increase use count on segments in range. */
1640 	RBT_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) {
1641 		if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) {
1642 			TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use);
1643 			pmr->use++;
1644 			sz += pmr->high - pmr->low;
1645 			uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr);
1646 		}
1647 		uvm_pmr_assertvalid(pmr);
1648 	}
1649 	uvm_unlock_fpageq();
1650 
1651 	KASSERT(sz >= high - low);
1652 }
1653 
1654 /*
1655  * Allocate a pmemrange.
1656  *
1657  * If called from uvm_page_init, the uvm_pageboot_alloc is used.
1658  * If called after uvm_init, malloc is used.
1659  * (And if called in between, you're dead.)
1660  */
1661 struct uvm_pmemrange *
uvm_pmr_allocpmr(void)1662 uvm_pmr_allocpmr(void)
1663 {
1664 	struct uvm_pmemrange *nw;
1665 	int i;
1666 
1667 	/* We're only ever hitting the !uvm.page_init_done case for now. */
1668 	if (!uvm.page_init_done) {
1669 		nw = (struct uvm_pmemrange *)
1670 		    uvm_pageboot_alloc(sizeof(struct uvm_pmemrange));
1671 	} else {
1672 		nw = malloc(sizeof(struct uvm_pmemrange),
1673 		    M_VMMAP, M_NOWAIT);
1674 	}
1675 	KASSERT(nw != NULL);
1676 	memset(nw, 0, sizeof(struct uvm_pmemrange));
1677 	RBT_INIT(uvm_pmr_addr, &nw->addr);
1678 	for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) {
1679 		RBT_INIT(uvm_pmr_size, &nw->size[i]);
1680 		TAILQ_INIT(&nw->single[i]);
1681 	}
1682 	return nw;
1683 }
1684 
1685 /*
1686  * Initialization of pmr.
1687  * Called by uvm_page_init.
1688  *
1689  * Sets up pmemranges.
1690  */
1691 void
uvm_pmr_init(void)1692 uvm_pmr_init(void)
1693 {
1694 	struct uvm_pmemrange *new_pmr;
1695 	int i;
1696 
1697 	TAILQ_INIT(&uvm.pmr_control.use);
1698 	RBT_INIT(uvm_pmemrange_addr, &uvm.pmr_control.addr);
1699 	TAILQ_INIT(&uvm.pmr_control.allocs);
1700 
1701 	/* By default, one range for the entire address space. */
1702 	new_pmr = uvm_pmr_allocpmr();
1703 	new_pmr->low = 0;
1704 	new_pmr->high = atop((paddr_t)-1) + 1;
1705 
1706 	RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr);
1707 	uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr);
1708 
1709 	for (i = 0; uvm_md_constraints[i] != NULL; i++) {
1710 		uvm_pmr_use_inc(uvm_md_constraints[i]->ucr_low,
1711 	    	    uvm_md_constraints[i]->ucr_high);
1712 	}
1713 }
1714 
1715 /*
1716  * Find the pmemrange that contains the given page number.
1717  *
1718  * (Manually traverses the binary tree, because that is cheaper on stack
1719  * usage.)
1720  */
1721 struct uvm_pmemrange *
uvm_pmemrange_find(paddr_t pageno)1722 uvm_pmemrange_find(paddr_t pageno)
1723 {
1724 	struct uvm_pmemrange *pmr;
1725 
1726 	pmr = RBT_ROOT(uvm_pmemrange_addr, &uvm.pmr_control.addr);
1727 	while (pmr != NULL) {
1728 		if (pmr->low > pageno)
1729 			pmr = RBT_LEFT(uvm_pmemrange_addr, pmr);
1730 		else if (pmr->high <= pageno)
1731 			pmr = RBT_RIGHT(uvm_pmemrange_addr, pmr);
1732 		else
1733 			break;
1734 	}
1735 
1736 	return pmr;
1737 }
1738 
1739 #if defined(DDB) || defined(DEBUG)
1740 /*
1741  * Return true if the given page is in any of the free lists.
1742  * Used by uvm_page_printit.
1743  * This function is safe, even if the page is not on the freeq.
1744  * Note: does not apply locking, only called from ddb.
1745  */
1746 int
uvm_pmr_isfree(struct vm_page * pg)1747 uvm_pmr_isfree(struct vm_page *pg)
1748 {
1749 	struct vm_page *r;
1750 	struct uvm_pmemrange *pmr;
1751 
1752 	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1753 	if (pmr == NULL)
1754 		return 0;
1755 	r = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg);
1756 	if (r == NULL)
1757 		r = RBT_MAX(uvm_pmr_addr, &pmr->addr);
1758 	else if (r != pg)
1759 		r = RBT_PREV(uvm_pmr_addr, r);
1760 	if (r == NULL)
1761 		return 0; /* Empty tree. */
1762 
1763 	KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg)));
1764 	return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz >
1765 	    atop(VM_PAGE_TO_PHYS(pg));
1766 }
1767 #endif /* DEBUG */
1768 
1769 /*
1770  * Given a root of a tree, find a range which intersects start, end and
1771  * is of the same memtype.
1772  *
1773  * Page must be in the address tree.
1774  */
1775 struct vm_page*
uvm_pmr_rootupdate(struct uvm_pmemrange * pmr,struct vm_page * init_root,paddr_t start,paddr_t end,int memtype)1776 uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root,
1777     paddr_t start, paddr_t end, int memtype)
1778 {
1779 	int	direction;
1780 	struct	vm_page *root;
1781 	struct	vm_page *high, *high_next;
1782 	struct	vm_page *low, *low_next;
1783 
1784 	KDASSERT(pmr != NULL && init_root != NULL);
1785 	root = init_root;
1786 
1787 	/* Which direction to use for searching. */
1788 	if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start)
1789 		direction =  1;
1790 	else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end)
1791 		direction = -1;
1792 	else /* nothing to do */
1793 		return root;
1794 
1795 	/* First, update root to fall within the chosen range. */
1796 	while (root && !PMR_INTERSECTS_WITH(
1797 	    atop(VM_PAGE_TO_PHYS(root)),
1798 	    atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz,
1799 	    start, end)) {
1800 		if (direction == 1)
1801 			root = RBT_RIGHT(uvm_objtree, root);
1802 		else
1803 			root = RBT_LEFT(uvm_objtree, root);
1804 	}
1805 	if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype)
1806 		return root;
1807 
1808 	/*
1809 	 * Root is valid, but of the wrong memtype.
1810 	 *
1811 	 * Try to find a range that has the given memtype in the subtree
1812 	 * (memtype mismatches are costly, either because the conversion
1813 	 * is expensive, or a later allocation will need to do the opposite
1814 	 * conversion, which will be expensive).
1815 	 *
1816 	 *
1817 	 * First, simply increase address until we hit something we can use.
1818 	 * Cache the upper page, so we can page-walk later.
1819 	 */
1820 	high = root;
1821 	high_next = RBT_RIGHT(uvm_objtree, high);
1822 	while (high_next != NULL && PMR_INTERSECTS_WITH(
1823 	    atop(VM_PAGE_TO_PHYS(high_next)),
1824 	    atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz,
1825 	    start, end)) {
1826 		high = high_next;
1827 		if (uvm_pmr_pg_to_memtype(high) == memtype)
1828 			return high;
1829 		high_next = RBT_RIGHT(uvm_objtree, high);
1830 	}
1831 
1832 	/*
1833 	 * Second, decrease the address until we hit something we can use.
1834 	 * Cache the lower page, so we can page-walk later.
1835 	 */
1836 	low = root;
1837 	low_next = RBT_LEFT(uvm_objtree, low);
1838 	while (low_next != NULL && PMR_INTERSECTS_WITH(
1839 	    atop(VM_PAGE_TO_PHYS(low_next)),
1840 	    atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz,
1841 	    start, end)) {
1842 		low = low_next;
1843 		if (uvm_pmr_pg_to_memtype(low) == memtype)
1844 			return low;
1845 		low_next = RBT_LEFT(uvm_objtree, low);
1846 	}
1847 
1848 	if (low == high)
1849 		return NULL;
1850 
1851 	/* No hits. Walk the address tree until we find something usable. */
1852 	for (low = RBT_NEXT(uvm_pmr_addr, low);
1853 	    low != high;
1854 	    low = RBT_NEXT(uvm_pmr_addr, low)) {
1855 		KDASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(low)),
1856 	    	    atop(VM_PAGE_TO_PHYS(low)) + low->fpgsz,
1857 	    	    start, end));
1858 		if (uvm_pmr_pg_to_memtype(low) == memtype)
1859 			return low;
1860 	}
1861 
1862 	/* Nothing found. */
1863 	return NULL;
1864 }
1865 
1866 /*
1867  * Allocate any page, the fastest way. Page number constraints only.
1868  */
1869 psize_t
uvm_pmr_get1page(psize_t count,int memtype_init,struct pglist * result,paddr_t start,paddr_t end,int memtype_only)1870 uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result,
1871     paddr_t start, paddr_t end, int memtype_only)
1872 {
1873 	struct	uvm_pmemrange *pmr;
1874 	struct	vm_page *found, *splitpg;
1875 	psize_t	fcount;
1876 	int	memtype;
1877 
1878 	fcount = 0;
1879 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1880 		/* We're done. */
1881 		if (fcount == count)
1882 			break;
1883 
1884 		/* Outside requested range. */
1885 		if (!(start == 0 && end == 0) &&
1886 		    !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
1887 			continue;
1888 
1889 		/* Range is empty. */
1890 		if (pmr->nsegs == 0)
1891 			continue;
1892 
1893 		/* Loop over all memtypes, starting at memtype_init. */
1894 		memtype = memtype_init;
1895 		while (fcount != count) {
1896 			found = TAILQ_FIRST(&pmr->single[memtype]);
1897 			/*
1898 			 * If found is outside the range, walk the list
1899 			 * until we find something that intersects with
1900 			 * boundaries.
1901 			 */
1902 			while (found && !PMR_INTERSECTS_WITH(
1903 			    atop(VM_PAGE_TO_PHYS(found)),
1904 			    atop(VM_PAGE_TO_PHYS(found)) + 1,
1905 			    start, end))
1906 				found = TAILQ_NEXT(found, pageq);
1907 
1908 			if (found == NULL) {
1909 				/*
1910 				 * Check if the size tree contains a range
1911 				 * that intersects with the boundaries. As the
1912 				 * allocation is for any page, try the smallest
1913 				 * range so that large ranges are preserved for
1914 				 * more constrained cases. Only one entry is
1915 				 * checked here, to avoid a brute-force search.
1916 				 *
1917 				 * Note that a size tree gives pg[1] instead of
1918 				 * pg[0].
1919 				 */
1920 				found = RBT_MIN(uvm_pmr_size,
1921 				    &pmr->size[memtype]);
1922 				if (found != NULL) {
1923 					found--;
1924 					if (!PMR_INTERSECTS_WITH(
1925 					    atop(VM_PAGE_TO_PHYS(found)),
1926 					    atop(VM_PAGE_TO_PHYS(found)) +
1927 					    found->fpgsz, start, end))
1928 						found = NULL;
1929 				}
1930 			}
1931 			if (found == NULL) {
1932 				/*
1933 				 * Try address-guided search to meet the page
1934 				 * number constraints.
1935 				 */
1936 				found = RBT_ROOT(uvm_pmr_addr, &pmr->addr);
1937 				if (found != NULL) {
1938 					found = uvm_pmr_rootupdate(pmr, found,
1939 					    start, end, memtype);
1940 				}
1941 			}
1942 			if (found != NULL) {
1943 				uvm_pmr_assertvalid(pmr);
1944 				uvm_pmr_remove_size(pmr, found);
1945 
1946 				/*
1947 				 * If the page intersects the end, then it'll
1948 				 * need splitting.
1949 				 *
1950 				 * Note that we don't need to split if the page
1951 				 * intersects start: the drain function will
1952 				 * simply stop on hitting start.
1953 				 */
1954 				if (end != 0 && atop(VM_PAGE_TO_PHYS(found)) +
1955 				    found->fpgsz > end) {
1956 					psize_t splitsz =
1957 					    atop(VM_PAGE_TO_PHYS(found)) +
1958 					    found->fpgsz - end;
1959 
1960 					uvm_pmr_remove_addr(pmr, found);
1961 					uvm_pmr_assertvalid(pmr);
1962 					found->fpgsz -= splitsz;
1963 					splitpg = found + found->fpgsz;
1964 					splitpg->fpgsz = splitsz;
1965 					uvm_pmr_insert(pmr, splitpg, 1);
1966 
1967 					/*
1968 					 * At this point, splitpg and found
1969 					 * actually should be joined.
1970 					 * But we explicitly disable that,
1971 					 * because we will start subtracting
1972 					 * from found.
1973 					 */
1974 					KASSERT(start == 0 ||
1975 					    atop(VM_PAGE_TO_PHYS(found)) +
1976 					    found->fpgsz > start);
1977 					uvm_pmr_insert_addr(pmr, found, 1);
1978 				}
1979 
1980 				/*
1981 				 * Fetch pages from the end.
1982 				 * If the range is larger than the requested
1983 				 * number of pages, this saves us an addr-tree
1984 				 * update.
1985 				 *
1986 				 * Since we take from the end and insert at
1987 				 * the head, any ranges keep preserved.
1988 				 */
1989 				while (found->fpgsz > 0 && fcount < count &&
1990 				    (start == 0 ||
1991 				    atop(VM_PAGE_TO_PHYS(found)) +
1992 				    found->fpgsz > start)) {
1993 					found->fpgsz--;
1994 					fcount++;
1995 					TAILQ_INSERT_HEAD(result,
1996 					    &found[found->fpgsz], pageq);
1997 				}
1998 				if (found->fpgsz > 0) {
1999 					uvm_pmr_insert_size(pmr, found);
2000 					KDASSERT(fcount == count);
2001 					uvm_pmr_assertvalid(pmr);
2002 					return fcount;
2003 				}
2004 
2005 				/*
2006 				 * Delayed addr-tree removal.
2007 				 */
2008 				uvm_pmr_remove_addr(pmr, found);
2009 				uvm_pmr_assertvalid(pmr);
2010 			} else {
2011 				if (memtype_only)
2012 					break;
2013 				/*
2014 				 * Skip to the next memtype.
2015 				 */
2016 				memtype += 1;
2017 				if (memtype == UVM_PMR_MEMTYPE_MAX)
2018 					memtype = 0;
2019 				if (memtype == memtype_init)
2020 					break;
2021 			}
2022 		}
2023 	}
2024 
2025 	/*
2026 	 * Search finished.
2027 	 *
2028 	 * Ran out of ranges before enough pages were gathered, or we hit the
2029 	 * case where found->fpgsz == count - fcount, in which case the
2030 	 * above exit condition didn't trigger.
2031 	 *
2032 	 * On failure, caller will free the pages.
2033 	 */
2034 	return fcount;
2035 }
2036 
2037 #ifdef DDB
2038 /*
2039  * Print information about pmemrange.
2040  * Does not do locking (so either call it from DDB or acquire fpageq lock
2041  * before invoking.
2042  */
2043 void
uvm_pmr_print(void)2044 uvm_pmr_print(void)
2045 {
2046 	struct	uvm_pmemrange *pmr;
2047 	struct	vm_page *pg;
2048 	psize_t	size[UVM_PMR_MEMTYPE_MAX];
2049 	psize_t	free;
2050 	int	useq_len;
2051 	int	mt;
2052 
2053 	printf("Ranges, use queue:\n");
2054 	useq_len = 0;
2055 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
2056 		useq_len++;
2057 		free = 0;
2058 		for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
2059 			pg = RBT_MAX(uvm_pmr_size, &pmr->size[mt]);
2060 			if (pg != NULL)
2061 				pg--;
2062 			else
2063 				pg = TAILQ_FIRST(&pmr->single[mt]);
2064 			size[mt] = (pg == NULL ? 0 : pg->fpgsz);
2065 
2066 			RBT_FOREACH(pg, uvm_pmr_addr, &pmr->addr)
2067 				free += pg->fpgsz;
2068 		}
2069 
2070 		printf("* [0x%lx-0x%lx] use=%d nsegs=%ld",
2071 		    (unsigned long)pmr->low, (unsigned long)pmr->high,
2072 		    pmr->use, (unsigned long)pmr->nsegs);
2073 		for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
2074 			printf(" maxsegsz[%d]=0x%lx", mt,
2075 			    (unsigned long)size[mt]);
2076 		}
2077 		printf(" free=0x%lx\n", (unsigned long)free);
2078 	}
2079 	printf("#ranges = %d\n", useq_len);
2080 }
2081 #endif
2082 
2083 /*
2084  * uvm_wait_pla: wait (sleep) for the page daemon to free some pages
2085  * in a specific physmem area.
2086  *
2087  * Returns ENOMEM if the pagedaemon failed to free any pages.
2088  * If not failok, failure will lead to panic.
2089  *
2090  * Must be called with fpageq locked.
2091  */
2092 int
uvm_wait_pla(paddr_t low,paddr_t high,paddr_t size,int failok)2093 uvm_wait_pla(paddr_t low, paddr_t high, paddr_t size, int failok)
2094 {
2095 	struct uvm_pmalloc pma;
2096 	const char *wmsg = "pmrwait";
2097 
2098 	if (curproc == uvm.pagedaemon_proc) {
2099 		/*
2100 		 * This is not that uncommon when the pagedaemon is trying
2101 		 * to flush out a large mmapped file. VOP_WRITE will circle
2102 		 * back through the buffer cache and try to get more memory.
2103 		 * The pagedaemon starts by calling bufbackoff, but we can
2104 		 * easily use up that reserve in a single scan iteration.
2105 		 */
2106 		uvm_unlock_fpageq();
2107 		if (bufbackoff(NULL, atop(size)) == 0) {
2108 			uvm_lock_fpageq();
2109 			return 0;
2110 		}
2111 		uvm_lock_fpageq();
2112 
2113 		/*
2114 		 * XXX detect pagedaemon deadlock - see comment in
2115 		 * uvm_wait(), as this is exactly the same issue.
2116 		 */
2117 		printf("pagedaemon: wait_pla deadlock detected!\n");
2118 		msleep_nsec(&uvmexp.free, &uvm.fpageqlock, PVM, wmsg,
2119 		    MSEC_TO_NSEC(125));
2120 #if defined(DEBUG)
2121 		/* DEBUG: panic so we can debug it */
2122 		panic("wait_pla pagedaemon deadlock");
2123 #endif
2124 		return 0;
2125 	}
2126 
2127 	for (;;) {
2128 		pma.pm_constraint.ucr_low = low;
2129 		pma.pm_constraint.ucr_high = high;
2130 		pma.pm_size = size;
2131 		pma.pm_flags = UVM_PMA_LINKED;
2132 		TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, &pma, pmq);
2133 
2134 		wakeup(&uvm.pagedaemon);		/* wake the daemon! */
2135 		while (pma.pm_flags & (UVM_PMA_LINKED | UVM_PMA_BUSY))
2136 			msleep_nsec(&pma, &uvm.fpageqlock, PVM, wmsg, INFSLP);
2137 
2138 		if (!(pma.pm_flags & UVM_PMA_FREED) &&
2139 		    pma.pm_flags & UVM_PMA_FAIL) {
2140 			if (failok)
2141 				return ENOMEM;
2142 			printf("uvm_wait: failed to free %ld pages between "
2143 			    "0x%lx-0x%lx\n", atop(size), low, high);
2144 		} else
2145 			return 0;
2146 	}
2147 	/* UNREACHABLE */
2148 }
2149 
2150 /*
2151  * Wake up uvm_pmalloc sleepers.
2152  */
2153 void
uvm_wakeup_pla(paddr_t low,psize_t len)2154 uvm_wakeup_pla(paddr_t low, psize_t len)
2155 {
2156 	struct uvm_pmalloc *pma, *pma_next;
2157 	paddr_t high;
2158 
2159 	high = low + len;
2160 
2161 	/* Wake specific allocations waiting for this memory. */
2162 	for (pma = TAILQ_FIRST(&uvm.pmr_control.allocs); pma != NULL;
2163 	    pma = pma_next) {
2164 		pma_next = TAILQ_NEXT(pma, pmq);
2165 
2166 		if (low < pma->pm_constraint.ucr_high &&
2167 		    high > pma->pm_constraint.ucr_low) {
2168 			pma->pm_flags |= UVM_PMA_FREED;
2169 			if (!(pma->pm_flags & UVM_PMA_BUSY)) {
2170 				pma->pm_flags &= ~UVM_PMA_LINKED;
2171 				TAILQ_REMOVE(&uvm.pmr_control.allocs, pma,
2172 				    pmq);
2173 				wakeup(pma);
2174 			}
2175 		}
2176 	}
2177 }
2178 
2179 void
uvm_pagezero_thread(void * arg)2180 uvm_pagezero_thread(void *arg)
2181 {
2182 	struct pglist pgl;
2183 	struct vm_page *pg;
2184 	int count;
2185 
2186 	/* Run at the lowest possible priority. */
2187 	curproc->p_p->ps_nice = NZERO + PRIO_MAX;
2188 
2189 	KERNEL_UNLOCK();
2190 
2191 	TAILQ_INIT(&pgl);
2192 	for (;;) {
2193 		uvm_lock_fpageq();
2194 		while (uvmexp.zeropages >= UVM_PAGEZERO_TARGET ||
2195 		    (count = uvm_pmr_get1page(16, UVM_PMR_MEMTYPE_DIRTY,
2196 		     &pgl, 0, 0, 1)) == 0) {
2197 			msleep_nsec(&uvmexp.zeropages, &uvm.fpageqlock,
2198 			    MAXPRI, "pgzero", INFSLP);
2199 		}
2200 		uvm_unlock_fpageq();
2201 
2202 		TAILQ_FOREACH(pg, &pgl, pageq) {
2203 			uvm_pagezero(pg);
2204 			atomic_setbits_int(&pg->pg_flags, PG_ZERO);
2205 		}
2206 
2207 		uvm_lock_fpageq();
2208 		while (!TAILQ_EMPTY(&pgl))
2209 			uvm_pmr_remove_1strange(&pgl, 0, NULL, 0);
2210 		uvmexp.zeropages += count;
2211  		uvm_unlock_fpageq();
2212 
2213 		yield();
2214 	}
2215 }
2216 
2217 #if defined(MULTIPROCESSOR) && defined(__HAVE_UVM_PERCPU)
2218 int
uvm_pmr_cache_alloc(struct uvm_pmr_cache_item * upci)2219 uvm_pmr_cache_alloc(struct uvm_pmr_cache_item *upci)
2220 {
2221 	struct vm_page *pg;
2222 	struct pglist pgl;
2223 	int flags = UVM_PLA_NOWAIT|UVM_PLA_NOWAKE;
2224 	int npages = UVM_PMR_CACHEMAGSZ;
2225 
2226 	splassert(IPL_VM);
2227 	KASSERT(upci->upci_npages == 0);
2228 
2229 	TAILQ_INIT(&pgl);
2230 	if (uvm_pmr_getpages(npages, 0, 0, 1, 0, npages, flags, &pgl))
2231 		return -1;
2232 
2233 	while ((pg = TAILQ_FIRST(&pgl)) != NULL) {
2234 		TAILQ_REMOVE(&pgl, pg, pageq);
2235 		upci->upci_pages[upci->upci_npages] = pg;
2236 		upci->upci_npages++;
2237 	}
2238 	atomic_add_int(&uvmexp.percpucaches, npages);
2239 
2240 	return 0;
2241 }
2242 
2243 struct vm_page *
uvm_pmr_cache_get(int flags)2244 uvm_pmr_cache_get(int flags)
2245 {
2246 	struct uvm_pmr_cache *upc = &curcpu()->ci_uvm;
2247 	struct uvm_pmr_cache_item *upci;
2248 	struct vm_page *pg;
2249 	int s;
2250 
2251 	s = splvm();
2252 	upci = &upc->upc_magz[upc->upc_actv];
2253 	if (upci->upci_npages == 0) {
2254 		unsigned int prev;
2255 
2256 		prev = (upc->upc_actv == 0) ?  1 : 0;
2257 		upci = &upc->upc_magz[prev];
2258 		if (upci->upci_npages == 0) {
2259 			atomic_inc_int(&uvmexp.pcpmiss);
2260 			if (uvm_pmr_cache_alloc(upci)) {
2261 				splx(s);
2262 				return uvm_pmr_getone(flags);
2263 			}
2264 		}
2265 		/* Swap magazines */
2266 		upc->upc_actv = prev;
2267 	} else {
2268 		atomic_inc_int(&uvmexp.pcphit);
2269 	}
2270 
2271 	atomic_dec_int(&uvmexp.percpucaches);
2272 	upci->upci_npages--;
2273 	pg = upci->upci_pages[upci->upci_npages];
2274 	splx(s);
2275 
2276 	if (flags & UVM_PLA_ZERO)
2277 		uvm_pagezero(pg);
2278 
2279 	return pg;
2280 }
2281 
2282 void
uvm_pmr_cache_free(struct uvm_pmr_cache_item * upci)2283 uvm_pmr_cache_free(struct uvm_pmr_cache_item *upci)
2284 {
2285 	struct pglist pgl;
2286 	unsigned int i;
2287 
2288 	splassert(IPL_VM);
2289 
2290 	TAILQ_INIT(&pgl);
2291 	for (i = 0; i < upci->upci_npages; i++)
2292 		TAILQ_INSERT_TAIL(&pgl, upci->upci_pages[i], pageq);
2293 
2294 	uvm_pmr_freepageq(&pgl);
2295 
2296 	atomic_sub_int(&uvmexp.percpucaches, upci->upci_npages);
2297 	upci->upci_npages = 0;
2298 	memset(upci->upci_pages, 0, sizeof(upci->upci_pages));
2299 }
2300 
2301 void
uvm_pmr_cache_put(struct vm_page * pg)2302 uvm_pmr_cache_put(struct vm_page *pg)
2303 {
2304 	struct uvm_pmr_cache *upc = &curcpu()->ci_uvm;
2305 	struct uvm_pmr_cache_item *upci;
2306 	int s;
2307 
2308 	s = splvm();
2309 	upci = &upc->upc_magz[upc->upc_actv];
2310 	if (upci->upci_npages >= UVM_PMR_CACHEMAGSZ) {
2311 		unsigned int prev;
2312 
2313 		prev = (upc->upc_actv == 0) ?  1 : 0;
2314 		upci = &upc->upc_magz[prev];
2315 		if (upci->upci_npages > 0)
2316 			uvm_pmr_cache_free(upci);
2317 
2318 		/* Swap magazines */
2319 		upc->upc_actv = prev;
2320 		KASSERT(upci->upci_npages == 0);
2321 	}
2322 
2323 	upci->upci_pages[upci->upci_npages] = pg;
2324 	upci->upci_npages++;
2325 	atomic_inc_int(&uvmexp.percpucaches);
2326 	splx(s);
2327 }
2328 
2329 void
uvm_pmr_cache_drain(void)2330 uvm_pmr_cache_drain(void)
2331 {
2332 	struct uvm_pmr_cache *upc = &curcpu()->ci_uvm;
2333 	int s;
2334 
2335 	s = splvm();
2336 	uvm_pmr_cache_free(&upc->upc_magz[0]);
2337 	uvm_pmr_cache_free(&upc->upc_magz[1]);
2338 	splx(s);
2339 }
2340 
2341 #else /* !(MULTIPROCESSOR && __HAVE_UVM_PERCPU) */
2342 
2343 struct vm_page *
uvm_pmr_cache_get(int flags)2344 uvm_pmr_cache_get(int flags)
2345 {
2346 	return uvm_pmr_getone(flags);
2347 }
2348 
2349 void
uvm_pmr_cache_put(struct vm_page * pg)2350 uvm_pmr_cache_put(struct vm_page *pg)
2351 {
2352 	uvm_pmr_freepages(pg, 1);
2353 }
2354 
2355 void
uvm_pmr_cache_drain(void)2356 uvm_pmr_cache_drain(void)
2357 {
2358 }
2359 #endif
2360