xref: /openbsd/sys/uvm/uvm_pmemrange.c (revision 9b7c3dbb)
1 /*	$OpenBSD: uvm_pmemrange.c,v 1.50 2016/01/29 11:50:40 tb Exp $	*/
2 
3 /*
4  * Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>
20 #include <sys/systm.h>
21 #include <uvm/uvm.h>
22 #include <sys/malloc.h>
23 #include <sys/kernel.h>
24 #include <sys/kthread.h>
25 #include <sys/mount.h>
26 
27 /*
28  * 2 trees: addr tree and size tree.
29  *
30  * The allocator keeps chunks of free pages (called a range).
31  * Two pages are part of the same range if:
32  * - all pages in between are part of that range,
33  * - they are of the same memory type (zeroed or non-zeroed),
34  * - they are part of the same pmemrange.
35  * A pmemrange is a range of memory which is part of the same vm_physseg
36  * and has a use-count.
37  *
38  * addr tree is vm_page[0].objt
39  * size tree is vm_page[1].objt
40  *
41  * The size tree is not used for memory ranges of 1 page, instead,
42  * single queue is vm_page[0].pageq
43  *
44  * vm_page[0].fpgsz describes the length of a free range. Two adjecent ranges
45  * are joined, unless:
46  * - they have pages in between them which are not free
47  * - they belong to different memtypes (zeroed vs dirty memory)
48  * - they are in different pmemrange areas (ISA vs non-ISA memory for instance)
49  * - they are not a continuation of the same array
50  * The latter issue is caused by vm_physseg ordering and splitting from the
51  * MD initialization machinery. The MD code is dependant on freelists and
52  * happens to split ISA memory from non-ISA memory.
53  * (Note: freelists die die die!)
54  *
55  * uvm_page_init guarantees that every vm_physseg contains an array of
56  * struct vm_page. Also, uvm_page_physload allocates an array of struct
57  * vm_page. This code depends on that array. The array may break across
58  * vm_physsegs boundaries.
59  */
60 
61 /*
62  * Validate the flags of the page. (Used in asserts.)
63  * Any free page must have the PQ_FREE flag set.
64  * Free pages may be zeroed.
65  * Pmap flags are left untouched.
66  *
67  * The PQ_FREE flag is not checked here: by not checking, we can easily use
68  * this check in pages which are freed.
69  */
70 #define VALID_FLAGS(pg_flags)						\
71 	(((pg_flags) & ~(PQ_FREE|PG_ZERO|PG_PMAPMASK)) == 0x0)
72 
73 /* Tree comparators. */
74 int	uvm_pmemrange_addr_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *);
75 int	uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *);
76 int	uvm_pmr_addr_cmp(struct vm_page *, struct vm_page *);
77 int	uvm_pmr_size_cmp(struct vm_page *, struct vm_page *);
78 int	uvm_pmr_pg_to_memtype(struct vm_page *);
79 
80 #ifdef DDB
81 void	uvm_pmr_print(void);
82 #endif
83 
84 /*
85  * Memory types. The page flags are used to derive what the current memory
86  * type of a page is.
87  */
88 int
89 uvm_pmr_pg_to_memtype(struct vm_page *pg)
90 {
91 	if (pg->pg_flags & PG_ZERO)
92 		return UVM_PMR_MEMTYPE_ZERO;
93 	/* Default: dirty memory. */
94 	return UVM_PMR_MEMTYPE_DIRTY;
95 }
96 
97 /* Trees. */
98 RB_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp);
99 RB_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp);
100 RB_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr,
101     uvm_pmemrange_addr_cmp);
102 
103 /* Validation. */
104 #ifdef DEBUG
105 void	uvm_pmr_assertvalid(struct uvm_pmemrange *pmr);
106 #else
107 #define uvm_pmr_assertvalid(pmr)	do {} while (0)
108 #endif
109 
110 psize_t			 uvm_pmr_get1page(psize_t, int, struct pglist *,
111 			    paddr_t, paddr_t, int);
112 
113 struct uvm_pmemrange	*uvm_pmr_allocpmr(void);
114 struct vm_page		*uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int);
115 struct vm_page		*uvm_pmr_nextsz(struct uvm_pmemrange *,
116 			    struct vm_page *, int);
117 void			 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr,
118 			    struct vm_page *pg, struct vm_page **pg_prev,
119 			    struct vm_page **pg_next);
120 struct vm_page		*uvm_pmr_findnextsegment(struct uvm_pmemrange *,
121 			    struct vm_page *, paddr_t);
122 psize_t			 uvm_pmr_remove_1strange(struct pglist *, paddr_t,
123 			    struct vm_page **, int);
124 void			 uvm_pmr_split(paddr_t);
125 struct uvm_pmemrange	*uvm_pmemrange_find(paddr_t);
126 struct uvm_pmemrange	*uvm_pmemrange_use_insert(struct uvm_pmemrange_use *,
127 			    struct uvm_pmemrange *);
128 psize_t			 pow2divide(psize_t, psize_t);
129 struct vm_page		*uvm_pmr_rootupdate(struct uvm_pmemrange *,
130 			    struct vm_page *, paddr_t, paddr_t, int);
131 
132 /*
133  * Computes num/denom and rounds it up to the next power-of-2.
134  *
135  * This is a division function which calculates an approximation of
136  * num/denom, with result =~ num/denom. It is meant to be fast and doesn't
137  * have to be accurate.
138  *
139  * Providing too large a value makes the allocator slightly faster, at the
140  * risk of hitting the failure case more often. Providing too small a value
141  * makes the allocator a bit slower, but less likely to hit a failure case.
142  */
143 psize_t
144 pow2divide(psize_t num, psize_t denom)
145 {
146 	int rshift;
147 
148 	for (rshift = 0; num > denom; rshift++, denom <<= 1)
149 		;
150 	return (paddr_t)1 << rshift;
151 }
152 
153 /*
154  * Predicate: lhs is a subrange or rhs.
155  *
156  * If rhs_low == 0: don't care about lower bound.
157  * If rhs_high == 0: don't care about upper bound.
158  */
159 #define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high)	\
160 	(((rhs_low) == 0 || (lhs_low) >= (rhs_low)) &&			\
161 	((rhs_high) == 0 || (lhs_high) <= (rhs_high)))
162 
163 /*
164  * Predicate: lhs intersects with rhs.
165  *
166  * If rhs_low == 0: don't care about lower bound.
167  * If rhs_high == 0: don't care about upper bound.
168  * Ranges don't intersect if they don't have any page in common, array
169  * semantics mean that < instead of <= should be used here.
170  */
171 #define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high)	\
172 	(((rhs_low) == 0 || (rhs_low) < (lhs_high)) &&			\
173 	((rhs_high) == 0 || (lhs_low) < (rhs_high)))
174 
175 /*
176  * Align to power-of-2 alignment.
177  */
178 #define PMR_ALIGN(pgno, align)						\
179 	(((pgno) + ((align) - 1)) & ~((align) - 1))
180 
181 
182 /*
183  * Comparator: sort by address ascending.
184  */
185 int
186 uvm_pmemrange_addr_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs)
187 {
188 	return lhs->low < rhs->low ? -1 : lhs->low > rhs->low;
189 }
190 
191 /*
192  * Comparator: sort by use ascending.
193  *
194  * The higher the use value of a range, the more devices need memory in
195  * this range. Therefore allocate from the range with the lowest use first.
196  */
197 int
198 uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs)
199 {
200 	int result;
201 
202 	result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use;
203 	if (result == 0)
204 		result = uvm_pmemrange_addr_cmp(lhs, rhs);
205 	return result;
206 }
207 
208 int
209 uvm_pmr_addr_cmp(struct vm_page *lhs, struct vm_page *rhs)
210 {
211 	paddr_t lhs_addr, rhs_addr;
212 
213 	lhs_addr = VM_PAGE_TO_PHYS(lhs);
214 	rhs_addr = VM_PAGE_TO_PHYS(rhs);
215 
216 	return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr);
217 }
218 
219 int
220 uvm_pmr_size_cmp(struct vm_page *lhs, struct vm_page *rhs)
221 {
222 	psize_t lhs_size, rhs_size;
223 	int cmp;
224 
225 	/* Using second tree, so we receive pg[1] instead of pg[0]. */
226 	lhs_size = (lhs - 1)->fpgsz;
227 	rhs_size = (rhs - 1)->fpgsz;
228 
229 	cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size);
230 	if (cmp == 0)
231 		cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1);
232 	return cmp;
233 }
234 
235 /*
236  * Find the first range of free pages that is at least sz pages long.
237  */
238 struct vm_page *
239 uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti)
240 {
241 	struct	vm_page *node, *best;
242 
243 	KASSERT(sz >= 1);
244 
245 	if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti]))
246 		return TAILQ_FIRST(&pmr->single[mti]);
247 
248 	node = RB_ROOT(&pmr->size[mti]);
249 	best = NULL;
250 	while (node != NULL) {
251 		if ((node - 1)->fpgsz >= sz) {
252 			best = (node - 1);
253 			node = RB_LEFT(node, objt);
254 		} else
255 			node = RB_RIGHT(node, objt);
256 	}
257 	return best;
258 }
259 
260 /*
261  * Finds the next range. The next range has a size >= pg->fpgsz.
262  * Returns NULL if no more ranges are available.
263  */
264 struct vm_page *
265 uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt)
266 {
267 	struct vm_page *npg;
268 
269 	KASSERT(pmr != NULL && pg != NULL);
270 	if (pg->fpgsz == 1) {
271 		if (TAILQ_NEXT(pg, pageq) != NULL)
272 			return TAILQ_NEXT(pg, pageq);
273 		else
274 			npg = RB_MIN(uvm_pmr_size, &pmr->size[mt]);
275 	} else
276 		npg = RB_NEXT(uvm_pmr_size, &pmr->size[mt], pg + 1);
277 
278 	return npg == NULL ? NULL : npg - 1;
279 }
280 
281 /*
282  * Finds the previous and next ranges relative to the (uninserted) pg range.
283  *
284  * *pg_prev == NULL if no previous range is available, that can join with
285  * 	pg.
286  * *pg_next == NULL if no next range is available, that can join with
287  * 	pg.
288  */
289 void
290 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg,
291     struct vm_page **pg_prev, struct vm_page **pg_next)
292 {
293 	KASSERT(pg_prev != NULL && pg_next != NULL);
294 
295 	*pg_next = RB_NFIND(uvm_pmr_addr, &pmr->addr, pg);
296 	if (*pg_next == NULL)
297 		*pg_prev = RB_MAX(uvm_pmr_addr, &pmr->addr);
298 	else
299 		*pg_prev = RB_PREV(uvm_pmr_addr, &pmr->addr, *pg_next);
300 
301 	KDASSERT(*pg_next == NULL ||
302 	    VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg));
303 	KDASSERT(*pg_prev == NULL ||
304 	    VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg));
305 
306 	/* Reset if not contig. */
307 	if (*pg_prev != NULL &&
308 	    (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz
309 	    != atop(VM_PAGE_TO_PHYS(pg)) ||
310 	    *pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */
311 	    uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg)))
312 		*pg_prev = NULL;
313 	if (*pg_next != NULL &&
314 	    (atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz
315 	    != atop(VM_PAGE_TO_PHYS(*pg_next)) ||
316 	    pg + pg->fpgsz != *pg_next || /* Array broke. */
317 	    uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg)))
318 		*pg_next = NULL;
319 	return;
320 }
321 
322 /*
323  * Remove a range from the address tree.
324  * Address tree maintains pmr counters.
325  */
326 void
327 uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg)
328 {
329 	KDASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
330 	KDASSERT(pg->pg_flags & PQ_FREE);
331 	RB_REMOVE(uvm_pmr_addr, &pmr->addr, pg);
332 
333 	pmr->nsegs--;
334 }
335 /*
336  * Remove a range from the size tree.
337  */
338 void
339 uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
340 {
341 	int memtype;
342 #ifdef DEBUG
343 	struct vm_page *i;
344 #endif
345 
346 	KDASSERT(pg->fpgsz >= 1);
347 	KDASSERT(pg->pg_flags & PQ_FREE);
348 	memtype = uvm_pmr_pg_to_memtype(pg);
349 
350 	if (pg->fpgsz == 1) {
351 #ifdef DEBUG
352 		TAILQ_FOREACH(i, &pmr->single[memtype], pageq) {
353 			if (i == pg)
354 				break;
355 		}
356 		KDASSERT(i == pg);
357 #endif
358 		TAILQ_REMOVE(&pmr->single[memtype], pg, pageq);
359 	} else {
360 		KDASSERT(RB_FIND(uvm_pmr_size, &pmr->size[memtype],
361 		    pg + 1) == pg + 1);
362 		RB_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1);
363 	}
364 }
365 /* Remove from both trees. */
366 void
367 uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg)
368 {
369 	uvm_pmr_assertvalid(pmr);
370 	uvm_pmr_remove_size(pmr, pg);
371 	uvm_pmr_remove_addr(pmr, pg);
372 	uvm_pmr_assertvalid(pmr);
373 }
374 
375 /*
376  * Insert the range described in pg.
377  * Returns the range thus created (which may be joined with the previous and
378  * next ranges).
379  * If no_join, the caller guarantees that the range cannot possibly join
380  * with adjecent ranges.
381  */
382 struct vm_page *
383 uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
384 {
385 	struct vm_page *prev, *next;
386 
387 #ifdef DEBUG
388 	struct vm_page *i;
389 	int mt;
390 #endif
391 
392 	KDASSERT(pg->pg_flags & PQ_FREE);
393 	KDASSERT(pg->fpgsz >= 1);
394 
395 #ifdef DEBUG
396 	for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
397 		TAILQ_FOREACH(i, &pmr->single[mt], pageq)
398 			KDASSERT(i != pg);
399 		if (pg->fpgsz > 1) {
400 			KDASSERT(RB_FIND(uvm_pmr_size, &pmr->size[mt],
401 			    pg + 1) == NULL);
402 		}
403 		KDASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL);
404 	}
405 #endif
406 
407 	if (!no_join) {
408 		uvm_pmr_pnaddr(pmr, pg, &prev, &next);
409 		if (next != NULL) {
410 			uvm_pmr_remove_size(pmr, next);
411 			uvm_pmr_remove_addr(pmr, next);
412 			pg->fpgsz += next->fpgsz;
413 			next->fpgsz = 0;
414 		}
415 		if (prev != NULL) {
416 			uvm_pmr_remove_size(pmr, prev);
417 			prev->fpgsz += pg->fpgsz;
418 			pg->fpgsz = 0;
419 			return prev;
420 		}
421 	}
422 
423 	RB_INSERT(uvm_pmr_addr, &pmr->addr, pg);
424 
425 	pmr->nsegs++;
426 
427 	return pg;
428 }
429 /*
430  * Insert the range described in pg.
431  * Returns the range thus created (which may be joined with the previous and
432  * next ranges).
433  * Page must already be in the address tree.
434  */
435 void
436 uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
437 {
438 	int memtype;
439 #ifdef DEBUG
440 	struct vm_page *i;
441 	int mti;
442 #endif
443 
444 	KDASSERT(pg->fpgsz >= 1);
445 	KDASSERT(pg->pg_flags & PQ_FREE);
446 
447 	memtype = uvm_pmr_pg_to_memtype(pg);
448 #ifdef DEBUG
449 	for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
450 		TAILQ_FOREACH(i, &pmr->single[mti], pageq)
451 			KDASSERT(i != pg);
452 		if (pg->fpgsz > 1) {
453 			KDASSERT(RB_FIND(uvm_pmr_size, &pmr->size[mti],
454 			    pg + 1) == NULL);
455 		}
456 		KDASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
457 	}
458 	for (i = pg; i < pg + pg->fpgsz; i++)
459 		KASSERT(uvm_pmr_pg_to_memtype(i) == memtype);
460 #endif
461 
462 	if (pg->fpgsz == 1)
463 		TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq);
464 	else
465 		RB_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1);
466 }
467 /* Insert in both trees. */
468 struct vm_page *
469 uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
470 {
471 	uvm_pmr_assertvalid(pmr);
472 	pg = uvm_pmr_insert_addr(pmr, pg, no_join);
473 	uvm_pmr_insert_size(pmr, pg);
474 	uvm_pmr_assertvalid(pmr);
475 	return pg;
476 }
477 
478 /*
479  * Find the last page that is part of this segment.
480  * => pg: the range at which to start the search.
481  * => boundary: the page number boundary specification (0 = no boundary).
482  * => pmr: the pmemrange of the page.
483  *
484  * This function returns 1 before the next range, so if you want to have the
485  * next range, you need to run TAILQ_NEXT(result, pageq) after calling.
486  * The reason is that this way, the length of the segment is easily
487  * calculated using: atop(result) - atop(pg) + 1.
488  * Hence this function also never returns NULL.
489  */
490 struct vm_page *
491 uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr,
492     struct vm_page *pg, paddr_t boundary)
493 {
494 	paddr_t	first_boundary;
495 	struct	vm_page *next;
496 	struct	vm_page *prev;
497 
498 	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) &&
499 	    pmr->high > atop(VM_PAGE_TO_PHYS(pg)));
500 	if (boundary != 0) {
501 		first_boundary =
502 		    PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary);
503 	} else
504 		first_boundary = 0;
505 
506 	/*
507 	 * Increase next until it hits the first page of the next segment.
508 	 *
509 	 * While loop checks the following:
510 	 * - next != NULL	we have not reached the end of pgl
511 	 * - boundary == 0 || next < first_boundary
512 	 *			we do not cross a boundary
513 	 * - atop(prev) + 1 == atop(next)
514 	 *			still in the same segment
515 	 * - low <= last
516 	 * - high > last	still in the same memory range
517 	 * - memtype is equal	allocator is unable to view different memtypes
518 	 *			as part of the same segment
519 	 * - prev + 1 == next	no array breakage occurs
520 	 */
521 	prev = pg;
522 	next = TAILQ_NEXT(prev, pageq);
523 	while (next != NULL &&
524 	    (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) &&
525 	    atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) &&
526 	    pmr->low <= atop(VM_PAGE_TO_PHYS(next)) &&
527 	    pmr->high > atop(VM_PAGE_TO_PHYS(next)) &&
528 	    uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) &&
529 	    prev + 1 == next) {
530 		prev = next;
531 		next = TAILQ_NEXT(prev, pageq);
532 	}
533 
534 	/*
535 	 * End of this segment.
536 	 */
537 	return prev;
538 }
539 
540 /*
541  * Remove the first segment of contiguous pages from pgl.
542  * A segment ends if it crosses boundary (unless boundary = 0) or
543  * if it would enter a different uvm_pmemrange.
544  *
545  * Work: the page range that the caller is currently working with.
546  * May be null.
547  *
548  * If is_desperate is non-zero, the smallest segment is erased. Otherwise,
549  * the first segment is erased (which, if called by uvm_pmr_getpages(),
550  * probably is the smallest or very close to it).
551  */
552 psize_t
553 uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary,
554     struct vm_page **work, int is_desperate)
555 {
556 	struct vm_page *start, *end, *iter, *iter_end, *inserted;
557 	psize_t count;
558 	struct uvm_pmemrange *pmr, *pmr_iter;
559 
560 	KASSERT(!TAILQ_EMPTY(pgl));
561 
562 	/*
563 	 * Initialize to first page.
564 	 * Unless desperate scan finds a better candidate, this is what'll be
565 	 * erased.
566 	 */
567 	start = TAILQ_FIRST(pgl);
568 	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start)));
569 	end = uvm_pmr_findnextsegment(pmr, start, boundary);
570 
571 	/*
572 	 * If we are desperate, we _really_ want to get rid of the smallest
573 	 * element (rather than a close match to the smallest element).
574 	 */
575 	if (is_desperate) {
576 		/* Linear search for smallest segment. */
577 		pmr_iter = pmr;
578 		for (iter = TAILQ_NEXT(end, pageq);
579 		    iter != NULL && start != end;
580 		    iter = TAILQ_NEXT(iter_end, pageq)) {
581 			/*
582 			 * Only update pmr if it doesn't match current
583 			 * iteration.
584 			 */
585 			if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) ||
586 			    pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) {
587 				pmr_iter = uvm_pmemrange_find(atop(
588 				    VM_PAGE_TO_PHYS(iter)));
589 			}
590 
591 			iter_end = uvm_pmr_findnextsegment(pmr_iter, iter,
592 			    boundary);
593 
594 			/*
595 			 * Current iteration is smaller than best match so
596 			 * far; update.
597 			 */
598 			if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) <
599 			    VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) {
600 				start = iter;
601 				end = iter_end;
602 				pmr = pmr_iter;
603 			}
604 		}
605 	}
606 
607 	/*
608 	 * Calculate count and end of the list.
609 	 */
610 	count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1;
611 	end = TAILQ_NEXT(end, pageq);
612 
613 	/*
614 	 * Actually remove the range of pages.
615 	 *
616 	 * Sadly, this cannot be done using pointer iteration:
617 	 * vm_physseg is not guaranteed to be sorted on address, hence
618 	 * uvm_page_init() may not have initialized its array sorted by
619 	 * page number.
620 	 */
621 	for (iter = start; iter != end; iter = iter_end) {
622 		iter_end = TAILQ_NEXT(iter, pageq);
623 		TAILQ_REMOVE(pgl, iter, pageq);
624 	}
625 
626 	start->fpgsz = count;
627 	inserted = uvm_pmr_insert(pmr, start, 0);
628 
629 	/*
630 	 * If the caller was working on a range and this function modified
631 	 * that range, update the pointer.
632 	 */
633 	if (work != NULL && *work != NULL &&
634 	    atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) &&
635 	    atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz >
636 	    atop(VM_PAGE_TO_PHYS(*work)))
637 		*work = inserted;
638 	return count;
639 }
640 
641 /*
642  * Extract a number of pages from a segment of free pages.
643  * Called by uvm_pmr_getpages.
644  *
645  * Returns the segment that was created from pages left over at the tail
646  * of the remove set of pages, or NULL if no pages were left at the tail.
647  */
648 struct vm_page *
649 uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg,
650     paddr_t start, paddr_t end, struct pglist *result)
651 {
652 	struct vm_page *after, *pg_i;
653 	psize_t before_sz, after_sz;
654 #ifdef DEBUG
655 	psize_t i;
656 #endif
657 
658 	KDASSERT(end > start);
659 	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)));
660 	KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz);
661 	KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start);
662 	KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end);
663 
664 	before_sz = start - atop(VM_PAGE_TO_PHYS(pg));
665 	after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end;
666 	KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz);
667 	uvm_pmr_assertvalid(pmr);
668 
669 	uvm_pmr_remove_size(pmr, pg);
670 	if (before_sz == 0)
671 		uvm_pmr_remove_addr(pmr, pg);
672 	after = pg + before_sz + (end - start);
673 
674 	/* Add selected pages to result. */
675 	for (pg_i = pg + before_sz; pg_i != after; pg_i++) {
676 		KASSERT(pg_i->pg_flags & PQ_FREE);
677 		pg_i->fpgsz = 0;
678 		TAILQ_INSERT_TAIL(result, pg_i, pageq);
679 	}
680 
681 	/* Before handling. */
682 	if (before_sz > 0) {
683 		pg->fpgsz = before_sz;
684 		uvm_pmr_insert_size(pmr, pg);
685 	}
686 
687 	/* After handling. */
688 	if (after_sz > 0) {
689 #ifdef DEBUG
690 		for (i = 0; i < after_sz; i++) {
691 			KASSERT(!uvm_pmr_isfree(after + i));
692 		}
693 #endif
694 		KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end);
695 		after->fpgsz = after_sz;
696 		after = uvm_pmr_insert_addr(pmr, after, 1);
697 		uvm_pmr_insert_size(pmr, after);
698 	}
699 
700 	uvm_pmr_assertvalid(pmr);
701 	return (after_sz > 0 ? after : NULL);
702 }
703 
704 /*
705  * Acquire a number of pages.
706  *
707  * count:	the number of pages returned
708  * start:	lowest page number
709  * end:		highest page number +1
710  * 		(start = end = 0: no limitation)
711  * align:	power-of-2 alignment constraint (align = 1: no alignment)
712  * boundary:	power-of-2 boundary (boundary = 0: no boundary)
713  * maxseg:	maximum number of segments to return
714  * flags:	UVM_PLA_* flags
715  * result:	returned pages storage (uses pageq)
716  */
717 int
718 uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align,
719     paddr_t boundary, int maxseg, int flags, struct pglist *result)
720 {
721 	struct	uvm_pmemrange *pmr;	/* Iterate memory ranges. */
722 	struct	vm_page *found, *f_next; /* Iterate chunks. */
723 	psize_t	fcount;			/* Current found pages. */
724 	int	fnsegs;			/* Current segment counter. */
725 	int	try, start_try;
726 	psize_t	search[3];
727 	paddr_t	fstart, fend;		/* Pages to be taken from found. */
728 	int	memtype;		/* Requested memtype. */
729 	int	memtype_init;		/* Best memtype. */
730 	int	desperate;		/* True if allocation failed. */
731 #ifdef DIAGNOSTIC
732 	struct	vm_page *diag_prev;	/* Used during validation. */
733 #endif /* DIAGNOSTIC */
734 
735 	/*
736 	 * Validate arguments.
737 	 */
738 	KASSERT(count > 0);
739 	KASSERT(start == 0 || end == 0 || start < end);
740 	KASSERT(align >= 1);
741 	KASSERT(powerof2(align));
742 	KASSERT(maxseg > 0);
743 	KASSERT(boundary == 0 || powerof2(boundary));
744 	KASSERT(boundary == 0 || maxseg * boundary >= count);
745 	KASSERT(TAILQ_EMPTY(result));
746 
747 	/*
748 	 * TRYCONTIG is a noop if you only want a single segment.
749 	 * Remove it if that's the case: otherwise it'll deny the fast
750 	 * allocation.
751 	 */
752 	if (maxseg == 1 || count == 1)
753 		flags &= ~UVM_PLA_TRYCONTIG;
754 
755 	/*
756 	 * Configure search.
757 	 *
758 	 * search[0] is one segment, only used in UVM_PLA_TRYCONTIG case.
759 	 * search[1] is multiple segments, chosen to fulfill the search in
760 	 *   approximately even-sized segments.
761 	 *   This is a good trade-off between slightly reduced allocation speed
762 	 *   and less fragmentation.
763 	 * search[2] is the worst case, in which all segments are evaluated.
764 	 *   This provides the least fragmentation, but makes the search
765 	 *   possibly longer (although in the case it is selected, that no
766 	 *   longer matters most).
767 	 *
768 	 * The exception is when maxseg == 1: since we can only fulfill that
769 	 * with one segment of size pages, only a single search type has to
770 	 * be attempted.
771 	 */
772 	if (maxseg == 1 || count == 1) {
773 		start_try = 2;
774 		search[2] = count;
775 	} else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) {
776 		start_try = 2;
777 		search[2] = 1;
778 	} else {
779 		start_try = 0;
780 		search[0] = count;
781 		search[1] = pow2divide(count, maxseg);
782 		search[2] = 1;
783 		if ((flags & UVM_PLA_TRYCONTIG) == 0)
784 			start_try = 1;
785 		if (search[1] >= search[0]) {
786 			search[1] = search[0];
787 			start_try = 1;
788 		}
789 		if (search[2] >= search[start_try]) {
790 			start_try = 2;
791 		}
792 	}
793 
794 	/*
795 	 * Memory type: if zeroed memory is requested, traverse the zero set.
796 	 * Otherwise, traverse the dirty set.
797 	 *
798 	 * The memtype iterator is reinitialized to memtype_init on entrance
799 	 * of a pmemrange.
800 	 */
801 	if (flags & UVM_PLA_ZERO)
802 		memtype_init = UVM_PMR_MEMTYPE_ZERO;
803 	else
804 		memtype_init = UVM_PMR_MEMTYPE_DIRTY;
805 
806 	/*
807 	 * Initially, we're not desperate.
808 	 *
809 	 * Note that if we return from a sleep, we are still desperate.
810 	 * Chances are that memory pressure is still high, so resetting
811 	 * seems over-optimistic to me.
812 	 */
813 	desperate = 0;
814 
815 	uvm_lock_fpageq();
816 
817 retry:		/* Return point after sleeping. */
818 	fcount = 0;
819 	fnsegs = 0;
820 
821 retry_desperate:
822 	/*
823 	 * If we just want any page(s), go for the really fast option.
824 	 */
825 	if (count <= maxseg && align == 1 && boundary == 0 &&
826 	    (flags & UVM_PLA_TRYCONTIG) == 0) {
827 		fcount += uvm_pmr_get1page(count - fcount, memtype_init,
828 		    result, start, end, 0);
829 
830 		/*
831 		 * If we found sufficient pages, go to the succes exit code.
832 		 *
833 		 * Otherwise, go immediately to fail, since we collected
834 		 * all we could anyway.
835 		 */
836 		if (fcount == count)
837 			goto out;
838 		else
839 			goto fail;
840 	}
841 
842 	/*
843 	 * The heart of the contig case.
844 	 *
845 	 * The code actually looks like this:
846 	 *
847 	 * foreach (struct pmemrange) {
848 	 *	foreach (memtype) {
849 	 *		foreach(try) {
850 	 *			foreach (free range of memtype in pmemrange,
851 	 *			    starting at search[try]) {
852 	 *				while (range has space left)
853 	 *					take from range
854 	 *			}
855 	 *		}
856 	 *	}
857 	 *
858 	 *	if next pmemrange has higher usecount than current:
859 	 *		enter desperate case (which will drain the pmemranges
860 	 *		until empty prior to moving to the next one)
861 	 * }
862 	 *
863 	 * When desperate is activated, try always starts at the highest
864 	 * value. The memtype loop is using a goto ReScanMemtype.
865 	 * The try loop is using a goto ReScan.
866 	 * The 'range has space left' loop uses label DrainFound.
867 	 *
868 	 * Writing them all as loops would take up a lot of screen space in
869 	 * the form of indentation and some parts are easier to express
870 	 * using the labels.
871 	 */
872 
873 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
874 		/* Empty range. */
875 		if (pmr->nsegs == 0)
876 			continue;
877 
878 		/* Outside requested range. */
879 		if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
880 			continue;
881 
882 		memtype = memtype_init;
883 
884 rescan_memtype:	/* Return point at memtype++. */
885 		try = start_try;
886 
887 rescan:		/* Return point at try++. */
888 		for (found = uvm_pmr_nfindsz(pmr, search[try], memtype);
889 		    found != NULL;
890 		    found = f_next) {
891 			f_next = uvm_pmr_nextsz(pmr, found, memtype);
892 
893 			fstart = atop(VM_PAGE_TO_PHYS(found));
894 			if (start != 0)
895 				fstart = MAX(start, fstart);
896 drain_found:
897 			/*
898 			 * Throw away the first segment if fnsegs == maxseg
899 			 *
900 			 * Note that f_next is still valid after this call,
901 			 * since we only allocated from entries before f_next.
902 			 * We don't revisit the entries we already extracted
903 			 * from unless we entered the desperate case.
904 			 */
905 			if (fnsegs == maxseg) {
906 				fnsegs--;
907 				fcount -=
908 				    uvm_pmr_remove_1strange(result, boundary,
909 				    &found, desperate);
910 			}
911 
912 			fstart = PMR_ALIGN(fstart, align);
913 			fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz;
914 			if (end != 0)
915 				fend = MIN(end, fend);
916 			if (boundary != 0) {
917 				fend =
918 				    MIN(fend, PMR_ALIGN(fstart + 1, boundary));
919 			}
920 			if (fstart >= fend)
921 				continue;
922 			if (fend - fstart > count - fcount)
923 				fend = fstart + (count - fcount);
924 
925 			fcount += fend - fstart;
926 			fnsegs++;
927 			found = uvm_pmr_extract_range(pmr, found,
928 			    fstart, fend, result);
929 
930 			if (fcount == count)
931 				goto out;
932 
933 			/*
934 			 * If there's still space left in found, try to
935 			 * fully drain it prior to continueing.
936 			 */
937 			if (found != NULL) {
938 				fstart = fend;
939 				goto drain_found;
940 			}
941 		}
942 
943 		/* Try a smaller search now. */
944 		if (++try < nitems(search))
945 			goto rescan;
946 
947 		/*
948 		 * Exhaust all memory types prior to going to the next memory
949 		 * segment.
950 		 * This means that zero-vs-dirty are eaten prior to moving
951 		 * to a pmemrange with a higher use-count.
952 		 *
953 		 * Code is basically a difficult way of writing:
954 		 * memtype = memtype_init;
955 		 * do {
956 		 *	...;
957 		 *	memtype += 1;
958 		 *	memtype %= MEMTYPE_MAX;
959 		 * } while (memtype != memtype_init);
960 		 */
961 		memtype += 1;
962 		if (memtype == UVM_PMR_MEMTYPE_MAX)
963 			memtype = 0;
964 		if (memtype != memtype_init)
965 			goto rescan_memtype;
966 
967 		/*
968 		 * If not desperate, enter desperate case prior to eating all
969 		 * the good stuff in the next range.
970 		 */
971 		if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL &&
972 		    TAILQ_NEXT(pmr, pmr_use)->use != pmr->use)
973 			break;
974 	}
975 
976 	/*
977 	 * Not enough memory of the requested type available. Fall back to
978 	 * less good memory that we'll clean up better later.
979 	 *
980 	 * This algorithm is not very smart though, it just starts scanning
981 	 * a different typed range, but the nicer ranges of the previous
982 	 * iteration may fall out. Hence there is a small chance of a false
983 	 * negative.
984 	 *
985 	 * When desparate: scan all sizes starting at the smallest
986 	 * (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may
987 	 * allow us to hit the fast path now).
988 	 *
989 	 * Also, because we will revisit entries we scanned before, we need
990 	 * to reset the page queue, or we may end up releasing entries in
991 	 * such a way as to invalidate f_next.
992 	 */
993 	if (!desperate) {
994 		desperate = 1;
995 		start_try = nitems(search) - 1;
996 		flags &= ~UVM_PLA_TRYCONTIG;
997 
998 		while (!TAILQ_EMPTY(result))
999 			uvm_pmr_remove_1strange(result, 0, NULL, 0);
1000 		fnsegs = 0;
1001 		fcount = 0;
1002 		goto retry_desperate;
1003 	}
1004 
1005 fail:
1006 	/* Allocation failed. */
1007 	/* XXX: claim from memory reserve here */
1008 
1009 	while (!TAILQ_EMPTY(result))
1010 		uvm_pmr_remove_1strange(result, 0, NULL, 0);
1011 
1012 	if (flags & UVM_PLA_WAITOK) {
1013 		if (uvm_wait_pla(ptoa(start), ptoa(end) - 1, ptoa(count),
1014 		    flags & UVM_PLA_FAILOK) == 0)
1015 			goto retry;
1016 		KASSERT(flags & UVM_PLA_FAILOK);
1017 	} else
1018 		wakeup(&uvm.pagedaemon);
1019 	uvm_unlock_fpageq();
1020 
1021 	return ENOMEM;
1022 
1023 out:
1024 	/* Allocation succesful. */
1025 	uvmexp.free -= fcount;
1026 
1027 	uvm_unlock_fpageq();
1028 
1029 	/* Update statistics and zero pages if UVM_PLA_ZERO. */
1030 #ifdef DIAGNOSTIC
1031 	fnsegs = 0;
1032 	fcount = 0;
1033 	diag_prev = NULL;
1034 #endif /* DIAGNOSTIC */
1035 	TAILQ_FOREACH(found, result, pageq) {
1036 		atomic_clearbits_int(&found->pg_flags, PG_PMAPMASK);
1037 
1038 		if (found->pg_flags & PG_ZERO) {
1039 			uvm_lock_fpageq();
1040 			uvmexp.zeropages--;
1041 			if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1042 				wakeup(&uvmexp.zeropages);
1043 			uvm_unlock_fpageq();
1044 		}
1045 		if (flags & UVM_PLA_ZERO) {
1046 			if (found->pg_flags & PG_ZERO)
1047 				uvmexp.pga_zerohit++;
1048 			else {
1049 				uvmexp.pga_zeromiss++;
1050 				uvm_pagezero(found);
1051 			}
1052 		}
1053 		atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE);
1054 
1055 		found->uobject = NULL;
1056 		found->uanon = NULL;
1057 		found->pg_version++;
1058 
1059 		/*
1060 		 * Validate that the page matches range criterium.
1061 		 */
1062 		KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start);
1063 		KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end);
1064 
1065 #ifdef DIAGNOSTIC
1066 		/*
1067 		 * Update fcount (# found pages) and
1068 		 * fnsegs (# found segments) counters.
1069 		 */
1070 		if (diag_prev == NULL ||
1071 		    /* new segment if it contains a hole */
1072 		    atop(VM_PAGE_TO_PHYS(diag_prev)) + 1 !=
1073 		    atop(VM_PAGE_TO_PHYS(found)) ||
1074 		    /* new segment if it crosses boundary */
1075 		    (atop(VM_PAGE_TO_PHYS(diag_prev)) & ~(boundary - 1)) !=
1076 		    (atop(VM_PAGE_TO_PHYS(found)) & ~(boundary - 1)))
1077 			fnsegs++;
1078 		fcount++;
1079 
1080 		diag_prev = found;
1081 #endif /* DIAGNOSTIC */
1082 	}
1083 
1084 #ifdef DIAGNOSTIC
1085 	/*
1086 	 * Panic on algorithm failure.
1087 	 */
1088 	if (fcount != count || fnsegs > maxseg) {
1089 		panic("pmemrange allocation error: "
1090 		    "allocated %ld pages in %d segments, "
1091 		    "but request was %ld pages in %d segments",
1092 		    fcount, fnsegs, count, maxseg);
1093 	}
1094 #endif /* DIAGNOSTIC */
1095 
1096 	return 0;
1097 }
1098 
1099 /*
1100  * Free a number of contig pages (invoked by uvm_page_init).
1101  */
1102 void
1103 uvm_pmr_freepages(struct vm_page *pg, psize_t count)
1104 {
1105 	struct uvm_pmemrange *pmr;
1106 	psize_t i, pmr_count;
1107 	struct vm_page *firstpg = pg;
1108 
1109 	for (i = 0; i < count; i++) {
1110 		KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) ==
1111 		    atop(VM_PAGE_TO_PHYS(pg)) + i);
1112 
1113 		if (!((pg[i].pg_flags & PQ_FREE) == 0 &&
1114 		    VALID_FLAGS(pg[i].pg_flags))) {
1115 			printf("Flags: 0x%x, will panic now.\n",
1116 			    pg[i].pg_flags);
1117 		}
1118 		KASSERT((pg[i].pg_flags & PQ_FREE) == 0 &&
1119 		    VALID_FLAGS(pg[i].pg_flags));
1120 		atomic_setbits_int(&pg[i].pg_flags, PQ_FREE);
1121 		atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
1122 	}
1123 
1124 	uvm_lock_fpageq();
1125 
1126 	for (i = count; i > 0; i -= pmr_count) {
1127 		pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1128 		KASSERT(pmr != NULL);
1129 
1130 		pmr_count = MIN(i, pmr->high - atop(VM_PAGE_TO_PHYS(pg)));
1131 		pg->fpgsz = pmr_count;
1132 		uvm_pmr_insert(pmr, pg, 0);
1133 
1134 		uvmexp.free += pmr_count;
1135 		pg += pmr_count;
1136 	}
1137 	wakeup(&uvmexp.free);
1138 	if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1139 		wakeup(&uvmexp.zeropages);
1140 
1141 	uvm_wakeup_pla(VM_PAGE_TO_PHYS(firstpg), ptoa(count));
1142 
1143 	uvm_unlock_fpageq();
1144 }
1145 
1146 /*
1147  * Free all pages in the queue.
1148  */
1149 void
1150 uvm_pmr_freepageq(struct pglist *pgl)
1151 {
1152 	struct vm_page *pg;
1153 	paddr_t pstart;
1154 	psize_t plen;
1155 
1156 	TAILQ_FOREACH(pg, pgl, pageq) {
1157 		if (!((pg->pg_flags & PQ_FREE) == 0 &&
1158 		    VALID_FLAGS(pg->pg_flags))) {
1159 			printf("Flags: 0x%x, will panic now.\n",
1160 			    pg->pg_flags);
1161 		}
1162 		KASSERT((pg->pg_flags & PQ_FREE) == 0 &&
1163 		    VALID_FLAGS(pg->pg_flags));
1164 		atomic_setbits_int(&pg->pg_flags, PQ_FREE);
1165 		atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
1166 	}
1167 
1168 	uvm_lock_fpageq();
1169 	while (!TAILQ_EMPTY(pgl)) {
1170 		pstart = VM_PAGE_TO_PHYS(TAILQ_FIRST(pgl));
1171 		plen = uvm_pmr_remove_1strange(pgl, 0, NULL, 0);
1172 		uvmexp.free += plen;
1173 
1174 		uvm_wakeup_pla(pstart, ptoa(plen));
1175 	}
1176 	wakeup(&uvmexp.free);
1177 	if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1178 		wakeup(&uvmexp.zeropages);
1179 	uvm_unlock_fpageq();
1180 
1181 	return;
1182 }
1183 
1184 /*
1185  * Store a pmemrange in the list.
1186  *
1187  * The list is sorted by use.
1188  */
1189 struct uvm_pmemrange *
1190 uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq,
1191     struct uvm_pmemrange *pmr)
1192 {
1193 	struct uvm_pmemrange *iter;
1194 	int cmp = 1;
1195 
1196 	TAILQ_FOREACH(iter, useq, pmr_use) {
1197 		cmp = uvm_pmemrange_use_cmp(pmr, iter);
1198 		if (cmp == 0)
1199 			return iter;
1200 		if (cmp == -1)
1201 			break;
1202 	}
1203 
1204 	if (iter == NULL)
1205 		TAILQ_INSERT_TAIL(useq, pmr, pmr_use);
1206 	else
1207 		TAILQ_INSERT_BEFORE(iter, pmr, pmr_use);
1208 	return NULL;
1209 }
1210 
1211 #ifdef DEBUG
1212 /*
1213  * Validation of the whole pmemrange.
1214  * Called with fpageq locked.
1215  */
1216 void
1217 uvm_pmr_assertvalid(struct uvm_pmemrange *pmr)
1218 {
1219 	struct vm_page *prev, *next, *i, *xref;
1220 	int lcv, mti;
1221 
1222 	/* Empty range */
1223 	if (pmr->nsegs == 0)
1224 		return;
1225 
1226 	/* Validate address tree. */
1227 	RB_FOREACH(i, uvm_pmr_addr, &pmr->addr) {
1228 		/* Validate the range. */
1229 		KASSERT(i->fpgsz > 0);
1230 		KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low);
1231 		KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz
1232 		    <= pmr->high);
1233 
1234 		/* Validate each page in this range. */
1235 		for (lcv = 0; lcv < i->fpgsz; lcv++) {
1236 			/*
1237 			 * Only the first page has a size specification.
1238 			 * Rest is size 0.
1239 			 */
1240 			KASSERT(lcv == 0 || i[lcv].fpgsz == 0);
1241 			/*
1242 			 * Flag check.
1243 			 */
1244 			KASSERT(VALID_FLAGS(i[lcv].pg_flags) &&
1245 			    (i[lcv].pg_flags & PQ_FREE) == PQ_FREE);
1246 			/*
1247 			 * Free pages are:
1248 			 * - not wired
1249 			 * - have no vm_anon
1250 			 * - have no uvm_object
1251 			 */
1252 			KASSERT(i[lcv].wire_count == 0);
1253 			KASSERT(i[lcv].uanon == (void*)0xdeadbeef ||
1254 			    i[lcv].uanon == NULL);
1255 			KASSERT(i[lcv].uobject == (void*)0xdeadbeef ||
1256 			    i[lcv].uobject == NULL);
1257 			/*
1258 			 * Pages in a single range always have the same
1259 			 * memtype.
1260 			 */
1261 			KASSERT(uvm_pmr_pg_to_memtype(&i[0]) ==
1262 			    uvm_pmr_pg_to_memtype(&i[lcv]));
1263 		}
1264 
1265 		/* Check that it shouldn't be joined with its predecessor. */
1266 		prev = RB_PREV(uvm_pmr_addr, &pmr->addr, i);
1267 		if (prev != NULL) {
1268 			KASSERT(uvm_pmr_pg_to_memtype(i) !=
1269 			    uvm_pmr_pg_to_memtype(prev) ||
1270 			    atop(VM_PAGE_TO_PHYS(i)) >
1271 			    atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz ||
1272 			    prev + prev->fpgsz != i);
1273 		}
1274 
1275 		/* Assert i is in the size tree as well. */
1276 		if (i->fpgsz == 1) {
1277 			TAILQ_FOREACH(xref,
1278 			    &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) {
1279 				if (xref == i)
1280 					break;
1281 			}
1282 			KASSERT(xref == i);
1283 		} else {
1284 			KASSERT(RB_FIND(uvm_pmr_size,
1285 			    &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) ==
1286 			    i + 1);
1287 		}
1288 	}
1289 
1290 	/* Validate size tree. */
1291 	for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
1292 		for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) {
1293 			next = uvm_pmr_nextsz(pmr, i, mti);
1294 			if (next != NULL) {
1295 				KASSERT(i->fpgsz <=
1296 				    next->fpgsz);
1297 			}
1298 
1299 			/* Assert i is in the addr tree as well. */
1300 			KASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, i) == i);
1301 
1302 			/* Assert i is of the correct memory type. */
1303 			KASSERT(uvm_pmr_pg_to_memtype(i) == mti);
1304 		}
1305 	}
1306 
1307 	/* Validate nsegs statistic. */
1308 	lcv = 0;
1309 	RB_FOREACH(i, uvm_pmr_addr, &pmr->addr)
1310 		lcv++;
1311 	KASSERT(pmr->nsegs == lcv);
1312 }
1313 #endif /* DEBUG */
1314 
1315 /*
1316  * Split pmr at split point pageno.
1317  * Called with fpageq unlocked.
1318  *
1319  * Split is only applied if a pmemrange spans pageno.
1320  */
1321 void
1322 uvm_pmr_split(paddr_t pageno)
1323 {
1324 	struct uvm_pmemrange *pmr, *drain;
1325 	struct vm_page *rebuild, *prev, *next;
1326 	psize_t prev_sz;
1327 
1328 	uvm_lock_fpageq();
1329 	pmr = uvm_pmemrange_find(pageno);
1330 	if (pmr == NULL || !(pmr->low < pageno)) {
1331 		/* No split required. */
1332 		uvm_unlock_fpageq();
1333 		return;
1334 	}
1335 
1336 	KASSERT(pmr->low < pageno);
1337 	KASSERT(pmr->high > pageno);
1338 
1339 	/*
1340 	 * uvm_pmr_allocpmr() calls into malloc() which in turn calls into
1341 	 * uvm_kmemalloc which calls into pmemrange, making the locking
1342 	 * a bit hard, so we just race!
1343 	 */
1344 	uvm_unlock_fpageq();
1345 	drain = uvm_pmr_allocpmr();
1346 	uvm_lock_fpageq();
1347 	pmr = uvm_pmemrange_find(pageno);
1348 	if (pmr == NULL || !(pmr->low < pageno)) {
1349 		/*
1350 		 * We lost the race since someone else ran this or a related
1351 		 * function, however this should be triggered very rarely so
1352 		 * we just leak the pmr.
1353 		 */
1354 		printf("uvm_pmr_split: lost one pmr\n");
1355 		uvm_unlock_fpageq();
1356 		return;
1357 	}
1358 
1359 	drain->low = pageno;
1360 	drain->high = pmr->high;
1361 	drain->use = pmr->use;
1362 
1363 	uvm_pmr_assertvalid(pmr);
1364 	uvm_pmr_assertvalid(drain);
1365 	KASSERT(drain->nsegs == 0);
1366 
1367 	RB_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) {
1368 		if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno)
1369 			break;
1370 	}
1371 	if (rebuild == NULL)
1372 		prev = RB_MAX(uvm_pmr_addr, &pmr->addr);
1373 	else
1374 		prev = RB_PREV(uvm_pmr_addr, &pmr->addr, rebuild);
1375 	KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1376 
1377 	/*
1378 	 * Handle free chunk that spans the split point.
1379 	 */
1380 	if (prev != NULL &&
1381 	    atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) {
1382 		psize_t before, after;
1383 
1384 		KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1385 
1386 		uvm_pmr_remove(pmr, prev);
1387 		prev_sz = prev->fpgsz;
1388 		before = pageno - atop(VM_PAGE_TO_PHYS(prev));
1389 		after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno;
1390 
1391 		KASSERT(before > 0);
1392 		KASSERT(after > 0);
1393 
1394 		prev->fpgsz = before;
1395 		uvm_pmr_insert(pmr, prev, 1);
1396 		(prev + before)->fpgsz = after;
1397 		uvm_pmr_insert(drain, prev + before, 1);
1398 	}
1399 
1400 	/* Move free chunks that no longer fall in the range. */
1401 	for (; rebuild != NULL; rebuild = next) {
1402 		next = RB_NEXT(uvm_pmr_addr, &pmr->addr, rebuild);
1403 
1404 		uvm_pmr_remove(pmr, rebuild);
1405 		uvm_pmr_insert(drain, rebuild, 1);
1406 	}
1407 
1408 	pmr->high = pageno;
1409 	uvm_pmr_assertvalid(pmr);
1410 	uvm_pmr_assertvalid(drain);
1411 
1412 	RB_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain);
1413 	uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain);
1414 	uvm_unlock_fpageq();
1415 }
1416 
1417 /*
1418  * Increase the usage counter for the given range of memory.
1419  *
1420  * The more usage counters a given range of memory has, the more will be
1421  * attempted not to allocate from it.
1422  *
1423  * Addresses here are in paddr_t, not page-numbers.
1424  * The lowest and highest allowed address are specified.
1425  */
1426 void
1427 uvm_pmr_use_inc(paddr_t low, paddr_t high)
1428 {
1429 	struct uvm_pmemrange *pmr;
1430 	paddr_t sz;
1431 
1432 	/* pmr uses page numbers, translate low and high. */
1433 	high++;
1434 	high = atop(trunc_page(high));
1435 	low = atop(round_page(low));
1436 	uvm_pmr_split(low);
1437 	uvm_pmr_split(high);
1438 
1439 	sz = 0;
1440 	uvm_lock_fpageq();
1441 	/* Increase use count on segments in range. */
1442 	RB_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) {
1443 		if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) {
1444 			TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use);
1445 			pmr->use++;
1446 			sz += pmr->high - pmr->low;
1447 			uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr);
1448 		}
1449 		uvm_pmr_assertvalid(pmr);
1450 	}
1451 	uvm_unlock_fpageq();
1452 
1453 	KASSERT(sz >= high - low);
1454 }
1455 
1456 /*
1457  * Allocate a pmemrange.
1458  *
1459  * If called from uvm_page_init, the uvm_pageboot_alloc is used.
1460  * If called after uvm_init, malloc is used.
1461  * (And if called in between, you're dead.)
1462  */
1463 struct uvm_pmemrange *
1464 uvm_pmr_allocpmr(void)
1465 {
1466 	struct uvm_pmemrange *nw;
1467 	int i;
1468 
1469 	/* We're only ever hitting the !uvm.page_init_done case for now. */
1470 	if (!uvm.page_init_done) {
1471 		nw = (struct uvm_pmemrange *)
1472 		    uvm_pageboot_alloc(sizeof(struct uvm_pmemrange));
1473 	} else {
1474 		nw = malloc(sizeof(struct uvm_pmemrange),
1475 		    M_VMMAP, M_NOWAIT);
1476 	}
1477 	KASSERT(nw != NULL);
1478 	memset(nw, 0, sizeof(struct uvm_pmemrange));
1479 	RB_INIT(&nw->addr);
1480 	for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) {
1481 		RB_INIT(&nw->size[i]);
1482 		TAILQ_INIT(&nw->single[i]);
1483 	}
1484 	return nw;
1485 }
1486 
1487 /*
1488  * Initialization of pmr.
1489  * Called by uvm_page_init.
1490  *
1491  * Sets up pmemranges.
1492  */
1493 void
1494 uvm_pmr_init(void)
1495 {
1496 	struct uvm_pmemrange *new_pmr;
1497 	int i;
1498 
1499 	TAILQ_INIT(&uvm.pmr_control.use);
1500 	RB_INIT(&uvm.pmr_control.addr);
1501 	TAILQ_INIT(&uvm.pmr_control.allocs);
1502 
1503 	/* By default, one range for the entire address space. */
1504 	new_pmr = uvm_pmr_allocpmr();
1505 	new_pmr->low = 0;
1506 	new_pmr->high = atop((paddr_t)-1) + 1;
1507 
1508 	RB_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr);
1509 	uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr);
1510 
1511 	for (i = 0; uvm_md_constraints[i] != NULL; i++) {
1512 		uvm_pmr_use_inc(uvm_md_constraints[i]->ucr_low,
1513 	    	    uvm_md_constraints[i]->ucr_high);
1514 	}
1515 }
1516 
1517 /*
1518  * Find the pmemrange that contains the given page number.
1519  *
1520  * (Manually traverses the binary tree, because that is cheaper on stack
1521  * usage.)
1522  */
1523 struct uvm_pmemrange *
1524 uvm_pmemrange_find(paddr_t pageno)
1525 {
1526 	struct uvm_pmemrange *pmr;
1527 
1528 	pmr = RB_ROOT(&uvm.pmr_control.addr);
1529 	while (pmr != NULL) {
1530 		if (pmr->low > pageno)
1531 			pmr = RB_LEFT(pmr, pmr_addr);
1532 		else if (pmr->high <= pageno)
1533 			pmr = RB_RIGHT(pmr, pmr_addr);
1534 		else
1535 			break;
1536 	}
1537 
1538 	return pmr;
1539 }
1540 
1541 #if defined(DDB) || defined(DEBUG)
1542 /*
1543  * Return true if the given page is in any of the free lists.
1544  * Used by uvm_page_printit.
1545  * This function is safe, even if the page is not on the freeq.
1546  * Note: does not apply locking, only called from ddb.
1547  */
1548 int
1549 uvm_pmr_isfree(struct vm_page *pg)
1550 {
1551 	struct vm_page *r;
1552 	struct uvm_pmemrange *pmr;
1553 
1554 	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1555 	if (pmr == NULL)
1556 		return 0;
1557 	r = RB_NFIND(uvm_pmr_addr, &pmr->addr, pg);
1558 	if (r == NULL)
1559 		r = RB_MAX(uvm_pmr_addr, &pmr->addr);
1560 	else if (r != pg)
1561 		r = RB_PREV(uvm_pmr_addr, &pmr->addr, r);
1562 	if (r == NULL)
1563 		return 0; /* Empty tree. */
1564 
1565 	KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg)));
1566 	return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz >
1567 	    atop(VM_PAGE_TO_PHYS(pg));
1568 }
1569 #endif /* DEBUG */
1570 
1571 /*
1572  * Given a root of a tree, find a range which intersects start, end and
1573  * is of the same memtype.
1574  *
1575  * Page must be in the address tree.
1576  */
1577 struct vm_page*
1578 uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root,
1579     paddr_t start, paddr_t end, int memtype)
1580 {
1581 	int	direction;
1582 	struct	vm_page *root;
1583 	struct	vm_page *high, *high_next;
1584 	struct	vm_page *low, *low_next;
1585 
1586 	KDASSERT(pmr != NULL && init_root != NULL);
1587 	root = init_root;
1588 
1589 	/* Which direction to use for searching. */
1590 	if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start)
1591 		direction =  1;
1592 	else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end)
1593 		direction = -1;
1594 	else /* nothing to do */
1595 		return root;
1596 
1597 	/* First, update root to fall within the chosen range. */
1598 	while (root && !PMR_INTERSECTS_WITH(
1599 	    atop(VM_PAGE_TO_PHYS(root)),
1600 	    atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz,
1601 	    start, end)) {
1602 		if (direction == 1)
1603 			root = RB_RIGHT(root, objt);
1604 		else
1605 			root = RB_LEFT(root, objt);
1606 	}
1607 	if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype)
1608 		return root;
1609 
1610 	/*
1611 	 * Root is valid, but of the wrong memtype.
1612 	 *
1613 	 * Try to find a range that has the given memtype in the subtree
1614 	 * (memtype mismatches are costly, either because the conversion
1615 	 * is expensive, or a later allocation will need to do the opposite
1616 	 * conversion, which will be expensive).
1617 	 *
1618 	 *
1619 	 * First, simply increase address until we hit something we can use.
1620 	 * Cache the upper page, so we can page-walk later.
1621 	 */
1622 	high = root;
1623 	high_next = RB_RIGHT(high, objt);
1624 	while (high_next != NULL && PMR_INTERSECTS_WITH(
1625 	    atop(VM_PAGE_TO_PHYS(high_next)),
1626 	    atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz,
1627 	    start, end)) {
1628 		high = high_next;
1629 		if (uvm_pmr_pg_to_memtype(high) == memtype)
1630 			return high;
1631 		high_next = RB_RIGHT(high, objt);
1632 	}
1633 
1634 	/*
1635 	 * Second, decrease the address until we hit something we can use.
1636 	 * Cache the lower page, so we can page-walk later.
1637 	 */
1638 	low = root;
1639 	low_next = RB_LEFT(low, objt);
1640 	while (low_next != NULL && PMR_INTERSECTS_WITH(
1641 	    atop(VM_PAGE_TO_PHYS(low_next)),
1642 	    atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz,
1643 	    start, end)) {
1644 		low = low_next;
1645 		if (uvm_pmr_pg_to_memtype(low) == memtype)
1646 			return low;
1647 		low_next = RB_LEFT(low, objt);
1648 	}
1649 
1650 	if (low == high)
1651 		return NULL;
1652 
1653 	/* No hits. Walk the address tree until we find something usable. */
1654 	for (low = RB_NEXT(uvm_pmr_addr, &pmr->addr, low);
1655 	    low != high;
1656 	    low = RB_NEXT(uvm_pmr_addr, &pmr->addr, low)) {
1657 		KDASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(low)),
1658 	    	    atop(VM_PAGE_TO_PHYS(low)) + low->fpgsz,
1659 	    	    start, end));
1660 		if (uvm_pmr_pg_to_memtype(low) == memtype)
1661 			return low;
1662 	}
1663 
1664 	/* Nothing found. */
1665 	return NULL;
1666 }
1667 
1668 /*
1669  * Allocate any page, the fastest way. Page number constraints only.
1670  */
1671 psize_t
1672 uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result,
1673     paddr_t start, paddr_t end, int memtype_only)
1674 {
1675 	struct	uvm_pmemrange *pmr;
1676 	struct	vm_page *found, *splitpg;
1677 	psize_t	fcount;
1678 	int	memtype;
1679 
1680 	fcount = 0;
1681 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1682 		/* We're done. */
1683 		if (fcount == count)
1684 			break;
1685 
1686 		/* Outside requested range. */
1687 		if (!(start == 0 && end == 0) &&
1688 		    !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
1689 			continue;
1690 
1691 		/* Range is empty. */
1692 		if (pmr->nsegs == 0)
1693 			continue;
1694 
1695 		/* Loop over all memtypes, starting at memtype_init. */
1696 		memtype = memtype_init;
1697 		while (fcount != count) {
1698 			found = TAILQ_FIRST(&pmr->single[memtype]);
1699 			/*
1700 			 * If found is outside the range, walk the list
1701 			 * until we find something that intersects with
1702 			 * boundaries.
1703 			 */
1704 			while (found && !PMR_INTERSECTS_WITH(
1705 			    atop(VM_PAGE_TO_PHYS(found)),
1706 			    atop(VM_PAGE_TO_PHYS(found)) + 1,
1707 			    start, end))
1708 				found = TAILQ_NEXT(found, pageq);
1709 
1710 			if (found == NULL) {
1711 				/*
1712 				 * Check if the size tree contains a range
1713 				 * that intersects with the boundaries. As the
1714 				 * allocation is for any page, try the smallest
1715 				 * range so that large ranges are preserved for
1716 				 * more constrained cases. Only one entry is
1717 				 * checked here, to avoid a brute-force search.
1718 				 *
1719 				 * Note that a size tree gives pg[1] instead of
1720 				 * pg[0].
1721 				 */
1722 				found = RB_MIN(uvm_pmr_size,
1723 				    &pmr->size[memtype]);
1724 				if (found != NULL) {
1725 					found--;
1726 					if (!PMR_INTERSECTS_WITH(
1727 					    atop(VM_PAGE_TO_PHYS(found)),
1728 					    atop(VM_PAGE_TO_PHYS(found)) +
1729 					    found->fpgsz, start, end))
1730 						found = NULL;
1731 				}
1732 			}
1733 			if (found == NULL) {
1734 				/*
1735 				 * Try address-guided search to meet the page
1736 				 * number constraints.
1737 				 */
1738 				found = RB_ROOT(&pmr->addr);
1739 				if (found != NULL) {
1740 					found = uvm_pmr_rootupdate(pmr, found,
1741 					    start, end, memtype);
1742 				}
1743 			}
1744 			if (found != NULL) {
1745 				uvm_pmr_assertvalid(pmr);
1746 				uvm_pmr_remove_size(pmr, found);
1747 
1748 				/*
1749 				 * If the page intersects the end, then it'll
1750 				 * need splitting.
1751 				 *
1752 				 * Note that we don't need to split if the page
1753 				 * intersects start: the drain function will
1754 				 * simply stop on hitting start.
1755 				 */
1756 				if (end != 0 && atop(VM_PAGE_TO_PHYS(found)) +
1757 				    found->fpgsz > end) {
1758 					psize_t splitsz =
1759 					    atop(VM_PAGE_TO_PHYS(found)) +
1760 					    found->fpgsz - end;
1761 
1762 					uvm_pmr_remove_addr(pmr, found);
1763 					uvm_pmr_assertvalid(pmr);
1764 					found->fpgsz -= splitsz;
1765 					splitpg = found + found->fpgsz;
1766 					splitpg->fpgsz = splitsz;
1767 					uvm_pmr_insert(pmr, splitpg, 1);
1768 
1769 					/*
1770 					 * At this point, splitpg and found
1771 					 * actually should be joined.
1772 					 * But we explicitly disable that,
1773 					 * because we will start subtracting
1774 					 * from found.
1775 					 */
1776 					KASSERT(start == 0 ||
1777 					    atop(VM_PAGE_TO_PHYS(found)) +
1778 					    found->fpgsz > start);
1779 					uvm_pmr_insert_addr(pmr, found, 1);
1780 				}
1781 
1782 				/*
1783 				 * Fetch pages from the end.
1784 				 * If the range is larger than the requested
1785 				 * number of pages, this saves us an addr-tree
1786 				 * update.
1787 				 *
1788 				 * Since we take from the end and insert at
1789 				 * the head, any ranges keep preserved.
1790 				 */
1791 				while (found->fpgsz > 0 && fcount < count &&
1792 				    (start == 0 ||
1793 				    atop(VM_PAGE_TO_PHYS(found)) +
1794 				    found->fpgsz > start)) {
1795 					found->fpgsz--;
1796 					fcount++;
1797 					TAILQ_INSERT_HEAD(result,
1798 					    &found[found->fpgsz], pageq);
1799 				}
1800 				if (found->fpgsz > 0) {
1801 					uvm_pmr_insert_size(pmr, found);
1802 					KDASSERT(fcount == count);
1803 					uvm_pmr_assertvalid(pmr);
1804 					return fcount;
1805 				}
1806 
1807 				/*
1808 				 * Delayed addr-tree removal.
1809 				 */
1810 				uvm_pmr_remove_addr(pmr, found);
1811 				uvm_pmr_assertvalid(pmr);
1812 			} else {
1813 				if (memtype_only)
1814 					break;
1815 				/*
1816 				 * Skip to the next memtype.
1817 				 */
1818 				memtype += 1;
1819 				if (memtype == UVM_PMR_MEMTYPE_MAX)
1820 					memtype = 0;
1821 				if (memtype == memtype_init)
1822 					break;
1823 			}
1824 		}
1825 	}
1826 
1827 	/*
1828 	 * Search finished.
1829 	 *
1830 	 * Ran out of ranges before enough pages were gathered, or we hit the
1831 	 * case where found->fpgsz == count - fcount, in which case the
1832 	 * above exit condition didn't trigger.
1833 	 *
1834 	 * On failure, caller will free the pages.
1835 	 */
1836 	return fcount;
1837 }
1838 
1839 #ifdef DDB
1840 /*
1841  * Print information about pmemrange.
1842  * Does not do locking (so either call it from DDB or acquire fpageq lock
1843  * before invoking.
1844  */
1845 void
1846 uvm_pmr_print(void)
1847 {
1848 	struct	uvm_pmemrange *pmr;
1849 	struct	vm_page *pg;
1850 	psize_t	size[UVM_PMR_MEMTYPE_MAX];
1851 	psize_t	free;
1852 	int	useq_len;
1853 	int	mt;
1854 
1855 	printf("Ranges, use queue:\n");
1856 	useq_len = 0;
1857 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1858 		useq_len++;
1859 		free = 0;
1860 		for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
1861 			pg = RB_MAX(uvm_pmr_size, &pmr->size[mt]);
1862 			if (pg != NULL)
1863 				pg--;
1864 			else
1865 				pg = TAILQ_FIRST(&pmr->single[mt]);
1866 			size[mt] = (pg == NULL ? 0 : pg->fpgsz);
1867 
1868 			RB_FOREACH(pg, uvm_pmr_addr, &pmr->addr)
1869 				free += pg->fpgsz;
1870 		}
1871 
1872 		printf("* [0x%lx-0x%lx] use=%d nsegs=%ld",
1873 		    (unsigned long)pmr->low, (unsigned long)pmr->high,
1874 		    pmr->use, (unsigned long)pmr->nsegs);
1875 		for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
1876 			printf(" maxsegsz[%d]=0x%lx", mt,
1877 			    (unsigned long)size[mt]);
1878 		}
1879 		printf(" free=0x%lx\n", (unsigned long)free);
1880 	}
1881 	printf("#ranges = %d\n", useq_len);
1882 }
1883 #endif
1884 
1885 /*
1886  * uvm_wait_pla: wait (sleep) for the page daemon to free some pages
1887  * in a specific physmem area.
1888  *
1889  * Returns ENOMEM if the pagedaemon failed to free any pages.
1890  * If not failok, failure will lead to panic.
1891  *
1892  * Must be called with fpageq locked.
1893  */
1894 int
1895 uvm_wait_pla(paddr_t low, paddr_t high, paddr_t size, int failok)
1896 {
1897 	struct uvm_pmalloc pma;
1898 	const char *wmsg = "pmrwait";
1899 
1900 	if (curproc == uvm.pagedaemon_proc) {
1901 		/*
1902 		 * This is not that uncommon when the pagedaemon is trying
1903 		 * to flush out a large mmapped file. VOP_WRITE will circle
1904 		 * back through the buffer cache and try to get more memory.
1905 		 * The pagedaemon starts by calling bufbackoff, but we can
1906 		 * easily use up that reserve in a single scan iteration.
1907 		 */
1908 		uvm_unlock_fpageq();
1909 		if (bufbackoff(NULL, atop(size)) == 0) {
1910 			uvm_lock_fpageq();
1911 			return 0;
1912 		}
1913 		uvm_lock_fpageq();
1914 
1915 		/*
1916 		 * XXX detect pagedaemon deadlock - see comment in
1917 		 * uvm_wait(), as this is exactly the same issue.
1918 		 */
1919 		printf("pagedaemon: wait_pla deadlock detected!\n");
1920 		msleep(&uvmexp.free, &uvm.fpageqlock, PVM, wmsg, hz >> 3);
1921 #if defined(DEBUG)
1922 		/* DEBUG: panic so we can debug it */
1923 		panic("wait_pla pagedaemon deadlock");
1924 #endif
1925 		return 0;
1926 	}
1927 
1928 	for (;;) {
1929 		pma.pm_constraint.ucr_low = low;
1930 		pma.pm_constraint.ucr_high = high;
1931 		pma.pm_size = size;
1932 		pma.pm_flags = UVM_PMA_LINKED;
1933 		TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, &pma, pmq);
1934 
1935 		wakeup(&uvm.pagedaemon);		/* wake the daemon! */
1936 		while (pma.pm_flags & (UVM_PMA_LINKED | UVM_PMA_BUSY))
1937 			msleep(&pma, &uvm.fpageqlock, PVM, wmsg, 0);
1938 
1939 		if (!(pma.pm_flags & UVM_PMA_FREED) &&
1940 		    pma.pm_flags & UVM_PMA_FAIL) {
1941 			if (failok)
1942 				return ENOMEM;
1943 			printf("uvm_wait: failed to free %ld pages between "
1944 			    "0x%lx-0x%lx\n", atop(size), low, high);
1945 		} else
1946 			return 0;
1947 	}
1948 	/* UNREACHABLE */
1949 }
1950 
1951 /*
1952  * Wake up uvm_pmalloc sleepers.
1953  */
1954 void
1955 uvm_wakeup_pla(paddr_t low, psize_t len)
1956 {
1957 	struct uvm_pmalloc *pma, *pma_next;
1958 	paddr_t high;
1959 
1960 	high = low + len;
1961 
1962 	/* Wake specific allocations waiting for this memory. */
1963 	for (pma = TAILQ_FIRST(&uvm.pmr_control.allocs); pma != NULL;
1964 	    pma = pma_next) {
1965 		pma_next = TAILQ_NEXT(pma, pmq);
1966 
1967 		if (low < pma->pm_constraint.ucr_high &&
1968 		    high > pma->pm_constraint.ucr_low) {
1969 			pma->pm_flags |= UVM_PMA_FREED;
1970 			if (!(pma->pm_flags & UVM_PMA_BUSY)) {
1971 				pma->pm_flags &= ~UVM_PMA_LINKED;
1972 				TAILQ_REMOVE(&uvm.pmr_control.allocs, pma,
1973 				    pmq);
1974 				wakeup(pma);
1975 			}
1976 		}
1977 	}
1978 }
1979 
1980 void
1981 uvm_pagezero_thread(void *arg)
1982 {
1983 	struct pglist pgl;
1984 	struct vm_page *pg;
1985 	int count;
1986 
1987 	/* Run at the lowest possible priority. */
1988 	curproc->p_p->ps_nice = NZERO + PRIO_MAX;
1989 
1990 	KERNEL_UNLOCK();
1991 
1992 	TAILQ_INIT(&pgl);
1993 	for (;;) {
1994 		uvm_lock_fpageq();
1995 		while (uvmexp.zeropages >= UVM_PAGEZERO_TARGET ||
1996 		    (count = uvm_pmr_get1page(16, UVM_PMR_MEMTYPE_DIRTY,
1997 		     &pgl, 0, 0, 1)) == 0) {
1998 			msleep(&uvmexp.zeropages, &uvm.fpageqlock, MAXPRI,
1999 			    "pgzero", 0);
2000 		}
2001 		uvm_unlock_fpageq();
2002 
2003 		TAILQ_FOREACH(pg, &pgl, pageq) {
2004 			uvm_pagezero(pg);
2005 			atomic_setbits_int(&pg->pg_flags, PG_ZERO);
2006 		}
2007 
2008 		uvm_lock_fpageq();
2009 		while (!TAILQ_EMPTY(&pgl))
2010 			uvm_pmr_remove_1strange(&pgl, 0, NULL, 0);
2011 		uvmexp.zeropages += count;
2012  		uvm_unlock_fpageq();
2013 
2014 		yield();
2015 	}
2016 }
2017