1 /* $OpenBSD: uvm_pmemrange.c,v 1.66 2024/05/01 12:54:27 mpi Exp $ */
2
3 /*
4 * Copyright (c) 2024 Martin Pieuchot <mpi@openbsd.org>
5 * Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl>
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 */
19
20 #include <sys/param.h>
21 #include <sys/systm.h>
22 #include <uvm/uvm.h>
23 #include <sys/malloc.h>
24 #include <sys/kernel.h>
25 #include <sys/proc.h>
26 #include <sys/mount.h>
27
28 /*
29 * 2 trees: addr tree and size tree.
30 *
31 * The allocator keeps chunks of free pages (called a range).
32 * Two pages are part of the same range if:
33 * - all pages in between are part of that range,
34 * - they are of the same memory type (zeroed or non-zeroed),
35 * - they are part of the same pmemrange.
36 * A pmemrange is a range of memory which is part of the same vm_physseg
37 * and has a use-count.
38 *
39 * addr tree is vm_page[0].objt
40 * size tree is vm_page[1].objt
41 *
42 * The size tree is not used for memory ranges of 1 page, instead,
43 * single queue is vm_page[0].pageq
44 *
45 * vm_page[0].fpgsz describes the length of a free range. Two adjacent ranges
46 * are joined, unless:
47 * - they have pages in between them which are not free
48 * - they belong to different memtypes (zeroed vs dirty memory)
49 * - they are in different pmemrange areas (ISA vs non-ISA memory for instance)
50 * - they are not a continuation of the same array
51 * The latter issue is caused by vm_physseg ordering and splitting from the
52 * MD initialization machinery. The MD code is dependant on freelists and
53 * happens to split ISA memory from non-ISA memory.
54 * (Note: freelists die die die!)
55 *
56 * uvm_page_init guarantees that every vm_physseg contains an array of
57 * struct vm_page. Also, uvm_page_physload allocates an array of struct
58 * vm_page. This code depends on that array. The array may break across
59 * vm_physsegs boundaries.
60 */
61
62 /*
63 * Validate the flags of the page. (Used in asserts.)
64 * Any free page must have the PQ_FREE flag set.
65 * Free pages may be zeroed.
66 * Pmap flags are left untouched.
67 *
68 * The PQ_FREE flag is not checked here: by not checking, we can easily use
69 * this check in pages which are freed.
70 */
71 #define VALID_FLAGS(pg_flags) \
72 (((pg_flags) & ~(PQ_FREE|PG_ZERO|PG_PMAPMASK)) == 0x0)
73
74 /* Tree comparators. */
75 int uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *,
76 const struct uvm_pmemrange *);
77 int uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *);
78 int uvm_pmr_pg_to_memtype(struct vm_page *);
79
80 #ifdef DDB
81 void uvm_pmr_print(void);
82 #endif
83
84 /*
85 * Memory types. The page flags are used to derive what the current memory
86 * type of a page is.
87 */
88 int
uvm_pmr_pg_to_memtype(struct vm_page * pg)89 uvm_pmr_pg_to_memtype(struct vm_page *pg)
90 {
91 if (pg->pg_flags & PG_ZERO)
92 return UVM_PMR_MEMTYPE_ZERO;
93 /* Default: dirty memory. */
94 return UVM_PMR_MEMTYPE_DIRTY;
95 }
96
97 /* Trees. */
98 RBT_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp);
99 RBT_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp);
100 RBT_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr,
101 uvm_pmemrange_addr_cmp);
102
103 /* Validation. */
104 #ifdef DEBUG
105 void uvm_pmr_assertvalid(struct uvm_pmemrange *pmr);
106 #else
107 #define uvm_pmr_assertvalid(pmr) do {} while (0)
108 #endif
109
110 psize_t uvm_pmr_get1page(psize_t, int, struct pglist *,
111 paddr_t, paddr_t, int);
112
113 struct uvm_pmemrange *uvm_pmr_allocpmr(void);
114 struct vm_page *uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int);
115 struct vm_page *uvm_pmr_nextsz(struct uvm_pmemrange *,
116 struct vm_page *, int);
117 void uvm_pmr_pnaddr(struct uvm_pmemrange *pmr,
118 struct vm_page *pg, struct vm_page **pg_prev,
119 struct vm_page **pg_next);
120 struct vm_page *uvm_pmr_findnextsegment(struct uvm_pmemrange *,
121 struct vm_page *, paddr_t);
122 struct vm_page *uvm_pmr_findprevsegment(struct uvm_pmemrange *,
123 struct vm_page *, paddr_t);
124 psize_t uvm_pmr_remove_1strange(struct pglist *, paddr_t,
125 struct vm_page **, int);
126 psize_t uvm_pmr_remove_1strange_reverse(struct pglist *,
127 paddr_t *);
128 void uvm_pmr_split(paddr_t);
129 struct uvm_pmemrange *uvm_pmemrange_find(paddr_t);
130 struct uvm_pmemrange *uvm_pmemrange_use_insert(struct uvm_pmemrange_use *,
131 struct uvm_pmemrange *);
132 psize_t pow2divide(psize_t, psize_t);
133 struct vm_page *uvm_pmr_rootupdate(struct uvm_pmemrange *,
134 struct vm_page *, paddr_t, paddr_t, int);
135
136 /*
137 * Computes num/denom and rounds it up to the next power-of-2.
138 *
139 * This is a division function which calculates an approximation of
140 * num/denom, with result =~ num/denom. It is meant to be fast and doesn't
141 * have to be accurate.
142 *
143 * Providing too large a value makes the allocator slightly faster, at the
144 * risk of hitting the failure case more often. Providing too small a value
145 * makes the allocator a bit slower, but less likely to hit a failure case.
146 */
147 psize_t
pow2divide(psize_t num,psize_t denom)148 pow2divide(psize_t num, psize_t denom)
149 {
150 int rshift;
151
152 for (rshift = 0; num > denom; rshift++, denom <<= 1)
153 ;
154 return (paddr_t)1 << rshift;
155 }
156
157 /*
158 * Predicate: lhs is a subrange or rhs.
159 *
160 * If rhs_low == 0: don't care about lower bound.
161 * If rhs_high == 0: don't care about upper bound.
162 */
163 #define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high) \
164 (((rhs_low) == 0 || (lhs_low) >= (rhs_low)) && \
165 ((rhs_high) == 0 || (lhs_high) <= (rhs_high)))
166
167 /*
168 * Predicate: lhs intersects with rhs.
169 *
170 * If rhs_low == 0: don't care about lower bound.
171 * If rhs_high == 0: don't care about upper bound.
172 * Ranges don't intersect if they don't have any page in common, array
173 * semantics mean that < instead of <= should be used here.
174 */
175 #define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high) \
176 (((rhs_low) == 0 || (rhs_low) < (lhs_high)) && \
177 ((rhs_high) == 0 || (lhs_low) < (rhs_high)))
178
179 /*
180 * Align to power-of-2 alignment.
181 */
182 #define PMR_ALIGN(pgno, align) \
183 (((pgno) + ((align) - 1)) & ~((align) - 1))
184 #define PMR_ALIGN_DOWN(pgno, align) \
185 ((pgno) & ~((align) - 1))
186
187
188 /*
189 * Comparator: sort by address ascending.
190 */
191 int
uvm_pmemrange_addr_cmp(const struct uvm_pmemrange * lhs,const struct uvm_pmemrange * rhs)192 uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *lhs,
193 const struct uvm_pmemrange *rhs)
194 {
195 return lhs->low < rhs->low ? -1 : lhs->low > rhs->low;
196 }
197
198 /*
199 * Comparator: sort by use ascending.
200 *
201 * The higher the use value of a range, the more devices need memory in
202 * this range. Therefore allocate from the range with the lowest use first.
203 */
204 int
uvm_pmemrange_use_cmp(struct uvm_pmemrange * lhs,struct uvm_pmemrange * rhs)205 uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs)
206 {
207 int result;
208
209 result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use;
210 if (result == 0)
211 result = uvm_pmemrange_addr_cmp(lhs, rhs);
212 return result;
213 }
214
215 int
uvm_pmr_addr_cmp(const struct vm_page * lhs,const struct vm_page * rhs)216 uvm_pmr_addr_cmp(const struct vm_page *lhs, const struct vm_page *rhs)
217 {
218 paddr_t lhs_addr, rhs_addr;
219
220 lhs_addr = VM_PAGE_TO_PHYS(lhs);
221 rhs_addr = VM_PAGE_TO_PHYS(rhs);
222
223 return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr);
224 }
225
226 int
uvm_pmr_size_cmp(const struct vm_page * lhs,const struct vm_page * rhs)227 uvm_pmr_size_cmp(const struct vm_page *lhs, const struct vm_page *rhs)
228 {
229 psize_t lhs_size, rhs_size;
230 int cmp;
231
232 /* Using second tree, so we receive pg[1] instead of pg[0]. */
233 lhs_size = (lhs - 1)->fpgsz;
234 rhs_size = (rhs - 1)->fpgsz;
235
236 cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size);
237 if (cmp == 0)
238 cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1);
239 return cmp;
240 }
241
242 /*
243 * Find the first range of free pages that is at least sz pages long.
244 */
245 struct vm_page *
uvm_pmr_nfindsz(struct uvm_pmemrange * pmr,psize_t sz,int mti)246 uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti)
247 {
248 struct vm_page *node, *best;
249
250 KASSERT(sz >= 1);
251
252 if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti]))
253 return TAILQ_FIRST(&pmr->single[mti]);
254
255 node = RBT_ROOT(uvm_pmr_size, &pmr->size[mti]);
256 best = NULL;
257 while (node != NULL) {
258 if ((node - 1)->fpgsz >= sz) {
259 best = (node - 1);
260 node = RBT_LEFT(uvm_objtree, node);
261 } else
262 node = RBT_RIGHT(uvm_objtree, node);
263 }
264 return best;
265 }
266
267 /*
268 * Finds the next range. The next range has a size >= pg->fpgsz.
269 * Returns NULL if no more ranges are available.
270 */
271 struct vm_page *
uvm_pmr_nextsz(struct uvm_pmemrange * pmr,struct vm_page * pg,int mt)272 uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt)
273 {
274 struct vm_page *npg;
275
276 KASSERT(pmr != NULL && pg != NULL);
277 if (pg->fpgsz == 1) {
278 if (TAILQ_NEXT(pg, pageq) != NULL)
279 return TAILQ_NEXT(pg, pageq);
280 else
281 npg = RBT_MIN(uvm_pmr_size, &pmr->size[mt]);
282 } else
283 npg = RBT_NEXT(uvm_pmr_size, pg + 1);
284
285 return npg == NULL ? NULL : npg - 1;
286 }
287
288 /*
289 * Finds the previous and next ranges relative to the (uninserted) pg range.
290 *
291 * *pg_prev == NULL if no previous range is available, that can join with
292 * pg.
293 * *pg_next == NULL if no next range is available, that can join with
294 * pg.
295 */
296 void
uvm_pmr_pnaddr(struct uvm_pmemrange * pmr,struct vm_page * pg,struct vm_page ** pg_prev,struct vm_page ** pg_next)297 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg,
298 struct vm_page **pg_prev, struct vm_page **pg_next)
299 {
300 KASSERT(pg_prev != NULL && pg_next != NULL);
301
302 *pg_next = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg);
303 if (*pg_next == NULL)
304 *pg_prev = RBT_MAX(uvm_pmr_addr, &pmr->addr);
305 else
306 *pg_prev = RBT_PREV(uvm_pmr_addr, *pg_next);
307
308 KDASSERT(*pg_next == NULL ||
309 VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg));
310 KDASSERT(*pg_prev == NULL ||
311 VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg));
312
313 /* Reset if not contig. */
314 if (*pg_prev != NULL &&
315 (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz
316 != atop(VM_PAGE_TO_PHYS(pg)) ||
317 *pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */
318 uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg)))
319 *pg_prev = NULL;
320 if (*pg_next != NULL &&
321 (atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz
322 != atop(VM_PAGE_TO_PHYS(*pg_next)) ||
323 pg + pg->fpgsz != *pg_next || /* Array broke. */
324 uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg)))
325 *pg_next = NULL;
326 return;
327 }
328
329 /*
330 * Remove a range from the address tree.
331 * Address tree maintains pmr counters.
332 */
333 void
uvm_pmr_remove_addr(struct uvm_pmemrange * pmr,struct vm_page * pg)334 uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg)
335 {
336 KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
337 KDASSERT(pg->pg_flags & PQ_FREE);
338 RBT_REMOVE(uvm_pmr_addr, &pmr->addr, pg);
339
340 pmr->nsegs--;
341 }
342 /*
343 * Remove a range from the size tree.
344 */
345 void
uvm_pmr_remove_size(struct uvm_pmemrange * pmr,struct vm_page * pg)346 uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
347 {
348 int memtype;
349 #ifdef DEBUG
350 struct vm_page *i;
351 #endif
352
353 KDASSERT(pg->fpgsz >= 1);
354 KDASSERT(pg->pg_flags & PQ_FREE);
355 memtype = uvm_pmr_pg_to_memtype(pg);
356
357 if (pg->fpgsz == 1) {
358 #ifdef DEBUG
359 TAILQ_FOREACH(i, &pmr->single[memtype], pageq) {
360 if (i == pg)
361 break;
362 }
363 KDASSERT(i == pg);
364 #endif
365 TAILQ_REMOVE(&pmr->single[memtype], pg, pageq);
366 } else {
367 KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[memtype],
368 pg + 1) == pg + 1);
369 RBT_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1);
370 }
371 }
372 /* Remove from both trees. */
373 void
uvm_pmr_remove(struct uvm_pmemrange * pmr,struct vm_page * pg)374 uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg)
375 {
376 uvm_pmr_assertvalid(pmr);
377 uvm_pmr_remove_size(pmr, pg);
378 uvm_pmr_remove_addr(pmr, pg);
379 uvm_pmr_assertvalid(pmr);
380 }
381
382 /*
383 * Insert the range described in pg.
384 * Returns the range thus created (which may be joined with the previous and
385 * next ranges).
386 * If no_join, the caller guarantees that the range cannot possibly join
387 * with adjacent ranges.
388 */
389 struct vm_page *
uvm_pmr_insert_addr(struct uvm_pmemrange * pmr,struct vm_page * pg,int no_join)390 uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
391 {
392 struct vm_page *prev, *next;
393
394 #ifdef DEBUG
395 struct vm_page *i;
396 int mt;
397 #endif
398
399 KDASSERT(pg->pg_flags & PQ_FREE);
400 KDASSERT(pg->fpgsz >= 1);
401
402 #ifdef DEBUG
403 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
404 TAILQ_FOREACH(i, &pmr->single[mt], pageq)
405 KDASSERT(i != pg);
406 if (pg->fpgsz > 1) {
407 KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mt],
408 pg + 1) == NULL);
409 }
410 KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL);
411 }
412 #endif
413
414 if (!no_join) {
415 uvm_pmr_pnaddr(pmr, pg, &prev, &next);
416 if (next != NULL) {
417 uvm_pmr_remove_size(pmr, next);
418 uvm_pmr_remove_addr(pmr, next);
419 pg->fpgsz += next->fpgsz;
420 next->fpgsz = 0;
421 }
422 if (prev != NULL) {
423 uvm_pmr_remove_size(pmr, prev);
424 prev->fpgsz += pg->fpgsz;
425 pg->fpgsz = 0;
426 return prev;
427 }
428 }
429
430 RBT_INSERT(uvm_pmr_addr, &pmr->addr, pg);
431
432 pmr->nsegs++;
433
434 return pg;
435 }
436 /*
437 * Insert the range described in pg.
438 * Returns the range thus created (which may be joined with the previous and
439 * next ranges).
440 * Page must already be in the address tree.
441 */
442 void
uvm_pmr_insert_size(struct uvm_pmemrange * pmr,struct vm_page * pg)443 uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
444 {
445 int memtype;
446 #ifdef DEBUG
447 struct vm_page *i;
448 int mti;
449 #endif
450
451 KDASSERT(pg->fpgsz >= 1);
452 KDASSERT(pg->pg_flags & PQ_FREE);
453
454 memtype = uvm_pmr_pg_to_memtype(pg);
455 #ifdef DEBUG
456 for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
457 TAILQ_FOREACH(i, &pmr->single[mti], pageq)
458 KDASSERT(i != pg);
459 if (pg->fpgsz > 1) {
460 KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mti],
461 pg + 1) == NULL);
462 }
463 KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
464 }
465 for (i = pg; i < pg + pg->fpgsz; i++)
466 KASSERT(uvm_pmr_pg_to_memtype(i) == memtype);
467 #endif
468
469 if (pg->fpgsz == 1)
470 TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq);
471 else
472 RBT_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1);
473 }
474 /* Insert in both trees. */
475 struct vm_page *
uvm_pmr_insert(struct uvm_pmemrange * pmr,struct vm_page * pg,int no_join)476 uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
477 {
478 uvm_pmr_assertvalid(pmr);
479 pg = uvm_pmr_insert_addr(pmr, pg, no_join);
480 uvm_pmr_insert_size(pmr, pg);
481 uvm_pmr_assertvalid(pmr);
482 return pg;
483 }
484
485 /*
486 * Find the last page that is part of this segment.
487 * => pg: the range at which to start the search.
488 * => boundary: the page number boundary specification (0 = no boundary).
489 * => pmr: the pmemrange of the page.
490 *
491 * This function returns 1 before the next range, so if you want to have the
492 * next range, you need to run TAILQ_NEXT(result, pageq) after calling.
493 * The reason is that this way, the length of the segment is easily
494 * calculated using: atop(result) - atop(pg) + 1.
495 * Hence this function also never returns NULL.
496 */
497 struct vm_page *
uvm_pmr_findnextsegment(struct uvm_pmemrange * pmr,struct vm_page * pg,paddr_t boundary)498 uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr,
499 struct vm_page *pg, paddr_t boundary)
500 {
501 paddr_t first_boundary;
502 struct vm_page *next;
503 struct vm_page *prev;
504
505 KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) &&
506 pmr->high > atop(VM_PAGE_TO_PHYS(pg)));
507 if (boundary != 0) {
508 first_boundary =
509 PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary);
510 } else
511 first_boundary = 0;
512
513 /*
514 * Increase next until it hits the first page of the next segment.
515 *
516 * While loop checks the following:
517 * - next != NULL we have not reached the end of pgl
518 * - boundary == 0 || next < first_boundary
519 * we do not cross a boundary
520 * - atop(prev) + 1 == atop(next)
521 * still in the same segment
522 * - low <= last
523 * - high > last still in the same memory range
524 * - memtype is equal allocator is unable to view different memtypes
525 * as part of the same segment
526 * - prev + 1 == next no array breakage occurs
527 */
528 prev = pg;
529 next = TAILQ_NEXT(prev, pageq);
530 while (next != NULL &&
531 (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) &&
532 atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) &&
533 pmr->low <= atop(VM_PAGE_TO_PHYS(next)) &&
534 pmr->high > atop(VM_PAGE_TO_PHYS(next)) &&
535 uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) &&
536 prev + 1 == next) {
537 prev = next;
538 next = TAILQ_NEXT(prev, pageq);
539 }
540
541 /*
542 * End of this segment.
543 */
544 return prev;
545 }
546
547 /*
548 * Find the first page that is part of this segment.
549 * => pg: the range at which to start the search.
550 * => boundary: the page number boundary specification (0 = no boundary).
551 * => pmr: the pmemrange of the page.
552 *
553 * This function returns 1 after the previous range, so if you want to have the
554 * previous range, you need to run TAILQ_NEXT(result, pageq) after calling.
555 * The reason is that this way, the length of the segment is easily
556 * calculated using: atop(pg) - atop(result) + 1.
557 * Hence this function also never returns NULL.
558 */
559 struct vm_page *
uvm_pmr_findprevsegment(struct uvm_pmemrange * pmr,struct vm_page * pg,paddr_t boundary)560 uvm_pmr_findprevsegment(struct uvm_pmemrange *pmr,
561 struct vm_page *pg, paddr_t boundary)
562 {
563 paddr_t first_boundary;
564 struct vm_page *next;
565 struct vm_page *prev;
566
567 KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) &&
568 pmr->high > atop(VM_PAGE_TO_PHYS(pg)));
569 if (boundary != 0) {
570 first_boundary =
571 PMR_ALIGN_DOWN(atop(VM_PAGE_TO_PHYS(pg)), boundary);
572 } else
573 first_boundary = 0;
574
575 /*
576 * Increase next until it hits the first page of the previous segment.
577 *
578 * While loop checks the following:
579 * - next != NULL we have not reached the end of pgl
580 * - boundary == 0 || next >= first_boundary
581 * we do not cross a boundary
582 * - atop(prev) - 1 == atop(next)
583 * still in the same segment
584 * - low <= last
585 * - high > last still in the same memory range
586 * - memtype is equal allocator is unable to view different memtypes
587 * as part of the same segment
588 * - prev - 1 == next no array breakage occurs
589 */
590 prev = pg;
591 next = TAILQ_NEXT(prev, pageq);
592 while (next != NULL &&
593 (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) >= first_boundary) &&
594 atop(VM_PAGE_TO_PHYS(prev)) - 1 == atop(VM_PAGE_TO_PHYS(next)) &&
595 pmr->low <= atop(VM_PAGE_TO_PHYS(next)) &&
596 pmr->high > atop(VM_PAGE_TO_PHYS(next)) &&
597 uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) &&
598 prev - 1 == next) {
599 prev = next;
600 next = TAILQ_NEXT(prev, pageq);
601 }
602
603 /*
604 * Start of this segment.
605 */
606 return prev;
607 }
608
609 /*
610 * Remove the first segment of contiguous pages from pgl.
611 * A segment ends if it crosses boundary (unless boundary = 0) or
612 * if it would enter a different uvm_pmemrange.
613 *
614 * Work: the page range that the caller is currently working with.
615 * May be null.
616 *
617 * If is_desperate is non-zero, the smallest segment is erased. Otherwise,
618 * the first segment is erased (which, if called by uvm_pmr_getpages(),
619 * probably is the smallest or very close to it).
620 */
621 psize_t
uvm_pmr_remove_1strange(struct pglist * pgl,paddr_t boundary,struct vm_page ** work,int is_desperate)622 uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary,
623 struct vm_page **work, int is_desperate)
624 {
625 struct vm_page *start, *end, *iter, *iter_end, *inserted, *lowest;
626 psize_t count;
627 struct uvm_pmemrange *pmr, *pmr_iter;
628
629 KASSERT(!TAILQ_EMPTY(pgl));
630
631 /*
632 * Initialize to first page.
633 * Unless desperate scan finds a better candidate, this is what'll be
634 * erased.
635 */
636 start = TAILQ_FIRST(pgl);
637 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start)));
638 end = uvm_pmr_findnextsegment(pmr, start, boundary);
639
640 /*
641 * If we are desperate, we _really_ want to get rid of the smallest
642 * element (rather than a close match to the smallest element).
643 */
644 if (is_desperate) {
645 /* Linear search for smallest segment. */
646 pmr_iter = pmr;
647 for (iter = TAILQ_NEXT(end, pageq);
648 iter != NULL && start != end;
649 iter = TAILQ_NEXT(iter_end, pageq)) {
650 /*
651 * Only update pmr if it doesn't match current
652 * iteration.
653 */
654 if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) ||
655 pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) {
656 pmr_iter = uvm_pmemrange_find(atop(
657 VM_PAGE_TO_PHYS(iter)));
658 }
659
660 iter_end = uvm_pmr_findnextsegment(pmr_iter, iter,
661 boundary);
662
663 /*
664 * Current iteration is smaller than best match so
665 * far; update.
666 */
667 if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) <
668 VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) {
669 start = iter;
670 end = iter_end;
671 pmr = pmr_iter;
672 }
673 }
674 }
675
676 /*
677 * Calculate count and end of the list.
678 */
679 count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1;
680 lowest = start;
681 end = TAILQ_NEXT(end, pageq);
682
683 /*
684 * Actually remove the range of pages.
685 *
686 * Sadly, this cannot be done using pointer iteration:
687 * vm_physseg is not guaranteed to be sorted on address, hence
688 * uvm_page_init() may not have initialized its array sorted by
689 * page number.
690 */
691 for (iter = start; iter != end; iter = iter_end) {
692 iter_end = TAILQ_NEXT(iter, pageq);
693 TAILQ_REMOVE(pgl, iter, pageq);
694 }
695
696 lowest->fpgsz = count;
697 inserted = uvm_pmr_insert(pmr, lowest, 0);
698
699 /*
700 * If the caller was working on a range and this function modified
701 * that range, update the pointer.
702 */
703 if (work != NULL && *work != NULL &&
704 atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) &&
705 atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz >
706 atop(VM_PAGE_TO_PHYS(*work)))
707 *work = inserted;
708 return count;
709 }
710
711 /*
712 * Remove the first segment of contiguous pages from a pgl
713 * with the list elements in reverse order of physaddr.
714 *
715 * A segment ends if it would enter a different uvm_pmemrange.
716 *
717 * Stores starting physical address of the segment in pstart.
718 */
719 psize_t
uvm_pmr_remove_1strange_reverse(struct pglist * pgl,paddr_t * pstart)720 uvm_pmr_remove_1strange_reverse(struct pglist *pgl, paddr_t *pstart)
721 {
722 struct vm_page *start, *end, *iter, *iter_end, *lowest;
723 psize_t count;
724 struct uvm_pmemrange *pmr;
725
726 KASSERT(!TAILQ_EMPTY(pgl));
727
728 start = TAILQ_FIRST(pgl);
729 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start)));
730 end = uvm_pmr_findprevsegment(pmr, start, 0);
731
732 KASSERT(end <= start);
733
734 /*
735 * Calculate count and end of the list.
736 */
737 count = atop(VM_PAGE_TO_PHYS(start) - VM_PAGE_TO_PHYS(end)) + 1;
738 lowest = end;
739 end = TAILQ_NEXT(end, pageq);
740
741 /*
742 * Actually remove the range of pages.
743 *
744 * Sadly, this cannot be done using pointer iteration:
745 * vm_physseg is not guaranteed to be sorted on address, hence
746 * uvm_page_init() may not have initialized its array sorted by
747 * page number.
748 */
749 for (iter = start; iter != end; iter = iter_end) {
750 iter_end = TAILQ_NEXT(iter, pageq);
751 TAILQ_REMOVE(pgl, iter, pageq);
752 }
753
754 lowest->fpgsz = count;
755 (void) uvm_pmr_insert(pmr, lowest, 0);
756
757 *pstart = VM_PAGE_TO_PHYS(lowest);
758 return count;
759 }
760
761 /*
762 * Extract a number of pages from a segment of free pages.
763 * Called by uvm_pmr_getpages.
764 *
765 * Returns the segment that was created from pages left over at the tail
766 * of the remove set of pages, or NULL if no pages were left at the tail.
767 */
768 struct vm_page *
uvm_pmr_extract_range(struct uvm_pmemrange * pmr,struct vm_page * pg,paddr_t start,paddr_t end,struct pglist * result)769 uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg,
770 paddr_t start, paddr_t end, struct pglist *result)
771 {
772 struct vm_page *after, *pg_i;
773 psize_t before_sz, after_sz;
774 #ifdef DEBUG
775 psize_t i;
776 #endif
777
778 KDASSERT(end > start);
779 KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)));
780 KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz);
781 KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start);
782 KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end);
783
784 before_sz = start - atop(VM_PAGE_TO_PHYS(pg));
785 after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end;
786 KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz);
787 uvm_pmr_assertvalid(pmr);
788
789 uvm_pmr_remove_size(pmr, pg);
790 if (before_sz == 0)
791 uvm_pmr_remove_addr(pmr, pg);
792 after = pg + before_sz + (end - start);
793
794 /* Add selected pages to result. */
795 for (pg_i = pg + before_sz; pg_i != after; pg_i++) {
796 KASSERT(pg_i->pg_flags & PQ_FREE);
797 pg_i->fpgsz = 0;
798 TAILQ_INSERT_TAIL(result, pg_i, pageq);
799 }
800
801 /* Before handling. */
802 if (before_sz > 0) {
803 pg->fpgsz = before_sz;
804 uvm_pmr_insert_size(pmr, pg);
805 }
806
807 /* After handling. */
808 if (after_sz > 0) {
809 #ifdef DEBUG
810 for (i = 0; i < after_sz; i++) {
811 KASSERT(!uvm_pmr_isfree(after + i));
812 }
813 #endif
814 KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end);
815 after->fpgsz = after_sz;
816 after = uvm_pmr_insert_addr(pmr, after, 1);
817 uvm_pmr_insert_size(pmr, after);
818 }
819
820 uvm_pmr_assertvalid(pmr);
821 return (after_sz > 0 ? after : NULL);
822 }
823
824 /*
825 * Indicate to the page daemon that a nowait call failed and it should
826 * recover at least some memory in the most restricted region (assumed
827 * to be dma_constraint).
828 */
829 extern volatile int uvm_nowait_failed;
830
831 /*
832 * Acquire a number of pages.
833 *
834 * count: the number of pages returned
835 * start: lowest page number
836 * end: highest page number +1
837 * (start = end = 0: no limitation)
838 * align: power-of-2 alignment constraint (align = 1: no alignment)
839 * boundary: power-of-2 boundary (boundary = 0: no boundary)
840 * maxseg: maximum number of segments to return
841 * flags: UVM_PLA_* flags
842 * result: returned pages storage (uses pageq)
843 */
844 int
uvm_pmr_getpages(psize_t count,paddr_t start,paddr_t end,paddr_t align,paddr_t boundary,int maxseg,int flags,struct pglist * result)845 uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align,
846 paddr_t boundary, int maxseg, int flags, struct pglist *result)
847 {
848 struct uvm_pmemrange *pmr; /* Iterate memory ranges. */
849 struct vm_page *found, *f_next; /* Iterate chunks. */
850 psize_t fcount; /* Current found pages. */
851 int fnsegs; /* Current segment counter. */
852 int try, start_try;
853 psize_t search[3];
854 paddr_t fstart, fend; /* Pages to be taken from found. */
855 int memtype; /* Requested memtype. */
856 int memtype_init; /* Best memtype. */
857 int desperate; /* True if allocation failed. */
858 #ifdef DIAGNOSTIC
859 struct vm_page *diag_prev; /* Used during validation. */
860 #endif /* DIAGNOSTIC */
861
862 /*
863 * Validate arguments.
864 */
865 KASSERT(count > 0);
866 KASSERT(start == 0 || end == 0 || start < end);
867 KASSERT(align >= 1);
868 KASSERT(powerof2(align));
869 KASSERT(maxseg > 0);
870 KASSERT(boundary == 0 || powerof2(boundary));
871 KASSERT(boundary == 0 || maxseg * boundary >= count);
872 KASSERT(TAILQ_EMPTY(result));
873 KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT));
874
875 /*
876 * TRYCONTIG is a noop if you only want a single segment.
877 * Remove it if that's the case: otherwise it'll deny the fast
878 * allocation.
879 */
880 if (maxseg == 1 || count == 1)
881 flags &= ~UVM_PLA_TRYCONTIG;
882
883 /*
884 * Configure search.
885 *
886 * search[0] is one segment, only used in UVM_PLA_TRYCONTIG case.
887 * search[1] is multiple segments, chosen to fulfill the search in
888 * approximately even-sized segments.
889 * This is a good trade-off between slightly reduced allocation speed
890 * and less fragmentation.
891 * search[2] is the worst case, in which all segments are evaluated.
892 * This provides the least fragmentation, but makes the search
893 * possibly longer (although in the case it is selected, that no
894 * longer matters most).
895 *
896 * The exception is when maxseg == 1: since we can only fulfill that
897 * with one segment of size pages, only a single search type has to
898 * be attempted.
899 */
900 if (maxseg == 1 || count == 1) {
901 start_try = 2;
902 search[2] = count;
903 } else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) {
904 start_try = 2;
905 search[2] = 1;
906 } else {
907 start_try = 0;
908 search[0] = count;
909 search[1] = pow2divide(count, maxseg);
910 search[2] = 1;
911 if ((flags & UVM_PLA_TRYCONTIG) == 0)
912 start_try = 1;
913 if (search[1] >= search[0]) {
914 search[1] = search[0];
915 start_try = 1;
916 }
917 if (search[2] >= search[start_try]) {
918 start_try = 2;
919 }
920 }
921
922 /*
923 * Memory type: if zeroed memory is requested, traverse the zero set.
924 * Otherwise, traverse the dirty set.
925 *
926 * The memtype iterator is reinitialized to memtype_init on entrance
927 * of a pmemrange.
928 */
929 if (flags & UVM_PLA_ZERO)
930 memtype_init = UVM_PMR_MEMTYPE_ZERO;
931 else
932 memtype_init = UVM_PMR_MEMTYPE_DIRTY;
933
934 /*
935 * Initially, we're not desperate.
936 *
937 * Note that if we return from a sleep, we are still desperate.
938 * Chances are that memory pressure is still high, so resetting
939 * seems over-optimistic to me.
940 */
941 desperate = 0;
942
943 again:
944 uvm_lock_fpageq();
945
946 /*
947 * check to see if we need to generate some free pages waking
948 * the pagedaemon.
949 */
950 if ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freemin ||
951 ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg &&
952 (uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg))
953 wakeup(&uvm.pagedaemon);
954
955 /*
956 * fail if any of these conditions is true:
957 * [1] there really are no free pages, or
958 * [2] only kernel "reserved" pages remain and
959 * the UVM_PLA_USERESERVE flag wasn't used.
960 * [3] only pagedaemon "reserved" pages remain and
961 * the requestor isn't the pagedaemon nor the syncer.
962 */
963 if ((uvmexp.free <= (uvmexp.reserve_kernel + count)) &&
964 !(flags & UVM_PLA_USERESERVE)) {
965 uvm_unlock_fpageq();
966 return ENOMEM;
967 }
968
969 if ((uvmexp.free <= (uvmexp.reserve_pagedaemon + count)) &&
970 (curproc != uvm.pagedaemon_proc) && (curproc != syncerproc)) {
971 uvm_unlock_fpageq();
972 if (flags & UVM_PLA_WAITOK) {
973 uvm_wait("uvm_pmr_getpages");
974 goto again;
975 }
976 return ENOMEM;
977 }
978
979 retry: /* Return point after sleeping. */
980 fcount = 0;
981 fnsegs = 0;
982
983 retry_desperate:
984 /*
985 * If we just want any page(s), go for the really fast option.
986 */
987 if (count <= maxseg && align == 1 && boundary == 0 &&
988 (flags & UVM_PLA_TRYCONTIG) == 0) {
989 fcount += uvm_pmr_get1page(count - fcount, memtype_init,
990 result, start, end, 0);
991
992 /*
993 * If we found sufficient pages, go to the success exit code.
994 *
995 * Otherwise, go immediately to fail, since we collected
996 * all we could anyway.
997 */
998 if (fcount == count)
999 goto out;
1000 else
1001 goto fail;
1002 }
1003
1004 /*
1005 * The heart of the contig case.
1006 *
1007 * The code actually looks like this:
1008 *
1009 * foreach (struct pmemrange) {
1010 * foreach (memtype) {
1011 * foreach(try) {
1012 * foreach (free range of memtype in pmemrange,
1013 * starting at search[try]) {
1014 * while (range has space left)
1015 * take from range
1016 * }
1017 * }
1018 * }
1019 *
1020 * if next pmemrange has higher usecount than current:
1021 * enter desperate case (which will drain the pmemranges
1022 * until empty prior to moving to the next one)
1023 * }
1024 *
1025 * When desperate is activated, try always starts at the highest
1026 * value. The memtype loop is using a goto ReScanMemtype.
1027 * The try loop is using a goto ReScan.
1028 * The 'range has space left' loop uses label DrainFound.
1029 *
1030 * Writing them all as loops would take up a lot of screen space in
1031 * the form of indentation and some parts are easier to express
1032 * using the labels.
1033 */
1034
1035 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1036 /* Empty range. */
1037 if (pmr->nsegs == 0)
1038 continue;
1039
1040 /* Outside requested range. */
1041 if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
1042 continue;
1043
1044 memtype = memtype_init;
1045
1046 rescan_memtype: /* Return point at memtype++. */
1047 try = start_try;
1048
1049 rescan: /* Return point at try++. */
1050 for (found = uvm_pmr_nfindsz(pmr, search[try], memtype);
1051 found != NULL;
1052 found = f_next) {
1053 f_next = uvm_pmr_nextsz(pmr, found, memtype);
1054
1055 fstart = atop(VM_PAGE_TO_PHYS(found));
1056 if (start != 0)
1057 fstart = MAX(start, fstart);
1058 drain_found:
1059 /*
1060 * Throw away the first segment if fnsegs == maxseg
1061 *
1062 * Note that f_next is still valid after this call,
1063 * since we only allocated from entries before f_next.
1064 * We don't revisit the entries we already extracted
1065 * from unless we entered the desperate case.
1066 */
1067 if (fnsegs == maxseg) {
1068 fnsegs--;
1069 fcount -=
1070 uvm_pmr_remove_1strange(result, boundary,
1071 &found, desperate);
1072 }
1073
1074 fstart = PMR_ALIGN(fstart, align);
1075 fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz;
1076 if (end != 0)
1077 fend = MIN(end, fend);
1078 if (boundary != 0) {
1079 fend =
1080 MIN(fend, PMR_ALIGN(fstart + 1, boundary));
1081 }
1082 if (fstart >= fend)
1083 continue;
1084 if (fend - fstart > count - fcount)
1085 fend = fstart + (count - fcount);
1086
1087 fcount += fend - fstart;
1088 fnsegs++;
1089 found = uvm_pmr_extract_range(pmr, found,
1090 fstart, fend, result);
1091
1092 if (fcount == count)
1093 goto out;
1094
1095 /*
1096 * If there's still space left in found, try to
1097 * fully drain it prior to continuing.
1098 */
1099 if (found != NULL) {
1100 fstart = fend;
1101 goto drain_found;
1102 }
1103 }
1104
1105 /* Try a smaller search now. */
1106 if (++try < nitems(search))
1107 goto rescan;
1108
1109 /*
1110 * Exhaust all memory types prior to going to the next memory
1111 * segment.
1112 * This means that zero-vs-dirty are eaten prior to moving
1113 * to a pmemrange with a higher use-count.
1114 *
1115 * Code is basically a difficult way of writing:
1116 * memtype = memtype_init;
1117 * do {
1118 * ...;
1119 * memtype += 1;
1120 * memtype %= MEMTYPE_MAX;
1121 * } while (memtype != memtype_init);
1122 */
1123 memtype += 1;
1124 if (memtype == UVM_PMR_MEMTYPE_MAX)
1125 memtype = 0;
1126 if (memtype != memtype_init)
1127 goto rescan_memtype;
1128
1129 /*
1130 * If not desperate, enter desperate case prior to eating all
1131 * the good stuff in the next range.
1132 */
1133 if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL &&
1134 TAILQ_NEXT(pmr, pmr_use)->use != pmr->use)
1135 break;
1136 }
1137
1138 /*
1139 * Not enough memory of the requested type available. Fall back to
1140 * less good memory that we'll clean up better later.
1141 *
1142 * This algorithm is not very smart though, it just starts scanning
1143 * a different typed range, but the nicer ranges of the previous
1144 * iteration may fall out. Hence there is a small chance of a false
1145 * negative.
1146 *
1147 * When desperate: scan all sizes starting at the smallest
1148 * (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may
1149 * allow us to hit the fast path now).
1150 *
1151 * Also, because we will revisit entries we scanned before, we need
1152 * to reset the page queue, or we may end up releasing entries in
1153 * such a way as to invalidate f_next.
1154 */
1155 if (!desperate) {
1156 desperate = 1;
1157 start_try = nitems(search) - 1;
1158 flags &= ~UVM_PLA_TRYCONTIG;
1159
1160 while (!TAILQ_EMPTY(result))
1161 uvm_pmr_remove_1strange(result, 0, NULL, 0);
1162 fnsegs = 0;
1163 fcount = 0;
1164 goto retry_desperate;
1165 }
1166
1167 fail:
1168 /* Allocation failed. */
1169 /* XXX: claim from memory reserve here */
1170
1171 while (!TAILQ_EMPTY(result))
1172 uvm_pmr_remove_1strange(result, 0, NULL, 0);
1173
1174 if (flags & UVM_PLA_WAITOK) {
1175 if (uvm_wait_pla(ptoa(start), ptoa(end) - 1, ptoa(count),
1176 flags & UVM_PLA_FAILOK) == 0)
1177 goto retry;
1178 KASSERT(flags & UVM_PLA_FAILOK);
1179 } else {
1180 if (!(flags & UVM_PLA_NOWAKE)) {
1181 uvm_nowait_failed = 1;
1182 wakeup(&uvm.pagedaemon);
1183 }
1184 }
1185 uvm_unlock_fpageq();
1186
1187 return ENOMEM;
1188
1189 out:
1190 /* Allocation successful. */
1191 uvmexp.free -= fcount;
1192
1193 uvm_unlock_fpageq();
1194
1195 /* Update statistics and zero pages if UVM_PLA_ZERO. */
1196 #ifdef DIAGNOSTIC
1197 fnsegs = 0;
1198 fcount = 0;
1199 diag_prev = NULL;
1200 #endif /* DIAGNOSTIC */
1201 TAILQ_FOREACH(found, result, pageq) {
1202 atomic_clearbits_int(&found->pg_flags, PG_PMAPMASK);
1203
1204 if (found->pg_flags & PG_ZERO) {
1205 uvm_lock_fpageq();
1206 uvmexp.zeropages--;
1207 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1208 wakeup(&uvmexp.zeropages);
1209 uvm_unlock_fpageq();
1210 }
1211 if (flags & UVM_PLA_ZERO) {
1212 if (found->pg_flags & PG_ZERO)
1213 uvmexp.pga_zerohit++;
1214 else {
1215 uvmexp.pga_zeromiss++;
1216 uvm_pagezero(found);
1217 }
1218 }
1219 atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE);
1220
1221 found->uobject = NULL;
1222 found->uanon = NULL;
1223 found->pg_version++;
1224
1225 /*
1226 * Validate that the page matches range criterium.
1227 */
1228 KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start);
1229 KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end);
1230
1231 #ifdef DIAGNOSTIC
1232 /*
1233 * Update fcount (# found pages) and
1234 * fnsegs (# found segments) counters.
1235 */
1236 if (diag_prev == NULL ||
1237 /* new segment if it contains a hole */
1238 atop(VM_PAGE_TO_PHYS(diag_prev)) + 1 !=
1239 atop(VM_PAGE_TO_PHYS(found)) ||
1240 /* new segment if it crosses boundary */
1241 (atop(VM_PAGE_TO_PHYS(diag_prev)) & ~(boundary - 1)) !=
1242 (atop(VM_PAGE_TO_PHYS(found)) & ~(boundary - 1)))
1243 fnsegs++;
1244 fcount++;
1245
1246 diag_prev = found;
1247 #endif /* DIAGNOSTIC */
1248 }
1249
1250 #ifdef DIAGNOSTIC
1251 /*
1252 * Panic on algorithm failure.
1253 */
1254 if (fcount != count || fnsegs > maxseg) {
1255 panic("pmemrange allocation error: "
1256 "allocated %ld pages in %d segments, "
1257 "but request was %ld pages in %d segments",
1258 fcount, fnsegs, count, maxseg);
1259 }
1260 #endif /* DIAGNOSTIC */
1261
1262 return 0;
1263 }
1264
1265 /*
1266 * Acquire a single page.
1267 *
1268 * flags: UVM_PLA_* flags
1269 * result: returned page.
1270 */
1271 struct vm_page *
uvm_pmr_getone(int flags)1272 uvm_pmr_getone(int flags)
1273 {
1274 struct vm_page *pg;
1275 struct pglist pgl;
1276
1277 TAILQ_INIT(&pgl);
1278 if (uvm_pmr_getpages(1, 0, 0, 1, 0, 1, flags, &pgl) != 0)
1279 return NULL;
1280
1281 pg = TAILQ_FIRST(&pgl);
1282 KASSERT(pg != NULL && TAILQ_NEXT(pg, pageq) == NULL);
1283
1284 return pg;
1285 }
1286
1287 /*
1288 * Free a number of contig pages (invoked by uvm_page_init).
1289 */
1290 void
uvm_pmr_freepages(struct vm_page * pg,psize_t count)1291 uvm_pmr_freepages(struct vm_page *pg, psize_t count)
1292 {
1293 struct uvm_pmemrange *pmr;
1294 psize_t i, pmr_count;
1295 struct vm_page *firstpg = pg;
1296
1297 for (i = 0; i < count; i++) {
1298 KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) ==
1299 atop(VM_PAGE_TO_PHYS(pg)) + i);
1300
1301 if (!((pg[i].pg_flags & PQ_FREE) == 0 &&
1302 VALID_FLAGS(pg[i].pg_flags))) {
1303 printf("Flags: 0x%x, will panic now.\n",
1304 pg[i].pg_flags);
1305 }
1306 KASSERT((pg[i].pg_flags & PQ_FREE) == 0 &&
1307 VALID_FLAGS(pg[i].pg_flags));
1308 atomic_setbits_int(&pg[i].pg_flags, PQ_FREE);
1309 atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
1310 }
1311
1312 uvm_lock_fpageq();
1313
1314 for (i = count; i > 0; i -= pmr_count) {
1315 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1316 KASSERT(pmr != NULL);
1317
1318 pmr_count = MIN(i, pmr->high - atop(VM_PAGE_TO_PHYS(pg)));
1319 pg->fpgsz = pmr_count;
1320 uvm_pmr_insert(pmr, pg, 0);
1321
1322 uvmexp.free += pmr_count;
1323 pg += pmr_count;
1324 }
1325 wakeup(&uvmexp.free);
1326 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1327 wakeup(&uvmexp.zeropages);
1328
1329 uvm_wakeup_pla(VM_PAGE_TO_PHYS(firstpg), ptoa(count));
1330
1331 uvm_unlock_fpageq();
1332 }
1333
1334 /*
1335 * Free all pages in the queue.
1336 */
1337 void
uvm_pmr_freepageq(struct pglist * pgl)1338 uvm_pmr_freepageq(struct pglist *pgl)
1339 {
1340 struct vm_page *pg;
1341 paddr_t pstart;
1342 psize_t plen;
1343
1344 TAILQ_FOREACH(pg, pgl, pageq) {
1345 if (!((pg->pg_flags & PQ_FREE) == 0 &&
1346 VALID_FLAGS(pg->pg_flags))) {
1347 printf("Flags: 0x%x, will panic now.\n",
1348 pg->pg_flags);
1349 }
1350 KASSERT((pg->pg_flags & PQ_FREE) == 0 &&
1351 VALID_FLAGS(pg->pg_flags));
1352 atomic_setbits_int(&pg->pg_flags, PQ_FREE);
1353 atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
1354 }
1355
1356 uvm_lock_fpageq();
1357 while (!TAILQ_EMPTY(pgl)) {
1358 pg = TAILQ_FIRST(pgl);
1359 if (pg == TAILQ_NEXT(pg, pageq) + 1) {
1360 /*
1361 * If pg is one behind the position of the
1362 * next page in the list in the page array,
1363 * try going backwards instead of forward.
1364 */
1365 plen = uvm_pmr_remove_1strange_reverse(pgl, &pstart);
1366 } else {
1367 pstart = VM_PAGE_TO_PHYS(TAILQ_FIRST(pgl));
1368 plen = uvm_pmr_remove_1strange(pgl, 0, NULL, 0);
1369 }
1370 uvmexp.free += plen;
1371
1372 uvm_wakeup_pla(pstart, ptoa(plen));
1373 }
1374 wakeup(&uvmexp.free);
1375 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1376 wakeup(&uvmexp.zeropages);
1377 uvm_unlock_fpageq();
1378
1379 return;
1380 }
1381
1382 /*
1383 * Store a pmemrange in the list.
1384 *
1385 * The list is sorted by use.
1386 */
1387 struct uvm_pmemrange *
uvm_pmemrange_use_insert(struct uvm_pmemrange_use * useq,struct uvm_pmemrange * pmr)1388 uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq,
1389 struct uvm_pmemrange *pmr)
1390 {
1391 struct uvm_pmemrange *iter;
1392 int cmp = 1;
1393
1394 TAILQ_FOREACH(iter, useq, pmr_use) {
1395 cmp = uvm_pmemrange_use_cmp(pmr, iter);
1396 if (cmp == 0)
1397 return iter;
1398 if (cmp == -1)
1399 break;
1400 }
1401
1402 if (iter == NULL)
1403 TAILQ_INSERT_TAIL(useq, pmr, pmr_use);
1404 else
1405 TAILQ_INSERT_BEFORE(iter, pmr, pmr_use);
1406 return NULL;
1407 }
1408
1409 #ifdef DEBUG
1410 /*
1411 * Validation of the whole pmemrange.
1412 * Called with fpageq locked.
1413 */
1414 void
uvm_pmr_assertvalid(struct uvm_pmemrange * pmr)1415 uvm_pmr_assertvalid(struct uvm_pmemrange *pmr)
1416 {
1417 struct vm_page *prev, *next, *i, *xref;
1418 int lcv, mti;
1419
1420 /* Empty range */
1421 if (pmr->nsegs == 0)
1422 return;
1423
1424 /* Validate address tree. */
1425 RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) {
1426 /* Validate the range. */
1427 KASSERT(i->fpgsz > 0);
1428 KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low);
1429 KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz
1430 <= pmr->high);
1431
1432 /* Validate each page in this range. */
1433 for (lcv = 0; lcv < i->fpgsz; lcv++) {
1434 /*
1435 * Only the first page has a size specification.
1436 * Rest is size 0.
1437 */
1438 KASSERT(lcv == 0 || i[lcv].fpgsz == 0);
1439 /*
1440 * Flag check.
1441 */
1442 KASSERT(VALID_FLAGS(i[lcv].pg_flags) &&
1443 (i[lcv].pg_flags & PQ_FREE) == PQ_FREE);
1444 /*
1445 * Free pages are:
1446 * - not wired
1447 * - have no vm_anon
1448 * - have no uvm_object
1449 */
1450 KASSERT(i[lcv].wire_count == 0);
1451 KASSERT(i[lcv].uanon == (void*)0xdeadbeef ||
1452 i[lcv].uanon == NULL);
1453 KASSERT(i[lcv].uobject == (void*)0xdeadbeef ||
1454 i[lcv].uobject == NULL);
1455 /*
1456 * Pages in a single range always have the same
1457 * memtype.
1458 */
1459 KASSERT(uvm_pmr_pg_to_memtype(&i[0]) ==
1460 uvm_pmr_pg_to_memtype(&i[lcv]));
1461 }
1462
1463 /* Check that it shouldn't be joined with its predecessor. */
1464 prev = RBT_PREV(uvm_pmr_addr, i);
1465 if (prev != NULL) {
1466 KASSERT(uvm_pmr_pg_to_memtype(i) !=
1467 uvm_pmr_pg_to_memtype(prev) ||
1468 atop(VM_PAGE_TO_PHYS(i)) >
1469 atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz ||
1470 prev + prev->fpgsz != i);
1471 }
1472
1473 /* Assert i is in the size tree as well. */
1474 if (i->fpgsz == 1) {
1475 TAILQ_FOREACH(xref,
1476 &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) {
1477 if (xref == i)
1478 break;
1479 }
1480 KASSERT(xref == i);
1481 } else {
1482 KASSERT(RBT_FIND(uvm_pmr_size,
1483 &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) ==
1484 i + 1);
1485 }
1486 }
1487
1488 /* Validate size tree. */
1489 for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
1490 for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) {
1491 next = uvm_pmr_nextsz(pmr, i, mti);
1492 if (next != NULL) {
1493 KASSERT(i->fpgsz <=
1494 next->fpgsz);
1495 }
1496
1497 /* Assert i is in the addr tree as well. */
1498 KASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, i) == i);
1499
1500 /* Assert i is of the correct memory type. */
1501 KASSERT(uvm_pmr_pg_to_memtype(i) == mti);
1502 }
1503 }
1504
1505 /* Validate nsegs statistic. */
1506 lcv = 0;
1507 RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr)
1508 lcv++;
1509 KASSERT(pmr->nsegs == lcv);
1510 }
1511 #endif /* DEBUG */
1512
1513 /*
1514 * Split pmr at split point pageno.
1515 * Called with fpageq unlocked.
1516 *
1517 * Split is only applied if a pmemrange spans pageno.
1518 */
1519 void
uvm_pmr_split(paddr_t pageno)1520 uvm_pmr_split(paddr_t pageno)
1521 {
1522 struct uvm_pmemrange *pmr, *drain;
1523 struct vm_page *rebuild, *prev, *next;
1524 psize_t prev_sz;
1525
1526 uvm_lock_fpageq();
1527 pmr = uvm_pmemrange_find(pageno);
1528 if (pmr == NULL || !(pmr->low < pageno)) {
1529 /* No split required. */
1530 uvm_unlock_fpageq();
1531 return;
1532 }
1533
1534 KASSERT(pmr->low < pageno);
1535 KASSERT(pmr->high > pageno);
1536
1537 /*
1538 * uvm_pmr_allocpmr() calls into malloc() which in turn calls into
1539 * uvm_kmemalloc which calls into pmemrange, making the locking
1540 * a bit hard, so we just race!
1541 */
1542 uvm_unlock_fpageq();
1543 drain = uvm_pmr_allocpmr();
1544 uvm_lock_fpageq();
1545 pmr = uvm_pmemrange_find(pageno);
1546 if (pmr == NULL || !(pmr->low < pageno)) {
1547 /*
1548 * We lost the race since someone else ran this or a related
1549 * function, however this should be triggered very rarely so
1550 * we just leak the pmr.
1551 */
1552 printf("uvm_pmr_split: lost one pmr\n");
1553 uvm_unlock_fpageq();
1554 return;
1555 }
1556
1557 drain->low = pageno;
1558 drain->high = pmr->high;
1559 drain->use = pmr->use;
1560
1561 uvm_pmr_assertvalid(pmr);
1562 uvm_pmr_assertvalid(drain);
1563 KASSERT(drain->nsegs == 0);
1564
1565 RBT_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) {
1566 if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno)
1567 break;
1568 }
1569 if (rebuild == NULL)
1570 prev = RBT_MAX(uvm_pmr_addr, &pmr->addr);
1571 else
1572 prev = RBT_PREV(uvm_pmr_addr, rebuild);
1573 KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1574
1575 /*
1576 * Handle free chunk that spans the split point.
1577 */
1578 if (prev != NULL &&
1579 atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) {
1580 psize_t before, after;
1581
1582 KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1583
1584 uvm_pmr_remove(pmr, prev);
1585 prev_sz = prev->fpgsz;
1586 before = pageno - atop(VM_PAGE_TO_PHYS(prev));
1587 after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno;
1588
1589 KASSERT(before > 0);
1590 KASSERT(after > 0);
1591
1592 prev->fpgsz = before;
1593 uvm_pmr_insert(pmr, prev, 1);
1594 (prev + before)->fpgsz = after;
1595 uvm_pmr_insert(drain, prev + before, 1);
1596 }
1597
1598 /* Move free chunks that no longer fall in the range. */
1599 for (; rebuild != NULL; rebuild = next) {
1600 next = RBT_NEXT(uvm_pmr_addr, rebuild);
1601
1602 uvm_pmr_remove(pmr, rebuild);
1603 uvm_pmr_insert(drain, rebuild, 1);
1604 }
1605
1606 pmr->high = pageno;
1607 uvm_pmr_assertvalid(pmr);
1608 uvm_pmr_assertvalid(drain);
1609
1610 RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain);
1611 uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain);
1612 uvm_unlock_fpageq();
1613 }
1614
1615 /*
1616 * Increase the usage counter for the given range of memory.
1617 *
1618 * The more usage counters a given range of memory has, the more will be
1619 * attempted not to allocate from it.
1620 *
1621 * Addresses here are in paddr_t, not page-numbers.
1622 * The lowest and highest allowed address are specified.
1623 */
1624 void
uvm_pmr_use_inc(paddr_t low,paddr_t high)1625 uvm_pmr_use_inc(paddr_t low, paddr_t high)
1626 {
1627 struct uvm_pmemrange *pmr;
1628 paddr_t sz;
1629
1630 /* pmr uses page numbers, translate low and high. */
1631 high++;
1632 high = atop(trunc_page(high));
1633 low = atop(round_page(low));
1634 uvm_pmr_split(low);
1635 uvm_pmr_split(high);
1636
1637 sz = 0;
1638 uvm_lock_fpageq();
1639 /* Increase use count on segments in range. */
1640 RBT_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) {
1641 if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) {
1642 TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use);
1643 pmr->use++;
1644 sz += pmr->high - pmr->low;
1645 uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr);
1646 }
1647 uvm_pmr_assertvalid(pmr);
1648 }
1649 uvm_unlock_fpageq();
1650
1651 KASSERT(sz >= high - low);
1652 }
1653
1654 /*
1655 * Allocate a pmemrange.
1656 *
1657 * If called from uvm_page_init, the uvm_pageboot_alloc is used.
1658 * If called after uvm_init, malloc is used.
1659 * (And if called in between, you're dead.)
1660 */
1661 struct uvm_pmemrange *
uvm_pmr_allocpmr(void)1662 uvm_pmr_allocpmr(void)
1663 {
1664 struct uvm_pmemrange *nw;
1665 int i;
1666
1667 /* We're only ever hitting the !uvm.page_init_done case for now. */
1668 if (!uvm.page_init_done) {
1669 nw = (struct uvm_pmemrange *)
1670 uvm_pageboot_alloc(sizeof(struct uvm_pmemrange));
1671 } else {
1672 nw = malloc(sizeof(struct uvm_pmemrange),
1673 M_VMMAP, M_NOWAIT);
1674 }
1675 KASSERT(nw != NULL);
1676 memset(nw, 0, sizeof(struct uvm_pmemrange));
1677 RBT_INIT(uvm_pmr_addr, &nw->addr);
1678 for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) {
1679 RBT_INIT(uvm_pmr_size, &nw->size[i]);
1680 TAILQ_INIT(&nw->single[i]);
1681 }
1682 return nw;
1683 }
1684
1685 /*
1686 * Initialization of pmr.
1687 * Called by uvm_page_init.
1688 *
1689 * Sets up pmemranges.
1690 */
1691 void
uvm_pmr_init(void)1692 uvm_pmr_init(void)
1693 {
1694 struct uvm_pmemrange *new_pmr;
1695 int i;
1696
1697 TAILQ_INIT(&uvm.pmr_control.use);
1698 RBT_INIT(uvm_pmemrange_addr, &uvm.pmr_control.addr);
1699 TAILQ_INIT(&uvm.pmr_control.allocs);
1700
1701 /* By default, one range for the entire address space. */
1702 new_pmr = uvm_pmr_allocpmr();
1703 new_pmr->low = 0;
1704 new_pmr->high = atop((paddr_t)-1) + 1;
1705
1706 RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr);
1707 uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr);
1708
1709 for (i = 0; uvm_md_constraints[i] != NULL; i++) {
1710 uvm_pmr_use_inc(uvm_md_constraints[i]->ucr_low,
1711 uvm_md_constraints[i]->ucr_high);
1712 }
1713 }
1714
1715 /*
1716 * Find the pmemrange that contains the given page number.
1717 *
1718 * (Manually traverses the binary tree, because that is cheaper on stack
1719 * usage.)
1720 */
1721 struct uvm_pmemrange *
uvm_pmemrange_find(paddr_t pageno)1722 uvm_pmemrange_find(paddr_t pageno)
1723 {
1724 struct uvm_pmemrange *pmr;
1725
1726 pmr = RBT_ROOT(uvm_pmemrange_addr, &uvm.pmr_control.addr);
1727 while (pmr != NULL) {
1728 if (pmr->low > pageno)
1729 pmr = RBT_LEFT(uvm_pmemrange_addr, pmr);
1730 else if (pmr->high <= pageno)
1731 pmr = RBT_RIGHT(uvm_pmemrange_addr, pmr);
1732 else
1733 break;
1734 }
1735
1736 return pmr;
1737 }
1738
1739 #if defined(DDB) || defined(DEBUG)
1740 /*
1741 * Return true if the given page is in any of the free lists.
1742 * Used by uvm_page_printit.
1743 * This function is safe, even if the page is not on the freeq.
1744 * Note: does not apply locking, only called from ddb.
1745 */
1746 int
uvm_pmr_isfree(struct vm_page * pg)1747 uvm_pmr_isfree(struct vm_page *pg)
1748 {
1749 struct vm_page *r;
1750 struct uvm_pmemrange *pmr;
1751
1752 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1753 if (pmr == NULL)
1754 return 0;
1755 r = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg);
1756 if (r == NULL)
1757 r = RBT_MAX(uvm_pmr_addr, &pmr->addr);
1758 else if (r != pg)
1759 r = RBT_PREV(uvm_pmr_addr, r);
1760 if (r == NULL)
1761 return 0; /* Empty tree. */
1762
1763 KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg)));
1764 return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz >
1765 atop(VM_PAGE_TO_PHYS(pg));
1766 }
1767 #endif /* DEBUG */
1768
1769 /*
1770 * Given a root of a tree, find a range which intersects start, end and
1771 * is of the same memtype.
1772 *
1773 * Page must be in the address tree.
1774 */
1775 struct vm_page*
uvm_pmr_rootupdate(struct uvm_pmemrange * pmr,struct vm_page * init_root,paddr_t start,paddr_t end,int memtype)1776 uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root,
1777 paddr_t start, paddr_t end, int memtype)
1778 {
1779 int direction;
1780 struct vm_page *root;
1781 struct vm_page *high, *high_next;
1782 struct vm_page *low, *low_next;
1783
1784 KDASSERT(pmr != NULL && init_root != NULL);
1785 root = init_root;
1786
1787 /* Which direction to use for searching. */
1788 if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start)
1789 direction = 1;
1790 else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end)
1791 direction = -1;
1792 else /* nothing to do */
1793 return root;
1794
1795 /* First, update root to fall within the chosen range. */
1796 while (root && !PMR_INTERSECTS_WITH(
1797 atop(VM_PAGE_TO_PHYS(root)),
1798 atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz,
1799 start, end)) {
1800 if (direction == 1)
1801 root = RBT_RIGHT(uvm_objtree, root);
1802 else
1803 root = RBT_LEFT(uvm_objtree, root);
1804 }
1805 if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype)
1806 return root;
1807
1808 /*
1809 * Root is valid, but of the wrong memtype.
1810 *
1811 * Try to find a range that has the given memtype in the subtree
1812 * (memtype mismatches are costly, either because the conversion
1813 * is expensive, or a later allocation will need to do the opposite
1814 * conversion, which will be expensive).
1815 *
1816 *
1817 * First, simply increase address until we hit something we can use.
1818 * Cache the upper page, so we can page-walk later.
1819 */
1820 high = root;
1821 high_next = RBT_RIGHT(uvm_objtree, high);
1822 while (high_next != NULL && PMR_INTERSECTS_WITH(
1823 atop(VM_PAGE_TO_PHYS(high_next)),
1824 atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz,
1825 start, end)) {
1826 high = high_next;
1827 if (uvm_pmr_pg_to_memtype(high) == memtype)
1828 return high;
1829 high_next = RBT_RIGHT(uvm_objtree, high);
1830 }
1831
1832 /*
1833 * Second, decrease the address until we hit something we can use.
1834 * Cache the lower page, so we can page-walk later.
1835 */
1836 low = root;
1837 low_next = RBT_LEFT(uvm_objtree, low);
1838 while (low_next != NULL && PMR_INTERSECTS_WITH(
1839 atop(VM_PAGE_TO_PHYS(low_next)),
1840 atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz,
1841 start, end)) {
1842 low = low_next;
1843 if (uvm_pmr_pg_to_memtype(low) == memtype)
1844 return low;
1845 low_next = RBT_LEFT(uvm_objtree, low);
1846 }
1847
1848 if (low == high)
1849 return NULL;
1850
1851 /* No hits. Walk the address tree until we find something usable. */
1852 for (low = RBT_NEXT(uvm_pmr_addr, low);
1853 low != high;
1854 low = RBT_NEXT(uvm_pmr_addr, low)) {
1855 KDASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(low)),
1856 atop(VM_PAGE_TO_PHYS(low)) + low->fpgsz,
1857 start, end));
1858 if (uvm_pmr_pg_to_memtype(low) == memtype)
1859 return low;
1860 }
1861
1862 /* Nothing found. */
1863 return NULL;
1864 }
1865
1866 /*
1867 * Allocate any page, the fastest way. Page number constraints only.
1868 */
1869 psize_t
uvm_pmr_get1page(psize_t count,int memtype_init,struct pglist * result,paddr_t start,paddr_t end,int memtype_only)1870 uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result,
1871 paddr_t start, paddr_t end, int memtype_only)
1872 {
1873 struct uvm_pmemrange *pmr;
1874 struct vm_page *found, *splitpg;
1875 psize_t fcount;
1876 int memtype;
1877
1878 fcount = 0;
1879 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1880 /* We're done. */
1881 if (fcount == count)
1882 break;
1883
1884 /* Outside requested range. */
1885 if (!(start == 0 && end == 0) &&
1886 !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
1887 continue;
1888
1889 /* Range is empty. */
1890 if (pmr->nsegs == 0)
1891 continue;
1892
1893 /* Loop over all memtypes, starting at memtype_init. */
1894 memtype = memtype_init;
1895 while (fcount != count) {
1896 found = TAILQ_FIRST(&pmr->single[memtype]);
1897 /*
1898 * If found is outside the range, walk the list
1899 * until we find something that intersects with
1900 * boundaries.
1901 */
1902 while (found && !PMR_INTERSECTS_WITH(
1903 atop(VM_PAGE_TO_PHYS(found)),
1904 atop(VM_PAGE_TO_PHYS(found)) + 1,
1905 start, end))
1906 found = TAILQ_NEXT(found, pageq);
1907
1908 if (found == NULL) {
1909 /*
1910 * Check if the size tree contains a range
1911 * that intersects with the boundaries. As the
1912 * allocation is for any page, try the smallest
1913 * range so that large ranges are preserved for
1914 * more constrained cases. Only one entry is
1915 * checked here, to avoid a brute-force search.
1916 *
1917 * Note that a size tree gives pg[1] instead of
1918 * pg[0].
1919 */
1920 found = RBT_MIN(uvm_pmr_size,
1921 &pmr->size[memtype]);
1922 if (found != NULL) {
1923 found--;
1924 if (!PMR_INTERSECTS_WITH(
1925 atop(VM_PAGE_TO_PHYS(found)),
1926 atop(VM_PAGE_TO_PHYS(found)) +
1927 found->fpgsz, start, end))
1928 found = NULL;
1929 }
1930 }
1931 if (found == NULL) {
1932 /*
1933 * Try address-guided search to meet the page
1934 * number constraints.
1935 */
1936 found = RBT_ROOT(uvm_pmr_addr, &pmr->addr);
1937 if (found != NULL) {
1938 found = uvm_pmr_rootupdate(pmr, found,
1939 start, end, memtype);
1940 }
1941 }
1942 if (found != NULL) {
1943 uvm_pmr_assertvalid(pmr);
1944 uvm_pmr_remove_size(pmr, found);
1945
1946 /*
1947 * If the page intersects the end, then it'll
1948 * need splitting.
1949 *
1950 * Note that we don't need to split if the page
1951 * intersects start: the drain function will
1952 * simply stop on hitting start.
1953 */
1954 if (end != 0 && atop(VM_PAGE_TO_PHYS(found)) +
1955 found->fpgsz > end) {
1956 psize_t splitsz =
1957 atop(VM_PAGE_TO_PHYS(found)) +
1958 found->fpgsz - end;
1959
1960 uvm_pmr_remove_addr(pmr, found);
1961 uvm_pmr_assertvalid(pmr);
1962 found->fpgsz -= splitsz;
1963 splitpg = found + found->fpgsz;
1964 splitpg->fpgsz = splitsz;
1965 uvm_pmr_insert(pmr, splitpg, 1);
1966
1967 /*
1968 * At this point, splitpg and found
1969 * actually should be joined.
1970 * But we explicitly disable that,
1971 * because we will start subtracting
1972 * from found.
1973 */
1974 KASSERT(start == 0 ||
1975 atop(VM_PAGE_TO_PHYS(found)) +
1976 found->fpgsz > start);
1977 uvm_pmr_insert_addr(pmr, found, 1);
1978 }
1979
1980 /*
1981 * Fetch pages from the end.
1982 * If the range is larger than the requested
1983 * number of pages, this saves us an addr-tree
1984 * update.
1985 *
1986 * Since we take from the end and insert at
1987 * the head, any ranges keep preserved.
1988 */
1989 while (found->fpgsz > 0 && fcount < count &&
1990 (start == 0 ||
1991 atop(VM_PAGE_TO_PHYS(found)) +
1992 found->fpgsz > start)) {
1993 found->fpgsz--;
1994 fcount++;
1995 TAILQ_INSERT_HEAD(result,
1996 &found[found->fpgsz], pageq);
1997 }
1998 if (found->fpgsz > 0) {
1999 uvm_pmr_insert_size(pmr, found);
2000 KDASSERT(fcount == count);
2001 uvm_pmr_assertvalid(pmr);
2002 return fcount;
2003 }
2004
2005 /*
2006 * Delayed addr-tree removal.
2007 */
2008 uvm_pmr_remove_addr(pmr, found);
2009 uvm_pmr_assertvalid(pmr);
2010 } else {
2011 if (memtype_only)
2012 break;
2013 /*
2014 * Skip to the next memtype.
2015 */
2016 memtype += 1;
2017 if (memtype == UVM_PMR_MEMTYPE_MAX)
2018 memtype = 0;
2019 if (memtype == memtype_init)
2020 break;
2021 }
2022 }
2023 }
2024
2025 /*
2026 * Search finished.
2027 *
2028 * Ran out of ranges before enough pages were gathered, or we hit the
2029 * case where found->fpgsz == count - fcount, in which case the
2030 * above exit condition didn't trigger.
2031 *
2032 * On failure, caller will free the pages.
2033 */
2034 return fcount;
2035 }
2036
2037 #ifdef DDB
2038 /*
2039 * Print information about pmemrange.
2040 * Does not do locking (so either call it from DDB or acquire fpageq lock
2041 * before invoking.
2042 */
2043 void
uvm_pmr_print(void)2044 uvm_pmr_print(void)
2045 {
2046 struct uvm_pmemrange *pmr;
2047 struct vm_page *pg;
2048 psize_t size[UVM_PMR_MEMTYPE_MAX];
2049 psize_t free;
2050 int useq_len;
2051 int mt;
2052
2053 printf("Ranges, use queue:\n");
2054 useq_len = 0;
2055 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
2056 useq_len++;
2057 free = 0;
2058 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
2059 pg = RBT_MAX(uvm_pmr_size, &pmr->size[mt]);
2060 if (pg != NULL)
2061 pg--;
2062 else
2063 pg = TAILQ_FIRST(&pmr->single[mt]);
2064 size[mt] = (pg == NULL ? 0 : pg->fpgsz);
2065
2066 RBT_FOREACH(pg, uvm_pmr_addr, &pmr->addr)
2067 free += pg->fpgsz;
2068 }
2069
2070 printf("* [0x%lx-0x%lx] use=%d nsegs=%ld",
2071 (unsigned long)pmr->low, (unsigned long)pmr->high,
2072 pmr->use, (unsigned long)pmr->nsegs);
2073 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
2074 printf(" maxsegsz[%d]=0x%lx", mt,
2075 (unsigned long)size[mt]);
2076 }
2077 printf(" free=0x%lx\n", (unsigned long)free);
2078 }
2079 printf("#ranges = %d\n", useq_len);
2080 }
2081 #endif
2082
2083 /*
2084 * uvm_wait_pla: wait (sleep) for the page daemon to free some pages
2085 * in a specific physmem area.
2086 *
2087 * Returns ENOMEM if the pagedaemon failed to free any pages.
2088 * If not failok, failure will lead to panic.
2089 *
2090 * Must be called with fpageq locked.
2091 */
2092 int
uvm_wait_pla(paddr_t low,paddr_t high,paddr_t size,int failok)2093 uvm_wait_pla(paddr_t low, paddr_t high, paddr_t size, int failok)
2094 {
2095 struct uvm_pmalloc pma;
2096 const char *wmsg = "pmrwait";
2097
2098 if (curproc == uvm.pagedaemon_proc) {
2099 /*
2100 * This is not that uncommon when the pagedaemon is trying
2101 * to flush out a large mmapped file. VOP_WRITE will circle
2102 * back through the buffer cache and try to get more memory.
2103 * The pagedaemon starts by calling bufbackoff, but we can
2104 * easily use up that reserve in a single scan iteration.
2105 */
2106 uvm_unlock_fpageq();
2107 if (bufbackoff(NULL, atop(size)) == 0) {
2108 uvm_lock_fpageq();
2109 return 0;
2110 }
2111 uvm_lock_fpageq();
2112
2113 /*
2114 * XXX detect pagedaemon deadlock - see comment in
2115 * uvm_wait(), as this is exactly the same issue.
2116 */
2117 printf("pagedaemon: wait_pla deadlock detected!\n");
2118 msleep_nsec(&uvmexp.free, &uvm.fpageqlock, PVM, wmsg,
2119 MSEC_TO_NSEC(125));
2120 #if defined(DEBUG)
2121 /* DEBUG: panic so we can debug it */
2122 panic("wait_pla pagedaemon deadlock");
2123 #endif
2124 return 0;
2125 }
2126
2127 for (;;) {
2128 pma.pm_constraint.ucr_low = low;
2129 pma.pm_constraint.ucr_high = high;
2130 pma.pm_size = size;
2131 pma.pm_flags = UVM_PMA_LINKED;
2132 TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, &pma, pmq);
2133
2134 wakeup(&uvm.pagedaemon); /* wake the daemon! */
2135 while (pma.pm_flags & (UVM_PMA_LINKED | UVM_PMA_BUSY))
2136 msleep_nsec(&pma, &uvm.fpageqlock, PVM, wmsg, INFSLP);
2137
2138 if (!(pma.pm_flags & UVM_PMA_FREED) &&
2139 pma.pm_flags & UVM_PMA_FAIL) {
2140 if (failok)
2141 return ENOMEM;
2142 printf("uvm_wait: failed to free %ld pages between "
2143 "0x%lx-0x%lx\n", atop(size), low, high);
2144 } else
2145 return 0;
2146 }
2147 /* UNREACHABLE */
2148 }
2149
2150 /*
2151 * Wake up uvm_pmalloc sleepers.
2152 */
2153 void
uvm_wakeup_pla(paddr_t low,psize_t len)2154 uvm_wakeup_pla(paddr_t low, psize_t len)
2155 {
2156 struct uvm_pmalloc *pma, *pma_next;
2157 paddr_t high;
2158
2159 high = low + len;
2160
2161 /* Wake specific allocations waiting for this memory. */
2162 for (pma = TAILQ_FIRST(&uvm.pmr_control.allocs); pma != NULL;
2163 pma = pma_next) {
2164 pma_next = TAILQ_NEXT(pma, pmq);
2165
2166 if (low < pma->pm_constraint.ucr_high &&
2167 high > pma->pm_constraint.ucr_low) {
2168 pma->pm_flags |= UVM_PMA_FREED;
2169 if (!(pma->pm_flags & UVM_PMA_BUSY)) {
2170 pma->pm_flags &= ~UVM_PMA_LINKED;
2171 TAILQ_REMOVE(&uvm.pmr_control.allocs, pma,
2172 pmq);
2173 wakeup(pma);
2174 }
2175 }
2176 }
2177 }
2178
2179 void
uvm_pagezero_thread(void * arg)2180 uvm_pagezero_thread(void *arg)
2181 {
2182 struct pglist pgl;
2183 struct vm_page *pg;
2184 int count;
2185
2186 /* Run at the lowest possible priority. */
2187 curproc->p_p->ps_nice = NZERO + PRIO_MAX;
2188
2189 KERNEL_UNLOCK();
2190
2191 TAILQ_INIT(&pgl);
2192 for (;;) {
2193 uvm_lock_fpageq();
2194 while (uvmexp.zeropages >= UVM_PAGEZERO_TARGET ||
2195 (count = uvm_pmr_get1page(16, UVM_PMR_MEMTYPE_DIRTY,
2196 &pgl, 0, 0, 1)) == 0) {
2197 msleep_nsec(&uvmexp.zeropages, &uvm.fpageqlock,
2198 MAXPRI, "pgzero", INFSLP);
2199 }
2200 uvm_unlock_fpageq();
2201
2202 TAILQ_FOREACH(pg, &pgl, pageq) {
2203 uvm_pagezero(pg);
2204 atomic_setbits_int(&pg->pg_flags, PG_ZERO);
2205 }
2206
2207 uvm_lock_fpageq();
2208 while (!TAILQ_EMPTY(&pgl))
2209 uvm_pmr_remove_1strange(&pgl, 0, NULL, 0);
2210 uvmexp.zeropages += count;
2211 uvm_unlock_fpageq();
2212
2213 yield();
2214 }
2215 }
2216
2217 #if defined(MULTIPROCESSOR) && defined(__HAVE_UVM_PERCPU)
2218 int
uvm_pmr_cache_alloc(struct uvm_pmr_cache_item * upci)2219 uvm_pmr_cache_alloc(struct uvm_pmr_cache_item *upci)
2220 {
2221 struct vm_page *pg;
2222 struct pglist pgl;
2223 int flags = UVM_PLA_NOWAIT|UVM_PLA_NOWAKE;
2224 int npages = UVM_PMR_CACHEMAGSZ;
2225
2226 splassert(IPL_VM);
2227 KASSERT(upci->upci_npages == 0);
2228
2229 TAILQ_INIT(&pgl);
2230 if (uvm_pmr_getpages(npages, 0, 0, 1, 0, npages, flags, &pgl))
2231 return -1;
2232
2233 while ((pg = TAILQ_FIRST(&pgl)) != NULL) {
2234 TAILQ_REMOVE(&pgl, pg, pageq);
2235 upci->upci_pages[upci->upci_npages] = pg;
2236 upci->upci_npages++;
2237 }
2238 atomic_add_int(&uvmexp.percpucaches, npages);
2239
2240 return 0;
2241 }
2242
2243 struct vm_page *
uvm_pmr_cache_get(int flags)2244 uvm_pmr_cache_get(int flags)
2245 {
2246 struct uvm_pmr_cache *upc = &curcpu()->ci_uvm;
2247 struct uvm_pmr_cache_item *upci;
2248 struct vm_page *pg;
2249 int s;
2250
2251 s = splvm();
2252 upci = &upc->upc_magz[upc->upc_actv];
2253 if (upci->upci_npages == 0) {
2254 unsigned int prev;
2255
2256 prev = (upc->upc_actv == 0) ? 1 : 0;
2257 upci = &upc->upc_magz[prev];
2258 if (upci->upci_npages == 0) {
2259 atomic_inc_int(&uvmexp.pcpmiss);
2260 if (uvm_pmr_cache_alloc(upci)) {
2261 splx(s);
2262 return uvm_pmr_getone(flags);
2263 }
2264 }
2265 /* Swap magazines */
2266 upc->upc_actv = prev;
2267 } else {
2268 atomic_inc_int(&uvmexp.pcphit);
2269 }
2270
2271 atomic_dec_int(&uvmexp.percpucaches);
2272 upci->upci_npages--;
2273 pg = upci->upci_pages[upci->upci_npages];
2274 splx(s);
2275
2276 if (flags & UVM_PLA_ZERO)
2277 uvm_pagezero(pg);
2278
2279 return pg;
2280 }
2281
2282 void
uvm_pmr_cache_free(struct uvm_pmr_cache_item * upci)2283 uvm_pmr_cache_free(struct uvm_pmr_cache_item *upci)
2284 {
2285 struct pglist pgl;
2286 unsigned int i;
2287
2288 splassert(IPL_VM);
2289
2290 TAILQ_INIT(&pgl);
2291 for (i = 0; i < upci->upci_npages; i++)
2292 TAILQ_INSERT_TAIL(&pgl, upci->upci_pages[i], pageq);
2293
2294 uvm_pmr_freepageq(&pgl);
2295
2296 atomic_sub_int(&uvmexp.percpucaches, upci->upci_npages);
2297 upci->upci_npages = 0;
2298 memset(upci->upci_pages, 0, sizeof(upci->upci_pages));
2299 }
2300
2301 void
uvm_pmr_cache_put(struct vm_page * pg)2302 uvm_pmr_cache_put(struct vm_page *pg)
2303 {
2304 struct uvm_pmr_cache *upc = &curcpu()->ci_uvm;
2305 struct uvm_pmr_cache_item *upci;
2306 int s;
2307
2308 s = splvm();
2309 upci = &upc->upc_magz[upc->upc_actv];
2310 if (upci->upci_npages >= UVM_PMR_CACHEMAGSZ) {
2311 unsigned int prev;
2312
2313 prev = (upc->upc_actv == 0) ? 1 : 0;
2314 upci = &upc->upc_magz[prev];
2315 if (upci->upci_npages > 0)
2316 uvm_pmr_cache_free(upci);
2317
2318 /* Swap magazines */
2319 upc->upc_actv = prev;
2320 KASSERT(upci->upci_npages == 0);
2321 }
2322
2323 upci->upci_pages[upci->upci_npages] = pg;
2324 upci->upci_npages++;
2325 atomic_inc_int(&uvmexp.percpucaches);
2326 splx(s);
2327 }
2328
2329 void
uvm_pmr_cache_drain(void)2330 uvm_pmr_cache_drain(void)
2331 {
2332 struct uvm_pmr_cache *upc = &curcpu()->ci_uvm;
2333 int s;
2334
2335 s = splvm();
2336 uvm_pmr_cache_free(&upc->upc_magz[0]);
2337 uvm_pmr_cache_free(&upc->upc_magz[1]);
2338 splx(s);
2339 }
2340
2341 #else /* !(MULTIPROCESSOR && __HAVE_UVM_PERCPU) */
2342
2343 struct vm_page *
uvm_pmr_cache_get(int flags)2344 uvm_pmr_cache_get(int flags)
2345 {
2346 return uvm_pmr_getone(flags);
2347 }
2348
2349 void
uvm_pmr_cache_put(struct vm_page * pg)2350 uvm_pmr_cache_put(struct vm_page *pg)
2351 {
2352 uvm_pmr_freepages(pg, 1);
2353 }
2354
2355 void
uvm_pmr_cache_drain(void)2356 uvm_pmr_cache_drain(void)
2357 {
2358 }
2359 #endif
2360