1 /* $OpenBSD: uvm_pmemrange.c,v 1.76 2024/11/08 15:54:33 mpi Exp $ */
2
3 /*
4 * Copyright (c) 2024 Martin Pieuchot <mpi@openbsd.org>
5 * Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl>
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 */
19
20 #include <sys/param.h>
21 #include <sys/systm.h>
22 #include <uvm/uvm.h>
23 #include <sys/malloc.h>
24 #include <sys/kernel.h>
25 #include <sys/proc.h>
26 #include <sys/mount.h>
27
28 /*
29 * 2 trees: addr tree and size tree.
30 *
31 * The allocator keeps chunks of free pages (called a range).
32 * Two pages are part of the same range if:
33 * - all pages in between are part of that range,
34 * - they are of the same memory type (zeroed or non-zeroed),
35 * - they are part of the same pmemrange.
36 * A pmemrange is a range of memory which is part of the same vm_physseg
37 * and has a use-count.
38 *
39 * addr tree is vm_page[0].objt
40 * size tree is vm_page[1].objt
41 *
42 * The size tree is not used for memory ranges of 1 page, instead,
43 * single queue is vm_page[0].pageq
44 *
45 * vm_page[0].fpgsz describes the length of a free range. Two adjacent ranges
46 * are joined, unless:
47 * - they have pages in between them which are not free
48 * - they belong to different memtypes (zeroed vs dirty memory)
49 * - they are in different pmemrange areas (ISA vs non-ISA memory for instance)
50 * - they are not a continuation of the same array
51 * The latter issue is caused by vm_physseg ordering and splitting from the
52 * MD initialization machinery. The MD code is dependant on freelists and
53 * happens to split ISA memory from non-ISA memory.
54 * (Note: freelists die die die!)
55 *
56 * uvm_page_init guarantees that every vm_physseg contains an array of
57 * struct vm_page. Also, uvm_page_physload allocates an array of struct
58 * vm_page. This code depends on that array. The array may break across
59 * vm_physsegs boundaries.
60 */
61
62 /*
63 * Validate the flags of the page. (Used in asserts.)
64 * Any free page must have the PQ_FREE flag set.
65 * Free pages may be zeroed.
66 * Pmap flags are left untouched.
67 *
68 * The PQ_FREE flag is not checked here: by not checking, we can easily use
69 * this check in pages which are freed.
70 */
71 #define VALID_FLAGS(pg_flags) \
72 (((pg_flags) & ~(PQ_FREE|PG_ZERO|PG_PMAPMASK)) == 0x0)
73
74 /* Tree comparators. */
75 int uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *,
76 const struct uvm_pmemrange *);
77 int uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *);
78 int uvm_pmr_pg_to_memtype(struct vm_page *);
79
80 #ifdef DDB
81 void uvm_pmr_print(void);
82 #endif
83
84 static inline int
in_pagedaemon(int allowsyncer)85 in_pagedaemon(int allowsyncer)
86 {
87 if (curcpu()->ci_idepth > 0)
88 return 0;
89 if (curproc == uvm.pagedaemon_proc)
90 return 1;
91 /* XXX why is the syncer allowed to use the pagedaemon's reserve? */
92 if (allowsyncer && (curproc == syncerproc))
93 return 1;
94 return 0;
95 }
96
97 /*
98 * Memory types. The page flags are used to derive what the current memory
99 * type of a page is.
100 */
101 int
uvm_pmr_pg_to_memtype(struct vm_page * pg)102 uvm_pmr_pg_to_memtype(struct vm_page *pg)
103 {
104 if (pg->pg_flags & PG_ZERO)
105 return UVM_PMR_MEMTYPE_ZERO;
106 /* Default: dirty memory. */
107 return UVM_PMR_MEMTYPE_DIRTY;
108 }
109
110 /* Trees. */
111 RBT_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp);
112 RBT_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp);
113 RBT_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr,
114 uvm_pmemrange_addr_cmp);
115
116 /* Validation. */
117 #ifdef DEBUG
118 void uvm_pmr_assertvalid(struct uvm_pmemrange *pmr);
119 #else
120 #define uvm_pmr_assertvalid(pmr) do {} while (0)
121 #endif
122
123 psize_t uvm_pmr_get1page(psize_t, int, struct pglist *,
124 paddr_t, paddr_t, int);
125
126 struct uvm_pmemrange *uvm_pmr_allocpmr(void);
127 struct vm_page *uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int);
128 struct vm_page *uvm_pmr_nextsz(struct uvm_pmemrange *,
129 struct vm_page *, int);
130 void uvm_pmr_pnaddr(struct uvm_pmemrange *pmr,
131 struct vm_page *pg, struct vm_page **pg_prev,
132 struct vm_page **pg_next);
133 struct vm_page *uvm_pmr_findnextsegment(struct uvm_pmemrange *,
134 struct vm_page *, paddr_t);
135 struct vm_page *uvm_pmr_findprevsegment(struct uvm_pmemrange *,
136 struct vm_page *, paddr_t);
137 psize_t uvm_pmr_remove_1strange(struct pglist *, paddr_t,
138 struct vm_page **, int);
139 psize_t uvm_pmr_remove_1strange_reverse(struct pglist *,
140 paddr_t *);
141 void uvm_pmr_split(paddr_t);
142 struct uvm_pmemrange *uvm_pmemrange_find(paddr_t);
143 struct uvm_pmemrange *uvm_pmemrange_use_insert(struct uvm_pmemrange_use *,
144 struct uvm_pmemrange *);
145 psize_t pow2divide(psize_t, psize_t);
146 struct vm_page *uvm_pmr_rootupdate(struct uvm_pmemrange *,
147 struct vm_page *, paddr_t, paddr_t, int);
148
149 /*
150 * Computes num/denom and rounds it up to the next power-of-2.
151 *
152 * This is a division function which calculates an approximation of
153 * num/denom, with result =~ num/denom. It is meant to be fast and doesn't
154 * have to be accurate.
155 *
156 * Providing too large a value makes the allocator slightly faster, at the
157 * risk of hitting the failure case more often. Providing too small a value
158 * makes the allocator a bit slower, but less likely to hit a failure case.
159 */
160 psize_t
pow2divide(psize_t num,psize_t denom)161 pow2divide(psize_t num, psize_t denom)
162 {
163 int rshift;
164
165 for (rshift = 0; num > denom; rshift++, denom <<= 1)
166 ;
167 return (paddr_t)1 << rshift;
168 }
169
170 /*
171 * Predicate: lhs is a subrange or rhs.
172 *
173 * If rhs_low == 0: don't care about lower bound.
174 * If rhs_high == 0: don't care about upper bound.
175 */
176 #define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high) \
177 (((rhs_low) == 0 || (lhs_low) >= (rhs_low)) && \
178 ((rhs_high) == 0 || (lhs_high) <= (rhs_high)))
179
180 /*
181 * Predicate: lhs intersects with rhs.
182 *
183 * If rhs_low == 0: don't care about lower bound.
184 * If rhs_high == 0: don't care about upper bound.
185 * Ranges don't intersect if they don't have any page in common, array
186 * semantics mean that < instead of <= should be used here.
187 */
188 #define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high) \
189 (((rhs_low) == 0 || (rhs_low) < (lhs_high)) && \
190 ((rhs_high) == 0 || (lhs_low) < (rhs_high)))
191
192 /*
193 * Align to power-of-2 alignment.
194 */
195 #define PMR_ALIGN(pgno, align) \
196 (((pgno) + ((align) - 1)) & ~((align) - 1))
197 #define PMR_ALIGN_DOWN(pgno, align) \
198 ((pgno) & ~((align) - 1))
199
200
201 /*
202 * Comparator: sort by address ascending.
203 */
204 int
uvm_pmemrange_addr_cmp(const struct uvm_pmemrange * lhs,const struct uvm_pmemrange * rhs)205 uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *lhs,
206 const struct uvm_pmemrange *rhs)
207 {
208 return lhs->low < rhs->low ? -1 : lhs->low > rhs->low;
209 }
210
211 /*
212 * Comparator: sort by use ascending.
213 *
214 * The higher the use value of a range, the more devices need memory in
215 * this range. Therefore allocate from the range with the lowest use first.
216 */
217 int
uvm_pmemrange_use_cmp(struct uvm_pmemrange * lhs,struct uvm_pmemrange * rhs)218 uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs)
219 {
220 int result;
221
222 result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use;
223 if (result == 0)
224 result = uvm_pmemrange_addr_cmp(lhs, rhs);
225 return result;
226 }
227
228 int
uvm_pmr_addr_cmp(const struct vm_page * lhs,const struct vm_page * rhs)229 uvm_pmr_addr_cmp(const struct vm_page *lhs, const struct vm_page *rhs)
230 {
231 paddr_t lhs_addr, rhs_addr;
232
233 lhs_addr = VM_PAGE_TO_PHYS(lhs);
234 rhs_addr = VM_PAGE_TO_PHYS(rhs);
235
236 return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr);
237 }
238
239 int
uvm_pmr_size_cmp(const struct vm_page * lhs,const struct vm_page * rhs)240 uvm_pmr_size_cmp(const struct vm_page *lhs, const struct vm_page *rhs)
241 {
242 psize_t lhs_size, rhs_size;
243 int cmp;
244
245 /* Using second tree, so we receive pg[1] instead of pg[0]. */
246 lhs_size = (lhs - 1)->fpgsz;
247 rhs_size = (rhs - 1)->fpgsz;
248
249 cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size);
250 if (cmp == 0)
251 cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1);
252 return cmp;
253 }
254
255 /*
256 * Find the first range of free pages that is at least sz pages long.
257 */
258 struct vm_page *
uvm_pmr_nfindsz(struct uvm_pmemrange * pmr,psize_t sz,int mti)259 uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti)
260 {
261 struct vm_page *node, *best;
262
263 KASSERT(sz >= 1);
264
265 if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti]))
266 return TAILQ_FIRST(&pmr->single[mti]);
267
268 node = RBT_ROOT(uvm_pmr_size, &pmr->size[mti]);
269 best = NULL;
270 while (node != NULL) {
271 if ((node - 1)->fpgsz >= sz) {
272 best = (node - 1);
273 node = RBT_LEFT(uvm_objtree, node);
274 } else
275 node = RBT_RIGHT(uvm_objtree, node);
276 }
277 return best;
278 }
279
280 /*
281 * Finds the next range. The next range has a size >= pg->fpgsz.
282 * Returns NULL if no more ranges are available.
283 */
284 struct vm_page *
uvm_pmr_nextsz(struct uvm_pmemrange * pmr,struct vm_page * pg,int mt)285 uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt)
286 {
287 struct vm_page *npg;
288
289 KASSERT(pmr != NULL && pg != NULL);
290 if (pg->fpgsz == 1) {
291 if (TAILQ_NEXT(pg, pageq) != NULL)
292 return TAILQ_NEXT(pg, pageq);
293 else
294 npg = RBT_MIN(uvm_pmr_size, &pmr->size[mt]);
295 } else
296 npg = RBT_NEXT(uvm_pmr_size, pg + 1);
297
298 return npg == NULL ? NULL : npg - 1;
299 }
300
301 /*
302 * Finds the previous and next ranges relative to the (uninserted) pg range.
303 *
304 * *pg_prev == NULL if no previous range is available, that can join with
305 * pg.
306 * *pg_next == NULL if no next range is available, that can join with
307 * pg.
308 */
309 void
uvm_pmr_pnaddr(struct uvm_pmemrange * pmr,struct vm_page * pg,struct vm_page ** pg_prev,struct vm_page ** pg_next)310 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg,
311 struct vm_page **pg_prev, struct vm_page **pg_next)
312 {
313 KASSERT(pg_prev != NULL && pg_next != NULL);
314
315 *pg_next = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg);
316 if (*pg_next == NULL)
317 *pg_prev = RBT_MAX(uvm_pmr_addr, &pmr->addr);
318 else
319 *pg_prev = RBT_PREV(uvm_pmr_addr, *pg_next);
320
321 KDASSERT(*pg_next == NULL ||
322 VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg));
323 KDASSERT(*pg_prev == NULL ||
324 VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg));
325
326 /* Reset if not contig. */
327 if (*pg_prev != NULL &&
328 (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz
329 != atop(VM_PAGE_TO_PHYS(pg)) ||
330 *pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */
331 uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg)))
332 *pg_prev = NULL;
333 if (*pg_next != NULL &&
334 (atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz
335 != atop(VM_PAGE_TO_PHYS(*pg_next)) ||
336 pg + pg->fpgsz != *pg_next || /* Array broke. */
337 uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg)))
338 *pg_next = NULL;
339 return;
340 }
341
342 /*
343 * Remove a range from the address tree.
344 * Address tree maintains pmr counters.
345 */
346 void
uvm_pmr_remove_addr(struct uvm_pmemrange * pmr,struct vm_page * pg)347 uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg)
348 {
349 KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
350 KDASSERT(pg->pg_flags & PQ_FREE);
351 RBT_REMOVE(uvm_pmr_addr, &pmr->addr, pg);
352
353 pmr->nsegs--;
354 }
355 /*
356 * Remove a range from the size tree.
357 */
358 void
uvm_pmr_remove_size(struct uvm_pmemrange * pmr,struct vm_page * pg)359 uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
360 {
361 int memtype;
362 #ifdef DEBUG
363 struct vm_page *i;
364 #endif
365
366 KDASSERT(pg->fpgsz >= 1);
367 KDASSERT(pg->pg_flags & PQ_FREE);
368 memtype = uvm_pmr_pg_to_memtype(pg);
369
370 if (pg->fpgsz == 1) {
371 #ifdef DEBUG
372 TAILQ_FOREACH(i, &pmr->single[memtype], pageq) {
373 if (i == pg)
374 break;
375 }
376 KDASSERT(i == pg);
377 #endif
378 TAILQ_REMOVE(&pmr->single[memtype], pg, pageq);
379 } else {
380 KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[memtype],
381 pg + 1) == pg + 1);
382 RBT_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1);
383 }
384 }
385 /* Remove from both trees. */
386 void
uvm_pmr_remove(struct uvm_pmemrange * pmr,struct vm_page * pg)387 uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg)
388 {
389 uvm_pmr_assertvalid(pmr);
390 uvm_pmr_remove_size(pmr, pg);
391 uvm_pmr_remove_addr(pmr, pg);
392 uvm_pmr_assertvalid(pmr);
393 }
394
395 /*
396 * Insert the range described in pg.
397 * Returns the range thus created (which may be joined with the previous and
398 * next ranges).
399 * If no_join, the caller guarantees that the range cannot possibly join
400 * with adjacent ranges.
401 */
402 struct vm_page *
uvm_pmr_insert_addr(struct uvm_pmemrange * pmr,struct vm_page * pg,int no_join)403 uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
404 {
405 struct vm_page *prev, *next;
406
407 #ifdef DEBUG
408 struct vm_page *i;
409 int mt;
410 #endif
411
412 KDASSERT(pg->pg_flags & PQ_FREE);
413 KDASSERT(pg->fpgsz >= 1);
414
415 #ifdef DEBUG
416 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
417 TAILQ_FOREACH(i, &pmr->single[mt], pageq)
418 KDASSERT(i != pg);
419 if (pg->fpgsz > 1) {
420 KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mt],
421 pg + 1) == NULL);
422 }
423 KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL);
424 }
425 #endif
426
427 if (!no_join) {
428 uvm_pmr_pnaddr(pmr, pg, &prev, &next);
429 if (next != NULL) {
430 uvm_pmr_remove_size(pmr, next);
431 uvm_pmr_remove_addr(pmr, next);
432 pg->fpgsz += next->fpgsz;
433 next->fpgsz = 0;
434 }
435 if (prev != NULL) {
436 uvm_pmr_remove_size(pmr, prev);
437 prev->fpgsz += pg->fpgsz;
438 pg->fpgsz = 0;
439 return prev;
440 }
441 }
442
443 RBT_INSERT(uvm_pmr_addr, &pmr->addr, pg);
444
445 pmr->nsegs++;
446
447 return pg;
448 }
449 /*
450 * Insert the range described in pg.
451 * Returns the range thus created (which may be joined with the previous and
452 * next ranges).
453 * Page must already be in the address tree.
454 */
455 void
uvm_pmr_insert_size(struct uvm_pmemrange * pmr,struct vm_page * pg)456 uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
457 {
458 int memtype;
459 #ifdef DEBUG
460 struct vm_page *i;
461 int mti;
462 #endif
463
464 KDASSERT(pg->fpgsz >= 1);
465 KDASSERT(pg->pg_flags & PQ_FREE);
466
467 memtype = uvm_pmr_pg_to_memtype(pg);
468 #ifdef DEBUG
469 for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
470 TAILQ_FOREACH(i, &pmr->single[mti], pageq)
471 KDASSERT(i != pg);
472 if (pg->fpgsz > 1) {
473 KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mti],
474 pg + 1) == NULL);
475 }
476 KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
477 }
478 for (i = pg; i < pg + pg->fpgsz; i++)
479 KASSERT(uvm_pmr_pg_to_memtype(i) == memtype);
480 #endif
481
482 if (pg->fpgsz == 1)
483 TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq);
484 else
485 RBT_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1);
486 }
487 /* Insert in both trees. */
488 struct vm_page *
uvm_pmr_insert(struct uvm_pmemrange * pmr,struct vm_page * pg,int no_join)489 uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
490 {
491 uvm_pmr_assertvalid(pmr);
492 pg = uvm_pmr_insert_addr(pmr, pg, no_join);
493 uvm_pmr_insert_size(pmr, pg);
494 uvm_pmr_assertvalid(pmr);
495 return pg;
496 }
497
498 /*
499 * Find the last page that is part of this segment.
500 * => pg: the range at which to start the search.
501 * => boundary: the page number boundary specification (0 = no boundary).
502 * => pmr: the pmemrange of the page.
503 *
504 * This function returns 1 before the next range, so if you want to have the
505 * next range, you need to run TAILQ_NEXT(result, pageq) after calling.
506 * The reason is that this way, the length of the segment is easily
507 * calculated using: atop(result) - atop(pg) + 1.
508 * Hence this function also never returns NULL.
509 */
510 struct vm_page *
uvm_pmr_findnextsegment(struct uvm_pmemrange * pmr,struct vm_page * pg,paddr_t boundary)511 uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr,
512 struct vm_page *pg, paddr_t boundary)
513 {
514 paddr_t first_boundary;
515 struct vm_page *next;
516 struct vm_page *prev;
517
518 KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) &&
519 pmr->high > atop(VM_PAGE_TO_PHYS(pg)));
520 if (boundary != 0) {
521 first_boundary =
522 PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary);
523 } else
524 first_boundary = 0;
525
526 /*
527 * Increase next until it hits the first page of the next segment.
528 *
529 * While loop checks the following:
530 * - next != NULL we have not reached the end of pgl
531 * - boundary == 0 || next < first_boundary
532 * we do not cross a boundary
533 * - atop(prev) + 1 == atop(next)
534 * still in the same segment
535 * - low <= last
536 * - high > last still in the same memory range
537 * - memtype is equal allocator is unable to view different memtypes
538 * as part of the same segment
539 * - prev + 1 == next no array breakage occurs
540 */
541 prev = pg;
542 next = TAILQ_NEXT(prev, pageq);
543 while (next != NULL &&
544 (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) &&
545 atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) &&
546 pmr->low <= atop(VM_PAGE_TO_PHYS(next)) &&
547 pmr->high > atop(VM_PAGE_TO_PHYS(next)) &&
548 uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) &&
549 prev + 1 == next) {
550 prev = next;
551 next = TAILQ_NEXT(prev, pageq);
552 }
553
554 /*
555 * End of this segment.
556 */
557 return prev;
558 }
559
560 /*
561 * Find the first page that is part of this segment.
562 * => pg: the range at which to start the search.
563 * => boundary: the page number boundary specification (0 = no boundary).
564 * => pmr: the pmemrange of the page.
565 *
566 * This function returns 1 after the previous range, so if you want to have the
567 * previous range, you need to run TAILQ_NEXT(result, pageq) after calling.
568 * The reason is that this way, the length of the segment is easily
569 * calculated using: atop(pg) - atop(result) + 1.
570 * Hence this function also never returns NULL.
571 */
572 struct vm_page *
uvm_pmr_findprevsegment(struct uvm_pmemrange * pmr,struct vm_page * pg,paddr_t boundary)573 uvm_pmr_findprevsegment(struct uvm_pmemrange *pmr,
574 struct vm_page *pg, paddr_t boundary)
575 {
576 paddr_t first_boundary;
577 struct vm_page *next;
578 struct vm_page *prev;
579
580 KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) &&
581 pmr->high > atop(VM_PAGE_TO_PHYS(pg)));
582 if (boundary != 0) {
583 first_boundary =
584 PMR_ALIGN_DOWN(atop(VM_PAGE_TO_PHYS(pg)), boundary);
585 } else
586 first_boundary = 0;
587
588 /*
589 * Increase next until it hits the first page of the previous segment.
590 *
591 * While loop checks the following:
592 * - next != NULL we have not reached the end of pgl
593 * - boundary == 0 || next >= first_boundary
594 * we do not cross a boundary
595 * - atop(prev) - 1 == atop(next)
596 * still in the same segment
597 * - low <= last
598 * - high > last still in the same memory range
599 * - memtype is equal allocator is unable to view different memtypes
600 * as part of the same segment
601 * - prev - 1 == next no array breakage occurs
602 */
603 prev = pg;
604 next = TAILQ_NEXT(prev, pageq);
605 while (next != NULL &&
606 (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) >= first_boundary) &&
607 atop(VM_PAGE_TO_PHYS(prev)) - 1 == atop(VM_PAGE_TO_PHYS(next)) &&
608 pmr->low <= atop(VM_PAGE_TO_PHYS(next)) &&
609 pmr->high > atop(VM_PAGE_TO_PHYS(next)) &&
610 uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) &&
611 prev - 1 == next) {
612 prev = next;
613 next = TAILQ_NEXT(prev, pageq);
614 }
615
616 /*
617 * Start of this segment.
618 */
619 return prev;
620 }
621
622 /*
623 * Remove the first segment of contiguous pages from pgl.
624 * A segment ends if it crosses boundary (unless boundary = 0) or
625 * if it would enter a different uvm_pmemrange.
626 *
627 * Work: the page range that the caller is currently working with.
628 * May be null.
629 *
630 * If is_desperate is non-zero, the smallest segment is erased. Otherwise,
631 * the first segment is erased (which, if called by uvm_pmr_getpages(),
632 * probably is the smallest or very close to it).
633 */
634 psize_t
uvm_pmr_remove_1strange(struct pglist * pgl,paddr_t boundary,struct vm_page ** work,int is_desperate)635 uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary,
636 struct vm_page **work, int is_desperate)
637 {
638 struct vm_page *start, *end, *iter, *iter_end, *inserted, *lowest;
639 psize_t count;
640 struct uvm_pmemrange *pmr, *pmr_iter;
641
642 KASSERT(!TAILQ_EMPTY(pgl));
643
644 /*
645 * Initialize to first page.
646 * Unless desperate scan finds a better candidate, this is what'll be
647 * erased.
648 */
649 start = TAILQ_FIRST(pgl);
650 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start)));
651 end = uvm_pmr_findnextsegment(pmr, start, boundary);
652
653 /*
654 * If we are desperate, we _really_ want to get rid of the smallest
655 * element (rather than a close match to the smallest element).
656 */
657 if (is_desperate) {
658 /* Linear search for smallest segment. */
659 pmr_iter = pmr;
660 for (iter = TAILQ_NEXT(end, pageq);
661 iter != NULL && start != end;
662 iter = TAILQ_NEXT(iter_end, pageq)) {
663 /*
664 * Only update pmr if it doesn't match current
665 * iteration.
666 */
667 if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) ||
668 pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) {
669 pmr_iter = uvm_pmemrange_find(atop(
670 VM_PAGE_TO_PHYS(iter)));
671 }
672
673 iter_end = uvm_pmr_findnextsegment(pmr_iter, iter,
674 boundary);
675
676 /*
677 * Current iteration is smaller than best match so
678 * far; update.
679 */
680 if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) <
681 VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) {
682 start = iter;
683 end = iter_end;
684 pmr = pmr_iter;
685 }
686 }
687 }
688
689 /*
690 * Calculate count and end of the list.
691 */
692 count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1;
693 lowest = start;
694 end = TAILQ_NEXT(end, pageq);
695
696 /*
697 * Actually remove the range of pages.
698 *
699 * Sadly, this cannot be done using pointer iteration:
700 * vm_physseg is not guaranteed to be sorted on address, hence
701 * uvm_page_init() may not have initialized its array sorted by
702 * page number.
703 */
704 for (iter = start; iter != end; iter = iter_end) {
705 iter_end = TAILQ_NEXT(iter, pageq);
706 TAILQ_REMOVE(pgl, iter, pageq);
707 }
708
709 lowest->fpgsz = count;
710 inserted = uvm_pmr_insert(pmr, lowest, 0);
711
712 /*
713 * If the caller was working on a range and this function modified
714 * that range, update the pointer.
715 */
716 if (work != NULL && *work != NULL &&
717 atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) &&
718 atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz >
719 atop(VM_PAGE_TO_PHYS(*work)))
720 *work = inserted;
721 return count;
722 }
723
724 /*
725 * Remove the first segment of contiguous pages from a pgl
726 * with the list elements in reverse order of physaddr.
727 *
728 * A segment ends if it would enter a different uvm_pmemrange.
729 *
730 * Stores starting physical address of the segment in pstart.
731 */
732 psize_t
uvm_pmr_remove_1strange_reverse(struct pglist * pgl,paddr_t * pstart)733 uvm_pmr_remove_1strange_reverse(struct pglist *pgl, paddr_t *pstart)
734 {
735 struct vm_page *start, *end, *iter, *iter_end, *lowest;
736 psize_t count;
737 struct uvm_pmemrange *pmr;
738
739 KASSERT(!TAILQ_EMPTY(pgl));
740
741 start = TAILQ_FIRST(pgl);
742 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start)));
743 end = uvm_pmr_findprevsegment(pmr, start, 0);
744
745 KASSERT(end <= start);
746
747 /*
748 * Calculate count and end of the list.
749 */
750 count = atop(VM_PAGE_TO_PHYS(start) - VM_PAGE_TO_PHYS(end)) + 1;
751 lowest = end;
752 end = TAILQ_NEXT(end, pageq);
753
754 /*
755 * Actually remove the range of pages.
756 *
757 * Sadly, this cannot be done using pointer iteration:
758 * vm_physseg is not guaranteed to be sorted on address, hence
759 * uvm_page_init() may not have initialized its array sorted by
760 * page number.
761 */
762 for (iter = start; iter != end; iter = iter_end) {
763 iter_end = TAILQ_NEXT(iter, pageq);
764 TAILQ_REMOVE(pgl, iter, pageq);
765 }
766
767 lowest->fpgsz = count;
768 (void) uvm_pmr_insert(pmr, lowest, 0);
769
770 *pstart = VM_PAGE_TO_PHYS(lowest);
771 return count;
772 }
773
774 /*
775 * Extract a number of pages from a segment of free pages.
776 * Called by uvm_pmr_getpages.
777 *
778 * Returns the segment that was created from pages left over at the tail
779 * of the remove set of pages, or NULL if no pages were left at the tail.
780 */
781 struct vm_page *
uvm_pmr_extract_range(struct uvm_pmemrange * pmr,struct vm_page * pg,paddr_t start,paddr_t end,struct pglist * result)782 uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg,
783 paddr_t start, paddr_t end, struct pglist *result)
784 {
785 struct vm_page *after, *pg_i;
786 psize_t before_sz, after_sz;
787 #ifdef DEBUG
788 psize_t i;
789 #endif
790
791 KDASSERT(end > start);
792 KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)));
793 KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz);
794 KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start);
795 KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end);
796
797 before_sz = start - atop(VM_PAGE_TO_PHYS(pg));
798 after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end;
799 KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz);
800 uvm_pmr_assertvalid(pmr);
801
802 uvm_pmr_remove_size(pmr, pg);
803 if (before_sz == 0)
804 uvm_pmr_remove_addr(pmr, pg);
805 after = pg + before_sz + (end - start);
806
807 /* Add selected pages to result. */
808 for (pg_i = pg + before_sz; pg_i != after; pg_i++) {
809 KASSERT(pg_i->pg_flags & PQ_FREE);
810 pg_i->fpgsz = 0;
811 TAILQ_INSERT_TAIL(result, pg_i, pageq);
812 }
813
814 /* Before handling. */
815 if (before_sz > 0) {
816 pg->fpgsz = before_sz;
817 uvm_pmr_insert_size(pmr, pg);
818 }
819
820 /* After handling. */
821 if (after_sz > 0) {
822 #ifdef DEBUG
823 for (i = 0; i < after_sz; i++) {
824 KASSERT(!uvm_pmr_isfree(after + i));
825 }
826 #endif
827 KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end);
828 after->fpgsz = after_sz;
829 after = uvm_pmr_insert_addr(pmr, after, 1);
830 uvm_pmr_insert_size(pmr, after);
831 }
832
833 uvm_pmr_assertvalid(pmr);
834 return (after_sz > 0 ? after : NULL);
835 }
836
837 /*
838 * Acquire a number of pages.
839 *
840 * count: the number of pages returned
841 * start: lowest page number
842 * end: highest page number +1
843 * (start = end = 0: no limitation)
844 * align: power-of-2 alignment constraint (align = 1: no alignment)
845 * boundary: power-of-2 boundary (boundary = 0: no boundary)
846 * maxseg: maximum number of segments to return
847 * flags: UVM_PLA_* flags
848 * result: returned pages storage (uses pageq)
849 */
850 int
uvm_pmr_getpages(psize_t count,paddr_t start,paddr_t end,paddr_t align,paddr_t boundary,int maxseg,int flags,struct pglist * result)851 uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align,
852 paddr_t boundary, int maxseg, int flags, struct pglist *result)
853 {
854 struct uvm_pmemrange *pmr; /* Iterate memory ranges. */
855 struct vm_page *found, *f_next; /* Iterate chunks. */
856 psize_t fcount; /* Current found pages. */
857 int fnsegs; /* Current segment counter. */
858 int try, start_try;
859 psize_t search[3];
860 paddr_t fstart, fend; /* Pages to be taken from found. */
861 int memtype; /* Requested memtype. */
862 int memtype_init; /* Best memtype. */
863 int desperate; /* True if allocation failed. */
864 #ifdef DIAGNOSTIC
865 struct vm_page *diag_prev; /* Used during validation. */
866 #endif /* DIAGNOSTIC */
867
868 /*
869 * Validate arguments.
870 */
871 KASSERT(count > 0);
872 KASSERT(start == 0 || end == 0 || start < end);
873 KASSERT(align >= 1);
874 KASSERT(powerof2(align));
875 KASSERT(maxseg > 0);
876 KASSERT(boundary == 0 || powerof2(boundary));
877 KASSERT(boundary == 0 || maxseg * boundary >= count);
878 KASSERT(TAILQ_EMPTY(result));
879 KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT));
880
881 /*
882 * TRYCONTIG is a noop if you only want a single segment.
883 * Remove it if that's the case: otherwise it'll deny the fast
884 * allocation.
885 */
886 if (maxseg == 1 || count == 1)
887 flags &= ~UVM_PLA_TRYCONTIG;
888
889 /*
890 * Configure search.
891 *
892 * search[0] is one segment, only used in UVM_PLA_TRYCONTIG case.
893 * search[1] is multiple segments, chosen to fulfill the search in
894 * approximately even-sized segments.
895 * This is a good trade-off between slightly reduced allocation speed
896 * and less fragmentation.
897 * search[2] is the worst case, in which all segments are evaluated.
898 * This provides the least fragmentation, but makes the search
899 * possibly longer (although in the case it is selected, that no
900 * longer matters most).
901 *
902 * The exception is when maxseg == 1: since we can only fulfill that
903 * with one segment of size pages, only a single search type has to
904 * be attempted.
905 */
906 if (maxseg == 1 || count == 1) {
907 start_try = 2;
908 search[2] = count;
909 } else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) {
910 start_try = 2;
911 search[2] = 1;
912 } else {
913 start_try = 0;
914 search[0] = count;
915 search[1] = pow2divide(count, maxseg);
916 search[2] = 1;
917 if ((flags & UVM_PLA_TRYCONTIG) == 0)
918 start_try = 1;
919 if (search[1] >= search[0]) {
920 search[1] = search[0];
921 start_try = 1;
922 }
923 if (search[2] >= search[start_try]) {
924 start_try = 2;
925 }
926 }
927
928 /*
929 * Memory type: if zeroed memory is requested, traverse the zero set.
930 * Otherwise, traverse the dirty set.
931 *
932 * The memtype iterator is reinitialized to memtype_init on entrance
933 * of a pmemrange.
934 */
935 if (flags & UVM_PLA_ZERO)
936 memtype_init = UVM_PMR_MEMTYPE_ZERO;
937 else
938 memtype_init = UVM_PMR_MEMTYPE_DIRTY;
939
940 /*
941 * Initially, we're not desperate.
942 *
943 * Note that if we return from a sleep, we are still desperate.
944 * Chances are that memory pressure is still high, so resetting
945 * seems over-optimistic to me.
946 */
947 desperate = 0;
948
949 uvm_lock_fpageq();
950 retry: /* Return point after sleeping. */
951 /*
952 * check to see if we need to generate some free pages waking
953 * the pagedaemon.
954 */
955 if ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freemin ||
956 ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg &&
957 (uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg))
958 wakeup(&uvm.pagedaemon);
959
960 /*
961 * fail if any of these conditions is true:
962 * [1] there really are no free pages, or
963 * [2] only kernel "reserved" pages remain and
964 * the UVM_PLA_USERESERVE flag wasn't used.
965 * [3] only pagedaemon "reserved" pages remain and
966 * the requestor isn't the pagedaemon nor the syncer.
967 */
968 if ((uvmexp.free <= (uvmexp.reserve_kernel + count)) &&
969 !(flags & UVM_PLA_USERESERVE)) {
970 uvm_unlock_fpageq();
971 return ENOMEM;
972 }
973
974 if ((uvmexp.free <= (uvmexp.reserve_pagedaemon + count)) &&
975 !in_pagedaemon(1)) {
976 uvm_unlock_fpageq();
977 if (flags & UVM_PLA_WAITOK) {
978 uvm_wait("uvm_pmr_getpages");
979 uvm_lock_fpageq();
980 goto retry;
981 }
982 return ENOMEM;
983 }
984
985 fcount = 0;
986 fnsegs = 0;
987
988 retry_desperate:
989 /*
990 * If we just want any page(s), go for the really fast option.
991 */
992 if (count <= maxseg && align == 1 && boundary == 0 &&
993 (flags & UVM_PLA_TRYCONTIG) == 0) {
994 fcount += uvm_pmr_get1page(count - fcount, memtype_init,
995 result, start, end, 0);
996
997 /*
998 * If we found sufficient pages, go to the success exit code.
999 *
1000 * Otherwise, go immediately to fail, since we collected
1001 * all we could anyway.
1002 */
1003 if (fcount == count)
1004 goto out;
1005 else
1006 goto fail;
1007 }
1008
1009 /*
1010 * The heart of the contig case.
1011 *
1012 * The code actually looks like this:
1013 *
1014 * foreach (struct pmemrange) {
1015 * foreach (memtype) {
1016 * foreach(try) {
1017 * foreach (free range of memtype in pmemrange,
1018 * starting at search[try]) {
1019 * while (range has space left)
1020 * take from range
1021 * }
1022 * }
1023 * }
1024 *
1025 * if next pmemrange has higher usecount than current:
1026 * enter desperate case (which will drain the pmemranges
1027 * until empty prior to moving to the next one)
1028 * }
1029 *
1030 * When desperate is activated, try always starts at the highest
1031 * value. The memtype loop is using a goto ReScanMemtype.
1032 * The try loop is using a goto ReScan.
1033 * The 'range has space left' loop uses label DrainFound.
1034 *
1035 * Writing them all as loops would take up a lot of screen space in
1036 * the form of indentation and some parts are easier to express
1037 * using the labels.
1038 */
1039
1040 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1041 /* Empty range. */
1042 if (pmr->nsegs == 0)
1043 continue;
1044
1045 /* Outside requested range. */
1046 if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
1047 continue;
1048
1049 memtype = memtype_init;
1050
1051 rescan_memtype: /* Return point at memtype++. */
1052 try = start_try;
1053
1054 rescan: /* Return point at try++. */
1055 for (found = uvm_pmr_nfindsz(pmr, search[try], memtype);
1056 found != NULL;
1057 found = f_next) {
1058 f_next = uvm_pmr_nextsz(pmr, found, memtype);
1059
1060 fstart = atop(VM_PAGE_TO_PHYS(found));
1061 if (start != 0)
1062 fstart = MAX(start, fstart);
1063 drain_found:
1064 /*
1065 * Throw away the first segment if fnsegs == maxseg
1066 *
1067 * Note that f_next is still valid after this call,
1068 * since we only allocated from entries before f_next.
1069 * We don't revisit the entries we already extracted
1070 * from unless we entered the desperate case.
1071 */
1072 if (fnsegs == maxseg) {
1073 fnsegs--;
1074 fcount -=
1075 uvm_pmr_remove_1strange(result, boundary,
1076 &found, desperate);
1077 }
1078
1079 fstart = PMR_ALIGN(fstart, align);
1080 fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz;
1081 if (end != 0)
1082 fend = MIN(end, fend);
1083 if (boundary != 0) {
1084 fend =
1085 MIN(fend, PMR_ALIGN(fstart + 1, boundary));
1086 }
1087 if (fstart >= fend)
1088 continue;
1089 if (fend - fstart > count - fcount)
1090 fend = fstart + (count - fcount);
1091
1092 fcount += fend - fstart;
1093 fnsegs++;
1094 found = uvm_pmr_extract_range(pmr, found,
1095 fstart, fend, result);
1096
1097 if (fcount == count)
1098 goto out;
1099
1100 /*
1101 * If there's still space left in found, try to
1102 * fully drain it prior to continuing.
1103 */
1104 if (found != NULL) {
1105 fstart = fend;
1106 goto drain_found;
1107 }
1108 }
1109
1110 /* Try a smaller search now. */
1111 if (++try < nitems(search))
1112 goto rescan;
1113
1114 /*
1115 * Exhaust all memory types prior to going to the next memory
1116 * segment.
1117 * This means that zero-vs-dirty are eaten prior to moving
1118 * to a pmemrange with a higher use-count.
1119 *
1120 * Code is basically a difficult way of writing:
1121 * memtype = memtype_init;
1122 * do {
1123 * ...;
1124 * memtype += 1;
1125 * memtype %= MEMTYPE_MAX;
1126 * } while (memtype != memtype_init);
1127 */
1128 memtype += 1;
1129 if (memtype == UVM_PMR_MEMTYPE_MAX)
1130 memtype = 0;
1131 if (memtype != memtype_init)
1132 goto rescan_memtype;
1133
1134 /*
1135 * If not desperate, enter desperate case prior to eating all
1136 * the good stuff in the next range.
1137 */
1138 if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL &&
1139 TAILQ_NEXT(pmr, pmr_use)->use != pmr->use)
1140 break;
1141 }
1142
1143 /*
1144 * Not enough memory of the requested type available. Fall back to
1145 * less good memory that we'll clean up better later.
1146 *
1147 * This algorithm is not very smart though, it just starts scanning
1148 * a different typed range, but the nicer ranges of the previous
1149 * iteration may fall out. Hence there is a small chance of a false
1150 * negative.
1151 *
1152 * When desperate: scan all sizes starting at the smallest
1153 * (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may
1154 * allow us to hit the fast path now).
1155 *
1156 * Also, because we will revisit entries we scanned before, we need
1157 * to reset the page queue, or we may end up releasing entries in
1158 * such a way as to invalidate f_next.
1159 */
1160 if (!desperate) {
1161 desperate = 1;
1162 start_try = nitems(search) - 1;
1163 flags &= ~UVM_PLA_TRYCONTIG;
1164
1165 while (!TAILQ_EMPTY(result))
1166 uvm_pmr_remove_1strange(result, 0, NULL, 0);
1167 fnsegs = 0;
1168 fcount = 0;
1169 goto retry_desperate;
1170 }
1171
1172 fail:
1173 /* Allocation failed. */
1174 /* XXX: claim from memory reserve here */
1175
1176 while (!TAILQ_EMPTY(result))
1177 uvm_pmr_remove_1strange(result, 0, NULL, 0);
1178
1179 if (flags & UVM_PLA_WAITOK) {
1180 if (uvm_wait_pla(ptoa(start), ptoa(end) - 1, ptoa(count),
1181 flags & UVM_PLA_FAILOK) == 0)
1182 goto retry;
1183 KASSERT(flags & UVM_PLA_FAILOK);
1184 } else if (!(flags & UVM_PLA_NOWAKE)) {
1185 struct uvm_pmalloc *pma = &nowait_pma;
1186
1187 if (!(nowait_pma.pm_flags & UVM_PMA_LINKED)) {
1188 nowait_pma.pm_flags = UVM_PMA_LINKED;
1189 TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, pma, pmq);
1190 wakeup(&uvm.pagedaemon);
1191 }
1192 }
1193 uvm_unlock_fpageq();
1194
1195 return ENOMEM;
1196
1197 out:
1198 /* Allocation successful. */
1199 uvmexp.free -= fcount;
1200
1201 uvm_unlock_fpageq();
1202
1203 /* Update statistics and zero pages if UVM_PLA_ZERO. */
1204 #ifdef DIAGNOSTIC
1205 fnsegs = 0;
1206 fcount = 0;
1207 diag_prev = NULL;
1208 #endif /* DIAGNOSTIC */
1209 TAILQ_FOREACH(found, result, pageq) {
1210 atomic_clearbits_int(&found->pg_flags, PG_PMAPMASK);
1211
1212 if (found->pg_flags & PG_ZERO) {
1213 uvm_lock_fpageq();
1214 uvmexp.zeropages--;
1215 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1216 wakeup(&uvmexp.zeropages);
1217 uvm_unlock_fpageq();
1218 }
1219 if (flags & UVM_PLA_ZERO) {
1220 if (found->pg_flags & PG_ZERO)
1221 uvmexp.pga_zerohit++;
1222 else {
1223 uvmexp.pga_zeromiss++;
1224 uvm_pagezero(found);
1225 }
1226 }
1227 atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE);
1228
1229 found->uobject = NULL;
1230 found->uanon = NULL;
1231 found->pg_version++;
1232
1233 /*
1234 * Validate that the page matches range criterium.
1235 */
1236 KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start);
1237 KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end);
1238
1239 #ifdef DIAGNOSTIC
1240 /*
1241 * Update fcount (# found pages) and
1242 * fnsegs (# found segments) counters.
1243 */
1244 if (diag_prev == NULL ||
1245 /* new segment if it contains a hole */
1246 atop(VM_PAGE_TO_PHYS(diag_prev)) + 1 !=
1247 atop(VM_PAGE_TO_PHYS(found)) ||
1248 /* new segment if it crosses boundary */
1249 (atop(VM_PAGE_TO_PHYS(diag_prev)) & ~(boundary - 1)) !=
1250 (atop(VM_PAGE_TO_PHYS(found)) & ~(boundary - 1)))
1251 fnsegs++;
1252 fcount++;
1253
1254 diag_prev = found;
1255 #endif /* DIAGNOSTIC */
1256 }
1257
1258 #ifdef DIAGNOSTIC
1259 /*
1260 * Panic on algorithm failure.
1261 */
1262 if (fcount != count || fnsegs > maxseg) {
1263 panic("pmemrange allocation error: "
1264 "allocated %ld pages in %d segments, "
1265 "but request was %ld pages in %d segments",
1266 fcount, fnsegs, count, maxseg);
1267 }
1268 #endif /* DIAGNOSTIC */
1269
1270 return 0;
1271 }
1272
1273 /*
1274 * Acquire a single page.
1275 *
1276 * flags: UVM_PLA_* flags
1277 * result: returned page.
1278 */
1279 struct vm_page *
uvm_pmr_getone(int flags)1280 uvm_pmr_getone(int flags)
1281 {
1282 struct vm_page *pg;
1283 struct pglist pgl;
1284
1285 TAILQ_INIT(&pgl);
1286 if (uvm_pmr_getpages(1, 0, 0, 1, 0, 1, flags, &pgl) != 0)
1287 return NULL;
1288
1289 pg = TAILQ_FIRST(&pgl);
1290 KASSERT(pg != NULL && TAILQ_NEXT(pg, pageq) == NULL);
1291
1292 return pg;
1293 }
1294
1295 /*
1296 * Free a number of contig pages (invoked by uvm_page_init).
1297 */
1298 void
uvm_pmr_freepages(struct vm_page * pg,psize_t count)1299 uvm_pmr_freepages(struct vm_page *pg, psize_t count)
1300 {
1301 struct uvm_pmemrange *pmr;
1302 psize_t i, pmr_count;
1303 struct vm_page *firstpg = pg;
1304
1305 for (i = 0; i < count; i++) {
1306 KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) ==
1307 atop(VM_PAGE_TO_PHYS(pg)) + i);
1308
1309 if (!((pg[i].pg_flags & PQ_FREE) == 0 &&
1310 VALID_FLAGS(pg[i].pg_flags))) {
1311 printf("Flags: 0x%x, will panic now.\n",
1312 pg[i].pg_flags);
1313 }
1314 KASSERT((pg[i].pg_flags & PQ_FREE) == 0 &&
1315 VALID_FLAGS(pg[i].pg_flags));
1316 atomic_setbits_int(&pg[i].pg_flags, PQ_FREE);
1317 atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
1318 }
1319
1320 uvm_lock_fpageq();
1321
1322 for (i = count; i > 0; i -= pmr_count) {
1323 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1324 KASSERT(pmr != NULL);
1325
1326 pmr_count = MIN(i, pmr->high - atop(VM_PAGE_TO_PHYS(pg)));
1327 pg->fpgsz = pmr_count;
1328 uvm_pmr_insert(pmr, pg, 0);
1329
1330 uvmexp.free += pmr_count;
1331 pg += pmr_count;
1332 }
1333 wakeup(&uvmexp.free);
1334 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1335 wakeup(&uvmexp.zeropages);
1336
1337 uvm_wakeup_pla(VM_PAGE_TO_PHYS(firstpg), ptoa(count));
1338
1339 uvm_unlock_fpageq();
1340 }
1341
1342 /*
1343 * Free all pages in the queue.
1344 */
1345 void
uvm_pmr_freepageq(struct pglist * pgl)1346 uvm_pmr_freepageq(struct pglist *pgl)
1347 {
1348 struct vm_page *pg;
1349 paddr_t pstart;
1350 psize_t plen;
1351
1352 TAILQ_FOREACH(pg, pgl, pageq) {
1353 if (!((pg->pg_flags & PQ_FREE) == 0 &&
1354 VALID_FLAGS(pg->pg_flags))) {
1355 printf("Flags: 0x%x, will panic now.\n",
1356 pg->pg_flags);
1357 }
1358 KASSERT((pg->pg_flags & PQ_FREE) == 0 &&
1359 VALID_FLAGS(pg->pg_flags));
1360 atomic_setbits_int(&pg->pg_flags, PQ_FREE);
1361 atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
1362 }
1363
1364 uvm_lock_fpageq();
1365 while (!TAILQ_EMPTY(pgl)) {
1366 pg = TAILQ_FIRST(pgl);
1367 if (pg == TAILQ_NEXT(pg, pageq) + 1) {
1368 /*
1369 * If pg is one behind the position of the
1370 * next page in the list in the page array,
1371 * try going backwards instead of forward.
1372 */
1373 plen = uvm_pmr_remove_1strange_reverse(pgl, &pstart);
1374 } else {
1375 pstart = VM_PAGE_TO_PHYS(TAILQ_FIRST(pgl));
1376 plen = uvm_pmr_remove_1strange(pgl, 0, NULL, 0);
1377 }
1378 uvmexp.free += plen;
1379
1380 uvm_wakeup_pla(pstart, ptoa(plen));
1381 }
1382 wakeup(&uvmexp.free);
1383 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1384 wakeup(&uvmexp.zeropages);
1385 uvm_unlock_fpageq();
1386
1387 return;
1388 }
1389
1390 /*
1391 * Store a pmemrange in the list.
1392 *
1393 * The list is sorted by use.
1394 */
1395 struct uvm_pmemrange *
uvm_pmemrange_use_insert(struct uvm_pmemrange_use * useq,struct uvm_pmemrange * pmr)1396 uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq,
1397 struct uvm_pmemrange *pmr)
1398 {
1399 struct uvm_pmemrange *iter;
1400 int cmp = 1;
1401
1402 TAILQ_FOREACH(iter, useq, pmr_use) {
1403 cmp = uvm_pmemrange_use_cmp(pmr, iter);
1404 if (cmp == 0)
1405 return iter;
1406 if (cmp == -1)
1407 break;
1408 }
1409
1410 if (iter == NULL)
1411 TAILQ_INSERT_TAIL(useq, pmr, pmr_use);
1412 else
1413 TAILQ_INSERT_BEFORE(iter, pmr, pmr_use);
1414 return NULL;
1415 }
1416
1417 #ifdef DEBUG
1418 /*
1419 * Validation of the whole pmemrange.
1420 * Called with fpageq locked.
1421 */
1422 void
uvm_pmr_assertvalid(struct uvm_pmemrange * pmr)1423 uvm_pmr_assertvalid(struct uvm_pmemrange *pmr)
1424 {
1425 struct vm_page *prev, *next, *i, *xref;
1426 int lcv, mti;
1427
1428 /* Empty range */
1429 if (pmr->nsegs == 0)
1430 return;
1431
1432 /* Validate address tree. */
1433 RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) {
1434 /* Validate the range. */
1435 KASSERT(i->fpgsz > 0);
1436 KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low);
1437 KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz
1438 <= pmr->high);
1439
1440 /* Validate each page in this range. */
1441 for (lcv = 0; lcv < i->fpgsz; lcv++) {
1442 /*
1443 * Only the first page has a size specification.
1444 * Rest is size 0.
1445 */
1446 KASSERT(lcv == 0 || i[lcv].fpgsz == 0);
1447 /*
1448 * Flag check.
1449 */
1450 KASSERT(VALID_FLAGS(i[lcv].pg_flags) &&
1451 (i[lcv].pg_flags & PQ_FREE) == PQ_FREE);
1452 /*
1453 * Free pages are:
1454 * - not wired
1455 * - have no vm_anon
1456 * - have no uvm_object
1457 */
1458 KASSERT(i[lcv].wire_count == 0);
1459 KASSERT(i[lcv].uanon == (void*)0xdeadbeef ||
1460 i[lcv].uanon == NULL);
1461 KASSERT(i[lcv].uobject == (void*)0xdeadbeef ||
1462 i[lcv].uobject == NULL);
1463 /*
1464 * Pages in a single range always have the same
1465 * memtype.
1466 */
1467 KASSERT(uvm_pmr_pg_to_memtype(&i[0]) ==
1468 uvm_pmr_pg_to_memtype(&i[lcv]));
1469 }
1470
1471 /* Check that it shouldn't be joined with its predecessor. */
1472 prev = RBT_PREV(uvm_pmr_addr, i);
1473 if (prev != NULL) {
1474 KASSERT(uvm_pmr_pg_to_memtype(i) !=
1475 uvm_pmr_pg_to_memtype(prev) ||
1476 atop(VM_PAGE_TO_PHYS(i)) >
1477 atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz ||
1478 prev + prev->fpgsz != i);
1479 }
1480
1481 /* Assert i is in the size tree as well. */
1482 if (i->fpgsz == 1) {
1483 TAILQ_FOREACH(xref,
1484 &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) {
1485 if (xref == i)
1486 break;
1487 }
1488 KASSERT(xref == i);
1489 } else {
1490 KASSERT(RBT_FIND(uvm_pmr_size,
1491 &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) ==
1492 i + 1);
1493 }
1494 }
1495
1496 /* Validate size tree. */
1497 for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
1498 for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) {
1499 next = uvm_pmr_nextsz(pmr, i, mti);
1500 if (next != NULL) {
1501 KASSERT(i->fpgsz <=
1502 next->fpgsz);
1503 }
1504
1505 /* Assert i is in the addr tree as well. */
1506 KASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, i) == i);
1507
1508 /* Assert i is of the correct memory type. */
1509 KASSERT(uvm_pmr_pg_to_memtype(i) == mti);
1510 }
1511 }
1512
1513 /* Validate nsegs statistic. */
1514 lcv = 0;
1515 RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr)
1516 lcv++;
1517 KASSERT(pmr->nsegs == lcv);
1518 }
1519 #endif /* DEBUG */
1520
1521 /*
1522 * Split pmr at split point pageno.
1523 * Called with fpageq unlocked.
1524 *
1525 * Split is only applied if a pmemrange spans pageno.
1526 */
1527 void
uvm_pmr_split(paddr_t pageno)1528 uvm_pmr_split(paddr_t pageno)
1529 {
1530 struct uvm_pmemrange *pmr, *drain;
1531 struct vm_page *rebuild, *prev, *next;
1532 psize_t prev_sz;
1533
1534 uvm_lock_fpageq();
1535 pmr = uvm_pmemrange_find(pageno);
1536 if (pmr == NULL || !(pmr->low < pageno)) {
1537 /* No split required. */
1538 uvm_unlock_fpageq();
1539 return;
1540 }
1541
1542 KASSERT(pmr->low < pageno);
1543 KASSERT(pmr->high > pageno);
1544
1545 /*
1546 * uvm_pmr_allocpmr() calls into malloc() which in turn calls into
1547 * uvm_kmemalloc which calls into pmemrange, making the locking
1548 * a bit hard, so we just race!
1549 */
1550 uvm_unlock_fpageq();
1551 drain = uvm_pmr_allocpmr();
1552 uvm_lock_fpageq();
1553 pmr = uvm_pmemrange_find(pageno);
1554 if (pmr == NULL || !(pmr->low < pageno)) {
1555 /*
1556 * We lost the race since someone else ran this or a related
1557 * function, however this should be triggered very rarely so
1558 * we just leak the pmr.
1559 */
1560 printf("uvm_pmr_split: lost one pmr\n");
1561 uvm_unlock_fpageq();
1562 return;
1563 }
1564
1565 drain->low = pageno;
1566 drain->high = pmr->high;
1567 drain->use = pmr->use;
1568
1569 uvm_pmr_assertvalid(pmr);
1570 uvm_pmr_assertvalid(drain);
1571 KASSERT(drain->nsegs == 0);
1572
1573 RBT_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) {
1574 if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno)
1575 break;
1576 }
1577 if (rebuild == NULL)
1578 prev = RBT_MAX(uvm_pmr_addr, &pmr->addr);
1579 else
1580 prev = RBT_PREV(uvm_pmr_addr, rebuild);
1581 KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1582
1583 /*
1584 * Handle free chunk that spans the split point.
1585 */
1586 if (prev != NULL &&
1587 atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) {
1588 psize_t before, after;
1589
1590 KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1591
1592 uvm_pmr_remove(pmr, prev);
1593 prev_sz = prev->fpgsz;
1594 before = pageno - atop(VM_PAGE_TO_PHYS(prev));
1595 after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno;
1596
1597 KASSERT(before > 0);
1598 KASSERT(after > 0);
1599
1600 prev->fpgsz = before;
1601 uvm_pmr_insert(pmr, prev, 1);
1602 (prev + before)->fpgsz = after;
1603 uvm_pmr_insert(drain, prev + before, 1);
1604 }
1605
1606 /* Move free chunks that no longer fall in the range. */
1607 for (; rebuild != NULL; rebuild = next) {
1608 next = RBT_NEXT(uvm_pmr_addr, rebuild);
1609
1610 uvm_pmr_remove(pmr, rebuild);
1611 uvm_pmr_insert(drain, rebuild, 1);
1612 }
1613
1614 pmr->high = pageno;
1615 uvm_pmr_assertvalid(pmr);
1616 uvm_pmr_assertvalid(drain);
1617
1618 RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain);
1619 uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain);
1620 uvm_unlock_fpageq();
1621 }
1622
1623 /*
1624 * Increase the usage counter for the given range of memory.
1625 *
1626 * The more usage counters a given range of memory has, the more will be
1627 * attempted not to allocate from it.
1628 *
1629 * Addresses here are in paddr_t, not page-numbers.
1630 * The lowest and highest allowed address are specified.
1631 */
1632 void
uvm_pmr_use_inc(paddr_t low,paddr_t high)1633 uvm_pmr_use_inc(paddr_t low, paddr_t high)
1634 {
1635 struct uvm_pmemrange *pmr;
1636 paddr_t sz;
1637
1638 /* pmr uses page numbers, translate low and high. */
1639 high++;
1640 high = atop(trunc_page(high));
1641 low = atop(round_page(low));
1642 uvm_pmr_split(low);
1643 uvm_pmr_split(high);
1644
1645 sz = 0;
1646 uvm_lock_fpageq();
1647 /* Increase use count on segments in range. */
1648 RBT_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) {
1649 if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) {
1650 TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use);
1651 pmr->use++;
1652 sz += pmr->high - pmr->low;
1653 uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr);
1654 }
1655 uvm_pmr_assertvalid(pmr);
1656 }
1657 uvm_unlock_fpageq();
1658
1659 KASSERT(sz >= high - low);
1660 }
1661
1662 /*
1663 * Allocate a pmemrange.
1664 *
1665 * If called from uvm_page_init, the uvm_pageboot_alloc is used.
1666 * If called after uvm_init, malloc is used.
1667 * (And if called in between, you're dead.)
1668 */
1669 struct uvm_pmemrange *
uvm_pmr_allocpmr(void)1670 uvm_pmr_allocpmr(void)
1671 {
1672 struct uvm_pmemrange *nw;
1673 int i;
1674
1675 /* We're only ever hitting the !uvm.page_init_done case for now. */
1676 if (!uvm.page_init_done) {
1677 nw = (struct uvm_pmemrange *)
1678 uvm_pageboot_alloc(sizeof(struct uvm_pmemrange));
1679 } else {
1680 nw = malloc(sizeof(struct uvm_pmemrange),
1681 M_VMMAP, M_NOWAIT);
1682 }
1683 KASSERT(nw != NULL);
1684 memset(nw, 0, sizeof(struct uvm_pmemrange));
1685 RBT_INIT(uvm_pmr_addr, &nw->addr);
1686 for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) {
1687 RBT_INIT(uvm_pmr_size, &nw->size[i]);
1688 TAILQ_INIT(&nw->single[i]);
1689 }
1690 return nw;
1691 }
1692
1693 /*
1694 * Initialization of pmr.
1695 * Called by uvm_page_init.
1696 *
1697 * Sets up pmemranges.
1698 */
1699 void
uvm_pmr_init(void)1700 uvm_pmr_init(void)
1701 {
1702 struct uvm_pmemrange *new_pmr;
1703 int i;
1704
1705 TAILQ_INIT(&uvm.pmr_control.use);
1706 RBT_INIT(uvm_pmemrange_addr, &uvm.pmr_control.addr);
1707 TAILQ_INIT(&uvm.pmr_control.allocs);
1708
1709 /* By default, one range for the entire address space. */
1710 new_pmr = uvm_pmr_allocpmr();
1711 new_pmr->low = 0;
1712 new_pmr->high = atop((paddr_t)-1) + 1;
1713
1714 RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr);
1715 uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr);
1716
1717 for (i = 0; uvm_md_constraints[i] != NULL; i++) {
1718 uvm_pmr_use_inc(uvm_md_constraints[i]->ucr_low,
1719 uvm_md_constraints[i]->ucr_high);
1720 }
1721 }
1722
1723 /*
1724 * Find the pmemrange that contains the given page number.
1725 *
1726 * (Manually traverses the binary tree, because that is cheaper on stack
1727 * usage.)
1728 */
1729 struct uvm_pmemrange *
uvm_pmemrange_find(paddr_t pageno)1730 uvm_pmemrange_find(paddr_t pageno)
1731 {
1732 struct uvm_pmemrange *pmr;
1733
1734 pmr = RBT_ROOT(uvm_pmemrange_addr, &uvm.pmr_control.addr);
1735 while (pmr != NULL) {
1736 if (pmr->low > pageno)
1737 pmr = RBT_LEFT(uvm_pmemrange_addr, pmr);
1738 else if (pmr->high <= pageno)
1739 pmr = RBT_RIGHT(uvm_pmemrange_addr, pmr);
1740 else
1741 break;
1742 }
1743
1744 return pmr;
1745 }
1746
1747 #if defined(DDB) || defined(DEBUG)
1748 /*
1749 * Return true if the given page is in any of the free lists.
1750 * Used by uvm_page_printit.
1751 * This function is safe, even if the page is not on the freeq.
1752 * Note: does not apply locking, only called from ddb.
1753 */
1754 int
uvm_pmr_isfree(struct vm_page * pg)1755 uvm_pmr_isfree(struct vm_page *pg)
1756 {
1757 struct vm_page *r;
1758 struct uvm_pmemrange *pmr;
1759
1760 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1761 if (pmr == NULL)
1762 return 0;
1763 r = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg);
1764 if (r == NULL)
1765 r = RBT_MAX(uvm_pmr_addr, &pmr->addr);
1766 else if (r != pg)
1767 r = RBT_PREV(uvm_pmr_addr, r);
1768 if (r == NULL)
1769 return 0; /* Empty tree. */
1770
1771 KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg)));
1772 return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz >
1773 atop(VM_PAGE_TO_PHYS(pg));
1774 }
1775 #endif /* DEBUG */
1776
1777 /*
1778 * Given a root of a tree, find a range which intersects start, end and
1779 * is of the same memtype.
1780 *
1781 * Page must be in the address tree.
1782 */
1783 struct vm_page*
uvm_pmr_rootupdate(struct uvm_pmemrange * pmr,struct vm_page * init_root,paddr_t start,paddr_t end,int memtype)1784 uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root,
1785 paddr_t start, paddr_t end, int memtype)
1786 {
1787 int direction;
1788 struct vm_page *root;
1789 struct vm_page *high, *high_next;
1790 struct vm_page *low, *low_next;
1791
1792 KDASSERT(pmr != NULL && init_root != NULL);
1793 root = init_root;
1794
1795 /* Which direction to use for searching. */
1796 if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start)
1797 direction = 1;
1798 else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end)
1799 direction = -1;
1800 else /* nothing to do */
1801 return root;
1802
1803 /* First, update root to fall within the chosen range. */
1804 while (root && !PMR_INTERSECTS_WITH(
1805 atop(VM_PAGE_TO_PHYS(root)),
1806 atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz,
1807 start, end)) {
1808 if (direction == 1)
1809 root = RBT_RIGHT(uvm_objtree, root);
1810 else
1811 root = RBT_LEFT(uvm_objtree, root);
1812 }
1813 if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype)
1814 return root;
1815
1816 /*
1817 * Root is valid, but of the wrong memtype.
1818 *
1819 * Try to find a range that has the given memtype in the subtree
1820 * (memtype mismatches are costly, either because the conversion
1821 * is expensive, or a later allocation will need to do the opposite
1822 * conversion, which will be expensive).
1823 *
1824 *
1825 * First, simply increase address until we hit something we can use.
1826 * Cache the upper page, so we can page-walk later.
1827 */
1828 high = root;
1829 high_next = RBT_RIGHT(uvm_objtree, high);
1830 while (high_next != NULL && PMR_INTERSECTS_WITH(
1831 atop(VM_PAGE_TO_PHYS(high_next)),
1832 atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz,
1833 start, end)) {
1834 high = high_next;
1835 if (uvm_pmr_pg_to_memtype(high) == memtype)
1836 return high;
1837 high_next = RBT_RIGHT(uvm_objtree, high);
1838 }
1839
1840 /*
1841 * Second, decrease the address until we hit something we can use.
1842 * Cache the lower page, so we can page-walk later.
1843 */
1844 low = root;
1845 low_next = RBT_LEFT(uvm_objtree, low);
1846 while (low_next != NULL && PMR_INTERSECTS_WITH(
1847 atop(VM_PAGE_TO_PHYS(low_next)),
1848 atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz,
1849 start, end)) {
1850 low = low_next;
1851 if (uvm_pmr_pg_to_memtype(low) == memtype)
1852 return low;
1853 low_next = RBT_LEFT(uvm_objtree, low);
1854 }
1855
1856 if (low == high)
1857 return NULL;
1858
1859 /* No hits. Walk the address tree until we find something usable. */
1860 for (low = RBT_NEXT(uvm_pmr_addr, low);
1861 low != high;
1862 low = RBT_NEXT(uvm_pmr_addr, low)) {
1863 KDASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(low)),
1864 atop(VM_PAGE_TO_PHYS(low)) + low->fpgsz,
1865 start, end));
1866 if (uvm_pmr_pg_to_memtype(low) == memtype)
1867 return low;
1868 }
1869
1870 /* Nothing found. */
1871 return NULL;
1872 }
1873
1874 /*
1875 * Allocate any page, the fastest way. Page number constraints only.
1876 */
1877 psize_t
uvm_pmr_get1page(psize_t count,int memtype_init,struct pglist * result,paddr_t start,paddr_t end,int memtype_only)1878 uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result,
1879 paddr_t start, paddr_t end, int memtype_only)
1880 {
1881 struct uvm_pmemrange *pmr;
1882 struct vm_page *found, *splitpg;
1883 psize_t fcount;
1884 int memtype;
1885
1886 fcount = 0;
1887 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1888 /* We're done. */
1889 if (fcount == count)
1890 break;
1891
1892 /* Outside requested range. */
1893 if (!(start == 0 && end == 0) &&
1894 !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
1895 continue;
1896
1897 /* Range is empty. */
1898 if (pmr->nsegs == 0)
1899 continue;
1900
1901 /* Loop over all memtypes, starting at memtype_init. */
1902 memtype = memtype_init;
1903 while (fcount != count) {
1904 found = TAILQ_FIRST(&pmr->single[memtype]);
1905 /*
1906 * If found is outside the range, walk the list
1907 * until we find something that intersects with
1908 * boundaries.
1909 */
1910 while (found && !PMR_INTERSECTS_WITH(
1911 atop(VM_PAGE_TO_PHYS(found)),
1912 atop(VM_PAGE_TO_PHYS(found)) + 1,
1913 start, end))
1914 found = TAILQ_NEXT(found, pageq);
1915
1916 if (found == NULL) {
1917 /*
1918 * Check if the size tree contains a range
1919 * that intersects with the boundaries. As the
1920 * allocation is for any page, try the smallest
1921 * range so that large ranges are preserved for
1922 * more constrained cases. Only one entry is
1923 * checked here, to avoid a brute-force search.
1924 *
1925 * Note that a size tree gives pg[1] instead of
1926 * pg[0].
1927 */
1928 found = RBT_MIN(uvm_pmr_size,
1929 &pmr->size[memtype]);
1930 if (found != NULL) {
1931 found--;
1932 if (!PMR_INTERSECTS_WITH(
1933 atop(VM_PAGE_TO_PHYS(found)),
1934 atop(VM_PAGE_TO_PHYS(found)) +
1935 found->fpgsz, start, end))
1936 found = NULL;
1937 }
1938 }
1939 if (found == NULL) {
1940 /*
1941 * Try address-guided search to meet the page
1942 * number constraints.
1943 */
1944 found = RBT_ROOT(uvm_pmr_addr, &pmr->addr);
1945 if (found != NULL) {
1946 found = uvm_pmr_rootupdate(pmr, found,
1947 start, end, memtype);
1948 }
1949 }
1950 if (found != NULL) {
1951 uvm_pmr_assertvalid(pmr);
1952 uvm_pmr_remove_size(pmr, found);
1953
1954 /*
1955 * If the page intersects the end, then it'll
1956 * need splitting.
1957 *
1958 * Note that we don't need to split if the page
1959 * intersects start: the drain function will
1960 * simply stop on hitting start.
1961 */
1962 if (end != 0 && atop(VM_PAGE_TO_PHYS(found)) +
1963 found->fpgsz > end) {
1964 psize_t splitsz =
1965 atop(VM_PAGE_TO_PHYS(found)) +
1966 found->fpgsz - end;
1967
1968 uvm_pmr_remove_addr(pmr, found);
1969 uvm_pmr_assertvalid(pmr);
1970 found->fpgsz -= splitsz;
1971 splitpg = found + found->fpgsz;
1972 splitpg->fpgsz = splitsz;
1973 uvm_pmr_insert(pmr, splitpg, 1);
1974
1975 /*
1976 * At this point, splitpg and found
1977 * actually should be joined.
1978 * But we explicitly disable that,
1979 * because we will start subtracting
1980 * from found.
1981 */
1982 KASSERT(start == 0 ||
1983 atop(VM_PAGE_TO_PHYS(found)) +
1984 found->fpgsz > start);
1985 uvm_pmr_insert_addr(pmr, found, 1);
1986 }
1987
1988 /*
1989 * Fetch pages from the end.
1990 * If the range is larger than the requested
1991 * number of pages, this saves us an addr-tree
1992 * update.
1993 *
1994 * Since we take from the end and insert at
1995 * the head, any ranges keep preserved.
1996 */
1997 while (found->fpgsz > 0 && fcount < count &&
1998 (start == 0 ||
1999 atop(VM_PAGE_TO_PHYS(found)) +
2000 found->fpgsz > start)) {
2001 found->fpgsz--;
2002 fcount++;
2003 TAILQ_INSERT_HEAD(result,
2004 &found[found->fpgsz], pageq);
2005 }
2006 if (found->fpgsz > 0) {
2007 uvm_pmr_insert_size(pmr, found);
2008 KDASSERT(fcount == count);
2009 uvm_pmr_assertvalid(pmr);
2010 return fcount;
2011 }
2012
2013 /*
2014 * Delayed addr-tree removal.
2015 */
2016 uvm_pmr_remove_addr(pmr, found);
2017 uvm_pmr_assertvalid(pmr);
2018 } else {
2019 if (memtype_only)
2020 break;
2021 /*
2022 * Skip to the next memtype.
2023 */
2024 memtype += 1;
2025 if (memtype == UVM_PMR_MEMTYPE_MAX)
2026 memtype = 0;
2027 if (memtype == memtype_init)
2028 break;
2029 }
2030 }
2031 }
2032
2033 /*
2034 * Search finished.
2035 *
2036 * Ran out of ranges before enough pages were gathered, or we hit the
2037 * case where found->fpgsz == count - fcount, in which case the
2038 * above exit condition didn't trigger.
2039 *
2040 * On failure, caller will free the pages.
2041 */
2042 return fcount;
2043 }
2044
2045 #ifdef DDB
2046 /*
2047 * Print information about pmemrange.
2048 * Does not do locking (so either call it from DDB or acquire fpageq lock
2049 * before invoking.
2050 */
2051 void
uvm_pmr_print(void)2052 uvm_pmr_print(void)
2053 {
2054 struct uvm_pmemrange *pmr;
2055 struct vm_page *pg;
2056 psize_t size[UVM_PMR_MEMTYPE_MAX];
2057 psize_t free;
2058 int useq_len;
2059 int mt;
2060
2061 printf("Ranges, use queue:\n");
2062 useq_len = 0;
2063 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
2064 useq_len++;
2065 free = 0;
2066 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
2067 pg = RBT_MAX(uvm_pmr_size, &pmr->size[mt]);
2068 if (pg != NULL)
2069 pg--;
2070 else
2071 pg = TAILQ_FIRST(&pmr->single[mt]);
2072 size[mt] = (pg == NULL ? 0 : pg->fpgsz);
2073
2074 RBT_FOREACH(pg, uvm_pmr_addr, &pmr->addr)
2075 free += pg->fpgsz;
2076 }
2077
2078 printf("* [0x%lx-0x%lx] use=%d nsegs=%ld",
2079 (unsigned long)pmr->low, (unsigned long)pmr->high,
2080 pmr->use, (unsigned long)pmr->nsegs);
2081 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
2082 printf(" maxsegsz[%d]=0x%lx", mt,
2083 (unsigned long)size[mt]);
2084 }
2085 printf(" free=0x%lx\n", (unsigned long)free);
2086 }
2087 printf("#ranges = %d\n", useq_len);
2088 }
2089 #endif
2090
2091 /*
2092 * uvm_wait_pla: wait (sleep) for the page daemon to free some pages
2093 * in a specific physmem area.
2094 *
2095 * Returns ENOMEM if the pagedaemon failed to free any pages.
2096 * If not failok, failure will lead to panic.
2097 *
2098 * Must be called with fpageq locked.
2099 */
2100 int
uvm_wait_pla(paddr_t low,paddr_t high,paddr_t size,int failok)2101 uvm_wait_pla(paddr_t low, paddr_t high, paddr_t size, int failok)
2102 {
2103 struct uvm_pmalloc pma;
2104 const char *wmsg = "pmrwait";
2105
2106 KASSERT(curcpu()->ci_idepth == 0);
2107
2108 if (curproc == uvm.pagedaemon_proc) {
2109 /*
2110 * This is not that uncommon when the pagedaemon is trying
2111 * to flush out a large mmapped file. VOP_WRITE will circle
2112 * back through the buffer cache and try to get more memory.
2113 * The pagedaemon starts by calling bufbackoff, but we can
2114 * easily use up that reserve in a single scan iteration.
2115 */
2116 uvm_unlock_fpageq();
2117 if (bufbackoff(NULL, atop(size)) >= atop(size)) {
2118 uvm_lock_fpageq();
2119 return 0;
2120 }
2121 uvm_lock_fpageq();
2122
2123 /*
2124 * XXX detect pagedaemon deadlock - see comment in
2125 * uvm_wait(), as this is exactly the same issue.
2126 */
2127 printf("pagedaemon: wait_pla deadlock detected!\n");
2128 msleep_nsec(&uvmexp.free, &uvm.fpageqlock, PVM, wmsg,
2129 MSEC_TO_NSEC(125));
2130 #if defined(DEBUG)
2131 /* DEBUG: panic so we can debug it */
2132 panic("wait_pla pagedaemon deadlock");
2133 #endif
2134 return 0;
2135 }
2136
2137 for (;;) {
2138 pma.pm_constraint.ucr_low = low;
2139 pma.pm_constraint.ucr_high = high;
2140 pma.pm_size = size;
2141 pma.pm_flags = UVM_PMA_LINKED;
2142 TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, &pma, pmq);
2143
2144 wakeup(&uvm.pagedaemon); /* wake the daemon! */
2145 while (pma.pm_flags & (UVM_PMA_LINKED | UVM_PMA_BUSY))
2146 msleep_nsec(&pma, &uvm.fpageqlock, PVM, wmsg, INFSLP);
2147
2148 if (!(pma.pm_flags & UVM_PMA_FREED) &&
2149 pma.pm_flags & UVM_PMA_FAIL) {
2150 if (failok)
2151 return ENOMEM;
2152 printf("uvm_wait: failed to free %ld pages between "
2153 "0x%lx-0x%lx\n", atop(size), low, high);
2154 } else
2155 return 0;
2156 }
2157 /* UNREACHABLE */
2158 }
2159
2160 /*
2161 * Wake up uvm_pmalloc sleepers.
2162 */
2163 void
uvm_wakeup_pla(paddr_t low,psize_t len)2164 uvm_wakeup_pla(paddr_t low, psize_t len)
2165 {
2166 struct uvm_pmalloc *pma, *pma_next;
2167 paddr_t high;
2168
2169 high = low + len;
2170
2171 /* Wake specific allocations waiting for this memory. */
2172 for (pma = TAILQ_FIRST(&uvm.pmr_control.allocs); pma != NULL;
2173 pma = pma_next) {
2174 pma_next = TAILQ_NEXT(pma, pmq);
2175
2176 if (low < pma->pm_constraint.ucr_high &&
2177 high > pma->pm_constraint.ucr_low) {
2178 pma->pm_flags |= UVM_PMA_FREED;
2179 if (!(pma->pm_flags & UVM_PMA_BUSY)) {
2180 pma->pm_flags &= ~UVM_PMA_LINKED;
2181 TAILQ_REMOVE(&uvm.pmr_control.allocs, pma,
2182 pmq);
2183 wakeup(pma);
2184 }
2185 }
2186 }
2187 }
2188
2189 void
uvm_pagezero_thread(void * arg)2190 uvm_pagezero_thread(void *arg)
2191 {
2192 struct pglist pgl;
2193 struct vm_page *pg;
2194 int count;
2195
2196 /* Run at the lowest possible priority. */
2197 curproc->p_p->ps_nice = NZERO + PRIO_MAX;
2198
2199 KERNEL_UNLOCK();
2200
2201 TAILQ_INIT(&pgl);
2202 for (;;) {
2203 uvm_lock_fpageq();
2204 while (uvmexp.zeropages >= UVM_PAGEZERO_TARGET ||
2205 (count = uvm_pmr_get1page(16, UVM_PMR_MEMTYPE_DIRTY,
2206 &pgl, 0, 0, 1)) == 0) {
2207 msleep_nsec(&uvmexp.zeropages, &uvm.fpageqlock,
2208 MAXPRI, "pgzero", INFSLP);
2209 }
2210 uvm_unlock_fpageq();
2211
2212 TAILQ_FOREACH(pg, &pgl, pageq) {
2213 uvm_pagezero(pg);
2214 atomic_setbits_int(&pg->pg_flags, PG_ZERO);
2215 }
2216
2217 uvm_lock_fpageq();
2218 while (!TAILQ_EMPTY(&pgl))
2219 uvm_pmr_remove_1strange(&pgl, 0, NULL, 0);
2220 uvmexp.zeropages += count;
2221 uvm_unlock_fpageq();
2222
2223 yield();
2224 }
2225 }
2226
2227 #if defined(MULTIPROCESSOR) && defined(__HAVE_UVM_PERCPU)
2228 int
uvm_pmr_cache_alloc(struct uvm_pmr_cache_item * upci)2229 uvm_pmr_cache_alloc(struct uvm_pmr_cache_item *upci)
2230 {
2231 struct vm_page *pg;
2232 struct pglist pgl;
2233 int flags = UVM_PLA_NOWAIT|UVM_PLA_NOWAKE;
2234 int npages = UVM_PMR_CACHEMAGSZ;
2235
2236 splassert(IPL_VM);
2237 KASSERT(upci->upci_npages == 0);
2238
2239 TAILQ_INIT(&pgl);
2240 if (uvm_pmr_getpages(npages, 0, 0, 1, 0, npages, flags, &pgl))
2241 return -1;
2242
2243 while ((pg = TAILQ_FIRST(&pgl)) != NULL) {
2244 TAILQ_REMOVE(&pgl, pg, pageq);
2245 upci->upci_pages[upci->upci_npages] = pg;
2246 upci->upci_npages++;
2247 }
2248 atomic_add_int(&uvmexp.percpucaches, npages);
2249
2250 return 0;
2251 }
2252
2253 struct vm_page *
uvm_pmr_cache_get(int flags)2254 uvm_pmr_cache_get(int flags)
2255 {
2256 struct uvm_pmr_cache *upc = &curcpu()->ci_uvm;
2257 struct uvm_pmr_cache_item *upci;
2258 struct vm_page *pg;
2259 int s;
2260
2261 s = splvm();
2262 upci = &upc->upc_magz[upc->upc_actv];
2263 if (upci->upci_npages == 0) {
2264 unsigned int prev;
2265
2266 prev = (upc->upc_actv == 0) ? 1 : 0;
2267 upci = &upc->upc_magz[prev];
2268 if (upci->upci_npages == 0) {
2269 atomic_inc_int(&uvmexp.pcpmiss);
2270 if (uvm_pmr_cache_alloc(upci)) {
2271 splx(s);
2272 return uvm_pmr_getone(flags);
2273 }
2274 }
2275 /* Swap magazines */
2276 upc->upc_actv = prev;
2277 } else {
2278 atomic_inc_int(&uvmexp.pcphit);
2279 }
2280
2281 atomic_dec_int(&uvmexp.percpucaches);
2282 upci->upci_npages--;
2283 pg = upci->upci_pages[upci->upci_npages];
2284 splx(s);
2285
2286 if (flags & UVM_PLA_ZERO)
2287 uvm_pagezero(pg);
2288
2289 return pg;
2290 }
2291
2292 unsigned int
uvm_pmr_cache_free(struct uvm_pmr_cache_item * upci)2293 uvm_pmr_cache_free(struct uvm_pmr_cache_item *upci)
2294 {
2295 struct pglist pgl;
2296 unsigned int i;
2297
2298 splassert(IPL_VM);
2299
2300 TAILQ_INIT(&pgl);
2301 for (i = 0; i < upci->upci_npages; i++)
2302 TAILQ_INSERT_TAIL(&pgl, upci->upci_pages[i], pageq);
2303
2304 uvm_pmr_freepageq(&pgl);
2305
2306 atomic_sub_int(&uvmexp.percpucaches, upci->upci_npages);
2307 upci->upci_npages = 0;
2308 memset(upci->upci_pages, 0, sizeof(upci->upci_pages));
2309
2310 return i;
2311 }
2312
2313 void
uvm_pmr_cache_put(struct vm_page * pg)2314 uvm_pmr_cache_put(struct vm_page *pg)
2315 {
2316 struct uvm_pmr_cache *upc = &curcpu()->ci_uvm;
2317 struct uvm_pmr_cache_item *upci;
2318 struct uvm_pmemrange *pmr;
2319 int s;
2320
2321 /*
2322 * Always give back low pages to the allocator to not accelerate
2323 * their exhaustion.
2324 */
2325 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
2326 if (pmr->use > 0) {
2327 uvm_pmr_freepages(pg, 1);
2328 return;
2329 }
2330
2331 s = splvm();
2332 upci = &upc->upc_magz[upc->upc_actv];
2333 if (upci->upci_npages >= UVM_PMR_CACHEMAGSZ) {
2334 unsigned int prev;
2335
2336 prev = (upc->upc_actv == 0) ? 1 : 0;
2337 upci = &upc->upc_magz[prev];
2338 if (upci->upci_npages > 0)
2339 uvm_pmr_cache_free(upci);
2340
2341 /* Swap magazines */
2342 upc->upc_actv = prev;
2343 KASSERT(upci->upci_npages == 0);
2344 }
2345
2346 upci->upci_pages[upci->upci_npages] = pg;
2347 upci->upci_npages++;
2348 atomic_inc_int(&uvmexp.percpucaches);
2349 splx(s);
2350 }
2351
2352 unsigned int
uvm_pmr_cache_drain(void)2353 uvm_pmr_cache_drain(void)
2354 {
2355 struct uvm_pmr_cache *upc = &curcpu()->ci_uvm;
2356 unsigned int freed = 0;
2357 int s;
2358
2359 s = splvm();
2360 freed += uvm_pmr_cache_free(&upc->upc_magz[0]);
2361 freed += uvm_pmr_cache_free(&upc->upc_magz[1]);
2362 splx(s);
2363
2364 return freed;
2365 }
2366
2367 #else /* !(MULTIPROCESSOR && __HAVE_UVM_PERCPU) */
2368
2369 struct vm_page *
uvm_pmr_cache_get(int flags)2370 uvm_pmr_cache_get(int flags)
2371 {
2372 return uvm_pmr_getone(flags);
2373 }
2374
2375 void
uvm_pmr_cache_put(struct vm_page * pg)2376 uvm_pmr_cache_put(struct vm_page *pg)
2377 {
2378 uvm_pmr_freepages(pg, 1);
2379 }
2380
2381 unsigned int
uvm_pmr_cache_drain(void)2382 uvm_pmr_cache_drain(void)
2383 {
2384 return 0;
2385 }
2386 #endif
2387