1 /* $OpenBSD: uvm_pmemrange.c,v 1.50 2016/01/29 11:50:40 tb Exp $ */ 2 3 /* 4 * Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/param.h> 20 #include <sys/systm.h> 21 #include <uvm/uvm.h> 22 #include <sys/malloc.h> 23 #include <sys/kernel.h> 24 #include <sys/kthread.h> 25 #include <sys/mount.h> 26 27 /* 28 * 2 trees: addr tree and size tree. 29 * 30 * The allocator keeps chunks of free pages (called a range). 31 * Two pages are part of the same range if: 32 * - all pages in between are part of that range, 33 * - they are of the same memory type (zeroed or non-zeroed), 34 * - they are part of the same pmemrange. 35 * A pmemrange is a range of memory which is part of the same vm_physseg 36 * and has a use-count. 37 * 38 * addr tree is vm_page[0].objt 39 * size tree is vm_page[1].objt 40 * 41 * The size tree is not used for memory ranges of 1 page, instead, 42 * single queue is vm_page[0].pageq 43 * 44 * vm_page[0].fpgsz describes the length of a free range. Two adjecent ranges 45 * are joined, unless: 46 * - they have pages in between them which are not free 47 * - they belong to different memtypes (zeroed vs dirty memory) 48 * - they are in different pmemrange areas (ISA vs non-ISA memory for instance) 49 * - they are not a continuation of the same array 50 * The latter issue is caused by vm_physseg ordering and splitting from the 51 * MD initialization machinery. The MD code is dependant on freelists and 52 * happens to split ISA memory from non-ISA memory. 53 * (Note: freelists die die die!) 54 * 55 * uvm_page_init guarantees that every vm_physseg contains an array of 56 * struct vm_page. Also, uvm_page_physload allocates an array of struct 57 * vm_page. This code depends on that array. The array may break across 58 * vm_physsegs boundaries. 59 */ 60 61 /* 62 * Validate the flags of the page. (Used in asserts.) 63 * Any free page must have the PQ_FREE flag set. 64 * Free pages may be zeroed. 65 * Pmap flags are left untouched. 66 * 67 * The PQ_FREE flag is not checked here: by not checking, we can easily use 68 * this check in pages which are freed. 69 */ 70 #define VALID_FLAGS(pg_flags) \ 71 (((pg_flags) & ~(PQ_FREE|PG_ZERO|PG_PMAPMASK)) == 0x0) 72 73 /* Tree comparators. */ 74 int uvm_pmemrange_addr_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *); 75 int uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *); 76 int uvm_pmr_addr_cmp(struct vm_page *, struct vm_page *); 77 int uvm_pmr_size_cmp(struct vm_page *, struct vm_page *); 78 int uvm_pmr_pg_to_memtype(struct vm_page *); 79 80 #ifdef DDB 81 void uvm_pmr_print(void); 82 #endif 83 84 /* 85 * Memory types. The page flags are used to derive what the current memory 86 * type of a page is. 87 */ 88 int 89 uvm_pmr_pg_to_memtype(struct vm_page *pg) 90 { 91 if (pg->pg_flags & PG_ZERO) 92 return UVM_PMR_MEMTYPE_ZERO; 93 /* Default: dirty memory. */ 94 return UVM_PMR_MEMTYPE_DIRTY; 95 } 96 97 /* Trees. */ 98 RB_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp); 99 RB_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp); 100 RB_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr, 101 uvm_pmemrange_addr_cmp); 102 103 /* Validation. */ 104 #ifdef DEBUG 105 void uvm_pmr_assertvalid(struct uvm_pmemrange *pmr); 106 #else 107 #define uvm_pmr_assertvalid(pmr) do {} while (0) 108 #endif 109 110 psize_t uvm_pmr_get1page(psize_t, int, struct pglist *, 111 paddr_t, paddr_t, int); 112 113 struct uvm_pmemrange *uvm_pmr_allocpmr(void); 114 struct vm_page *uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int); 115 struct vm_page *uvm_pmr_nextsz(struct uvm_pmemrange *, 116 struct vm_page *, int); 117 void uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, 118 struct vm_page *pg, struct vm_page **pg_prev, 119 struct vm_page **pg_next); 120 struct vm_page *uvm_pmr_findnextsegment(struct uvm_pmemrange *, 121 struct vm_page *, paddr_t); 122 psize_t uvm_pmr_remove_1strange(struct pglist *, paddr_t, 123 struct vm_page **, int); 124 void uvm_pmr_split(paddr_t); 125 struct uvm_pmemrange *uvm_pmemrange_find(paddr_t); 126 struct uvm_pmemrange *uvm_pmemrange_use_insert(struct uvm_pmemrange_use *, 127 struct uvm_pmemrange *); 128 psize_t pow2divide(psize_t, psize_t); 129 struct vm_page *uvm_pmr_rootupdate(struct uvm_pmemrange *, 130 struct vm_page *, paddr_t, paddr_t, int); 131 132 /* 133 * Computes num/denom and rounds it up to the next power-of-2. 134 * 135 * This is a division function which calculates an approximation of 136 * num/denom, with result =~ num/denom. It is meant to be fast and doesn't 137 * have to be accurate. 138 * 139 * Providing too large a value makes the allocator slightly faster, at the 140 * risk of hitting the failure case more often. Providing too small a value 141 * makes the allocator a bit slower, but less likely to hit a failure case. 142 */ 143 psize_t 144 pow2divide(psize_t num, psize_t denom) 145 { 146 int rshift; 147 148 for (rshift = 0; num > denom; rshift++, denom <<= 1) 149 ; 150 return (paddr_t)1 << rshift; 151 } 152 153 /* 154 * Predicate: lhs is a subrange or rhs. 155 * 156 * If rhs_low == 0: don't care about lower bound. 157 * If rhs_high == 0: don't care about upper bound. 158 */ 159 #define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high) \ 160 (((rhs_low) == 0 || (lhs_low) >= (rhs_low)) && \ 161 ((rhs_high) == 0 || (lhs_high) <= (rhs_high))) 162 163 /* 164 * Predicate: lhs intersects with rhs. 165 * 166 * If rhs_low == 0: don't care about lower bound. 167 * If rhs_high == 0: don't care about upper bound. 168 * Ranges don't intersect if they don't have any page in common, array 169 * semantics mean that < instead of <= should be used here. 170 */ 171 #define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high) \ 172 (((rhs_low) == 0 || (rhs_low) < (lhs_high)) && \ 173 ((rhs_high) == 0 || (lhs_low) < (rhs_high))) 174 175 /* 176 * Align to power-of-2 alignment. 177 */ 178 #define PMR_ALIGN(pgno, align) \ 179 (((pgno) + ((align) - 1)) & ~((align) - 1)) 180 181 182 /* 183 * Comparator: sort by address ascending. 184 */ 185 int 186 uvm_pmemrange_addr_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs) 187 { 188 return lhs->low < rhs->low ? -1 : lhs->low > rhs->low; 189 } 190 191 /* 192 * Comparator: sort by use ascending. 193 * 194 * The higher the use value of a range, the more devices need memory in 195 * this range. Therefore allocate from the range with the lowest use first. 196 */ 197 int 198 uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs) 199 { 200 int result; 201 202 result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use; 203 if (result == 0) 204 result = uvm_pmemrange_addr_cmp(lhs, rhs); 205 return result; 206 } 207 208 int 209 uvm_pmr_addr_cmp(struct vm_page *lhs, struct vm_page *rhs) 210 { 211 paddr_t lhs_addr, rhs_addr; 212 213 lhs_addr = VM_PAGE_TO_PHYS(lhs); 214 rhs_addr = VM_PAGE_TO_PHYS(rhs); 215 216 return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr); 217 } 218 219 int 220 uvm_pmr_size_cmp(struct vm_page *lhs, struct vm_page *rhs) 221 { 222 psize_t lhs_size, rhs_size; 223 int cmp; 224 225 /* Using second tree, so we receive pg[1] instead of pg[0]. */ 226 lhs_size = (lhs - 1)->fpgsz; 227 rhs_size = (rhs - 1)->fpgsz; 228 229 cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size); 230 if (cmp == 0) 231 cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1); 232 return cmp; 233 } 234 235 /* 236 * Find the first range of free pages that is at least sz pages long. 237 */ 238 struct vm_page * 239 uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti) 240 { 241 struct vm_page *node, *best; 242 243 KASSERT(sz >= 1); 244 245 if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti])) 246 return TAILQ_FIRST(&pmr->single[mti]); 247 248 node = RB_ROOT(&pmr->size[mti]); 249 best = NULL; 250 while (node != NULL) { 251 if ((node - 1)->fpgsz >= sz) { 252 best = (node - 1); 253 node = RB_LEFT(node, objt); 254 } else 255 node = RB_RIGHT(node, objt); 256 } 257 return best; 258 } 259 260 /* 261 * Finds the next range. The next range has a size >= pg->fpgsz. 262 * Returns NULL if no more ranges are available. 263 */ 264 struct vm_page * 265 uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt) 266 { 267 struct vm_page *npg; 268 269 KASSERT(pmr != NULL && pg != NULL); 270 if (pg->fpgsz == 1) { 271 if (TAILQ_NEXT(pg, pageq) != NULL) 272 return TAILQ_NEXT(pg, pageq); 273 else 274 npg = RB_MIN(uvm_pmr_size, &pmr->size[mt]); 275 } else 276 npg = RB_NEXT(uvm_pmr_size, &pmr->size[mt], pg + 1); 277 278 return npg == NULL ? NULL : npg - 1; 279 } 280 281 /* 282 * Finds the previous and next ranges relative to the (uninserted) pg range. 283 * 284 * *pg_prev == NULL if no previous range is available, that can join with 285 * pg. 286 * *pg_next == NULL if no next range is available, that can join with 287 * pg. 288 */ 289 void 290 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg, 291 struct vm_page **pg_prev, struct vm_page **pg_next) 292 { 293 KASSERT(pg_prev != NULL && pg_next != NULL); 294 295 *pg_next = RB_NFIND(uvm_pmr_addr, &pmr->addr, pg); 296 if (*pg_next == NULL) 297 *pg_prev = RB_MAX(uvm_pmr_addr, &pmr->addr); 298 else 299 *pg_prev = RB_PREV(uvm_pmr_addr, &pmr->addr, *pg_next); 300 301 KDASSERT(*pg_next == NULL || 302 VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg)); 303 KDASSERT(*pg_prev == NULL || 304 VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg)); 305 306 /* Reset if not contig. */ 307 if (*pg_prev != NULL && 308 (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz 309 != atop(VM_PAGE_TO_PHYS(pg)) || 310 *pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */ 311 uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg))) 312 *pg_prev = NULL; 313 if (*pg_next != NULL && 314 (atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz 315 != atop(VM_PAGE_TO_PHYS(*pg_next)) || 316 pg + pg->fpgsz != *pg_next || /* Array broke. */ 317 uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg))) 318 *pg_next = NULL; 319 return; 320 } 321 322 /* 323 * Remove a range from the address tree. 324 * Address tree maintains pmr counters. 325 */ 326 void 327 uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg) 328 { 329 KDASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg); 330 KDASSERT(pg->pg_flags & PQ_FREE); 331 RB_REMOVE(uvm_pmr_addr, &pmr->addr, pg); 332 333 pmr->nsegs--; 334 } 335 /* 336 * Remove a range from the size tree. 337 */ 338 void 339 uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg) 340 { 341 int memtype; 342 #ifdef DEBUG 343 struct vm_page *i; 344 #endif 345 346 KDASSERT(pg->fpgsz >= 1); 347 KDASSERT(pg->pg_flags & PQ_FREE); 348 memtype = uvm_pmr_pg_to_memtype(pg); 349 350 if (pg->fpgsz == 1) { 351 #ifdef DEBUG 352 TAILQ_FOREACH(i, &pmr->single[memtype], pageq) { 353 if (i == pg) 354 break; 355 } 356 KDASSERT(i == pg); 357 #endif 358 TAILQ_REMOVE(&pmr->single[memtype], pg, pageq); 359 } else { 360 KDASSERT(RB_FIND(uvm_pmr_size, &pmr->size[memtype], 361 pg + 1) == pg + 1); 362 RB_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1); 363 } 364 } 365 /* Remove from both trees. */ 366 void 367 uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg) 368 { 369 uvm_pmr_assertvalid(pmr); 370 uvm_pmr_remove_size(pmr, pg); 371 uvm_pmr_remove_addr(pmr, pg); 372 uvm_pmr_assertvalid(pmr); 373 } 374 375 /* 376 * Insert the range described in pg. 377 * Returns the range thus created (which may be joined with the previous and 378 * next ranges). 379 * If no_join, the caller guarantees that the range cannot possibly join 380 * with adjecent ranges. 381 */ 382 struct vm_page * 383 uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join) 384 { 385 struct vm_page *prev, *next; 386 387 #ifdef DEBUG 388 struct vm_page *i; 389 int mt; 390 #endif 391 392 KDASSERT(pg->pg_flags & PQ_FREE); 393 KDASSERT(pg->fpgsz >= 1); 394 395 #ifdef DEBUG 396 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { 397 TAILQ_FOREACH(i, &pmr->single[mt], pageq) 398 KDASSERT(i != pg); 399 if (pg->fpgsz > 1) { 400 KDASSERT(RB_FIND(uvm_pmr_size, &pmr->size[mt], 401 pg + 1) == NULL); 402 } 403 KDASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL); 404 } 405 #endif 406 407 if (!no_join) { 408 uvm_pmr_pnaddr(pmr, pg, &prev, &next); 409 if (next != NULL) { 410 uvm_pmr_remove_size(pmr, next); 411 uvm_pmr_remove_addr(pmr, next); 412 pg->fpgsz += next->fpgsz; 413 next->fpgsz = 0; 414 } 415 if (prev != NULL) { 416 uvm_pmr_remove_size(pmr, prev); 417 prev->fpgsz += pg->fpgsz; 418 pg->fpgsz = 0; 419 return prev; 420 } 421 } 422 423 RB_INSERT(uvm_pmr_addr, &pmr->addr, pg); 424 425 pmr->nsegs++; 426 427 return pg; 428 } 429 /* 430 * Insert the range described in pg. 431 * Returns the range thus created (which may be joined with the previous and 432 * next ranges). 433 * Page must already be in the address tree. 434 */ 435 void 436 uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg) 437 { 438 int memtype; 439 #ifdef DEBUG 440 struct vm_page *i; 441 int mti; 442 #endif 443 444 KDASSERT(pg->fpgsz >= 1); 445 KDASSERT(pg->pg_flags & PQ_FREE); 446 447 memtype = uvm_pmr_pg_to_memtype(pg); 448 #ifdef DEBUG 449 for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) { 450 TAILQ_FOREACH(i, &pmr->single[mti], pageq) 451 KDASSERT(i != pg); 452 if (pg->fpgsz > 1) { 453 KDASSERT(RB_FIND(uvm_pmr_size, &pmr->size[mti], 454 pg + 1) == NULL); 455 } 456 KDASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg); 457 } 458 for (i = pg; i < pg + pg->fpgsz; i++) 459 KASSERT(uvm_pmr_pg_to_memtype(i) == memtype); 460 #endif 461 462 if (pg->fpgsz == 1) 463 TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq); 464 else 465 RB_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1); 466 } 467 /* Insert in both trees. */ 468 struct vm_page * 469 uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join) 470 { 471 uvm_pmr_assertvalid(pmr); 472 pg = uvm_pmr_insert_addr(pmr, pg, no_join); 473 uvm_pmr_insert_size(pmr, pg); 474 uvm_pmr_assertvalid(pmr); 475 return pg; 476 } 477 478 /* 479 * Find the last page that is part of this segment. 480 * => pg: the range at which to start the search. 481 * => boundary: the page number boundary specification (0 = no boundary). 482 * => pmr: the pmemrange of the page. 483 * 484 * This function returns 1 before the next range, so if you want to have the 485 * next range, you need to run TAILQ_NEXT(result, pageq) after calling. 486 * The reason is that this way, the length of the segment is easily 487 * calculated using: atop(result) - atop(pg) + 1. 488 * Hence this function also never returns NULL. 489 */ 490 struct vm_page * 491 uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr, 492 struct vm_page *pg, paddr_t boundary) 493 { 494 paddr_t first_boundary; 495 struct vm_page *next; 496 struct vm_page *prev; 497 498 KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) && 499 pmr->high > atop(VM_PAGE_TO_PHYS(pg))); 500 if (boundary != 0) { 501 first_boundary = 502 PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary); 503 } else 504 first_boundary = 0; 505 506 /* 507 * Increase next until it hits the first page of the next segment. 508 * 509 * While loop checks the following: 510 * - next != NULL we have not reached the end of pgl 511 * - boundary == 0 || next < first_boundary 512 * we do not cross a boundary 513 * - atop(prev) + 1 == atop(next) 514 * still in the same segment 515 * - low <= last 516 * - high > last still in the same memory range 517 * - memtype is equal allocator is unable to view different memtypes 518 * as part of the same segment 519 * - prev + 1 == next no array breakage occurs 520 */ 521 prev = pg; 522 next = TAILQ_NEXT(prev, pageq); 523 while (next != NULL && 524 (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) && 525 atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) && 526 pmr->low <= atop(VM_PAGE_TO_PHYS(next)) && 527 pmr->high > atop(VM_PAGE_TO_PHYS(next)) && 528 uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) && 529 prev + 1 == next) { 530 prev = next; 531 next = TAILQ_NEXT(prev, pageq); 532 } 533 534 /* 535 * End of this segment. 536 */ 537 return prev; 538 } 539 540 /* 541 * Remove the first segment of contiguous pages from pgl. 542 * A segment ends if it crosses boundary (unless boundary = 0) or 543 * if it would enter a different uvm_pmemrange. 544 * 545 * Work: the page range that the caller is currently working with. 546 * May be null. 547 * 548 * If is_desperate is non-zero, the smallest segment is erased. Otherwise, 549 * the first segment is erased (which, if called by uvm_pmr_getpages(), 550 * probably is the smallest or very close to it). 551 */ 552 psize_t 553 uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary, 554 struct vm_page **work, int is_desperate) 555 { 556 struct vm_page *start, *end, *iter, *iter_end, *inserted; 557 psize_t count; 558 struct uvm_pmemrange *pmr, *pmr_iter; 559 560 KASSERT(!TAILQ_EMPTY(pgl)); 561 562 /* 563 * Initialize to first page. 564 * Unless desperate scan finds a better candidate, this is what'll be 565 * erased. 566 */ 567 start = TAILQ_FIRST(pgl); 568 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start))); 569 end = uvm_pmr_findnextsegment(pmr, start, boundary); 570 571 /* 572 * If we are desperate, we _really_ want to get rid of the smallest 573 * element (rather than a close match to the smallest element). 574 */ 575 if (is_desperate) { 576 /* Linear search for smallest segment. */ 577 pmr_iter = pmr; 578 for (iter = TAILQ_NEXT(end, pageq); 579 iter != NULL && start != end; 580 iter = TAILQ_NEXT(iter_end, pageq)) { 581 /* 582 * Only update pmr if it doesn't match current 583 * iteration. 584 */ 585 if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) || 586 pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) { 587 pmr_iter = uvm_pmemrange_find(atop( 588 VM_PAGE_TO_PHYS(iter))); 589 } 590 591 iter_end = uvm_pmr_findnextsegment(pmr_iter, iter, 592 boundary); 593 594 /* 595 * Current iteration is smaller than best match so 596 * far; update. 597 */ 598 if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) < 599 VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) { 600 start = iter; 601 end = iter_end; 602 pmr = pmr_iter; 603 } 604 } 605 } 606 607 /* 608 * Calculate count and end of the list. 609 */ 610 count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1; 611 end = TAILQ_NEXT(end, pageq); 612 613 /* 614 * Actually remove the range of pages. 615 * 616 * Sadly, this cannot be done using pointer iteration: 617 * vm_physseg is not guaranteed to be sorted on address, hence 618 * uvm_page_init() may not have initialized its array sorted by 619 * page number. 620 */ 621 for (iter = start; iter != end; iter = iter_end) { 622 iter_end = TAILQ_NEXT(iter, pageq); 623 TAILQ_REMOVE(pgl, iter, pageq); 624 } 625 626 start->fpgsz = count; 627 inserted = uvm_pmr_insert(pmr, start, 0); 628 629 /* 630 * If the caller was working on a range and this function modified 631 * that range, update the pointer. 632 */ 633 if (work != NULL && *work != NULL && 634 atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) && 635 atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz > 636 atop(VM_PAGE_TO_PHYS(*work))) 637 *work = inserted; 638 return count; 639 } 640 641 /* 642 * Extract a number of pages from a segment of free pages. 643 * Called by uvm_pmr_getpages. 644 * 645 * Returns the segment that was created from pages left over at the tail 646 * of the remove set of pages, or NULL if no pages were left at the tail. 647 */ 648 struct vm_page * 649 uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg, 650 paddr_t start, paddr_t end, struct pglist *result) 651 { 652 struct vm_page *after, *pg_i; 653 psize_t before_sz, after_sz; 654 #ifdef DEBUG 655 psize_t i; 656 #endif 657 658 KDASSERT(end > start); 659 KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg))); 660 KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz); 661 KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start); 662 KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end); 663 664 before_sz = start - atop(VM_PAGE_TO_PHYS(pg)); 665 after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end; 666 KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz); 667 uvm_pmr_assertvalid(pmr); 668 669 uvm_pmr_remove_size(pmr, pg); 670 if (before_sz == 0) 671 uvm_pmr_remove_addr(pmr, pg); 672 after = pg + before_sz + (end - start); 673 674 /* Add selected pages to result. */ 675 for (pg_i = pg + before_sz; pg_i != after; pg_i++) { 676 KASSERT(pg_i->pg_flags & PQ_FREE); 677 pg_i->fpgsz = 0; 678 TAILQ_INSERT_TAIL(result, pg_i, pageq); 679 } 680 681 /* Before handling. */ 682 if (before_sz > 0) { 683 pg->fpgsz = before_sz; 684 uvm_pmr_insert_size(pmr, pg); 685 } 686 687 /* After handling. */ 688 if (after_sz > 0) { 689 #ifdef DEBUG 690 for (i = 0; i < after_sz; i++) { 691 KASSERT(!uvm_pmr_isfree(after + i)); 692 } 693 #endif 694 KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end); 695 after->fpgsz = after_sz; 696 after = uvm_pmr_insert_addr(pmr, after, 1); 697 uvm_pmr_insert_size(pmr, after); 698 } 699 700 uvm_pmr_assertvalid(pmr); 701 return (after_sz > 0 ? after : NULL); 702 } 703 704 /* 705 * Acquire a number of pages. 706 * 707 * count: the number of pages returned 708 * start: lowest page number 709 * end: highest page number +1 710 * (start = end = 0: no limitation) 711 * align: power-of-2 alignment constraint (align = 1: no alignment) 712 * boundary: power-of-2 boundary (boundary = 0: no boundary) 713 * maxseg: maximum number of segments to return 714 * flags: UVM_PLA_* flags 715 * result: returned pages storage (uses pageq) 716 */ 717 int 718 uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align, 719 paddr_t boundary, int maxseg, int flags, struct pglist *result) 720 { 721 struct uvm_pmemrange *pmr; /* Iterate memory ranges. */ 722 struct vm_page *found, *f_next; /* Iterate chunks. */ 723 psize_t fcount; /* Current found pages. */ 724 int fnsegs; /* Current segment counter. */ 725 int try, start_try; 726 psize_t search[3]; 727 paddr_t fstart, fend; /* Pages to be taken from found. */ 728 int memtype; /* Requested memtype. */ 729 int memtype_init; /* Best memtype. */ 730 int desperate; /* True if allocation failed. */ 731 #ifdef DIAGNOSTIC 732 struct vm_page *diag_prev; /* Used during validation. */ 733 #endif /* DIAGNOSTIC */ 734 735 /* 736 * Validate arguments. 737 */ 738 KASSERT(count > 0); 739 KASSERT(start == 0 || end == 0 || start < end); 740 KASSERT(align >= 1); 741 KASSERT(powerof2(align)); 742 KASSERT(maxseg > 0); 743 KASSERT(boundary == 0 || powerof2(boundary)); 744 KASSERT(boundary == 0 || maxseg * boundary >= count); 745 KASSERT(TAILQ_EMPTY(result)); 746 747 /* 748 * TRYCONTIG is a noop if you only want a single segment. 749 * Remove it if that's the case: otherwise it'll deny the fast 750 * allocation. 751 */ 752 if (maxseg == 1 || count == 1) 753 flags &= ~UVM_PLA_TRYCONTIG; 754 755 /* 756 * Configure search. 757 * 758 * search[0] is one segment, only used in UVM_PLA_TRYCONTIG case. 759 * search[1] is multiple segments, chosen to fulfill the search in 760 * approximately even-sized segments. 761 * This is a good trade-off between slightly reduced allocation speed 762 * and less fragmentation. 763 * search[2] is the worst case, in which all segments are evaluated. 764 * This provides the least fragmentation, but makes the search 765 * possibly longer (although in the case it is selected, that no 766 * longer matters most). 767 * 768 * The exception is when maxseg == 1: since we can only fulfill that 769 * with one segment of size pages, only a single search type has to 770 * be attempted. 771 */ 772 if (maxseg == 1 || count == 1) { 773 start_try = 2; 774 search[2] = count; 775 } else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) { 776 start_try = 2; 777 search[2] = 1; 778 } else { 779 start_try = 0; 780 search[0] = count; 781 search[1] = pow2divide(count, maxseg); 782 search[2] = 1; 783 if ((flags & UVM_PLA_TRYCONTIG) == 0) 784 start_try = 1; 785 if (search[1] >= search[0]) { 786 search[1] = search[0]; 787 start_try = 1; 788 } 789 if (search[2] >= search[start_try]) { 790 start_try = 2; 791 } 792 } 793 794 /* 795 * Memory type: if zeroed memory is requested, traverse the zero set. 796 * Otherwise, traverse the dirty set. 797 * 798 * The memtype iterator is reinitialized to memtype_init on entrance 799 * of a pmemrange. 800 */ 801 if (flags & UVM_PLA_ZERO) 802 memtype_init = UVM_PMR_MEMTYPE_ZERO; 803 else 804 memtype_init = UVM_PMR_MEMTYPE_DIRTY; 805 806 /* 807 * Initially, we're not desperate. 808 * 809 * Note that if we return from a sleep, we are still desperate. 810 * Chances are that memory pressure is still high, so resetting 811 * seems over-optimistic to me. 812 */ 813 desperate = 0; 814 815 uvm_lock_fpageq(); 816 817 retry: /* Return point after sleeping. */ 818 fcount = 0; 819 fnsegs = 0; 820 821 retry_desperate: 822 /* 823 * If we just want any page(s), go for the really fast option. 824 */ 825 if (count <= maxseg && align == 1 && boundary == 0 && 826 (flags & UVM_PLA_TRYCONTIG) == 0) { 827 fcount += uvm_pmr_get1page(count - fcount, memtype_init, 828 result, start, end, 0); 829 830 /* 831 * If we found sufficient pages, go to the succes exit code. 832 * 833 * Otherwise, go immediately to fail, since we collected 834 * all we could anyway. 835 */ 836 if (fcount == count) 837 goto out; 838 else 839 goto fail; 840 } 841 842 /* 843 * The heart of the contig case. 844 * 845 * The code actually looks like this: 846 * 847 * foreach (struct pmemrange) { 848 * foreach (memtype) { 849 * foreach(try) { 850 * foreach (free range of memtype in pmemrange, 851 * starting at search[try]) { 852 * while (range has space left) 853 * take from range 854 * } 855 * } 856 * } 857 * 858 * if next pmemrange has higher usecount than current: 859 * enter desperate case (which will drain the pmemranges 860 * until empty prior to moving to the next one) 861 * } 862 * 863 * When desperate is activated, try always starts at the highest 864 * value. The memtype loop is using a goto ReScanMemtype. 865 * The try loop is using a goto ReScan. 866 * The 'range has space left' loop uses label DrainFound. 867 * 868 * Writing them all as loops would take up a lot of screen space in 869 * the form of indentation and some parts are easier to express 870 * using the labels. 871 */ 872 873 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { 874 /* Empty range. */ 875 if (pmr->nsegs == 0) 876 continue; 877 878 /* Outside requested range. */ 879 if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end)) 880 continue; 881 882 memtype = memtype_init; 883 884 rescan_memtype: /* Return point at memtype++. */ 885 try = start_try; 886 887 rescan: /* Return point at try++. */ 888 for (found = uvm_pmr_nfindsz(pmr, search[try], memtype); 889 found != NULL; 890 found = f_next) { 891 f_next = uvm_pmr_nextsz(pmr, found, memtype); 892 893 fstart = atop(VM_PAGE_TO_PHYS(found)); 894 if (start != 0) 895 fstart = MAX(start, fstart); 896 drain_found: 897 /* 898 * Throw away the first segment if fnsegs == maxseg 899 * 900 * Note that f_next is still valid after this call, 901 * since we only allocated from entries before f_next. 902 * We don't revisit the entries we already extracted 903 * from unless we entered the desperate case. 904 */ 905 if (fnsegs == maxseg) { 906 fnsegs--; 907 fcount -= 908 uvm_pmr_remove_1strange(result, boundary, 909 &found, desperate); 910 } 911 912 fstart = PMR_ALIGN(fstart, align); 913 fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz; 914 if (end != 0) 915 fend = MIN(end, fend); 916 if (boundary != 0) { 917 fend = 918 MIN(fend, PMR_ALIGN(fstart + 1, boundary)); 919 } 920 if (fstart >= fend) 921 continue; 922 if (fend - fstart > count - fcount) 923 fend = fstart + (count - fcount); 924 925 fcount += fend - fstart; 926 fnsegs++; 927 found = uvm_pmr_extract_range(pmr, found, 928 fstart, fend, result); 929 930 if (fcount == count) 931 goto out; 932 933 /* 934 * If there's still space left in found, try to 935 * fully drain it prior to continueing. 936 */ 937 if (found != NULL) { 938 fstart = fend; 939 goto drain_found; 940 } 941 } 942 943 /* Try a smaller search now. */ 944 if (++try < nitems(search)) 945 goto rescan; 946 947 /* 948 * Exhaust all memory types prior to going to the next memory 949 * segment. 950 * This means that zero-vs-dirty are eaten prior to moving 951 * to a pmemrange with a higher use-count. 952 * 953 * Code is basically a difficult way of writing: 954 * memtype = memtype_init; 955 * do { 956 * ...; 957 * memtype += 1; 958 * memtype %= MEMTYPE_MAX; 959 * } while (memtype != memtype_init); 960 */ 961 memtype += 1; 962 if (memtype == UVM_PMR_MEMTYPE_MAX) 963 memtype = 0; 964 if (memtype != memtype_init) 965 goto rescan_memtype; 966 967 /* 968 * If not desperate, enter desperate case prior to eating all 969 * the good stuff in the next range. 970 */ 971 if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL && 972 TAILQ_NEXT(pmr, pmr_use)->use != pmr->use) 973 break; 974 } 975 976 /* 977 * Not enough memory of the requested type available. Fall back to 978 * less good memory that we'll clean up better later. 979 * 980 * This algorithm is not very smart though, it just starts scanning 981 * a different typed range, but the nicer ranges of the previous 982 * iteration may fall out. Hence there is a small chance of a false 983 * negative. 984 * 985 * When desparate: scan all sizes starting at the smallest 986 * (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may 987 * allow us to hit the fast path now). 988 * 989 * Also, because we will revisit entries we scanned before, we need 990 * to reset the page queue, or we may end up releasing entries in 991 * such a way as to invalidate f_next. 992 */ 993 if (!desperate) { 994 desperate = 1; 995 start_try = nitems(search) - 1; 996 flags &= ~UVM_PLA_TRYCONTIG; 997 998 while (!TAILQ_EMPTY(result)) 999 uvm_pmr_remove_1strange(result, 0, NULL, 0); 1000 fnsegs = 0; 1001 fcount = 0; 1002 goto retry_desperate; 1003 } 1004 1005 fail: 1006 /* Allocation failed. */ 1007 /* XXX: claim from memory reserve here */ 1008 1009 while (!TAILQ_EMPTY(result)) 1010 uvm_pmr_remove_1strange(result, 0, NULL, 0); 1011 1012 if (flags & UVM_PLA_WAITOK) { 1013 if (uvm_wait_pla(ptoa(start), ptoa(end) - 1, ptoa(count), 1014 flags & UVM_PLA_FAILOK) == 0) 1015 goto retry; 1016 KASSERT(flags & UVM_PLA_FAILOK); 1017 } else 1018 wakeup(&uvm.pagedaemon); 1019 uvm_unlock_fpageq(); 1020 1021 return ENOMEM; 1022 1023 out: 1024 /* Allocation succesful. */ 1025 uvmexp.free -= fcount; 1026 1027 uvm_unlock_fpageq(); 1028 1029 /* Update statistics and zero pages if UVM_PLA_ZERO. */ 1030 #ifdef DIAGNOSTIC 1031 fnsegs = 0; 1032 fcount = 0; 1033 diag_prev = NULL; 1034 #endif /* DIAGNOSTIC */ 1035 TAILQ_FOREACH(found, result, pageq) { 1036 atomic_clearbits_int(&found->pg_flags, PG_PMAPMASK); 1037 1038 if (found->pg_flags & PG_ZERO) { 1039 uvm_lock_fpageq(); 1040 uvmexp.zeropages--; 1041 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) 1042 wakeup(&uvmexp.zeropages); 1043 uvm_unlock_fpageq(); 1044 } 1045 if (flags & UVM_PLA_ZERO) { 1046 if (found->pg_flags & PG_ZERO) 1047 uvmexp.pga_zerohit++; 1048 else { 1049 uvmexp.pga_zeromiss++; 1050 uvm_pagezero(found); 1051 } 1052 } 1053 atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE); 1054 1055 found->uobject = NULL; 1056 found->uanon = NULL; 1057 found->pg_version++; 1058 1059 /* 1060 * Validate that the page matches range criterium. 1061 */ 1062 KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start); 1063 KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end); 1064 1065 #ifdef DIAGNOSTIC 1066 /* 1067 * Update fcount (# found pages) and 1068 * fnsegs (# found segments) counters. 1069 */ 1070 if (diag_prev == NULL || 1071 /* new segment if it contains a hole */ 1072 atop(VM_PAGE_TO_PHYS(diag_prev)) + 1 != 1073 atop(VM_PAGE_TO_PHYS(found)) || 1074 /* new segment if it crosses boundary */ 1075 (atop(VM_PAGE_TO_PHYS(diag_prev)) & ~(boundary - 1)) != 1076 (atop(VM_PAGE_TO_PHYS(found)) & ~(boundary - 1))) 1077 fnsegs++; 1078 fcount++; 1079 1080 diag_prev = found; 1081 #endif /* DIAGNOSTIC */ 1082 } 1083 1084 #ifdef DIAGNOSTIC 1085 /* 1086 * Panic on algorithm failure. 1087 */ 1088 if (fcount != count || fnsegs > maxseg) { 1089 panic("pmemrange allocation error: " 1090 "allocated %ld pages in %d segments, " 1091 "but request was %ld pages in %d segments", 1092 fcount, fnsegs, count, maxseg); 1093 } 1094 #endif /* DIAGNOSTIC */ 1095 1096 return 0; 1097 } 1098 1099 /* 1100 * Free a number of contig pages (invoked by uvm_page_init). 1101 */ 1102 void 1103 uvm_pmr_freepages(struct vm_page *pg, psize_t count) 1104 { 1105 struct uvm_pmemrange *pmr; 1106 psize_t i, pmr_count; 1107 struct vm_page *firstpg = pg; 1108 1109 for (i = 0; i < count; i++) { 1110 KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) == 1111 atop(VM_PAGE_TO_PHYS(pg)) + i); 1112 1113 if (!((pg[i].pg_flags & PQ_FREE) == 0 && 1114 VALID_FLAGS(pg[i].pg_flags))) { 1115 printf("Flags: 0x%x, will panic now.\n", 1116 pg[i].pg_flags); 1117 } 1118 KASSERT((pg[i].pg_flags & PQ_FREE) == 0 && 1119 VALID_FLAGS(pg[i].pg_flags)); 1120 atomic_setbits_int(&pg[i].pg_flags, PQ_FREE); 1121 atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO); 1122 } 1123 1124 uvm_lock_fpageq(); 1125 1126 for (i = count; i > 0; i -= pmr_count) { 1127 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg))); 1128 KASSERT(pmr != NULL); 1129 1130 pmr_count = MIN(i, pmr->high - atop(VM_PAGE_TO_PHYS(pg))); 1131 pg->fpgsz = pmr_count; 1132 uvm_pmr_insert(pmr, pg, 0); 1133 1134 uvmexp.free += pmr_count; 1135 pg += pmr_count; 1136 } 1137 wakeup(&uvmexp.free); 1138 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) 1139 wakeup(&uvmexp.zeropages); 1140 1141 uvm_wakeup_pla(VM_PAGE_TO_PHYS(firstpg), ptoa(count)); 1142 1143 uvm_unlock_fpageq(); 1144 } 1145 1146 /* 1147 * Free all pages in the queue. 1148 */ 1149 void 1150 uvm_pmr_freepageq(struct pglist *pgl) 1151 { 1152 struct vm_page *pg; 1153 paddr_t pstart; 1154 psize_t plen; 1155 1156 TAILQ_FOREACH(pg, pgl, pageq) { 1157 if (!((pg->pg_flags & PQ_FREE) == 0 && 1158 VALID_FLAGS(pg->pg_flags))) { 1159 printf("Flags: 0x%x, will panic now.\n", 1160 pg->pg_flags); 1161 } 1162 KASSERT((pg->pg_flags & PQ_FREE) == 0 && 1163 VALID_FLAGS(pg->pg_flags)); 1164 atomic_setbits_int(&pg->pg_flags, PQ_FREE); 1165 atomic_clearbits_int(&pg->pg_flags, PG_ZERO); 1166 } 1167 1168 uvm_lock_fpageq(); 1169 while (!TAILQ_EMPTY(pgl)) { 1170 pstart = VM_PAGE_TO_PHYS(TAILQ_FIRST(pgl)); 1171 plen = uvm_pmr_remove_1strange(pgl, 0, NULL, 0); 1172 uvmexp.free += plen; 1173 1174 uvm_wakeup_pla(pstart, ptoa(plen)); 1175 } 1176 wakeup(&uvmexp.free); 1177 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) 1178 wakeup(&uvmexp.zeropages); 1179 uvm_unlock_fpageq(); 1180 1181 return; 1182 } 1183 1184 /* 1185 * Store a pmemrange in the list. 1186 * 1187 * The list is sorted by use. 1188 */ 1189 struct uvm_pmemrange * 1190 uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq, 1191 struct uvm_pmemrange *pmr) 1192 { 1193 struct uvm_pmemrange *iter; 1194 int cmp = 1; 1195 1196 TAILQ_FOREACH(iter, useq, pmr_use) { 1197 cmp = uvm_pmemrange_use_cmp(pmr, iter); 1198 if (cmp == 0) 1199 return iter; 1200 if (cmp == -1) 1201 break; 1202 } 1203 1204 if (iter == NULL) 1205 TAILQ_INSERT_TAIL(useq, pmr, pmr_use); 1206 else 1207 TAILQ_INSERT_BEFORE(iter, pmr, pmr_use); 1208 return NULL; 1209 } 1210 1211 #ifdef DEBUG 1212 /* 1213 * Validation of the whole pmemrange. 1214 * Called with fpageq locked. 1215 */ 1216 void 1217 uvm_pmr_assertvalid(struct uvm_pmemrange *pmr) 1218 { 1219 struct vm_page *prev, *next, *i, *xref; 1220 int lcv, mti; 1221 1222 /* Empty range */ 1223 if (pmr->nsegs == 0) 1224 return; 1225 1226 /* Validate address tree. */ 1227 RB_FOREACH(i, uvm_pmr_addr, &pmr->addr) { 1228 /* Validate the range. */ 1229 KASSERT(i->fpgsz > 0); 1230 KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low); 1231 KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz 1232 <= pmr->high); 1233 1234 /* Validate each page in this range. */ 1235 for (lcv = 0; lcv < i->fpgsz; lcv++) { 1236 /* 1237 * Only the first page has a size specification. 1238 * Rest is size 0. 1239 */ 1240 KASSERT(lcv == 0 || i[lcv].fpgsz == 0); 1241 /* 1242 * Flag check. 1243 */ 1244 KASSERT(VALID_FLAGS(i[lcv].pg_flags) && 1245 (i[lcv].pg_flags & PQ_FREE) == PQ_FREE); 1246 /* 1247 * Free pages are: 1248 * - not wired 1249 * - have no vm_anon 1250 * - have no uvm_object 1251 */ 1252 KASSERT(i[lcv].wire_count == 0); 1253 KASSERT(i[lcv].uanon == (void*)0xdeadbeef || 1254 i[lcv].uanon == NULL); 1255 KASSERT(i[lcv].uobject == (void*)0xdeadbeef || 1256 i[lcv].uobject == NULL); 1257 /* 1258 * Pages in a single range always have the same 1259 * memtype. 1260 */ 1261 KASSERT(uvm_pmr_pg_to_memtype(&i[0]) == 1262 uvm_pmr_pg_to_memtype(&i[lcv])); 1263 } 1264 1265 /* Check that it shouldn't be joined with its predecessor. */ 1266 prev = RB_PREV(uvm_pmr_addr, &pmr->addr, i); 1267 if (prev != NULL) { 1268 KASSERT(uvm_pmr_pg_to_memtype(i) != 1269 uvm_pmr_pg_to_memtype(prev) || 1270 atop(VM_PAGE_TO_PHYS(i)) > 1271 atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz || 1272 prev + prev->fpgsz != i); 1273 } 1274 1275 /* Assert i is in the size tree as well. */ 1276 if (i->fpgsz == 1) { 1277 TAILQ_FOREACH(xref, 1278 &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) { 1279 if (xref == i) 1280 break; 1281 } 1282 KASSERT(xref == i); 1283 } else { 1284 KASSERT(RB_FIND(uvm_pmr_size, 1285 &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) == 1286 i + 1); 1287 } 1288 } 1289 1290 /* Validate size tree. */ 1291 for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) { 1292 for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) { 1293 next = uvm_pmr_nextsz(pmr, i, mti); 1294 if (next != NULL) { 1295 KASSERT(i->fpgsz <= 1296 next->fpgsz); 1297 } 1298 1299 /* Assert i is in the addr tree as well. */ 1300 KASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, i) == i); 1301 1302 /* Assert i is of the correct memory type. */ 1303 KASSERT(uvm_pmr_pg_to_memtype(i) == mti); 1304 } 1305 } 1306 1307 /* Validate nsegs statistic. */ 1308 lcv = 0; 1309 RB_FOREACH(i, uvm_pmr_addr, &pmr->addr) 1310 lcv++; 1311 KASSERT(pmr->nsegs == lcv); 1312 } 1313 #endif /* DEBUG */ 1314 1315 /* 1316 * Split pmr at split point pageno. 1317 * Called with fpageq unlocked. 1318 * 1319 * Split is only applied if a pmemrange spans pageno. 1320 */ 1321 void 1322 uvm_pmr_split(paddr_t pageno) 1323 { 1324 struct uvm_pmemrange *pmr, *drain; 1325 struct vm_page *rebuild, *prev, *next; 1326 psize_t prev_sz; 1327 1328 uvm_lock_fpageq(); 1329 pmr = uvm_pmemrange_find(pageno); 1330 if (pmr == NULL || !(pmr->low < pageno)) { 1331 /* No split required. */ 1332 uvm_unlock_fpageq(); 1333 return; 1334 } 1335 1336 KASSERT(pmr->low < pageno); 1337 KASSERT(pmr->high > pageno); 1338 1339 /* 1340 * uvm_pmr_allocpmr() calls into malloc() which in turn calls into 1341 * uvm_kmemalloc which calls into pmemrange, making the locking 1342 * a bit hard, so we just race! 1343 */ 1344 uvm_unlock_fpageq(); 1345 drain = uvm_pmr_allocpmr(); 1346 uvm_lock_fpageq(); 1347 pmr = uvm_pmemrange_find(pageno); 1348 if (pmr == NULL || !(pmr->low < pageno)) { 1349 /* 1350 * We lost the race since someone else ran this or a related 1351 * function, however this should be triggered very rarely so 1352 * we just leak the pmr. 1353 */ 1354 printf("uvm_pmr_split: lost one pmr\n"); 1355 uvm_unlock_fpageq(); 1356 return; 1357 } 1358 1359 drain->low = pageno; 1360 drain->high = pmr->high; 1361 drain->use = pmr->use; 1362 1363 uvm_pmr_assertvalid(pmr); 1364 uvm_pmr_assertvalid(drain); 1365 KASSERT(drain->nsegs == 0); 1366 1367 RB_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) { 1368 if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno) 1369 break; 1370 } 1371 if (rebuild == NULL) 1372 prev = RB_MAX(uvm_pmr_addr, &pmr->addr); 1373 else 1374 prev = RB_PREV(uvm_pmr_addr, &pmr->addr, rebuild); 1375 KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno); 1376 1377 /* 1378 * Handle free chunk that spans the split point. 1379 */ 1380 if (prev != NULL && 1381 atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) { 1382 psize_t before, after; 1383 1384 KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno); 1385 1386 uvm_pmr_remove(pmr, prev); 1387 prev_sz = prev->fpgsz; 1388 before = pageno - atop(VM_PAGE_TO_PHYS(prev)); 1389 after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno; 1390 1391 KASSERT(before > 0); 1392 KASSERT(after > 0); 1393 1394 prev->fpgsz = before; 1395 uvm_pmr_insert(pmr, prev, 1); 1396 (prev + before)->fpgsz = after; 1397 uvm_pmr_insert(drain, prev + before, 1); 1398 } 1399 1400 /* Move free chunks that no longer fall in the range. */ 1401 for (; rebuild != NULL; rebuild = next) { 1402 next = RB_NEXT(uvm_pmr_addr, &pmr->addr, rebuild); 1403 1404 uvm_pmr_remove(pmr, rebuild); 1405 uvm_pmr_insert(drain, rebuild, 1); 1406 } 1407 1408 pmr->high = pageno; 1409 uvm_pmr_assertvalid(pmr); 1410 uvm_pmr_assertvalid(drain); 1411 1412 RB_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain); 1413 uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain); 1414 uvm_unlock_fpageq(); 1415 } 1416 1417 /* 1418 * Increase the usage counter for the given range of memory. 1419 * 1420 * The more usage counters a given range of memory has, the more will be 1421 * attempted not to allocate from it. 1422 * 1423 * Addresses here are in paddr_t, not page-numbers. 1424 * The lowest and highest allowed address are specified. 1425 */ 1426 void 1427 uvm_pmr_use_inc(paddr_t low, paddr_t high) 1428 { 1429 struct uvm_pmemrange *pmr; 1430 paddr_t sz; 1431 1432 /* pmr uses page numbers, translate low and high. */ 1433 high++; 1434 high = atop(trunc_page(high)); 1435 low = atop(round_page(low)); 1436 uvm_pmr_split(low); 1437 uvm_pmr_split(high); 1438 1439 sz = 0; 1440 uvm_lock_fpageq(); 1441 /* Increase use count on segments in range. */ 1442 RB_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) { 1443 if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) { 1444 TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use); 1445 pmr->use++; 1446 sz += pmr->high - pmr->low; 1447 uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr); 1448 } 1449 uvm_pmr_assertvalid(pmr); 1450 } 1451 uvm_unlock_fpageq(); 1452 1453 KASSERT(sz >= high - low); 1454 } 1455 1456 /* 1457 * Allocate a pmemrange. 1458 * 1459 * If called from uvm_page_init, the uvm_pageboot_alloc is used. 1460 * If called after uvm_init, malloc is used. 1461 * (And if called in between, you're dead.) 1462 */ 1463 struct uvm_pmemrange * 1464 uvm_pmr_allocpmr(void) 1465 { 1466 struct uvm_pmemrange *nw; 1467 int i; 1468 1469 /* We're only ever hitting the !uvm.page_init_done case for now. */ 1470 if (!uvm.page_init_done) { 1471 nw = (struct uvm_pmemrange *) 1472 uvm_pageboot_alloc(sizeof(struct uvm_pmemrange)); 1473 } else { 1474 nw = malloc(sizeof(struct uvm_pmemrange), 1475 M_VMMAP, M_NOWAIT); 1476 } 1477 KASSERT(nw != NULL); 1478 memset(nw, 0, sizeof(struct uvm_pmemrange)); 1479 RB_INIT(&nw->addr); 1480 for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) { 1481 RB_INIT(&nw->size[i]); 1482 TAILQ_INIT(&nw->single[i]); 1483 } 1484 return nw; 1485 } 1486 1487 /* 1488 * Initialization of pmr. 1489 * Called by uvm_page_init. 1490 * 1491 * Sets up pmemranges. 1492 */ 1493 void 1494 uvm_pmr_init(void) 1495 { 1496 struct uvm_pmemrange *new_pmr; 1497 int i; 1498 1499 TAILQ_INIT(&uvm.pmr_control.use); 1500 RB_INIT(&uvm.pmr_control.addr); 1501 TAILQ_INIT(&uvm.pmr_control.allocs); 1502 1503 /* By default, one range for the entire address space. */ 1504 new_pmr = uvm_pmr_allocpmr(); 1505 new_pmr->low = 0; 1506 new_pmr->high = atop((paddr_t)-1) + 1; 1507 1508 RB_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr); 1509 uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr); 1510 1511 for (i = 0; uvm_md_constraints[i] != NULL; i++) { 1512 uvm_pmr_use_inc(uvm_md_constraints[i]->ucr_low, 1513 uvm_md_constraints[i]->ucr_high); 1514 } 1515 } 1516 1517 /* 1518 * Find the pmemrange that contains the given page number. 1519 * 1520 * (Manually traverses the binary tree, because that is cheaper on stack 1521 * usage.) 1522 */ 1523 struct uvm_pmemrange * 1524 uvm_pmemrange_find(paddr_t pageno) 1525 { 1526 struct uvm_pmemrange *pmr; 1527 1528 pmr = RB_ROOT(&uvm.pmr_control.addr); 1529 while (pmr != NULL) { 1530 if (pmr->low > pageno) 1531 pmr = RB_LEFT(pmr, pmr_addr); 1532 else if (pmr->high <= pageno) 1533 pmr = RB_RIGHT(pmr, pmr_addr); 1534 else 1535 break; 1536 } 1537 1538 return pmr; 1539 } 1540 1541 #if defined(DDB) || defined(DEBUG) 1542 /* 1543 * Return true if the given page is in any of the free lists. 1544 * Used by uvm_page_printit. 1545 * This function is safe, even if the page is not on the freeq. 1546 * Note: does not apply locking, only called from ddb. 1547 */ 1548 int 1549 uvm_pmr_isfree(struct vm_page *pg) 1550 { 1551 struct vm_page *r; 1552 struct uvm_pmemrange *pmr; 1553 1554 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg))); 1555 if (pmr == NULL) 1556 return 0; 1557 r = RB_NFIND(uvm_pmr_addr, &pmr->addr, pg); 1558 if (r == NULL) 1559 r = RB_MAX(uvm_pmr_addr, &pmr->addr); 1560 else if (r != pg) 1561 r = RB_PREV(uvm_pmr_addr, &pmr->addr, r); 1562 if (r == NULL) 1563 return 0; /* Empty tree. */ 1564 1565 KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg))); 1566 return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz > 1567 atop(VM_PAGE_TO_PHYS(pg)); 1568 } 1569 #endif /* DEBUG */ 1570 1571 /* 1572 * Given a root of a tree, find a range which intersects start, end and 1573 * is of the same memtype. 1574 * 1575 * Page must be in the address tree. 1576 */ 1577 struct vm_page* 1578 uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root, 1579 paddr_t start, paddr_t end, int memtype) 1580 { 1581 int direction; 1582 struct vm_page *root; 1583 struct vm_page *high, *high_next; 1584 struct vm_page *low, *low_next; 1585 1586 KDASSERT(pmr != NULL && init_root != NULL); 1587 root = init_root; 1588 1589 /* Which direction to use for searching. */ 1590 if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start) 1591 direction = 1; 1592 else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end) 1593 direction = -1; 1594 else /* nothing to do */ 1595 return root; 1596 1597 /* First, update root to fall within the chosen range. */ 1598 while (root && !PMR_INTERSECTS_WITH( 1599 atop(VM_PAGE_TO_PHYS(root)), 1600 atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz, 1601 start, end)) { 1602 if (direction == 1) 1603 root = RB_RIGHT(root, objt); 1604 else 1605 root = RB_LEFT(root, objt); 1606 } 1607 if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype) 1608 return root; 1609 1610 /* 1611 * Root is valid, but of the wrong memtype. 1612 * 1613 * Try to find a range that has the given memtype in the subtree 1614 * (memtype mismatches are costly, either because the conversion 1615 * is expensive, or a later allocation will need to do the opposite 1616 * conversion, which will be expensive). 1617 * 1618 * 1619 * First, simply increase address until we hit something we can use. 1620 * Cache the upper page, so we can page-walk later. 1621 */ 1622 high = root; 1623 high_next = RB_RIGHT(high, objt); 1624 while (high_next != NULL && PMR_INTERSECTS_WITH( 1625 atop(VM_PAGE_TO_PHYS(high_next)), 1626 atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz, 1627 start, end)) { 1628 high = high_next; 1629 if (uvm_pmr_pg_to_memtype(high) == memtype) 1630 return high; 1631 high_next = RB_RIGHT(high, objt); 1632 } 1633 1634 /* 1635 * Second, decrease the address until we hit something we can use. 1636 * Cache the lower page, so we can page-walk later. 1637 */ 1638 low = root; 1639 low_next = RB_LEFT(low, objt); 1640 while (low_next != NULL && PMR_INTERSECTS_WITH( 1641 atop(VM_PAGE_TO_PHYS(low_next)), 1642 atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz, 1643 start, end)) { 1644 low = low_next; 1645 if (uvm_pmr_pg_to_memtype(low) == memtype) 1646 return low; 1647 low_next = RB_LEFT(low, objt); 1648 } 1649 1650 if (low == high) 1651 return NULL; 1652 1653 /* No hits. Walk the address tree until we find something usable. */ 1654 for (low = RB_NEXT(uvm_pmr_addr, &pmr->addr, low); 1655 low != high; 1656 low = RB_NEXT(uvm_pmr_addr, &pmr->addr, low)) { 1657 KDASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(low)), 1658 atop(VM_PAGE_TO_PHYS(low)) + low->fpgsz, 1659 start, end)); 1660 if (uvm_pmr_pg_to_memtype(low) == memtype) 1661 return low; 1662 } 1663 1664 /* Nothing found. */ 1665 return NULL; 1666 } 1667 1668 /* 1669 * Allocate any page, the fastest way. Page number constraints only. 1670 */ 1671 psize_t 1672 uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result, 1673 paddr_t start, paddr_t end, int memtype_only) 1674 { 1675 struct uvm_pmemrange *pmr; 1676 struct vm_page *found, *splitpg; 1677 psize_t fcount; 1678 int memtype; 1679 1680 fcount = 0; 1681 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { 1682 /* We're done. */ 1683 if (fcount == count) 1684 break; 1685 1686 /* Outside requested range. */ 1687 if (!(start == 0 && end == 0) && 1688 !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end)) 1689 continue; 1690 1691 /* Range is empty. */ 1692 if (pmr->nsegs == 0) 1693 continue; 1694 1695 /* Loop over all memtypes, starting at memtype_init. */ 1696 memtype = memtype_init; 1697 while (fcount != count) { 1698 found = TAILQ_FIRST(&pmr->single[memtype]); 1699 /* 1700 * If found is outside the range, walk the list 1701 * until we find something that intersects with 1702 * boundaries. 1703 */ 1704 while (found && !PMR_INTERSECTS_WITH( 1705 atop(VM_PAGE_TO_PHYS(found)), 1706 atop(VM_PAGE_TO_PHYS(found)) + 1, 1707 start, end)) 1708 found = TAILQ_NEXT(found, pageq); 1709 1710 if (found == NULL) { 1711 /* 1712 * Check if the size tree contains a range 1713 * that intersects with the boundaries. As the 1714 * allocation is for any page, try the smallest 1715 * range so that large ranges are preserved for 1716 * more constrained cases. Only one entry is 1717 * checked here, to avoid a brute-force search. 1718 * 1719 * Note that a size tree gives pg[1] instead of 1720 * pg[0]. 1721 */ 1722 found = RB_MIN(uvm_pmr_size, 1723 &pmr->size[memtype]); 1724 if (found != NULL) { 1725 found--; 1726 if (!PMR_INTERSECTS_WITH( 1727 atop(VM_PAGE_TO_PHYS(found)), 1728 atop(VM_PAGE_TO_PHYS(found)) + 1729 found->fpgsz, start, end)) 1730 found = NULL; 1731 } 1732 } 1733 if (found == NULL) { 1734 /* 1735 * Try address-guided search to meet the page 1736 * number constraints. 1737 */ 1738 found = RB_ROOT(&pmr->addr); 1739 if (found != NULL) { 1740 found = uvm_pmr_rootupdate(pmr, found, 1741 start, end, memtype); 1742 } 1743 } 1744 if (found != NULL) { 1745 uvm_pmr_assertvalid(pmr); 1746 uvm_pmr_remove_size(pmr, found); 1747 1748 /* 1749 * If the page intersects the end, then it'll 1750 * need splitting. 1751 * 1752 * Note that we don't need to split if the page 1753 * intersects start: the drain function will 1754 * simply stop on hitting start. 1755 */ 1756 if (end != 0 && atop(VM_PAGE_TO_PHYS(found)) + 1757 found->fpgsz > end) { 1758 psize_t splitsz = 1759 atop(VM_PAGE_TO_PHYS(found)) + 1760 found->fpgsz - end; 1761 1762 uvm_pmr_remove_addr(pmr, found); 1763 uvm_pmr_assertvalid(pmr); 1764 found->fpgsz -= splitsz; 1765 splitpg = found + found->fpgsz; 1766 splitpg->fpgsz = splitsz; 1767 uvm_pmr_insert(pmr, splitpg, 1); 1768 1769 /* 1770 * At this point, splitpg and found 1771 * actually should be joined. 1772 * But we explicitly disable that, 1773 * because we will start subtracting 1774 * from found. 1775 */ 1776 KASSERT(start == 0 || 1777 atop(VM_PAGE_TO_PHYS(found)) + 1778 found->fpgsz > start); 1779 uvm_pmr_insert_addr(pmr, found, 1); 1780 } 1781 1782 /* 1783 * Fetch pages from the end. 1784 * If the range is larger than the requested 1785 * number of pages, this saves us an addr-tree 1786 * update. 1787 * 1788 * Since we take from the end and insert at 1789 * the head, any ranges keep preserved. 1790 */ 1791 while (found->fpgsz > 0 && fcount < count && 1792 (start == 0 || 1793 atop(VM_PAGE_TO_PHYS(found)) + 1794 found->fpgsz > start)) { 1795 found->fpgsz--; 1796 fcount++; 1797 TAILQ_INSERT_HEAD(result, 1798 &found[found->fpgsz], pageq); 1799 } 1800 if (found->fpgsz > 0) { 1801 uvm_pmr_insert_size(pmr, found); 1802 KDASSERT(fcount == count); 1803 uvm_pmr_assertvalid(pmr); 1804 return fcount; 1805 } 1806 1807 /* 1808 * Delayed addr-tree removal. 1809 */ 1810 uvm_pmr_remove_addr(pmr, found); 1811 uvm_pmr_assertvalid(pmr); 1812 } else { 1813 if (memtype_only) 1814 break; 1815 /* 1816 * Skip to the next memtype. 1817 */ 1818 memtype += 1; 1819 if (memtype == UVM_PMR_MEMTYPE_MAX) 1820 memtype = 0; 1821 if (memtype == memtype_init) 1822 break; 1823 } 1824 } 1825 } 1826 1827 /* 1828 * Search finished. 1829 * 1830 * Ran out of ranges before enough pages were gathered, or we hit the 1831 * case where found->fpgsz == count - fcount, in which case the 1832 * above exit condition didn't trigger. 1833 * 1834 * On failure, caller will free the pages. 1835 */ 1836 return fcount; 1837 } 1838 1839 #ifdef DDB 1840 /* 1841 * Print information about pmemrange. 1842 * Does not do locking (so either call it from DDB or acquire fpageq lock 1843 * before invoking. 1844 */ 1845 void 1846 uvm_pmr_print(void) 1847 { 1848 struct uvm_pmemrange *pmr; 1849 struct vm_page *pg; 1850 psize_t size[UVM_PMR_MEMTYPE_MAX]; 1851 psize_t free; 1852 int useq_len; 1853 int mt; 1854 1855 printf("Ranges, use queue:\n"); 1856 useq_len = 0; 1857 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { 1858 useq_len++; 1859 free = 0; 1860 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { 1861 pg = RB_MAX(uvm_pmr_size, &pmr->size[mt]); 1862 if (pg != NULL) 1863 pg--; 1864 else 1865 pg = TAILQ_FIRST(&pmr->single[mt]); 1866 size[mt] = (pg == NULL ? 0 : pg->fpgsz); 1867 1868 RB_FOREACH(pg, uvm_pmr_addr, &pmr->addr) 1869 free += pg->fpgsz; 1870 } 1871 1872 printf("* [0x%lx-0x%lx] use=%d nsegs=%ld", 1873 (unsigned long)pmr->low, (unsigned long)pmr->high, 1874 pmr->use, (unsigned long)pmr->nsegs); 1875 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { 1876 printf(" maxsegsz[%d]=0x%lx", mt, 1877 (unsigned long)size[mt]); 1878 } 1879 printf(" free=0x%lx\n", (unsigned long)free); 1880 } 1881 printf("#ranges = %d\n", useq_len); 1882 } 1883 #endif 1884 1885 /* 1886 * uvm_wait_pla: wait (sleep) for the page daemon to free some pages 1887 * in a specific physmem area. 1888 * 1889 * Returns ENOMEM if the pagedaemon failed to free any pages. 1890 * If not failok, failure will lead to panic. 1891 * 1892 * Must be called with fpageq locked. 1893 */ 1894 int 1895 uvm_wait_pla(paddr_t low, paddr_t high, paddr_t size, int failok) 1896 { 1897 struct uvm_pmalloc pma; 1898 const char *wmsg = "pmrwait"; 1899 1900 if (curproc == uvm.pagedaemon_proc) { 1901 /* 1902 * This is not that uncommon when the pagedaemon is trying 1903 * to flush out a large mmapped file. VOP_WRITE will circle 1904 * back through the buffer cache and try to get more memory. 1905 * The pagedaemon starts by calling bufbackoff, but we can 1906 * easily use up that reserve in a single scan iteration. 1907 */ 1908 uvm_unlock_fpageq(); 1909 if (bufbackoff(NULL, atop(size)) == 0) { 1910 uvm_lock_fpageq(); 1911 return 0; 1912 } 1913 uvm_lock_fpageq(); 1914 1915 /* 1916 * XXX detect pagedaemon deadlock - see comment in 1917 * uvm_wait(), as this is exactly the same issue. 1918 */ 1919 printf("pagedaemon: wait_pla deadlock detected!\n"); 1920 msleep(&uvmexp.free, &uvm.fpageqlock, PVM, wmsg, hz >> 3); 1921 #if defined(DEBUG) 1922 /* DEBUG: panic so we can debug it */ 1923 panic("wait_pla pagedaemon deadlock"); 1924 #endif 1925 return 0; 1926 } 1927 1928 for (;;) { 1929 pma.pm_constraint.ucr_low = low; 1930 pma.pm_constraint.ucr_high = high; 1931 pma.pm_size = size; 1932 pma.pm_flags = UVM_PMA_LINKED; 1933 TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, &pma, pmq); 1934 1935 wakeup(&uvm.pagedaemon); /* wake the daemon! */ 1936 while (pma.pm_flags & (UVM_PMA_LINKED | UVM_PMA_BUSY)) 1937 msleep(&pma, &uvm.fpageqlock, PVM, wmsg, 0); 1938 1939 if (!(pma.pm_flags & UVM_PMA_FREED) && 1940 pma.pm_flags & UVM_PMA_FAIL) { 1941 if (failok) 1942 return ENOMEM; 1943 printf("uvm_wait: failed to free %ld pages between " 1944 "0x%lx-0x%lx\n", atop(size), low, high); 1945 } else 1946 return 0; 1947 } 1948 /* UNREACHABLE */ 1949 } 1950 1951 /* 1952 * Wake up uvm_pmalloc sleepers. 1953 */ 1954 void 1955 uvm_wakeup_pla(paddr_t low, psize_t len) 1956 { 1957 struct uvm_pmalloc *pma, *pma_next; 1958 paddr_t high; 1959 1960 high = low + len; 1961 1962 /* Wake specific allocations waiting for this memory. */ 1963 for (pma = TAILQ_FIRST(&uvm.pmr_control.allocs); pma != NULL; 1964 pma = pma_next) { 1965 pma_next = TAILQ_NEXT(pma, pmq); 1966 1967 if (low < pma->pm_constraint.ucr_high && 1968 high > pma->pm_constraint.ucr_low) { 1969 pma->pm_flags |= UVM_PMA_FREED; 1970 if (!(pma->pm_flags & UVM_PMA_BUSY)) { 1971 pma->pm_flags &= ~UVM_PMA_LINKED; 1972 TAILQ_REMOVE(&uvm.pmr_control.allocs, pma, 1973 pmq); 1974 wakeup(pma); 1975 } 1976 } 1977 } 1978 } 1979 1980 void 1981 uvm_pagezero_thread(void *arg) 1982 { 1983 struct pglist pgl; 1984 struct vm_page *pg; 1985 int count; 1986 1987 /* Run at the lowest possible priority. */ 1988 curproc->p_p->ps_nice = NZERO + PRIO_MAX; 1989 1990 KERNEL_UNLOCK(); 1991 1992 TAILQ_INIT(&pgl); 1993 for (;;) { 1994 uvm_lock_fpageq(); 1995 while (uvmexp.zeropages >= UVM_PAGEZERO_TARGET || 1996 (count = uvm_pmr_get1page(16, UVM_PMR_MEMTYPE_DIRTY, 1997 &pgl, 0, 0, 1)) == 0) { 1998 msleep(&uvmexp.zeropages, &uvm.fpageqlock, MAXPRI, 1999 "pgzero", 0); 2000 } 2001 uvm_unlock_fpageq(); 2002 2003 TAILQ_FOREACH(pg, &pgl, pageq) { 2004 uvm_pagezero(pg); 2005 atomic_setbits_int(&pg->pg_flags, PG_ZERO); 2006 } 2007 2008 uvm_lock_fpageq(); 2009 while (!TAILQ_EMPTY(&pgl)) 2010 uvm_pmr_remove_1strange(&pgl, 0, NULL, 0); 2011 uvmexp.zeropages += count; 2012 uvm_unlock_fpageq(); 2013 2014 yield(); 2015 } 2016 } 2017