1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 1997, 1998 John S. Dyson 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice immediately at the beginning of the file, without modification, 12 * this list of conditions, and the following disclaimer. 13 * 2. Absolutely no warranty of function or purpose is made by the author 14 * John S. Dyson. 15 * 16 * $FreeBSD: src/sys/vm/vm_zone.c,v 1.30.2.6 2002/10/10 19:50:16 dillon Exp $ 17 * $DragonFly: src/sys/vm/vm_zone.c,v 1.28 2008/01/23 17:35:48 nth Exp $ 18 */ 19 20 #include <sys/param.h> 21 #include <sys/queue.h> 22 #include <sys/systm.h> 23 #include <sys/kernel.h> 24 #include <sys/lock.h> 25 #include <sys/malloc.h> 26 #include <sys/sysctl.h> 27 #include <sys/vmmeter.h> 28 29 #include <vm/vm.h> 30 #include <vm/vm_object.h> 31 #include <vm/vm_page.h> 32 #include <vm/vm_map.h> 33 #include <vm/vm_kern.h> 34 #include <vm/vm_extern.h> 35 #include <vm/vm_zone.h> 36 37 #include <sys/spinlock2.h> 38 #include <sys/mplock2.h> 39 40 static MALLOC_DEFINE(M_ZONE, "ZONE", "Zone header"); 41 42 #define ZONE_ERROR_INVALID 0 43 #define ZONE_ERROR_NOTFREE 1 44 #define ZONE_ERROR_ALREADYFREE 2 45 46 #define ZONE_ROUNDING 32 47 48 #define ZENTRY_FREE 0x12342378 49 50 static void *zget(vm_zone_t z); 51 52 /* 53 * Return an item from the specified zone. This function is non-blocking for 54 * ZONE_INTERRUPT zones. 55 * 56 * No requirements. 57 */ 58 void * 59 zalloc(vm_zone_t z) 60 { 61 void *item; 62 63 #ifdef INVARIANTS 64 if (z == NULL) 65 zerror(ZONE_ERROR_INVALID); 66 #endif 67 spin_lock_wr(&z->zlock); 68 if (z->zfreecnt > z->zfreemin) { 69 item = z->zitems; 70 #ifdef INVARIANTS 71 KASSERT(item != NULL, ("zitems unexpectedly NULL")); 72 if (((void **) item)[1] != (void *) ZENTRY_FREE) 73 zerror(ZONE_ERROR_NOTFREE); 74 ((void **) item)[1] = 0; 75 #endif 76 z->zitems = ((void **) item)[0]; 77 z->zfreecnt--; 78 z->znalloc++; 79 spin_unlock_wr(&z->zlock); 80 } else { 81 spin_unlock_wr(&z->zlock); 82 item = zget(z); 83 /* 84 * PANICFAIL allows the caller to assume that the zalloc() 85 * will always succeed. If it doesn't, we panic here. 86 */ 87 if (item == NULL && (z->zflags & ZONE_PANICFAIL)) 88 panic("zalloc(%s) failed", z->zname); 89 } 90 return item; 91 } 92 93 /* 94 * Free an item to the specified zone. 95 * 96 * No requirements. 97 */ 98 void 99 zfree(vm_zone_t z, void *item) 100 { 101 spin_lock_wr(&z->zlock); 102 ((void **) item)[0] = z->zitems; 103 #ifdef INVARIANTS 104 if (((void **) item)[1] == (void *) ZENTRY_FREE) 105 zerror(ZONE_ERROR_ALREADYFREE); 106 ((void **) item)[1] = (void *) ZENTRY_FREE; 107 #endif 108 z->zitems = item; 109 z->zfreecnt++; 110 spin_unlock_wr(&z->zlock); 111 } 112 113 /* 114 * This file comprises a very simple zone allocator. This is used 115 * in lieu of the malloc allocator, where needed or more optimal. 116 * 117 * Note that the initial implementation of this had coloring, and 118 * absolutely no improvement (actually perf degradation) occurred. 119 * 120 * Note also that the zones are type stable. The only restriction is 121 * that the first two longwords of a data structure can be changed 122 * between allocations. Any data that must be stable between allocations 123 * must reside in areas after the first two longwords. 124 * 125 * zinitna, zinit, zbootinit are the initialization routines. 126 * zalloc, zfree, are the allocation/free routines. 127 */ 128 129 LIST_HEAD(zlist, vm_zone) zlist = LIST_HEAD_INITIALIZER(zlist); 130 static int sysctl_vm_zone(SYSCTL_HANDLER_ARGS); 131 static int zone_kmem_pages, zone_kern_pages, zone_kmem_kvaspace; 132 133 /* 134 * Create a zone, but don't allocate the zone structure. If the 135 * zone had been previously created by the zone boot code, initialize 136 * various parts of the zone code. 137 * 138 * If waits are not allowed during allocation (e.g. during interrupt 139 * code), a-priori allocate the kernel virtual space, and allocate 140 * only pages when needed. 141 * 142 * Arguments: 143 * z pointer to zone structure. 144 * obj pointer to VM object (opt). 145 * name name of zone. 146 * size size of zone entries. 147 * nentries number of zone entries allocated (only ZONE_INTERRUPT.) 148 * flags ZONE_INTERRUPT -- items can be allocated at interrupt time. 149 * zalloc number of pages allocated when memory is needed. 150 * 151 * Note that when using ZONE_INTERRUPT, the size of the zone is limited 152 * by the nentries argument. The size of the memory allocatable is 153 * unlimited if ZONE_INTERRUPT is not set. 154 * 155 * No requirements. 156 */ 157 int 158 zinitna(vm_zone_t z, vm_object_t obj, char *name, int size, 159 int nentries, int flags, int zalloc) 160 { 161 int totsize; 162 163 /* 164 * Only zones created with zinit() are destroyable. 165 */ 166 if (z->zflags & ZONE_DESTROYABLE) 167 panic("zinitna: can't create destroyable zone"); 168 169 /* 170 * NOTE: We can only adjust zsize if we previously did not 171 * use zbootinit(). 172 */ 173 if ((z->zflags & ZONE_BOOT) == 0) { 174 z->zsize = (size + ZONE_ROUNDING - 1) & ~(ZONE_ROUNDING - 1); 175 spin_init(&z->zlock); 176 z->zfreecnt = 0; 177 z->ztotal = 0; 178 z->zmax = 0; 179 z->zname = name; 180 z->znalloc = 0; 181 z->zitems = NULL; 182 183 lwkt_gettoken(&vm_token); 184 LIST_INSERT_HEAD(&zlist, z, zlink); 185 lwkt_reltoken(&vm_token); 186 } 187 188 z->zkmvec = NULL; 189 z->zkmcur = z->zkmmax = 0; 190 z->zflags |= flags; 191 192 /* 193 * If we cannot wait, allocate KVA space up front, and we will fill 194 * in pages as needed. This is particularly required when creating 195 * an allocation space for map entries in kernel_map, because we 196 * do not want to go into a recursion deadlock with 197 * vm_map_entry_reserve(). 198 */ 199 if (z->zflags & ZONE_INTERRUPT) { 200 totsize = round_page(z->zsize * nentries); 201 zone_kmem_kvaspace += totsize; 202 203 z->zkva = kmem_alloc_pageable(&kernel_map, totsize); 204 if (z->zkva == 0) { 205 LIST_REMOVE(z, zlink); 206 return 0; 207 } 208 209 z->zpagemax = totsize / PAGE_SIZE; 210 if (obj == NULL) { 211 z->zobj = vm_object_allocate(OBJT_DEFAULT, z->zpagemax); 212 } else { 213 z->zobj = obj; 214 _vm_object_allocate(OBJT_DEFAULT, z->zpagemax, obj); 215 } 216 z->zallocflag = VM_ALLOC_SYSTEM | VM_ALLOC_INTERRUPT; 217 z->zmax += nentries; 218 } else { 219 z->zallocflag = VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM; 220 z->zmax = 0; 221 } 222 223 224 if (z->zsize > PAGE_SIZE) 225 z->zfreemin = 1; 226 else 227 z->zfreemin = PAGE_SIZE / z->zsize; 228 229 z->zpagecount = 0; 230 if (zalloc) 231 z->zalloc = zalloc; 232 else 233 z->zalloc = 1; 234 235 /* 236 * Populate the interrrupt zone at creation time rather than 237 * on first allocation, as this is a potentially long operation. 238 */ 239 if (z->zflags & ZONE_INTERRUPT) { 240 void *buf; 241 242 buf = zget(z); 243 zfree(z, buf); 244 } 245 246 return 1; 247 } 248 249 /* 250 * Subroutine same as zinitna, except zone data structure is allocated 251 * automatically by malloc. This routine should normally be used, except 252 * in certain tricky startup conditions in the VM system -- then 253 * zbootinit and zinitna can be used. Zinit is the standard zone 254 * initialization call. 255 * 256 * No requirements. 257 */ 258 vm_zone_t 259 zinit(char *name, int size, int nentries, int flags, int zalloc) 260 { 261 vm_zone_t z; 262 263 z = (vm_zone_t) kmalloc(sizeof (struct vm_zone), M_ZONE, M_NOWAIT); 264 if (z == NULL) 265 return NULL; 266 267 z->zflags = 0; 268 if (zinitna(z, NULL, name, size, nentries, 269 flags & ~ZONE_DESTROYABLE, zalloc) == 0) { 270 kfree(z, M_ZONE); 271 return NULL; 272 } 273 274 if (flags & ZONE_DESTROYABLE) 275 z->zflags |= ZONE_DESTROYABLE; 276 277 return z; 278 } 279 280 /* 281 * Initialize a zone before the system is fully up. This routine should 282 * only be called before full VM startup. 283 * 284 * Called from the low level boot code only. 285 */ 286 void 287 zbootinit(vm_zone_t z, char *name, int size, void *item, int nitems) 288 { 289 int i; 290 291 z->zname = name; 292 z->zsize = size; 293 z->zpagemax = 0; 294 z->zobj = NULL; 295 z->zflags = ZONE_BOOT; 296 z->zfreemin = 0; 297 z->zallocflag = 0; 298 z->zpagecount = 0; 299 z->zalloc = 0; 300 z->znalloc = 0; 301 spin_init(&z->zlock); 302 303 bzero(item, nitems * z->zsize); 304 z->zitems = NULL; 305 for (i = 0; i < nitems; i++) { 306 ((void **) item)[0] = z->zitems; 307 #ifdef INVARIANTS 308 ((void **) item)[1] = (void *) ZENTRY_FREE; 309 #endif 310 z->zitems = item; 311 item = (uint8_t *)item + z->zsize; 312 } 313 z->zfreecnt = nitems; 314 z->zmax = nitems; 315 z->ztotal = nitems; 316 317 lwkt_gettoken(&vm_token); 318 LIST_INSERT_HEAD(&zlist, z, zlink); 319 lwkt_reltoken(&vm_token); 320 } 321 322 /* 323 * Release all resources owned by zone created with zinit(). 324 * 325 * No requirements. 326 */ 327 void 328 zdestroy(vm_zone_t z) 329 { 330 int i; 331 332 if (z == NULL) 333 panic("zdestroy: null zone"); 334 if ((z->zflags & ZONE_DESTROYABLE) == 0) 335 panic("zdestroy: undestroyable zone"); 336 337 lwkt_gettoken(&vm_token); 338 LIST_REMOVE(z, zlink); 339 lwkt_reltoken(&vm_token); 340 341 /* 342 * Release virtual mappings, physical memory and update sysctl stats. 343 */ 344 if (z->zflags & ZONE_INTERRUPT) { 345 /* 346 * Pages mapped via pmap_kenter() must be removed from the 347 * kernel_pmap() before calling kmem_free() to avoid issues 348 * with kernel_pmap.pm_stats.resident_count. 349 */ 350 pmap_qremove(z->zkva, z->zpagemax); 351 352 /* 353 * Free the mapping. 354 */ 355 kmem_free(&kernel_map, z->zkva, z->zpagemax*PAGE_SIZE); 356 atomic_subtract_int(&zone_kmem_kvaspace, z->zpagemax*PAGE_SIZE); 357 358 /* 359 * Free the backing object and physical pages. 360 */ 361 vm_object_deallocate(z->zobj); 362 atomic_subtract_int(&zone_kmem_pages, z->zpagecount); 363 } else { 364 for (i=0; i < z->zkmcur; i++) { 365 kmem_free(&kernel_map, z->zkmvec[i], 366 z->zalloc*PAGE_SIZE); 367 atomic_subtract_int(&zone_kern_pages, z->zalloc); 368 } 369 if (z->zkmvec != NULL) 370 kfree(z->zkmvec, M_ZONE); 371 } 372 373 spin_uninit(&z->zlock); 374 kfree(z, M_ZONE); 375 } 376 377 378 /* 379 * void *zalloc(vm_zone_t zone) -- 380 * Returns an item from a specified zone. May not be called from a 381 * FAST interrupt or IPI function. 382 * 383 * void zfree(vm_zone_t zone, void *item) -- 384 * Frees an item back to a specified zone. May not be called from a 385 * FAST interrupt or IPI function. 386 */ 387 388 /* 389 * Internal zone routine. Not to be called from external (non vm_zone) code. 390 * 391 * No requirements. 392 */ 393 static void * 394 zget(vm_zone_t z) 395 { 396 int i; 397 vm_page_t m; 398 int nitems, nbytes; 399 int savezpc; 400 void *item; 401 402 if (z == NULL) 403 panic("zget: null zone"); 404 405 if (z->zflags & ZONE_INTERRUPT) { 406 /* 407 * Interrupt zones do not mess with the kernel_map, they 408 * simply populate an existing mapping. 409 */ 410 get_mplock(); 411 lwkt_gettoken(&vm_token); 412 savezpc = z->zpagecount; 413 nbytes = z->zpagecount * PAGE_SIZE; 414 nbytes -= nbytes % z->zsize; 415 item = (char *) z->zkva + nbytes; 416 for (i = 0; ((i < z->zalloc) && (z->zpagecount < z->zpagemax)); 417 i++) { 418 vm_offset_t zkva; 419 420 m = vm_page_alloc(z->zobj, z->zpagecount, 421 z->zallocflag); 422 /* note: z might be modified due to blocking */ 423 if (m == NULL) 424 break; 425 426 /* 427 * Unbusy page so it can freed in zdestroy(). Make 428 * sure it is not on any queue and so can not be 429 * recycled under our feet. 430 */ 431 KKASSERT(m->queue == PQ_NONE); 432 vm_page_flag_clear(m, PG_BUSY); 433 434 zkva = z->zkva + z->zpagecount * PAGE_SIZE; 435 pmap_kenter(zkva, VM_PAGE_TO_PHYS(m)); /* YYY */ 436 bzero((void *)zkva, PAGE_SIZE); 437 KKASSERT(savezpc == z->zpagecount); 438 ++savezpc; 439 z->zpagecount++; 440 zone_kmem_pages++; 441 vmstats.v_wire_count++; 442 } 443 nitems = ((z->zpagecount * PAGE_SIZE) - nbytes) / z->zsize; 444 lwkt_reltoken(&vm_token); 445 rel_mplock(); 446 } else if (z->zflags & ZONE_SPECIAL) { 447 /* 448 * The special zone is the one used for vm_map_entry_t's. 449 * We have to avoid an infinite recursion in 450 * vm_map_entry_reserve() by using vm_map_entry_kreserve() 451 * instead. The map entries are pre-reserved by the kernel 452 * by vm_map_entry_reserve_cpu_init(). 453 */ 454 nbytes = z->zalloc * PAGE_SIZE; 455 456 item = (void *)kmem_alloc3(&kernel_map, nbytes, KM_KRESERVE); 457 458 /* note: z might be modified due to blocking */ 459 if (item != NULL) { 460 zone_kern_pages += z->zalloc; /* not MP-safe XXX */ 461 bzero(item, nbytes); 462 } else { 463 nbytes = 0; 464 } 465 nitems = nbytes / z->zsize; 466 } else { 467 /* 468 * Otherwise allocate KVA from the kernel_map. 469 */ 470 nbytes = z->zalloc * PAGE_SIZE; 471 472 item = (void *)kmem_alloc3(&kernel_map, nbytes, 0); 473 474 /* note: z might be modified due to blocking */ 475 if (item != NULL) { 476 zone_kern_pages += z->zalloc; /* not MP-safe XXX */ 477 bzero(item, nbytes); 478 479 if (z->zflags & ZONE_DESTROYABLE) { 480 if (z->zkmcur == z->zkmmax) { 481 z->zkmmax = 482 z->zkmmax==0 ? 1 : z->zkmmax*2; 483 z->zkmvec = krealloc(z->zkmvec, 484 z->zkmmax * sizeof(z->zkmvec[0]), 485 M_ZONE, M_WAITOK); 486 } 487 z->zkmvec[z->zkmcur++] = (vm_offset_t)item; 488 } 489 } else { 490 nbytes = 0; 491 } 492 nitems = nbytes / z->zsize; 493 } 494 495 spin_lock_wr(&z->zlock); 496 z->ztotal += nitems; 497 /* 498 * Save one for immediate allocation 499 */ 500 if (nitems != 0) { 501 nitems -= 1; 502 for (i = 0; i < nitems; i++) { 503 ((void **) item)[0] = z->zitems; 504 #ifdef INVARIANTS 505 ((void **) item)[1] = (void *) ZENTRY_FREE; 506 #endif 507 z->zitems = item; 508 item = (uint8_t *)item + z->zsize; 509 } 510 z->zfreecnt += nitems; 511 z->znalloc++; 512 } else if (z->zfreecnt > 0) { 513 item = z->zitems; 514 z->zitems = ((void **) item)[0]; 515 #ifdef INVARIANTS 516 if (((void **) item)[1] != (void *) ZENTRY_FREE) 517 zerror(ZONE_ERROR_NOTFREE); 518 ((void **) item)[1] = 0; 519 #endif 520 z->zfreecnt--; 521 z->znalloc++; 522 } else { 523 item = NULL; 524 } 525 spin_unlock_wr(&z->zlock); 526 527 /* 528 * A special zone may have used a kernel-reserved vm_map_entry. If 529 * so we have to be sure to recover our reserve so we don't run out. 530 * We will panic if we run out. 531 */ 532 if (z->zflags & ZONE_SPECIAL) 533 vm_map_entry_reserve(0); 534 535 return item; 536 } 537 538 /* 539 * No requirements. 540 */ 541 static int 542 sysctl_vm_zone(SYSCTL_HANDLER_ARGS) 543 { 544 int error=0; 545 vm_zone_t curzone; 546 char tmpbuf[128]; 547 char tmpname[14]; 548 549 ksnprintf(tmpbuf, sizeof(tmpbuf), 550 "\nITEM SIZE LIMIT USED FREE REQUESTS\n"); 551 error = SYSCTL_OUT(req, tmpbuf, strlen(tmpbuf)); 552 if (error) 553 return (error); 554 555 lwkt_gettoken(&vm_token); 556 LIST_FOREACH(curzone, &zlist, zlink) { 557 int i; 558 int len; 559 int offset; 560 561 len = strlen(curzone->zname); 562 if (len >= (sizeof(tmpname) - 1)) 563 len = (sizeof(tmpname) - 1); 564 for(i = 0; i < sizeof(tmpname) - 1; i++) 565 tmpname[i] = ' '; 566 tmpname[i] = 0; 567 memcpy(tmpname, curzone->zname, len); 568 tmpname[len] = ':'; 569 offset = 0; 570 if (curzone == LIST_FIRST(&zlist)) { 571 offset = 1; 572 tmpbuf[0] = '\n'; 573 } 574 575 ksnprintf(tmpbuf + offset, sizeof(tmpbuf) - offset, 576 "%s %6.6u, %8.8u, %6.6u, %6.6u, %8.8u\n", 577 tmpname, curzone->zsize, curzone->zmax, 578 (curzone->ztotal - curzone->zfreecnt), 579 curzone->zfreecnt, curzone->znalloc); 580 581 len = strlen((char *)tmpbuf); 582 if (LIST_NEXT(curzone, zlink) == NULL) 583 tmpbuf[len - 1] = 0; 584 585 error = SYSCTL_OUT(req, tmpbuf, len); 586 587 if (error) 588 break; 589 } 590 lwkt_reltoken(&vm_token); 591 return (error); 592 } 593 594 #if defined(INVARIANTS) 595 596 /* 597 * Debugging only. 598 */ 599 void 600 zerror(int error) 601 { 602 char *msg; 603 604 switch (error) { 605 case ZONE_ERROR_INVALID: 606 msg = "zone: invalid zone"; 607 break; 608 case ZONE_ERROR_NOTFREE: 609 msg = "zone: entry not free"; 610 break; 611 case ZONE_ERROR_ALREADYFREE: 612 msg = "zone: freeing free entry"; 613 break; 614 default: 615 msg = "zone: invalid error"; 616 break; 617 } 618 panic(msg); 619 } 620 #endif 621 622 SYSCTL_OID(_vm, OID_AUTO, zone, CTLTYPE_STRING|CTLFLAG_RD, \ 623 NULL, 0, sysctl_vm_zone, "A", "Zone Info"); 624 625 SYSCTL_INT(_vm, OID_AUTO, zone_kmem_pages, 626 CTLFLAG_RD, &zone_kmem_pages, 0, "Number of interrupt safe pages allocated by zone"); 627 SYSCTL_INT(_vm, OID_AUTO, zone_kmem_kvaspace, 628 CTLFLAG_RD, &zone_kmem_kvaspace, 0, "KVA space allocated by zone"); 629 SYSCTL_INT(_vm, OID_AUTO, zone_kern_pages, 630 CTLFLAG_RD, &zone_kern_pages, 0, "Number of non-interrupt safe pages allocated by zone"); 631