1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 1997, 1998 John S. Dyson 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice immediately at the beginning of the file, without modification, 12 * this list of conditions, and the following disclaimer. 13 * 2. Absolutely no warranty of function or purpose is made by the author 14 * John S. Dyson. 15 * 16 * $FreeBSD: src/sys/vm/vm_zone.c,v 1.30.2.6 2002/10/10 19:50:16 dillon Exp $ 17 * $DragonFly: src/sys/vm/vm_zone.c,v 1.28 2008/01/23 17:35:48 nth Exp $ 18 */ 19 20 #include <sys/param.h> 21 #include <sys/queue.h> 22 #include <sys/systm.h> 23 #include <sys/kernel.h> 24 #include <sys/lock.h> 25 #include <sys/malloc.h> 26 #include <sys/sysctl.h> 27 #include <sys/vmmeter.h> 28 29 #include <vm/vm.h> 30 #include <vm/vm_object.h> 31 #include <vm/vm_page.h> 32 #include <vm/vm_map.h> 33 #include <vm/vm_kern.h> 34 #include <vm/vm_extern.h> 35 #include <vm/vm_zone.h> 36 37 #include <sys/spinlock2.h> 38 39 static MALLOC_DEFINE(M_ZONE, "ZONE", "Zone header"); 40 41 #define ZONE_ERROR_INVALID 0 42 #define ZONE_ERROR_NOTFREE 1 43 #define ZONE_ERROR_ALREADYFREE 2 44 45 #define ZONE_ROUNDING 32 46 47 #define ZENTRY_FREE 0x12342378 48 49 static void *zget(vm_zone_t z); 50 51 /* 52 * Return an item from the specified zone. This function is non-blocking for 53 * ZONE_INTERRUPT zones. 54 * 55 * No requirements. 56 */ 57 void * 58 zalloc(vm_zone_t z) 59 { 60 void *item; 61 62 #ifdef INVARIANTS 63 if (z == NULL) 64 zerror(ZONE_ERROR_INVALID); 65 #endif 66 spin_lock(&z->zlock); 67 if (z->zfreecnt > z->zfreemin) { 68 item = z->zitems; 69 #ifdef INVARIANTS 70 KASSERT(item != NULL, ("zitems unexpectedly NULL")); 71 if (((void **) item)[1] != (void *) ZENTRY_FREE) 72 zerror(ZONE_ERROR_NOTFREE); 73 ((void **) item)[1] = 0; 74 #endif 75 z->zitems = ((void **) item)[0]; 76 z->zfreecnt--; 77 z->znalloc++; 78 spin_unlock(&z->zlock); 79 } else { 80 spin_unlock(&z->zlock); 81 item = zget(z); 82 /* 83 * PANICFAIL allows the caller to assume that the zalloc() 84 * will always succeed. If it doesn't, we panic here. 85 */ 86 if (item == NULL && (z->zflags & ZONE_PANICFAIL)) 87 panic("zalloc(%s) failed", z->zname); 88 } 89 return item; 90 } 91 92 /* 93 * Free an item to the specified zone. 94 * 95 * No requirements. 96 */ 97 void 98 zfree(vm_zone_t z, void *item) 99 { 100 spin_lock(&z->zlock); 101 ((void **) item)[0] = z->zitems; 102 #ifdef INVARIANTS 103 if (((void **) item)[1] == (void *) ZENTRY_FREE) 104 zerror(ZONE_ERROR_ALREADYFREE); 105 ((void **) item)[1] = (void *) ZENTRY_FREE; 106 #endif 107 z->zitems = item; 108 z->zfreecnt++; 109 spin_unlock(&z->zlock); 110 } 111 112 /* 113 * This file comprises a very simple zone allocator. This is used 114 * in lieu of the malloc allocator, where needed or more optimal. 115 * 116 * Note that the initial implementation of this had coloring, and 117 * absolutely no improvement (actually perf degradation) occurred. 118 * 119 * Note also that the zones are type stable. The only restriction is 120 * that the first two longwords of a data structure can be changed 121 * between allocations. Any data that must be stable between allocations 122 * must reside in areas after the first two longwords. 123 * 124 * zinitna, zinit, zbootinit are the initialization routines. 125 * zalloc, zfree, are the allocation/free routines. 126 */ 127 128 LIST_HEAD(zlist, vm_zone) zlist = LIST_HEAD_INITIALIZER(zlist); 129 static int sysctl_vm_zone(SYSCTL_HANDLER_ARGS); 130 static int zone_kmem_pages, zone_kern_pages, zone_kmem_kvaspace; 131 132 /* 133 * Create a zone, but don't allocate the zone structure. If the 134 * zone had been previously created by the zone boot code, initialize 135 * various parts of the zone code. 136 * 137 * If waits are not allowed during allocation (e.g. during interrupt 138 * code), a-priori allocate the kernel virtual space, and allocate 139 * only pages when needed. 140 * 141 * Arguments: 142 * z pointer to zone structure. 143 * obj pointer to VM object (opt). 144 * name name of zone. 145 * size size of zone entries. 146 * nentries number of zone entries allocated (only ZONE_INTERRUPT.) 147 * flags ZONE_INTERRUPT -- items can be allocated at interrupt time. 148 * zalloc number of pages allocated when memory is needed. 149 * 150 * Note that when using ZONE_INTERRUPT, the size of the zone is limited 151 * by the nentries argument. The size of the memory allocatable is 152 * unlimited if ZONE_INTERRUPT is not set. 153 * 154 * No requirements. 155 */ 156 int 157 zinitna(vm_zone_t z, vm_object_t obj, char *name, int size, 158 int nentries, int flags, int zalloc) 159 { 160 int totsize; 161 162 /* 163 * Only zones created with zinit() are destroyable. 164 */ 165 if (z->zflags & ZONE_DESTROYABLE) 166 panic("zinitna: can't create destroyable zone"); 167 168 /* 169 * NOTE: We can only adjust zsize if we previously did not 170 * use zbootinit(). 171 */ 172 if ((z->zflags & ZONE_BOOT) == 0) { 173 z->zsize = (size + ZONE_ROUNDING - 1) & ~(ZONE_ROUNDING - 1); 174 spin_init(&z->zlock); 175 z->zfreecnt = 0; 176 z->ztotal = 0; 177 z->zmax = 0; 178 z->zname = name; 179 z->znalloc = 0; 180 z->zitems = NULL; 181 182 lwkt_gettoken(&vm_token); 183 LIST_INSERT_HEAD(&zlist, z, zlink); 184 lwkt_reltoken(&vm_token); 185 } 186 187 z->zkmvec = NULL; 188 z->zkmcur = z->zkmmax = 0; 189 z->zflags |= flags; 190 191 /* 192 * If we cannot wait, allocate KVA space up front, and we will fill 193 * in pages as needed. This is particularly required when creating 194 * an allocation space for map entries in kernel_map, because we 195 * do not want to go into a recursion deadlock with 196 * vm_map_entry_reserve(). 197 */ 198 if (z->zflags & ZONE_INTERRUPT) { 199 totsize = round_page(z->zsize * nentries); 200 zone_kmem_kvaspace += totsize; 201 202 z->zkva = kmem_alloc_pageable(&kernel_map, totsize); 203 if (z->zkva == 0) { 204 LIST_REMOVE(z, zlink); 205 return 0; 206 } 207 208 z->zpagemax = totsize / PAGE_SIZE; 209 if (obj == NULL) { 210 z->zobj = vm_object_allocate(OBJT_DEFAULT, z->zpagemax); 211 } else { 212 z->zobj = obj; 213 _vm_object_allocate(OBJT_DEFAULT, z->zpagemax, obj); 214 } 215 z->zallocflag = VM_ALLOC_SYSTEM | VM_ALLOC_INTERRUPT; 216 z->zmax += nentries; 217 } else { 218 z->zallocflag = VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM; 219 z->zmax = 0; 220 } 221 222 223 if (z->zsize > PAGE_SIZE) 224 z->zfreemin = 1; 225 else 226 z->zfreemin = PAGE_SIZE / z->zsize; 227 228 z->zpagecount = 0; 229 if (zalloc) 230 z->zalloc = zalloc; 231 else 232 z->zalloc = 1; 233 234 /* 235 * Populate the interrrupt zone at creation time rather than 236 * on first allocation, as this is a potentially long operation. 237 */ 238 if (z->zflags & ZONE_INTERRUPT) { 239 void *buf; 240 241 buf = zget(z); 242 zfree(z, buf); 243 } 244 245 return 1; 246 } 247 248 /* 249 * Subroutine same as zinitna, except zone data structure is allocated 250 * automatically by malloc. This routine should normally be used, except 251 * in certain tricky startup conditions in the VM system -- then 252 * zbootinit and zinitna can be used. Zinit is the standard zone 253 * initialization call. 254 * 255 * No requirements. 256 */ 257 vm_zone_t 258 zinit(char *name, int size, int nentries, int flags, int zalloc) 259 { 260 vm_zone_t z; 261 262 z = (vm_zone_t) kmalloc(sizeof (struct vm_zone), M_ZONE, M_NOWAIT); 263 if (z == NULL) 264 return NULL; 265 266 z->zflags = 0; 267 if (zinitna(z, NULL, name, size, nentries, 268 flags & ~ZONE_DESTROYABLE, zalloc) == 0) { 269 kfree(z, M_ZONE); 270 return NULL; 271 } 272 273 if (flags & ZONE_DESTROYABLE) 274 z->zflags |= ZONE_DESTROYABLE; 275 276 return z; 277 } 278 279 /* 280 * Initialize a zone before the system is fully up. This routine should 281 * only be called before full VM startup. 282 * 283 * Called from the low level boot code only. 284 */ 285 void 286 zbootinit(vm_zone_t z, char *name, int size, void *item, int nitems) 287 { 288 int i; 289 290 z->zname = name; 291 z->zsize = size; 292 z->zpagemax = 0; 293 z->zobj = NULL; 294 z->zflags = ZONE_BOOT; 295 z->zfreemin = 0; 296 z->zallocflag = 0; 297 z->zpagecount = 0; 298 z->zalloc = 0; 299 z->znalloc = 0; 300 spin_init(&z->zlock); 301 302 bzero(item, nitems * z->zsize); 303 z->zitems = NULL; 304 for (i = 0; i < nitems; i++) { 305 ((void **) item)[0] = z->zitems; 306 #ifdef INVARIANTS 307 ((void **) item)[1] = (void *) ZENTRY_FREE; 308 #endif 309 z->zitems = item; 310 item = (uint8_t *)item + z->zsize; 311 } 312 z->zfreecnt = nitems; 313 z->zmax = nitems; 314 z->ztotal = nitems; 315 316 lwkt_gettoken(&vm_token); 317 LIST_INSERT_HEAD(&zlist, z, zlink); 318 lwkt_reltoken(&vm_token); 319 } 320 321 /* 322 * Release all resources owned by zone created with zinit(). 323 * 324 * No requirements. 325 */ 326 void 327 zdestroy(vm_zone_t z) 328 { 329 int i; 330 331 if (z == NULL) 332 panic("zdestroy: null zone"); 333 if ((z->zflags & ZONE_DESTROYABLE) == 0) 334 panic("zdestroy: undestroyable zone"); 335 336 lwkt_gettoken(&vm_token); 337 LIST_REMOVE(z, zlink); 338 lwkt_reltoken(&vm_token); 339 340 /* 341 * Release virtual mappings, physical memory and update sysctl stats. 342 */ 343 if (z->zflags & ZONE_INTERRUPT) { 344 /* 345 * Pages mapped via pmap_kenter() must be removed from the 346 * kernel_pmap() before calling kmem_free() to avoid issues 347 * with kernel_pmap.pm_stats.resident_count. 348 */ 349 pmap_qremove(z->zkva, z->zpagemax); 350 351 /* 352 * Free the mapping. 353 */ 354 kmem_free(&kernel_map, z->zkva, z->zpagemax*PAGE_SIZE); 355 atomic_subtract_int(&zone_kmem_kvaspace, z->zpagemax*PAGE_SIZE); 356 357 /* 358 * Free the backing object and physical pages. 359 */ 360 vm_object_deallocate(z->zobj); 361 atomic_subtract_int(&zone_kmem_pages, z->zpagecount); 362 } else { 363 for (i=0; i < z->zkmcur; i++) { 364 kmem_free(&kernel_map, z->zkmvec[i], 365 z->zalloc*PAGE_SIZE); 366 atomic_subtract_int(&zone_kern_pages, z->zalloc); 367 } 368 if (z->zkmvec != NULL) 369 kfree(z->zkmvec, M_ZONE); 370 } 371 372 spin_uninit(&z->zlock); 373 kfree(z, M_ZONE); 374 } 375 376 377 /* 378 * void *zalloc(vm_zone_t zone) -- 379 * Returns an item from a specified zone. May not be called from a 380 * FAST interrupt or IPI function. 381 * 382 * void zfree(vm_zone_t zone, void *item) -- 383 * Frees an item back to a specified zone. May not be called from a 384 * FAST interrupt or IPI function. 385 */ 386 387 /* 388 * Internal zone routine. Not to be called from external (non vm_zone) code. 389 * 390 * No requirements. 391 */ 392 static void * 393 zget(vm_zone_t z) 394 { 395 int i; 396 vm_page_t m; 397 int nitems, nbytes; 398 int savezpc; 399 void *item; 400 401 if (z == NULL) 402 panic("zget: null zone"); 403 404 if (z->zflags & ZONE_INTERRUPT) { 405 /* 406 * Interrupt zones do not mess with the kernel_map, they 407 * simply populate an existing mapping. 408 */ 409 lwkt_gettoken(&vm_token); 410 vm_object_hold(z->zobj); 411 savezpc = z->zpagecount; 412 nbytes = z->zpagecount * PAGE_SIZE; 413 nbytes -= nbytes % z->zsize; 414 item = (char *) z->zkva + nbytes; 415 for (i = 0; ((i < z->zalloc) && (z->zpagecount < z->zpagemax)); 416 i++) { 417 vm_offset_t zkva; 418 419 m = vm_page_alloc(z->zobj, z->zpagecount, 420 z->zallocflag); 421 /* note: z might be modified due to blocking */ 422 if (m == NULL) 423 break; 424 425 /* 426 * Unbusy page so it can freed in zdestroy(). Make 427 * sure it is not on any queue and so can not be 428 * recycled under our feet. 429 */ 430 KKASSERT(m->queue == PQ_NONE); 431 vm_page_flag_clear(m, PG_BUSY); 432 433 zkva = z->zkva + z->zpagecount * PAGE_SIZE; 434 pmap_kenter(zkva, VM_PAGE_TO_PHYS(m)); /* YYY */ 435 bzero((void *)zkva, PAGE_SIZE); 436 KKASSERT(savezpc == z->zpagecount); 437 ++savezpc; 438 z->zpagecount++; 439 zone_kmem_pages++; 440 vmstats.v_wire_count++; 441 } 442 nitems = ((z->zpagecount * PAGE_SIZE) - nbytes) / z->zsize; 443 vm_object_drop(z->zobj); 444 lwkt_reltoken(&vm_token); 445 } else if (z->zflags & ZONE_SPECIAL) { 446 /* 447 * The special zone is the one used for vm_map_entry_t's. 448 * We have to avoid an infinite recursion in 449 * vm_map_entry_reserve() by using vm_map_entry_kreserve() 450 * instead. The map entries are pre-reserved by the kernel 451 * by vm_map_entry_reserve_cpu_init(). 452 */ 453 nbytes = z->zalloc * PAGE_SIZE; 454 455 item = (void *)kmem_alloc3(&kernel_map, nbytes, KM_KRESERVE); 456 457 /* note: z might be modified due to blocking */ 458 if (item != NULL) { 459 zone_kern_pages += z->zalloc; /* not MP-safe XXX */ 460 bzero(item, nbytes); 461 } else { 462 nbytes = 0; 463 } 464 nitems = nbytes / z->zsize; 465 } else { 466 /* 467 * Otherwise allocate KVA from the kernel_map. 468 */ 469 nbytes = z->zalloc * PAGE_SIZE; 470 471 item = (void *)kmem_alloc3(&kernel_map, nbytes, 0); 472 473 /* note: z might be modified due to blocking */ 474 if (item != NULL) { 475 zone_kern_pages += z->zalloc; /* not MP-safe XXX */ 476 bzero(item, nbytes); 477 478 if (z->zflags & ZONE_DESTROYABLE) { 479 if (z->zkmcur == z->zkmmax) { 480 z->zkmmax = 481 z->zkmmax==0 ? 1 : z->zkmmax*2; 482 z->zkmvec = krealloc(z->zkmvec, 483 z->zkmmax * sizeof(z->zkmvec[0]), 484 M_ZONE, M_WAITOK); 485 } 486 z->zkmvec[z->zkmcur++] = (vm_offset_t)item; 487 } 488 } else { 489 nbytes = 0; 490 } 491 nitems = nbytes / z->zsize; 492 } 493 494 spin_lock(&z->zlock); 495 z->ztotal += nitems; 496 /* 497 * Save one for immediate allocation 498 */ 499 if (nitems != 0) { 500 nitems -= 1; 501 for (i = 0; i < nitems; i++) { 502 ((void **) item)[0] = z->zitems; 503 #ifdef INVARIANTS 504 ((void **) item)[1] = (void *) ZENTRY_FREE; 505 #endif 506 z->zitems = item; 507 item = (uint8_t *)item + z->zsize; 508 } 509 z->zfreecnt += nitems; 510 z->znalloc++; 511 } else if (z->zfreecnt > 0) { 512 item = z->zitems; 513 z->zitems = ((void **) item)[0]; 514 #ifdef INVARIANTS 515 if (((void **) item)[1] != (void *) ZENTRY_FREE) 516 zerror(ZONE_ERROR_NOTFREE); 517 ((void **) item)[1] = 0; 518 #endif 519 z->zfreecnt--; 520 z->znalloc++; 521 } else { 522 item = NULL; 523 } 524 spin_unlock(&z->zlock); 525 526 /* 527 * A special zone may have used a kernel-reserved vm_map_entry. If 528 * so we have to be sure to recover our reserve so we don't run out. 529 * We will panic if we run out. 530 */ 531 if (z->zflags & ZONE_SPECIAL) 532 vm_map_entry_reserve(0); 533 534 return item; 535 } 536 537 /* 538 * No requirements. 539 */ 540 static int 541 sysctl_vm_zone(SYSCTL_HANDLER_ARGS) 542 { 543 int error=0; 544 vm_zone_t curzone; 545 char tmpbuf[128]; 546 char tmpname[14]; 547 548 ksnprintf(tmpbuf, sizeof(tmpbuf), 549 "\nITEM SIZE LIMIT USED FREE REQUESTS\n"); 550 error = SYSCTL_OUT(req, tmpbuf, strlen(tmpbuf)); 551 if (error) 552 return (error); 553 554 lwkt_gettoken(&vm_token); 555 LIST_FOREACH(curzone, &zlist, zlink) { 556 int i; 557 int len; 558 int offset; 559 560 len = strlen(curzone->zname); 561 if (len >= (sizeof(tmpname) - 1)) 562 len = (sizeof(tmpname) - 1); 563 for(i = 0; i < sizeof(tmpname) - 1; i++) 564 tmpname[i] = ' '; 565 tmpname[i] = 0; 566 memcpy(tmpname, curzone->zname, len); 567 tmpname[len] = ':'; 568 offset = 0; 569 if (curzone == LIST_FIRST(&zlist)) { 570 offset = 1; 571 tmpbuf[0] = '\n'; 572 } 573 574 ksnprintf(tmpbuf + offset, sizeof(tmpbuf) - offset, 575 "%s %6.6u, %8.8u, %6.6u, %6.6u, %8.8u\n", 576 tmpname, curzone->zsize, curzone->zmax, 577 (curzone->ztotal - curzone->zfreecnt), 578 curzone->zfreecnt, curzone->znalloc); 579 580 len = strlen((char *)tmpbuf); 581 if (LIST_NEXT(curzone, zlink) == NULL) 582 tmpbuf[len - 1] = 0; 583 584 error = SYSCTL_OUT(req, tmpbuf, len); 585 586 if (error) 587 break; 588 } 589 lwkt_reltoken(&vm_token); 590 return (error); 591 } 592 593 #if defined(INVARIANTS) 594 595 /* 596 * Debugging only. 597 */ 598 void 599 zerror(int error) 600 { 601 char *msg; 602 603 switch (error) { 604 case ZONE_ERROR_INVALID: 605 msg = "zone: invalid zone"; 606 break; 607 case ZONE_ERROR_NOTFREE: 608 msg = "zone: entry not free"; 609 break; 610 case ZONE_ERROR_ALREADYFREE: 611 msg = "zone: freeing free entry"; 612 break; 613 default: 614 msg = "zone: invalid error"; 615 break; 616 } 617 panic(msg); 618 } 619 #endif 620 621 SYSCTL_OID(_vm, OID_AUTO, zone, CTLTYPE_STRING|CTLFLAG_RD, \ 622 NULL, 0, sysctl_vm_zone, "A", "Zone Info"); 623 624 SYSCTL_INT(_vm, OID_AUTO, zone_kmem_pages, 625 CTLFLAG_RD, &zone_kmem_pages, 0, "Number of interrupt safe pages allocated by zone"); 626 SYSCTL_INT(_vm, OID_AUTO, zone_kmem_kvaspace, 627 CTLFLAG_RD, &zone_kmem_kvaspace, 0, "KVA space allocated by zone"); 628 SYSCTL_INT(_vm, OID_AUTO, zone_kern_pages, 629 CTLFLAG_RD, &zone_kern_pages, 0, "Number of non-interrupt safe pages allocated by zone"); 630