1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2006 Peter Wemm 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include "opt_pmap.h" 33 #include "opt_watchdog.h" 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/conf.h> 38 #include <sys/cons.h> 39 #include <sys/kernel.h> 40 #include <sys/kerneldump.h> 41 #include <sys/msgbuf.h> 42 #include <sys/sysctl.h> 43 #include <sys/watchdog.h> 44 #include <sys/vmmeter.h> 45 #include <vm/vm.h> 46 #include <vm/vm_param.h> 47 #include <vm/vm_page.h> 48 #include <vm/vm_phys.h> 49 #include <vm/vm_dumpset.h> 50 #include <vm/pmap.h> 51 #include <machine/atomic.h> 52 #include <machine/elf.h> 53 #include <machine/md_var.h> 54 #include <machine/minidump.h> 55 #include <machine/vmparam.h> 56 57 CTASSERT(sizeof(struct kerneldumpheader) == 512); 58 59 static struct kerneldumpheader kdh; 60 61 /* Handle chunked writes. */ 62 static size_t fragsz; 63 static void *dump_va; 64 static size_t progress, dumpsize, wdog_next; 65 66 static int dump_retry_count = 5; 67 SYSCTL_INT(_machdep, OID_AUTO, dump_retry_count, CTLFLAG_RWTUN, 68 &dump_retry_count, 0, "Number of times dump has to retry before bailing out"); 69 70 static int 71 blk_flush(struct dumperinfo *di) 72 { 73 int error; 74 75 if (fragsz == 0) 76 return (0); 77 78 error = dump_append(di, dump_va, fragsz); 79 fragsz = 0; 80 return (error); 81 } 82 83 /* Pat the watchdog approximately every 128MB of the dump. */ 84 #define WDOG_DUMP_INTERVAL (128 * 1024 * 1024) 85 86 static int 87 blk_write(struct dumperinfo *di, char *ptr, vm_paddr_t pa, size_t sz) 88 { 89 size_t len; 90 int error, i, c; 91 u_int maxdumpsz; 92 93 maxdumpsz = min(di->maxiosize, MAXDUMPPGS * PAGE_SIZE); 94 if (maxdumpsz == 0) /* seatbelt */ 95 maxdumpsz = PAGE_SIZE; 96 error = 0; 97 if ((sz % PAGE_SIZE) != 0) { 98 printf("size not page aligned\n"); 99 return (EINVAL); 100 } 101 if (ptr != NULL && pa != 0) { 102 printf("cant have both va and pa!\n"); 103 return (EINVAL); 104 } 105 if ((((uintptr_t)pa) % PAGE_SIZE) != 0) { 106 printf("address not page aligned %p\n", ptr); 107 return (EINVAL); 108 } 109 if (ptr != NULL) { 110 /* If we're doing a virtual dump, flush any pre-existing pa pages */ 111 error = blk_flush(di); 112 if (error) 113 return (error); 114 } 115 while (sz) { 116 len = maxdumpsz - fragsz; 117 if (len > sz) 118 len = sz; 119 progress -= len; 120 121 dumpsys_pb_progress(len); 122 if (progress <= wdog_next) { 123 wdog_kern_pat(WD_LASTVAL); 124 if (wdog_next > WDOG_DUMP_INTERVAL) 125 wdog_next -= WDOG_DUMP_INTERVAL; 126 else 127 wdog_next = 0; 128 } 129 130 if (ptr) { 131 error = dump_append(di, ptr, len); 132 if (error) 133 return (error); 134 ptr += len; 135 sz -= len; 136 } else { 137 for (i = 0; i < len; i += PAGE_SIZE) 138 dump_va = pmap_kenter_temporary(pa + i, (i + fragsz) >> PAGE_SHIFT); 139 fragsz += len; 140 pa += len; 141 sz -= len; 142 if (fragsz == maxdumpsz) { 143 error = blk_flush(di); 144 if (error) 145 return (error); 146 } 147 } 148 149 /* Check for user abort. */ 150 c = cncheckc(); 151 if (c == 0x03) 152 return (ECANCELED); 153 if (c != -1) 154 printf(" (CTRL-C to abort) "); 155 } 156 157 return (0); 158 } 159 160 /* A fake page table page, to avoid having to handle both 4K and 2M pages */ 161 static pd_entry_t fakepd[NPDEPG]; 162 163 int 164 cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state) 165 { 166 uint32_t pmapsize; 167 vm_offset_t va, kva_end; 168 int error; 169 uint64_t *pml4, *pdp, *pd, *pt, pa; 170 uint64_t pdpe, pde, pte; 171 int ii, j, k, n; 172 int retry_count; 173 struct minidumphdr mdhdr; 174 struct msgbuf *mbp; 175 176 retry_count = 0; 177 retry: 178 retry_count++; 179 180 /* Snapshot the KVA upper bound in case it grows. */ 181 kva_end = MAX(KERNBASE + nkpt * NBPDR, kernel_vm_end); 182 183 /* 184 * Walk the kernel page table pages, setting the active entries in the 185 * dump bitmap. 186 * 187 * NB: for a live dump, we may be racing with updates to the page 188 * tables, so care must be taken to read each entry only once. 189 */ 190 pmapsize = 0; 191 for (va = VM_MIN_KERNEL_ADDRESS; va < kva_end; ) { 192 /* 193 * We always write a page, even if it is zero. Each 194 * page written corresponds to 1GB of space 195 */ 196 pmapsize += PAGE_SIZE; 197 ii = pmap_pml4e_index(va); 198 pml4 = (uint64_t *)PHYS_TO_DMAP(KPML4phys) + ii; 199 pdp = (uint64_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 200 pdpe = atomic_load_64(&pdp[pmap_pdpe_index(va)]); 201 if ((pdpe & PG_V) == 0) { 202 va += NBPDP; 203 continue; 204 } 205 206 /* 207 * 1GB page is represented as 512 2MB pages in a dump. 208 */ 209 if ((pdpe & PG_PS) != 0) { 210 va += NBPDP; 211 pa = pdpe & PG_PS_FRAME; 212 for (n = 0; n < NPDEPG * NPTEPG; n++) { 213 if (vm_phys_is_dumpable(pa)) 214 vm_page_dump_add(state->dump_bitset, 215 pa); 216 pa += PAGE_SIZE; 217 } 218 continue; 219 } 220 221 pd = (uint64_t *)PHYS_TO_DMAP(pdpe & PG_FRAME); 222 for (n = 0; n < NPDEPG; n++, va += NBPDR) { 223 pde = atomic_load_64(&pd[pmap_pde_index(va)]); 224 225 if ((pde & PG_V) == 0) 226 continue; 227 228 if ((pde & PG_PS) != 0) { 229 /* This is an entire 2M page. */ 230 pa = pde & PG_PS_FRAME; 231 for (k = 0; k < NPTEPG; k++) { 232 if (vm_phys_is_dumpable(pa)) 233 vm_page_dump_add( 234 state->dump_bitset, pa); 235 pa += PAGE_SIZE; 236 } 237 continue; 238 } 239 240 pa = pde & PG_FRAME; 241 /* set bit for this PTE page */ 242 if (vm_phys_is_dumpable(pa)) 243 vm_page_dump_add(state->dump_bitset, pa); 244 /* and for each valid page in this 2MB block */ 245 pt = (uint64_t *)PHYS_TO_DMAP(pde & PG_FRAME); 246 for (k = 0; k < NPTEPG; k++) { 247 pte = atomic_load_64(&pt[k]); 248 if ((pte & PG_V) == 0) 249 continue; 250 pa = pte & PG_FRAME; 251 if (PHYS_IN_DMAP(pa) && vm_phys_is_dumpable(pa)) 252 vm_page_dump_add(state->dump_bitset, 253 pa); 254 } 255 } 256 } 257 258 /* Calculate dump size. */ 259 mbp = state->msgbufp; 260 dumpsize = pmapsize; 261 dumpsize += round_page(mbp->msg_size); 262 dumpsize += round_page(sizeof(dump_avail)); 263 dumpsize += round_page(BITSET_SIZE(vm_page_dump_pages)); 264 VM_PAGE_DUMP_FOREACH(state->dump_bitset, pa) { 265 /* Clear out undumpable pages now if needed */ 266 if (PHYS_IN_DMAP(pa) && vm_phys_is_dumpable(pa)) { 267 dumpsize += PAGE_SIZE; 268 } else { 269 vm_page_dump_drop(state->dump_bitset, pa); 270 } 271 } 272 dumpsize += PAGE_SIZE; 273 274 wdog_next = progress = dumpsize; 275 dumpsys_pb_init(dumpsize); 276 277 /* Initialize mdhdr */ 278 bzero(&mdhdr, sizeof(mdhdr)); 279 strcpy(mdhdr.magic, MINIDUMP_MAGIC); 280 mdhdr.version = MINIDUMP_VERSION; 281 mdhdr.msgbufsize = mbp->msg_size; 282 mdhdr.bitmapsize = round_page(BITSET_SIZE(vm_page_dump_pages)); 283 mdhdr.pmapsize = pmapsize; 284 mdhdr.kernbase = VM_MIN_KERNEL_ADDRESS; 285 mdhdr.dmapbase = DMAP_MIN_ADDRESS; 286 mdhdr.dmapend = DMAP_MAX_ADDRESS; 287 mdhdr.dumpavailsize = round_page(sizeof(dump_avail)); 288 289 dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_AMD64_VERSION, 290 dumpsize); 291 292 error = dump_start(di, &kdh); 293 if (error != 0) 294 goto fail; 295 296 printf("Dumping %llu out of %ju MB:", (long long)dumpsize >> 20, 297 ptoa((uintmax_t)physmem) / 1048576); 298 299 /* Dump my header */ 300 bzero(&fakepd, sizeof(fakepd)); 301 bcopy(&mdhdr, &fakepd, sizeof(mdhdr)); 302 error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE); 303 if (error) 304 goto fail; 305 306 /* Dump msgbuf up front */ 307 error = blk_write(di, mbp->msg_ptr, 0, round_page(mbp->msg_size)); 308 if (error) 309 goto fail; 310 311 /* Dump dump_avail */ 312 _Static_assert(sizeof(dump_avail) <= sizeof(fakepd), 313 "Large dump_avail not handled"); 314 bzero(&fakepd, sizeof(fakepd)); 315 memcpy(fakepd, dump_avail, sizeof(dump_avail)); 316 error = blk_write(di, (char *)fakepd, 0, PAGE_SIZE); 317 if (error) 318 goto fail; 319 320 /* Dump bitmap */ 321 error = blk_write(di, (char *)state->dump_bitset, 0, 322 round_page(BITSET_SIZE(vm_page_dump_pages))); 323 if (error) 324 goto fail; 325 326 /* Dump kernel page directory pages */ 327 bzero(fakepd, sizeof(fakepd)); 328 for (va = VM_MIN_KERNEL_ADDRESS; va < kva_end; va += NBPDP) { 329 ii = pmap_pml4e_index(va); 330 pml4 = (uint64_t *)PHYS_TO_DMAP(KPML4phys) + ii; 331 pdp = (uint64_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 332 pdpe = atomic_load_64(&pdp[pmap_pdpe_index(va)]); 333 334 /* We always write a page, even if it is zero */ 335 if ((pdpe & PG_V) == 0) { 336 error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE); 337 if (error) 338 goto fail; 339 /* flush, in case we reuse fakepd in the same block */ 340 error = blk_flush(di); 341 if (error) 342 goto fail; 343 continue; 344 } 345 346 /* 1GB page is represented as 512 2MB pages in a dump */ 347 if ((pdpe & PG_PS) != 0) { 348 /* PDPE and PDP have identical layout in this case */ 349 fakepd[0] = pdpe; 350 for (j = 1; j < NPDEPG; j++) 351 fakepd[j] = fakepd[j - 1] + NBPDR; 352 error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE); 353 if (error) 354 goto fail; 355 /* flush, in case we reuse fakepd in the same block */ 356 error = blk_flush(di); 357 if (error) 358 goto fail; 359 bzero(fakepd, sizeof(fakepd)); 360 continue; 361 } 362 363 pa = pdpe & PG_FRAME; 364 if (PHYS_IN_DMAP(pa) && vm_phys_is_dumpable(pa)) { 365 pd = (uint64_t *)PHYS_TO_DMAP(pa); 366 error = blk_write(di, (char *)pd, 0, PAGE_SIZE); 367 } else { 368 /* Malformed pa, write the zeroed fakepd. */ 369 error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE); 370 } 371 if (error) 372 goto fail; 373 error = blk_flush(di); 374 if (error) 375 goto fail; 376 } 377 378 /* Dump memory chunks */ 379 VM_PAGE_DUMP_FOREACH(state->dump_bitset, pa) { 380 error = blk_write(di, 0, pa, PAGE_SIZE); 381 if (error) 382 goto fail; 383 } 384 385 error = blk_flush(di); 386 if (error) 387 goto fail; 388 389 error = dump_finish(di, &kdh); 390 if (error != 0) 391 goto fail; 392 393 printf("\nDump complete\n"); 394 return (0); 395 396 fail: 397 if (error < 0) 398 error = -error; 399 400 printf("\n"); 401 if (error == ENOSPC) { 402 printf("Dump map grown while dumping. "); 403 if (retry_count < dump_retry_count) { 404 printf("Retrying...\n"); 405 goto retry; 406 } 407 printf("Dump failed.\n"); 408 } 409 else if (error == ECANCELED) 410 printf("Dump aborted\n"); 411 else if (error == E2BIG) { 412 printf("Dump failed. Partition too small (about %lluMB were " 413 "needed this time).\n", (long long)dumpsize >> 20); 414 } else 415 printf("** DUMP FAILED (ERROR %d) **\n", error); 416 return (error); 417 } 418