1 /*-
2  * Copyright (c) 2006 Peter Wemm
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * $FreeBSD: src/sys/amd64/amd64/minidump_machdep.c,v 1.10 2009/05/29 21:27:12 jamie Exp $
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/conf.h>
32 #include <sys/cons.h>
33 #include <sys/device.h>
34 #include <sys/globaldata.h>
35 #include <sys/kernel.h>
36 #include <sys/kerneldump.h>
37 #include <sys/msgbuf.h>
38 #include <sys/kbio.h>
39 #include <vm/vm.h>
40 #include <vm/vm_kern.h>
41 #include <vm/pmap.h>
42 #include <machine/atomic.h>
43 #include <machine/elf.h>
44 #include <machine/globaldata.h>
45 #include <machine/md_var.h>
46 #include <machine/vmparam.h>
47 #include <machine/minidump.h>
48 
49 CTASSERT(sizeof(struct kerneldumpheader) == 512);
50 
51 /*
52  * Don't touch the first SIZEOF_METADATA bytes on the dump device. This
53  * is to protect us from metadata and to protect metadata from us.
54  */
55 #define	SIZEOF_METADATA		(64*1024)
56 
57 #define	MD_ALIGN(x)	(((off_t)(x) + PAGE_MASK) & ~PAGE_MASK)
58 #define	DEV_ALIGN(x)	roundup2((off_t)(x), DEV_BSIZE)
59 
60 uint64_t *vm_page_dump;
61 vm_offset_t vm_page_dump_size;
62 
63 static struct kerneldumpheader kdh;
64 static off_t dumplo;
65 
66 /* Handle chunked writes. */
67 static size_t fragsz;
68 static void *dump_va;
69 static size_t counter, progress;
70 
71 CTASSERT(sizeof(*vm_page_dump) == 8);
72 
73 static int
74 is_dumpable(vm_paddr_t pa)
75 {
76 	int i;
77 
78 	for (i = 0; dump_avail[i].phys_beg || dump_avail[i].phys_end; ++i) {
79 		if (pa >= dump_avail[i].phys_beg && pa < dump_avail[i].phys_end)
80 			return (1);
81 	}
82 	return (0);
83 }
84 
85 #define PG2MB(pgs) (((pgs) + (1 << 8) - 1) >> 8)
86 
87 static int
88 blk_flush(struct dumperinfo *di)
89 {
90 	int error;
91 
92 	if (fragsz == 0)
93 		return (0);
94 
95 	error = dev_ddump(di->priv, dump_va, 0, dumplo, fragsz);
96 	dumplo += fragsz;
97 	fragsz = 0;
98 	return (error);
99 }
100 
101 static int
102 blk_write(struct dumperinfo *di, char *ptr, vm_paddr_t pa, size_t sz)
103 {
104 	size_t len;
105 	int error, i, c;
106 	int max_iosize;
107 
108 	error = 0;
109 	if ((sz & PAGE_MASK)) {
110 		kprintf("size not page aligned\n");
111 		return (EINVAL);
112 	}
113 	if (ptr != NULL && pa != 0) {
114 		kprintf("can't have both va and pa!\n");
115 		return (EINVAL);
116 	}
117 	if (pa != 0 && (((uintptr_t)pa) & PAGE_MASK) != 0) {
118 		kprintf("address not page aligned\n");
119 		return (EINVAL);
120 	}
121 	if (ptr != NULL) {
122 		/*
123 		 * If we're doing a virtual dump, flush any
124 		 * pre-existing pa pages
125 		 */
126 		error = blk_flush(di);
127 		if (error)
128 			return (error);
129 	}
130 	max_iosize = min(MAXPHYS, di->maxiosize);
131 	while (sz) {
132 		len = max_iosize - fragsz;
133 		if (len > sz)
134 			len = sz;
135 		counter += len;
136 		progress -= len;
137 		if (counter >> 24) {
138 			kprintf(" %ld", PG2MB(progress >> PAGE_SHIFT));
139 			counter &= (1<<24) - 1;
140 		}
141 		if (ptr) {
142 			/*kprintf("s");*/
143 			error = dev_ddump(di->priv, ptr, 0, dumplo, len);
144 			/* kprintf("t");*/
145 			if (error)
146 				return (error);
147 			dumplo += len;
148 			ptr += len;
149 			sz -= len;
150 		} else {
151 			for (i = 0; i < len; i += PAGE_SIZE) {
152 				dump_va = pmap_kenter_temporary(pa + i,
153 						(i + fragsz) >> PAGE_SHIFT);
154 			}
155 			smp_invltlb();
156 			fragsz += len;
157 			pa += len;
158 			sz -= len;
159 			if (fragsz == max_iosize) {
160 				error = blk_flush(di);
161 				if (error)
162 					return (error);
163 			}
164 		}
165 	}
166 
167 	/* Check for user abort. */
168 	c = cncheckc();
169 	if (c == 0x03)
170 		return (ECANCELED);
171 	if (c != -1 && c != NOKEY)
172 		kprintf(" (CTRL-C to abort) ");
173 
174 	return (0);
175 }
176 
177 /* A fake page table page, to avoid having to handle both 4K and 2M pages */
178 static pt_entry_t fakept[NPTEPG];
179 
180 void
181 minidumpsys(struct dumperinfo *di)
182 {
183 	uint64_t dumpsize;
184 	uint64_t ptesize;
185 	vm_offset_t va;
186 	vm_offset_t kern_end;
187 	int error;
188 	uint64_t bits;
189 	uint64_t *pdp, *pd, *pt, pa;
190 	int i, j, k, bit;
191 	int kpdp, klo, khi;
192 	int lpdp = -1;
193 	long lpdpttl = 0;
194 	struct minidumphdr2 mdhdr;
195 	struct mdglobaldata *md;
196 
197 	cnpoll(TRUE);
198 	counter = 0;
199 
200 	/*
201 	 * minidump page table format is an array of PD entries (1GB pte's),
202 	 * representing the entire user and kernel virtual address space
203 	 * (256TB).
204 	 *
205 	 * However, we will only dump the KVM portion of this space.  And we
206 	 * only copy the PDP pages for direct access, the PD and PT pages
207 	 * will be included in the dump as part of the physical map.
208 	 */
209 	ptesize = NPML4EPG * NPDPEPG * 8;
210 
211 	/*
212 	 * Walk page table pages, set bits in vm_page_dump.
213 	 *
214 	 * NOTE: kernel_vm_end can actually be below KERNBASE.
215 	 * 	 Just use KvaEnd.  Also note that loops which go
216 	 *	 all the way to the end of the address space might
217 	 *	 overflow the loop variable.
218 	 */
219 	md = (struct mdglobaldata *)globaldata_find(0);
220 
221 	kern_end = KvaEnd;
222 	if (kern_end < (vm_offset_t)&(md[ncpus]))
223 		kern_end = (vm_offset_t)&(md[ncpus]);
224 
225 	pdp = (uint64_t *)PHYS_TO_DMAP(KPDPphys);
226 	for (va = VM_MIN_KERNEL_ADDRESS; va < kern_end; va += NBPDR) {
227 		/*
228 		 * The loop probably overflows a 64-bit int due to NBPDR.
229 		 */
230 		if (va < VM_MIN_KERNEL_ADDRESS)
231 			break;
232 
233 		/*
234 		 * KPDPphys[] is relative to VM_MIN_KERNEL_ADDRESS. It
235 		 * contains NKPML4E PDP pages (so we can get to all kernel
236 		 * PD entries from this array).
237 		 */
238 		i = ((va - VM_MIN_KERNEL_ADDRESS) >> PDPSHIFT) &
239 		    (NPML4EPG * NPDPEPG - 1);
240 		if (i != lpdp) {
241 			lpdp = i;
242 			lpdpttl = 0;
243 		}
244 
245 		/*
246 		 * Calculate the PD index in the PDP.  Each PD represents 1GB.
247 		 * KVA space can cover multiple PDP pages.  The PDP array
248 		 * has been initialized for the entire kernel address space.
249 		 *
250 		 * We include the PD entries in the PDP in the dump
251 		 */
252 		i = ((va - VM_MIN_KERNEL_ADDRESS) >> PDPSHIFT) &
253 		    (NPML4EPG * NPDPEPG - 1);
254 		if ((pdp[i] & kernel_pmap.pmap_bits[PG_V_IDX]) == 0)
255 			continue;
256 
257 		/*
258 		 * Add the PD page from the PDP to the dump
259 		 */
260 		dump_add_page(pdp[i] & PG_FRAME);
261 		lpdpttl += PAGE_SIZE;
262 
263 		pd = (uint64_t *)PHYS_TO_DMAP(pdp[i] & PG_FRAME);
264 		j = ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
265 		if ((pd[j] & (kernel_pmap.pmap_bits[PG_PS_IDX] | kernel_pmap.pmap_bits[PG_V_IDX])) ==
266 		    (kernel_pmap.pmap_bits[PG_PS_IDX] | kernel_pmap.pmap_bits[PG_V_IDX]))  {
267 			/* This is an entire 2M page. */
268 			lpdpttl += PAGE_SIZE * NPTEPG;
269 			pa = pd[j] & PG_PS_FRAME;
270 			for (k = 0; k < NPTEPG; k++) {
271 				if (is_dumpable(pa))
272 					dump_add_page(pa);
273 				pa += PAGE_SIZE;
274 			}
275 		} else if ((pd[j] & kernel_pmap.pmap_bits[PG_V_IDX]) ==
276 			   kernel_pmap.pmap_bits[PG_V_IDX]) {
277 			/*
278 			 * Add the PT page from the PD to the dump (it is no
279 			 * longer included in the ptemap.
280 			 */
281 			dump_add_page(pd[j] & PG_FRAME);
282 			lpdpttl += PAGE_SIZE;
283 
284 			/* set bit for each valid page in this 2MB block */
285 			pt = (uint64_t *)PHYS_TO_DMAP(pd[j] & PG_FRAME);
286 			for (k = 0; k < NPTEPG; k++) {
287 				if ((pt[k] & kernel_pmap.pmap_bits[PG_V_IDX]) == kernel_pmap.pmap_bits[PG_V_IDX]) {
288 					pa = pt[k] & PG_FRAME;
289 					lpdpttl += PAGE_SIZE;
290 					if (is_dumpable(pa))
291 						dump_add_page(pa);
292 				}
293 			}
294 		} else {
295 			/* nothing, we're going to dump a null page */
296 		}
297 	}
298 
299 	/* Calculate dump size. */
300 	dumpsize = ptesize;
301 	dumpsize += round_page(msgbufp->msg_size);
302 	dumpsize += round_page(vm_page_dump_size);
303 
304 	for (i = 0; i < vm_page_dump_size / sizeof(*vm_page_dump); i++) {
305 		bits = vm_page_dump[i];
306 		while (bits) {
307 			bit = bsfq(bits);
308 			pa = (((uint64_t)i * sizeof(*vm_page_dump) * NBBY) + bit) * PAGE_SIZE;
309 			/* Clear out undumpable pages now if needed */
310 			if (is_dumpable(pa)) {
311 				dumpsize += PAGE_SIZE;
312 			} else {
313 				dump_drop_page(pa);
314 			}
315 			bits &= ~(1ul << bit);
316 		}
317 	}
318 	dumpsize += PAGE_SIZE;
319 
320 	/* Determine dump offset on device. */
321 	if (di->mediasize < SIZEOF_METADATA + dumpsize + sizeof(kdh) * 2) {
322 		error = ENOSPC;
323 		goto fail;
324 	}
325 	dumplo = di->mediaoffset + di->mediasize - dumpsize;
326 	dumplo -= sizeof(kdh) * 2;
327 	progress = dumpsize;
328 
329 	/* Initialize mdhdr */
330 	bzero(&mdhdr, sizeof(mdhdr));
331 	strcpy(mdhdr.magic, MINIDUMP2_MAGIC);
332 	mdhdr.version = MINIDUMP2_VERSION;
333 	mdhdr.msgbufsize = msgbufp->msg_size;
334 	mdhdr.bitmapsize = vm_page_dump_size;
335 	mdhdr.ptesize = ptesize;
336 	mdhdr.kernbase = VM_MIN_KERNEL_ADDRESS;
337 	mdhdr.dmapbase = DMAP_MIN_ADDRESS;
338 	mdhdr.dmapend = DMAP_MAX_ADDRESS;
339 
340 	mkdumpheader(&kdh, KERNELDUMPMAGIC, KERNELDUMP_AMD64_VERSION,
341 		     dumpsize, di->blocksize);
342 
343 	kprintf("Physical memory: %jd MB\n", (intmax_t)ptoa(physmem) / 1048576);
344 	kprintf("Dumping %jd MB:", (intmax_t)dumpsize >> 20);
345 
346 	/* Dump leader */
347 	error = dev_ddump(di->priv, &kdh, 0, dumplo, sizeof(kdh));
348 	if (error)
349 		goto fail;
350 	dumplo += sizeof(kdh);
351 
352 	/* Dump my header */
353 	bzero(fakept, sizeof(fakept));
354 	bcopy(&mdhdr, fakept, sizeof(mdhdr));
355 	error = blk_write(di, (char *)fakept, 0, PAGE_SIZE);
356 	if (error)
357 		goto fail;
358 
359 	/* Dump msgbuf up front */
360 	error = blk_write(di, (char *)msgbufp->msg_ptr, 0, round_page(msgbufp->msg_size));
361 	if (error)
362 		goto fail;
363 
364 	/* Dump bitmap */
365 	error = blk_write(di, (char *)vm_page_dump, 0, round_page(vm_page_dump_size));
366 	if (error)
367 		goto fail;
368 
369 	/*
370 	 * Dump a full PDP array for the entire KVM space, user and kernel.
371 	 * This is 512*512 1G PD entries (512*512*8 = 2MB).
372 	 *
373 	 * The minidump only dumps PD entries related to KVA space.  Also
374 	 * note that pdp[] (aka KPDPphys[]) only covers VM_MIN_KERNEL_ADDRESS
375 	 * to VM_MAX_KERNEL_ADDRESS.
376 	 *
377 	 * The actual KPDPphys[] array covers a KVA space starting at KVA
378 	 * KPDPPHYS_KVA.
379 	 *
380 	 * By dumping a PDP[] array of PDs representing the entire virtual
381 	 * address space we can expand what we dump in the future.
382 	 */
383 	pdp = (uint64_t *)PHYS_TO_DMAP(KPDPphys);
384 	kpdp = (KPDPPHYS_KVA >> PDPSHIFT) &
385 		    (NPML4EPG * NPDPEPG - 1);
386 	klo = (int)(VM_MIN_KERNEL_ADDRESS >> PDPSHIFT) &
387 		    (NPML4EPG * NPDPEPG - 1);
388 	khi = (int)(VM_MAX_KERNEL_ADDRESS >> PDPSHIFT) &
389 		    (NPML4EPG * NPDPEPG - 1);
390 
391 	for (i = 0; i < NPML4EPG * NPDPEPG; ++i) {
392 		if (i < klo || i > khi) {
393 			fakept[i & (NPDPEPG - 1)] = 0;
394 		} else {
395 			fakept[i & (NPDPEPG - 1)] = pdp[i - kpdp];
396 		}
397 		if ((i & (NPDPEPG - 1)) == (NPDPEPG - 1)) {
398 			error = blk_write(di, (char *)fakept, 0, PAGE_SIZE);
399 			if (error)
400 				goto fail;
401 			error = blk_flush(di);
402 			if (error)
403 				goto fail;
404 		}
405 	}
406 
407 	/* Dump memory chunks */
408 	/* XXX cluster it up and use blk_dump() */
409 	for (i = 0; i < vm_page_dump_size / sizeof(*vm_page_dump); i++) {
410 		bits = vm_page_dump[i];
411 		while (bits) {
412 			bit = bsfq(bits);
413 			pa = (((uint64_t)i * sizeof(*vm_page_dump) * NBBY) + bit) * PAGE_SIZE;
414 			error = blk_write(di, 0, pa, PAGE_SIZE);
415 			if (error)
416 				goto fail;
417 			bits &= ~(1ul << bit);
418 		}
419 	}
420 
421 	error = blk_flush(di);
422 	if (error)
423 		goto fail;
424 
425 	/* Dump trailer */
426 	error = dev_ddump(di->priv, &kdh, 0, dumplo, sizeof(kdh));
427 	if (error)
428 		goto fail;
429 	dumplo += sizeof(kdh);
430 
431 	/* Signal completion, signoff and exit stage left. */
432 	dev_ddump(di->priv, NULL, 0, 0, 0);
433 	kprintf("\nDump complete\n");
434 	cnpoll(FALSE);
435 	return;
436 
437  fail:
438 	cnpoll(FALSE);
439 	if (error < 0)
440 		error = -error;
441 
442 	if (error == ECANCELED)
443 		kprintf("\nDump aborted\n");
444 	else if (error == ENOSPC)
445 		kprintf("\nDump failed. Partition too small.\n");
446 	else
447 		kprintf("\n** DUMP FAILED (ERROR %d) **\n", error);
448 }
449 
450 void
451 dump_add_page(vm_paddr_t pa)
452 {
453 	int idx, bit;
454 
455 	pa >>= PAGE_SHIFT;
456 	idx = pa >> 6;		/* 2^6 = 64 */
457 	bit = pa & 63;
458 	atomic_set_long(&vm_page_dump[idx], 1ul << bit);
459 }
460 
461 void
462 dump_drop_page(vm_paddr_t pa)
463 {
464 	int idx, bit;
465 
466 	pa >>= PAGE_SHIFT;
467 	idx = pa >> 6;		/* 2^6 = 64 */
468 	bit = pa & 63;
469 	atomic_clear_long(&vm_page_dump[idx], 1ul << bit);
470 }
471