From c0345a84aa23b03722ff306af87132e645f730c1 Mon Sep 17 00:00:00 2001
From: Peter Wemm <peter@FreeBSD.org>
Date: Fri, 21 Apr 2006 04:24:50 +0000
Subject: Introduce minidumps.  Full physical memory crash dumps are still
 available via the debug.minidump sysctl and tunable.

Traditional dumps store all physical memory.  This was once a good thing
when machines had a maximum of 64M of ram and 1GB of kvm.  These days,
machines often have many gigabytes of ram and a smaller amount of kvm.
libkvm+kgdb don't have a way to access physical ram that is not mapped
into kvm at the time of the crash dump, so the extra ram being dumped
is mostly wasted.

Minidumps invert the process.  Instead of dumping physical memory in
in order to guarantee that all of kvm's backing is dumped, minidumps
instead dump only memory that is actively mapped into kvm.

amd64 has a direct map region that things like UMA use.  Obviously we
cannot dump all of the direct map region because that is effectively
an old style all-physical-memory dump.  Instead, introduce a bitmap
and two helper routines (dump_add_page(pa) and dump_drop_page(pa)) that
allow certain critical direct map pages to be included in the dump.
uma_machdep.c's allocator is the intended consumer.

Dumps are a custom format.  At the very beginning of the file is a header,
then a copy of the message buffer, then the bitmap of pages present in
the dump, then the final level of the kvm page table trees (2MB mappings
are expanded into a 4K page mappings), then the sparse physical pages
according to the bitmap.  libkvm can now conveniently access the kvm
page table entries.

Booting my test 8GB machine, forcing it into ddb and forcing a dump
leads to a 48MB minidump.  While this is a best case, I expect minidumps
to be in the 100MB-500MB range.  Obviously, never larger than physical
memory of course.

minidumps are on by default.  It would want be necessary to turn them off
if it was necessary to debug corrupt kernel page table management as that
would mess up minidumps as well.

Both minidumps and regular dumps are supported on the same machine.
---
 sys/amd64/amd64/minidump_machdep.c | 420 +++++++++++++++++++++++++++++++++++++
 1 file changed, 420 insertions(+)
 create mode 100644 sys/amd64/amd64/minidump_machdep.c

(limited to 'sys/amd64/amd64/minidump_machdep.c')

diff --git a/sys/amd64/amd64/minidump_machdep.c b/sys/amd64/amd64/minidump_machdep.c
new file mode 100644
index 000000000000..1f7af43044ee
--- /dev/null
+++ b/sys/amd64/amd64/minidump_machdep.c
@@ -0,0 +1,420 @@
+/*-
+ * Copyright (c) 2006 Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/cons.h>
+#include <sys/kernel.h>
+#include <sys/kerneldump.h>
+#include <sys/msgbuf.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <machine/atomic.h>
+#include <machine/elf.h>
+#include <machine/md_var.h>
+#include <machine/vmparam.h>
+#include <machine/minidump.h>
+
+CTASSERT(sizeof(struct kerneldumpheader) == 512);
+
+/*
+ * Don't touch the first SIZEOF_METADATA bytes on the dump device. This
+ * is to protect us from metadata and to protect metadata from us.
+ */
+#define	SIZEOF_METADATA		(64*1024)
+
+#define	MD_ALIGN(x)	(((off_t)(x) + PAGE_MASK) & ~PAGE_MASK)
+#define	DEV_ALIGN(x)	(((off_t)(x) + (DEV_BSIZE-1)) & ~(DEV_BSIZE-1))
+
+extern uint64_t KPDPphys;
+
+uint64_t *vm_page_dump;
+int vm_page_dump_size;
+
+static struct kerneldumpheader kdh;
+static off_t dumplo;
+
+/* Handle chunked writes. */
+static size_t fragsz;
+static void *dump_va;
+static size_t counter, progress;
+
+CTASSERT(sizeof(*vm_page_dump) == 8);
+
+static int
+is_dumpable(vm_paddr_t pa)
+{
+	int i;
+
+	for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) {
+		if (pa >= dump_avail[i] && pa < dump_avail[i + 1])
+			return (1);
+	}
+	return (0);
+}
+
+/* XXX should be MI */
+static void
+mkdumpheader(struct kerneldumpheader *kdh, uint32_t archver, uint64_t dumplen,
+    uint32_t blksz)
+{
+
+	bzero(kdh, sizeof(*kdh));
+	strncpy(kdh->magic, KERNELDUMPMAGIC, sizeof(kdh->magic));
+	strncpy(kdh->architecture, MACHINE_ARCH, sizeof(kdh->architecture));
+	kdh->version = htod32(KERNELDUMPVERSION);
+	kdh->architectureversion = htod32(archver);
+	kdh->dumplength = htod64(dumplen);
+	kdh->dumptime = htod64(time_second);
+	kdh->blocksize = htod32(blksz);
+	strncpy(kdh->hostname, hostname, sizeof(kdh->hostname));
+	strncpy(kdh->versionstring, version, sizeof(kdh->versionstring));
+	if (panicstr != NULL)
+		strncpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring));
+	kdh->parity = kerneldump_parity(kdh);
+}
+
+#define PG2MB(pgs) (((pgs) + (1 << 8) - 1) >> 8)
+
+static int
+blk_flush(struct dumperinfo *di)
+{
+	int error;
+
+	if (fragsz == 0)
+		return (0);
+
+	error = di->dumper(di->priv, dump_va, 0, dumplo, fragsz);
+	dumplo += fragsz;
+	fragsz = 0;
+	return (error);
+}
+
+static int
+blk_write(struct dumperinfo *di, char *ptr, vm_paddr_t pa, size_t sz)
+{
+	size_t len;
+	int error, i, c;
+
+	error = 0;
+	if ((sz % PAGE_SIZE) != 0) {
+		printf("size not page aligned\n");
+		return (EINVAL);
+	}
+	if (ptr != NULL && pa != 0) {
+		printf("cant have both va and pa!\n");
+		return (EINVAL);
+	}
+	if (pa != 0 && (((uintptr_t)ptr) % PAGE_SIZE) != 0) {
+		printf("address not page aligned\n");
+		return (EINVAL);
+	}
+	if (ptr != NULL) {
+		/* If we're doing a virtual dump, flush any pre-existing pa pages */
+		error = blk_flush(di);
+		if (error)
+			return (error);
+	}
+	while (sz) {
+		len = (MAXDUMPPGS * PAGE_SIZE) - fragsz;
+		if (len > sz)
+			len = sz;
+		counter += len;
+		progress -= len;
+		if (counter >> 24) {
+			printf(" %ld", PG2MB(progress >> PAGE_SHIFT));
+			counter &= (1<<24) - 1;
+		}
+		if (ptr) {
+			error = di->dumper(di->priv, ptr, 0, dumplo, len);
+			if (error)
+				return (error);
+			dumplo += len;
+			ptr += len;
+			sz -= len;
+		} else {
+			for (i = 0; i < len; i += PAGE_SIZE)
+				dump_va = pmap_kenter_temporary(pa + i, (i + fragsz) >> PAGE_SHIFT);
+			fragsz += len;
+			pa += len;
+			sz -= len;
+			if (fragsz == (MAXDUMPPGS * PAGE_SIZE)) {
+				error = blk_flush(di);
+				if (error)
+					return (error);
+			}
+		}
+
+		/* Check for user abort. */
+		c = cncheckc();
+		if (c == 0x03)
+			return (ECANCELED);
+		if (c != -1)
+			printf(" (CTRL-C to abort) ");
+	}
+
+	return (0);
+}
+
+/* A fake page table page, to avoid having to handle both 4K and 2M pages */
+static pt_entry_t fakept[NPTEPG];
+
+void
+minidumpsys(struct dumperinfo *di)
+{
+	uint64_t dumpsize;
+	uint32_t ptesize;
+	vm_offset_t va;
+	int error;
+	uint64_t bits;
+	uint64_t *pdp, *pd, *pt, pa;
+	int i, j, k, bit;
+	struct minidumphdr mdhdr;
+
+	counter = 0;
+	/* Walk page table pages, set bits in vm_page_dump */
+	ptesize = 0;
+	pdp = (uint64_t *)PHYS_TO_DMAP(KPDPphys);
+	for (va = KERNBASE; va < kernel_vm_end; va += NBPDR) {
+		i = (va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1);
+		/*
+		 * We always write a page, even if it is zero. Each
+		 * page written corresponds to 2MB of space
+		 */
+		ptesize += PAGE_SIZE;
+		if ((pdp[i] & PG_V) == 0)
+			continue;
+		pd = (uint64_t *)PHYS_TO_DMAP(pdp[i] & PG_FRAME);
+		j = ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
+		if ((pd[j] & (PG_PS | PG_V)) == (PG_PS | PG_V))  {
+			/* This is an entire 2M page. */
+			pa = pd[j] & PG_FRAME & ~PDRMASK;
+			for (k = 0; k < NPTEPG; k++) {
+				if (is_dumpable(pa))
+					dump_add_page(pa);
+				pa += PAGE_SIZE;
+			}
+			continue;
+		}
+		if ((pd[j] & PG_V) == PG_V) {
+			/* set bit for each valid page in this 2MB block */
+			pt = (uint64_t *)PHYS_TO_DMAP(pd[j] & PG_FRAME);
+			for (k = 0; k < NPTEPG; k++) {
+				if ((pt[k] & PG_V) == PG_V) {
+					pa = pt[k] & PG_FRAME;
+					if (is_dumpable(pa))
+						dump_add_page(pa);
+				}
+			}
+		} else {
+			/* nothing, we're going to dump a null page */
+		}
+	}
+
+	/* Calculate dump size. */
+	dumpsize = ptesize;
+	dumpsize += round_page(msgbufp->msg_size);
+	dumpsize += round_page(vm_page_dump_size);
+	for (i = 0; i < vm_page_dump_size / sizeof(*vm_page_dump); i++) {
+		bits = vm_page_dump[i];
+		while (bits) {
+			bit = bsfq(bits);
+			pa = (((uint64_t)i * sizeof(*vm_page_dump) * NBBY) + bit) * PAGE_SIZE;
+			/* Clear out undumpable pages now if needed */
+			if (is_dumpable(pa)) {
+				dumpsize += PAGE_SIZE;
+			} else {
+				dump_drop_page(pa);
+			}
+			bits &= ~(1ul << bit);
+		}
+	}
+	dumpsize += PAGE_SIZE;
+
+	/* Determine dump offset on device. */
+	if (di->mediasize < SIZEOF_METADATA + dumpsize + sizeof(kdh) * 2) {
+		error = ENOSPC;
+		goto fail;
+	}
+	dumplo = di->mediaoffset + di->mediasize - dumpsize;
+	dumplo -= sizeof(kdh) * 2;
+	progress = dumpsize;
+
+	/* Initialize mdhdr */
+	bzero(&mdhdr, sizeof(mdhdr));
+	strcpy(mdhdr.magic, MINIDUMP_MAGIC);
+	mdhdr.version = MINIDUMP_VERSION;
+	mdhdr.msgbufsize = msgbufp->msg_size;
+	mdhdr.bitmapsize = vm_page_dump_size;
+	mdhdr.ptesize = ptesize;
+	mdhdr.kernbase = KERNBASE;
+	mdhdr.dmapbase = DMAP_MIN_ADDRESS;
+	mdhdr.dmapend = DMAP_MAX_ADDRESS;
+
+	mkdumpheader(&kdh, KERNELDUMP_AMD64_VERSION, dumpsize, di->blocksize);
+
+	printf("Physical memory: %ju MB\n", ptoa((uintmax_t)physmem) / 1048576);
+	printf("Dumping %llu MB:", (long long)dumpsize >> 20);
+
+	/* Dump leader */
+	error = di->dumper(di->priv, &kdh, 0, dumplo, sizeof(kdh));
+	if (error)
+		goto fail;
+	dumplo += sizeof(kdh);
+
+	/* Dump my header */
+	bzero(&fakept, sizeof(fakept));
+	bcopy(&mdhdr, &fakept, sizeof(mdhdr));
+	error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE);
+	if (error)
+		goto fail;
+
+	/* Dump msgbuf up front */
+	error = blk_write(di, (char *)msgbufp->msg_ptr, 0, round_page(msgbufp->msg_size));
+	if (error)
+		goto fail;
+
+	/* Dump bitmap */
+	error = blk_write(di, (char *)vm_page_dump, 0, round_page(vm_page_dump_size));
+	if (error)
+		goto fail;
+
+	/* Dump kernel page table pages */
+	pdp = (uint64_t *)PHYS_TO_DMAP(KPDPphys);
+	for (va = KERNBASE; va < kernel_vm_end; va += NBPDR) {
+		i = (va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1);
+		/* We always write a page, even if it is zero */
+		if ((pdp[i] & PG_V) == 0) {
+			bzero(fakept, sizeof(fakept));
+			error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE);
+			if (error)
+				goto fail;
+			/* flush, in case we reuse fakept in the same block */
+			error = blk_flush(di);
+			if (error)
+				goto fail;
+			continue;
+		}
+		pd = (uint64_t *)PHYS_TO_DMAP(pdp[i] & PG_FRAME);
+		j = ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
+		if ((pd[j] & (PG_PS | PG_V)) == (PG_PS | PG_V))  {
+			/* This is a single 2M block. Generate a fake PTP */
+			pa = pd[j] & PG_FRAME & ~PDRMASK;
+			for (k = 0; k < NPTEPG; k++) {
+				fakept[k] = (pa + (k * PAGE_SIZE)) | PG_V | PG_RW | PG_A | PG_M;
+			}
+			error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE);
+			if (error)
+				goto fail;
+			/* flush, in case we reuse fakept in the same block */
+			error = blk_flush(di);
+			if (error)
+				goto fail;
+			continue;
+		}
+		if ((pd[j] & PG_V) == PG_V) {
+			pt = (uint64_t *)PHYS_TO_DMAP(pd[j] & PG_FRAME);
+			error = blk_write(di, (char *)pt, 0, PAGE_SIZE);
+			if (error)
+				goto fail;
+		} else {
+			bzero(fakept, sizeof(fakept));
+			error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE);
+			if (error)
+				goto fail;
+			/* flush, in case we reuse fakept in the same block */
+			error = blk_flush(di);
+			if (error)
+				goto fail;
+		}
+	}
+
+	/* Dump memory chunks */
+	/* XXX cluster it up and use blk_dump() */
+	for (i = 0; i < vm_page_dump_size / sizeof(*vm_page_dump); i++) {
+		bits = vm_page_dump[i];
+		while (bits) {
+			bit = bsfq(bits);
+			pa = (((uint64_t)i * sizeof(*vm_page_dump) * NBBY) + bit) * PAGE_SIZE;
+			error = blk_write(di, 0, pa, PAGE_SIZE);
+			if (error)
+				goto fail;
+			bits &= ~(1ul << bit);
+		}
+	}
+
+	error = blk_flush(di);
+	if (error)
+		goto fail;
+
+	/* Dump trailer */
+	error = di->dumper(di->priv, &kdh, 0, dumplo, sizeof(kdh));
+	if (error)
+		goto fail;
+	dumplo += sizeof(kdh);
+
+	/* Signal completion, signoff and exit stage left. */
+	di->dumper(di->priv, NULL, 0, 0, 0);
+	printf("\nDump complete\n");
+	return;
+
+ fail:
+	if (error < 0)
+		error = -error;
+
+	if (error == ECANCELED)
+		printf("\nDump aborted\n");
+	else if (error == ENOSPC)
+		printf("\nDump failed. Partition too small.\n");
+	else
+		printf("\n** DUMP FAILED (ERROR %d) **\n", error);
+}
+
+void
+dump_add_page(vm_paddr_t pa)
+{
+	int idx, bit;
+
+	pa >>= PAGE_SHIFT;
+	idx = pa >> 6;		/* 2^6 = 64 */
+	bit = pa & 63;
+	atomic_set_long(&vm_page_dump[idx], 1ul << bit);
+}
+
+void
+dump_drop_page(vm_paddr_t pa)
+{
+	int idx, bit;
+
+	pa >>= PAGE_SHIFT;
+	idx = pa >> 6;		/* 2^6 = 64 */
+	bit = pa & 63;
+	atomic_clear_long(&vm_page_dump[idx], 1ul << bit);
+}
-- 
cgit v1.2.3