diff options
author | Tony Luck <tony.luck@intel.com> | 2014-03-11 09:34:13 -0700 |
---|---|---|
committer | Tony Luck <tony.luck@intel.com> | 2014-03-11 09:34:13 -0700 |
commit | de00752da7c655798bc87d0a4f52af236829c55d (patch) | |
tree | 46744640573f05dcfe842f0f31624bd2f1260705 | |
download | ras-tools-de00752da7c655798bc87d0a4f52af236829c55d.tar.gz |
Add example recovery application
This is pretty trivial - just shows how to setup a SIGBUS handler
for recoverable machine checks. Injection of the actual error is
handled externally (e.g. using EINJ).
Signed-off-by: Tony Luck <tony.luck@intel.com>
-rw-r--r-- | mca-recover.c | 146 |
1 files changed, 146 insertions, 0 deletions
diff --git a/mca-recover.c b/mca-recover.c new file mode 100644 index 0000000..e4276a1 --- /dev/null +++ b/mca-recover.c @@ -0,0 +1,146 @@ +/* + * Copyright (C) 2013 Intel Corporation + * Authors: Tony Luck + * + * This software may be redistributed and/or modified under the terms of + * the GNU General Public License ("GPL") version 2 only as published by the + * Free Software Foundation. + */ + +/* + * Set up to get zapped by a machine check (injected elsewhere) + * recovery function reports physical address of new page - so + * we can inject to that and repeat over and over. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <time.h> +#include <fcntl.h> +#include <signal.h> +#include <sys/mman.h> + +static int pagesize; + +/* + * get information about address from /proc/{pid}/pagemap + * Assumes target address is mapped as 4K (not hugepage) + */ +unsigned long long vtop(unsigned long long addr) +{ + unsigned long long pinfo; + long offset = addr / pagesize * (sizeof pinfo); + int fd; + char pagemapname[64]; + + sprintf(pagemapname, "/proc/%d/pagemap", getpid()); + fd = open(pagemapname, O_RDONLY); + if (fd == -1) { + perror(pagemapname); + exit(1); + } + if (pread(fd, &pinfo, sizeof pinfo, offset) != sizeof pinfo) { + perror(pagemapname); + exit(1); + } + close(fd); + if ((pinfo & (1ull << 63)) == 0) { + printf("page not present\n"); + exit(1); + } + return ((pinfo & 0x007fffffffffffffull) << 12) + (addr & (pagesize - 1)); +} + +int checksum(unsigned char *addr) +{ + int i, sum; + + sum = 0; + for (i = 0; i < pagesize; i++) + sum += addr[i]; + return sum; +} + +/* + * Older glibc headers don't have the si_addr_lsb field in the siginfo_t + * structure ... ugly hack to get it + */ +struct morebits { + void *addr; + short lsb; +}; + +char *buf; +unsigned long long phys; + +/* + * "Recover" from the error by allocating a new page and mapping + * it at the same virtual address as the page we lost. Fill with + * the same (trivial) contents. + */ +void recover(int sig, siginfo_t *si, void *v) +{ + struct morebits *m = (struct morebits *)&si->si_addr; + char *newbuf; + + printf("recover: sig=%d si=%p v=%p\n", sig, si, v); + printf("Platform memory error at 0x%p\n", si->si_addr); + printf("addr = %p lsb=%d\n", m->addr, m->lsb); + + newbuf = mmap(buf, pagesize, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + if (newbuf == MAP_FAILED) { + fprintf(stderr, "Can't get a single page of memory!\n"); + exit(1); + } + if (newbuf != buf) { + fprintf(stderr, "Could not allocate at original virtual address\n"); + exit(1); + } + buf = newbuf; + memset(buf, '*', pagesize); + phys = vtop((unsigned long long)buf); + + printf("Recovery allocated new page at physical %llx\n", phys); +} + +struct sigaction recover_act = { + .sa_sigaction = recover, + .sa_flags = SA_SIGINFO, +}; + +int main(int argc, char **argv) +{ + int i, sum, rightsum; + int iflag = 0; + int tflag = 0; + time_t now; + + pagesize = getpagesize(); + + buf = mmap(NULL, pagesize, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + + if (buf == MAP_FAILED) { + fprintf(stderr, "Can't get a single page of memory!\n"); + return 1; + } + memset(buf, '*', pagesize); + rightsum = checksum(buf); + phys = vtop((unsigned long long)buf); + + printf("vtop(%llx) = %llx\n", (unsigned long long)buf, phys); + fflush(stdout); + + sigaction(SIGBUS, &recover_act, NULL); + + while (1) { + sum = checksum(buf); + if (sum != rightsum) { + printf("Ooops. Saw bad checksum %d\n", sum); + break; + } + } + + return 1; +} |