aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTony Luck <tony.luck@intel.com>2015-12-08 16:08:38 -0800
committerAndi Kleen <ak@linux.intel.com>2016-01-04 11:12:28 -0800
commit32252e9c37e97ea5083d90d2cf194bb85a4a0cda (patch)
tree7bdbf84dafe63bd271d63c49f6adf2d3de26f35e
parentc83713fd96181fc6e486f05732f5b93f8a000e39 (diff)
downloadmcelog-32252e9c37e97ea5083d90d2cf194bb85a4a0cda.tar.gz
Add support to decode MSCOD values for Broadwell-{de,ep,ex}v129
Intel software developer manual version 057 released in December 2015 has details of the MSCOD values in PCU, QPI and IMC banks on Broadwell server processors. See volume 3, sections 16.7, 16.8 [AK: Add missing prototype] Signed-off-by: Tony Luck <tony.luck@intel.com> Signed-off-by: Andi Kleen <ak@linux.intel.com>
-rw-r--r--Makefile3
-rw-r--r--broadwell_de.c104
-rw-r--r--broadwell_de.h2
-rw-r--r--broadwell_epex.c149
-rw-r--r--broadwell_epex.h1
-rw-r--r--intel.c7
-rw-r--r--intel.h2
-rw-r--r--mcelog.c7
-rw-r--r--mcelog.h2
-rw-r--r--p4.c8
10 files changed, 283 insertions, 2 deletions
diff --git a/Makefile b/Makefile
index bab79b8..8b2d0ff 100644
--- a/Makefile
+++ b/Makefile
@@ -36,7 +36,8 @@ OBJ := p4.o k8.o mcelog.o dmi.o tsc.o core2.o bitfield.o intel.o \
nehalem.o dunnington.o tulsa.o config.o memutil.o msg.o \
eventloop.o leaky-bucket.o memdb.o server.o trigger.o \
client.o cache.o sysfs.o yellow.o page.o rbtree.o \
- xeon75xx.o sandy-bridge.o ivy-bridge.o haswell.o msr.o bus.o \
+ xeon75xx.o sandy-bridge.o ivy-bridge.o haswell.o \
+ broadwell_de.o broadwell_epex.o msr.o bus.o \
unknown.o
DISKDB_OBJ := diskdb.o dimm.o db.o
CLEAN := mcelog dmi tsc dbquery .depend .depend.X dbquery.o ${DISKDB_OBJ} \
diff --git a/broadwell_de.c b/broadwell_de.c
new file mode 100644
index 0000000..f5c19a0
--- /dev/null
+++ b/broadwell_de.c
@@ -0,0 +1,104 @@
+/* Copyright (C) 2015 Intel Corporation
+ Decode Intel Broadwell D specific machine check errors.
+
+ mcelog is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; version
+ 2.
+
+ mcelog is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should find a copy of v2 of the GNU General Public License somewhere
+ on your Linux system; if not, write to the Free Software Foundation,
+ Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ Author: Tony Luck
+*/
+
+#include "mcelog.h"
+#include "bitfield.h"
+#include "broadwell_de.h"
+#include "memdb.h"
+
+/* See IA32 SDM Vol3B Table 16-24 */
+
+static char *pcu_1[] = {
+ [0x00] = "No Error",
+ [0x09] = "MC_MESSAGE_CHANNEL_TIMEOUT",
+ [0x13] = "MC_DMI_TRAINING_TIMEOUT",
+ [0x15] = "MC_DMI_CPU_RESET_ACK_TIMEOUT",
+ [0x1E] = "MC_VR_ICC_MAX_LT_FUSED_ICC_MAX",
+ [0x25] = "MC_SVID_COMMAN_TIMEOUT",
+ [0x26] = "MCA_PKGC_DIRECT_WAKE_RING_TIMEOUT",
+ [0x29] = "MC_VR_VOUT_MAC_LT_FUSED_SVID",
+ [0x2B] = "MC_PKGC_WATCHDOG_HANG_CBZ_DOWN",
+ [0x2C] = "MC_PKGC_WATCHDOG_HANG_CBZ_UP",
+ [0x44] = "MC_CRITICAL_VR_FAILED",
+ [0x46] = "MC_VID_RAMP_DOWN_FAILED",
+ [0x49] = "MC_SVID_WRITE_REG_VOUT_MAX_FAILED",
+ [0x4B] = "MC_BOOT_VID_TIMEOUT_DRAM_0",
+ [0x4F] = "MC_SVID_COMMAND_ERROR",
+ [0x52] = "MC_FIVR_CATAS_OVERVOL_FAULT",
+ [0x53] = "MC_FIVR_CATAS_OVERCUR_FAULT",
+ [0x57] = "MC_SVID_PKGC_REQUEST_FAILED",
+ [0x58] = "MC_SVID_IMON_REQUEST_FAILED",
+ [0x59] = "MC_SVID_ALERT_REQUEST_FAILED",
+ [0x62] = "MC_INVALID_PKGS_RSP_QPI",
+ [0x64] = "MC_INVALID_PKG_STATE_CONFIG",
+ [0x67] = "MC_HA_IMC_RW_BLOCK_ACK_TIMEOUT",
+ [0x6A] = "MC_MSGCH_PMREQ_CMP_TIMEOUT",
+ [0x72] = "MC_WATCHDOG_TIMEOUT_PKGS_MASTER",
+ [0x81] = "MC_RECOVERABLE_DIE_THERMAL_TOO_HOT"
+};
+
+static struct field pcu_mc4[] = {
+ FIELD(24, pcu_1),
+ {}
+};
+
+/* See IA32 SDM Vol3B Table 16-18 */
+
+static struct field memctrl_mc9[] = {
+ SBITFIELD(16, "Address parity error"),
+ SBITFIELD(17, "HA Wrt buffer Data parity error"),
+ SBITFIELD(18, "HA Wrt byte enable parity error"),
+ SBITFIELD(19, "Corrected patrol scrub error"),
+ SBITFIELD(20, "Uncorrected patrol scrub error"),
+ SBITFIELD(21, "Corrected spare error"),
+ SBITFIELD(22, "Uncorrected spare error"),
+ SBITFIELD(23, "Corrected memory read error"),
+ SBITFIELD(24, "iMC, WDB, parity errors"),
+ {}
+};
+
+void bdw_de_decode_model(int cputype, int bank, u64 status, u64 misc)
+{
+ switch (bank) {
+ case 4:
+ Wprintf("PCU: ");
+ switch (EXTRACT(status, 0, 15) & ~(1ull << 12)) {
+ case 0x402: case 0x403:
+ Wprintf("Internal errors ");
+ break;
+ case 0x406:
+ Wprintf("Intel TXT errors ");
+ break;
+ case 0x407:
+ Wprintf("Other UBOX Internal errors ");
+ break;
+ }
+ if (EXTRACT(status, 16, 19) & 3)
+ Wprintf("PCU internal error ");
+ if (EXTRACT(status, 20, 23) & 4)
+ Wprintf("Ubox error ");
+ decode_bitfield(status, pcu_mc4);
+ break;
+ case 9: case 10:
+ Wprintf("MemCtrl: ");
+ decode_bitfield(status, memctrl_mc9);
+ break;
+ }
+}
diff --git a/broadwell_de.h b/broadwell_de.h
new file mode 100644
index 0000000..489f4fc
--- /dev/null
+++ b/broadwell_de.h
@@ -0,0 +1,2 @@
+void bdw_d_decode_model(int cputype, int bank, u64 status, u64 misc);
+void bdw_de_decode_model(int cputype, int bank, u64 status, u64 misc);
diff --git a/broadwell_epex.c b/broadwell_epex.c
new file mode 100644
index 0000000..576be1c
--- /dev/null
+++ b/broadwell_epex.c
@@ -0,0 +1,149 @@
+/* Copyright (C) 2015 Intel Corporation
+ Decode Intel Broadwell specific machine check errors.
+
+ mcelog is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; version
+ 2.
+
+ mcelog is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should find a copy of v2 of the GNU General Public License somewhere
+ on your Linux system; if not, write to the Free Software Foundation,
+ Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ Author: Tony Luck
+*/
+
+#include "mcelog.h"
+#include "bitfield.h"
+#include "broadwell_epex.h"
+#include "memdb.h"
+
+/* See IA32 SDM Vol3B Table 16-20 */
+
+static char *pcu_1[] = {
+ [0x00] = "No Error",
+ [0x09] = "MC_MESSAGE_CHANNEL_TIMEOUT",
+ [0x0D] = "MC_IMC_FORCE_SR_S3_TIMEOUT",
+ [0x0E] = "MC_CPD_UNCPD_SD_TIMEOUT",
+ [0x13] = "MC_DMI_TRAINING_TIMEOUT",
+ [0x15] = "MC_DMI_CPU_RESET_ACK_TIMEOUT",
+ [0x1E] = "MC_VR_ICC_MAX_LT_FUSED_ICC_MAX",
+ [0x25] = "MC_SVID_COMMAN_TIMEOUT",
+ [0x29] = "MC_VR_VOUT_MAC_LT_FUSED_SVID",
+ [0x2B] = "MC_PKGC_WATCHDOG_HANG_CBZ_DOWN",
+ [0x2C] = "MC_PKGC_WATCHDOG_HANG_CBZ_UP",
+ [0x39] = "MC_PKGC_WATCHDOG_HANG_C3_UP_SF",
+ [0x44] = "MC_CRITICAL_VR_FAILED",
+ [0x45] = "MC_ICC_MAX_NOTSUPPORTED",
+ [0x46] = "MC_VID_RAMP_DOWN_FAILED",
+ [0x47] = "MC_EXCL_MODE_NO_PMREQ_CMP",
+ [0x48] = "MC_SVID_READ_REG_ICC_MAX_FAILED",
+ [0x49] = "MC_SVID_WRITE_REG_VOUT_MAX_FAILED",
+ [0x4B] = "MC_BOOT_VID_TIMEOUT_DRAM_0",
+ [0x4C] = "MC_BOOT_VID_TIMEOUT_DRAM_1",
+ [0x4D] = "MC_BOOT_VID_TIMEOUT_DRAM_2",
+ [0x4E] = "MC_BOOT_VID_TIMEOUT_DRAM_3",
+ [0x4F] = "MC_SVID_COMMAND_ERROR",
+ [0x52] = "MC_FIVR_CATAS_OVERVOL_FAULT",
+ [0x53] = "MC_FIVR_CATAS_OVERCUR_FAULT",
+ [0x57] = "MC_SVID_PKGC_REQUEST_FAILED",
+ [0x58] = "MC_SVID_IMON_REQUEST_FAILED",
+ [0x59] = "MC_SVID_ALERT_REQUEST_FAILED",
+ [0x60] = "MC_INVALID_PKGS_REQ_PCH",
+ [0x61] = "MC_INVALID_PKGS_REQ_QPI",
+ [0x62] = "MC_INVALID_PKGS_RSP_QPI",
+ [0x63] = "MC_INVALID_PKGS_RSP_PCH",
+ [0x64] = "MC_INVALID_PKG_STATE_CONFIG",
+ [0x67] = "MC_HA_IMC_RW_BLOCK_ACK_TIMEOUT",
+ [0x68] = "MC_IMC_RW_SMBUS_TIMEOUT",
+ [0x69] = "MC_HA_FAILSTS_CHANGE_DETECTED",
+ [0x6A] = "MC_MSGCH_PMREQ_CMP_TIMEOUT",
+ [0x70] = "MC_WATCHDOG_TIMEOUT_PKGC_SLAVE",
+ [0x71] = "MC_WATCHDOG_TIMEOUT_PKGC_MASTER",
+ [0x72] = "MC_WATCHDOG_TIMEOUT_PKGS_MASTER",
+ [0x7C] = "MC_BIOS_RST_CPL_INVALID_SEQ",
+ [0x7D] = "MC_MORE_THAN_ONE_TXT_AGENT",
+ [0x81] = "MC_RECOVERABLE_DIE_THERMAL_TOO_HOT"
+};
+
+static struct field pcu_mc4[] = {
+ FIELD(24, pcu_1),
+ {}
+};
+
+/* See IA32 SDM Vol3B Table 16-21 */
+
+static char *qpi[] = {
+ [0x02] = "Intel QPI physical layer detected drift buffer alarm",
+ [0x03] = "Intel QPI physical layer detected latency buffer rollover",
+ [0x10] = "Intel QPI link layer detected control error from R3QPI",
+ [0x11] = "Rx entered LLR abort state on CRC error",
+ [0x12] = "Unsupported or undefined packet",
+ [0x13] = "Intel QPI link layer control error",
+ [0x15] = "RBT used un-initialized value",
+ [0x20] = "Intel QPI physical layer detected a QPI in-band reset but aborted initialization",
+ [0x21] = "Link failover data self healing",
+ [0x22] = "Phy detected in-band reset (no width change)",
+ [0x23] = "Link failover clock failover",
+ [0x30] = "Rx detected CRC error - successful LLR after Phy re-init",
+ [0x31] = "Rx detected CRC error - successful LLR wihout Phy re-init",
+};
+
+static struct field qpi_mc[] = {
+ FIELD(16, qpi),
+ {}
+};
+
+/* See IA32 SDM Vol3B Table 16-26 */
+
+static struct field memctrl_mc9[] = {
+ SBITFIELD(16, "DDR3 address parity error"),
+ SBITFIELD(17, "Uncorrected HA write data error"),
+ SBITFIELD(18, "Uncorrected HA data byte enable error"),
+ SBITFIELD(19, "Corrected patrol scrub error"),
+ SBITFIELD(20, "Uncorrected patrol scrub error"),
+ SBITFIELD(21, "Corrected spare error"),
+ SBITFIELD(22, "Uncorrected spare error"),
+ SBITFIELD(24, "iMC write data buffer parity error"),
+ SBITFIELD(25, "DDR4 command address parity error"),
+ {}
+};
+
+void bdw_epex_decode_model(int cputype, int bank, u64 status, u64 misc)
+{
+ switch (bank) {
+ case 4:
+ Wprintf("PCU: ");
+ switch (EXTRACT(status, 0, 15) & ~(1ull << 12)) {
+ case 0x402: case 0x403:
+ Wprintf("Internal errors ");
+ break;
+ case 0x406:
+ Wprintf("Intel TXT errors ");
+ break;
+ case 0x407:
+ Wprintf("Other UBOX Internal errors ");
+ break;
+ }
+ if (EXTRACT(status, 16, 19))
+ Wprintf("PCU internal error ");
+ decode_bitfield(status, pcu_mc4);
+ break;
+ case 5:
+ case 20:
+ case 21:
+ Wprintf("QPI: ");
+ decode_bitfield(status, qpi_mc);
+ break;
+ case 9: case 10: case 11: case 12:
+ case 13: case 14: case 15: case 16:
+ Wprintf("MemCtrl: ");
+ decode_bitfield(status, memctrl_mc9);
+ break;
+ }
+}
diff --git a/broadwell_epex.h b/broadwell_epex.h
new file mode 100644
index 0000000..8ed3356
--- /dev/null
+++ b/broadwell_epex.h
@@ -0,0 +1 @@
+void bdw_epex_decode_model(int cputype, int bank, u64 status, u64 misc);
diff --git a/intel.c b/intel.c
index 1167c1c..ffff405 100644
--- a/intel.c
+++ b/intel.c
@@ -35,6 +35,7 @@ void intel_cpu_init(enum cputype cpu)
cpu == CPU_SANDY_BRIDGE || cpu == CPU_SANDY_BRIDGE_EP ||
cpu == CPU_IVY_BRIDGE || cpu == CPU_IVY_BRIDGE_EPEX ||
cpu == CPU_HASWELL || cpu == CPU_HASWELL_EPEX || cpu == CPU_BROADWELL ||
+ cpu == CPU_BROADWELL_DE || cpu == CPU_BROADWELL_EPEX ||
cpu == CPU_KNIGHTS_LANDING || cpu == CPU_SKYLAKE)
memory_error_support = 1;
}
@@ -73,8 +74,12 @@ enum cputype select_intel_cputype(int family, int model)
return CPU_HASWELL;
else if (model == 0x3f)
return CPU_HASWELL_EPEX;
- else if (model == 0x3d || model == 0x4f || model == 0x56)
+ else if (model == 0x3d)
return CPU_BROADWELL;
+ else if (model == 0x4f)
+ return CPU_BROADWELL_EPEX;
+ else if (model == 0x56)
+ return CPU_BROADWELL_DE;
else if (model == 0x57)
return CPU_KNIGHTS_LANDING;
else if (model == 0x1c || model == 0x26 || model == 0x27 ||
diff --git a/intel.h b/intel.h
index 574dbb6..26781fc 100644
--- a/intel.h
+++ b/intel.h
@@ -21,6 +21,8 @@ extern int memory_error_support;
case CPU_HASWELL: \
case CPU_HASWELL_EPEX: \
case CPU_BROADWELL: \
+ case CPU_BROADWELL_DE: \
+ case CPU_BROADWELL_EPEX: \
case CPU_KNIGHTS_LANDING: \
case CPU_SKYLAKE
diff --git a/mcelog.c b/mcelog.c
index e96f199..2c9ff37 100644
--- a/mcelog.c
+++ b/mcelog.c
@@ -233,6 +233,8 @@ static char *cputype_name[] = {
[CPU_HASWELL] = "Haswell", /* Fill in better name */
[CPU_HASWELL_EPEX] = "Intel Xeon v3 (Haswell) EP/EX",
[CPU_BROADWELL] = "Broadwell",
+ [CPU_BROADWELL_DE] = "Intel Xeon (Broadwell) D family",
+ [CPU_BROADWELL_EPEX] = "Intel Xeon v4 (Broadwell) EP/EX",
[CPU_KNIGHTS_LANDING] = "Knights Landing",
[CPU_ATOM] = "ATOM",
[CPU_SKYLAKE] = "Skylake",
@@ -275,9 +277,13 @@ static struct config_choice cpu_choices[] = {
{ "haswell-ep", CPU_HASWELL_EPEX },
{ "haswell-ex", CPU_HASWELL_EPEX },
{ "broadwell", CPU_BROADWELL },
+ { "broadwell-d", CPU_BROADWELL_DE },
+ { "broadwell-ep", CPU_BROADWELL_EPEX },
+ { "broadwell-ex", CPU_BROADWELL_EPEX },
{ "knightslanding", CPU_KNIGHTS_LANDING },
{ "xeon-v2", CPU_IVY_BRIDGE_EPEX },
{ "xeon-v3", CPU_HASWELL_EPEX },
+ { "xeon-v4", CPU_BROADWELL_EPEX },
{ "atom", CPU_ATOM },
{ "skylake", CPU_SKYLAKE },
{ NULL }
@@ -442,6 +448,7 @@ static void dump_mce(struct mce *m, unsigned recordlen)
}
if (cputype != CPU_SANDY_BRIDGE_EP && cputype != CPU_IVY_BRIDGE_EPEX &&
cputype != CPU_HASWELL_EPEX && cputype != CPU_BROADWELL &&
+ cputype != CPU_BROADWELL_DE && cputype != CPU_BROADWELL_EPEX &&
cputype != CPU_KNIGHTS_LANDING && cputype != CPU_SKYLAKE)
resolveaddr(m->addr);
if (!ascii_mode && ismemerr && (m->status & MCI_STATUS_ADDRV)) {
diff --git a/mcelog.h b/mcelog.h
index 74ab3b3..237a5c6 100644
--- a/mcelog.h
+++ b/mcelog.h
@@ -124,6 +124,8 @@ enum cputype {
CPU_HASWELL,
CPU_HASWELL_EPEX,
CPU_BROADWELL,
+ CPU_BROADWELL_DE,
+ CPU_BROADWELL_EPEX,
CPU_KNIGHTS_LANDING,
CPU_ATOM,
CPU_SKYLAKE,
diff --git a/p4.c b/p4.c
index 083c8e4..fac1b75 100644
--- a/p4.c
+++ b/p4.c
@@ -36,6 +36,8 @@
#include "sandy-bridge.h"
#include "ivy-bridge.h"
#include "haswell.h"
+#include "broadwell_de.h"
+#include "broadwell_epex.h"
/* decode mce for P4/Xeon and Core2 family */
@@ -416,6 +418,12 @@ void decode_intel_mc(struct mce *log, int cputype, int *ismemerr, unsigned size)
case CPU_HASWELL_EPEX:
hsw_decode_model(cputype, log->bank, log->status, log->misc);
break;
+ case CPU_BROADWELL_DE:
+ bdw_de_decode_model(cputype, log->bank, log->status, log->misc);
+ break;
+ case CPU_BROADWELL_EPEX:
+ bdw_epex_decode_model(cputype, log->bank, log->status, log->misc);
+ break;
}
}