From 3f3174996be6b4312c38f54d5969f5d5b75fec9e Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 22 Jan 2024 22:13:59 -0600 Subject: RAS: Introduce AMD Address Translation Library AMD Zen-based systems report memory errors through Machine Check banks representing Unified Memory Controllers (UMCs). The address value reported for DRAM ECC errors is a "normalized address" that is relative to the UMC. This normalized address must be converted to a system physical address to be usable by the OS. Support for this address translation was introduced to the MCA subsystem with Zen1 systems. The code was later moved to the AMD64 EDAC module, since this was the only user of the code at the time. However, there are uses for this translation outside of EDAC. The system physical address can be used in MCA for preemptive page offlining as done in some MCA notifier functions. Also, this translation is needed as the basis of similar functionality needed for some CXL configurations on AMD systems. Introduce a common address translation library that can be used for multiple subsystems including MCA, EDAC, and CXL. Include support for UMC normalized to system physical address translation for current CPU systems. The Data Fabric Indirect register access offsets and one of the register fields were changed. Default to the current offsets and register field definition. And fallback to the older values if running on a "legacy" system. Provide built-in code to facilitate the loading and unloading of the library module without affecting other modules or built-in code. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240123041401.79812-2-yazen.ghannam@amd.com --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'MAINTAINERS') diff --git a/MAINTAINERS b/MAINTAINERS index 8d1052fa6a692..25537a37338e5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -897,6 +897,12 @@ Q: https://patchwork.kernel.org/project/linux-rdma/list/ F: drivers/infiniband/hw/efa/ F: include/uapi/rdma/efa-abi.h +AMD ADDRESS TRANSLATION LIBRARY (ATL) +M: Yazen Ghannam +L: linux-edac@vger.kernel.org +S: Supported +F: drivers/ras/amd/atl/* + AMD AXI W1 DRIVER M: Kris Chaplin R: Thomas Delev -- cgit 1.2.3-korg From 1289c431641f8beacc47db506210154dcea2492a Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 22 Jan 2024 22:14:01 -0600 Subject: Documentation: RAS: Add index and address translation section There are a lot of RAS topic to document, and there are a lot of details for each topic. Prep for this by adding an index for the RAS directory. This will provide a top-level document and table of contents. It also provides the option to build the RAS directory individually using "make SPHINXDIRS=". Start a section on address translation. This will be expanded with details for future translation methods and how they're used in the kernel. Move the error decoding topic to its own section. Links to other error decoding kernel docs will be added. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240123041401.79812-4-yazen.ghannam@amd.com --- Documentation/RAS/address-translation.rst | 24 ++++++++++++++++++++++++ Documentation/RAS/error-decoding.rst | 21 +++++++++++++++++++++ Documentation/RAS/index.rst | 14 ++++++++++++++ Documentation/RAS/ras.rst | 26 -------------------------- Documentation/index.rst | 2 +- MAINTAINERS | 1 + 6 files changed, 61 insertions(+), 27 deletions(-) create mode 100644 Documentation/RAS/address-translation.rst create mode 100644 Documentation/RAS/error-decoding.rst create mode 100644 Documentation/RAS/index.rst delete mode 100644 Documentation/RAS/ras.rst (limited to 'MAINTAINERS') diff --git a/Documentation/RAS/address-translation.rst b/Documentation/RAS/address-translation.rst new file mode 100644 index 0000000000000..f0ca17b43cd3d --- /dev/null +++ b/Documentation/RAS/address-translation.rst @@ -0,0 +1,24 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Address translation +=================== + +x86 AMD +------- + +Zen-based AMD systems include a Data Fabric that manages the layout of +physical memory. Devices attached to the Fabric, like memory controllers, +I/O, etc., may not have a complete view of the system physical memory map. +These devices may provide a "normalized", i.e. device physical, address +when reporting memory errors. Normalized addresses must be translated to +a system physical address for the kernel to action on the memory. + +AMD Address Translation Library (CONFIG_AMD_ATL) provides translation for +this case. + +Glossary of acronyms used in address translation for Zen-based systems + +* CCM = Cache Coherent Moderator +* COD = Cluster-on-Die +* COH_ST = Coherent Station +* DF = Data Fabric diff --git a/Documentation/RAS/error-decoding.rst b/Documentation/RAS/error-decoding.rst new file mode 100644 index 0000000000000..26a72f3fe5de8 --- /dev/null +++ b/Documentation/RAS/error-decoding.rst @@ -0,0 +1,21 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Error decoding +============== + +x86 +--- + +Error decoding on AMD systems should be done using the rasdaemon tool: +https://github.com/mchehab/rasdaemon/ + +While the daemon is running, it would automatically log and decode +errors. If not, one can still decode such errors by supplying the +hardware information from the error:: + + $ rasdaemon -p --status --ipid --smca + +Also, the user can pass particular family and model to decode the error +string:: + + $ rasdaemon -p --status --ipid --smca --family --model --bank diff --git a/Documentation/RAS/index.rst b/Documentation/RAS/index.rst new file mode 100644 index 0000000000000..2794c1816e906 --- /dev/null +++ b/Documentation/RAS/index.rst @@ -0,0 +1,14 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========================================================== +Reliability, Availability and Serviceability (RAS) features +=========================================================== + +This documents different aspects of the RAS functionality present in the +kernel. + +.. toctree:: + :maxdepth: 2 + + error-decoding + address-translation diff --git a/Documentation/RAS/ras.rst b/Documentation/RAS/ras.rst deleted file mode 100644 index 2556b397cd271..0000000000000 --- a/Documentation/RAS/ras.rst +++ /dev/null @@ -1,26 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Reliability, Availability and Serviceability features -===================================================== - -This documents different aspects of the RAS functionality present in the -kernel. - -Error decoding ---------------- - -* x86 - -Error decoding on AMD systems should be done using the rasdaemon tool: -https://github.com/mchehab/rasdaemon/ - -While the daemon is running, it would automatically log and decode -errors. If not, one can still decode such errors by supplying the -hardware information from the error:: - - $ rasdaemon -p --status --ipid --smca - -Also, the user can pass particular family and model to decode the error -string:: - - $ rasdaemon -p --status --ipid --smca --family --model --bank diff --git a/Documentation/index.rst b/Documentation/index.rst index 36e61783437c1..07f2aa07f0fa0 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -113,7 +113,7 @@ to ReStructured Text format, or are simply too old. :maxdepth: 1 staging/index - RAS/ras + RAS/index Translations diff --git a/MAINTAINERS b/MAINTAINERS index 25537a37338e5..5b945fd5a3b91 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18359,6 +18359,7 @@ M: Tony Luck M: Borislav Petkov L: linux-edac@vger.kernel.org S: Maintained +F: Documentation/RAS/ F: Documentation/admin-guide/ras.rst F: drivers/ras/ F: include/linux/ras.h -- cgit 1.2.3-korg From 0e4fd816b08e85484e4dbe06e91466c85273f8e0 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 24 Jan 2024 13:37:52 +0100 Subject: Documentation: Move RAS section to admin-guide This is where this stuff should be. Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/87a5pes8jy.fsf@meer.lwn.net --- Documentation/RAS/address-translation.rst | 24 - Documentation/RAS/error-decoding.rst | 21 - Documentation/RAS/index.rst | 14 - .../admin-guide/RAS/address-translation.rst | 24 + Documentation/admin-guide/RAS/error-decoding.rst | 21 + Documentation/admin-guide/RAS/index.rst | 7 + Documentation/admin-guide/RAS/main.rst | 1223 ++++++++++++++++++++ Documentation/admin-guide/index.rst | 2 +- Documentation/admin-guide/ras.rst | 1219 ------------------- Documentation/index.rst | 1 - MAINTAINERS | 4 +- 11 files changed, 1277 insertions(+), 1283 deletions(-) delete mode 100644 Documentation/RAS/address-translation.rst delete mode 100644 Documentation/RAS/error-decoding.rst delete mode 100644 Documentation/RAS/index.rst create mode 100644 Documentation/admin-guide/RAS/address-translation.rst create mode 100644 Documentation/admin-guide/RAS/error-decoding.rst create mode 100644 Documentation/admin-guide/RAS/index.rst create mode 100644 Documentation/admin-guide/RAS/main.rst delete mode 100644 Documentation/admin-guide/ras.rst (limited to 'MAINTAINERS') diff --git a/Documentation/RAS/address-translation.rst b/Documentation/RAS/address-translation.rst deleted file mode 100644 index f0ca17b43cd3d..0000000000000 --- a/Documentation/RAS/address-translation.rst +++ /dev/null @@ -1,24 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Address translation -=================== - -x86 AMD -------- - -Zen-based AMD systems include a Data Fabric that manages the layout of -physical memory. Devices attached to the Fabric, like memory controllers, -I/O, etc., may not have a complete view of the system physical memory map. -These devices may provide a "normalized", i.e. device physical, address -when reporting memory errors. Normalized addresses must be translated to -a system physical address for the kernel to action on the memory. - -AMD Address Translation Library (CONFIG_AMD_ATL) provides translation for -this case. - -Glossary of acronyms used in address translation for Zen-based systems - -* CCM = Cache Coherent Moderator -* COD = Cluster-on-Die -* COH_ST = Coherent Station -* DF = Data Fabric diff --git a/Documentation/RAS/error-decoding.rst b/Documentation/RAS/error-decoding.rst deleted file mode 100644 index 26a72f3fe5de8..0000000000000 --- a/Documentation/RAS/error-decoding.rst +++ /dev/null @@ -1,21 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Error decoding -============== - -x86 ---- - -Error decoding on AMD systems should be done using the rasdaemon tool: -https://github.com/mchehab/rasdaemon/ - -While the daemon is running, it would automatically log and decode -errors. If not, one can still decode such errors by supplying the -hardware information from the error:: - - $ rasdaemon -p --status --ipid --smca - -Also, the user can pass particular family and model to decode the error -string:: - - $ rasdaemon -p --status --ipid --smca --family --model --bank diff --git a/Documentation/RAS/index.rst b/Documentation/RAS/index.rst deleted file mode 100644 index 2794c1816e906..0000000000000 --- a/Documentation/RAS/index.rst +++ /dev/null @@ -1,14 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -=========================================================== -Reliability, Availability and Serviceability (RAS) features -=========================================================== - -This documents different aspects of the RAS functionality present in the -kernel. - -.. toctree:: - :maxdepth: 2 - - error-decoding - address-translation diff --git a/Documentation/admin-guide/RAS/address-translation.rst b/Documentation/admin-guide/RAS/address-translation.rst new file mode 100644 index 0000000000000..f0ca17b43cd3d --- /dev/null +++ b/Documentation/admin-guide/RAS/address-translation.rst @@ -0,0 +1,24 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Address translation +=================== + +x86 AMD +------- + +Zen-based AMD systems include a Data Fabric that manages the layout of +physical memory. Devices attached to the Fabric, like memory controllers, +I/O, etc., may not have a complete view of the system physical memory map. +These devices may provide a "normalized", i.e. device physical, address +when reporting memory errors. Normalized addresses must be translated to +a system physical address for the kernel to action on the memory. + +AMD Address Translation Library (CONFIG_AMD_ATL) provides translation for +this case. + +Glossary of acronyms used in address translation for Zen-based systems + +* CCM = Cache Coherent Moderator +* COD = Cluster-on-Die +* COH_ST = Coherent Station +* DF = Data Fabric diff --git a/Documentation/admin-guide/RAS/error-decoding.rst b/Documentation/admin-guide/RAS/error-decoding.rst new file mode 100644 index 0000000000000..26a72f3fe5de8 --- /dev/null +++ b/Documentation/admin-guide/RAS/error-decoding.rst @@ -0,0 +1,21 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Error decoding +============== + +x86 +--- + +Error decoding on AMD systems should be done using the rasdaemon tool: +https://github.com/mchehab/rasdaemon/ + +While the daemon is running, it would automatically log and decode +errors. If not, one can still decode such errors by supplying the +hardware information from the error:: + + $ rasdaemon -p --status --ipid --smca + +Also, the user can pass particular family and model to decode the error +string:: + + $ rasdaemon -p --status --ipid --smca --family --model --bank diff --git a/Documentation/admin-guide/RAS/index.rst b/Documentation/admin-guide/RAS/index.rst new file mode 100644 index 0000000000000..f4087040a7c05 --- /dev/null +++ b/Documentation/admin-guide/RAS/index.rst @@ -0,0 +1,7 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. toctree:: + :maxdepth: 2 + + main + error-decoding + address-translation diff --git a/Documentation/admin-guide/RAS/main.rst b/Documentation/admin-guide/RAS/main.rst new file mode 100644 index 0000000000000..7ac1d4ccc5099 --- /dev/null +++ b/Documentation/admin-guide/RAS/main.rst @@ -0,0 +1,1223 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +================================================== +Reliability, Availability and Serviceability (RAS) +================================================== + +This documents different aspects of the RAS functionality present in the +kernel. + +RAS concepts +************ + +Reliability, Availability and Serviceability (RAS) is a concept used on +servers meant to measure their robustness. + +Reliability + is the probability that a system will produce correct outputs. + + * Generally measured as Mean Time Between Failures (MTBF) + * Enhanced by features that help to avoid, detect and repair hardware faults + +Availability + is the probability that a system is operational at a given time + + * Generally measured as a percentage of downtime per a period of time + * Often uses mechanisms to detect and correct hardware faults in + runtime; + +Serviceability (or maintainability) + is the simplicity and speed with which a system can be repaired or + maintained + + * Generally measured on Mean Time Between Repair (MTBR) + +Improving RAS +------------- + +In order to reduce systems downtime, a system should be capable of detecting +hardware errors, and, when possible correcting them in runtime. It should +also provide mechanisms to detect hardware degradation, in order to warn +the system administrator to take the action of replacing a component before +it causes data loss or system downtime. + +Among the monitoring measures, the most usual ones include: + +* CPU – detect errors at instruction execution and at L1/L2/L3 caches; +* Memory – add error correction logic (ECC) to detect and correct errors; +* I/O – add CRC checksums for transferred data; +* Storage – RAID, journal file systems, checksums, + Self-Monitoring, Analysis and Reporting Technology (SMART). + +By monitoring the number of occurrences of error detections, it is possible +to identify if the probability of hardware errors is increasing, and, on such +case, do a preventive maintenance to replace a degraded component while +those errors are correctable. + +Types of errors +--------------- + +Most mechanisms used on modern systems use technologies like Hamming +Codes that allow error correction when the number of errors on a bit packet +is below a threshold. If the number of errors is above, those mechanisms +can indicate with a high degree of confidence that an error happened, but +they can't correct. + +Also, sometimes an error occur on a component that it is not used. For +example, a part of the memory that it is not currently allocated. + +That defines some categories of errors: + +* **Correctable Error (CE)** - the error detection mechanism detected and + corrected the error. Such errors are usually not fatal, although some + Kernel mechanisms allow the system administrator to consider them as fatal. + +* **Uncorrected Error (UE)** - the amount of errors happened above the error + correction threshold, and the system was unable to auto-correct. + +* **Fatal Error** - when an UE error happens on a critical component of the + system (for example, a piece of the Kernel got corrupted by an UE), the + only reliable way to avoid data corruption is to hang or reboot the machine. + +* **Non-fatal Error** - when an UE error happens on an unused component, + like a CPU in power down state or an unused memory bank, the system may + still run, eventually replacing the affected hardware by a hot spare, + if available. + + Also, when an error happens on a userspace process, it is also possible to + kill such process and let userspace restart it. + +The mechanism for handling non-fatal errors is usually complex and may +require the help of some userspace application, in order to apply the +policy desired by the system administrator. + +Identifying a bad hardware component +------------------------------------ + +Just detecting a hardware flaw is usually not enough, as the system needs +to pinpoint to the minimal replaceable unit (MRU) that should be exchanged +to make the hardware reliable again. + +So, it requires not only error logging facilities, but also mechanisms that +will translate the error message to the silkscreen or component label for +the MRU. + +Typically, it is very complex for memory, as modern CPUs interlace memory +from different memory modules, in order to provide a better performance. The +DMI BIOS usually have a list of memory module labels, with can be obtained +using the ``dmidecode`` tool. For example, on a desktop machine, it shows:: + + Memory Device + Total Width: 64 bits + Data Width: 64 bits + Size: 16384 MB + Form Factor: SODIMM + Set: None + Locator: ChannelA-DIMM0 + Bank Locator: BANK 0 + Type: DDR4 + Type Detail: Synchronous + Speed: 2133 MHz + Rank: 2 + Configured Clock Speed: 2133 MHz + +On the above example, a DDR4 SO-DIMM memory module is located at the +system's memory labeled as "BANK 0", as given by the *bank locator* field. +Please notice that, on such system, the *total width* is equal to the +*data width*. It means that such memory module doesn't have error +detection/correction mechanisms. + +Unfortunately, not all systems use the same field to specify the memory +bank. On this example, from an older server, ``dmidecode`` shows:: + + Memory Device + Array Handle: 0x1000 + Error Information Handle: Not Provided + Total Width: 72 bits + Data Width: 64 bits + Size: 8192 MB + Form Factor: DIMM + Set: 1 + Locator: DIMM_A1 + Bank Locator: Not Specified + Type: DDR3 + Type Detail: Synchronous Registered (Buffered) + Speed: 1600 MHz + Rank: 2 + Configured Clock Speed: 1600 MHz + +There, the DDR3 RDIMM memory module is located at the system's memory labeled +as "DIMM_A1", as given by the *locator* field. Please notice that this +memory module has 64 bits of *data width* and 72 bits of *total width*. So, +it has 8 extra bits to be used by error detection and correction mechanisms. +Such kind of memory is called Error-correcting code memory (ECC memory). + +To make things even worse, it is not uncommon that systems with different +labels on their system's board to use exactly the same BIOS, meaning that +the labels provided by the BIOS won't match the real ones. + +ECC memory +---------- + +As mentioned in the previous section, ECC memory has extra bits to be +used for error correction. In the above example, a memory module has +64 bits of *data width*, and 72 bits of *total width*. The extra 8 +bits which are used for the error detection and correction mechanisms +are referred to as the *syndrome*\ [#f1]_\ [#f2]_. + +So, when the cpu requests the memory controller to write a word with +*data width*, the memory controller calculates the *syndrome* in real time, +using Hamming code, or some other error correction code, like SECDED+, +producing a code with *total width* size. Such code is then written +on the memory modules. + +At read, the *total width* bits code is converted back, using the same +ECC code used on write, producing a word with *data width* and a *syndrome*. +The word with *data width* is sent to the CPU, even when errors happen. + +The memory controller also looks at the *syndrome* in order to check if +there was an error, and if the ECC code was able to fix such error. +If the error was corrected, a Corrected Error (CE) happened. If not, an +Uncorrected Error (UE) happened. + +The information about the CE/UE errors is stored on some special registers +at the memory controller and can be accessed by reading such registers, +either by BIOS, by some special CPUs or by Linux EDAC driver. On x86 64 +bit CPUs, such errors can also be retrieved via the Machine Check +Architecture (MCA)\ [#f3]_. + +.. [#f1] Please notice that several memory controllers allow operation on a + mode called "Lock-Step", where it groups two memory modules together, + doing 128-bit reads/writes. That gives 16 bits for error correction, with + significantly improves the error correction mechanism, at the expense + that, when an error happens, there's no way to know what memory module is + to blame. So, it has to blame both memory modules. + +.. [#f2] Some memory controllers also allow using memory in mirror mode. + On such mode, the same data is written to two memory modules. At read, + the system checks both memory modules, in order to check if both provide + identical data. On such configuration, when an error happens, there's no + way to know what memory module is to blame. So, it has to blame both + memory modules (or 4 memory modules, if the system is also on Lock-step + mode). + +.. [#f3] For more details about the Machine Check Architecture (MCA), + please read Documentation/arch/x86/x86_64/machinecheck.rst at the Kernel tree. + +EDAC - Error Detection And Correction +************************************* + +.. note:: + + "bluesmoke" was the name for this device driver subsystem when it + was "out-of-tree" and maintained at http://bluesmoke.sourceforge.net. + That site is mostly archaic now and can be used only for historical + purposes. + + When the subsystem was pushed upstream for the first time, on + Kernel 2.6.16, it was renamed to ``EDAC``. + +Purpose +------- + +The ``edac`` kernel module's goal is to detect and report hardware errors +that occur within the computer system running under linux. + +Memory +------ + +Memory Correctable Errors (CE) and Uncorrectable Errors (UE) are the +primary errors being harvested. These types of errors are harvested by +the ``edac_mc`` device. + +Detecting CE events, then harvesting those events and reporting them, +**can** but must not necessarily be a predictor of future UE events. With +CE events only, the system can and will continue to operate as no data +has been damaged yet. + +However, preventive maintenance and proactive part replacement of memory +modules exhibiting CEs can reduce the likelihood of the dreaded UE events +and system panics. + +Other hardware elements +----------------------- + +A new feature for EDAC, the ``edac_device`` class of device, was added in +the 2.6.23 version of the kernel. + +This new device type allows for non-memory type of ECC hardware detectors +to have their states harvested and presented to userspace via the sysfs +interface. + +Some architectures have ECC detectors for L1, L2 and L3 caches, +along with DMA engines, fabric switches, main data path switches, +interconnections, and various other hardware data paths. If the hardware +reports it, then a edac_device device probably can be constructed to +harvest and present that to userspace. + + +PCI bus scanning +---------------- + +In addition, PCI devices are scanned for PCI Bus Parity and SERR Errors +in order to determine if errors are occurring during data transfers. + +The presence of PCI Parity errors must be examined with a grain of salt. +There are several add-in adapters that do **not** follow the PCI specification +with regards to Parity generation and reporting. The specification says +the vendor should tie the parity status bits to 0 if they do not intend +to generate parity. Some vendors do not do this, and thus the parity bit +can "float" giving false positives. + +There is a PCI device attribute located in sysfs that is checked by +the EDAC PCI scanning code. If that attribute is set, PCI parity/error +scanning is skipped for that device. The attribute is:: + + broken_parity_status + +and is located in ``/sys/devices/pci/0000:XX:YY.Z`` directories for +PCI devices. + + +Versioning +---------- + +EDAC is composed of a "core" module (``edac_core.ko``) and several Memory +Controller (MC) driver modules. On a given system, the CORE is loaded +and one MC driver will be loaded. Both the CORE and the MC driver (or +``edac_device`` driver) have individual versions that reflect current +release level of their respective modules. + +Thus, to "report" on what version a system is running, one must report +both the CORE's and the MC driver's versions. + + +Loading +------- + +If ``edac`` was statically linked with the kernel then no loading +is necessary. If ``edac`` was built as modules then simply modprobe +the ``edac`` pieces that you need. You should be able to modprobe +hardware-specific modules and have the dependencies load the necessary +core modules. + +Example:: + + $ modprobe amd76x_edac + +loads both the ``amd76x_edac.ko`` memory controller module and the +``edac_mc.ko`` core module. + + +Sysfs interface +--------------- + +EDAC presents a ``sysfs`` interface for control and reporting purposes. It +lives in the /sys/devices/system/edac directory. + +Within this directory there currently reside 2 components: + + ======= ============================== + mc memory controller(s) system + pci PCI control and status system + ======= ============================== + + + +Memory Controller (mc) Model +---------------------------- + +Each ``mc`` device controls a set of memory modules [#f4]_. These modules +are laid out in a Chip-Select Row (``csrowX``) and Channel table (``chX``). +There can be multiple csrows and multiple channels. + +.. [#f4] Nowadays, the term DIMM (Dual In-line Memory Module) is widely + used to refer to a memory module, although there are other memory + packaging alternatives, like SO-DIMM, SIMM, etc. The UEFI + specification (Version 2.7) defines a memory module in the Common + Platform Error Record (CPER) section to be an SMBIOS Memory Device + (Type 17). Along this document, and inside the EDAC subsystem, the term + "dimm" is used for all memory modules, even when they use a + different kind of packaging. + +Memory controllers allow for several csrows, with 8 csrows being a +typical value. Yet, the actual number of csrows depends on the layout of +a given motherboard, memory controller and memory module characteristics. + +Dual channels allow for dual data length (e. g. 128 bits, on 64 bit systems) +data transfers to/from the CPU from/to memory. Some newer chipsets allow +for more than 2 channels, like Fully Buffered DIMMs (FB-DIMMs) memory +controllers. The following example will assume 2 channels: + + +------------+-----------------------+ + | CS Rows | Channels | + +------------+-----------+-----------+ + | | ``ch0`` | ``ch1`` | + +============+===========+===========+ + | |**DIMM_A0**|**DIMM_B0**| + +------------+-----------+-----------+ + | ``csrow0`` | rank0 | rank0 | + +------------+-----------+-----------+ + | ``csrow1`` | rank1 | rank1 | + +------------+-----------+-----------+ + | |**DIMM_A1**|**DIMM_B1**| + +------------+-----------+-----------+ + | ``csrow2`` | rank0 | rank0 | + +------------+-----------+-----------+ + | ``csrow3`` | rank1 | rank1 | + +------------+-----------+-----------+ + +In the above example, there are 4 physical slots on the motherboard +for memory DIMMs: + + +---------+---------+ + | DIMM_A0 | DIMM_B0 | + +---------+---------+ + | DIMM_A1 | DIMM_B1 | + +---------+---------+ + +Labels for these slots are usually silk-screened on the motherboard. +Slots labeled ``A`` are channel 0 in this example. Slots labeled ``B`` are +channel 1. Notice that there are two csrows possible on a physical DIMM. +These csrows are allocated their csrow assignment based on the slot into +which the memory DIMM is placed. Thus, when 1 DIMM is placed in each +Channel, the csrows cross both DIMMs. + +Memory DIMMs come single or dual "ranked". A rank is a populated csrow. +In the example above 2 dual ranked DIMMs are similarly placed. Thus, +both csrow0 and csrow1 are populated. On the other hand, when 2 single +ranked DIMMs are placed in slots DIMM_A0 and DIMM_B0, then they will +have just one csrow (csrow0) and csrow1 will be empty. The pattern +repeats itself for csrow2 and csrow3. Also note that some memory +controllers don't have any logic to identify the memory module, see +``rankX`` directories below. + +The representation of the above is reflected in the directory +tree in EDAC's sysfs interface. Starting in directory +``/sys/devices/system/edac/mc``, each memory controller will be +represented by its own ``mcX`` directory, where ``X`` is the +index of the MC:: + + ..../edac/mc/ + | + |->mc0 + |->mc1 + |->mc2 + .... + +Under each ``mcX`` directory each ``csrowX`` is again represented by a +``csrowX``, where ``X`` is the csrow index:: + + .../mc/mc0/ + | + |->csrow0 + |->csrow2 + |->csrow3 + .... + +Notice that there is no csrow1, which indicates that csrow0 is composed +of a single ranked DIMMs. This should also apply in both Channels, in +order to have dual-channel mode be operational. Since both csrow2 and +csrow3 are populated, this indicates a dual ranked set of DIMMs for +channels 0 and 1. + +Within each of the ``mcX`` and ``csrowX`` directories are several EDAC +control and attribute files. + +``mcX`` directories +------------------- + +In ``mcX`` directories are EDAC control and attribute files for +this ``X`` instance of the memory controllers. + +For a description of the sysfs API, please see: + + Documentation/ABI/testing/sysfs-devices-edac + + +``dimmX`` or ``rankX`` directories +---------------------------------- + +The recommended way to use the EDAC subsystem is to look at the information +provided by the ``dimmX`` or ``rankX`` directories [#f5]_. + +A typical EDAC system has the following structure under +``/sys/devices/system/edac/``\ [#f6]_:: + + /sys/devices/system/edac/ + ├── mc + │   ├── mc0 + │   │   ├── ce_count + │   │   ├── ce_noinfo_count + │   │   ├── dimm0 + │   │   │   ├── dimm_ce_count + │   │   │   ├── dimm_dev_type + │   │   │   ├── dimm_edac_mode + │   │   │   ├── dimm_label + │   │   │   ├── dimm_location + │   │   │   ├── dimm_mem_type + │   │   │   ├── dimm_ue_count + │   │   │   ├── size + │   │   │   └── uevent + │   │   ├── max_location + │   │   ├── mc_name + │   │   ├── reset_counters + │   │   ├── seconds_since_reset + │   │   ├── size_mb + │   │   ├── ue_count + │   │   ├── ue_noinfo_count + │   │   └── uevent + │   ├── mc1 + │   │   ├── ce_count + │   │   ├── ce_noinfo_count + │   │   ├── dimm0 + │   │   │   ├── dimm_ce_count + │   │   │   ├── dimm_dev_type + │   │   │   ├── dimm_edac_mode + │   │   │   ├── dimm_label + │   │   │   ├── dimm_location + │   │   │   ├── dimm_mem_type + │   │   │   ├── dimm_ue_count + │   │   │   ├── size + │   │   │   └── uevent + │   │   ├── max_location + │   │   ├── mc_name + │   │   ├── reset_counters + │   │   ├── seconds_since_reset + │   │   ├── size_mb + │   │   ├── ue_count + │   │   ├── ue_noinfo_count + │   │   └── uevent + │   └── uevent + └── uevent + +In the ``dimmX`` directories are EDAC control and attribute files for +this ``X`` memory module: + +- ``size`` - Total memory managed by this csrow attribute file + + This attribute file displays, in count of megabytes, the memory + that this csrow contains. + +- ``dimm_ue_count`` - Uncorrectable Errors count attribute file + + This attribute file displays the total count of uncorrectable + errors that have occurred on this DIMM. If panic_on_ue is set + this counter will not have a chance to increment, since EDAC + will panic the system. + +- ``dimm_ce_count`` - Correctable Errors count attribute file + + This attribute file displays the total count of correctable + errors that have occurred on this DIMM. This count is very + important to examine. CEs provide early indications that a + DIMM is beginning to fail. This count field should be + monitored for non-zero values and report such information + to the system administrator. + +- ``dimm_dev_type`` - Device type attribute file + + This attribute file will display what type of DRAM device is + being utilized on this DIMM. + Examples: + + - x1 + - x2 + - x4 + - x8 + +- ``dimm_edac_mode`` - EDAC Mode of operation attribute file + + This attribute file will display what type of Error detection + and correction is being utilized. + +- ``dimm_label`` - memory module label control file + + This control file allows this DIMM to have a label assigned + to it. With this label in the module, when errors occur + the output can provide the DIMM label in the system log. + This becomes vital for panic events to isolate the + cause of the UE event. + + DIMM Labels must be assigned after booting, with information + that correctly identifies the physical slot with its + silk screen label. This information is currently very + motherboard specific and determination of this information + must occur in userland at this time. + +- ``dimm_location`` - location of the memory module + + The location can have up to 3 levels, and describe how the + memory controller identifies the location of a memory module. + Depending on the type of memory and memory controller, it + can be: + + - *csrow* and *channel* - used when the memory controller + doesn't identify a single DIMM - e. g. in ``rankX`` dir; + - *branch*, *channel*, *slot* - typically used on FB-DIMM memory + controllers; + - *channel*, *slot* - used on Nehalem and newer Intel drivers. + +- ``dimm_mem_type`` - Memory Type attribute file + + This attribute file will display what type of memory is currently + on this csrow. Normally, either buffered or unbuffered memory. + Examples: + + - Registered-DDR + - Unbuffered-DDR + +.. [#f5] On some systems, the memory controller doesn't have any logic + to identify the memory module. On such systems, the directory is called ``rankX`` and works on a similar way as the ``csrowX`` directories. + On modern Intel memory controllers, the memory controller identifies the + memory modules directly. On such systems, the directory is called ``dimmX``. + +.. [#f6] There are also some ``power`` directories and ``subsystem`` + symlinks inside the sysfs mapping that are automatically created by + the sysfs subsystem. Currently, they serve no purpose. + +``csrowX`` directories +---------------------- + +When CONFIG_EDAC_LEGACY_SYSFS is enabled, sysfs will contain the ``csrowX`` +directories. As this API doesn't work properly for Rambus, FB-DIMMs and +modern Intel Memory Controllers, this is being deprecated in favor of +``dimmX`` directories. + +In the ``csrowX`` directories are EDAC control and attribute files for +this ``X`` instance of csrow: + + +- ``ue_count`` - Total Uncorrectable Errors count attribute file + + This attribute file displays the total count of uncorrectable + errors that have occurred on this csrow. If panic_on_ue is set + this counter will not have a chance to increment, since EDAC + will panic the system. + + +- ``ce_count`` - Total Correctable Errors count attribute file + + This attribute file displays the total count of correctable + errors that have occurred on this csrow. This count is very + important to examine. CEs provide early indications that a + DIMM is beginning to fail. This count field should be + monitored for non-zero values and report such information + to the system administrator. + + +- ``size_mb`` - Total memory managed by this csrow attribute file + + This attribute file displays, in count of megabytes, the memory + that this csrow contains. + + +- ``mem_type`` - Memory Type attribute file + + This attribute file will display what type of memory is currently + on this csrow. Normally, either buffered or unbuffered memory. + Examples: + + - Registered-DDR + - Unbuffered-DDR + + +- ``edac_mode`` - EDAC Mode of operation attribute file + + This attribute file will display what type of Error detection + and correction is being utilized. + + +- ``dev_type`` - Device type attribute file + + This attribute file will display what type of DRAM device is + being utilized on this DIMM. + Examples: + + - x1 + - x2 + - x4 + - x8 + + +- ``ch0_ce_count`` - Channel 0 CE Count attribute file + + This attribute file will display the count of CEs on this + DIMM located in channel 0. + + +- ``ch0_ue_count`` - Channel 0 UE Count attribute file + + This attribute file will display the count of UEs on this + DIMM located in channel 0. + + +- ``ch0_dimm_label`` - Channel 0 DIMM Label control file + + + This control file allows this DIMM to have a label assigned + to it. With this label in the module, when errors occur + the output can provide the DIMM label in the system log. + This becomes vital for panic events to isolate the + cause of the UE event. + + DIMM Labels must be assigned after booting, with information + that correctly identifies the physical slot with its + silk screen label. This information is currently very + motherboard specific and determination of this information + must occur in userland at this time. + + +- ``ch1_ce_count`` - Channel 1 CE Count attribute file + + + This attribute file will display the count of CEs on this + DIMM located in channel 1. + + +- ``ch1_ue_count`` - Channel 1 UE Count attribute file + + + This attribute file will display the count of UEs on this + DIMM located in channel 0. + + +- ``ch1_dimm_label`` - Channel 1 DIMM Label control file + + This control file allows this DIMM to have a label assigned + to it. With this label in the module, when errors occur + the output can provide the DIMM label in the system log. + This becomes vital for panic events to isolate the + cause of the UE event. + + DIMM Labels must be assigned after booting, with information + that correctly identifies the physical slot with its + silk screen label. This information is currently very + motherboard specific and determination of this information + must occur in userland at this time. + + +System Logging +-------------- + +If logging for UEs and CEs is enabled, then system logs will contain +information indicating that errors have been detected:: + + EDAC MC0: CE page 0x283, offset 0xce0, grain 8, syndrome 0x6ec3, row 0, channel 1 "DIMM_B1": amd76x_edac + EDAC MC0: CE page 0x1e5, offset 0xfb0, grain 8, syndrome 0xb741, row 0, channel 1 "DIMM_B1": amd76x_edac + + +The structure of the message is: + + +---------------------------------------+-------------+ + | Content | Example | + +=======================================+=============+ + | The memory controller | MC0 | + +---------------------------------------+-------------+ + | Error type | CE | + +---------------------------------------+-------------+ + | Memory page | 0x283 | + +---------------------------------------+-------------+ + | Offset in the page | 0xce0 | + +---------------------------------------+-------------+ + | The byte granularity | grain 8 | + | or resolution of the error | | + +---------------------------------------+-------------+ + | The error syndrome | 0xb741 | + +---------------------------------------+-------------+ + | Memory row | row 0 | + +---------------------------------------+-------------+ + | Memory channel | channel 1 | + +---------------------------------------+-------------+ + | DIMM label, if set prior | DIMM B1 | + +---------------------------------------+-------------+ + | And then an optional, driver-specific | | + | message that may have additional | | + | information. | | + +---------------------------------------+-------------+ + +Both UEs and CEs with no info will lack all but memory controller, error +type, a notice of "no info" and then an optional, driver-specific error +message. + + +PCI Bus Parity Detection +------------------------ + +On Header Type 00 devices, the primary status is looked at for any +parity error regardless of whether parity is enabled on the device or +not. (The spec indicates parity is generated in some cases). On Header +Type 01 bridges, the secondary status register is also looked at to see +if parity occurred on the bus on the other side of the bridge. + + +Sysfs configuration +------------------- + +Under ``/sys/devices/system/edac/pci`` are control and attribute files as +follows: + + +- ``check_pci_parity`` - Enable/Disable PCI Parity checking control file + + This control file enables or disables the PCI Bus Parity scanning + operation. Writing a 1 to this file enables the scanning. Writing + a 0 to this file disables the scanning. + + Enable:: + + echo "1" >/sys/devices/system/edac/pci/check_pci_parity + + Disable:: + + echo "0" >/sys/devices/system/edac/pci/check_pci_parity + + +- ``pci_parity_count`` - Parity Count + + This attribute file will display the number of parity errors that + have been detected. + + +Module parameters +----------------- + +- ``edac_mc_panic_on_ue`` - Panic on UE control file + + An uncorrectable error will cause a machine panic. This is usually + desirable. It is a bad idea to continue when an uncorrectable error + occurs - it is indeterminate what was uncorrected and the operating + system context might be so mangled that continuing will lead to further + corruption. If the kernel has MCE configured, then EDAC will never + notice the UE. + + LOAD TIME:: + + module/kernel parameter: edac_mc_panic_on_ue=[0|1] + + RUN TIME:: + + echo "1" > /sys/module/edac_core/parameters/edac_mc_panic_on_ue + + +- ``edac_mc_log_ue`` - Log UE control file + + + Generate kernel messages describing uncorrectable errors. These errors + are reported through the system message log system. UE statistics + will be accumulated even when UE logging is disabled. + + LOAD TIME:: + + module/kernel parameter: edac_mc_log_ue=[0|1] + + RUN TIME:: + + echo "1" > /sys/module/edac_core/parameters/edac_mc_log_ue + + +- ``edac_mc_log_ce`` - Log CE control file + + + Generate kernel messages describing correctable errors. These + errors are reported through the system message log system. + CE statistics will be accumulated even when CE logging is disabled. + + LOAD TIME:: + + module/kernel parameter: edac_mc_log_ce=[0|1] + + RUN TIME:: + + echo "1" > /sys/module/edac_core/parameters/edac_mc_log_ce + + +- ``edac_mc_poll_msec`` - Polling period control file + + + The time period, in milliseconds, for polling for error information. + Too small a value wastes resources. Too large a value might delay + necessary handling of errors and might loose valuable information for + locating the error. 1000 milliseconds (once each second) is the current + default. Systems which require all the bandwidth they can get, may + increase this. + + LOAD TIME:: + + module/kernel parameter: edac_mc_poll_msec=[0|1] + + RUN TIME:: + + echo "1000" > /sys/module/edac_core/parameters/edac_mc_poll_msec + + +- ``panic_on_pci_parity`` - Panic on PCI PARITY Error + + + This control file enables or disables panicking when a parity + error has been detected. + + + module/kernel parameter:: + + edac_panic_on_pci_pe=[0|1] + + Enable:: + + echo "1" > /sys/module/edac_core/parameters/edac_panic_on_pci_pe + + Disable:: + + echo "0" > /sys/module/edac_core/parameters/edac_panic_on_pci_pe + + + +EDAC device type +---------------- + +In the header file, edac_pci.h, there is a series of edac_device structures +and APIs for the EDAC_DEVICE. + +User space access to an edac_device is through the sysfs interface. + +At the location ``/sys/devices/system/edac`` (sysfs) new edac_device devices +will appear. + +There is a three level tree beneath the above ``edac`` directory. For example, +the ``test_device_edac`` device (found at the http://bluesmoke.sourceforget.net +website) installs itself as:: + + /sys/devices/system/edac/test-instance + +in this directory are various controls, a symlink and one or more ``instance`` +directories. + +The standard default controls are: + + ============== ======================================================= + log_ce boolean to log CE events + log_ue boolean to log UE events + panic_on_ue boolean to ``panic`` the system if an UE is encountered + (default off, can be set true via startup script) + poll_msec time period between POLL cycles for events + ============== ======================================================= + +The test_device_edac device adds at least one of its own custom control: + + ============== ================================================== + test_bits which in the current test driver does nothing but + show how it is installed. A ported driver can + add one or more such controls and/or attributes + for specific uses. + One out-of-tree driver uses controls here to allow + for ERROR INJECTION operations to hardware + injection registers + ============== ================================================== + +The symlink points to the 'struct dev' that is registered for this edac_device. + +Instances +--------- + +One or more instance directories are present. For the ``test_device_edac`` +case: + + +----------------+ + | test-instance0 | + +----------------+ + + +In this directory there are two default counter attributes, which are totals of +counter in deeper subdirectories. + + ============== ==================================== + ce_count total of CE events of subdirectories + ue_count total of UE events of subdirectories + ============== ==================================== + +Blocks +------ + +At the lowest directory level is the ``block`` directory. There can be 0, 1 +or more blocks specified in each instance: + + +-------------+ + | test-block0 | + +-------------+ + +In this directory the default attributes are: + + ============== ================================================ + ce_count which is counter of CE events for this ``block`` + of hardware being monitored + ue_count which is counter of UE events for this ``block`` + of hardware being monitored + ============== ================================================ + + +The ``test_device_edac`` device adds 4 attributes and 1 control: + + ================== ==================================================== + test-block-bits-0 for every POLL cycle this counter + is incremented + test-block-bits-1 every 10 cycles, this counter is bumped once, + and test-block-bits-0 is set to 0 + test-block-bits-2 every 100 cycles, this counter is bumped once, + and test-block-bits-1 is set to 0 + test-block-bits-3 every 1000 cycles, this counter is bumped once, + and test-block-bits-2 is set to 0 + ================== ==================================================== + + + ================== ==================================================== + reset-counters writing ANY thing to this control will + reset all the above counters. + ================== ==================================================== + + +Use of the ``test_device_edac`` driver should enable any others to create their own +unique drivers for their hardware systems. + +The ``test_device_edac`` sample driver is located at the +http://bluesmoke.sourceforge.net project site for EDAC. + + +Usage of EDAC APIs on Nehalem and newer Intel CPUs +-------------------------------------------------- + +On older Intel architectures, the memory controller was part of the North +Bridge chipset. Nehalem, Sandy Bridge, Ivy Bridge, Haswell, Sky Lake and +newer Intel architectures integrated an enhanced version of the memory +controller (MC) inside the CPUs. + +This chapter will cover the differences of the enhanced memory controllers +found on newer Intel CPUs, such as ``i7core_edac``, ``sb_edac`` and +``sbx_edac`` drivers. + +.. note:: + + The Xeon E7 processor families use a separate chip for the memory + controller, called Intel Scalable Memory Buffer. This section doesn't + apply for such families. + +1) There is one Memory Controller per Quick Patch Interconnect + (QPI). At the driver, the term "socket" means one QPI. This is + associated with a physical CPU socket. + + Each MC have 3 physical read channels, 3 physical write channels and + 3 logic channels. The driver currently sees it as just 3 channels. + Each channel can have up to 3 DIMMs. + + The minimum known unity is DIMMs. There are no information about csrows. + As EDAC API maps the minimum unity is csrows, the driver sequentially + maps channel/DIMM into different csrows. + + For example, supposing the following layout:: + + Ch0 phy rd0, wr0 (0x063f4031): 2 ranks, UDIMMs + dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400 + dimm 1 1024 Mb offset: 4, bank: 8, rank: 1, row: 0x4000, col: 0x400 + Ch1 phy rd1, wr1 (0x063f4031): 2 ranks, UDIMMs + dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400 + Ch2 phy rd3, wr3 (0x063f4031): 2 ranks, UDIMMs + dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400 + + The driver will map it as:: + + csrow0: channel 0, dimm0 + csrow1: channel 0, dimm1 + csrow2: channel 1, dimm0 + csrow3: channel 2, dimm0 + + exports one DIMM per csrow. + + Each QPI is exported as a different memory controller. + +2) The MC has the ability to inject errors to test drivers. The drivers + implement this functionality via some error injection nodes: + + For injecting a memory error, there are some sysfs nodes, under + ``/sys/devices/system/edac/mc/mc?/``: + + - ``inject_addrmatch/*``: + Controls the error injection mask register. It is possible to specify + several characteristics of the address to match an error code:: + + dimm = the affected dimm. Numbers are relative to a channel; + rank = the memory rank; + channel = the channel that will generate an error; + bank = the affected bank; + page = the page address; + column (or col) = the address column. + + each of the above values can be set to "any" to match any valid value. + + At driver init, all values are set to any. + + For example, to generate an error at rank 1 of dimm 2, for any channel, + any bank, any page, any column:: + + echo 2 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/dimm + echo 1 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/rank + + To return to the default behaviour of matching any, you can do:: + + echo any >/sys/devices/system/edac/mc/mc0/inject_addrmatch/dimm + echo any >/sys/devices/system/edac/mc/mc0/inject_addrmatch/rank + + - ``inject_eccmask``: + specifies what bits will have troubles, + + - ``inject_section``: + specifies what ECC cache section will get the error:: + + 3 for both + 2 for the highest + 1 for the lowest + + - ``inject_type``: + specifies the type of error, being a combination of the following bits:: + + bit 0 - repeat + bit 1 - ecc + bit 2 - parity + + - ``inject_enable``: + starts the error generation when something different than 0 is written. + + All inject vars can be read. root permission is needed for write. + + Datasheet states that the error will only be generated after a write on an + address that matches inject_addrmatch. It seems, however, that reading will + also produce an error. + + For example, the following code will generate an error for any write access + at socket 0, on any DIMM/address on channel 2:: + + echo 2 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/channel + echo 2 >/sys/devices/system/edac/mc/mc0/inject_type + echo 64 >/sys/devices/system/edac/mc/mc0/inject_eccmask + echo 3 >/sys/devices/system/edac/mc/mc0/inject_section + echo 1 >/sys/devices/system/edac/mc/mc0/inject_enable + dd if=/dev/mem of=/dev/null seek=16k bs=4k count=1 >& /dev/null + + For socket 1, it is needed to replace "mc0" by "mc1" at the above + commands. + + The generated error message will look like:: + + EDAC MC0: UE row 0, channel-a= 0 channel-b= 0 labels "-": NON_FATAL (addr = 0x0075b980, socket=0, Dimm=0, Channel=2, syndrome=0x00000040, count=1, Err=8c0000400001009f:4000080482 (read error: read ECC error)) + +3) Corrected Error memory register counters + + Those newer MCs have some registers to count memory errors. The driver + uses those registers to report Corrected Errors on devices with Registered + DIMMs. + + However, those counters don't work with Unregistered DIMM. As the chipset + offers some counters that also work with UDIMMs (but with a worse level of + granularity than the default ones), the driver exposes those registers for + UDIMM memories. + + They can be read by looking at the contents of ``all_channel_counts/``:: + + $ for i in /sys/devices/system/edac/mc/mc0/all_channel_counts/*; do echo $i; cat $i; done + /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm0 + 0 + /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm1 + 0 + /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm2 + 0 + + What happens here is that errors on different csrows, but at the same + dimm number will increment the same counter. + So, in this memory mapping:: + + csrow0: channel 0, dimm0 + csrow1: channel 0, dimm1 + csrow2: channel 1, dimm0 + csrow3: channel 2, dimm0 + + The hardware will increment udimm0 for an error at the first dimm at either + csrow0, csrow2 or csrow3; + + The hardware will increment udimm1 for an error at the second dimm at either + csrow0, csrow2 or csrow3; + + The hardware will increment udimm2 for an error at the third dimm at either + csrow0, csrow2 or csrow3; + +4) Standard error counters + + The standard error counters are generated when an mcelog error is received + by the driver. Since, with UDIMM, this is counted by software, it is + possible that some errors could be lost. With RDIMM's, they display the + contents of the registers + +Reference documents used on ``amd64_edac`` +------------------------------------------ + +``amd64_edac`` module is based on the following documents +(available from http://support.amd.com/en-us/search/tech-docs): + +1. :Title: BIOS and Kernel Developer's Guide for AMD Athlon 64 and AMD + Opteron Processors + :AMD publication #: 26094 + :Revision: 3.26 + :Link: http://support.amd.com/TechDocs/26094.PDF + +2. :Title: BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh + Processors + :AMD publication #: 32559 + :Revision: 3.00 + :Issue Date: May 2006 + :Link: http://support.amd.com/TechDocs/32559.pdf + +3. :Title: BIOS and Kernel Developer's Guide (BKDG) For AMD Family 10h + Processors + :AMD publication #: 31116 + :Revision: 3.00 + :Issue Date: September 07, 2007 + :Link: http://support.amd.com/TechDocs/31116.pdf + +4. :Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 15h + Models 30h-3Fh Processors + :AMD publication #: 49125 + :Revision: 3.06 + :Issue Date: 2/12/2015 (latest release) + :Link: http://support.amd.com/TechDocs/49125_15h_Models_30h-3Fh_BKDG.pdf + +5. :Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 15h + Models 60h-6Fh Processors + :AMD publication #: 50742 + :Revision: 3.01 + :Issue Date: 7/23/2015 (latest release) + :Link: http://support.amd.com/TechDocs/50742_15h_Models_60h-6Fh_BKDG.pdf + +6. :Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 16h + Models 00h-0Fh Processors + :AMD publication #: 48751 + :Revision: 3.03 + :Issue Date: 2/23/2015 (latest release) + :Link: http://support.amd.com/TechDocs/48751_16h_bkdg.pdf + +Credits +======= + +* Written by Doug Thompson + + - 7 Dec 2005 + - 17 Jul 2007 Updated + +* |copy| Mauro Carvalho Chehab + + - 05 Aug 2009 Nehalem interface + - 26 Oct 2016 Converted to ReST and cleanups at the Nehalem section + +* EDAC authors/maintainers: + + - Doug Thompson, Dave Jiang, Dave Peterson et al, + - Mauro Carvalho Chehab + - Borislav Petkov + - original author: Thayne Harbaugh diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index fb40a1f6f79e1..dfc06fab94322 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -122,7 +122,7 @@ configure specific aspects of kernel behavior to your liking. pmf pnp rapidio - ras + RAS/index rtc serial-console svga diff --git a/Documentation/admin-guide/ras.rst b/Documentation/admin-guide/ras.rst deleted file mode 100644 index 8e03751d126d0..0000000000000 --- a/Documentation/admin-guide/ras.rst +++ /dev/null @@ -1,1219 +0,0 @@ -.. include:: - -============================================ -Reliability, Availability and Serviceability -============================================ - -RAS concepts -************ - -Reliability, Availability and Serviceability (RAS) is a concept used on -servers meant to measure their robustness. - -Reliability - is the probability that a system will produce correct outputs. - - * Generally measured as Mean Time Between Failures (MTBF) - * Enhanced by features that help to avoid, detect and repair hardware faults - -Availability - is the probability that a system is operational at a given time - - * Generally measured as a percentage of downtime per a period of time - * Often uses mechanisms to detect and correct hardware faults in - runtime; - -Serviceability (or maintainability) - is the simplicity and speed with which a system can be repaired or - maintained - - * Generally measured on Mean Time Between Repair (MTBR) - -Improving RAS -------------- - -In order to reduce systems downtime, a system should be capable of detecting -hardware errors, and, when possible correcting them in runtime. It should -also provide mechanisms to detect hardware degradation, in order to warn -the system administrator to take the action of replacing a component before -it causes data loss or system downtime. - -Among the monitoring measures, the most usual ones include: - -* CPU – detect errors at instruction execution and at L1/L2/L3 caches; -* Memory – add error correction logic (ECC) to detect and correct errors; -* I/O – add CRC checksums for transferred data; -* Storage – RAID, journal file systems, checksums, - Self-Monitoring, Analysis and Reporting Technology (SMART). - -By monitoring the number of occurrences of error detections, it is possible -to identify if the probability of hardware errors is increasing, and, on such -case, do a preventive maintenance to replace a degraded component while -those errors are correctable. - -Types of errors ---------------- - -Most mechanisms used on modern systems use technologies like Hamming -Codes that allow error correction when the number of errors on a bit packet -is below a threshold. If the number of errors is above, those mechanisms -can indicate with a high degree of confidence that an error happened, but -they can't correct. - -Also, sometimes an error occur on a component that it is not used. For -example, a part of the memory that it is not currently allocated. - -That defines some categories of errors: - -* **Correctable Error (CE)** - the error detection mechanism detected and - corrected the error. Such errors are usually not fatal, although some - Kernel mechanisms allow the system administrator to consider them as fatal. - -* **Uncorrected Error (UE)** - the amount of errors happened above the error - correction threshold, and the system was unable to auto-correct. - -* **Fatal Error** - when an UE error happens on a critical component of the - system (for example, a piece of the Kernel got corrupted by an UE), the - only reliable way to avoid data corruption is to hang or reboot the machine. - -* **Non-fatal Error** - when an UE error happens on an unused component, - like a CPU in power down state or an unused memory bank, the system may - still run, eventually replacing the affected hardware by a hot spare, - if available. - - Also, when an error happens on a userspace process, it is also possible to - kill such process and let userspace restart it. - -The mechanism for handling non-fatal errors is usually complex and may -require the help of some userspace application, in order to apply the -policy desired by the system administrator. - -Identifying a bad hardware component ------------------------------------- - -Just detecting a hardware flaw is usually not enough, as the system needs -to pinpoint to the minimal replaceable unit (MRU) that should be exchanged -to make the hardware reliable again. - -So, it requires not only error logging facilities, but also mechanisms that -will translate the error message to the silkscreen or component label for -the MRU. - -Typically, it is very complex for memory, as modern CPUs interlace memory -from different memory modules, in order to provide a better performance. The -DMI BIOS usually have a list of memory module labels, with can be obtained -using the ``dmidecode`` tool. For example, on a desktop machine, it shows:: - - Memory Device - Total Width: 64 bits - Data Width: 64 bits - Size: 16384 MB - Form Factor: SODIMM - Set: None - Locator: ChannelA-DIMM0 - Bank Locator: BANK 0 - Type: DDR4 - Type Detail: Synchronous - Speed: 2133 MHz - Rank: 2 - Configured Clock Speed: 2133 MHz - -On the above example, a DDR4 SO-DIMM memory module is located at the -system's memory labeled as "BANK 0", as given by the *bank locator* field. -Please notice that, on such system, the *total width* is equal to the -*data width*. It means that such memory module doesn't have error -detection/correction mechanisms. - -Unfortunately, not all systems use the same field to specify the memory -bank. On this example, from an older server, ``dmidecode`` shows:: - - Memory Device - Array Handle: 0x1000 - Error Information Handle: Not Provided - Total Width: 72 bits - Data Width: 64 bits - Size: 8192 MB - Form Factor: DIMM - Set: 1 - Locator: DIMM_A1 - Bank Locator: Not Specified - Type: DDR3 - Type Detail: Synchronous Registered (Buffered) - Speed: 1600 MHz - Rank: 2 - Configured Clock Speed: 1600 MHz - -There, the DDR3 RDIMM memory module is located at the system's memory labeled -as "DIMM_A1", as given by the *locator* field. Please notice that this -memory module has 64 bits of *data width* and 72 bits of *total width*. So, -it has 8 extra bits to be used by error detection and correction mechanisms. -Such kind of memory is called Error-correcting code memory (ECC memory). - -To make things even worse, it is not uncommon that systems with different -labels on their system's board to use exactly the same BIOS, meaning that -the labels provided by the BIOS won't match the real ones. - -ECC memory ----------- - -As mentioned in the previous section, ECC memory has extra bits to be -used for error correction. In the above example, a memory module has -64 bits of *data width*, and 72 bits of *total width*. The extra 8 -bits which are used for the error detection and correction mechanisms -are referred to as the *syndrome*\ [#f1]_\ [#f2]_. - -So, when the cpu requests the memory controller to write a word with -*data width*, the memory controller calculates the *syndrome* in real time, -using Hamming code, or some other error correction code, like SECDED+, -producing a code with *total width* size. Such code is then written -on the memory modules. - -At read, the *total width* bits code is converted back, using the same -ECC code used on write, producing a word with *data width* and a *syndrome*. -The word with *data width* is sent to the CPU, even when errors happen. - -The memory controller also looks at the *syndrome* in order to check if -there was an error, and if the ECC code was able to fix such error. -If the error was corrected, a Corrected Error (CE) happened. If not, an -Uncorrected Error (UE) happened. - -The information about the CE/UE errors is stored on some special registers -at the memory controller and can be accessed by reading such registers, -either by BIOS, by some special CPUs or by Linux EDAC driver. On x86 64 -bit CPUs, such errors can also be retrieved via the Machine Check -Architecture (MCA)\ [#f3]_. - -.. [#f1] Please notice that several memory controllers allow operation on a - mode called "Lock-Step", where it groups two memory modules together, - doing 128-bit reads/writes. That gives 16 bits for error correction, with - significantly improves the error correction mechanism, at the expense - that, when an error happens, there's no way to know what memory module is - to blame. So, it has to blame both memory modules. - -.. [#f2] Some memory controllers also allow using memory in mirror mode. - On such mode, the same data is written to two memory modules. At read, - the system checks both memory modules, in order to check if both provide - identical data. On such configuration, when an error happens, there's no - way to know what memory module is to blame. So, it has to blame both - memory modules (or 4 memory modules, if the system is also on Lock-step - mode). - -.. [#f3] For more details about the Machine Check Architecture (MCA), - please read Documentation/arch/x86/x86_64/machinecheck.rst at the Kernel tree. - -EDAC - Error Detection And Correction -************************************* - -.. note:: - - "bluesmoke" was the name for this device driver subsystem when it - was "out-of-tree" and maintained at http://bluesmoke.sourceforge.net. - That site is mostly archaic now and can be used only for historical - purposes. - - When the subsystem was pushed upstream for the first time, on - Kernel 2.6.16, it was renamed to ``EDAC``. - -Purpose -------- - -The ``edac`` kernel module's goal is to detect and report hardware errors -that occur within the computer system running under linux. - -Memory ------- - -Memory Correctable Errors (CE) and Uncorrectable Errors (UE) are the -primary errors being harvested. These types of errors are harvested by -the ``edac_mc`` device. - -Detecting CE events, then harvesting those events and reporting them, -**can** but must not necessarily be a predictor of future UE events. With -CE events only, the system can and will continue to operate as no data -has been damaged yet. - -However, preventive maintenance and proactive part replacement of memory -modules exhibiting CEs can reduce the likelihood of the dreaded UE events -and system panics. - -Other hardware elements ------------------------ - -A new feature for EDAC, the ``edac_device`` class of device, was added in -the 2.6.23 version of the kernel. - -This new device type allows for non-memory type of ECC hardware detectors -to have their states harvested and presented to userspace via the sysfs -interface. - -Some architectures have ECC detectors for L1, L2 and L3 caches, -along with DMA engines, fabric switches, main data path switches, -interconnections, and various other hardware data paths. If the hardware -reports it, then a edac_device device probably can be constructed to -harvest and present that to userspace. - - -PCI bus scanning ----------------- - -In addition, PCI devices are scanned for PCI Bus Parity and SERR Errors -in order to determine if errors are occurring during data transfers. - -The presence of PCI Parity errors must be examined with a grain of salt. -There are several add-in adapters that do **not** follow the PCI specification -with regards to Parity generation and reporting. The specification says -the vendor should tie the parity status bits to 0 if they do not intend -to generate parity. Some vendors do not do this, and thus the parity bit -can "float" giving false positives. - -There is a PCI device attribute located in sysfs that is checked by -the EDAC PCI scanning code. If that attribute is set, PCI parity/error -scanning is skipped for that device. The attribute is:: - - broken_parity_status - -and is located in ``/sys/devices/pci/0000:XX:YY.Z`` directories for -PCI devices. - - -Versioning ----------- - -EDAC is composed of a "core" module (``edac_core.ko``) and several Memory -Controller (MC) driver modules. On a given system, the CORE is loaded -and one MC driver will be loaded. Both the CORE and the MC driver (or -``edac_device`` driver) have individual versions that reflect current -release level of their respective modules. - -Thus, to "report" on what version a system is running, one must report -both the CORE's and the MC driver's versions. - - -Loading -------- - -If ``edac`` was statically linked with the kernel then no loading -is necessary. If ``edac`` was built as modules then simply modprobe -the ``edac`` pieces that you need. You should be able to modprobe -hardware-specific modules and have the dependencies load the necessary -core modules. - -Example:: - - $ modprobe amd76x_edac - -loads both the ``amd76x_edac.ko`` memory controller module and the -``edac_mc.ko`` core module. - - -Sysfs interface ---------------- - -EDAC presents a ``sysfs`` interface for control and reporting purposes. It -lives in the /sys/devices/system/edac directory. - -Within this directory there currently reside 2 components: - - ======= ============================== - mc memory controller(s) system - pci PCI control and status system - ======= ============================== - - - -Memory Controller (mc) Model ----------------------------- - -Each ``mc`` device controls a set of memory modules [#f4]_. These modules -are laid out in a Chip-Select Row (``csrowX``) and Channel table (``chX``). -There can be multiple csrows and multiple channels. - -.. [#f4] Nowadays, the term DIMM (Dual In-line Memory Module) is widely - used to refer to a memory module, although there are other memory - packaging alternatives, like SO-DIMM, SIMM, etc. The UEFI - specification (Version 2.7) defines a memory module in the Common - Platform Error Record (CPER) section to be an SMBIOS Memory Device - (Type 17). Along this document, and inside the EDAC subsystem, the term - "dimm" is used for all memory modules, even when they use a - different kind of packaging. - -Memory controllers allow for several csrows, with 8 csrows being a -typical value. Yet, the actual number of csrows depends on the layout of -a given motherboard, memory controller and memory module characteristics. - -Dual channels allow for dual data length (e. g. 128 bits, on 64 bit systems) -data transfers to/from the CPU from/to memory. Some newer chipsets allow -for more than 2 channels, like Fully Buffered DIMMs (FB-DIMMs) memory -controllers. The following example will assume 2 channels: - - +------------+-----------------------+ - | CS Rows | Channels | - +------------+-----------+-----------+ - | | ``ch0`` | ``ch1`` | - +============+===========+===========+ - | |**DIMM_A0**|**DIMM_B0**| - +------------+-----------+-----------+ - | ``csrow0`` | rank0 | rank0 | - +------------+-----------+-----------+ - | ``csrow1`` | rank1 | rank1 | - +------------+-----------+-----------+ - | |**DIMM_A1**|**DIMM_B1**| - +------------+-----------+-----------+ - | ``csrow2`` | rank0 | rank0 | - +------------+-----------+-----------+ - | ``csrow3`` | rank1 | rank1 | - +------------+-----------+-----------+ - -In the above example, there are 4 physical slots on the motherboard -for memory DIMMs: - - +---------+---------+ - | DIMM_A0 | DIMM_B0 | - +---------+---------+ - | DIMM_A1 | DIMM_B1 | - +---------+---------+ - -Labels for these slots are usually silk-screened on the motherboard. -Slots labeled ``A`` are channel 0 in this example. Slots labeled ``B`` are -channel 1. Notice that there are two csrows possible on a physical DIMM. -These csrows are allocated their csrow assignment based on the slot into -which the memory DIMM is placed. Thus, when 1 DIMM is placed in each -Channel, the csrows cross both DIMMs. - -Memory DIMMs come single or dual "ranked". A rank is a populated csrow. -In the example above 2 dual ranked DIMMs are similarly placed. Thus, -both csrow0 and csrow1 are populated. On the other hand, when 2 single -ranked DIMMs are placed in slots DIMM_A0 and DIMM_B0, then they will -have just one csrow (csrow0) and csrow1 will be empty. The pattern -repeats itself for csrow2 and csrow3. Also note that some memory -controllers don't have any logic to identify the memory module, see -``rankX`` directories below. - -The representation of the above is reflected in the directory -tree in EDAC's sysfs interface. Starting in directory -``/sys/devices/system/edac/mc``, each memory controller will be -represented by its own ``mcX`` directory, where ``X`` is the -index of the MC:: - - ..../edac/mc/ - | - |->mc0 - |->mc1 - |->mc2 - .... - -Under each ``mcX`` directory each ``csrowX`` is again represented by a -``csrowX``, where ``X`` is the csrow index:: - - .../mc/mc0/ - | - |->csrow0 - |->csrow2 - |->csrow3 - .... - -Notice that there is no csrow1, which indicates that csrow0 is composed -of a single ranked DIMMs. This should also apply in both Channels, in -order to have dual-channel mode be operational. Since both csrow2 and -csrow3 are populated, this indicates a dual ranked set of DIMMs for -channels 0 and 1. - -Within each of the ``mcX`` and ``csrowX`` directories are several EDAC -control and attribute files. - -``mcX`` directories -------------------- - -In ``mcX`` directories are EDAC control and attribute files for -this ``X`` instance of the memory controllers. - -For a description of the sysfs API, please see: - - Documentation/ABI/testing/sysfs-devices-edac - - -``dimmX`` or ``rankX`` directories ----------------------------------- - -The recommended way to use the EDAC subsystem is to look at the information -provided by the ``dimmX`` or ``rankX`` directories [#f5]_. - -A typical EDAC system has the following structure under -``/sys/devices/system/edac/``\ [#f6]_:: - - /sys/devices/system/edac/ - ├── mc - │   ├── mc0 - │   │   ├── ce_count - │   │   ├── ce_noinfo_count - │   │   ├── dimm0 - │   │   │   ├── dimm_ce_count - │   │   │   ├── dimm_dev_type - │   │   │   ├── dimm_edac_mode - │   │   │   ├── dimm_label - │   │   │   ├── dimm_location - │   │   │   ├── dimm_mem_type - │   │   │   ├── dimm_ue_count - │   │   │   ├── size - │   │   │   └── uevent - │   │   ├── max_location - │   │   ├── mc_name - │   │   ├── reset_counters - │   │   ├── seconds_since_reset - │   │   ├── size_mb - │   │   ├── ue_count - │   │   ├── ue_noinfo_count - │   │   └── uevent - │   ├── mc1 - │   │   ├── ce_count - │   │   ├── ce_noinfo_count - │   │   ├── dimm0 - │   │   │   ├── dimm_ce_count - │   │   │   ├── dimm_dev_type - │   │   │   ├── dimm_edac_mode - │   │   │   ├── dimm_label - │   │   │   ├── dimm_location - │   │   │   ├── dimm_mem_type - │   │   │   ├── dimm_ue_count - │   │   │   ├── size - │   │   │   └── uevent - │   │   ├── max_location - │   │   ├── mc_name - │   │   ├── reset_counters - │   │   ├── seconds_since_reset - │   │   ├── size_mb - │   │   ├── ue_count - │   │   ├── ue_noinfo_count - │   │   └── uevent - │   └── uevent - └── uevent - -In the ``dimmX`` directories are EDAC control and attribute files for -this ``X`` memory module: - -- ``size`` - Total memory managed by this csrow attribute file - - This attribute file displays, in count of megabytes, the memory - that this csrow contains. - -- ``dimm_ue_count`` - Uncorrectable Errors count attribute file - - This attribute file displays the total count of uncorrectable - errors that have occurred on this DIMM. If panic_on_ue is set - this counter will not have a chance to increment, since EDAC - will panic the system. - -- ``dimm_ce_count`` - Correctable Errors count attribute file - - This attribute file displays the total count of correctable - errors that have occurred on this DIMM. This count is very - important to examine. CEs provide early indications that a - DIMM is beginning to fail. This count field should be - monitored for non-zero values and report such information - to the system administrator. - -- ``dimm_dev_type`` - Device type attribute file - - This attribute file will display what type of DRAM device is - being utilized on this DIMM. - Examples: - - - x1 - - x2 - - x4 - - x8 - -- ``dimm_edac_mode`` - EDAC Mode of operation attribute file - - This attribute file will display what type of Error detection - and correction is being utilized. - -- ``dimm_label`` - memory module label control file - - This control file allows this DIMM to have a label assigned - to it. With this label in the module, when errors occur - the output can provide the DIMM label in the system log. - This becomes vital for panic events to isolate the - cause of the UE event. - - DIMM Labels must be assigned after booting, with information - that correctly identifies the physical slot with its - silk screen label. This information is currently very - motherboard specific and determination of this information - must occur in userland at this time. - -- ``dimm_location`` - location of the memory module - - The location can have up to 3 levels, and describe how the - memory controller identifies the location of a memory module. - Depending on the type of memory and memory controller, it - can be: - - - *csrow* and *channel* - used when the memory controller - doesn't identify a single DIMM - e. g. in ``rankX`` dir; - - *branch*, *channel*, *slot* - typically used on FB-DIMM memory - controllers; - - *channel*, *slot* - used on Nehalem and newer Intel drivers. - -- ``dimm_mem_type`` - Memory Type attribute file - - This attribute file will display what type of memory is currently - on this csrow. Normally, either buffered or unbuffered memory. - Examples: - - - Registered-DDR - - Unbuffered-DDR - -.. [#f5] On some systems, the memory controller doesn't have any logic - to identify the memory module. On such systems, the directory is called ``rankX`` and works on a similar way as the ``csrowX`` directories. - On modern Intel memory controllers, the memory controller identifies the - memory modules directly. On such systems, the directory is called ``dimmX``. - -.. [#f6] There are also some ``power`` directories and ``subsystem`` - symlinks inside the sysfs mapping that are automatically created by - the sysfs subsystem. Currently, they serve no purpose. - -``csrowX`` directories ----------------------- - -When CONFIG_EDAC_LEGACY_SYSFS is enabled, sysfs will contain the ``csrowX`` -directories. As this API doesn't work properly for Rambus, FB-DIMMs and -modern Intel Memory Controllers, this is being deprecated in favor of -``dimmX`` directories. - -In the ``csrowX`` directories are EDAC control and attribute files for -this ``X`` instance of csrow: - - -- ``ue_count`` - Total Uncorrectable Errors count attribute file - - This attribute file displays the total count of uncorrectable - errors that have occurred on this csrow. If panic_on_ue is set - this counter will not have a chance to increment, since EDAC - will panic the system. - - -- ``ce_count`` - Total Correctable Errors count attribute file - - This attribute file displays the total count of correctable - errors that have occurred on this csrow. This count is very - important to examine. CEs provide early indications that a - DIMM is beginning to fail. This count field should be - monitored for non-zero values and report such information - to the system administrator. - - -- ``size_mb`` - Total memory managed by this csrow attribute file - - This attribute file displays, in count of megabytes, the memory - that this csrow contains. - - -- ``mem_type`` - Memory Type attribute file - - This attribute file will display what type of memory is currently - on this csrow. Normally, either buffered or unbuffered memory. - Examples: - - - Registered-DDR - - Unbuffered-DDR - - -- ``edac_mode`` - EDAC Mode of operation attribute file - - This attribute file will display what type of Error detection - and correction is being utilized. - - -- ``dev_type`` - Device type attribute file - - This attribute file will display what type of DRAM device is - being utilized on this DIMM. - Examples: - - - x1 - - x2 - - x4 - - x8 - - -- ``ch0_ce_count`` - Channel 0 CE Count attribute file - - This attribute file will display the count of CEs on this - DIMM located in channel 0. - - -- ``ch0_ue_count`` - Channel 0 UE Count attribute file - - This attribute file will display the count of UEs on this - DIMM located in channel 0. - - -- ``ch0_dimm_label`` - Channel 0 DIMM Label control file - - - This control file allows this DIMM to have a label assigned - to it. With this label in the module, when errors occur - the output can provide the DIMM label in the system log. - This becomes vital for panic events to isolate the - cause of the UE event. - - DIMM Labels must be assigned after booting, with information - that correctly identifies the physical slot with its - silk screen label. This information is currently very - motherboard specific and determination of this information - must occur in userland at this time. - - -- ``ch1_ce_count`` - Channel 1 CE Count attribute file - - - This attribute file will display the count of CEs on this - DIMM located in channel 1. - - -- ``ch1_ue_count`` - Channel 1 UE Count attribute file - - - This attribute file will display the count of UEs on this - DIMM located in channel 0. - - -- ``ch1_dimm_label`` - Channel 1 DIMM Label control file - - This control file allows this DIMM to have a label assigned - to it. With this label in the module, when errors occur - the output can provide the DIMM label in the system log. - This becomes vital for panic events to isolate the - cause of the UE event. - - DIMM Labels must be assigned after booting, with information - that correctly identifies the physical slot with its - silk screen label. This information is currently very - motherboard specific and determination of this information - must occur in userland at this time. - - -System Logging --------------- - -If logging for UEs and CEs is enabled, then system logs will contain -information indicating that errors have been detected:: - - EDAC MC0: CE page 0x283, offset 0xce0, grain 8, syndrome 0x6ec3, row 0, channel 1 "DIMM_B1": amd76x_edac - EDAC MC0: CE page 0x1e5, offset 0xfb0, grain 8, syndrome 0xb741, row 0, channel 1 "DIMM_B1": amd76x_edac - - -The structure of the message is: - - +---------------------------------------+-------------+ - | Content | Example | - +=======================================+=============+ - | The memory controller | MC0 | - +---------------------------------------+-------------+ - | Error type | CE | - +---------------------------------------+-------------+ - | Memory page | 0x283 | - +---------------------------------------+-------------+ - | Offset in the page | 0xce0 | - +---------------------------------------+-------------+ - | The byte granularity | grain 8 | - | or resolution of the error | | - +---------------------------------------+-------------+ - | The error syndrome | 0xb741 | - +---------------------------------------+-------------+ - | Memory row | row 0 | - +---------------------------------------+-------------+ - | Memory channel | channel 1 | - +---------------------------------------+-------------+ - | DIMM label, if set prior | DIMM B1 | - +---------------------------------------+-------------+ - | And then an optional, driver-specific | | - | message that may have additional | | - | information. | | - +---------------------------------------+-------------+ - -Both UEs and CEs with no info will lack all but memory controller, error -type, a notice of "no info" and then an optional, driver-specific error -message. - - -PCI Bus Parity Detection ------------------------- - -On Header Type 00 devices, the primary status is looked at for any -parity error regardless of whether parity is enabled on the device or -not. (The spec indicates parity is generated in some cases). On Header -Type 01 bridges, the secondary status register is also looked at to see -if parity occurred on the bus on the other side of the bridge. - - -Sysfs configuration -------------------- - -Under ``/sys/devices/system/edac/pci`` are control and attribute files as -follows: - - -- ``check_pci_parity`` - Enable/Disable PCI Parity checking control file - - This control file enables or disables the PCI Bus Parity scanning - operation. Writing a 1 to this file enables the scanning. Writing - a 0 to this file disables the scanning. - - Enable:: - - echo "1" >/sys/devices/system/edac/pci/check_pci_parity - - Disable:: - - echo "0" >/sys/devices/system/edac/pci/check_pci_parity - - -- ``pci_parity_count`` - Parity Count - - This attribute file will display the number of parity errors that - have been detected. - - -Module parameters ------------------ - -- ``edac_mc_panic_on_ue`` - Panic on UE control file - - An uncorrectable error will cause a machine panic. This is usually - desirable. It is a bad idea to continue when an uncorrectable error - occurs - it is indeterminate what was uncorrected and the operating - system context might be so mangled that continuing will lead to further - corruption. If the kernel has MCE configured, then EDAC will never - notice the UE. - - LOAD TIME:: - - module/kernel parameter: edac_mc_panic_on_ue=[0|1] - - RUN TIME:: - - echo "1" > /sys/module/edac_core/parameters/edac_mc_panic_on_ue - - -- ``edac_mc_log_ue`` - Log UE control file - - - Generate kernel messages describing uncorrectable errors. These errors - are reported through the system message log system. UE statistics - will be accumulated even when UE logging is disabled. - - LOAD TIME:: - - module/kernel parameter: edac_mc_log_ue=[0|1] - - RUN TIME:: - - echo "1" > /sys/module/edac_core/parameters/edac_mc_log_ue - - -- ``edac_mc_log_ce`` - Log CE control file - - - Generate kernel messages describing correctable errors. These - errors are reported through the system message log system. - CE statistics will be accumulated even when CE logging is disabled. - - LOAD TIME:: - - module/kernel parameter: edac_mc_log_ce=[0|1] - - RUN TIME:: - - echo "1" > /sys/module/edac_core/parameters/edac_mc_log_ce - - -- ``edac_mc_poll_msec`` - Polling period control file - - - The time period, in milliseconds, for polling for error information. - Too small a value wastes resources. Too large a value might delay - necessary handling of errors and might loose valuable information for - locating the error. 1000 milliseconds (once each second) is the current - default. Systems which require all the bandwidth they can get, may - increase this. - - LOAD TIME:: - - module/kernel parameter: edac_mc_poll_msec=[0|1] - - RUN TIME:: - - echo "1000" > /sys/module/edac_core/parameters/edac_mc_poll_msec - - -- ``panic_on_pci_parity`` - Panic on PCI PARITY Error - - - This control file enables or disables panicking when a parity - error has been detected. - - - module/kernel parameter:: - - edac_panic_on_pci_pe=[0|1] - - Enable:: - - echo "1" > /sys/module/edac_core/parameters/edac_panic_on_pci_pe - - Disable:: - - echo "0" > /sys/module/edac_core/parameters/edac_panic_on_pci_pe - - - -EDAC device type ----------------- - -In the header file, edac_pci.h, there is a series of edac_device structures -and APIs for the EDAC_DEVICE. - -User space access to an edac_device is through the sysfs interface. - -At the location ``/sys/devices/system/edac`` (sysfs) new edac_device devices -will appear. - -There is a three level tree beneath the above ``edac`` directory. For example, -the ``test_device_edac`` device (found at the http://bluesmoke.sourceforget.net -website) installs itself as:: - - /sys/devices/system/edac/test-instance - -in this directory are various controls, a symlink and one or more ``instance`` -directories. - -The standard default controls are: - - ============== ======================================================= - log_ce boolean to log CE events - log_ue boolean to log UE events - panic_on_ue boolean to ``panic`` the system if an UE is encountered - (default off, can be set true via startup script) - poll_msec time period between POLL cycles for events - ============== ======================================================= - -The test_device_edac device adds at least one of its own custom control: - - ============== ================================================== - test_bits which in the current test driver does nothing but - show how it is installed. A ported driver can - add one or more such controls and/or attributes - for specific uses. - One out-of-tree driver uses controls here to allow - for ERROR INJECTION operations to hardware - injection registers - ============== ================================================== - -The symlink points to the 'struct dev' that is registered for this edac_device. - -Instances ---------- - -One or more instance directories are present. For the ``test_device_edac`` -case: - - +----------------+ - | test-instance0 | - +----------------+ - - -In this directory there are two default counter attributes, which are totals of -counter in deeper subdirectories. - - ============== ==================================== - ce_count total of CE events of subdirectories - ue_count total of UE events of subdirectories - ============== ==================================== - -Blocks ------- - -At the lowest directory level is the ``block`` directory. There can be 0, 1 -or more blocks specified in each instance: - - +-------------+ - | test-block0 | - +-------------+ - -In this directory the default attributes are: - - ============== ================================================ - ce_count which is counter of CE events for this ``block`` - of hardware being monitored - ue_count which is counter of UE events for this ``block`` - of hardware being monitored - ============== ================================================ - - -The ``test_device_edac`` device adds 4 attributes and 1 control: - - ================== ==================================================== - test-block-bits-0 for every POLL cycle this counter - is incremented - test-block-bits-1 every 10 cycles, this counter is bumped once, - and test-block-bits-0 is set to 0 - test-block-bits-2 every 100 cycles, this counter is bumped once, - and test-block-bits-1 is set to 0 - test-block-bits-3 every 1000 cycles, this counter is bumped once, - and test-block-bits-2 is set to 0 - ================== ==================================================== - - - ================== ==================================================== - reset-counters writing ANY thing to this control will - reset all the above counters. - ================== ==================================================== - - -Use of the ``test_device_edac`` driver should enable any others to create their own -unique drivers for their hardware systems. - -The ``test_device_edac`` sample driver is located at the -http://bluesmoke.sourceforge.net project site for EDAC. - - -Usage of EDAC APIs on Nehalem and newer Intel CPUs --------------------------------------------------- - -On older Intel architectures, the memory controller was part of the North -Bridge chipset. Nehalem, Sandy Bridge, Ivy Bridge, Haswell, Sky Lake and -newer Intel architectures integrated an enhanced version of the memory -controller (MC) inside the CPUs. - -This chapter will cover the differences of the enhanced memory controllers -found on newer Intel CPUs, such as ``i7core_edac``, ``sb_edac`` and -``sbx_edac`` drivers. - -.. note:: - - The Xeon E7 processor families use a separate chip for the memory - controller, called Intel Scalable Memory Buffer. This section doesn't - apply for such families. - -1) There is one Memory Controller per Quick Patch Interconnect - (QPI). At the driver, the term "socket" means one QPI. This is - associated with a physical CPU socket. - - Each MC have 3 physical read channels, 3 physical write channels and - 3 logic channels. The driver currently sees it as just 3 channels. - Each channel can have up to 3 DIMMs. - - The minimum known unity is DIMMs. There are no information about csrows. - As EDAC API maps the minimum unity is csrows, the driver sequentially - maps channel/DIMM into different csrows. - - For example, supposing the following layout:: - - Ch0 phy rd0, wr0 (0x063f4031): 2 ranks, UDIMMs - dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400 - dimm 1 1024 Mb offset: 4, bank: 8, rank: 1, row: 0x4000, col: 0x400 - Ch1 phy rd1, wr1 (0x063f4031): 2 ranks, UDIMMs - dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400 - Ch2 phy rd3, wr3 (0x063f4031): 2 ranks, UDIMMs - dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400 - - The driver will map it as:: - - csrow0: channel 0, dimm0 - csrow1: channel 0, dimm1 - csrow2: channel 1, dimm0 - csrow3: channel 2, dimm0 - - exports one DIMM per csrow. - - Each QPI is exported as a different memory controller. - -2) The MC has the ability to inject errors to test drivers. The drivers - implement this functionality via some error injection nodes: - - For injecting a memory error, there are some sysfs nodes, under - ``/sys/devices/system/edac/mc/mc?/``: - - - ``inject_addrmatch/*``: - Controls the error injection mask register. It is possible to specify - several characteristics of the address to match an error code:: - - dimm = the affected dimm. Numbers are relative to a channel; - rank = the memory rank; - channel = the channel that will generate an error; - bank = the affected bank; - page = the page address; - column (or col) = the address column. - - each of the above values can be set to "any" to match any valid value. - - At driver init, all values are set to any. - - For example, to generate an error at rank 1 of dimm 2, for any channel, - any bank, any page, any column:: - - echo 2 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/dimm - echo 1 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/rank - - To return to the default behaviour of matching any, you can do:: - - echo any >/sys/devices/system/edac/mc/mc0/inject_addrmatch/dimm - echo any >/sys/devices/system/edac/mc/mc0/inject_addrmatch/rank - - - ``inject_eccmask``: - specifies what bits will have troubles, - - - ``inject_section``: - specifies what ECC cache section will get the error:: - - 3 for both - 2 for the highest - 1 for the lowest - - - ``inject_type``: - specifies the type of error, being a combination of the following bits:: - - bit 0 - repeat - bit 1 - ecc - bit 2 - parity - - - ``inject_enable``: - starts the error generation when something different than 0 is written. - - All inject vars can be read. root permission is needed for write. - - Datasheet states that the error will only be generated after a write on an - address that matches inject_addrmatch. It seems, however, that reading will - also produce an error. - - For example, the following code will generate an error for any write access - at socket 0, on any DIMM/address on channel 2:: - - echo 2 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/channel - echo 2 >/sys/devices/system/edac/mc/mc0/inject_type - echo 64 >/sys/devices/system/edac/mc/mc0/inject_eccmask - echo 3 >/sys/devices/system/edac/mc/mc0/inject_section - echo 1 >/sys/devices/system/edac/mc/mc0/inject_enable - dd if=/dev/mem of=/dev/null seek=16k bs=4k count=1 >& /dev/null - - For socket 1, it is needed to replace "mc0" by "mc1" at the above - commands. - - The generated error message will look like:: - - EDAC MC0: UE row 0, channel-a= 0 channel-b= 0 labels "-": NON_FATAL (addr = 0x0075b980, socket=0, Dimm=0, Channel=2, syndrome=0x00000040, count=1, Err=8c0000400001009f:4000080482 (read error: read ECC error)) - -3) Corrected Error memory register counters - - Those newer MCs have some registers to count memory errors. The driver - uses those registers to report Corrected Errors on devices with Registered - DIMMs. - - However, those counters don't work with Unregistered DIMM. As the chipset - offers some counters that also work with UDIMMs (but with a worse level of - granularity than the default ones), the driver exposes those registers for - UDIMM memories. - - They can be read by looking at the contents of ``all_channel_counts/``:: - - $ for i in /sys/devices/system/edac/mc/mc0/all_channel_counts/*; do echo $i; cat $i; done - /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm0 - 0 - /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm1 - 0 - /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm2 - 0 - - What happens here is that errors on different csrows, but at the same - dimm number will increment the same counter. - So, in this memory mapping:: - - csrow0: channel 0, dimm0 - csrow1: channel 0, dimm1 - csrow2: channel 1, dimm0 - csrow3: channel 2, dimm0 - - The hardware will increment udimm0 for an error at the first dimm at either - csrow0, csrow2 or csrow3; - - The hardware will increment udimm1 for an error at the second dimm at either - csrow0, csrow2 or csrow3; - - The hardware will increment udimm2 for an error at the third dimm at either - csrow0, csrow2 or csrow3; - -4) Standard error counters - - The standard error counters are generated when an mcelog error is received - by the driver. Since, with UDIMM, this is counted by software, it is - possible that some errors could be lost. With RDIMM's, they display the - contents of the registers - -Reference documents used on ``amd64_edac`` ------------------------------------------- - -``amd64_edac`` module is based on the following documents -(available from http://support.amd.com/en-us/search/tech-docs): - -1. :Title: BIOS and Kernel Developer's Guide for AMD Athlon 64 and AMD - Opteron Processors - :AMD publication #: 26094 - :Revision: 3.26 - :Link: http://support.amd.com/TechDocs/26094.PDF - -2. :Title: BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh - Processors - :AMD publication #: 32559 - :Revision: 3.00 - :Issue Date: May 2006 - :Link: http://support.amd.com/TechDocs/32559.pdf - -3. :Title: BIOS and Kernel Developer's Guide (BKDG) For AMD Family 10h - Processors - :AMD publication #: 31116 - :Revision: 3.00 - :Issue Date: September 07, 2007 - :Link: http://support.amd.com/TechDocs/31116.pdf - -4. :Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 15h - Models 30h-3Fh Processors - :AMD publication #: 49125 - :Revision: 3.06 - :Issue Date: 2/12/2015 (latest release) - :Link: http://support.amd.com/TechDocs/49125_15h_Models_30h-3Fh_BKDG.pdf - -5. :Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 15h - Models 60h-6Fh Processors - :AMD publication #: 50742 - :Revision: 3.01 - :Issue Date: 7/23/2015 (latest release) - :Link: http://support.amd.com/TechDocs/50742_15h_Models_60h-6Fh_BKDG.pdf - -6. :Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 16h - Models 00h-0Fh Processors - :AMD publication #: 48751 - :Revision: 3.03 - :Issue Date: 2/23/2015 (latest release) - :Link: http://support.amd.com/TechDocs/48751_16h_bkdg.pdf - -Credits -======= - -* Written by Doug Thompson - - - 7 Dec 2005 - - 17 Jul 2007 Updated - -* |copy| Mauro Carvalho Chehab - - - 05 Aug 2009 Nehalem interface - - 26 Oct 2016 Converted to ReST and cleanups at the Nehalem section - -* EDAC authors/maintainers: - - - Doug Thompson, Dave Jiang, Dave Peterson et al, - - Mauro Carvalho Chehab - - Borislav Petkov - - original author: Thayne Harbaugh diff --git a/Documentation/index.rst b/Documentation/index.rst index 07f2aa07f0fa0..9dfdc826618c0 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -113,7 +113,6 @@ to ReStructured Text format, or are simply too old. :maxdepth: 1 staging/index - RAS/index Translations diff --git a/MAINTAINERS b/MAINTAINERS index 5b945fd5a3b91..fc5996feba70c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7584,7 +7584,6 @@ R: Robert Richter L: linux-edac@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git edac-for-next -F: Documentation/admin-guide/ras.rst F: Documentation/driver-api/edac.rst F: drivers/edac/ F: include/linux/edac.h @@ -18359,8 +18358,7 @@ M: Tony Luck M: Borislav Petkov L: linux-edac@vger.kernel.org S: Maintained -F: Documentation/RAS/ -F: Documentation/admin-guide/ras.rst +F: Documentation/admin-guide/RAS F: drivers/ras/ F: include/linux/ras.h F: include/ras/ras_event.h -- cgit 1.2.3-korg From 6f15e617cc99323339dc241d19956f0d640c4354 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 13 Feb 2024 21:35:16 -0600 Subject: RAS: Introduce a FRU memory poison manager Memory errors are an expected occurrence on systems with high memory density. Generally, errors within a small number of unique physical locations are acceptable, based on manufacturer and/or admin policy. During run time, memory with errors may be retired so it is no longer used by the system. This is done in mm through page poisoning, and the effect will remain until the system is restarted. If a memory location is consistently faulty, then the same run time error handling may occur in the next reboot cycle, leading to terminating jobs due to that already known bad memory. This could be prevented if information from the previous boot was not lost. Some add-in cards with driver-managed memory have on-board persistent storage. Their driver saves memory error information to the persistent storage during run time. The information is then restored after reset, and known bad memory will be retired before the hardware is used. A running log of bad memory locations is kept across multiple resets. A similar solution is desirable for CPUs. However, this solution should leverage industry-standard components as much as possible, rather than a bespoke platform driver. Two components are needed: a record format and a persistent storage interface. Implement a new module to manage the record formats on persistent storage. Use the requirements for an AMD MI300-based system to start. Vendor- and platform-specific details can be abstracted later as needed. [ bp: Massage commit message and code, squash 30-ish more fixes from Yazen and me. ] Signed-off-by: Yazen Ghannam Co-developed-by: Signed-off-by: Co-developed-by: Signed-off-by: Tested-by: Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240214033516.1344948-3-yazen.ghannam@amd.com --- MAINTAINERS | 6 + drivers/ras/Kconfig | 12 + drivers/ras/Makefile | 1 + drivers/ras/amd/fmpm.c | 812 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 831 insertions(+) create mode 100644 drivers/ras/amd/fmpm.c (limited to 'MAINTAINERS') diff --git a/MAINTAINERS b/MAINTAINERS index fc5996feba70c..76163f09e4e2b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18363,6 +18363,12 @@ F: drivers/ras/ F: include/linux/ras.h F: include/ras/ras_event.h +RAS FRU MEMORY POISON MANAGER (FMPM) +M: Yazen Ghannam +L: linux-edac@vger.kernel.org +S: Maintained +F: drivers/ras/amd/fmpm.c + RC-CORE / LIRC FRAMEWORK M: Sean Young L: linux-media@vger.kernel.org diff --git a/drivers/ras/Kconfig b/drivers/ras/Kconfig index 2e969f59c0cac..fc4f4bb94a4c6 100644 --- a/drivers/ras/Kconfig +++ b/drivers/ras/Kconfig @@ -34,4 +34,16 @@ if RAS source "arch/x86/ras/Kconfig" source "drivers/ras/amd/atl/Kconfig" +config RAS_FMPM + tristate "FRU Memory Poison Manager" + default m + depends on AMD_ATL && ACPI_APEI + help + Support saving and restoring memory error information across reboot + using ACPI ERST as persistent storage. Error information is saved with + the UEFI CPER "FRU Memory Poison" section format. + + Memory will be retired during boot time and run time depending on + platform-specific policies. + endif diff --git a/drivers/ras/Makefile b/drivers/ras/Makefile index 3fac80f580052..11f95d59d3972 100644 --- a/drivers/ras/Makefile +++ b/drivers/ras/Makefile @@ -3,4 +3,5 @@ obj-$(CONFIG_RAS) += ras.o obj-$(CONFIG_DEBUG_FS) += debugfs.o obj-$(CONFIG_RAS_CEC) += cec.o +obj-$(CONFIG_RAS_FMPM) += amd/fmpm.o obj-y += amd/atl/ diff --git a/drivers/ras/amd/fmpm.c b/drivers/ras/amd/fmpm.c new file mode 100644 index 0000000000000..80dd112b720af --- /dev/null +++ b/drivers/ras/amd/fmpm.c @@ -0,0 +1,812 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * FRU (Field-Replaceable Unit) Memory Poison Manager + * + * Copyright (c) 2024, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Authors: + * Naveen Krishna Chatradhi + * Muralidhara M K + * Yazen Ghannam + * + * Implementation notes, assumptions, and limitations: + * + * - FRU memory poison section and memory poison descriptor definitions are not yet + * included in the UEFI specification. So they are defined here. Afterwards, they + * may be moved to linux/cper.h, if appropriate. + * + * - Platforms based on AMD MI300 systems will be the first to use these structures. + * There are a number of assumptions made here that will need to be generalized + * to support other platforms. + * + * AMD MI300-based platform(s) assumptions: + * - Memory errors are reported through x86 MCA. + * - The entire DRAM row containing a memory error should be retired. + * - There will be (1) FRU memory poison section per CPER. + * - The FRU will be the CPU package (processor socket). + * - The default number of memory poison descriptor entries should be (8). + * - The platform will use ACPI ERST for persistent storage. + * - All FRU records should be saved to persistent storage. Module init will + * fail if any FRU record is not successfully written. + * + * - Boot time memory retirement may occur later than ideal due to dependencies + * on other libraries and drivers. This leaves a gap where bad memory may be + * accessed during early boot stages. + * + * - Enough memory should be pre-allocated for each FRU record to be able to hold + * the expected number of descriptor entries. This, mostly empty, record is + * written to storage during init time. Subsequent writes to the same record + * should allow the Platform to update the stored record in-place. Otherwise, + * if the record is extended, then the Platform may need to perform costly memory + * management operations on the storage. For example, the Platform may spend time + * in Firmware copying and invalidating memory on a relatively slow SPI ROM. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include + +#include + +#include +#include + +#define INVALID_CPU UINT_MAX + +/* Validation Bits */ +#define FMP_VALID_ARCH_TYPE BIT_ULL(0) +#define FMP_VALID_ARCH BIT_ULL(1) +#define FMP_VALID_ID_TYPE BIT_ULL(2) +#define FMP_VALID_ID BIT_ULL(3) +#define FMP_VALID_LIST_ENTRIES BIT_ULL(4) +#define FMP_VALID_LIST BIT_ULL(5) + +/* FRU Architecture Types */ +#define FMP_ARCH_TYPE_X86_CPUID_1_EAX 0 + +/* FRU ID Types */ +#define FMP_ID_TYPE_X86_PPIN 0 + +/* FRU Memory Poison Section */ +struct cper_sec_fru_mem_poison { + u32 checksum; + u64 validation_bits; + u32 fru_arch_type; + u64 fru_arch; + u32 fru_id_type; + u64 fru_id; + u32 nr_entries; +} __packed; + +/* FRU Descriptor ID Types */ +#define FPD_HW_ID_TYPE_MCA_IPID 0 + +/* FRU Descriptor Address Types */ +#define FPD_ADDR_TYPE_MCA_ADDR 0 + +/* Memory Poison Descriptor */ +struct cper_fru_poison_desc { + u64 timestamp; + u32 hw_id_type; + u64 hw_id; + u32 addr_type; + u64 addr; +} __packed; + +/* Collection of headers and sections for easy pointer use. */ +struct fru_rec { + struct cper_record_header hdr; + struct cper_section_descriptor sec_desc; + struct cper_sec_fru_mem_poison fmp; + struct cper_fru_poison_desc entries[]; +} __packed; + +/* + * Pointers to the complete CPER record of each FRU. + * + * Memory allocation will include padded space for descriptor entries. + */ +static struct fru_rec **fru_records; + +#define CPER_CREATOR_FMP \ + GUID_INIT(0xcd5c2993, 0xf4b2, 0x41b2, 0xb5, 0xd4, 0xf9, 0xc3, \ + 0xa0, 0x33, 0x08, 0x75) + +#define CPER_SECTION_TYPE_FMP \ + GUID_INIT(0x5e4706c1, 0x5356, 0x48c6, 0x93, 0x0b, 0x52, 0xf2, \ + 0x12, 0x0a, 0x44, 0x58) + +/** + * DOC: fru_poison_entries (byte) + * Maximum number of descriptor entries possible for each FRU. + * + * Values between '1' and '255' are valid. + * No input or '0' will default to FMPM_DEFAULT_MAX_NR_ENTRIES. + */ +static u8 max_nr_entries; +module_param(max_nr_entries, byte, 0644); +MODULE_PARM_DESC(max_nr_entries, + "Maximum number of memory poison descriptor entries per FRU"); + +#define FMPM_DEFAULT_MAX_NR_ENTRIES 8 + +/* Maximum number of FRUs in the system. */ +#define FMPM_MAX_NR_FRU 256 +static unsigned int max_nr_fru; + +/* Total length of record including headers and list of descriptor entries. */ +static size_t max_rec_len; + +/* + * Protect the local records cache in fru_records and prevent concurrent + * writes to storage. This is only needed after init once notifier block + * registration is done. + */ +static DEFINE_MUTEX(fmpm_update_mutex); + +#define for_each_fru(i, rec) \ + for (i = 0; rec = fru_records[i], i < max_nr_fru; i++) + +static inline u32 get_fmp_len(struct fru_rec *rec) +{ + return rec->sec_desc.section_length - sizeof(struct cper_section_descriptor); +} + +static struct fru_rec *get_fru_record(u64 fru_id) +{ + struct fru_rec *rec; + unsigned int i; + + for_each_fru(i, rec) { + if (rec->fmp.fru_id == fru_id) + return rec; + } + + pr_debug("Record not found for FRU 0x%016llx\n", fru_id); + + return NULL; +} + +/* + * Sum up all bytes within the FRU Memory Poison Section including the Memory + * Poison Descriptor entries. + * + * Don't include the old checksum here. It's a u32 value, so summing each of its + * bytes will give the wrong total. + */ +static u32 do_fmp_checksum(struct cper_sec_fru_mem_poison *fmp, u32 len) +{ + u32 checksum = 0; + u8 *buf, *end; + + /* Skip old checksum. */ + buf = (u8 *)fmp + sizeof(u32); + end = buf + len; + + while (buf < end) + checksum += (u8)(*(buf++)); + + return checksum; +} + +static int update_record_on_storage(struct fru_rec *rec) +{ + u32 len, checksum; + int ret; + + /* Calculate a new checksum. */ + len = get_fmp_len(rec); + + /* Get the current total. */ + checksum = do_fmp_checksum(&rec->fmp, len); + + /* Use the complement value. */ + rec->fmp.checksum = -checksum; + + pr_debug("Writing to storage\n"); + + ret = erst_write(&rec->hdr); + if (ret) { + pr_warn("Storage update failed for FRU 0x%016llx\n", rec->fmp.fru_id); + + if (ret == -ENOSPC) + pr_warn("Not enough space on storage\n"); + } + + return ret; +} + +static bool rec_has_valid_entries(struct fru_rec *rec) +{ + if (!(rec->fmp.validation_bits & FMP_VALID_LIST_ENTRIES)) + return false; + + if (!(rec->fmp.validation_bits & FMP_VALID_LIST)) + return false; + + return true; +} + +static bool fpds_equal(struct cper_fru_poison_desc *old, struct cper_fru_poison_desc *new) +{ + /* + * Ignore timestamp field. + * The same physical error may be reported multiple times due to stuck bits, etc. + * + * Also, order the checks from most->least likely to fail to shortcut the code. + */ + if (old->addr != new->addr) + return false; + + if (old->hw_id != new->hw_id) + return false; + + if (old->addr_type != new->addr_type) + return false; + + if (old->hw_id_type != new->hw_id_type) + return false; + + return true; +} + +static bool rec_has_fpd(struct fru_rec *rec, struct cper_fru_poison_desc *fpd) +{ + unsigned int i; + + for (i = 0; i < rec->fmp.nr_entries; i++) { + struct cper_fru_poison_desc *fpd_i = &rec->entries[i]; + + if (fpds_equal(fpd_i, fpd)) { + pr_debug("Found duplicate record\n"); + return true; + } + } + + return false; +} + +static void update_fru_record(struct fru_rec *rec, struct mce *m) +{ + struct cper_sec_fru_mem_poison *fmp = &rec->fmp; + struct cper_fru_poison_desc fpd, *fpd_dest; + u32 entry = 0; + + mutex_lock(&fmpm_update_mutex); + + memset(&fpd, 0, sizeof(struct cper_fru_poison_desc)); + + fpd.timestamp = m->time; + fpd.hw_id_type = FPD_HW_ID_TYPE_MCA_IPID; + fpd.hw_id = m->ipid; + fpd.addr_type = FPD_ADDR_TYPE_MCA_ADDR; + fpd.addr = m->addr; + + /* This is the first entry, so just save it. */ + if (!rec_has_valid_entries(rec)) + goto save_fpd; + + /* Ignore already recorded errors. */ + if (rec_has_fpd(rec, &fpd)) + goto out_unlock; + + if (rec->fmp.nr_entries >= max_nr_entries) { + pr_warn("Exceeded number of entries for FRU 0x%016llx\n", rec->fmp.fru_id); + goto out_unlock; + } + + entry = fmp->nr_entries; + +save_fpd: + fpd_dest = &rec->entries[entry]; + memcpy(fpd_dest, &fpd, sizeof(struct cper_fru_poison_desc)); + + fmp->nr_entries = entry + 1; + fmp->validation_bits |= FMP_VALID_LIST_ENTRIES; + fmp->validation_bits |= FMP_VALID_LIST; + + pr_debug("Updated FRU 0x%016llx entry #%u\n", fmp->fru_id, entry); + + update_record_on_storage(rec); + +out_unlock: + mutex_unlock(&fmpm_update_mutex); +} + +static void retire_dram_row(u64 addr, u64 id, u32 cpu) +{ + struct atl_err a_err; + + memset(&a_err, 0, sizeof(struct atl_err)); + + a_err.addr = addr; + a_err.ipid = id; + a_err.cpu = cpu; + + amd_retire_dram_row(&a_err); +} + +static int fru_handle_mem_poison(struct notifier_block *nb, unsigned long val, void *data) +{ + struct mce *m = (struct mce *)data; + struct fru_rec *rec; + + if (!mce_is_memory_error(m)) + return NOTIFY_DONE; + + retire_dram_row(m->addr, m->ipid, m->extcpu); + + /* + * An invalid FRU ID should not happen on real errors. But it + * could happen from software error injection, etc. + */ + rec = get_fru_record(m->ppin); + if (!rec) + return NOTIFY_DONE; + + update_fru_record(rec, m); + + return NOTIFY_OK; +} + +static struct notifier_block fru_mem_poison_nb = { + .notifier_call = fru_handle_mem_poison, + .priority = MCE_PRIO_LOWEST, +}; + +static void retire_mem_fmp(struct fru_rec *rec) +{ + struct cper_sec_fru_mem_poison *fmp = &rec->fmp; + unsigned int i, cpu; + + for (i = 0; i < fmp->nr_entries; i++) { + struct cper_fru_poison_desc *fpd = &rec->entries[i]; + unsigned int err_cpu = INVALID_CPU; + + if (fpd->hw_id_type != FPD_HW_ID_TYPE_MCA_IPID) + continue; + + if (fpd->addr_type != FPD_ADDR_TYPE_MCA_ADDR) + continue; + + cpus_read_lock(); + for_each_online_cpu(cpu) { + if (topology_ppin(cpu) == fmp->fru_id) { + err_cpu = cpu; + break; + } + } + cpus_read_unlock(); + + if (err_cpu == INVALID_CPU) + continue; + + retire_dram_row(fpd->addr, fpd->hw_id, err_cpu); + } +} + +static void retire_mem_records(void) +{ + struct fru_rec *rec; + unsigned int i; + + for_each_fru(i, rec) { + if (!rec_has_valid_entries(rec)) + continue; + + retire_mem_fmp(rec); + } +} + +/* Set the CPER Record Header and CPER Section Descriptor fields. */ +static void set_rec_fields(struct fru_rec *rec) +{ + struct cper_section_descriptor *sec_desc = &rec->sec_desc; + struct cper_record_header *hdr = &rec->hdr; + + memcpy(hdr->signature, CPER_SIG_RECORD, CPER_SIG_SIZE); + hdr->revision = CPER_RECORD_REV; + hdr->signature_end = CPER_SIG_END; + + /* + * Currently, it is assumed that there is one FRU Memory Poison + * section per CPER. But this may change for other implementations. + */ + hdr->section_count = 1; + + /* The logged errors are recoverable. Otherwise, they'd never make it here. */ + hdr->error_severity = CPER_SEV_RECOVERABLE; + + hdr->validation_bits = 0; + hdr->record_length = max_rec_len; + hdr->creator_id = CPER_CREATOR_FMP; + hdr->notification_type = CPER_NOTIFY_MCE; + hdr->record_id = cper_next_record_id(); + hdr->flags = CPER_HW_ERROR_FLAGS_PREVERR; + + sec_desc->section_offset = sizeof(struct cper_record_header); + sec_desc->section_length = max_rec_len - sizeof(struct cper_record_header); + sec_desc->revision = CPER_SEC_REV; + sec_desc->validation_bits = 0; + sec_desc->flags = CPER_SEC_PRIMARY; + sec_desc->section_type = CPER_SECTION_TYPE_FMP; + sec_desc->section_severity = CPER_SEV_RECOVERABLE; +} + +static int save_new_records(void) +{ + DECLARE_BITMAP(new_records, FMPM_MAX_NR_FRU); + struct fru_rec *rec; + unsigned int i; + int ret = 0; + + for_each_fru(i, rec) { + if (rec->hdr.record_length) + continue; + + set_rec_fields(rec); + + ret = update_record_on_storage(rec); + if (ret) + goto out_clear; + + set_bit(i, new_records); + } + + return ret; + +out_clear: + for_each_fru(i, rec) { + if (!test_bit(i, new_records)) + continue; + + erst_clear(rec->hdr.record_id); + } + + return ret; +} + +/* Check that the record matches expected types for the current system.*/ +static bool fmp_is_usable(struct fru_rec *rec) +{ + struct cper_sec_fru_mem_poison *fmp = &rec->fmp; + u64 cpuid; + + pr_debug("Validation bits: 0x%016llx\n", fmp->validation_bits); + + if (!(fmp->validation_bits & FMP_VALID_ARCH_TYPE)) { + pr_debug("Arch type unknown\n"); + return false; + } + + if (fmp->fru_arch_type != FMP_ARCH_TYPE_X86_CPUID_1_EAX) { + pr_debug("Arch type not 'x86 Family/Model/Stepping'\n"); + return false; + } + + if (!(fmp->validation_bits & FMP_VALID_ARCH)) { + pr_debug("Arch value unknown\n"); + return false; + } + + cpuid = cpuid_eax(1); + if (fmp->fru_arch != cpuid) { + pr_debug("Arch value mismatch: record = 0x%016llx, system = 0x%016llx\n", + fmp->fru_arch, cpuid); + return false; + } + + if (!(fmp->validation_bits & FMP_VALID_ID_TYPE)) { + pr_debug("FRU ID type unknown\n"); + return false; + } + + if (fmp->fru_id_type != FMP_ID_TYPE_X86_PPIN) { + pr_debug("FRU ID type is not 'x86 PPIN'\n"); + return false; + } + + if (!(fmp->validation_bits & FMP_VALID_ID)) { + pr_debug("FRU ID value unknown\n"); + return false; + } + + return true; +} + +static bool fmp_is_valid(struct fru_rec *rec) +{ + struct cper_sec_fru_mem_poison *fmp = &rec->fmp; + u32 checksum, len; + + len = get_fmp_len(rec); + if (len < sizeof(struct cper_sec_fru_mem_poison)) { + pr_debug("fmp length is too small\n"); + return false; + } + + /* Checksum must sum to zero for the entire section. */ + checksum = do_fmp_checksum(fmp, len) + fmp->checksum; + if (checksum) { + pr_debug("fmp checksum failed: sum = 0x%x\n", checksum); + print_hex_dump_debug("fmp record: ", DUMP_PREFIX_NONE, 16, 1, fmp, len, false); + return false; + } + + if (!fmp_is_usable(rec)) + return false; + + return true; +} + +static struct fru_rec *get_valid_record(struct fru_rec *old) +{ + struct fru_rec *new; + + if (!fmp_is_valid(old)) { + pr_debug("Ignoring invalid record\n"); + return NULL; + } + + new = get_fru_record(old->fmp.fru_id); + if (!new) + pr_debug("Ignoring record for absent FRU\n"); + + return new; +} + +/* + * Fetch saved records from persistent storage. + * + * For each found record: + * - If it was not created by this module, then ignore it. + * - If it is valid, then copy its data to the local cache. + * - If it is not valid, then erase it. + */ +static int get_saved_records(void) +{ + struct fru_rec *old, *new; + u64 record_id; + int ret, pos; + ssize_t len; + + /* + * Assume saved records match current max size. + * + * However, this may not be true depending on module parameters. + */ + old = kmalloc(max_rec_len, GFP_KERNEL); + if (!old) { + ret = -ENOMEM; + goto out; + } + + ret = erst_get_record_id_begin(&pos); + if (ret < 0) + goto out_end; + + while (!erst_get_record_id_next(&pos, &record_id)) { + if (record_id == APEI_ERST_INVALID_RECORD_ID) + goto out_end; + /* + * Make sure to clear temporary buffer between reads to avoid + * leftover data from records of various sizes. + */ + memset(old, 0, max_rec_len); + + len = erst_read_record(record_id, &old->hdr, max_rec_len, + sizeof(struct fru_rec), &CPER_CREATOR_FMP); + if (len < 0) + continue; + + if (len > max_rec_len) { + pr_debug("Found record larger than max_rec_len\n"); + continue; + } + + new = get_valid_record(old); + if (!new) + erst_clear(record_id); + + /* Restore the record */ + memcpy(new, old, len); + } + +out_end: + erst_get_record_id_end(); + kfree(old); +out: + return ret; +} + +static void set_fmp_fields(struct fru_rec *rec, unsigned int cpu) +{ + struct cper_sec_fru_mem_poison *fmp = &rec->fmp; + + fmp->fru_arch_type = FMP_ARCH_TYPE_X86_CPUID_1_EAX; + fmp->validation_bits |= FMP_VALID_ARCH_TYPE; + + /* Assume all CPUs in the system have the same value for now. */ + fmp->fru_arch = cpuid_eax(1); + fmp->validation_bits |= FMP_VALID_ARCH; + + fmp->fru_id_type = FMP_ID_TYPE_X86_PPIN; + fmp->validation_bits |= FMP_VALID_ID_TYPE; + + fmp->fru_id = topology_ppin(cpu); + fmp->validation_bits |= FMP_VALID_ID; +} + +static int init_fmps(void) +{ + struct fru_rec *rec; + unsigned int i, cpu; + int ret = 0; + + for_each_fru(i, rec) { + unsigned int fru_cpu = INVALID_CPU; + + cpus_read_lock(); + for_each_online_cpu(cpu) { + if (topology_physical_package_id(cpu) == i) { + fru_cpu = cpu; + break; + } + } + cpus_read_unlock(); + + if (fru_cpu == INVALID_CPU) { + pr_debug("Failed to find matching CPU for FRU #%u\n", i); + ret = -ENODEV; + break; + } + + set_fmp_fields(rec, fru_cpu); + } + + return ret; +} + +static int get_system_info(void) +{ + /* Only load on MI300A systems for now. */ + if (!(boot_cpu_data.x86_model >= 0x90 && + boot_cpu_data.x86_model <= 0x9f)) + return -ENODEV; + + if (!cpu_feature_enabled(X86_FEATURE_AMD_PPIN)) { + pr_debug("PPIN feature not available\n"); + return -ENODEV; + } + + /* Use CPU socket as FRU for MI300 systems. */ + max_nr_fru = topology_max_packages(); + if (!max_nr_fru) + return -ENODEV; + + if (max_nr_fru > FMPM_MAX_NR_FRU) { + pr_warn("Too many FRUs to manage: found: %u, max: %u\n", + max_nr_fru, FMPM_MAX_NR_FRU); + return -ENODEV; + } + + if (!max_nr_entries) + max_nr_entries = FMPM_DEFAULT_MAX_NR_ENTRIES; + + max_rec_len = sizeof(struct fru_rec); + max_rec_len += sizeof(struct cper_fru_poison_desc) * max_nr_entries; + + pr_info("max FRUs: %u, max entries: %u, max record length: %lu\n", + max_nr_fru, max_nr_entries, max_rec_len); + + return 0; +} + +static void free_records(void) +{ + struct fru_rec *rec; + int i; + + for_each_fru(i, rec) + kfree(rec); + + kfree(fru_records); +} + +static int allocate_records(void) +{ + int i, ret = 0; + + fru_records = kcalloc(max_nr_fru, sizeof(struct fru_rec *), GFP_KERNEL); + if (!fru_records) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < max_nr_fru; i++) { + fru_records[i] = kzalloc(max_rec_len, GFP_KERNEL); + if (!fru_records[i]) { + ret = -ENOMEM; + goto out_free; + } + } + + return ret; + +out_free: + for (; i >= 0; i--) + kfree(fru_records[i]); + + kfree(fru_records); +out: + return ret; +} + +static const struct x86_cpu_id fmpm_cpuids[] = { + X86_MATCH_VENDOR_FAM(AMD, 0x19, NULL), + { } +}; +MODULE_DEVICE_TABLE(x86cpu, fmpm_cpuids); + +static int __init fru_mem_poison_init(void) +{ + int ret; + + if (!x86_match_cpu(fmpm_cpuids)) { + ret = -ENODEV; + goto out; + } + + if (erst_disable) { + pr_debug("ERST not available\n"); + ret = -ENODEV; + goto out; + } + + ret = get_system_info(); + if (ret) + goto out; + + ret = allocate_records(); + if (ret) + goto out; + + ret = init_fmps(); + if (ret) + goto out_free; + + ret = get_saved_records(); + if (ret) + goto out_free; + + ret = save_new_records(); + if (ret) + goto out_free; + + retire_mem_records(); + + mce_register_decode_chain(&fru_mem_poison_nb); + + pr_info("FRU Memory Poison Manager initialized\n"); + return 0; + +out_free: + free_records(); +out: + return ret; +} + +static void __exit fru_mem_poison_exit(void) +{ + mce_unregister_decode_chain(&fru_mem_poison_nb); + free_records(); +} + +module_init(fru_mem_poison_init); +module_exit(fru_mem_poison_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("FRU Memory Poison Manager"); -- cgit 1.2.3-korg