From 7474b1c82be3780692d537d331f9aa7fc1e5a368 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Fri, 19 Apr 2024 11:34:47 -0700 Subject: bnxt_en: refactor reset close code Introduce bnxt_fw_fatal_close() API which can be used to stop data path and disable device when firmware is in fatal state. Signed-off-by: Vikas Gupta Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 57e61f9631678..c852f87c842f3 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -13037,6 +13037,16 @@ static void bnxt_rx_ring_reset(struct bnxt *bp) bnxt_rtnl_unlock_sp(bp); } +static void bnxt_fw_fatal_close(struct bnxt *bp) +{ + bnxt_tx_disable(bp); + bnxt_disable_napi(bp); + bnxt_disable_int_sync(bp); + bnxt_free_irq(bp); + bnxt_clear_int_mode(bp); + pci_disable_device(bp->pdev); +} + static void bnxt_fw_reset_close(struct bnxt *bp) { bnxt_ulp_stop(bp); @@ -13050,12 +13060,7 @@ static void bnxt_fw_reset_close(struct bnxt *bp) pci_read_config_word(bp->pdev, PCI_SUBSYSTEM_ID, &val); if (val == 0xffff) bp->fw_reset_min_dsecs = 0; - bnxt_tx_disable(bp); - bnxt_disable_napi(bp); - bnxt_disable_int_sync(bp); - bnxt_free_irq(bp); - bnxt_clear_int_mode(bp); - pci_disable_device(bp->pdev); + bnxt_fw_fatal_close(bp); } __bnxt_close_nic(bp, true, false); bnxt_vf_reps_free(bp); -- cgit 1.2.3-korg From a1acdc226baec331512f815d6ac9dd6f8435cc7f Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Fri, 19 Apr 2024 11:34:48 -0700 Subject: bnxt_en: Fix the PCI-AER routines We do not support two simultaneous recoveries so check for reset flag, BNXT_STATE_IN_FW_RESET, and do not proceed with AER further. When the pci channel state is pci_channel_io_frozen, the PCIe link can not be trusted so we disable the traffic immediately and stop BAR access by calling bnxt_fw_fatal_close(). BAR access after AER fatal error can cause an NMI. Fixes: f75d9a0aa967 ("bnxt_en: Re-write PCI BARs after PCI fatal error.") Signed-off-by: Vikas Gupta Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index c852f87c842f3..86c1c30c70d5f 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -15378,6 +15378,7 @@ static pci_ers_result_t bnxt_io_error_detected(struct pci_dev *pdev, { struct net_device *netdev = pci_get_drvdata(pdev); struct bnxt *bp = netdev_priv(netdev); + bool abort = false; netdev_info(netdev, "PCI I/O error detected\n"); @@ -15386,16 +15387,27 @@ static pci_ers_result_t bnxt_io_error_detected(struct pci_dev *pdev, bnxt_ulp_stop(bp); - if (state == pci_channel_io_perm_failure) { + if (test_and_set_bit(BNXT_STATE_IN_FW_RESET, &bp->state)) { + netdev_err(bp->dev, "Firmware reset already in progress\n"); + abort = true; + } + + if (abort || state == pci_channel_io_perm_failure) { rtnl_unlock(); return PCI_ERS_RESULT_DISCONNECT; } - if (state == pci_channel_io_frozen) + /* Link is not reliable anymore if state is pci_channel_io_frozen + * so we disable bus master to prevent any potential bad DMAs before + * freeing kernel memory. + */ + if (state == pci_channel_io_frozen) { set_bit(BNXT_STATE_PCI_CHANNEL_IO_FROZEN, &bp->state); + bnxt_fw_fatal_close(bp); + } if (netif_running(netdev)) - bnxt_close(netdev); + __bnxt_close_nic(bp, true, true); if (pci_is_enabled(pdev)) pci_disable_device(pdev); @@ -15479,6 +15491,7 @@ static pci_ers_result_t bnxt_io_slot_reset(struct pci_dev *pdev) } reset_exit: + clear_bit(BNXT_STATE_IN_FW_RESET, &bp->state); bnxt_clear_reservations(bp, true); rtnl_unlock(); -- cgit 1.2.3-korg From 41e54045b741daf61e03c82d442227af3d12111f Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Fri, 19 Apr 2024 11:34:49 -0700 Subject: bnxt_en: Fix error recovery for 5760X (P7) chips During error recovery, such as AER fatal error slot reset, we call bnxt_try_map_fw_health_reg() to try to get access to the health register to determine the firmware state. Fix bnxt_try_map_fw_health_reg() to recognize the P7 chip correctly and set up the health register. This fixes this type of AER slot reset failure: bnxt_en 0000:04:00.0: AER: PCIe Bus Error: severity=Uncorrectable (Fatal), type=Inaccessible, (Unregistered Agent ID) bnxt_en 0000:04:00.0 enp4s0f0np0: PCI I/O error detected bnxt_en 0000:04:00.0 bnxt_re0: Handle device suspend call bnxt_en 0000:04:00.1 enp4s0f1np1: PCI I/O error detected bnxt_en 0000:04:00.1 bnxt_re1: Handle device suspend call pcieport 0000:00:02.0: AER: Root Port link has been reset (0) bnxt_en 0000:04:00.0 enp4s0f0np0: PCI Slot Reset bnxt_en 0000:04:00.0: enabling device (0000 -> 0002) bnxt_en 0000:04:00.0: Firmware not ready bnxt_en 0000:04:00.1 enp4s0f1np1: PCI Slot Reset bnxt_en 0000:04:00.1: enabling device (0000 -> 0002) bnxt_en 0000:04:00.1: Firmware not ready pcieport 0000:00:02.0: AER: device recovery failed Fixes: a432a45bdba4 ("bnxt_en: Define basic P7 macros") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 86c1c30c70d5f..ed04a90a4fdde 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -9089,7 +9089,7 @@ static void bnxt_try_map_fw_health_reg(struct bnxt *bp) BNXT_FW_HEALTH_WIN_BASE + BNXT_GRC_REG_CHIP_NUM); } - if (!BNXT_CHIP_P5(bp)) + if (!BNXT_CHIP_P5_PLUS(bp)) return; status_loc = BNXT_GRC_REG_STATUS_P5 | -- cgit 1.2.3-korg