From 0553e753ea9ee724acaf6b3dfc7354702af83567 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 9 Apr 2024 22:08:09 +0300 Subject: net/mlx5: E-switch, store eswitch pointer before registering devlink_param Next patch will move devlink register to be first. Therefore, whenever mlx5 will register a param, the user will be notified. In order to notify the user, devlink is using the get() callback of the param. Hence, resources that are being used by the get() callback must be set before the devlink param is registered. Therefore, store eswitch pointer inside mdev before registering the param. Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 9 +++------ drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 4 ++++ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 3047d7015c525..1789800faaeb6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1868,6 +1868,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev) if (err) goto abort; + dev->priv.eswitch = esw; err = esw_offloads_init(esw); if (err) goto reps_err; @@ -1892,11 +1893,6 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev) esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_BASIC; else esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_NONE; - if (MLX5_ESWITCH_MANAGER(dev) && - mlx5_esw_vport_match_metadata_supported(esw)) - esw->flags |= MLX5_ESWITCH_VPORT_MATCH_METADATA; - - dev->priv.eswitch = esw; BLOCKING_INIT_NOTIFIER_HEAD(&esw->n_head); esw_info(dev, @@ -1908,6 +1904,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev) reps_err: mlx5_esw_vports_cleanup(esw); + dev->priv.eswitch = NULL; abort: if (esw->work_queue) destroy_workqueue(esw->work_queue); @@ -1926,7 +1923,6 @@ void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) esw_info(esw->dev, "cleanup\n"); - esw->dev->priv.eswitch = NULL; destroy_workqueue(esw->work_queue); WARN_ON(refcount_read(&esw->qos.refcnt)); mutex_destroy(&esw->state_lock); @@ -1937,6 +1933,7 @@ void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) mutex_destroy(&esw->offloads.encap_tbl_lock); mutex_destroy(&esw->offloads.decap_tbl_lock); esw_offloads_cleanup(esw); + esw->dev->priv.eswitch = NULL; mlx5_esw_vports_cleanup(esw); debugfs_remove_recursive(esw->debugfs_root); devl_params_unregister(priv_to_devlink(esw->dev), mlx5_eswitch_params, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index baaae628b0a0f..e3cce110e52fd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -2476,6 +2476,10 @@ int esw_offloads_init(struct mlx5_eswitch *esw) if (err) return err; + if (MLX5_ESWITCH_MANAGER(esw->dev) && + mlx5_esw_vport_match_metadata_supported(esw)) + esw->flags |= MLX5_ESWITCH_VPORT_MATCH_METADATA; + err = devl_params_register(priv_to_devlink(esw->dev), esw_devlink_params, ARRAY_SIZE(esw_devlink_params)); -- cgit 1.2.3-korg From c6e77aa9dd82bc18a89bf49418f8f7e961cfccc8 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 9 Apr 2024 22:08:10 +0300 Subject: net/mlx5: Register devlink first under devlink lock In case device is having a non fatal FW error during probe, the driver will report the error to user via devlink. This will trigger a WARN_ON, since mlx5 is calling devlink_register() last. In order to avoid the WARN_ON[1], change mlx5 to invoke devl_register() first under devlink lock. [1] WARNING: CPU: 5 PID: 227 at net/devlink/health.c:483 devlink_recover_notify.constprop.0+0xb8/0xc0 CPU: 5 PID: 227 Comm: kworker/u16:3 Not tainted 6.4.0-rc5_for_upstream_min_debug_2023_06_12_12_38 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Workqueue: mlx5_health0000:08:00.0 mlx5_fw_reporter_err_work [mlx5_core] RIP: 0010:devlink_recover_notify.constprop.0+0xb8/0xc0 Call Trace: ? __warn+0x79/0x120 ? devlink_recover_notify.constprop.0+0xb8/0xc0 ? report_bug+0x17c/0x190 ? handle_bug+0x3c/0x60 ? exc_invalid_op+0x14/0x70 ? asm_exc_invalid_op+0x16/0x20 ? devlink_recover_notify.constprop.0+0xb8/0xc0 devlink_health_report+0x4a/0x1c0 mlx5_fw_reporter_err_work+0xa4/0xd0 [mlx5_core] process_one_work+0x1bb/0x3c0 ? process_one_work+0x3c0/0x3c0 worker_thread+0x4d/0x3c0 ? process_one_work+0x3c0/0x3c0 kthread+0xc6/0xf0 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x1f/0x30 Fixes: cf530217408e ("devlink: Notify users when objects are accessible") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 37 ++++++++++++---------- .../ethernet/mellanox/mlx5/core/sf/dev/driver.c | 1 - 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index c2593625c09ad..59806553889e9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1480,6 +1480,14 @@ int mlx5_init_one_devl_locked(struct mlx5_core_dev *dev) if (err) goto err_register; + err = mlx5_crdump_enable(dev); + if (err) + mlx5_core_err(dev, "mlx5_crdump_enable failed with error code %d\n", err); + + err = mlx5_hwmon_dev_register(dev); + if (err) + mlx5_core_err(dev, "mlx5_hwmon_dev_register failed with error code %d\n", err); + mutex_unlock(&dev->intf_state_mutex); return 0; @@ -1505,7 +1513,10 @@ int mlx5_init_one(struct mlx5_core_dev *dev) int err; devl_lock(devlink); + devl_register(devlink); err = mlx5_init_one_devl_locked(dev); + if (err) + devl_unregister(devlink); devl_unlock(devlink); return err; } @@ -1517,6 +1528,8 @@ void mlx5_uninit_one(struct mlx5_core_dev *dev) devl_lock(devlink); mutex_lock(&dev->intf_state_mutex); + mlx5_hwmon_dev_unregister(dev); + mlx5_crdump_disable(dev); mlx5_unregister_device(dev); if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) { @@ -1534,6 +1547,7 @@ void mlx5_uninit_one(struct mlx5_core_dev *dev) mlx5_function_teardown(dev, true); out: mutex_unlock(&dev->intf_state_mutex); + devl_unregister(devlink); devl_unlock(devlink); } @@ -1680,16 +1694,20 @@ int mlx5_init_one_light(struct mlx5_core_dev *dev) } devl_lock(devlink); + devl_register(devlink); + err = mlx5_devlink_params_register(priv_to_devlink(dev)); - devl_unlock(devlink); if (err) { mlx5_core_warn(dev, "mlx5_devlink_param_reg err = %d\n", err); goto query_hca_caps_err; } + devl_unlock(devlink); return 0; query_hca_caps_err: + devl_unregister(devlink); + devl_unlock(devlink); mlx5_function_disable(dev, true); out: dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; @@ -1702,6 +1720,7 @@ void mlx5_uninit_one_light(struct mlx5_core_dev *dev) devl_lock(devlink); mlx5_devlink_params_unregister(priv_to_devlink(dev)); + devl_unregister(devlink); devl_unlock(devlink); if (dev->state != MLX5_DEVICE_STATE_UP) return; @@ -1943,16 +1962,7 @@ static int probe_one(struct pci_dev *pdev, const struct pci_device_id *id) goto err_init_one; } - err = mlx5_crdump_enable(dev); - if (err) - dev_err(&pdev->dev, "mlx5_crdump_enable failed with error code %d\n", err); - - err = mlx5_hwmon_dev_register(dev); - if (err) - mlx5_core_err(dev, "mlx5_hwmon_dev_register failed with error code %d\n", err); - pci_save_state(pdev); - devlink_register(devlink); return 0; err_init_one: @@ -1973,16 +1983,9 @@ static void remove_one(struct pci_dev *pdev) struct devlink *devlink = priv_to_devlink(dev); set_bit(MLX5_BREAK_FW_WAIT, &dev->intf_state); - /* mlx5_drain_fw_reset() and mlx5_drain_health_wq() are using - * devlink notify APIs. - * Hence, we must drain them before unregistering the devlink. - */ mlx5_drain_fw_reset(dev); mlx5_drain_health_wq(dev); - devlink_unregister(devlink); mlx5_sriov_disable(pdev, false); - mlx5_hwmon_dev_unregister(dev); - mlx5_crdump_disable(dev); mlx5_uninit_one(dev); mlx5_pci_close(dev); mlx5_mdev_uninit(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c index bc863e1f062e6..e3bf8c7e4baa6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c @@ -101,7 +101,6 @@ static void mlx5_sf_dev_remove(struct auxiliary_device *adev) devlink = priv_to_devlink(mdev); set_bit(MLX5_BREAK_FW_WAIT, &mdev->intf_state); mlx5_drain_health_wq(mdev); - devlink_unregister(devlink); if (mlx5_dev_is_lightweight(mdev)) mlx5_uninit_one_light(mdev); else -- cgit 1.2.3-korg From 9f7e8fbb91f8fa29548e2f6ab50c03b628c67ede Mon Sep 17 00:00:00 2001 From: Michael Liang Date: Tue, 9 Apr 2024 22:08:11 +0300 Subject: net/mlx5: offset comp irq index in name by one The mlx5 comp irq name scheme is changed a little bit between commit 3663ad34bc70 ("net/mlx5: Shift control IRQ to the last index") and commit 3354822cde5a ("net/mlx5: Use dynamic msix vectors allocation"). The index in the comp irq name used to start from 0 but now it starts from 1. There is nothing critical here, but it's harmless to change back to the old behavior, a.k.a starting from 0. Fixes: 3354822cde5a ("net/mlx5: Use dynamic msix vectors allocation") Reviewed-by: Mohamed Khalfella Reviewed-by: Yuanyuan Zhong Signed-off-by: Michael Liang Reviewed-by: Shay Drory Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index 4dcf995cb1a20..6bac8ad70ba60 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -19,6 +19,7 @@ #define MLX5_IRQ_CTRL_SF_MAX 8 /* min num of vectors for SFs to be enabled */ #define MLX5_IRQ_VEC_COMP_BASE_SF 2 +#define MLX5_IRQ_VEC_COMP_BASE 1 #define MLX5_EQ_SHARE_IRQ_MAX_COMP (8) #define MLX5_EQ_SHARE_IRQ_MAX_CTRL (UINT_MAX) @@ -246,6 +247,7 @@ static void irq_set_name(struct mlx5_irq_pool *pool, char *name, int vecidx) return; } + vecidx -= MLX5_IRQ_VEC_COMP_BASE; snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", vecidx); } @@ -585,7 +587,7 @@ struct mlx5_irq *mlx5_irq_request_vector(struct mlx5_core_dev *dev, u16 cpu, struct mlx5_irq_table *table = mlx5_irq_table_get(dev); struct mlx5_irq_pool *pool = table->pcif_pool; struct irq_affinity_desc af_desc; - int offset = 1; + int offset = MLX5_IRQ_VEC_COMP_BASE; if (!pool->xa_num_irqs.max) offset = 0; -- cgit 1.2.3-korg From 7c6782ad4911cbee874e85630226ed389ff2e453 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 9 Apr 2024 22:08:12 +0300 Subject: net/mlx5: Properly link new fs rules into the tree Previously, add_rule_fg would only add newly created rules from the handle into the tree when they had a refcount of 1. On the other hand, create_flow_handle tries hard to find and reference already existing identical rules instead of creating new ones. These two behaviors can result in a situation where create_flow_handle 1) creates a new rule and references it, then 2) in a subsequent step during the same handle creation references it again, resulting in a rule with a refcount of 2 that is not linked into the tree, will have a NULL parent and root and will result in a crash when the flow group is deleted because del_sw_hw_rule, invoked on rule deletion, assumes node->parent is != NULL. This happened in the wild, due to another bug related to incorrect handling of duplicate pkt_reformat ids, which lead to the code in create_flow_handle incorrectly referencing a just-added rule in the same flow handle, resulting in the problem described above. Full details are at [1]. This patch changes add_rule_fg to add new rules without parents into the tree, properly initializing them and avoiding the crash. This makes it more consistent with how rules are added to an FTE in create_flow_handle. Fixes: 74491de93712 ("net/mlx5: Add multi dest support") Link: https://lore.kernel.org/netdev/ea5264d6-6b55-4449-a602-214c6f509c1e@163.com/T/#u [1] Signed-off-by: Cosmin Ratiu Reviewed-by: Tariq Toukan Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index e6bfa7e4f146c..2a9421342a503 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -1808,8 +1808,9 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg, } trace_mlx5_fs_set_fte(fte, false); + /* Link newly added rules into the tree. */ for (i = 0; i < handle->num_rules; i++) { - if (refcount_read(&handle->rule[i]->node.refcount) == 1) { + if (!handle->rule[i]->node.parent) { tree_add_node(&handle->rule[i]->node, &fte->node); trace_mlx5_fs_add_rule(handle->rule[i]); } -- cgit 1.2.3-korg From 9eca93f4d5ab03905516a68683674d9c50ff95bd Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 9 Apr 2024 22:08:13 +0300 Subject: net/mlx5: Correctly compare pkt reformat ids struct mlx5_pkt_reformat contains a naked union of a u32 id and a dr_action pointer which is used when the action is SW-managed (when pkt_reformat.owner is set to MLX5_FLOW_RESOURCE_OWNER_SW). Using id directly in that case is incorrect, as it maps to the least significant 32 bits of the 64-bit pointer in mlx5_fs_dr_action and not to the pkt reformat id allocated in firmware. For the purpose of comparing whether two rules are identical, interpreting the least significant 32 bits of the mlx5_fs_dr_action pointer as an id mostly works... until it breaks horribly and produces the outcome described in [1]. This patch fixes mlx5_flow_dests_cmp to correctly compare ids using mlx5_fs_dr_action_get_pkt_reformat_id for the SW-managed rules. Link: https://lore.kernel.org/netdev/ea5264d6-6b55-4449-a602-214c6f509c1e@163.com/T/#u [1] Fixes: 6a48faeeca10 ("net/mlx5: Add direct rule fs_cmd implementation") Signed-off-by: Cosmin Ratiu Reviewed-by: Mark Bloch Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-6-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 2a9421342a503..cf085a478e3e4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -1664,6 +1664,16 @@ static int create_auto_flow_group(struct mlx5_flow_table *ft, return err; } +static bool mlx5_pkt_reformat_cmp(struct mlx5_pkt_reformat *p1, + struct mlx5_pkt_reformat *p2) +{ + return p1->owner == p2->owner && + (p1->owner == MLX5_FLOW_RESOURCE_OWNER_FW ? + p1->id == p2->id : + mlx5_fs_dr_action_get_pkt_reformat_id(p1) == + mlx5_fs_dr_action_get_pkt_reformat_id(p2)); +} + static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1, struct mlx5_flow_destination *d2) { @@ -1675,8 +1685,8 @@ static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1, ((d1->vport.flags & MLX5_FLOW_DEST_VPORT_VHCA_ID) ? (d1->vport.vhca_id == d2->vport.vhca_id) : true) && ((d1->vport.flags & MLX5_FLOW_DEST_VPORT_REFORMAT_ID) ? - (d1->vport.pkt_reformat->id == - d2->vport.pkt_reformat->id) : true)) || + mlx5_pkt_reformat_cmp(d1->vport.pkt_reformat, + d2->vport.pkt_reformat) : true)) || (d1->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE && d1->ft == d2->ft) || (d1->type == MLX5_FLOW_DESTINATION_TYPE_TIR && -- cgit 1.2.3-korg From ee3572409f74a838154af74ce1e56e62c17786a8 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Tue, 9 Apr 2024 22:08:14 +0300 Subject: net/mlx5e: RSS, Block changing channels number when RXFH is configured Changing the channels number after configuring the receive flow hash indirection table may affect the RSS table size. The previous configuration may no longer be compatible with the new receive flow hash indirection table. Block changing the channels number when RXFH is configured and changing the channels number requires resizing the RSS table size. Fixes: 74a8dadac17e ("net/mlx5e: Preparations for supporting larger number of channels") Signed-off-by: Carolina Jubran Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-7-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index cc51ce16df14a..93461b0c5703b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -451,6 +451,23 @@ int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv, mutex_lock(&priv->state_lock); + /* If RXFH is configured, changing the channels number is allowed only if + * it does not require resizing the RSS table. This is because the previous + * configuration may no longer be compatible with the new RSS table. + */ + if (netif_is_rxfh_configured(priv->netdev)) { + int cur_rqt_size = mlx5e_rqt_size(priv->mdev, cur_params->num_channels); + int new_rqt_size = mlx5e_rqt_size(priv->mdev, count); + + if (new_rqt_size != cur_rqt_size) { + err = -EINVAL; + netdev_err(priv->netdev, + "%s: RXFH is configured, block changing channels number that affects RSS table size (new: %d, current: %d)\n", + __func__, new_rqt_size, cur_rqt_size); + goto out; + } + } + /* Don't allow changing the number of channels if HTB offload is active, * because the numeration of the QoS SQs will change, while per-queue * qdiscs are attached. -- cgit 1.2.3-korg From ecb829459a841198e142f72fadab56424ae96519 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Tue, 9 Apr 2024 22:08:15 +0300 Subject: net/mlx5e: Fix mlx5e_priv_init() cleanup flow When mlx5e_priv_init() fails, the cleanup flow calls mlx5e_selq_cleanup which calls mlx5e_selq_apply() that assures that the `priv->state_lock` is held using lockdep_is_held(). Acquire the state_lock in mlx5e_selq_cleanup(). Kernel log: ============================= WARNING: suspicious RCU usage 6.8.0-rc3_net_next_841a9b5 #1 Not tainted ----------------------------- drivers/net/ethernet/mellanox/mlx5/core/en/selq.c:124 suspicious rcu_dereference_protected() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 2 locks held by systemd-modules/293: #0: ffffffffa05067b0 (devices_rwsem){++++}-{3:3}, at: ib_register_client+0x109/0x1b0 [ib_core] #1: ffff8881096c65c0 (&device->client_data_rwsem){++++}-{3:3}, at: add_client_context+0x104/0x1c0 [ib_core] stack backtrace: CPU: 4 PID: 293 Comm: systemd-modules Not tainted 6.8.0-rc3_net_next_841a9b5 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x8a/0xa0 lockdep_rcu_suspicious+0x154/0x1a0 mlx5e_selq_apply+0x94/0xa0 [mlx5_core] mlx5e_selq_cleanup+0x3a/0x60 [mlx5_core] mlx5e_priv_init+0x2be/0x2f0 [mlx5_core] mlx5_rdma_setup_rn+0x7c/0x1a0 [mlx5_core] rdma_init_netdev+0x4e/0x80 [ib_core] ? mlx5_rdma_netdev_free+0x70/0x70 [mlx5_core] ipoib_intf_init+0x64/0x550 [ib_ipoib] ipoib_intf_alloc+0x4e/0xc0 [ib_ipoib] ipoib_add_one+0xb0/0x360 [ib_ipoib] add_client_context+0x112/0x1c0 [ib_core] ib_register_client+0x166/0x1b0 [ib_core] ? 0xffffffffa0573000 ipoib_init_module+0xeb/0x1a0 [ib_ipoib] do_one_initcall+0x61/0x250 do_init_module+0x8a/0x270 init_module_from_file+0x8b/0xd0 idempotent_init_module+0x17d/0x230 __x64_sys_finit_module+0x61/0xb0 do_syscall_64+0x71/0x140 entry_SYSCALL_64_after_hwframe+0x46/0x4e Fixes: 8bf30be75069 ("net/mlx5e: Introduce select queue parameters") Signed-off-by: Carolina Jubran Reviewed-by: Tariq Toukan Reviewed-by: Dragos Tatulea Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-8-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/selq.c | 2 ++ drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/selq.c b/drivers/net/ethernet/mellanox/mlx5/core/en/selq.c index f675b1926340f..f66bbc8464645 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/selq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/selq.c @@ -57,6 +57,7 @@ int mlx5e_selq_init(struct mlx5e_selq *selq, struct mutex *state_lock) void mlx5e_selq_cleanup(struct mlx5e_selq *selq) { + mutex_lock(selq->state_lock); WARN_ON_ONCE(selq->is_prepared); kvfree(selq->standby); @@ -67,6 +68,7 @@ void mlx5e_selq_cleanup(struct mlx5e_selq *selq) kvfree(selq->standby); selq->standby = NULL; + mutex_unlock(selq->state_lock); } void mlx5e_selq_prepare_params(struct mlx5e_selq *selq, struct mlx5e_params *params) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 91848eae45655..b375ef268671a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5726,9 +5726,7 @@ void mlx5e_priv_cleanup(struct mlx5e_priv *priv) kfree(priv->tx_rates); kfree(priv->txq2sq); destroy_workqueue(priv->wq); - mutex_lock(&priv->state_lock); mlx5e_selq_cleanup(&priv->selq); - mutex_unlock(&priv->state_lock); free_cpumask_var(priv->scratchpad.cpumask); for (i = 0; i < priv->htb_max_qos_sqs; i++) -- cgit 1.2.3-korg From 2f436f1869771d46e1a9f85738d5a1a7c5653a4e Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Tue, 9 Apr 2024 22:08:16 +0300 Subject: net/mlx5e: HTB, Fix inconsistencies with QoS SQs number When creating a new HTB class while the interface is down, the variable that follows the number of QoS SQs (htb_max_qos_sqs) may not be consistent with the number of HTB classes. Previously, we compared these two values to ensure that the node_qid is lower than the number of QoS SQs, and we allocated stats for that SQ when they are equal. Change the check to compare the node_qid with the current number of leaf nodes and fix the checking conditions to ensure allocation of stats_list and stats for each node. Fixes: 214baf22870c ("net/mlx5e: Support HTB offload") Signed-off-by: Carolina Jubran Reviewed-by: Tariq Toukan Reviewed-by: Dragos Tatulea Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-9-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/qos.c | 33 ++++++++++++------------ 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c index e87e26f2c669c..6743806b84806 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c @@ -83,24 +83,25 @@ int mlx5e_open_qos_sq(struct mlx5e_priv *priv, struct mlx5e_channels *chs, txq_ix = mlx5e_qid_from_qos(chs, node_qid); - WARN_ON(node_qid > priv->htb_max_qos_sqs); - if (node_qid == priv->htb_max_qos_sqs) { - struct mlx5e_sq_stats *stats, **stats_list = NULL; - - if (priv->htb_max_qos_sqs == 0) { - stats_list = kvcalloc(mlx5e_qos_max_leaf_nodes(priv->mdev), - sizeof(*stats_list), - GFP_KERNEL); - if (!stats_list) - return -ENOMEM; - } + WARN_ON(node_qid >= mlx5e_htb_cur_leaf_nodes(priv->htb)); + if (!priv->htb_qos_sq_stats) { + struct mlx5e_sq_stats **stats_list; + + stats_list = kvcalloc(mlx5e_qos_max_leaf_nodes(priv->mdev), + sizeof(*stats_list), GFP_KERNEL); + if (!stats_list) + return -ENOMEM; + + WRITE_ONCE(priv->htb_qos_sq_stats, stats_list); + } + + if (!priv->htb_qos_sq_stats[node_qid]) { + struct mlx5e_sq_stats *stats; + stats = kzalloc(sizeof(*stats), GFP_KERNEL); - if (!stats) { - kvfree(stats_list); + if (!stats) return -ENOMEM; - } - if (stats_list) - WRITE_ONCE(priv->htb_qos_sq_stats, stats_list); + WRITE_ONCE(priv->htb_qos_sq_stats[node_qid], stats); /* Order htb_max_qos_sqs increment after writing the array pointer. * Pairs with smp_load_acquire in en_stats.c. -- cgit 1.2.3-korg From 86b0ca5b118d3a0bae5e5645a13e66f8a4f6c525 Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 9 Apr 2024 22:08:17 +0300 Subject: net/mlx5e: Do not produce metadata freelist entries in Tx port ts WQE xmit Free Tx port timestamping metadata entries in the NAPI poll context and consume metadata enties in the WQE xmit path. Do not free a Tx port timestamping metadata entry in the WQE xmit path even in the error path to avoid a race between two metadata entry producers. Fixes: 3178308ad4ca ("net/mlx5e: Make tx_port_ts logic resilient to out-of-order CQEs") Signed-off-by: Rahul Rameshbabu Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-10-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h | 8 +++++++- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 7 +++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h index 86f1854698b4e..883c044852f1d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h @@ -95,9 +95,15 @@ static inline void mlx5e_ptp_metadata_fifo_push(struct mlx5e_ptp_metadata_fifo * } static inline u8 +mlx5e_ptp_metadata_fifo_peek(struct mlx5e_ptp_metadata_fifo *fifo) +{ + return fifo->data[fifo->mask & fifo->cc]; +} + +static inline void mlx5e_ptp_metadata_fifo_pop(struct mlx5e_ptp_metadata_fifo *fifo) { - return fifo->data[fifo->mask & fifo->cc++]; + fifo->cc++; } static inline void diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 2fa076b23fbea..e21a3b4128ce8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -398,6 +398,8 @@ mlx5e_txwqe_complete(struct mlx5e_txqsq *sq, struct sk_buff *skb, (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))) { u8 metadata_index = be32_to_cpu(eseg->flow_table_metadata); + mlx5e_ptp_metadata_fifo_pop(&sq->ptpsq->metadata_freelist); + mlx5e_skb_cb_hwtstamp_init(skb); mlx5e_ptp_metadata_map_put(&sq->ptpsq->metadata_map, skb, metadata_index); @@ -496,9 +498,6 @@ mlx5e_sq_xmit_wqe(struct mlx5e_txqsq *sq, struct sk_buff *skb, err_drop: stats->dropped++; - if (unlikely(sq->ptpsq && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))) - mlx5e_ptp_metadata_fifo_push(&sq->ptpsq->metadata_freelist, - be32_to_cpu(eseg->flow_table_metadata)); dev_kfree_skb_any(skb); mlx5e_tx_flush(sq); } @@ -657,7 +656,7 @@ static void mlx5e_cqe_ts_id_eseg(struct mlx5e_ptpsq *ptpsq, struct sk_buff *skb, { if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) eseg->flow_table_metadata = - cpu_to_be32(mlx5e_ptp_metadata_fifo_pop(&ptpsq->metadata_freelist)); + cpu_to_be32(mlx5e_ptp_metadata_fifo_peek(&ptpsq->metadata_freelist)); } static void mlx5e_txwqe_build_eseg(struct mlx5e_priv *priv, struct mlx5e_txqsq *sq, -- cgit 1.2.3-korg From 49e6c9387051716169ff6a6c5ddd4d9f358db2e9 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Tue, 9 Apr 2024 22:08:18 +0300 Subject: net/mlx5e: RSS, Block XOR hash with over 128 channels When supporting more than 128 channels, the RQT size is calculated by multiplying the number of channels by 2 and rounding up to the nearest power of 2. The index of the RQT is derived from the RSS hash calculations. If XOR8 is used as the RSS hash function, there are only 256 possible hash results, and therefore, only 256 indexes can be reached in the RQT. Block setting the RSS hash function to XOR when the number of channels exceeds 128. Fixes: 74a8dadac17e ("net/mlx5e: Preparations for supporting larger number of channels") Signed-off-by: Carolina Jubran Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-11-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c | 7 ++++++ drivers/net/ethernet/mellanox/mlx5/core/en/rqt.h | 1 + .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 28 ++++++++++++++++++++-- 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c index bcafb4bf94154..8d9a3b5ec973b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c @@ -179,6 +179,13 @@ u32 mlx5e_rqt_size(struct mlx5_core_dev *mdev, unsigned int num_channels) return min_t(u32, rqt_size, max_cap_rqt_size); } +#define MLX5E_MAX_RQT_SIZE_ALLOWED_WITH_XOR8_HASH 256 + +unsigned int mlx5e_rqt_max_num_channels_allowed_for_xor8(void) +{ + return MLX5E_MAX_RQT_SIZE_ALLOWED_WITH_XOR8_HASH / MLX5E_UNIFORM_SPREAD_RQT_FACTOR; +} + void mlx5e_rqt_destroy(struct mlx5e_rqt *rqt) { mlx5_core_destroy_rqt(rqt->mdev, rqt->rqtn); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.h b/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.h index e0bc30308c770..2f9e04a8418f1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.h @@ -38,6 +38,7 @@ static inline u32 mlx5e_rqt_get_rqtn(struct mlx5e_rqt *rqt) } u32 mlx5e_rqt_size(struct mlx5_core_dev *mdev, unsigned int num_channels); +unsigned int mlx5e_rqt_max_num_channels_allowed_for_xor8(void); int mlx5e_rqt_redirect_direct(struct mlx5e_rqt *rqt, u32 rqn, u32 *vhca_id); int mlx5e_rqt_redirect_indir(struct mlx5e_rqt *rqt, u32 *rqns, u32 *vhca_ids, unsigned int num_rqns, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 93461b0c5703b..8f101181648c6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -451,6 +451,17 @@ int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv, mutex_lock(&priv->state_lock); + if (mlx5e_rx_res_get_current_hash(priv->rx_res).hfunc == ETH_RSS_HASH_XOR) { + unsigned int xor8_max_channels = mlx5e_rqt_max_num_channels_allowed_for_xor8(); + + if (count > xor8_max_channels) { + err = -EINVAL; + netdev_err(priv->netdev, "%s: Requested number of channels (%d) exceeds the maximum allowed by the XOR8 RSS hfunc (%d)\n", + __func__, count, xor8_max_channels); + goto out; + } + } + /* If RXFH is configured, changing the channels number is allowed only if * it does not require resizing the RSS table. This is because the previous * configuration may no longer be compatible with the new RSS table. @@ -1298,17 +1309,30 @@ int mlx5e_set_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh, struct mlx5e_priv *priv = netdev_priv(dev); u32 *rss_context = &rxfh->rss_context; u8 hfunc = rxfh->hfunc; + unsigned int count; int err; mutex_lock(&priv->state_lock); + + count = priv->channels.params.num_channels; + + if (hfunc == ETH_RSS_HASH_XOR) { + unsigned int xor8_max_channels = mlx5e_rqt_max_num_channels_allowed_for_xor8(); + + if (count > xor8_max_channels) { + err = -EINVAL; + netdev_err(priv->netdev, "%s: Cannot set RSS hash function to XOR, current number of channels (%d) exceeds the maximum allowed for XOR8 RSS hfunc (%d)\n", + __func__, count, xor8_max_channels); + goto unlock; + } + } + if (*rss_context && rxfh->rss_delete) { err = mlx5e_rx_res_rss_destroy(priv->rx_res, *rss_context); goto unlock; } if (*rss_context == ETH_RXFH_CONTEXT_ALLOC) { - unsigned int count = priv->channels.params.num_channels; - err = mlx5e_rx_res_rss_init(priv->rx_res, rss_context, count); if (err) goto unlock; -- cgit 1.2.3-korg From 7772dc7460e8ef359f3eee88c3b708cb403e19af Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 9 Apr 2024 22:08:19 +0300 Subject: net/mlx5: Disallow SRIOV switchdev mode when in multi-PF netdev Adaptations need to be made for the auxiliary device management in the core driver level. Block this combination for now. Fixes: 678eb448055a ("net/mlx5: SD, Implement basic query and instantiation") Signed-off-by: Tariq Toukan Reviewed-by: Dragos Tatulea Reviewed-by: Gal Pressman Link: https://lore.kernel.org/r/20240409190820.227554-12-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index e3cce110e52fd..1f60954c12f72 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -43,6 +43,7 @@ #include "rdma.h" #include "en.h" #include "fs_core.h" +#include "lib/mlx5.h" #include "lib/devcom.h" #include "lib/eq.h" #include "lib/fs_chains.h" @@ -3711,6 +3712,12 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, if (esw_mode_from_devlink(mode, &mlx5_mode)) return -EINVAL; + if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV && mlx5_get_sd(esw->dev)) { + NL_SET_ERR_MSG_MOD(extack, + "Can't change E-Switch mode to switchdev when multi-PF netdev (Socket Direct) is configured."); + return -EPERM; + } + mlx5_lag_disable_change(esw->dev); err = mlx5_esw_try_lock(esw); if (err < 0) { -- cgit 1.2.3-korg