From f969eb84ce482331a991079ab7a5c4dc3b7f89bf Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Sun, 7 Apr 2024 14:56:04 +0800 Subject: netfilter: nf_tables: Fix potential data-race in __nft_expr_type_get() nft_unregister_expr() can concurrent with __nft_expr_type_get(), and there is not any protection when iterate over nf_tables_expressions list in __nft_expr_type_get(). Therefore, there is potential data-race of nf_tables_expressions list entry. Use list_for_each_entry_rcu() to iterate over nf_tables_expressions list in __nft_expr_type_get(), and use rcu_read_lock() in the caller nft_expr_type_get() to protect the entire type query process. Fixes: ef1f7df9170d ("netfilter: nf_tables: expression ops overloading") Signed-off-by: Ziyang Xuan Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d89d779467197..53b8c00863ad2 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3060,7 +3060,7 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family, { const struct nft_expr_type *type, *candidate = NULL; - list_for_each_entry(type, &nf_tables_expressions, list) { + list_for_each_entry_rcu(type, &nf_tables_expressions, list) { if (!nla_strcmp(nla, type->name)) { if (!type->family && !candidate) candidate = type; @@ -3092,9 +3092,13 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net, if (nla == NULL) return ERR_PTR(-EINVAL); + rcu_read_lock(); type = __nft_expr_type_get(family, nla); - if (type != NULL && try_module_get(type->owner)) + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES -- cgit 1.2.3-korg From d78d867dcea69c328db30df665be5be7d0148484 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Sun, 7 Apr 2024 14:56:05 +0800 Subject: netfilter: nf_tables: Fix potential data-race in __nft_obj_type_get() nft_unregister_obj() can concurrent with __nft_obj_type_get(), and there is not any protection when iterate over nf_tables_objects list in __nft_obj_type_get(). Therefore, there is potential data-race of nf_tables_objects list entry. Use list_for_each_entry_rcu() to iterate over nf_tables_objects list in __nft_obj_type_get(), and use rcu_read_lock() in the caller nft_obj_type_get() to protect the entire type query process. Fixes: e50092404c1b ("netfilter: nf_tables: add stateful objects") Signed-off-by: Ziyang Xuan Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 53b8c00863ad2..f11d0c0a2c735 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7611,7 +7611,7 @@ static const struct nft_object_type *__nft_obj_type_get(u32 objtype, u8 family) { const struct nft_object_type *type; - list_for_each_entry(type, &nf_tables_objects, list) { + list_for_each_entry_rcu(type, &nf_tables_objects, list) { if (type->family != NFPROTO_UNSPEC && type->family != family) continue; @@ -7627,9 +7627,13 @@ nft_obj_type_get(struct net *net, u32 objtype, u8 family) { const struct nft_object_type *type; + rcu_read_lock(); type = __nft_obj_type_get(objtype, family); - if (type != NULL && try_module_get(type->owner)) + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES -- cgit 1.2.3-korg From 751de2012eafa4d46d8081056761fa0e9cc8a178 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Apr 2024 11:24:59 +0200 Subject: netfilter: br_netfilter: skip conntrack input hook for promisc packets For historical reasons, when bridge device is in promisc mode, packets that are directed to the taps follow bridge input hook path. This patch adds a workaround to reset conntrack for these packets. Jianbo Liu reports warning splats in their test infrastructure where cloned packets reach the br_netfilter input hook to confirm the conntrack object. Scratch one bit from BR_INPUT_SKB_CB to annotate that this packet has reached the input hook because it is passed up to the bridge device to reach the taps. [ 57.571874] WARNING: CPU: 1 PID: 0 at net/bridge/br_netfilter_hooks.c:616 br_nf_local_in+0x157/0x180 [br_netfilter] [ 57.572749] Modules linked in: xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat xt_addrtype xt_conntrack nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_isc si ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_ib ib_uverbs ib_core mlx5ctl mlx5_core [ 57.575158] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 6.8.0+ #19 [ 57.575700] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 [ 57.576662] RIP: 0010:br_nf_local_in+0x157/0x180 [br_netfilter] [ 57.577195] Code: fe ff ff 41 bd 04 00 00 00 be 04 00 00 00 e9 4a ff ff ff be 04 00 00 00 48 89 ef e8 f3 a9 3c e1 66 83 ad b4 00 00 00 04 eb 91 <0f> 0b e9 f1 fe ff ff 0f 0b e9 df fe ff ff 48 89 df e8 b3 53 47 e1 [ 57.578722] RSP: 0018:ffff88885f845a08 EFLAGS: 00010202 [ 57.579207] RAX: 0000000000000002 RBX: ffff88812dfe8000 RCX: 0000000000000000 [ 57.579830] RDX: ffff88885f845a60 RSI: ffff8881022dc300 RDI: 0000000000000000 [ 57.580454] RBP: ffff88885f845a60 R08: 0000000000000001 R09: 0000000000000003 [ 57.581076] R10: 00000000ffff1300 R11: 0000000000000002 R12: 0000000000000000 [ 57.581695] R13: ffff8881047ffe00 R14: ffff888108dbee00 R15: ffff88814519b800 [ 57.582313] FS: 0000000000000000(0000) GS:ffff88885f840000(0000) knlGS:0000000000000000 [ 57.583040] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 57.583564] CR2: 000000c4206aa000 CR3: 0000000103847001 CR4: 0000000000370eb0 [ 57.584194] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 57.584820] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 57.585440] Call Trace: [ 57.585721] [ 57.585976] ? __warn+0x7d/0x130 [ 57.586323] ? br_nf_local_in+0x157/0x180 [br_netfilter] [ 57.586811] ? report_bug+0xf1/0x1c0 [ 57.587177] ? handle_bug+0x3f/0x70 [ 57.587539] ? exc_invalid_op+0x13/0x60 [ 57.587929] ? asm_exc_invalid_op+0x16/0x20 [ 57.588336] ? br_nf_local_in+0x157/0x180 [br_netfilter] [ 57.588825] nf_hook_slow+0x3d/0xd0 [ 57.589188] ? br_handle_vlan+0x4b/0x110 [ 57.589579] br_pass_frame_up+0xfc/0x150 [ 57.589970] ? br_port_flags_change+0x40/0x40 [ 57.590396] br_handle_frame_finish+0x346/0x5e0 [ 57.590837] ? ipt_do_table+0x32e/0x430 [ 57.591221] ? br_handle_local_finish+0x20/0x20 [ 57.591656] br_nf_hook_thresh+0x4b/0xf0 [br_netfilter] [ 57.592286] ? br_handle_local_finish+0x20/0x20 [ 57.592802] br_nf_pre_routing_finish+0x178/0x480 [br_netfilter] [ 57.593348] ? br_handle_local_finish+0x20/0x20 [ 57.593782] ? nf_nat_ipv4_pre_routing+0x25/0x60 [nf_nat] [ 57.594279] br_nf_pre_routing+0x24c/0x550 [br_netfilter] [ 57.594780] ? br_nf_hook_thresh+0xf0/0xf0 [br_netfilter] [ 57.595280] br_handle_frame+0x1f3/0x3d0 [ 57.595676] ? br_handle_local_finish+0x20/0x20 [ 57.596118] ? br_handle_frame_finish+0x5e0/0x5e0 [ 57.596566] __netif_receive_skb_core+0x25b/0xfc0 [ 57.597017] ? __napi_build_skb+0x37/0x40 [ 57.597418] __netif_receive_skb_list_core+0xfb/0x220 Fixes: 62e7151ae3eb ("netfilter: bridge: confirm multicast packets before passing them up the stack") Reported-by: Jianbo Liu Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_input.c | 15 +++++++++++---- net/bridge/br_netfilter_hooks.c | 6 ++++++ net/bridge/br_private.h | 1 + net/bridge/netfilter/nf_conntrack_bridge.c | 14 ++++++++++---- 4 files changed, 28 insertions(+), 8 deletions(-) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index f21097e734827..ceaa5a89b947f 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -30,7 +30,7 @@ br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb) return netif_receive_skb(skb); } -static int br_pass_frame_up(struct sk_buff *skb) +static int br_pass_frame_up(struct sk_buff *skb, bool promisc) { struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev; struct net_bridge *br = netdev_priv(brdev); @@ -65,6 +65,8 @@ static int br_pass_frame_up(struct sk_buff *skb) br_multicast_count(br, NULL, skb, br_multicast_igmp_type(skb), BR_MCAST_DIR_TX); + BR_INPUT_SKB_CB(skb)->promisc = promisc; + return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(indev), NULL, skb, indev, NULL, br_netif_receive_skb); @@ -82,6 +84,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb struct net_bridge_mcast *brmctx; struct net_bridge_vlan *vlan; struct net_bridge *br; + bool promisc; u16 vid = 0; u8 state; @@ -137,7 +140,9 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (p->flags & BR_LEARNING) br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0); - local_rcv = !!(br->dev->flags & IFF_PROMISC); + promisc = !!(br->dev->flags & IFF_PROMISC); + local_rcv = promisc; + if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) { /* by definition the broadcast is also a multicast address */ if (is_broadcast_ether_addr(eth_hdr(skb)->h_dest)) { @@ -200,7 +205,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb unsigned long now = jiffies; if (test_bit(BR_FDB_LOCAL, &dst->flags)) - return br_pass_frame_up(skb); + return br_pass_frame_up(skb, false); if (now != dst->used) dst->used = now; @@ -213,7 +218,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb } if (local_rcv) - return br_pass_frame_up(skb); + return br_pass_frame_up(skb, promisc); out: return 0; @@ -386,6 +391,8 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) goto forward; } + BR_INPUT_SKB_CB(skb)->promisc = false; + /* The else clause should be hit when nf_hook(): * - returns < 0 (drop/error) * - returns = 0 (stolen/nf_queue) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 35e10c5a766d5..22e35623c148a 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -600,11 +600,17 @@ static unsigned int br_nf_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { + bool promisc = BR_INPUT_SKB_CB(skb)->promisc; struct nf_conntrack *nfct = skb_nfct(skb); const struct nf_ct_hook *ct_hook; struct nf_conn *ct; int ret; + if (promisc) { + nf_reset_ct(skb); + return NF_ACCEPT; + } + if (!nfct || skb->pkt_type == PACKET_HOST) return NF_ACCEPT; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 86ea5e6689b5c..d4bedc87b1d8f 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -589,6 +589,7 @@ struct br_input_skb_cb { #endif u8 proxyarp_replied:1; u8 src_port_isolated:1; + u8 promisc:1; #ifdef CONFIG_BRIDGE_VLAN_FILTERING u8 vlan_filtered:1; #endif diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 6f877e31709ba..c3c51b9a68265 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -294,18 +294,24 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb, static unsigned int nf_ct_bridge_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - enum ip_conntrack_info ctinfo; + bool promisc = BR_INPUT_SKB_CB(skb)->promisc; + struct nf_conntrack *nfct = skb_nfct(skb); struct nf_conn *ct; - if (skb->pkt_type == PACKET_HOST) + if (promisc) { + nf_reset_ct(skb); + return NF_ACCEPT; + } + + if (!nfct || skb->pkt_type == PACKET_HOST) return NF_ACCEPT; /* nf_conntrack_confirm() cannot handle concurrent clones, * this happens for broad/multicast frames with e.g. macvlan on top * of the bridge device. */ - ct = nf_ct_get(skb, &ctinfo); - if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct)) + ct = container_of(nfct, struct nf_conn, ct_general); + if (nf_ct_is_confirmed(ct) || nf_ct_is_template(ct)) return NF_ACCEPT; /* let inet prerouting call conntrack again */ -- cgit 1.2.3-korg From 29b359cf6d95fd60730533f7f10464e95bd17c73 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 10 Apr 2024 18:50:45 +0200 Subject: netfilter: nft_set_pipapo: walk over current view on netlink dump The generation mask can be updated while netlink dump is in progress. The pipapo set backend walk iterator cannot rely on it to infer what view of the datastructure is to be used. Add notation to specify if user wants to read/update the set. Based on patch from Florian Westphal. Fixes: 2b84e215f874 ("netfilter: nft_set_pipapo: .walk does not deal with generations") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 14 ++++++++++++++ net/netfilter/nf_tables_api.c | 6 ++++++ net/netfilter/nft_set_pipapo.c | 5 +++-- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index e27c28b612e46..3f1ed467f951f 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -307,9 +307,23 @@ static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv) return (void *)priv; } + +/** + * enum nft_iter_type - nftables set iterator type + * + * @NFT_ITER_READ: read-only iteration over set elements + * @NFT_ITER_UPDATE: iteration under mutex to update set element state + */ +enum nft_iter_type { + NFT_ITER_UNSPEC, + NFT_ITER_READ, + NFT_ITER_UPDATE, +}; + struct nft_set; struct nft_set_iter { u8 genmask; + enum nft_iter_type type:8; unsigned int count; unsigned int skip; int err; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f11d0c0a2c735..a7a34db62ea93 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -626,6 +626,7 @@ static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_iter iter = { .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, .fn = nft_mapelem_deactivate, }; @@ -5445,6 +5446,7 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, } iter.genmask = nft_genmask_next(ctx->net); + iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; @@ -5518,6 +5520,7 @@ static void nft_map_activate(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_iter iter = { .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, .fn = nft_mapelem_activate, }; @@ -5892,6 +5895,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) args.skb = skb; args.reset = dump_ctx->reset; args.iter.genmask = nft_genmask_cur(net); + args.iter.type = NFT_ITER_READ; args.iter.skip = cb->args[0]; args.iter.count = 0; args.iter.err = 0; @@ -7376,6 +7380,7 @@ static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask) { struct nft_set_iter iter = { .genmask = genmask, + .type = NFT_ITER_UPDATE, .fn = nft_setelem_flush, }; @@ -10879,6 +10884,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, continue; iter.genmask = nft_genmask_next(ctx->net); + iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index df8de50902463..11e44e4dfb1f7 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -2115,13 +2115,14 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter) { struct nft_pipapo *priv = nft_set_priv(set); - struct net *net = read_pnet(&set->net); const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; unsigned int i, r; + WARN_ON_ONCE(iter->type == NFT_ITER_UNSPEC); + rcu_read_lock(); - if (iter->genmask == nft_genmask_cur(net)) + if (iter->type == NFT_ITER_READ) m = rcu_dereference(priv->match); else m = priv->clone; -- cgit 1.2.3-korg From 3cfc9ec039af60dbd8965ae085b2c2ccdcfbe1cc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Apr 2024 21:05:13 +0200 Subject: netfilter: nft_set_pipapo: do not free live element Pablo reports a crash with large batches of elements with a back-to-back add/remove pattern. Quoting Pablo: add_elem("00000000") timeout 100 ms ... add_elem("0000000X") timeout 100 ms del_elem("0000000X") <---------------- delete one that was just added ... add_elem("00005000") timeout 100 ms 1) nft_pipapo_remove() removes element 0000000X Then, KASAN shows a splat. Looking at the remove function there is a chance that we will drop a rule that maps to a non-deactivated element. Removal happens in two steps, first we do a lookup for key k and return the to-be-removed element and mark it as inactive in the next generation. Then, in a second step, the element gets removed from the set/map. The _remove function does not work correctly if we have more than one element that share the same key. This can happen if we insert an element into a set when the set already holds an element with same key, but the element mapping to the existing key has timed out or is not active in the next generation. In such case its possible that removal will unmap the wrong element. If this happens, we will leak the non-deactivated element, it becomes unreachable. The element that got deactivated (and will be freed later) will remain reachable in the set data structure, this can result in a crash when such an element is retrieved during lookup (stale pointer). Add a check that the fully matching key does in fact map to the element that we have marked as inactive in the deactivation step. If not, we need to continue searching. Add a bug/warn trap at the end of the function as well, the remove function must not ever be called with an invisible/unreachable/non-existent element. v2: avoid uneeded temporary variable (Stefano) Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Reported-by: Pablo Neira Ayuso Reviewed-by: Stefano Brivio Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 11e44e4dfb1f7..eeaf05ffba953 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -2077,6 +2077,8 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, rules_fx = rules_f0; nft_pipapo_for_each_field(f, i, m) { + bool last = i == m->field_count - 1; + if (!pipapo_match_field(f, start, rules_fx, match_start, match_end)) break; @@ -2089,16 +2091,18 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); - } - if (i == m->field_count) { - priv->dirty = true; - pipapo_drop(m, rulemap); - return; + if (last && f->mt[rulemap[i].to].e == e) { + priv->dirty = true; + pipapo_drop(m, rulemap); + return; + } } first_rule += rules_f0; } + + WARN_ON_ONCE(1); /* elem_priv not found */ } /** -- cgit 1.2.3-korg From 87b3593bed1868b2d9fe096c01bcdf0ea86cbebf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Apr 2024 13:47:33 +0200 Subject: netfilter: flowtable: validate pppoe header Ensure there is sufficient room to access the protocol field of the PPPoe header. Validate it once before the flowtable lookup, then use a helper function to access protocol field. Reported-by: syzbot+b6f07e1c07ef40199081@syzkaller.appspotmail.com Fixes: 72efd585f714 ("netfilter: flowtable: add pppoe support") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 12 +++++++++++- net/netfilter/nf_flow_table_inet.c | 3 ++- net/netfilter/nf_flow_table_ip.c | 8 +++++--- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index a763dd327c6ea..9abb7ee40d72f 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -336,7 +336,7 @@ int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow, int nf_flow_table_offload_init(void); void nf_flow_table_offload_exit(void); -static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) +static inline __be16 __nf_flow_pppoe_proto(const struct sk_buff *skb) { __be16 proto; @@ -352,6 +352,16 @@ static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) return 0; } +static inline bool nf_flow_pppoe_proto(struct sk_buff *skb, __be16 *inner_proto) +{ + if (!pskb_may_pull(skb, PPPOE_SES_HLEN)) + return false; + + *inner_proto = __nf_flow_pppoe_proto(skb); + + return true; +} + #define NF_FLOW_TABLE_STAT_INC(net, count) __this_cpu_inc((net)->ft.stat->count) #define NF_FLOW_TABLE_STAT_DEC(net, count) __this_cpu_dec((net)->ft.stat->count) #define NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count) \ diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 9505f9d188ff2..6eef15648b7b0 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -21,7 +21,8 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, proto = veth->h_vlan_encapsulated_proto; break; case htons(ETH_P_PPP_SES): - proto = nf_flow_pppoe_proto(skb); + if (!nf_flow_pppoe_proto(skb, &proto)) + return NF_ACCEPT; break; default: proto = skb->protocol; diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index e45fade764096..9e9e105052dae 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -273,10 +273,11 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; } -static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto, +static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, u32 *offset) { struct vlan_ethhdr *veth; + __be16 inner_proto; switch (skb->protocol) { case htons(ETH_P_8021Q): @@ -287,7 +288,8 @@ static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto, } break; case htons(ETH_P_PPP_SES): - if (nf_flow_pppoe_proto(skb) == proto) { + if (nf_flow_pppoe_proto(skb, &inner_proto) && + inner_proto == proto) { *offset += PPPOE_SES_HLEN; return true; } @@ -316,7 +318,7 @@ static void nf_flow_encap_pop(struct sk_buff *skb, skb_reset_network_header(skb); break; case htons(ETH_P_PPP_SES): - skb->protocol = nf_flow_pppoe_proto(skb); + skb->protocol = __nf_flow_pppoe_proto(skb); skb_pull(skb, PPPOE_SES_HLEN); skb_reset_network_header(skb); break; -- cgit 1.2.3-korg From 6db5dc7b351b9569940cd1cf445e237c42cd6d27 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 11 Apr 2024 00:09:00 +0200 Subject: netfilter: flowtable: incorrect pppoe tuple pppoe traffic reaching ingress path does not match the flowtable entry because the pppoe header is expected to be at the network header offset. This bug causes a mismatch in the flow table lookup, so pppoe packets enter the classical forwarding path. Fixes: 72efd585f714 ("netfilter: flowtable: add pppoe support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 9e9e105052dae..5383bed3d3e00 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -157,7 +157,7 @@ static void nf_flow_tuple_encap(struct sk_buff *skb, tuple->encap[i].proto = skb->protocol; break; case htons(ETH_P_PPP_SES): - phdr = (struct pppoe_hdr *)skb_mac_header(skb); + phdr = (struct pppoe_hdr *)skb_network_header(skb); tuple->encap[i].id = ntohs(phdr->sid); tuple->encap[i].proto = skb->protocol; break; -- cgit 1.2.3-korg From 283454c8a123072e5c386a5a2b5fc576aa455b6f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 10 Apr 2024 10:10:15 -0700 Subject: af_unix: Call manage_oob() for every skb in unix_stream_read_generic(). When we call recv() for AF_UNIX socket, we first peek one skb and calls manage_oob() to check if the skb is sent with MSG_OOB. However, when we fetch the next (and the following) skb, manage_oob() is not called now, leading a wrong behaviour. Let's say a socket send()s "hello" with MSG_OOB and the peer tries to recv() 5 bytes with MSG_PEEK. Here, we should get only "hell" without 'o', but actually not: >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) >>> c1.send(b'hello', MSG_OOB) 5 >>> c2.recv(5, MSG_PEEK) b'hello' The first skb fills 4 bytes, and the next skb is peeked but not properly checked by manage_oob(). Let's move up the again label to call manage_oob() for evry skb. With this patch: >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) >>> c1.send(b'hello', MSG_OOB) 5 >>> c2.recv(5, MSG_PEEK) b'hell' Fixes: 314001f0bf92 ("af_unix: Add OOB support") Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240410171016.7621-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index d032eb5fa6df1..f297320438bfd 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2741,6 +2741,7 @@ redo: last = skb = skb_peek(&sk->sk_receive_queue); last_len = last ? last->len : 0; +again: #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb) { skb = manage_oob(skb, sk, flags, copied); @@ -2752,7 +2753,6 @@ redo: } } #endif -again: if (skb == NULL) { if (copied >= target) goto unlock; -- cgit 1.2.3-korg From 22dd70eb2c3d754862964377a75abafd3167346b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 10 Apr 2024 10:10:16 -0700 Subject: af_unix: Don't peek OOB data without MSG_OOB. Currently, we can read OOB data without MSG_OOB by using MSG_PEEK when OOB data is sitting on the front row, which is apparently wrong. >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) >>> c1.send(b'a', MSG_OOB) 1 >>> c2.recv(1, MSG_PEEK | MSG_DONTWAIT) b'a' If manage_oob() is called when no data has been copied, we only check if the socket enables SO_OOBINLINE or MSG_PEEK is not used. Otherwise, the skb is returned as is. However, here we should return NULL if MSG_PEEK is set and no data has been copied. Also, in such a case, we should not jump to the redo label because we will be caught in the loop and hog the CPU until normal data comes in. Then, we need to handle skb == NULL case with the if-clause below the manage_oob() block. With this patch: >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) >>> c1.send(b'a', MSG_OOB) 1 >>> c2.recv(1, MSG_PEEK | MSG_DONTWAIT) Traceback (most recent call last): File "", line 1, in BlockingIOError: [Errno 11] Resource temporarily unavailable Fixes: 314001f0bf92 ("af_unix: Add OOB support") Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240410171016.7621-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f297320438bfd..9a6ad5974dff5 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2663,7 +2663,9 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, WRITE_ONCE(u->oob_skb, NULL); consume_skb(skb); } - } else if (!(flags & MSG_PEEK)) { + } else if (flags & MSG_PEEK) { + skb = NULL; + } else { skb_unlink(skb, &sk->sk_receive_queue); WRITE_ONCE(u->oob_skb, NULL); if (!WARN_ON_ONCE(skb_unref(skb))) @@ -2745,11 +2747,9 @@ again: #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb) { skb = manage_oob(skb, sk, flags, copied); - if (!skb) { + if (!skb && copied) { unix_state_unlock(sk); - if (copied) - break; - goto redo; + break; } } #endif -- cgit 1.2.3-korg From 68aba00483c7c4102429bcdfdece7289a8ab5c8e Mon Sep 17 00:00:00 2001 From: Asbjørn Sloth Tønnesen Date: Thu, 11 Apr 2024 11:13:18 +0000 Subject: net: sparx5: flower: fix fragment flags handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I noticed that only 3 out of the 4 input bits were used, mt.key->flags & FLOW_DIS_IS_FRAGMENT was never checked. In order to avoid a complicated maze, I converted it to use a 16 byte mapping table. As shown in the table below the old heuristics doesn't always do the right thing, ie. when FLOW_DIS_IS_FRAGMENT=1/1 then it used to only match follow-up fragment packets. Here are all the combinations, and their resulting new/old VCAP key/mask filter: /- FLOW_DIS_IS_FRAGMENT (key/mask) | /- FLOW_DIS_FIRST_FRAG (key/mask) | | /-- new VCAP fragment (key/mask) v v v v- old VCAP fragment (key/mask) 0/0 0/0 -/- -/- impossible (due to entry cond. on mask) 0/0 0/1 -/- 0/3 !! invalid (can't match non-fragment + follow-up frag) 0/0 1/0 -/- -/- impossible (key > mask) 0/0 1/1 1/3 1/3 first fragment 0/1 0/0 0/3 3/3 !! not fragmented 0/1 0/1 0/3 3/3 !! not fragmented (+ not first fragment) 0/1 1/0 -/- -/- impossible (key > mask) 0/1 1/1 -/- 1/3 !! invalid (non-fragment and first frag) 1/0 0/0 -/- -/- impossible (key > mask) 1/0 0/1 -/- -/- impossible (key > mask) 1/0 1/0 -/- -/- impossible (key > mask) 1/0 1/1 -/- -/- impossible (key > mask) 1/1 0/0 1/1 3/3 !! some fragment 1/1 0/1 3/3 3/3 follow-up fragment 1/1 1/0 -/- -/- impossible (key > mask) 1/1 1/1 1/3 1/3 first fragment In the datasheet the VCAP fragment values are documented as: 0 = no fragment 1 = initial fragment 2 = suspicious fragment 3 = valid follow-up fragment Result: 3 combinations match the old behavior, 3 combinations have been corrected, 2 combinations are now invalid, and fail, 8 combinations are impossible. It should now be aligned with how FLOW_DIS_IS_FRAGMENT and FLOW_DIS_FIRST_FRAG is set in __skb_flow_dissect() in net/core/flow_dissector.c Since the VCAP fragment values are not a bitfield, we have to ignore the suspicious fragment value, eg. when matching on any kind of fragment with FLOW_DIS_IS_FRAGMENT=1/1. Only compile tested, and logic tested in userspace, as I unfortunately don't have access to this switch chip (yet). Fixes: d6c2964db3fe ("net: microchip: sparx5: Adding more tc flower keys for the IS2 VCAP") Signed-off-by: Asbjørn Sloth Tønnesen Reviewed-by: Steen Hegelund Tested-by: Daniel Machon Reviewed-by: Jacob Keller Link: https://lore.kernel.org/r/20240411111321.114095-1-ast@fiberby.net Signed-off-by: Jakub Kicinski --- .../ethernet/microchip/sparx5/sparx5_tc_flower.c | 61 ++++++++++++++-------- 1 file changed, 40 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_tc_flower.c b/drivers/net/ethernet/microchip/sparx5/sparx5_tc_flower.c index 523e0c470894f..55f255a3c9db6 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_tc_flower.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_tc_flower.c @@ -36,6 +36,27 @@ struct sparx5_tc_flower_template { u16 l3_proto; /* protocol specified in the template */ }; +/* SparX-5 VCAP fragment types: + * 0 = no fragment, 1 = initial fragment, + * 2 = suspicious fragment, 3 = valid follow-up fragment + */ +enum { /* key / mask */ + FRAG_NOT = 0x03, /* 0 / 3 */ + FRAG_SOME = 0x11, /* 1 / 1 */ + FRAG_FIRST = 0x13, /* 1 / 3 */ + FRAG_LATER = 0x33, /* 3 / 3 */ + FRAG_INVAL = 0xff, /* invalid */ +}; + +/* Flower fragment flag to VCAP fragment type mapping */ +static const u8 sparx5_vcap_frag_map[4][4] = { /* is_frag */ + { FRAG_INVAL, FRAG_INVAL, FRAG_INVAL, FRAG_FIRST }, /* 0/0 */ + { FRAG_NOT, FRAG_NOT, FRAG_INVAL, FRAG_INVAL }, /* 0/1 */ + { FRAG_INVAL, FRAG_INVAL, FRAG_INVAL, FRAG_INVAL }, /* 1/0 */ + { FRAG_SOME, FRAG_LATER, FRAG_INVAL, FRAG_FIRST } /* 1/1 */ + /* 0/0 0/1 1/0 1/1 <-- first_frag */ +}; + static int sparx5_tc_flower_es0_tpid(struct vcap_tc_flower_parse_usage *st) { @@ -145,29 +166,27 @@ sparx5_tc_flower_handler_control_usage(struct vcap_tc_flower_parse_usage *st) flow_rule_match_control(st->frule, &mt); if (mt.mask->flags) { - if (mt.mask->flags & FLOW_DIS_FIRST_FRAG) { - if (mt.key->flags & FLOW_DIS_FIRST_FRAG) { - value = 1; /* initial fragment */ - mask = 0x3; - } else { - if (mt.mask->flags & FLOW_DIS_IS_FRAGMENT) { - value = 3; /* follow up fragment */ - mask = 0x3; - } else { - value = 0; /* no fragment */ - mask = 0x3; - } - } - } else { - if (mt.mask->flags & FLOW_DIS_IS_FRAGMENT) { - value = 3; /* follow up fragment */ - mask = 0x3; - } else { - value = 0; /* no fragment */ - mask = 0x3; - } + u8 is_frag_key = !!(mt.key->flags & FLOW_DIS_IS_FRAGMENT); + u8 is_frag_mask = !!(mt.mask->flags & FLOW_DIS_IS_FRAGMENT); + u8 is_frag_idx = (is_frag_key << 1) | is_frag_mask; + + u8 first_frag_key = !!(mt.key->flags & FLOW_DIS_FIRST_FRAG); + u8 first_frag_mask = !!(mt.mask->flags & FLOW_DIS_FIRST_FRAG); + u8 first_frag_idx = (first_frag_key << 1) | first_frag_mask; + + /* Lookup verdict based on the 2 + 2 input bits */ + u8 vdt = sparx5_vcap_frag_map[is_frag_idx][first_frag_idx]; + + if (vdt == FRAG_INVAL) { + NL_SET_ERR_MSG_MOD(st->fco->common.extack, + "Match on invalid fragment flag combination"); + return -EINVAL; } + /* Extract VCAP fragment key and mask from verdict */ + value = (vdt >> 4) & 0x3; + mask = vdt & 0x3; + err = vcap_rule_add_key_u32(st->vrule, VCAP_KF_L3_FRAGMENT_TYPE, value, mask); -- cgit 1.2.3-korg From 37cc10da3a50e6d0cb9808a90b7da9b4868794dd Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Thu, 11 Apr 2024 14:54:39 +0300 Subject: net/mlx5: Lag, restore buckets number to default after hash LAG deactivation The cited patch introduces the concept of buckets in LAG in hash mode. However, the patch doesn't clear the number of buckets in the LAG deactivation. This results in using the wrong number of buckets in case user create a hash mode LAG and afterwards create a non-hash mode LAG. Hence, restore buckets number to default after hash mode LAG deactivation. Fixes: 352899f384d4 ("net/mlx5: Lag, use buckets in hash mode") Signed-off-by: Shay Drory Reviewed-by: Maor Gottlieb Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240411115444.374475-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index d14459e5c04fc..69d482f7c5a29 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -703,8 +703,10 @@ int mlx5_deactivate_lag(struct mlx5_lag *ldev) return err; } - if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) + if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) { mlx5_lag_port_sel_destroy(ldev); + ldev->buckets = 1; + } if (mlx5_lag_has_drop_rule(ldev)) mlx5_lag_drop_rule_cleanup(ldev); -- cgit 1.2.3-korg From aa4ac90d04f4371466000825adb44935ecb5c974 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Thu, 11 Apr 2024 14:54:40 +0300 Subject: net/mlx5: SD, Handle possible devcom ERR_PTR Check if devcom holds an error pointer and return immediately. This fixes Smatch static checker warning: drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c:221 sd_register() error: 'devcom' dereferencing possible ERR_PTR() Enhance mlx5_devcom_register_component() so it stops returning NULL, making it easier for its callers. Fixes: d3d057666090 ("net/mlx5: SD, Implement devcom communication and primary election") Reported-by: Dan Carpenter Link: https://lore.kernel.org/all/f09666c8-e604-41f6-958b-4cc55c73faf9@gmail.com/T/ Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Link: https://lore.kernel.org/r/20240411115444.374475-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index b375ef268671a..319930c04093b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -209,8 +209,8 @@ static int mlx5e_devcom_init_mpv(struct mlx5e_priv *priv, u64 *data) *data, mlx5e_devcom_event_mpv, priv); - if (IS_ERR_OR_NULL(priv->devcom)) - return -EOPNOTSUPP; + if (IS_ERR(priv->devcom)) + return PTR_ERR(priv->devcom); if (mlx5_core_is_mp_master(priv->mdev)) { mlx5_devcom_send_event(priv->devcom, MPV_DEVCOM_MASTER_UP, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 1f60954c12f72..844d3e3a65ddf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -3060,7 +3060,7 @@ void mlx5_esw_offloads_devcom_init(struct mlx5_eswitch *esw, u64 key) key, mlx5_esw_offloads_devcom_event, esw); - if (IS_ERR_OR_NULL(esw->devcom)) + if (IS_ERR(esw->devcom)) return; mlx5_devcom_send_event(esw->devcom, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c index e7d59cfa8708e..7b0766c89f4cf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c @@ -220,7 +220,7 @@ mlx5_devcom_register_component(struct mlx5_devcom_dev *devc, struct mlx5_devcom_comp *comp; if (IS_ERR_OR_NULL(devc)) - return NULL; + return ERR_PTR(-EINVAL); mutex_lock(&comp_list_lock); comp = devcom_component_get(devc, id, key, handler); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c index 5b28084e8a03c..dd5d186dc6148 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c @@ -213,8 +213,8 @@ static int sd_register(struct mlx5_core_dev *dev) sd = mlx5_get_sd(dev); devcom = mlx5_devcom_register_component(dev->priv.devc, MLX5_DEVCOM_SD_GROUP, sd->group_id, NULL, dev); - if (!devcom) - return -ENOMEM; + if (IS_ERR(devcom)) + return PTR_ERR(devcom); sd->devcom = devcom; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 59806553889e9..a39c4b25ba280 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -956,7 +956,7 @@ static void mlx5_register_hca_devcom_comp(struct mlx5_core_dev *dev) mlx5_devcom_register_component(dev->priv.devc, MLX5_DEVCOM_HCA_PORTS, mlx5_query_nic_system_image_guid(dev), NULL, dev); - if (IS_ERR_OR_NULL(dev->priv.hca_devcom_comp)) + if (IS_ERR(dev->priv.hca_devcom_comp)) mlx5_core_err(dev, "Failed to register devcom HCA component\n"); } -- cgit 1.2.3-korg From bf729988303a27833a86acb561f42b9a3cc12728 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Thu, 11 Apr 2024 14:54:41 +0300 Subject: net/mlx5: Restore mistakenly dropped parts in register devlink flow Code parts from cited commit were mistakenly dropped while rebasing before submission. Add them here. Fixes: c6e77aa9dd82 ("net/mlx5: Register devlink first under devlink lock") Signed-off-by: Shay Drory Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240411115444.374475-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 5 ++++- drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index a39c4b25ba280..331ce47f51a17 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1699,12 +1699,15 @@ int mlx5_init_one_light(struct mlx5_core_dev *dev) err = mlx5_devlink_params_register(priv_to_devlink(dev)); if (err) { mlx5_core_warn(dev, "mlx5_devlink_param_reg err = %d\n", err); - goto query_hca_caps_err; + goto params_reg_err; } devl_unlock(devlink); return 0; +params_reg_err: + devl_unregister(devlink); + devl_unlock(devlink); query_hca_caps_err: devl_unregister(devlink); devl_unlock(devlink); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c index e3bf8c7e4baa6..7ebe712808275 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c @@ -75,7 +75,6 @@ static int mlx5_sf_dev_probe(struct auxiliary_device *adev, const struct auxilia goto peer_devlink_set_err; } - devlink_register(devlink); return 0; peer_devlink_set_err: -- cgit 1.2.3-korg From 6c685bdb9e1af966ec0278dbd4068ec39ae88c2d Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Thu, 11 Apr 2024 14:54:42 +0300 Subject: net/mlx5e: Use channel mdev reference instead of global mdev instance for coalescing Channels can potentially have independent mdev instances. Do not refer to the global mdev instance in the mlx5e_priv instance for channel FW operations related to coalescing. CQ numbers that would be valid on the channel's mdev instance may not be correctly referenced if using the mlx5e_priv instance. Fixes: 67936e138586 ("net/mlx5e: Let channels be SD-aware") Signed-off-by: Rahul Rameshbabu Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240411115444.374475-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 8f101181648c6..67a29826bb570 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -589,12 +589,12 @@ static int mlx5e_get_coalesce(struct net_device *netdev, static void mlx5e_set_priv_channels_tx_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal) { - struct mlx5_core_dev *mdev = priv->mdev; int tc; int i; for (i = 0; i < priv->channels.num; ++i) { struct mlx5e_channel *c = priv->channels.c[i]; + struct mlx5_core_dev *mdev = c->mdev; for (tc = 0; tc < c->num_tc; tc++) { mlx5_core_modify_cq_moderation(mdev, @@ -608,11 +608,11 @@ mlx5e_set_priv_channels_tx_coalesce(struct mlx5e_priv *priv, struct ethtool_coal static void mlx5e_set_priv_channels_rx_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal) { - struct mlx5_core_dev *mdev = priv->mdev; int i; for (i = 0; i < priv->channels.num; ++i) { struct mlx5e_channel *c = priv->channels.c[i]; + struct mlx5_core_dev *mdev = c->mdev; mlx5_core_modify_cq_moderation(mdev, &c->rq.cq.mcq, coal->rx_coalesce_usecs, -- cgit 1.2.3-korg From fdce06bda7e56b2a34c53ea58bad7af2fae96da1 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Thu, 11 Apr 2024 14:54:43 +0300 Subject: net/mlx5e: Acquire RTNL lock before RQs/SQs activation/deactivation netif_queue_set_napi asserts whether RTNL lock is held if the netdev is initialized. Acquire the RTNL lock before activating or deactivating RQs/SQs if the lock has not been held before in the flow. Fixes: f25e7b82635f ("net/mlx5e: link NAPI instances to queues and IRQs") Cc: Joe Damato Signed-off-by: Carolina Jubran Reviewed-by: Rahul Rameshbabu Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240411115444.374475-6-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index 0ab9db3195302..22918b2ef7f12 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -108,7 +108,10 @@ static int mlx5e_tx_reporter_err_cqe_recover(void *ctx) mlx5e_reset_txqsq_cc_pc(sq); sq->stats->recover++; clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); + rtnl_lock(); mlx5e_activate_txqsq(sq); + rtnl_unlock(); + if (sq->channel) mlx5e_trigger_napi_icosq(sq->channel); else @@ -179,12 +182,16 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx) carrier_ok = netif_carrier_ok(netdev); netif_carrier_off(netdev); + rtnl_lock(); mlx5e_deactivate_priv_channels(priv); + rtnl_unlock(); mlx5e_ptp_close(chs->ptp); err = mlx5e_ptp_open(priv, &chs->params, chs->c[0]->lag_port, &chs->ptp); + rtnl_lock(); mlx5e_activate_priv_channels(priv); + rtnl_unlock(); /* return carrier back if needed */ if (carrier_ok) -- cgit 1.2.3-korg From fef965764cf562f28afb997b626fc7c3cec99693 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Thu, 11 Apr 2024 14:54:44 +0300 Subject: net/mlx5e: Prevent deadlock while disabling aRFS When disabling aRFS under the `priv->state_lock`, any scheduled aRFS works are canceled using the `cancel_work_sync` function, which waits for the work to end if it has already started. However, while waiting for the work handler, the handler will try to acquire the `state_lock` which is already acquired. The worker acquires the lock to delete the rules if the state is down, which is not the worker's responsibility since disabling aRFS deletes the rules. Add an aRFS state variable, which indicates whether the aRFS is enabled and prevent adding rules when the aRFS is disabled. Kernel log: ====================================================== WARNING: possible circular locking dependency detected 6.7.0-rc4_net_next_mlx5_5483eb2 #1 Tainted: G I ------------------------------------------------------ ethtool/386089 is trying to acquire lock: ffff88810f21ce68 ((work_completion)(&rule->arfs_work)){+.+.}-{0:0}, at: __flush_work+0x74/0x4e0 but task is already holding lock: ffff8884a1808cc0 (&priv->state_lock){+.+.}-{3:3}, at: mlx5e_ethtool_set_channels+0x53/0x200 [mlx5_core] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&priv->state_lock){+.+.}-{3:3}: __mutex_lock+0x80/0xc90 arfs_handle_work+0x4b/0x3b0 [mlx5_core] process_one_work+0x1dc/0x4a0 worker_thread+0x1bf/0x3c0 kthread+0xd7/0x100 ret_from_fork+0x2d/0x50 ret_from_fork_asm+0x11/0x20 -> #0 ((work_completion)(&rule->arfs_work)){+.+.}-{0:0}: __lock_acquire+0x17b4/0x2c80 lock_acquire+0xd0/0x2b0 __flush_work+0x7a/0x4e0 __cancel_work_timer+0x131/0x1c0 arfs_del_rules+0x143/0x1e0 [mlx5_core] mlx5e_arfs_disable+0x1b/0x30 [mlx5_core] mlx5e_ethtool_set_channels+0xcb/0x200 [mlx5_core] ethnl_set_channels+0x28f/0x3b0 ethnl_default_set_doit+0xec/0x240 genl_family_rcv_msg_doit+0xd0/0x120 genl_rcv_msg+0x188/0x2c0 netlink_rcv_skb+0x54/0x100 genl_rcv+0x24/0x40 netlink_unicast+0x1a1/0x270 netlink_sendmsg+0x214/0x460 __sock_sendmsg+0x38/0x60 __sys_sendto+0x113/0x170 __x64_sys_sendto+0x20/0x30 do_syscall_64+0x40/0xe0 entry_SYSCALL_64_after_hwframe+0x46/0x4e other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&priv->state_lock); lock((work_completion)(&rule->arfs_work)); lock(&priv->state_lock); lock((work_completion)(&rule->arfs_work)); *** DEADLOCK *** 3 locks held by ethtool/386089: #0: ffffffff82ea7210 (cb_lock){++++}-{3:3}, at: genl_rcv+0x15/0x40 #1: ffffffff82e94c88 (rtnl_mutex){+.+.}-{3:3}, at: ethnl_default_set_doit+0xd3/0x240 #2: ffff8884a1808cc0 (&priv->state_lock){+.+.}-{3:3}, at: mlx5e_ethtool_set_channels+0x53/0x200 [mlx5_core] stack backtrace: CPU: 15 PID: 386089 Comm: ethtool Tainted: G I 6.7.0-rc4_net_next_mlx5_5483eb2 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x60/0xa0 check_noncircular+0x144/0x160 __lock_acquire+0x17b4/0x2c80 lock_acquire+0xd0/0x2b0 ? __flush_work+0x74/0x4e0 ? save_trace+0x3e/0x360 ? __flush_work+0x74/0x4e0 __flush_work+0x7a/0x4e0 ? __flush_work+0x74/0x4e0 ? __lock_acquire+0xa78/0x2c80 ? lock_acquire+0xd0/0x2b0 ? mark_held_locks+0x49/0x70 __cancel_work_timer+0x131/0x1c0 ? mark_held_locks+0x49/0x70 arfs_del_rules+0x143/0x1e0 [mlx5_core] mlx5e_arfs_disable+0x1b/0x30 [mlx5_core] mlx5e_ethtool_set_channels+0xcb/0x200 [mlx5_core] ethnl_set_channels+0x28f/0x3b0 ethnl_default_set_doit+0xec/0x240 genl_family_rcv_msg_doit+0xd0/0x120 genl_rcv_msg+0x188/0x2c0 ? ethnl_ops_begin+0xb0/0xb0 ? genl_family_rcv_msg_dumpit+0xf0/0xf0 netlink_rcv_skb+0x54/0x100 genl_rcv+0x24/0x40 netlink_unicast+0x1a1/0x270 netlink_sendmsg+0x214/0x460 __sock_sendmsg+0x38/0x60 __sys_sendto+0x113/0x170 ? do_user_addr_fault+0x53f/0x8f0 __x64_sys_sendto+0x20/0x30 do_syscall_64+0x40/0xe0 entry_SYSCALL_64_after_hwframe+0x46/0x4e Fixes: 45bf454ae884 ("net/mlx5e: Enabling aRFS mechanism") Signed-off-by: Carolina Jubran Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240411115444.374475-7-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c | 27 ++++++++++++++--------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c index c7f542d0b8f08..93cf23278d93c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c @@ -46,6 +46,10 @@ struct arfs_table { struct hlist_head rules_hash[ARFS_HASH_SIZE]; }; +enum { + MLX5E_ARFS_STATE_ENABLED, +}; + enum arfs_type { ARFS_IPV4_TCP, ARFS_IPV6_TCP, @@ -60,6 +64,7 @@ struct mlx5e_arfs_tables { spinlock_t arfs_lock; int last_filter_id; struct workqueue_struct *wq; + unsigned long state; }; struct arfs_tuple { @@ -170,6 +175,8 @@ int mlx5e_arfs_enable(struct mlx5e_flow_steering *fs) return err; } } + set_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state); + return 0; } @@ -455,6 +462,8 @@ static void arfs_del_rules(struct mlx5e_flow_steering *fs) int i; int j; + clear_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state); + spin_lock_bh(&arfs->arfs_lock); mlx5e_for_each_arfs_rule(rule, htmp, arfs->arfs_tables, i, j) { hlist_del_init(&rule->hlist); @@ -627,17 +636,8 @@ static void arfs_handle_work(struct work_struct *work) struct mlx5_flow_handle *rule; arfs = mlx5e_fs_get_arfs(priv->fs); - mutex_lock(&priv->state_lock); - if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { - spin_lock_bh(&arfs->arfs_lock); - hlist_del(&arfs_rule->hlist); - spin_unlock_bh(&arfs->arfs_lock); - - mutex_unlock(&priv->state_lock); - kfree(arfs_rule); - goto out; - } - mutex_unlock(&priv->state_lock); + if (!test_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state)) + return; if (!arfs_rule->rule) { rule = arfs_add_rule(priv, arfs_rule); @@ -753,6 +753,11 @@ int mlx5e_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb, return -EPROTONOSUPPORT; spin_lock_bh(&arfs->arfs_lock); + if (!test_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state)) { + spin_unlock_bh(&arfs->arfs_lock); + return -EPERM; + } + arfs_rule = arfs_find_rule(arfs_t, &fk); if (arfs_rule) { if (arfs_rule->rxq == rxq_index || work_busy(&arfs_rule->arfs_work)) { -- cgit 1.2.3-korg From 1382e3b6a3500c245e5278c66d210c02926f804f Mon Sep 17 00:00:00 2001 From: Yuri Benditovich Date: Thu, 11 Apr 2024 08:11:24 +0300 Subject: net: change maximum number of UDP segments to 128 The commit fc8b2a619469 ("net: more strict VIRTIO_NET_HDR_GSO_UDP_L4 validation") adds check of potential number of UDP segments vs UDP_MAX_SEGMENTS in linux/virtio_net.h. After this change certification test of USO guest-to-guest transmit on Windows driver for virtio-net device fails, for example with packet size of ~64K and mss of 536 bytes. In general the USO should not be more restrictive than TSO. Indeed, in case of unreasonably small mss a lot of segments can cause queue overflow and packet loss on the destination. Limit of 128 segments is good for any practical purpose, with minimal meaningful mss of 536 the maximal UDP packet will be divided to ~120 segments. The number of segments for UDP packets is validated vs UDP_MAX_SEGMENTS also in udp.c (v4,v6), this does not affect quest-to-guest path but does affect packets sent to host, for example. It is important to mention that UDP_MAX_SEGMENTS is kernel-only define and not available to user mode socket applications. In order to request MSS smaller than MTU the applications just uses setsockopt with SOL_UDP and UDP_SEGMENT and there is no limitations on socket API level. Fixes: fc8b2a619469 ("net: more strict VIRTIO_NET_HDR_GSO_UDP_L4 validation") Signed-off-by: Yuri Benditovich Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/udp.h | 2 +- tools/testing/selftests/net/udpgso.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/udp.h b/include/linux/udp.h index 17539d0896661..e398e1dbd2d36 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -108,7 +108,7 @@ struct udp_sock { #define udp_assign_bit(nr, sk, val) \ assign_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags, val) -#define UDP_MAX_SEGMENTS (1 << 6UL) +#define UDP_MAX_SEGMENTS (1 << 7UL) #define udp_sk(ptr) container_of_const(ptr, struct udp_sock, inet.sk) diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c index 1d975bf52af33..85b3baa3f7f34 100644 --- a/tools/testing/selftests/net/udpgso.c +++ b/tools/testing/selftests/net/udpgso.c @@ -34,7 +34,7 @@ #endif #ifndef UDP_MAX_SEGMENTS -#define UDP_MAX_SEGMENTS (1 << 6UL) +#define UDP_MAX_SEGMENTS (1 << 7UL) #endif #define CONST_MTU_TEST 1500 -- cgit 1.2.3-korg From 460b0d33cf10eee33de651381d3170ef13241650 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Apr 2024 11:02:02 -0700 Subject: inet: bring NLM_DONE out to a separate recv() again Commit under Fixes optimized the number of recv() calls needed during RTM_GETROUTE dumps, but we got multiple reports of applications hanging on recv() calls. Applications expect that a route dump will be terminated with a recv() reading an individual NLM_DONE message. Coalescing NLM_DONE is perfectly legal in netlink, but even tho reporters fixed the code in respective projects, chances are it will take time for those applications to get updated. So revert to old behavior (for now)? Old kernel (5.19): $ ./cli.py --dbg-small-recv 4096 --spec netlink/specs/rt_route.yaml \ --dump getroute --json '{"rtm-family": 2}' Recv: read 692 bytes, 11 messages nl_len = 68 (52) nl_flags = 0x22 nl_type = 24 ... nl_len = 60 (44) nl_flags = 0x22 nl_type = 24 Recv: read 20 bytes, 1 messages nl_len = 20 (4) nl_flags = 0x2 nl_type = 3 Before (6.9-rc2): $ ./cli.py --dbg-small-recv 4096 --spec netlink/specs/rt_route.yaml \ --dump getroute --json '{"rtm-family": 2}' Recv: read 712 bytes, 12 messages nl_len = 68 (52) nl_flags = 0x22 nl_type = 24 ... nl_len = 60 (44) nl_flags = 0x22 nl_type = 24 nl_len = 20 (4) nl_flags = 0x2 nl_type = 3 After: $ ./cli.py --dbg-small-recv 4096 --spec netlink/specs/rt_route.yaml \ --dump getroute --json '{"rtm-family": 2}' Recv: read 692 bytes, 11 messages nl_len = 68 (52) nl_flags = 0x22 nl_type = 24 ... nl_len = 60 (44) nl_flags = 0x22 nl_type = 24 Recv: read 20 bytes, 1 messages nl_len = 20 (4) nl_flags = 0x2 nl_type = 3 Reported-by: Stefano Brivio Link: https://lore.kernel.org/all/20240315124808.033ff58d@elisabeth Reported-by: Ilya Maximets Link: https://lore.kernel.org/all/02b50aae-f0e9-47a4-8365-a977a85975d3@ovn.org Fixes: 4ce5dc9316de ("inet: switch inet_dump_fib() to RCU protection") Signed-off-by: Jakub Kicinski Tested-by: Ilya Maximets Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 48741352a88a7..c484b1c0fc00a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1050,6 +1050,11 @@ next: e++; } } + + /* Don't let NLM_DONE coalesce into a message, even if it could. + * Some user space expects NLM_DONE in a separate recv(). + */ + err = skb->len; out: cb->args[1] = e; -- cgit 1.2.3-korg From 75ce9506ee3dc66648a7d74ab3b0acfa364d6d43 Mon Sep 17 00:00:00 2001 From: Asbjørn Sloth Tønnesen Date: Fri, 12 Apr 2024 12:02:56 +0000 Subject: octeontx2-pf: fix FLOW_DIS_IS_FRAGMENT implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Upon reviewing the flower control flags handling in this driver, I notice that the key wasn't being used, only the mask. Ie. `tc flower ... ip_flags nofrag` was hardware offloaded as `... ip_flags frag`. Only compile tested, no access to HW. Fixes: c672e3727989 ("octeontx2-pf: Add support to filter packet based on IP fragment") Signed-off-by: Asbjørn Sloth Tønnesen Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c index 87bdb93cb066e..f4655a8c0705d 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c @@ -689,6 +689,7 @@ static int otx2_tc_prepare_flow(struct otx2_nic *nic, struct otx2_tc_flow *node, if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_CONTROL)) { struct flow_match_control match; + u32 val; flow_rule_match_control(rule, &match); if (match.mask->flags & FLOW_DIS_FIRST_FRAG) { @@ -697,12 +698,14 @@ static int otx2_tc_prepare_flow(struct otx2_nic *nic, struct otx2_tc_flow *node, } if (match.mask->flags & FLOW_DIS_IS_FRAGMENT) { + val = match.key->flags & FLOW_DIS_IS_FRAGMENT; if (ntohs(flow_spec->etype) == ETH_P_IP) { - flow_spec->ip_flag = IPV4_FLAG_MORE; + flow_spec->ip_flag = val ? IPV4_FLAG_MORE : 0; flow_mask->ip_flag = IPV4_FLAG_MORE; req->features |= BIT_ULL(NPC_IPFRAG_IPV4); } else if (ntohs(flow_spec->etype) == ETH_P_IPV6) { - flow_spec->next_header = IPPROTO_FRAGMENT; + flow_spec->next_header = val ? + IPPROTO_FRAGMENT : 0; flow_mask->next_header = 0xff; req->features |= BIT_ULL(NPC_IPFRAG_IPV6); } else { -- cgit 1.2.3-korg From 4225dfa4535f219b03ae14147d9c6e7e82ec8df4 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Sat, 13 Apr 2024 02:42:52 +0100 Subject: selftests/tcp_ao: Make RST tests less flaky Currently, "active reset" cases are flaky, because select() is called for 3 sockets, while only 2 are expected to receive RST. The idea of the third socket was to get into request_sock_queue, but the test mistakenly attempted to connect() after the listener socket was shut down. Repair this test, it's important to check the different kernel code-paths for signing RST TCP-AO segments. Fixes: c6df7b2361d7 ("selftests/net: Add TCP-AO RST test") Reported-by: Jakub Kicinski Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tcp_ao/rst.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/net/tcp_ao/rst.c b/tools/testing/selftests/net/tcp_ao/rst.c index 7df8b8700e39e..a2fe88d35ac06 100644 --- a/tools/testing/selftests/net/tcp_ao/rst.c +++ b/tools/testing/selftests/net/tcp_ao/rst.c @@ -256,8 +256,6 @@ static int test_wait_fds(int sk[], size_t nr, bool is_writable[], static void test_client_active_rst(unsigned int port) { - /* one in queue, another accept()ed */ - unsigned int wait_for = backlog + 2; int i, sk[3], err; bool is_writable[ARRAY_SIZE(sk)] = {false}; unsigned int last = ARRAY_SIZE(sk) - 1; @@ -275,16 +273,20 @@ static void test_client_active_rst(unsigned int port) for (i = 0; i < last; i++) { err = _test_connect_socket(sk[i], this_ip_dest, port, (i == 0) ? TEST_TIMEOUT_SEC : -1); - if (err < 0) test_error("failed to connect()"); } - synchronize_threads(); /* 2: connection accept()ed, another queued */ - err = test_wait_fds(sk, last, is_writable, wait_for, TEST_TIMEOUT_SEC); + synchronize_threads(); /* 2: two connections: one accept()ed, another queued */ + err = test_wait_fds(sk, last, is_writable, last, TEST_TIMEOUT_SEC); if (err < 0) test_error("test_wait_fds(): %d", err); + /* async connect() with third sk to get into request_sock_queue */ + err = _test_connect_socket(sk[last], this_ip_dest, port, -1); + if (err < 0) + test_error("failed to connect()"); + synchronize_threads(); /* 3: close listen socket */ if (test_client_verify(sk[0], packet_sz, quota / packet_sz, TEST_TIMEOUT_SEC)) test_fail("Failed to send data on connected socket"); @@ -292,13 +294,14 @@ static void test_client_active_rst(unsigned int port) test_ok("Verified established tcp connection"); synchronize_threads(); /* 4: finishing up */ - err = _test_connect_socket(sk[last], this_ip_dest, port, -1); - if (err < 0) - test_error("failed to connect()"); synchronize_threads(); /* 5: closed active sk */ - err = test_wait_fds(sk, ARRAY_SIZE(sk), NULL, - wait_for, TEST_TIMEOUT_SEC); + /* + * Wait for 2 connections: one accepted, another in the accept queue, + * the one in request_sock_queue won't get fully established, so + * doesn't receive an active RST, see inet_csk_listen_stop(). + */ + err = test_wait_fds(sk, last, NULL, last, TEST_TIMEOUT_SEC); if (err < 0) test_error("select(): %d", err); -- cgit 1.2.3-korg From b089b3bead532419cdcbd8e4e0a3e23c49d11573 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Sat, 13 Apr 2024 02:42:53 +0100 Subject: selftests/tcp_ao: Zero-init tcp_ao_info_opt The structure is on the stack and has to be zero-initialized as the kernel checks for: > if (in.reserved != 0 || in.reserved2 != 0) > return -EINVAL; Fixes: b26660531cf6 ("selftests/net: Add test for TCP-AO add setsockopt() command") Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tcp_ao/setsockopt-closed.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c index 452de131fa3a9..517930f9721bd 100644 --- a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c +++ b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c @@ -21,7 +21,7 @@ static void make_listen(int sk) static void test_vefify_ao_info(int sk, struct tcp_ao_info_opt *info, const char *tst) { - struct tcp_ao_info_opt tmp; + struct tcp_ao_info_opt tmp = {}; socklen_t len = sizeof(tmp); if (getsockopt(sk, IPPROTO_TCP, TCP_AO_INFO, &tmp, &len)) -- cgit 1.2.3-korg From beb78cd1329d039d73487ca05633d1b92e1ab2ea Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Sat, 13 Apr 2024 02:42:54 +0100 Subject: selftests/tcp_ao: Fix fscanf() call for format-security MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On my new laptop with packages from nixos-unstable, gcc 12.3.0 produces: > lib/proc.c: In function ‘netstat_read_type’: > lib/proc.c:89:9: error: format not a string literal and no format arguments [-Werror=format-security] > 89 | if (fscanf(fnetstat, type->header_name) == EOF) > | ^~ > cc1: some warnings being treated as errors Here the selftests lib parses header name, while expectes non-space word ending with a column. Fixes: cfbab37b3da0 ("selftests/net: Add TCP-AO library") Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Reported-by: Muhammad Usama Anjum Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tcp_ao/lib/proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/tcp_ao/lib/proc.c b/tools/testing/selftests/net/tcp_ao/lib/proc.c index 2fb6dd8adba69..8b984fa042869 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/proc.c +++ b/tools/testing/selftests/net/tcp_ao/lib/proc.c @@ -86,7 +86,7 @@ static void netstat_read_type(FILE *fnetstat, struct netstat **dest, char *line) pos = strchr(line, ' ') + 1; - if (fscanf(fnetstat, type->header_name) == EOF) + if (fscanf(fnetstat, "%[^ :]", type->header_name) == EOF) test_error("fscanf(%s)", type->header_name); if (fread(&tmp, 1, 1, fnetstat) != 1 || tmp != ':') test_error("Unexpected netstat format (%c)", tmp); -- cgit 1.2.3-korg From b476c93654d748c13624f7c7d0ba191c56a8092e Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Sat, 13 Apr 2024 02:42:55 +0100 Subject: selftests/tcp_ao: Printing fixes to confirm with format-security MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On my new laptop with packages from nixos-unstable, gcc 12.3.0 produces > lib/setup.c: In function ‘__test_msg’: > lib/setup.c:20:9: error: format not a string literal and no format arguments [-Werror=format-security] > 20 | ksft_print_msg(buf); > | ^~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_ok’: > lib/setup.c:26:9: error: format not a string literal and no format arguments [-Werror=format-security] > 26 | ksft_test_result_pass(buf); > | ^~~~~~~~~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_fail’: > lib/setup.c:32:9: error: format not a string literal and no format arguments [-Werror=format-security] > 32 | ksft_test_result_fail(buf); > | ^~~~~~~~~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_xfail’: > lib/setup.c:38:9: error: format not a string literal and no format arguments [-Werror=format-security] > 38 | ksft_test_result_xfail(buf); > | ^~~~~~~~~~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_error’: > lib/setup.c:44:9: error: format not a string literal and no format arguments [-Werror=format-security] > 44 | ksft_test_result_error(buf); > | ^~~~~~~~~~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_skip’: > lib/setup.c:50:9: error: format not a string literal and no format arguments [-Werror=format-security] > 50 | ksft_test_result_skip(buf); > | ^~~~~~~~~~~~~~~~~~~~~ > cc1: some warnings being treated as errors As the buffer was already pre-printed into, print it as a string rather than a format-string. Fixes: cfbab37b3da0 ("selftests/net: Add TCP-AO library") Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Reported-by: Muhammad Usama Anjum Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tcp_ao/lib/setup.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/tcp_ao/lib/setup.c b/tools/testing/selftests/net/tcp_ao/lib/setup.c index 92276f916f2f3..e408b9243b2c5 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/setup.c +++ b/tools/testing/selftests/net/tcp_ao/lib/setup.c @@ -17,37 +17,37 @@ static pthread_mutex_t ksft_print_lock = PTHREAD_MUTEX_INITIALIZER; void __test_msg(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_print_msg(buf); + ksft_print_msg("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_ok(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_pass(buf); + ksft_test_result_pass("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_fail(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_fail(buf); + ksft_test_result_fail("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_xfail(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_xfail(buf); + ksft_test_result_xfail("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_error(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_error(buf); + ksft_test_result_error("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_skip(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_skip(buf); + ksft_test_result_skip("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } -- cgit 1.2.3-korg From 0ebd96f5da4410c0cb8fc75e44f1009530b2f90b Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 12 Apr 2024 21:03:14 +0300 Subject: net: stmmac: Apply half-duplex-less constraint for DW QoS Eth only There are three DW MAC IP-cores which can have the multiple Tx/Rx queues enabled: DW GMAC v3.7+ with AV feature, DW QoS Eth v4.x/v5.x, DW XGMAC/XLGMAC Based on the respective HW databooks, only the DW QoS Eth IP-core doesn't support the half-duplex link mode in case if more than one queues enabled: "In multiple queue/channel configurations, for half-duplex operation, enable only the Q0/CH0 on Tx and Rx. For single queue/channel in full-duplex operation, any queue/channel can be enabled." The rest of the IP-cores don't have such constraint. Thus in order to have the constraint applied for the DW QoS Eth MACs only, let's move the it' implementation to the respective MAC-capabilities getter and make sure the getter is called in the queues re-init procedure. Fixes: b6cfffa7ad92 ("stmmac: fix DMA channel hang in half-duplex mode") Signed-off-by: Serge Semin Reviewed-by: Romain Gantois Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 7 +++++++ drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 19 +++---------------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index cef25efbdff99..ec6a13e644b36 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -71,6 +71,13 @@ static void dwmac4_core_init(struct mac_device_info *hw, static void dwmac4_phylink_get_caps(struct stmmac_priv *priv) { priv->phylink_config.mac_capabilities |= MAC_2500FD; + + if (priv->plat->tx_queues_to_use > 1) + priv->phylink_config.mac_capabilities &= + ~(MAC_10HD | MAC_100HD | MAC_1000HD); + else + priv->phylink_config.mac_capabilities |= + (MAC_10HD | MAC_100HD | MAC_1000HD); } static void dwmac4_rx_queue_enable(struct mac_device_info *hw, diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 24cd80490d19c..dd58c21b53eec 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1198,17 +1198,6 @@ static int stmmac_init_phy(struct net_device *dev) return ret; } -static void stmmac_set_half_duplex(struct stmmac_priv *priv) -{ - /* Half-Duplex can only work with single tx queue */ - if (priv->plat->tx_queues_to_use > 1) - priv->phylink_config.mac_capabilities &= - ~(MAC_10HD | MAC_100HD | MAC_1000HD); - else - priv->phylink_config.mac_capabilities |= - (MAC_10HD | MAC_100HD | MAC_1000HD); -} - static int stmmac_phy_setup(struct stmmac_priv *priv) { struct stmmac_mdio_bus_data *mdio_bus_data; @@ -1237,10 +1226,7 @@ static int stmmac_phy_setup(struct stmmac_priv *priv) priv->phylink_config.supported_interfaces); priv->phylink_config.mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | - MAC_10FD | MAC_100FD | - MAC_1000FD; - - stmmac_set_half_duplex(priv); + MAC_10 | MAC_100 | MAC_1000; /* Get the MAC specific capabilities */ stmmac_mac_phylink_get_caps(priv); @@ -7355,7 +7341,8 @@ int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt) priv->rss.table[i] = ethtool_rxfh_indir_default(i, rx_cnt); - stmmac_set_half_duplex(priv); + stmmac_mac_phylink_get_caps(priv); + stmmac_napi_add(dev); if (netif_running(dev)) -- cgit 1.2.3-korg From 59c3d6ca6cbded6c6599e975b42a9d6a27fcbaf2 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 12 Apr 2024 21:03:15 +0300 Subject: net: stmmac: Fix max-speed being ignored on queue re-init It's possible to have the maximum link speed being artificially limited on the platform-specific basis. It's done either by setting up the plat_stmmacenet_data::max_speed field or by specifying the "max-speed" DT-property. In such cases it's required that any specific MAC-capabilities re-initializations would take the limit into account. In particular the link speed capabilities may change during the number of active Tx/Rx queues re-initialization. But the currently implemented procedure doesn't take the speed limit into account. Fix that by calling phylink_limit_mac_speed() in the stmmac_reinit_queues() method if the speed limitation was required in the same way as it's done in the stmmac_phy_setup() function. Fixes: 95201f36f395 ("net: stmmac: update MAC capabilities when tx queues are updated") Signed-off-by: Serge Semin Reviewed-by: Romain Gantois Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index dd58c21b53eec..b8a1f02398ee9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7328,6 +7328,7 @@ int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt) { struct stmmac_priv *priv = netdev_priv(dev); int ret = 0, i; + int max_speed; if (netif_running(dev)) stmmac_release(dev); @@ -7343,6 +7344,10 @@ int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt) stmmac_mac_phylink_get_caps(priv); + max_speed = priv->plat->max_speed; + if (max_speed) + phylink_limit_mac_speed(&priv->phylink_config, max_speed); + stmmac_napi_add(dev); if (netif_running(dev)) -- cgit 1.2.3-korg From 9cb54af214a7cdc91577ec083e5569f2ce2c86d8 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 12 Apr 2024 21:03:16 +0300 Subject: net: stmmac: Fix IP-cores specific MAC capabilities Here is the list of the MAC capabilities specific to the particular DW MAC IP-cores currently supported by the driver: DW MAC100: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10 | MAC_100 DW GMAC: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10 | MAC_100 | MAC_1000 Allwinner sun8i MAC: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10 | MAC_100 | MAC_1000 DW QoS Eth: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10 | MAC_100 | MAC_1000 | MAC_2500FD if there is more than 1 active Tx/Rx queues: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10FD | MAC_100FD | MAC_1000FD | MAC_2500FD DW XGMAC: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_1000FD | MAC_2500FD | MAC_5000FD | MAC_10000FD DW XLGMAC: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_1000FD | MAC_2500FD | MAC_5000FD | MAC_10000FD | MAC_25000FD | MAC_40000FD | MAC_50000FD | MAC_100000FD As you can see there are only two common capabilities: MAC_ASYM_PAUSE | MAC_SYM_PAUSE. Meanwhile what is currently implemented defines 10/100/1000 link speeds for all IP-cores, which is definitely incorrect for DW MAC100, DW XGMAC and DW XLGMAC devices. Seeing the flow-control is implemented as a callback for each MAC IP-core (see dwmac100_flow_ctrl(), dwmac1000_flow_ctrl(), sun8i_dwmac_flow_ctrl(), etc) and since the MAC-specific setup() method is supposed to be called for each available DW MAC-based device, the capabilities initialization can be freely moved to these setup() functions, thus correctly setting up the MAC-capabilities for each IP-core (including the Allwinner Sun8i). A new stmmac_link::caps field was specifically introduced for that so to have all link-specific info preserved in a single structure. Note the suggested change fixes three earlier commits at a time. The commit 5b0d7d7da64b ("net: stmmac: Add the missing speeds that XGMAC supports") permitted the 10-100 link speeds and 1G half-duplex mode for DW XGMAC IP-core even though it doesn't support them. The commit df7699c70c1b ("net: stmmac: Do not cut down 1G modes") incorrectly added the MAC1000 capability to the DW MAC100 IP-core. Similarly to the DW XGMAC the commit 8a880936e902 ("net: stmmac: Add XLGMII support") incorrectly permitted the 10-100 link speeds and 1G half-duplex mode for DW XLGMAC IP-core. Fixes: 5b0d7d7da64b ("net: stmmac: Add the missing speeds that XGMAC supports") Fixes: df7699c70c1b ("net: stmmac: Do not cut down 1G modes") Fixes: 8a880936e902 ("net: stmmac: Add XLGMII support") Suggested-by: Russell King (Oracle) Signed-off-by: Serge Semin Reviewed-by: Romain Gantois Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/common.h | 1 + drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 2 ++ drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c | 2 ++ drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c | 2 ++ drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 10 ++++------ drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c | 18 ++++++++---------- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 7 ++++--- 7 files changed, 23 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index a6fefe675ef15..3b7d4ac1e7be0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -553,6 +553,7 @@ extern const struct stmmac_hwtimestamp stmmac_ptp; extern const struct stmmac_mode_ops dwmac4_ring_mode_ops; struct mac_link { + u32 caps; u32 speed_mask; u32 speed10; u32 speed100; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index b21d99faa2d04..e1b761dcfa1dd 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -1096,6 +1096,8 @@ static struct mac_device_info *sun8i_dwmac_setup(void *ppriv) priv->dev->priv_flags |= IFF_UNICAST_FLT; + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_10 | MAC_100 | MAC_1000; /* The loopback bit seems to be re-set when link change * Simply mask it each time * Speed 10/100/1000 are set in BIT(2)/BIT(3) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c index 3927609abc441..8555299443f4e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c @@ -539,6 +539,8 @@ int dwmac1000_setup(struct stmmac_priv *priv) if (mac->multicast_filter_bins) mac->mcast_bits_log2 = ilog2(mac->multicast_filter_bins); + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_10 | MAC_100 | MAC_1000; mac->link.duplex = GMAC_CONTROL_DM; mac->link.speed10 = GMAC_CONTROL_PS; mac->link.speed100 = GMAC_CONTROL_PS | GMAC_CONTROL_FES; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c index a6e8d7bd95886..7667d103cd0eb 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c @@ -175,6 +175,8 @@ int dwmac100_setup(struct stmmac_priv *priv) dev_info(priv->device, "\tDWMAC100\n"); mac->pcsr = priv->ioaddr; + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_10 | MAC_100; mac->link.duplex = MAC_CONTROL_F; mac->link.speed10 = 0; mac->link.speed100 = 0; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index ec6a13e644b36..a38226d7cc6a9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -70,14 +70,10 @@ static void dwmac4_core_init(struct mac_device_info *hw, static void dwmac4_phylink_get_caps(struct stmmac_priv *priv) { - priv->phylink_config.mac_capabilities |= MAC_2500FD; - if (priv->plat->tx_queues_to_use > 1) - priv->phylink_config.mac_capabilities &= - ~(MAC_10HD | MAC_100HD | MAC_1000HD); + priv->hw->link.caps &= ~(MAC_10HD | MAC_100HD | MAC_1000HD); else - priv->phylink_config.mac_capabilities |= - (MAC_10HD | MAC_100HD | MAC_1000HD); + priv->hw->link.caps |= (MAC_10HD | MAC_100HD | MAC_1000HD); } static void dwmac4_rx_queue_enable(struct mac_device_info *hw, @@ -1385,6 +1381,8 @@ int dwmac4_setup(struct stmmac_priv *priv) if (mac->multicast_filter_bins) mac->mcast_bits_log2 = ilog2(mac->multicast_filter_bins); + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_10 | MAC_100 | MAC_1000 | MAC_2500FD; mac->link.duplex = GMAC_CONFIG_DM; mac->link.speed10 = GMAC_CONFIG_PS; mac->link.speed100 = GMAC_CONFIG_FES | GMAC_CONFIG_PS; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c index e841e312077ef..f8e7775bb6336 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c @@ -47,14 +47,6 @@ static void dwxgmac2_core_init(struct mac_device_info *hw, writel(XGMAC_INT_DEFAULT_EN, ioaddr + XGMAC_INT_EN); } -static void xgmac_phylink_get_caps(struct stmmac_priv *priv) -{ - priv->phylink_config.mac_capabilities |= MAC_2500FD | MAC_5000FD | - MAC_10000FD | MAC_25000FD | - MAC_40000FD | MAC_50000FD | - MAC_100000FD; -} - static void dwxgmac2_set_mac(void __iomem *ioaddr, bool enable) { u32 tx = readl(ioaddr + XGMAC_TX_CONFIG); @@ -1540,7 +1532,6 @@ static void dwxgmac3_fpe_configure(void __iomem *ioaddr, struct stmmac_fpe_cfg * const struct stmmac_ops dwxgmac210_ops = { .core_init = dwxgmac2_core_init, - .phylink_get_caps = xgmac_phylink_get_caps, .set_mac = dwxgmac2_set_mac, .rx_ipc = dwxgmac2_rx_ipc, .rx_queue_enable = dwxgmac2_rx_queue_enable, @@ -1601,7 +1592,6 @@ static void dwxlgmac2_rx_queue_enable(struct mac_device_info *hw, u8 mode, const struct stmmac_ops dwxlgmac2_ops = { .core_init = dwxgmac2_core_init, - .phylink_get_caps = xgmac_phylink_get_caps, .set_mac = dwxgmac2_set_mac, .rx_ipc = dwxgmac2_rx_ipc, .rx_queue_enable = dwxlgmac2_rx_queue_enable, @@ -1661,6 +1651,9 @@ int dwxgmac2_setup(struct stmmac_priv *priv) if (mac->multicast_filter_bins) mac->mcast_bits_log2 = ilog2(mac->multicast_filter_bins); + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_1000FD | MAC_2500FD | MAC_5000FD | + MAC_10000FD; mac->link.duplex = 0; mac->link.speed10 = XGMAC_CONFIG_SS_10_MII; mac->link.speed100 = XGMAC_CONFIG_SS_100_MII; @@ -1698,6 +1691,11 @@ int dwxlgmac2_setup(struct stmmac_priv *priv) if (mac->multicast_filter_bins) mac->mcast_bits_log2 = ilog2(mac->multicast_filter_bins); + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_1000FD | MAC_2500FD | MAC_5000FD | + MAC_10000FD | MAC_25000FD | + MAC_40000FD | MAC_50000FD | + MAC_100000FD; mac->link.duplex = 0; mac->link.speed1000 = XLGMAC_CONFIG_SS_1000; mac->link.speed2500 = XLGMAC_CONFIG_SS_2500; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index b8a1f02398ee9..7c6fb14b55550 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1225,12 +1225,11 @@ static int stmmac_phy_setup(struct stmmac_priv *priv) xpcs_get_interfaces(priv->hw->xpcs, priv->phylink_config.supported_interfaces); - priv->phylink_config.mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | - MAC_10 | MAC_100 | MAC_1000; - /* Get the MAC specific capabilities */ stmmac_mac_phylink_get_caps(priv); + priv->phylink_config.mac_capabilities = priv->hw->link.caps; + max_speed = priv->plat->max_speed; if (max_speed) phylink_limit_mac_speed(&priv->phylink_config, max_speed); @@ -7344,6 +7343,8 @@ int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt) stmmac_mac_phylink_get_caps(priv); + priv->phylink_config.mac_capabilities = priv->hw->link.caps; + max_speed = priv->plat->max_speed; if (max_speed) phylink_limit_mac_speed(&priv->phylink_config, max_speed); -- cgit 1.2.3-korg From 428051600cb4e5a61d81aba3f8009b6c4f5e7582 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Fri, 15 Mar 2024 12:08:20 +0100 Subject: ice: tc: check src_vsi in case of traffic from VF In case of traffic going from the VF (so ingress for port representor) source VSI should be consider during packet classification. It is needed for hardware to not match packets from different ports with filters added on other port. It is only for "from VF" traffic, because other traffic direction doesn't have source VSI. Set correct ::src_vsi in rule_info to pass it to the hardware filter. For example this rule should drop only ipv4 packets from eth10, not from the others VF PRs. It is needed to check source VSI in this case. $tc filter add dev eth10 ingress protocol ip flower skip_sw action drop Fixes: 0d08a441fb1a ("ice: ndo_setup_tc implementation for PF") Reviewed-by: Jedrzej Jagielski Reviewed-by: Sridhar Samudrala Signed-off-by: Michal Swiatkowski Reviewed-by: Simon Horman Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_tc_lib.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index b890410a2bc0b..49ed5fd7db107 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -28,6 +28,8 @@ ice_tc_count_lkups(u32 flags, struct ice_tc_flower_lyr_2_4_hdrs *headers, * - ICE_TC_FLWR_FIELD_VLAN_TPID (present if specified) * - Tunnel flag (present if tunnel) */ + if (fltr->direction == ICE_ESWITCH_FLTR_EGRESS) + lkups_cnt++; if (flags & ICE_TC_FLWR_FIELD_TENANT_ID) lkups_cnt++; @@ -363,6 +365,11 @@ ice_tc_fill_rules(struct ice_hw *hw, u32 flags, /* Always add direction metadata */ ice_rule_add_direction_metadata(&list[ICE_TC_METADATA_LKUP_IDX]); + if (tc_fltr->direction == ICE_ESWITCH_FLTR_EGRESS) { + ice_rule_add_src_vsi_metadata(&list[i]); + i++; + } + rule_info->tun_type = ice_sw_type_from_tunnel(tc_fltr->tunnel_type); if (tc_fltr->tunnel_type != TNL_LAST) { i = ice_tc_fill_tunnel_outer(flags, tc_fltr, list, i); @@ -820,6 +827,7 @@ ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) /* specify the cookie as filter_rule_id */ rule_info.fltr_rule_id = fltr->cookie; + rule_info.src_vsi = vsi->idx; ret = ice_add_adv_rule(hw, list, lkups_cnt, &rule_info, &rule_added); if (ret == -EEXIST) { -- cgit 1.2.3-korg From 73278715725a8347032acf233082ca4eb31e6a56 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Fri, 15 Mar 2024 12:08:21 +0100 Subject: ice: tc: allow zero flags in parsing tc flower The check for flags is done to not pass empty lookups to adding switch rule functions. Since metadata is always added to lookups there is no need to check against the flag. It is also fixing the problem with such rule: $ tc filter add dev gtp_dev ingress protocol ip prio 0 flower \ enc_dst_port 2123 action drop Switch block in case of GTP can't parse the destination port, because it should always be set to GTP specific value. The same with ethertype. The result is that there is no other matching criteria than GTP tunnel. In this case flags is 0, rule can't be added only because of defensive check against flags. Fixes: 9a225f81f540 ("ice: Support GTP-U and GTP-C offload in switchdev") Reviewed-by: Wojciech Drewek Signed-off-by: Michal Swiatkowski Reviewed-by: Simon Horman Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_tc_lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index 49ed5fd7db107..bcbcfc67e5606 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -779,7 +779,7 @@ ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) int ret; int i; - if (!flags || (flags & ICE_TC_FLWR_FIELD_ENC_SRC_L4_PORT)) { + if (flags & ICE_TC_FLWR_FIELD_ENC_SRC_L4_PORT) { NL_SET_ERR_MSG_MOD(fltr->extack, "Unsupported encap field(s)"); return -EOPNOTSUPP; } -- cgit 1.2.3-korg From 2cca35f5dd78b9f8297c879c5db5ab137c5d86c3 Mon Sep 17 00:00:00 2001 From: Marcin Szycik Date: Tue, 9 Apr 2024 17:45:44 +0200 Subject: ice: Fix checking for unsupported keys on non-tunnel device Add missing FLOW_DISSECTOR_KEY_ENC_* checks to TC flower filter parsing. Without these checks, it would be possible to add filters with tunnel options on non-tunnel devices. enc_* options are only valid for tunnel devices. Example: devlink dev eswitch set $PF1_PCI mode switchdev echo 1 > /sys/class/net/$PF1/device/sriov_numvfs tc qdisc add dev $VF1_PR ingress ethtool -K $PF1 hw-tc-offload on tc filter add dev $VF1_PR ingress flower enc_ttl 12 skip_sw action drop Fixes: 9e300987d4a8 ("ice: VXLAN and Geneve TC support") Reviewed-by: Michal Swiatkowski Signed-off-by: Marcin Szycik Reviewed-by: Jacob Keller Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_tc_lib.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index bcbcfc67e5606..688ccb0615ab9 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -1489,7 +1489,10 @@ ice_parse_cls_flower(struct net_device *filter_dev, struct ice_vsi *vsi, (BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID) | - BIT_ULL(FLOW_DISSECTOR_KEY_ENC_PORTS))) { + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_PORTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IP) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_OPTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_CONTROL))) { NL_SET_ERR_MSG_MOD(fltr->extack, "Tunnel key used, but device isn't a tunnel"); return -EOPNOTSUPP; } else { -- cgit 1.2.3-korg From f8bbc07ac535593139c875ffa19af924b1084540 Mon Sep 17 00:00:00 2001 From: Lei Chen Date: Sun, 14 Apr 2024 22:02:46 -0400 Subject: tun: limit printing rate when illegal packet received by tun dev vhost_worker will call tun call backs to receive packets. If too many illegal packets arrives, tun_do_read will keep dumping packet contents. When console is enabled, it will costs much more cpu time to dump packet and soft lockup will be detected. net_ratelimit mechanism can be used to limit the dumping rate. PID: 33036 TASK: ffff949da6f20000 CPU: 23 COMMAND: "vhost-32980" #0 [fffffe00003fce50] crash_nmi_callback at ffffffff89249253 #1 [fffffe00003fce58] nmi_handle at ffffffff89225fa3 #2 [fffffe00003fceb0] default_do_nmi at ffffffff8922642e #3 [fffffe00003fced0] do_nmi at ffffffff8922660d #4 [fffffe00003fcef0] end_repeat_nmi at ffffffff89c01663 [exception RIP: io_serial_in+20] RIP: ffffffff89792594 RSP: ffffa655314979e8 RFLAGS: 00000002 RAX: ffffffff89792500 RBX: ffffffff8af428a0 RCX: 0000000000000000 RDX: 00000000000003fd RSI: 0000000000000005 RDI: ffffffff8af428a0 RBP: 0000000000002710 R8: 0000000000000004 R9: 000000000000000f R10: 0000000000000000 R11: ffffffff8acbf64f R12: 0000000000000020 R13: ffffffff8acbf698 R14: 0000000000000058 R15: 0000000000000000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #5 [ffffa655314979e8] io_serial_in at ffffffff89792594 #6 [ffffa655314979e8] wait_for_xmitr at ffffffff89793470 #7 [ffffa65531497a08] serial8250_console_putchar at ffffffff897934f6 #8 [ffffa65531497a20] uart_console_write at ffffffff8978b605 #9 [ffffa65531497a48] serial8250_console_write at ffffffff89796558 #10 [ffffa65531497ac8] console_unlock at ffffffff89316124 #11 [ffffa65531497b10] vprintk_emit at ffffffff89317c07 #12 [ffffa65531497b68] printk at ffffffff89318306 #13 [ffffa65531497bc8] print_hex_dump at ffffffff89650765 #14 [ffffa65531497ca8] tun_do_read at ffffffffc0b06c27 [tun] #15 [ffffa65531497d38] tun_recvmsg at ffffffffc0b06e34 [tun] #16 [ffffa65531497d68] handle_rx at ffffffffc0c5d682 [vhost_net] #17 [ffffa65531497ed0] vhost_worker at ffffffffc0c644dc [vhost] #18 [ffffa65531497f10] kthread at ffffffff892d2e72 #19 [ffffa65531497f50] ret_from_fork at ffffffff89c0022f Fixes: ef3db4a59542 ("tun: avoid BUG, dump packet on GSO errors") Signed-off-by: Lei Chen Reviewed-by: Willem de Bruijn Acked-by: Jason Wang Reviewed-by: Eric Dumazet Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/20240415020247.2207781-1-lei.chen@smartx.com Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 0b3f21cba552f..92da8c03d960c 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2125,14 +2125,16 @@ static ssize_t tun_put_user(struct tun_struct *tun, tun_is_little_endian(tun), true, vlan_hlen)) { struct skb_shared_info *sinfo = skb_shinfo(skb); - pr_err("unexpected GSO type: " - "0x%x, gso_size %d, hdr_len %d\n", - sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size), - tun16_to_cpu(tun, gso.hdr_len)); - print_hex_dump(KERN_ERR, "tun: ", - DUMP_PREFIX_NONE, - 16, 1, skb->head, - min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true); + + if (net_ratelimit()) { + netdev_err(tun->dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", + sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size), + tun16_to_cpu(tun, gso.hdr_len)); + print_hex_dump(KERN_ERR, "tun: ", + DUMP_PREFIX_NONE, + 16, 1, skb->head, + min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true); + } WARN_ON_ONCE(1); return -EINVAL; } -- cgit 1.2.3-korg From d59cf049c8378677053703e724808836f180888e Mon Sep 17 00:00:00 2001 From: Arınç ÜNAL Date: Sat, 13 Apr 2024 16:01:39 +0300 Subject: net: dsa: mt7530: fix mirroring frames received on local port MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This switch intellectual property provides a bit on the ARL global control register which controls allowing mirroring frames which are received on the local port (monitor port). This bit is unset after reset. This ability must be enabled to fully support the port mirroring feature on this switch intellectual property. Therefore, this patch fixes the traffic not being reflected on a port, which would be configured like below: tc qdisc add dev swp0 clsact tc filter add dev swp0 ingress matchall skip_sw \ action mirred egress mirror dev swp0 As a side note, this configuration provides the hairpinning feature for a single port. Fixes: 37feab6076aa ("net: dsa: mt7530: add support for port mirroring") Signed-off-by: Arınç ÜNAL Signed-off-by: David S. Miller --- drivers/net/dsa/mt7530.c | 6 ++++++ drivers/net/dsa/mt7530.h | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index c0d0bce0b5942..b84e1845fa028 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -2480,6 +2480,9 @@ mt7530_setup(struct dsa_switch *ds) PVC_EG_TAG(MT7530_VLAN_EG_CONSISTENT)); } + /* Allow mirroring frames received on the local port (monitor port). */ + mt7530_set(priv, MT753X_AGC, LOCAL_EN); + /* Setup VLAN ID 0 for VLAN-unaware bridges */ ret = mt7530_setup_vlan0(priv); if (ret) @@ -2591,6 +2594,9 @@ mt7531_setup_common(struct dsa_switch *ds) PVC_EG_TAG(MT7530_VLAN_EG_CONSISTENT)); } + /* Allow mirroring frames received on the local port (monitor port). */ + mt7530_set(priv, MT753X_AGC, LOCAL_EN); + /* Flush the FDB table */ ret = mt7530_fdb_cmd(priv, MT7530_FDB_FLUSH, NULL); if (ret < 0) diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h index 585db03c05487..a08053390b285 100644 --- a/drivers/net/dsa/mt7530.h +++ b/drivers/net/dsa/mt7530.h @@ -32,6 +32,10 @@ enum mt753x_id { #define SYSC_REG_RSTCTRL 0x34 #define RESET_MCM BIT(2) +/* Register for ARL global control */ +#define MT753X_AGC 0xc +#define LOCAL_EN BIT(7) + /* Registers to mac forward control for unknown frames */ #define MT7530_MFC 0x10 #define BC_FFP(x) (((x) & 0xff) << 24) -- cgit 1.2.3-korg From 2c606d138518cc69f09c35929abc414a99e3a28f Mon Sep 17 00:00:00 2001 From: Arınç ÜNAL Date: Sat, 13 Apr 2024 16:01:40 +0300 Subject: net: dsa: mt7530: fix port mirroring for MT7988 SoC switch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "MT7988A Wi-Fi 7 Generation Router Platform: Datasheet (Open Version) v0.1" document shows bits 16 to 18 as the MIRROR_PORT field of the CPU forward control register. Currently, the MT7530 DSA subdriver configures bits 0 to 2 of the CPU forward control register which breaks the port mirroring feature for the MT7988 SoC switch. Fix this by using the MT7531_MIRROR_PORT_GET() and MT7531_MIRROR_PORT_SET() macros which utilise the correct bits. Fixes: 110c18bfed41 ("net: dsa: mt7530: introduce driver for MT7988 built-in switch") Signed-off-by: Arınç ÜNAL Acked-by: Daniel Golle Signed-off-by: David S. Miller --- drivers/net/dsa/mt7530.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index b84e1845fa028..8090390edaf9d 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1883,14 +1883,16 @@ mt7530_port_vlan_del(struct dsa_switch *ds, int port, static int mt753x_mirror_port_get(unsigned int id, u32 val) { - return (id == ID_MT7531) ? MT7531_MIRROR_PORT_GET(val) : - MIRROR_PORT(val); + return (id == ID_MT7531 || id == ID_MT7988) ? + MT7531_MIRROR_PORT_GET(val) : + MIRROR_PORT(val); } static int mt753x_mirror_port_set(unsigned int id, u32 val) { - return (id == ID_MT7531) ? MT7531_MIRROR_PORT_SET(val) : - MIRROR_PORT(val); + return (id == ID_MT7531 || id == ID_MT7988) ? + MT7531_MIRROR_PORT_SET(val) : + MIRROR_PORT(val); } static int mt753x_port_mirror_add(struct dsa_switch *ds, int port, -- cgit 1.2.3-korg From 83781384a96b95e2b6403d3c8a002b2c89031770 Mon Sep 17 00:00:00 2001 From: Gerd Bayer Date: Mon, 15 Apr 2024 15:15:07 +0200 Subject: s390/ism: Properly fix receive message buffer allocation Since [1], dma_alloc_coherent() does not accept requests for GFP_COMP anymore, even on archs that may be able to fulfill this. Functionality that relied on the receive buffer being a compound page broke at that point: The SMC-D protocol, that utilizes the ism device driver, passes receive buffers to the splice processor in a struct splice_pipe_desc with a single entry list of struct pages. As the buffer is no longer a compound page, the splice processor now rejects requests to handle more than a page worth of data. Replace dma_alloc_coherent() and allocate a buffer with folio_alloc and create a DMA map for it with dma_map_page(). Since only receive buffers on ISM devices use DMA, qualify the mapping as FROM_DEVICE. Since ISM devices are available on arch s390, only, and on that arch all DMA is coherent, there is no need to introduce and export some kind of dma_sync_to_cpu() method to be called by the SMC-D protocol layer. Analogously, replace dma_free_coherent by a two step dma_unmap_page, then folio_put to free the receive buffer. [1] https://lore.kernel.org/all/20221113163535.884299-1-hch@lst.de/ Fixes: c08004eede4b ("s390/ism: don't pass bogus GFP_ flags to dma_alloc_coherent") Signed-off-by: Gerd Bayer Signed-off-by: David S. Miller --- drivers/s390/net/ism_drv.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 2c8e964425dc3..43778b088ffac 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -292,13 +292,16 @@ out: static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { clear_bit(dmb->sba_idx, ism->sba_bitmap); - dma_free_coherent(&ism->pdev->dev, dmb->dmb_len, - dmb->cpu_addr, dmb->dma_addr); + dma_unmap_page(&ism->pdev->dev, dmb->dma_addr, dmb->dmb_len, + DMA_FROM_DEVICE); + folio_put(virt_to_folio(dmb->cpu_addr)); } static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { + struct folio *folio; unsigned long bit; + int rc; if (PAGE_ALIGN(dmb->dmb_len) > dma_get_max_seg_size(&ism->pdev->dev)) return -EINVAL; @@ -315,14 +318,30 @@ static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) test_and_set_bit(dmb->sba_idx, ism->sba_bitmap)) return -EINVAL; - dmb->cpu_addr = dma_alloc_coherent(&ism->pdev->dev, dmb->dmb_len, - &dmb->dma_addr, - GFP_KERNEL | __GFP_NOWARN | - __GFP_NOMEMALLOC | __GFP_NORETRY); - if (!dmb->cpu_addr) - clear_bit(dmb->sba_idx, ism->sba_bitmap); + folio = folio_alloc(GFP_KERNEL | __GFP_NOWARN | __GFP_NOMEMALLOC | + __GFP_NORETRY, get_order(dmb->dmb_len)); - return dmb->cpu_addr ? 0 : -ENOMEM; + if (!folio) { + rc = -ENOMEM; + goto out_bit; + } + + dmb->cpu_addr = folio_address(folio); + dmb->dma_addr = dma_map_page(&ism->pdev->dev, + virt_to_page(dmb->cpu_addr), 0, + dmb->dmb_len, DMA_FROM_DEVICE); + if (dma_mapping_error(&ism->pdev->dev, dmb->dma_addr)) { + rc = -ENOMEM; + goto out_free; + } + + return 0; + +out_free: + kfree(dmb->cpu_addr); +out_bit: + clear_bit(dmb->sba_idx, ism->sba_bitmap); + return rc; } int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, -- cgit 1.2.3-korg From efefd4f00c967d00ad7abe092554ffbb70c1a793 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 17 Apr 2024 17:43:01 +0200 Subject: netfilter: nf_tables: missing iterator type in lookup walk Add missing decorator type to lookup expression and tighten WARN_ON_ONCE check in pipapo to spot earlier that this is unset. Fixes: 29b359cf6d95 ("netfilter: nft_set_pipapo: walk over current view on netlink dump") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_lookup.c | 1 + net/netfilter/nft_set_pipapo.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index a0055f510e31e..b314ca728a291 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -216,6 +216,7 @@ static int nft_lookup_validate(const struct nft_ctx *ctx, return 0; iter.genmask = nft_genmask_next(ctx->net); + iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index eeaf05ffba953..0f903d18bbea0 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -2123,7 +2123,8 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_pipapo_field *f; unsigned int i, r; - WARN_ON_ONCE(iter->type == NFT_ITER_UNSPEC); + WARN_ON_ONCE(iter->type != NFT_ITER_READ && + iter->type != NFT_ITER_UPDATE); rcu_read_lock(); if (iter->type == NFT_ITER_READ) -- cgit 1.2.3-korg From e79b47a8615d42c68aaeb68971593333667382ed Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 17 Apr 2024 17:43:11 +0200 Subject: netfilter: nf_tables: restore set elements when delete set fails From abort path, nft_mapelem_activate() needs to restore refcounters to the original state. Currently, it uses the set->ops->walk() to iterate over these set elements. The existing set iterator skips inactive elements in the next generation, this does not work from the abort path to restore the original state since it has to skip active elements instead (not inactive ones). This patch moves the check for inactive elements to the set iterator callback, then it reverses the logic for the .activate case which needs to skip active elements. Toggle next generation bit for elements when delete set command is invoked and call nft_clear() from .activate (abort) path to restore the next generation bit. The splat below shows an object in mappings memleak: [43929.457523] ------------[ cut here ]------------ [43929.457532] WARNING: CPU: 0 PID: 1139 at include/net/netfilter/nf_tables.h:1237 nft_setelem_data_deactivate+0xe4/0xf0 [nf_tables] [...] [43929.458014] RIP: 0010:nft_setelem_data_deactivate+0xe4/0xf0 [nf_tables] [43929.458076] Code: 83 f8 01 77 ab 49 8d 7c 24 08 e8 37 5e d0 de 49 8b 6c 24 08 48 8d 7d 50 e8 e9 5c d0 de 8b 45 50 8d 50 ff 89 55 50 85 c0 75 86 <0f> 0b eb 82 0f 0b eb b3 0f 1f 40 00 90 90 90 90 90 90 90 90 90 90 [43929.458081] RSP: 0018:ffff888140f9f4b0 EFLAGS: 00010246 [43929.458086] RAX: 0000000000000000 RBX: ffff8881434f5288 RCX: dffffc0000000000 [43929.458090] RDX: 00000000ffffffff RSI: ffffffffa26d28a7 RDI: ffff88810ecc9550 [43929.458093] RBP: ffff88810ecc9500 R08: 0000000000000001 R09: ffffed10281f3e8f [43929.458096] R10: 0000000000000003 R11: ffff0000ffff0000 R12: ffff8881434f52a0 [43929.458100] R13: ffff888140f9f5f4 R14: ffff888151c7a800 R15: 0000000000000002 [43929.458103] FS: 00007f0c687c4740(0000) GS:ffff888390800000(0000) knlGS:0000000000000000 [43929.458107] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [43929.458111] CR2: 00007f58dbe5b008 CR3: 0000000123602005 CR4: 00000000001706f0 [43929.458114] Call Trace: [43929.458118] [43929.458121] ? __warn+0x9f/0x1a0 [43929.458127] ? nft_setelem_data_deactivate+0xe4/0xf0 [nf_tables] [43929.458188] ? report_bug+0x1b1/0x1e0 [43929.458196] ? handle_bug+0x3c/0x70 [43929.458200] ? exc_invalid_op+0x17/0x40 [43929.458211] ? nft_setelem_data_deactivate+0xd7/0xf0 [nf_tables] [43929.458271] ? nft_setelem_data_deactivate+0xe4/0xf0 [nf_tables] [43929.458332] nft_mapelem_deactivate+0x24/0x30 [nf_tables] [43929.458392] nft_rhash_walk+0xdd/0x180 [nf_tables] [43929.458453] ? __pfx_nft_rhash_walk+0x10/0x10 [nf_tables] [43929.458512] ? rb_insert_color+0x2e/0x280 [43929.458520] nft_map_deactivate+0xdc/0x1e0 [nf_tables] [43929.458582] ? __pfx_nft_map_deactivate+0x10/0x10 [nf_tables] [43929.458642] ? __pfx_nft_mapelem_deactivate+0x10/0x10 [nf_tables] [43929.458701] ? __rcu_read_unlock+0x46/0x70 [43929.458709] nft_delset+0xff/0x110 [nf_tables] [43929.458769] nft_flush_table+0x16f/0x460 [nf_tables] [43929.458830] nf_tables_deltable+0x501/0x580 [nf_tables] Fixes: 628bd3e49cba ("netfilter: nf_tables: drop map element references from preparation phase") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 44 ++++++++++++++++++++++++++++++++++++++---- net/netfilter/nft_set_bitmap.c | 4 +--- net/netfilter/nft_set_hash.c | 8 ++------ net/netfilter/nft_set_pipapo.c | 5 +---- net/netfilter/nft_set_rbtree.c | 4 +--- 5 files changed, 45 insertions(+), 20 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a7a34db62ea93..d0c09f899e801 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -594,6 +594,12 @@ static int nft_mapelem_deactivate(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + + nft_set_elem_change_active(ctx->net, set, ext); nft_setelem_data_deactivate(ctx->net, set, elem_priv); return 0; @@ -617,6 +623,7 @@ static void nft_map_catchall_deactivate(const struct nft_ctx *ctx, if (!nft_set_elem_active(ext, genmask)) continue; + nft_set_elem_change_active(ctx->net, set, ext); nft_setelem_data_deactivate(ctx->net, set, catchall->elem); break; } @@ -3880,6 +3887,9 @@ int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_data *data; int err; + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) return 0; @@ -3903,17 +3913,20 @@ int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set) { - u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_iter dummy_iter = { + .genmask = nft_genmask_next(ctx->net), + }; struct nft_set_elem_catchall *catchall; + struct nft_set_ext *ext; int ret = 0; list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_set_elem_active(ext, genmask)) + if (!nft_set_elem_active(ext, dummy_iter.genmask)) continue; - ret = nft_setelem_validate(ctx, set, NULL, catchall->elem); + ret = nft_setelem_validate(ctx, set, &dummy_iter, catchall->elem); if (ret < 0) return ret; } @@ -5402,6 +5415,11 @@ static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + return nft_setelem_data_validate(ctx, set, elem_priv); } @@ -5494,6 +5512,13 @@ static int nft_mapelem_activate(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + /* called from abort path, reverse check to undo changes. */ + if (nft_set_elem_active(ext, iter->genmask)) + return 0; + + nft_clear(ctx->net, ext); nft_setelem_data_activate(ctx->net, set, elem_priv); return 0; @@ -5511,6 +5536,7 @@ static void nft_map_catchall_activate(const struct nft_ctx *ctx, if (!nft_set_elem_active(ext, genmask)) continue; + nft_clear(ctx->net, ext); nft_setelem_data_activate(ctx->net, set, catchall->elem); break; } @@ -5785,6 +5811,9 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_set_dump_args *args; + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + if (nft_set_elem_expired(ext) || nft_set_elem_is_dead(ext)) return 0; @@ -6635,7 +6664,7 @@ static void nft_setelem_activate(struct net *net, struct nft_set *set, struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_setelem_is_catchall(set, elem_priv)) { - nft_set_elem_change_active(net, set, ext); + nft_clear(net, ext); } else { set->ops->activate(net, set, elem_priv); } @@ -7317,8 +7346,12 @@ static int nft_setelem_flush(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_trans *trans; + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, sizeof(struct nft_trans_elem), GFP_ATOMIC); if (!trans) @@ -10800,6 +10833,9 @@ static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) return 0; diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 32df7a16835da..1caa04619dc6d 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -172,7 +172,7 @@ static void nft_bitmap_activate(const struct net *net, nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); /* Enter 11 state. */ priv->bitmap[idx] |= (genmask << off); - nft_set_elem_change_active(net, set, &be->ext); + nft_clear(net, &be->ext); } static void nft_bitmap_flush(const struct net *net, @@ -222,8 +222,6 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx, list_for_each_entry_rcu(be, &priv->list, head) { if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&be->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &be->priv); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 6968a3b342367..daa56dda737ae 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -199,7 +199,7 @@ static void nft_rhash_activate(const struct net *net, const struct nft_set *set, { struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &he->ext); + nft_clear(net, &he->ext); } static void nft_rhash_flush(const struct net *net, @@ -286,8 +286,6 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&he->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &he->priv); if (iter->err < 0) @@ -599,7 +597,7 @@ static void nft_hash_activate(const struct net *net, const struct nft_set *set, { struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &he->ext); + nft_clear(net, &he->ext); } static void nft_hash_flush(const struct net *net, @@ -652,8 +650,6 @@ static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set, hlist_for_each_entry_rcu(he, &priv->table[i], node) { if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&he->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &he->priv); if (iter->err < 0) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 0f903d18bbea0..187138afac45d 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1847,7 +1847,7 @@ static void nft_pipapo_activate(const struct net *net, { struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &e->ext); + nft_clear(net, &e->ext); } /** @@ -2149,9 +2149,6 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, e = f->mt[r].e; - if (!nft_set_elem_active(&e->ext, iter->genmask)) - goto cont; - iter->err = iter->fn(ctx, set, iter, &e->priv); if (iter->err < 0) goto out; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 9944fe479e536..b7ea21327549b 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -532,7 +532,7 @@ static void nft_rbtree_activate(const struct net *net, { struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &rbe->ext); + nft_clear(net, &rbe->ext); } static void nft_rbtree_flush(const struct net *net, @@ -600,8 +600,6 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&rbe->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &rbe->priv); if (iter->err < 0) { -- cgit 1.2.3-korg From 86a1471d7cde792941109b93b558b5dc078b9ee9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 17 Apr 2024 17:43:21 +0200 Subject: netfilter: nf_tables: fix memleak in map from abort path The delete set command does not rely on the transaction object for element removal, therefore, a combination of delete element + delete set from the abort path could result in restoring twice the refcount of the mapping. Check for inactive element in the next generation for the delete element command in the abort path, skip restoring state if next generation bit has been already cleared. This is similar to the activate logic using the set walk iterator. [ 6170.286929] ------------[ cut here ]------------ [ 6170.286939] WARNING: CPU: 6 PID: 790302 at net/netfilter/nf_tables_api.c:2086 nf_tables_chain_destroy+0x1f7/0x220 [nf_tables] [ 6170.287071] Modules linked in: [...] [ 6170.287633] CPU: 6 PID: 790302 Comm: kworker/6:2 Not tainted 6.9.0-rc3+ #365 [ 6170.287768] RIP: 0010:nf_tables_chain_destroy+0x1f7/0x220 [nf_tables] [ 6170.287886] Code: df 48 8d 7d 58 e8 69 2e 3b df 48 8b 7d 58 e8 80 1b 37 df 48 8d 7d 68 e8 57 2e 3b df 48 8b 7d 68 e8 6e 1b 37 df 48 89 ef eb c4 <0f> 0b 48 83 c4 08 5b 5d 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc 0f [ 6170.287895] RSP: 0018:ffff888134b8fd08 EFLAGS: 00010202 [ 6170.287904] RAX: 0000000000000001 RBX: ffff888125bffb28 RCX: dffffc0000000000 [ 6170.287912] RDX: 0000000000000003 RSI: ffffffffa20298ab RDI: ffff88811ebe4750 [ 6170.287919] RBP: ffff88811ebe4700 R08: ffff88838e812650 R09: fffffbfff0623a55 [ 6170.287926] R10: ffffffff8311d2af R11: 0000000000000001 R12: ffff888125bffb10 [ 6170.287933] R13: ffff888125bffb10 R14: dead000000000122 R15: dead000000000100 [ 6170.287940] FS: 0000000000000000(0000) GS:ffff888390b00000(0000) knlGS:0000000000000000 [ 6170.287948] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 6170.287955] CR2: 00007fd31fc00710 CR3: 0000000133f60004 CR4: 00000000001706f0 [ 6170.287962] Call Trace: [ 6170.287967] [ 6170.287973] ? __warn+0x9f/0x1a0 [ 6170.287986] ? nf_tables_chain_destroy+0x1f7/0x220 [nf_tables] [ 6170.288092] ? report_bug+0x1b1/0x1e0 [ 6170.287986] ? nf_tables_chain_destroy+0x1f7/0x220 [nf_tables] [ 6170.288092] ? report_bug+0x1b1/0x1e0 [ 6170.288104] ? handle_bug+0x3c/0x70 [ 6170.288112] ? exc_invalid_op+0x17/0x40 [ 6170.288120] ? asm_exc_invalid_op+0x1a/0x20 [ 6170.288132] ? nf_tables_chain_destroy+0x2b/0x220 [nf_tables] [ 6170.288243] ? nf_tables_chain_destroy+0x1f7/0x220 [nf_tables] [ 6170.288366] ? nf_tables_chain_destroy+0x2b/0x220 [nf_tables] [ 6170.288483] nf_tables_trans_destroy_work+0x588/0x590 [nf_tables] Fixes: 591054469b3e ("netfilter: nf_tables: revisit chain/object refcounting from elements") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d0c09f899e801..167074283ea91 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7223,6 +7223,16 @@ void nft_data_hold(const struct nft_data *data, enum nft_data_types type) } } +static int nft_setelem_active_next(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + u8 genmask = nft_genmask_next(net); + + return nft_set_elem_active(ext, genmask); +} + static void nft_setelem_data_activate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) @@ -10644,8 +10654,10 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_DESTROYSETELEM: te = (struct nft_trans_elem *)trans->data; - nft_setelem_data_activate(net, te->set, te->elem_priv); - nft_setelem_activate(net, te->set, te->elem_priv); + if (!nft_setelem_active_next(net, te->set, te->elem_priv)) { + nft_setelem_data_activate(net, te->set, te->elem_priv); + nft_setelem_activate(net, te->set, te->elem_priv); + } if (!nft_setelem_is_catchall(te->set, te->elem_priv)) te->set->ndeact--; -- cgit 1.2.3-korg From 0f022d32c3eca477fbf79a205243a6123ed0fe11 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Apr 2024 18:07:28 -0300 Subject: net/sched: Fix mirred deadlock on device recursion When the mirred action is used on a classful egress qdisc and a packet is mirrored or redirected to self we hit a qdisc lock deadlock. See trace below. [..... other info removed for brevity....] [ 82.890906] [ 82.890906] ============================================ [ 82.890906] WARNING: possible recursive locking detected [ 82.890906] 6.8.0-05205-g77fadd89fe2d-dirty #213 Tainted: G W [ 82.890906] -------------------------------------------- [ 82.890906] ping/418 is trying to acquire lock: [ 82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at: __dev_queue_xmit+0x1778/0x3550 [ 82.890906] [ 82.890906] but task is already holding lock: [ 82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at: __dev_queue_xmit+0x1778/0x3550 [ 82.890906] [ 82.890906] other info that might help us debug this: [ 82.890906] Possible unsafe locking scenario: [ 82.890906] [ 82.890906] CPU0 [ 82.890906] ---- [ 82.890906] lock(&sch->q.lock); [ 82.890906] lock(&sch->q.lock); [ 82.890906] [ 82.890906] *** DEADLOCK *** [ 82.890906] [..... other info removed for brevity....] Example setup (eth0->eth0) to recreate tc qdisc add dev eth0 root handle 1: htb default 30 tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \ action mirred egress redirect dev eth0 Another example(eth0->eth1->eth0) to recreate tc qdisc add dev eth0 root handle 1: htb default 30 tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \ action mirred egress redirect dev eth1 tc qdisc add dev eth1 root handle 1: htb default 30 tc filter add dev eth1 handle 1: protocol ip prio 2 matchall \ action mirred egress redirect dev eth0 We fix this by adding an owner field (CPU id) to struct Qdisc set after root qdisc is entered. When the softirq enters it a second time, if the qdisc owner is the same CPU, the packet is dropped to break the loop. Reported-by: Mingshuai Ren Closes: https://lore.kernel.org/netdev/20240314111713.5979-1-renmingshuai@huawei.com/ Fixes: 3bcb846ca4cf ("net: get rid of spin_trylock() in net_tx_action()") Fixes: e578d9c02587 ("net: sched: use counter to break reclassify loops") Signed-off-by: Eric Dumazet Reviewed-by: Victor Nogueira Reviewed-by: Pedro Tammela Tested-by: Jamal Hadi Salim Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20240415210728.36949-1-victor@mojatatu.com Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 1 + net/core/dev.c | 6 ++++++ net/sched/sch_generic.c | 1 + 3 files changed, 8 insertions(+) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index cefe0c4bdae34..41ca14e81d55f 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -117,6 +117,7 @@ struct Qdisc { struct qdisc_skb_head q; struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; + int owner; unsigned long state; unsigned long state2; /* must be written under qdisc spinlock */ struct Qdisc *next_sched; diff --git a/net/core/dev.c b/net/core/dev.c index 984ff8b9d0e1a..331848eca7d31 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3775,6 +3775,10 @@ no_lock_out: return rc; } + if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) { + kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP); + return NET_XMIT_DROP; + } /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. @@ -3814,7 +3818,9 @@ no_lock_out: qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { + WRITE_ONCE(q->owner, smp_processor_id()); rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + WRITE_ONCE(q->owner, -1); if (qdisc_run_begin(q)) { if (unlikely(contended)) { spin_unlock(&q->busylock); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index ff53364937775..4a2c763e2d116 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -974,6 +974,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; + sch->owner = -1; netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL); refcount_set(&sch->refcnt, 1); -- cgit 1.2.3-korg From caed8eba221533123192d39cc947f45cbb1e1db5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Apr 2024 08:10:48 -0700 Subject: selftests: kselftest_harness: fix Clang warning about zero-length format Apparently it's more legal to pass the format as NULL, than it is to use an empty string. Clang complains about empty formats: ./../kselftest_harness.h:1207:30: warning: format string is empty [-Wformat-zero-length] 1207 | diagnostic ? "%s" : "", diagnostic); | ^~ 1 warning generated. Reported-by: Sean Christopherson Link: https://lore.kernel.org/all/20240409224256.1581292-1-seanjc@google.com Fixes: 378193eff339 ("selftests: kselftest_harness: let PASS / FAIL provide diagnostic") Tested-by: Sean Christopherson Reviewed-by: Muhammad Usama Anjum Link: https://lore.kernel.org/r/20240416151048.1682352-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/kselftest.h | 10 ++++++---- tools/testing/selftests/kselftest_harness.h | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 541bf192e30e6..4eca3fd1292cf 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -288,15 +288,17 @@ void ksft_test_result_code(int exit_code, const char *test_name, } /* Docs seem to call for double space if directive is absent */ - if (!directive[0] && msg[0]) + if (!directive[0] && msg) directive = " # "; - va_start(args, msg); printf("%s %u %s%s", tap_code, ksft_test_num(), test_name, directive); errno = saved_errno; - vprintf(msg, args); + if (msg) { + va_start(args, msg); + vprintf(msg, args); + va_end(args); + } printf("\n"); - va_end(args); } static inline int ksft_exit_pass(void) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 4fd735e48ee7e..adb15cae79abc 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -1202,7 +1202,7 @@ void __run_test(struct __fixture_metadata *f, diagnostic = "unknown"; ksft_test_result_code(t->exit_code, test_name, - diagnostic ? "%s" : "", diagnostic); + diagnostic ? "%s" : NULL, diagnostic); } static int test_harness_run(int argc, char **argv) -- cgit 1.2.3-korg From d362046021ea122309da8c8e0b6850c792ca97b5 Mon Sep 17 00:00:00 2001 From: Vanillan Wang Date: Tue, 16 Apr 2024 20:07:13 +0800 Subject: net:usb:qmi_wwan: support Rolling modules Update the qmi_wwan driver support for the Rolling LTE modules. - VID:PID 33f8:0104, RW101-GL for laptop debug M.2 cards(with RMNET interface for /Linux/Chrome OS) 0x0104: RMNET, diag, at, pipe Here are the outputs of usb-devices: T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=5000 MxCh= 0 D: Ver= 3.20 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=33f8 ProdID=0104 Rev=05.04 S: Manufacturer=Rolling Wireless S.a.r.l. S: Product=Rolling Module S: SerialNumber=ba2eb033 C: #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=896mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=40 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=0f(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=88(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=8e(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=05(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms Signed-off-by: Vanillan Wang Link: https://lore.kernel.org/r/20240416120713.24777-1-vanillanwang@163.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index e2e181378f412..edc34402e787f 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1431,6 +1431,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x2692, 0x9025, 4)}, /* Cellient MPL200 (rebranded Qualcomm 05c6:9025) */ {QMI_QUIRK_SET_DTR(0x1546, 0x1312, 4)}, /* u-blox LARA-R6 01B */ {QMI_QUIRK_SET_DTR(0x1546, 0x1342, 4)}, /* u-blox LARA-L6 */ + {QMI_QUIRK_SET_DTR(0x33f8, 0x0104, 4)}, /* Rolling RW101 RMNET */ /* 4. Gobi 1000 devices */ {QMI_GOBI1K_DEVICE(0x05c6, 0x9212)}, /* Acer Gobi Modem Device */ -- cgit 1.2.3-korg From 94667949ec3bbb2218c46ad0a0e7274c8832e494 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 16 Apr 2024 10:23:29 +0200 Subject: net: ethernet: mtk_eth_soc: fix WED + wifi reset The WLAN + WED reset sequence relies on being able to receive interrupts from the card, in order to synchronize individual steps with the firmware. When WED is stopped, leave interrupts running and rely on the driver turning off unwanted ones. WED DMA also needs to be disabled before resetting. Fixes: f78cd9c783e0 ("net: ethernet: mtk_wed: update mtk_wed_stop") Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20240416082330.82564-1-nbd@nbd.name Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_wed.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c index c895e265ae0eb..61334a71058c7 100644 --- a/drivers/net/ethernet/mediatek/mtk_wed.c +++ b/drivers/net/ethernet/mediatek/mtk_wed.c @@ -1074,13 +1074,13 @@ mtk_wed_dma_disable(struct mtk_wed_device *dev) static void mtk_wed_stop(struct mtk_wed_device *dev) { + mtk_wed_dma_disable(dev); mtk_wed_set_ext_int(dev, false); wed_w32(dev, MTK_WED_WPDMA_INT_TRIGGER, 0); wed_w32(dev, MTK_WED_WDMA_INT_TRIGGER, 0); wdma_w32(dev, MTK_WDMA_INT_MASK, 0); wdma_w32(dev, MTK_WDMA_INT_GRP2, 0); - wed_w32(dev, MTK_WED_WPDMA_INT_MASK, 0); if (!mtk_wed_get_rx_capa(dev)) return; @@ -1093,7 +1093,6 @@ static void mtk_wed_deinit(struct mtk_wed_device *dev) { mtk_wed_stop(dev); - mtk_wed_dma_disable(dev); wed_clr(dev, MTK_WED_CTRL, MTK_WED_CTRL_WDMA_INT_AGENT_EN | @@ -2605,9 +2604,6 @@ mtk_wed_irq_get(struct mtk_wed_device *dev, u32 mask) static void mtk_wed_irq_set_mask(struct mtk_wed_device *dev, u32 mask) { - if (!dev->running) - return; - mtk_wed_set_ext_int(dev, !!mask); wed_w32(dev, MTK_WED_INT_MASK, mask); } -- cgit 1.2.3-korg From def52db470df28d6f43cacbd21137f03b9502073 Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Tue, 16 Apr 2024 13:02:51 +0100 Subject: net: ravb: Count packets instead of descriptors in R-Car RX path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The units of "work done" in the RX path should be packets instead of descriptors. Descriptors which are used by the hardware to record error conditions or are empty in the case of a DMA mapping error should not count towards our RX work budget. Also make the limit variable unsigned as it can never be negative. Fixes: c156633f1353 ("Renesas Ethernet AVB driver proper") Signed-off-by: Paul Barker Reviewed-by: Sergey Shtylyov Reviewed-by: Niklas Söderlund Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index ba01c8cc3c906..7d231d011bffa 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -892,29 +892,24 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) struct ravb_private *priv = netdev_priv(ndev); const struct ravb_hw_info *info = priv->info; int entry = priv->cur_rx[q] % priv->num_rx_ring[q]; - int boguscnt = (priv->dirty_rx[q] + priv->num_rx_ring[q]) - - priv->cur_rx[q]; struct net_device_stats *stats = &priv->stats[q]; struct ravb_ex_rx_desc *desc; + unsigned int limit, i; struct sk_buff *skb; dma_addr_t dma_addr; struct timespec64 ts; + int rx_packets = 0; u8 desc_status; u16 pkt_len; - int limit; - boguscnt = min(boguscnt, *quota); - limit = boguscnt; + limit = priv->dirty_rx[q] + priv->num_rx_ring[q] - priv->cur_rx[q]; desc = &priv->rx_ring[q].ex_desc[entry]; - while (desc->die_dt != DT_FEMPTY) { + for (i = 0; i < limit && rx_packets < *quota && desc->die_dt != DT_FEMPTY; i++) { /* Descriptor type must be checked before all other reads */ dma_rmb(); desc_status = desc->msc; pkt_len = le16_to_cpu(desc->ds_cc) & RX_DS; - if (--boguscnt < 0) - break; - /* We use 0-byte descriptors to mark the DMA mapping errors */ if (!pkt_len) continue; @@ -960,7 +955,7 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) if (ndev->features & NETIF_F_RXCSUM) ravb_rx_csum(skb); napi_gro_receive(&priv->napi[q], skb); - stats->rx_packets++; + rx_packets++; stats->rx_bytes += pkt_len; } @@ -995,9 +990,9 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) desc->die_dt = DT_FEMPTY; } - *quota -= limit - (++boguscnt); - - return boguscnt <= 0; + stats->rx_packets += rx_packets; + *quota -= rx_packets; + return *quota == 0; } /* Packet receive function for Ethernet AVB */ -- cgit 1.2.3-korg From a892493a343494bd6bab9d098593932077ff3c43 Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Tue, 16 Apr 2024 13:02:52 +0100 Subject: net: ravb: Allow RX loop to move past DMA mapping errors The RX loops in ravb_rx_gbeth() and ravb_rx_rcar() skip to the next loop iteration if a zero-length descriptor is seen (indicating a DMA mapping error). However, the current RX descriptor index `priv->cur_rx[q]` was incremented at the end of the loop and so would not be incremented when we skip to the next loop iteration. This would cause the loop to keep seeing the same zero-length descriptor instead of moving on to the next descriptor. As the loop counter `i` still increments, the loop would eventually terminate so there is no risk of being stuck here forever - but we should still fix this to avoid wasting cycles. To fix this, the RX descriptor index is incremented at the top of the loop, in the for statement itself. The assignments of `entry` and `desc` are brought into the loop to avoid the need for duplication. Fixes: d8b48911fd24 ("ravb: fix ring memory allocation") Signed-off-by: Paul Barker Reviewed-by: Sergey Shtylyov Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 7d231d011bffa..3b870926af14e 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -775,12 +775,15 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) int limit; int i; - entry = priv->cur_rx[q] % priv->num_rx_ring[q]; limit = priv->dirty_rx[q] + priv->num_rx_ring[q] - priv->cur_rx[q]; stats = &priv->stats[q]; - desc = &priv->rx_ring[q].desc[entry]; - for (i = 0; i < limit && rx_packets < *quota && desc->die_dt != DT_FEMPTY; i++) { + for (i = 0; i < limit; i++, priv->cur_rx[q]++) { + entry = priv->cur_rx[q] % priv->num_rx_ring[q]; + desc = &priv->rx_ring[q].desc[entry]; + if (rx_packets == *quota || desc->die_dt == DT_FEMPTY) + break; + /* Descriptor type must be checked before all other reads */ dma_rmb(); desc_status = desc->msc; @@ -848,9 +851,6 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) break; } } - - entry = (++priv->cur_rx[q]) % priv->num_rx_ring[q]; - desc = &priv->rx_ring[q].desc[entry]; } /* Refill the RX ring buffers. */ @@ -891,7 +891,6 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) { struct ravb_private *priv = netdev_priv(ndev); const struct ravb_hw_info *info = priv->info; - int entry = priv->cur_rx[q] % priv->num_rx_ring[q]; struct net_device_stats *stats = &priv->stats[q]; struct ravb_ex_rx_desc *desc; unsigned int limit, i; @@ -901,10 +900,15 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) int rx_packets = 0; u8 desc_status; u16 pkt_len; + int entry; limit = priv->dirty_rx[q] + priv->num_rx_ring[q] - priv->cur_rx[q]; - desc = &priv->rx_ring[q].ex_desc[entry]; - for (i = 0; i < limit && rx_packets < *quota && desc->die_dt != DT_FEMPTY; i++) { + for (i = 0; i < limit; i++, priv->cur_rx[q]++) { + entry = priv->cur_rx[q] % priv->num_rx_ring[q]; + desc = &priv->rx_ring[q].ex_desc[entry]; + if (rx_packets == *quota || desc->die_dt == DT_FEMPTY) + break; + /* Descriptor type must be checked before all other reads */ dma_rmb(); desc_status = desc->msc; @@ -958,9 +962,6 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) rx_packets++; stats->rx_bytes += pkt_len; } - - entry = (++priv->cur_rx[q]) % priv->num_rx_ring[q]; - desc = &priv->rx_ring[q].ex_desc[entry]; } /* Refill the RX ring buffers. */ -- cgit 1.2.3-korg From c7c449502b51c5b5de79f97a42be750b28f6ecee Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Tue, 16 Apr 2024 13:02:53 +0100 Subject: net: ravb: Fix GbEth jumbo packet RX checksum handling Sending a 7kB ping packet to the RZ/G2L in v6.9-rc2 causes the following backtrace: WARNING: CPU: 0 PID: 0 at include/linux/skbuff.h:3127 skb_trim+0x30/0x38 Hardware name: Renesas SMARC EVK based on r9a07g044l2 (DT) pc : skb_trim+0x30/0x38 lr : ravb_rx_csum_gbeth+0x40/0x90 Call trace: skb_trim+0x30/0x38 ravb_rx_gbeth+0x56c/0x5cc ravb_poll+0xa0/0x204 __napi_poll+0x38/0x17c This is caused by ravb_rx_gbeth() calling ravb_rx_csum_gbeth() with the wrong skb for a packet which spans multiple descriptors. To fix this, use the correct skb. Fixes: c2da9408579d ("ravb: Add Rx checksum offload support for GbEth") Signed-off-by: Paul Barker Reviewed-by: Sergey Shtylyov Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 3b870926af14e..6969cdeeb67ab 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -843,7 +843,7 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) priv->rx_1st_skb->protocol = eth_type_trans(priv->rx_1st_skb, ndev); if (ndev->features & NETIF_F_RXCSUM) - ravb_rx_csum_gbeth(skb); + ravb_rx_csum_gbeth(priv->rx_1st_skb); napi_gro_receive(&priv->napi[q], priv->rx_1st_skb); rx_packets++; -- cgit 1.2.3-korg From 2e36c9fbc476f95a1b19e3fa0a2cdf408475ff56 Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Tue, 16 Apr 2024 13:02:54 +0100 Subject: net: ravb: Fix RX byte accounting for jumbo packets The RX byte accounting for jumbo packets was changed to fix a potential use-after-free bug. However, that fix used the wrong variable and so only accounted for the number of bytes in the final descriptor, not the number of bytes in the whole packet. To fix this, we can simply update our stats with the correct number of bytes before calling napi_gro_receive(). Also rename pkt_len to desc_len in ravb_rx_gbeth() to avoid any future confusion. The variable name pkt_len is correct in ravb_rx_rcar() as that function does not handle packets spanning multiple descriptors. Fixes: 5a5a3e564de6 ("ravb: Fix potential use-after-free in ravb_rx_gbeth()") Signed-off-by: Paul Barker Reviewed-by: Sergey Shtylyov Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 6969cdeeb67ab..fcb756d77681c 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -769,7 +769,7 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) dma_addr_t dma_addr; int rx_packets = 0; u8 desc_status; - u16 pkt_len; + u16 desc_len; u8 die_dt; int entry; int limit; @@ -787,10 +787,10 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) /* Descriptor type must be checked before all other reads */ dma_rmb(); desc_status = desc->msc; - pkt_len = le16_to_cpu(desc->ds_cc) & RX_DS; + desc_len = le16_to_cpu(desc->ds_cc) & RX_DS; /* We use 0-byte descriptors to mark the DMA mapping errors */ - if (!pkt_len) + if (!desc_len) continue; if (desc_status & MSC_MC) @@ -811,25 +811,25 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) switch (die_dt) { case DT_FSINGLE: skb = ravb_get_skb_gbeth(ndev, entry, desc); - skb_put(skb, pkt_len); + skb_put(skb, desc_len); skb->protocol = eth_type_trans(skb, ndev); if (ndev->features & NETIF_F_RXCSUM) ravb_rx_csum_gbeth(skb); napi_gro_receive(&priv->napi[q], skb); rx_packets++; - stats->rx_bytes += pkt_len; + stats->rx_bytes += desc_len; break; case DT_FSTART: priv->rx_1st_skb = ravb_get_skb_gbeth(ndev, entry, desc); - skb_put(priv->rx_1st_skb, pkt_len); + skb_put(priv->rx_1st_skb, desc_len); break; case DT_FMID: skb = ravb_get_skb_gbeth(ndev, entry, desc); skb_copy_to_linear_data_offset(priv->rx_1st_skb, priv->rx_1st_skb->len, skb->data, - pkt_len); - skb_put(priv->rx_1st_skb, pkt_len); + desc_len); + skb_put(priv->rx_1st_skb, desc_len); dev_kfree_skb(skb); break; case DT_FEND: @@ -837,17 +837,17 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) skb_copy_to_linear_data_offset(priv->rx_1st_skb, priv->rx_1st_skb->len, skb->data, - pkt_len); - skb_put(priv->rx_1st_skb, pkt_len); + desc_len); + skb_put(priv->rx_1st_skb, desc_len); dev_kfree_skb(skb); priv->rx_1st_skb->protocol = eth_type_trans(priv->rx_1st_skb, ndev); if (ndev->features & NETIF_F_RXCSUM) ravb_rx_csum_gbeth(priv->rx_1st_skb); + stats->rx_bytes += priv->rx_1st_skb->len; napi_gro_receive(&priv->napi[q], priv->rx_1st_skb); rx_packets++; - stats->rx_bytes += pkt_len; break; } } -- cgit 1.2.3-korg From 56f78615bcb1c3ba58a5d9911bad3d9185cf141b Mon Sep 17 00:00:00 2001 From: Jose Ignacio Tornos Martinez Date: Wed, 17 Apr 2024 10:55:13 +0200 Subject: net: usb: ax88179_178a: avoid writing the mac address before first reading After the commit d2689b6a86b9 ("net: usb: ax88179_178a: avoid two consecutive device resets"), reset operation, in which the default mac address from the device is read, is not executed from bind operation and the random address, that is pregenerated just in case, is direclty written the first time in the device, so the default one from the device is not even read. This writing is not dangerous because is volatile and the default mac address is not missed. In order to avoid this and keep the simplification to have only one reset and reduce the delays, restore the reset from bind operation and remove the reset that is commanded from open operation. The behavior is the same but everything is ready for usbnet_probe. Tested with ASIX AX88179 USB Gigabit Ethernet devices. Restore the old behavior for the rest of possible devices because I don't have the hardware to test. cc: stable@vger.kernel.org # 6.6+ Fixes: d2689b6a86b9 ("net: usb: ax88179_178a: avoid two consecutive device resets") Reported-by: Jarkko Palviainen Signed-off-by: Jose Ignacio Tornos Martinez Link: https://lore.kernel.org/r/20240417085524.219532-1-jtornosm@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/ax88179_178a.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c index a9c418890a1ca..752f821a19901 100644 --- a/drivers/net/usb/ax88179_178a.c +++ b/drivers/net/usb/ax88179_178a.c @@ -1317,6 +1317,8 @@ static int ax88179_bind(struct usbnet *dev, struct usb_interface *intf) netif_set_tso_max_size(dev->net, 16384); + ax88179_reset(dev); + return 0; } @@ -1695,7 +1697,6 @@ static const struct driver_info ax88179_info = { .unbind = ax88179_unbind, .status = ax88179_status, .link_reset = ax88179_link_reset, - .reset = ax88179_reset, .stop = ax88179_stop, .flags = FLAG_ETHER | FLAG_FRAMING_AX, .rx_fixup = ax88179_rx_fixup, @@ -1708,7 +1709,6 @@ static const struct driver_info ax88178a_info = { .unbind = ax88179_unbind, .status = ax88179_status, .link_reset = ax88179_link_reset, - .reset = ax88179_reset, .stop = ax88179_stop, .flags = FLAG_ETHER | FLAG_FRAMING_AX, .rx_fixup = ax88179_rx_fixup, -- cgit 1.2.3-korg From c24cd679b075b0e953ea167b0aa2b2d59e4eba7f Mon Sep 17 00:00:00 2001 From: Siddharth Vadapalli Date: Wed, 17 Apr 2024 15:24:25 +0530 Subject: net: ethernet: ti: am65-cpsw-nuss: cleanup DMA Channels before using them The TX and RX DMA Channels used by the driver to exchange data with CPSW are not guaranteed to be in a clean state during driver initialization. The Bootloader could have used the same DMA Channels without cleaning them up in the event of failure. Thus, reset and disable the DMA Channels to ensure that they are in a clean state before using them. Fixes: 93a76530316a ("net: ethernet: ti: introduce am65x/j721e gigabit eth subsystem driver") Reported-by: Schuyler Patton Signed-off-by: Siddharth Vadapalli Reviewed-by: Roger Quadros Link: https://lore.kernel.org/r/20240417095425.2253876-1-s-vadapalli@ti.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 2939a21ca74f3..1d00e21808c1c 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -2793,6 +2793,8 @@ static void am65_cpsw_unregister_devlink(struct am65_cpsw_common *common) static int am65_cpsw_nuss_register_ndevs(struct am65_cpsw_common *common) { + struct am65_cpsw_rx_chn *rx_chan = &common->rx_chns; + struct am65_cpsw_tx_chn *tx_chan = common->tx_chns; struct device *dev = common->dev; struct am65_cpsw_port *port; int ret = 0, i; @@ -2805,6 +2807,22 @@ static int am65_cpsw_nuss_register_ndevs(struct am65_cpsw_common *common) if (ret) return ret; + /* The DMA Channels are not guaranteed to be in a clean state. + * Reset and disable them to ensure that they are back to the + * clean state and ready to be used. + */ + for (i = 0; i < common->tx_ch_num; i++) { + k3_udma_glue_reset_tx_chn(tx_chan[i].tx_chn, &tx_chan[i], + am65_cpsw_nuss_tx_cleanup); + k3_udma_glue_disable_tx_chn(tx_chan[i].tx_chn); + } + + for (i = 0; i < AM65_CPSW_MAX_RX_FLOWS; i++) + k3_udma_glue_reset_rx_chn(rx_chan->rx_chn, i, rx_chan, + am65_cpsw_nuss_rx_cleanup, !!i); + + k3_udma_glue_disable_rx_chn(rx_chan->rx_chn); + ret = am65_cpsw_nuss_register_devlink(common); if (ret) return ret; -- cgit 1.2.3-korg