From 0e07e25b481aa021e4b48085ecb8a049e9614510 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 30 Mar 2021 16:24:11 +0200 Subject: netfilter: flowtable: fix NAT IPv6 offload mangling Fix out-of-bound access in the address array. Fixes: 5c27d8d76ce8 ("netfilter: nf_flow_table_offload: add IPv6 support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_offload.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 2a6993fa40d78d..1c5460e7bce875 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -305,12 +305,12 @@ static void flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule, const __be32 *addr, const __be32 *mask) { struct flow_action_entry *entry; - int i; + int i, j; - for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i += sizeof(u32)) { + for (i = 0, j = 0; i < sizeof(struct in6_addr) / sizeof(u32); i += sizeof(u32), j++) { entry = flow_action_entry_next(flow_rule); flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP6, - offset + i, &addr[i], mask); + offset + i, &addr[j], mask); } } -- cgit 1.2.3-korg From fbea31808ca124dd73ff6bb1e67c9af4607c3e32 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 31 Mar 2021 01:04:45 +0200 Subject: netfilter: conntrack: do not print icmpv6 as unknown via /proc /proc/net/nf_conntrack shows icmpv6 as unknown. Fixes: 09ec82f5af99 ("netfilter: conntrack: remove protocol name from l4proto struct") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_standalone.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 0ee702d374b028..c6c0cb46566456 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -266,6 +266,7 @@ static const char* l4proto_name(u16 proto) case IPPROTO_GRE: return "gre"; case IPPROTO_SCTP: return "sctp"; case IPPROTO_UDPLITE: return "udplite"; + case IPPROTO_ICMPV6: return "icmpv6"; } return "unknown"; -- cgit 1.2.3-korg From afd0be7299533bb2e2b09104399d8a467ecbd2c5 Mon Sep 17 00:00:00 2001 From: Ciara Loftus Date: Thu, 8 Apr 2021 05:20:09 +0000 Subject: libbpf: Fix potential NULL pointer dereference Wait until after the UMEM is checked for null to dereference it. Fixes: 43f1bc1efff1 ("libbpf: Restore umem state after socket create failure") Signed-off-by: Ciara Loftus Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210408052009.7844-1-ciara.loftus@intel.com --- tools/lib/bpf/xsk.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index d24b5cc720ec67..007fe5d5943861 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -852,18 +852,19 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, struct xsk_ring_cons *comp, const struct xsk_socket_config *usr_config) { + bool unmap, rx_setup_done = false, tx_setup_done = false; void *rx_map = NULL, *tx_map = NULL; struct sockaddr_xdp sxdp = {}; struct xdp_mmap_offsets off; struct xsk_socket *xsk; struct xsk_ctx *ctx; int err, ifindex; - bool unmap = umem->fill_save != fill; - bool rx_setup_done = false, tx_setup_done = false; if (!umem || !xsk_ptr || !(rx || tx)) return -EFAULT; + unmap = umem->fill_save != fill; + xsk = calloc(1, sizeof(*xsk)); if (!xsk) return -ENOMEM; -- cgit 1.2.3-korg From b895bdf5d643b6feb7c60856326dd4feb6981560 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 9 Apr 2021 08:49:39 -0700 Subject: netfilter: nft_limit: avoid possible divide error in nft_limit_init div_u64() divides u64 by u32. nft_limit_init() wants to divide u64 by u64, use the appropriate math function (div64_u64) divide error: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 8390 Comm: syz-executor188 Not tainted 5.12.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:div_u64_rem include/linux/math64.h:28 [inline] RIP: 0010:div_u64 include/linux/math64.h:127 [inline] RIP: 0010:nft_limit_init+0x2a2/0x5e0 net/netfilter/nft_limit.c:85 Code: ef 4c 01 eb 41 0f 92 c7 48 89 de e8 38 a5 22 fa 4d 85 ff 0f 85 97 02 00 00 e8 ea 9e 22 fa 4c 0f af f3 45 89 ed 31 d2 4c 89 f0 <49> f7 f5 49 89 c6 e8 d3 9e 22 fa 48 8d 7d 48 48 b8 00 00 00 00 00 RSP: 0018:ffffc90009447198 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000200000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff875152e6 RDI: 0000000000000003 RBP: ffff888020f80908 R08: 0000200000000000 R09: 0000000000000000 R10: ffffffff875152d8 R11: 0000000000000000 R12: ffffc90009447270 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 000000000097a300(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000200001c4 CR3: 0000000026a52000 CR4: 00000000001506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: nf_tables_newexpr net/netfilter/nf_tables_api.c:2675 [inline] nft_expr_init+0x145/0x2d0 net/netfilter/nf_tables_api.c:2713 nft_set_elem_expr_alloc+0x27/0x280 net/netfilter/nf_tables_api.c:5160 nf_tables_newset+0x1997/0x3150 net/netfilter/nf_tables_api.c:4321 nfnetlink_rcv_batch+0x85a/0x21b0 net/netfilter/nfnetlink.c:456 nfnetlink_rcv_skb_batch net/netfilter/nfnetlink.c:580 [inline] nfnetlink_rcv+0x3af/0x420 net/netfilter/nfnetlink.c:598 netlink_unicast_kernel net/netlink/af_netlink.c:1312 [inline] netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1338 netlink_sendmsg+0x856/0xd90 net/netlink/af_netlink.c:1927 sock_sendmsg_nosec net/socket.c:654 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:674 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2350 ___sys_sendmsg+0xf3/0x170 net/socket.c:2404 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2433 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: c26844eda9d4 ("netfilter: nf_tables: Fix nft limit burst handling") Fixes: 3e0f64b7dd31 ("netfilter: nft_limit: fix packet ratelimiting") Signed-off-by: Eric Dumazet Diagnosed-by: Luigi Rizzo Reported-by: syzbot Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_limit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index 0e2c315c3b5ed5..82ec27bdf94120 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -76,13 +76,13 @@ static int nft_limit_init(struct nft_limit *limit, return -EOVERFLOW; if (pkts) { - tokens = div_u64(limit->nsecs, limit->rate) * limit->burst; + tokens = div64_u64(limit->nsecs, limit->rate) * limit->burst; } else { /* The token bucket size limits the number of tokens can be * accumulated. tokens_max specifies the bucket size. * tokens_max = unit * (rate + burst) / rate. */ - tokens = div_u64(limit->nsecs * (limit->rate + limit->burst), + tokens = div64_u64(limit->nsecs * (limit->rate + limit->burst), limit->rate); } -- cgit 1.2.3-korg From 7ee3c61dcd28bf6e290e06ad382f13511dc790e9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 7 Apr 2021 21:43:39 +0200 Subject: netfilter: bridge: add pre_exit hooks for ebtable unregistration Just like ip/ip6/arptables, the hooks have to be removed, then synchronize_rcu() has to be called to make sure no more packets are being processed before the ruleset data is released. Place the hook unregistration in the pre_exit hook, then call the new ebtables pre_exit function from there. Years ago, when first netns support got added for netfilter+ebtables, this used an older (now removed) netfilter hook unregister API, that did a unconditional synchronize_rcu(). Now that all is done with call_rcu, ebtable_{filter,nat,broute} pernet exit handlers may free the ebtable ruleset while packets are still in flight. This can only happens on module removal, not during netns exit. The new function expects the table name, not the table struct. This is because upcoming patch set (targeting -next) will remove all net->xt.{nat,filter,broute}_table instances, this makes it necessary to avoid external references to those member variables. The existing APIs will be converted, so follow the upcoming scheme of passing name + hook type instead. Fixes: aee12a0a3727e ("ebtables: remove nf_hook_register usage") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge/ebtables.h | 5 +++-- net/bridge/netfilter/ebtable_broute.c | 8 +++++++- net/bridge/netfilter/ebtable_filter.c | 8 +++++++- net/bridge/netfilter/ebtable_nat.c | 8 +++++++- net/bridge/netfilter/ebtables.c | 30 +++++++++++++++++++++++++++--- 5 files changed, 51 insertions(+), 8 deletions(-) diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index 2f5c4e6ecd8a4c..3a956145a25cb6 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -110,8 +110,9 @@ extern int ebt_register_table(struct net *net, const struct ebt_table *table, const struct nf_hook_ops *ops, struct ebt_table **res); -extern void ebt_unregister_table(struct net *net, struct ebt_table *table, - const struct nf_hook_ops *); +extern void ebt_unregister_table(struct net *net, struct ebt_table *table); +void ebt_unregister_table_pre_exit(struct net *net, const char *tablename, + const struct nf_hook_ops *ops); extern unsigned int ebt_do_table(struct sk_buff *skb, const struct nf_hook_state *state, struct ebt_table *table); diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 66e7af16549436..32bc2821027f37 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -105,14 +105,20 @@ static int __net_init broute_net_init(struct net *net) &net->xt.broute_table); } +static void __net_exit broute_net_pre_exit(struct net *net) +{ + ebt_unregister_table_pre_exit(net, "broute", &ebt_ops_broute); +} + static void __net_exit broute_net_exit(struct net *net) { - ebt_unregister_table(net, net->xt.broute_table, &ebt_ops_broute); + ebt_unregister_table(net, net->xt.broute_table); } static struct pernet_operations broute_net_ops = { .init = broute_net_init, .exit = broute_net_exit, + .pre_exit = broute_net_pre_exit, }; static int __init ebtable_broute_init(void) diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index 78cb9b21022d0a..bcf982e12f16b9 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -99,14 +99,20 @@ static int __net_init frame_filter_net_init(struct net *net) &net->xt.frame_filter); } +static void __net_exit frame_filter_net_pre_exit(struct net *net) +{ + ebt_unregister_table_pre_exit(net, "filter", ebt_ops_filter); +} + static void __net_exit frame_filter_net_exit(struct net *net) { - ebt_unregister_table(net, net->xt.frame_filter, ebt_ops_filter); + ebt_unregister_table(net, net->xt.frame_filter); } static struct pernet_operations frame_filter_net_ops = { .init = frame_filter_net_init, .exit = frame_filter_net_exit, + .pre_exit = frame_filter_net_pre_exit, }; static int __init ebtable_filter_init(void) diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 0888936ef8537e..0d092773f81618 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -99,14 +99,20 @@ static int __net_init frame_nat_net_init(struct net *net) &net->xt.frame_nat); } +static void __net_exit frame_nat_net_pre_exit(struct net *net) +{ + ebt_unregister_table_pre_exit(net, "nat", ebt_ops_nat); +} + static void __net_exit frame_nat_net_exit(struct net *net) { - ebt_unregister_table(net, net->xt.frame_nat, ebt_ops_nat); + ebt_unregister_table(net, net->xt.frame_nat); } static struct pernet_operations frame_nat_net_ops = { .init = frame_nat_net_init, .exit = frame_nat_net_exit, + .pre_exit = frame_nat_net_pre_exit, }; static int __init ebtable_nat_init(void) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index ebe33b60efd6b0..d481ff24a15016 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1232,10 +1232,34 @@ out: return ret; } -void ebt_unregister_table(struct net *net, struct ebt_table *table, - const struct nf_hook_ops *ops) +static struct ebt_table *__ebt_find_table(struct net *net, const char *name) +{ + struct ebt_table *t; + + mutex_lock(&ebt_mutex); + + list_for_each_entry(t, &net->xt.tables[NFPROTO_BRIDGE], list) { + if (strcmp(t->name, name) == 0) { + mutex_unlock(&ebt_mutex); + return t; + } + } + + mutex_unlock(&ebt_mutex); + return NULL; +} + +void ebt_unregister_table_pre_exit(struct net *net, const char *name, const struct nf_hook_ops *ops) +{ + struct ebt_table *table = __ebt_find_table(net, name); + + if (table) + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); +} +EXPORT_SYMBOL(ebt_unregister_table_pre_exit); + +void ebt_unregister_table(struct net *net, struct ebt_table *table) { - nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); __ebt_unregister_table(net, table); } -- cgit 1.2.3-korg From d163a925ebbc6eb5b562b0f1d72c7e817aa75c40 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 7 Apr 2021 21:43:40 +0200 Subject: netfilter: arp_tables: add pre_exit hook for table unregister Same problem that also existed in iptables/ip(6)tables, when arptable_filter is removed there is no longer a wait period before the table/ruleset is free'd. Unregister the hook in pre_exit, then remove the table in the exit function. This used to work correctly because the old nf_hook_unregister API did unconditional synchronize_net. The per-net hook unregister function uses call_rcu instead. Fixes: b9e69e127397 ("netfilter: xtables: don't hook tables by default") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_arp/arp_tables.h | 5 +++-- net/ipv4/netfilter/arp_tables.c | 9 +++++++-- net/ipv4/netfilter/arptable_filter.c | 10 +++++++++- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h index 7d3537c40ec95a..26a13294318cf7 100644 --- a/include/linux/netfilter_arp/arp_tables.h +++ b/include/linux/netfilter_arp/arp_tables.h @@ -52,8 +52,9 @@ extern void *arpt_alloc_initial_table(const struct xt_table *); int arpt_register_table(struct net *net, const struct xt_table *table, const struct arpt_replace *repl, const struct nf_hook_ops *ops, struct xt_table **res); -void arpt_unregister_table(struct net *net, struct xt_table *table, - const struct nf_hook_ops *ops); +void arpt_unregister_table(struct net *net, struct xt_table *table); +void arpt_unregister_table_pre_exit(struct net *net, struct xt_table *table, + const struct nf_hook_ops *ops); extern unsigned int arpt_do_table(struct sk_buff *skb, const struct nf_hook_state *state, struct xt_table *table); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index d1e04d2b5170ec..6c26533480dd1a 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1539,10 +1539,15 @@ out_free: return ret; } -void arpt_unregister_table(struct net *net, struct xt_table *table, - const struct nf_hook_ops *ops) +void arpt_unregister_table_pre_exit(struct net *net, struct xt_table *table, + const struct nf_hook_ops *ops) { nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); +} +EXPORT_SYMBOL(arpt_unregister_table_pre_exit); + +void arpt_unregister_table(struct net *net, struct xt_table *table) +{ __arpt_unregister_table(net, table); } diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index c216b9ad3bb24d..6c300ba5634e2e 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -56,16 +56,24 @@ static int __net_init arptable_filter_table_init(struct net *net) return err; } +static void __net_exit arptable_filter_net_pre_exit(struct net *net) +{ + if (net->ipv4.arptable_filter) + arpt_unregister_table_pre_exit(net, net->ipv4.arptable_filter, + arpfilter_ops); +} + static void __net_exit arptable_filter_net_exit(struct net *net) { if (!net->ipv4.arptable_filter) return; - arpt_unregister_table(net, net->ipv4.arptable_filter, arpfilter_ops); + arpt_unregister_table(net, net->ipv4.arptable_filter); net->ipv4.arptable_filter = NULL; } static struct pernet_operations arptable_filter_net_ops = { .exit = arptable_filter_net_exit, + .pre_exit = arptable_filter_net_pre_exit, }; static int __init arptable_filter_init(void) -- cgit 1.2.3-korg From 4af2178ac605faf32ebe638f7ac17d841d40ea9b Mon Sep 17 00:00:00 2001 From: Joakim Zhang Date: Fri, 9 Apr 2021 17:11:45 +0800 Subject: MAINTAINERS: update maintainer entry for freescale fec driver Update maintainer entry for freescale fec driver. Suggested-by: Heiner Kallweit Signed-off-by: Joakim Zhang Signed-off-by: David S. Miller --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index ccd9228350cfcb..163264c282eb82 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7089,7 +7089,7 @@ S: Maintained F: drivers/i2c/busses/i2c-cpm.c FREESCALE IMX / MXC FEC DRIVER -M: Fugang Duan +M: Joakim Zhang L: netdev@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/net/fsl-fec.txt -- cgit 1.2.3-korg From 31457db3750c0b0ed229d836f2609fdb8a5b790e Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 11 Apr 2021 11:02:08 +0200 Subject: net: davicom: Fix regulator not turned off on failed probe When the probe fails, we must disable the regulator that was previously enabled. This patch is a follow-up to commit ac88c531a5b3 ("net: davicom: Fix regulator not turned off on failed probe") which missed one case. Fixes: 7994fe55a4a2 ("dm9000: Add regulator and reset support to dm9000") Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/ethernet/davicom/dm9000.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/davicom/dm9000.c b/drivers/net/ethernet/davicom/dm9000.c index 252adfa5d837ba..8a9096aa85cdfe 100644 --- a/drivers/net/ethernet/davicom/dm9000.c +++ b/drivers/net/ethernet/davicom/dm9000.c @@ -1471,8 +1471,10 @@ dm9000_probe(struct platform_device *pdev) /* Init network device */ ndev = alloc_etherdev(sizeof(struct board_info)); - if (!ndev) - return -ENOMEM; + if (!ndev) { + ret = -ENOMEM; + goto out_regulator_disable; + } SET_NETDEV_DEV(ndev, &pdev->dev); -- cgit 1.2.3-korg From 6628ddfec7580882f11fdc5c194a8ea781fdadfa Mon Sep 17 00:00:00 2001 From: Phillip Potter Date: Sun, 11 Apr 2021 12:28:24 +0100 Subject: net: geneve: check skb is large enough for IPv4/IPv6 header Check within geneve_xmit_skb/geneve6_xmit_skb that sk_buff structure is large enough to include IPv4 or IPv6 header, and reject if not. The geneve_xmit_skb portion and overall idea was contributed by Eric Dumazet. Fixes a KMSAN-found uninit-value bug reported by syzbot at: https://syzkaller.appspot.com/bug?id=abe95dc3e3e9667fc23b8d81f29ecad95c6f106f Suggested-by: Eric Dumazet Reported-by: syzbot+2e406a9ac75bb71d4b7a@syzkaller.appspotmail.com Signed-off-by: Phillip Potter Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/geneve.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index d5b1e48e0c0902..42f31c6818462e 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -891,6 +891,9 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 sport; int err; + if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) + return -EINVAL; + sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); rt = geneve_get_v4_rt(skb, dev, gs4, &fl4, info, geneve->cfg.info.key.tp_dst, sport); @@ -985,6 +988,9 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 sport; int err; + if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) + return -EINVAL; + sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); dst = geneve_get_v6_dst(skb, dev, gs6, &fl6, info, geneve->cfg.info.key.tp_dst, sport); -- cgit 1.2.3-korg From 1fe976d308acb6374c899a4ee8025a0a016e453e Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Mon, 12 Apr 2021 18:57:39 +0200 Subject: net: phy: marvell: fix detection of PHY on Topaz switches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit fee2d546414d ("net: phy: marvell: mv88e6390 temperature sensor reading"), Linux reports the temperature of Topaz hwmon as constant -75°C. This is because switches from the Topaz family (88E6141 / 88E6341) have the address of the temperature sensor register different from Peridot. This address is instead compatible with 88E1510 PHYs, as was used for Topaz before the above mentioned commit. Create a new mapping table between switch family and PHY ID for families which don't have a model number. And define PHY IDs for Topaz and Peridot families. Create a new PHY ID and a new PHY driver for Topaz's internal PHY. The only difference from Peridot's PHY driver is the HWMON probing method. Prior this change Topaz's internal PHY is detected by kernel as: PHY [...] driver [Marvell 88E6390] (irq=63) And afterwards as: PHY [...] driver [Marvell 88E6341 Family] (irq=63) Signed-off-by: Pali Rohár BugLink: https://github.com/globalscaletechnologies/linux/issues/1 Fixes: fee2d546414d ("net: phy: marvell: mv88e6390 temperature sensor reading") Reviewed-by: Marek Behún Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/chip.c | 30 +++++++++++++----------------- drivers/net/phy/marvell.c | 32 +++++++++++++++++++++++++++++--- include/linux/marvell_phy.h | 5 +++-- 3 files changed, 45 insertions(+), 22 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 903d619e08ed1e..e08bf937714009 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -3026,10 +3026,17 @@ out_resources: return err; } +/* prod_id for switch families which do not have a PHY model number */ +static const u16 family_prod_id_table[] = { + [MV88E6XXX_FAMILY_6341] = MV88E6XXX_PORT_SWITCH_ID_PROD_6341, + [MV88E6XXX_FAMILY_6390] = MV88E6XXX_PORT_SWITCH_ID_PROD_6390, +}; + static int mv88e6xxx_mdio_read(struct mii_bus *bus, int phy, int reg) { struct mv88e6xxx_mdio_bus *mdio_bus = bus->priv; struct mv88e6xxx_chip *chip = mdio_bus->chip; + u16 prod_id; u16 val; int err; @@ -3040,23 +3047,12 @@ static int mv88e6xxx_mdio_read(struct mii_bus *bus, int phy, int reg) err = chip->info->ops->phy_read(chip, bus, phy, reg, &val); mv88e6xxx_reg_unlock(chip); - if (reg == MII_PHYSID2) { - /* Some internal PHYs don't have a model number. */ - if (chip->info->family != MV88E6XXX_FAMILY_6165) - /* Then there is the 6165 family. It gets is - * PHYs correct. But it can also have two - * SERDES interfaces in the PHY address - * space. And these don't have a model - * number. But they are not PHYs, so we don't - * want to give them something a PHY driver - * will recognise. - * - * Use the mv88e6390 family model number - * instead, for anything which really could be - * a PHY, - */ - if (!(val & 0x3f0)) - val |= MV88E6XXX_PORT_SWITCH_ID_PROD_6390 >> 4; + /* Some internal PHYs don't have a model number. */ + if (reg == MII_PHYSID2 && !(val & 0x3f0) && + chip->info->family < ARRAY_SIZE(family_prod_id_table)) { + prod_id = family_prod_id_table[chip->info->family]; + if (prod_id) + val |= prod_id >> 4; } return err ? err : val; diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index e26a5d663f8a76..8018ddf7f31622 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -3021,9 +3021,34 @@ static struct phy_driver marvell_drivers[] = { .get_stats = marvell_get_stats, }, { - .phy_id = MARVELL_PHY_ID_88E6390, + .phy_id = MARVELL_PHY_ID_88E6341_FAMILY, .phy_id_mask = MARVELL_PHY_ID_MASK, - .name = "Marvell 88E6390", + .name = "Marvell 88E6341 Family", + /* PHY_GBIT_FEATURES */ + .flags = PHY_POLL_CABLE_TEST, + .probe = m88e1510_probe, + .config_init = marvell_config_init, + .config_aneg = m88e6390_config_aneg, + .read_status = marvell_read_status, + .config_intr = marvell_config_intr, + .handle_interrupt = marvell_handle_interrupt, + .resume = genphy_resume, + .suspend = genphy_suspend, + .read_page = marvell_read_page, + .write_page = marvell_write_page, + .get_sset_count = marvell_get_sset_count, + .get_strings = marvell_get_strings, + .get_stats = marvell_get_stats, + .get_tunable = m88e1540_get_tunable, + .set_tunable = m88e1540_set_tunable, + .cable_test_start = marvell_vct7_cable_test_start, + .cable_test_tdr_start = marvell_vct5_cable_test_tdr_start, + .cable_test_get_status = marvell_vct7_cable_test_get_status, + }, + { + .phy_id = MARVELL_PHY_ID_88E6390_FAMILY, + .phy_id_mask = MARVELL_PHY_ID_MASK, + .name = "Marvell 88E6390 Family", /* PHY_GBIT_FEATURES */ .flags = PHY_POLL_CABLE_TEST, .probe = m88e6390_probe, @@ -3107,7 +3132,8 @@ static struct mdio_device_id __maybe_unused marvell_tbl[] = { { MARVELL_PHY_ID_88E1540, MARVELL_PHY_ID_MASK }, { MARVELL_PHY_ID_88E1545, MARVELL_PHY_ID_MASK }, { MARVELL_PHY_ID_88E3016, MARVELL_PHY_ID_MASK }, - { MARVELL_PHY_ID_88E6390, MARVELL_PHY_ID_MASK }, + { MARVELL_PHY_ID_88E6341_FAMILY, MARVELL_PHY_ID_MASK }, + { MARVELL_PHY_ID_88E6390_FAMILY, MARVELL_PHY_ID_MASK }, { MARVELL_PHY_ID_88E1340S, MARVELL_PHY_ID_MASK }, { MARVELL_PHY_ID_88E1548P, MARVELL_PHY_ID_MASK }, { } diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h index 52b1610eae68bf..c544b70dfbd267 100644 --- a/include/linux/marvell_phy.h +++ b/include/linux/marvell_phy.h @@ -28,11 +28,12 @@ /* Marvel 88E1111 in Finisar SFP module with modified PHY ID */ #define MARVELL_PHY_ID_88E1111_FINISAR 0x01ff0cc0 -/* The MV88e6390 Ethernet switch contains embedded PHYs. These PHYs do +/* These Ethernet switch families contain embedded PHYs, but they do * not have a model ID. So the switch driver traps reads to the ID2 * register and returns the switch family ID */ -#define MARVELL_PHY_ID_88E6390 0x01410f90 +#define MARVELL_PHY_ID_88E6341_FAMILY 0x01410f41 +#define MARVELL_PHY_ID_88E6390_FAMILY 0x01410f90 #define MARVELL_PHY_FAMILY_ID(id) ((id) >> 4) -- cgit 1.2.3-korg From f33b0e196ed7aa3dc285b26db7768c1db1eb3a41 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 12 Apr 2021 11:47:07 -0700 Subject: ethtool: fix kdoc attr name Add missing 't' in attrtype. Signed-off-by: Jakub Kicinski Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/ethtool/netlink.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 6eabd58d81bfed..cde9f3169ae5dc 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -36,9 +36,9 @@ static inline int ethnl_strz_size(const char *s) /** * ethnl_put_strz() - put string attribute with fixed size string - * @skb: skb with the message - * @attrype: attribute type - * @s: ETH_GSTRING_LEN sized string (may not be null terminated) + * @skb: skb with the message + * @attrtype: attribute type + * @s: ETH_GSTRING_LEN sized string (may not be null terminated) * * Puts an attribute with null terminated string from @s into the message. * -- cgit 1.2.3-korg From b29c457a6511435960115c0f548c4360d5f4801d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 7 Apr 2021 21:38:57 +0200 Subject: netfilter: x_tables: fix compat match/target pad out-of-bound write xt_compat_match/target_from_user doesn't check that zeroing the area to start of next rule won't write past end of allocated ruleset blob. Remove this code and zero the entire blob beforehand. Reported-by: syzbot+cfc0247ac173f597aaaa@syzkaller.appspotmail.com Reported-by: Andy Nguyen Fixes: 9fa492cdc160c ("[NETFILTER]: x_tables: simplify compat API") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 2 ++ net/ipv4/netfilter/ip_tables.c | 2 ++ net/ipv6/netfilter/ip6_tables.c | 2 ++ net/netfilter/x_tables.c | 10 ++-------- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 6c26533480dd1a..d6d45d820d79ad 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1193,6 +1193,8 @@ static int translate_compat_table(struct net *net, if (!newinfo) goto out_unlock; + memset(newinfo->entries, 0, size); + newinfo->number = compatr->num_entries; for (i = 0; i < NF_ARP_NUMHOOKS; i++) { newinfo->hook_entry[i] = compatr->hook_entry[i]; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index f15bc21d730164..f77ea0dbe6562b 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1428,6 +1428,8 @@ translate_compat_table(struct net *net, if (!newinfo) goto out_unlock; + memset(newinfo->entries, 0, size); + newinfo->number = compatr->num_entries; for (i = 0; i < NF_INET_NUMHOOKS; i++) { newinfo->hook_entry[i] = compatr->hook_entry[i]; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 2e2119bfcf1373..eb2b5404806c64 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1443,6 +1443,8 @@ translate_compat_table(struct net *net, if (!newinfo) goto out_unlock; + memset(newinfo->entries, 0, size); + newinfo->number = compatr->num_entries; for (i = 0; i < NF_INET_NUMHOOKS; i++) { newinfo->hook_entry[i] = compatr->hook_entry[i]; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 6bd31a7a27fc58..92e9d4ebc5e8d7 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -733,7 +733,7 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, { const struct xt_match *match = m->u.kernel.match; struct compat_xt_entry_match *cm = (struct compat_xt_entry_match *)m; - int pad, off = xt_compat_match_offset(match); + int off = xt_compat_match_offset(match); u_int16_t msize = cm->u.user.match_size; char name[sizeof(m->u.user.name)]; @@ -743,9 +743,6 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, match->compat_from_user(m->data, cm->data); else memcpy(m->data, cm->data, msize - sizeof(*cm)); - pad = XT_ALIGN(match->matchsize) - match->matchsize; - if (pad > 0) - memset(m->data + match->matchsize, 0, pad); msize += off; m->u.user.match_size = msize; @@ -1116,7 +1113,7 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr, { const struct xt_target *target = t->u.kernel.target; struct compat_xt_entry_target *ct = (struct compat_xt_entry_target *)t; - int pad, off = xt_compat_target_offset(target); + int off = xt_compat_target_offset(target); u_int16_t tsize = ct->u.user.target_size; char name[sizeof(t->u.user.name)]; @@ -1126,9 +1123,6 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr, target->compat_from_user(t->data, ct->data); else memcpy(t->data, ct->data, tsize - sizeof(*ct)); - pad = XT_ALIGN(target->targetsize) - target->targetsize; - if (pad > 0) - memset(t->data + target->targetsize, 0, pad); tsize += off; t->u.user.target_size = tsize; -- cgit 1.2.3-korg From 4d8f9065830e526c83199186c5f56a6514f457d2 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 10 Apr 2021 21:29:38 +0200 Subject: netfilter: nftables: clone set element expression template memcpy() breaks when using connlimit in set elements. Use nft_expr_clone() to initialize the connlimit expression list, otherwise connlimit garbage collector crashes when walking on the list head copy. [ 493.064656] Workqueue: events_power_efficient nft_rhash_gc [nf_tables] [ 493.064685] RIP: 0010:find_or_evict+0x5a/0x90 [nf_conncount] [ 493.064694] Code: 2b 43 40 83 f8 01 77 0d 48 c7 c0 f5 ff ff ff 44 39 63 3c 75 df 83 6d 18 01 48 8b 43 08 48 89 de 48 8b 13 48 8b 3d ee 2f 00 00 <48> 89 42 08 48 89 10 48 b8 00 01 00 00 00 00 ad de 48 89 03 48 83 [ 493.064699] RSP: 0018:ffffc90000417dc0 EFLAGS: 00010297 [ 493.064704] RAX: 0000000000000000 RBX: ffff888134f38410 RCX: 0000000000000000 [ 493.064708] RDX: 0000000000000000 RSI: ffff888134f38410 RDI: ffff888100060cc0 [ 493.064711] RBP: ffff88812ce594a8 R08: ffff888134f38438 R09: 00000000ebb9025c [ 493.064714] R10: ffffffff8219f838 R11: 0000000000000017 R12: 0000000000000001 [ 493.064718] R13: ffffffff82146740 R14: ffff888134f38410 R15: 0000000000000000 [ 493.064721] FS: 0000000000000000(0000) GS:ffff88840e440000(0000) knlGS:0000000000000000 [ 493.064725] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 493.064729] CR2: 0000000000000008 CR3: 00000001330aa002 CR4: 00000000001706e0 [ 493.064733] Call Trace: [ 493.064737] nf_conncount_gc_list+0x8f/0x150 [nf_conncount] [ 493.064746] nft_rhash_gc+0x106/0x390 [nf_tables] Reported-by: Laura Garcia Liebana Fixes: 409444522976 ("netfilter: nf_tables: add elements with stateful expressions") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 46 ++++++++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f57f1a6ba96f6c..589d2f6978d380 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5295,16 +5295,35 @@ err_expr: return -ENOMEM; } -static void nft_set_elem_expr_setup(const struct nft_set_ext *ext, int i, - struct nft_expr *expr_array[]) +static int nft_set_elem_expr_setup(struct nft_ctx *ctx, + const struct nft_set_ext *ext, + struct nft_expr *expr_array[], + u32 num_exprs) { struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext); - struct nft_expr *expr = nft_setelem_expr_at(elem_expr, elem_expr->size); + struct nft_expr *expr; + int i, err; + + for (i = 0; i < num_exprs; i++) { + expr = nft_setelem_expr_at(elem_expr, elem_expr->size); + err = nft_expr_clone(expr, expr_array[i]); + if (err < 0) + goto err_elem_expr_setup; + + elem_expr->size += expr_array[i]->ops->size; + nft_expr_destroy(ctx, expr_array[i]); + expr_array[i] = NULL; + } + + return 0; + +err_elem_expr_setup: + for (; i < num_exprs; i++) { + nft_expr_destroy(ctx, expr_array[i]); + expr_array[i] = NULL; + } - memcpy(expr, expr_array[i], expr_array[i]->ops->size); - elem_expr->size += expr_array[i]->ops->size; - kfree(expr_array[i]); - expr_array[i] = NULL; + return -ENOMEM; } static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, @@ -5556,12 +5575,15 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, *nft_set_ext_obj(ext) = obj; obj->use++; } - for (i = 0; i < num_exprs; i++) - nft_set_elem_expr_setup(ext, i, expr_array); + err = nft_set_elem_expr_setup(ctx, ext, expr_array, num_exprs); + if (err < 0) + goto err_elem_expr; trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); - if (trans == NULL) - goto err_trans; + if (trans == NULL) { + err = -ENOMEM; + goto err_elem_expr; + } ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; err = set->ops->insert(ctx->net, set, &elem, &ext2); @@ -5605,7 +5627,7 @@ err_set_full: set->ops->remove(ctx->net, set, &elem); err_element_clash: kfree(trans); -err_trans: +err_elem_expr: if (obj) obj->use--; -- cgit 1.2.3-korg From 31166efb1cee348eb6314e9c0095d84cbeb66b9d Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 8 Mar 2021 12:41:56 -0800 Subject: ixgbe: Fix NULL pointer dereference in ethtool loopback test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ixgbe driver currently generates a NULL pointer dereference when performing the ethtool loopback test. This is due to the fact that there isn't a q_vector associated with the test ring when it is setup as interrupts are not normally added to the test rings. To address this I have added code that will check for a q_vector before returning a napi_id value. If a q_vector is not present it will return a value of 0. Fixes: b02e5a0ebb17 ("xsk: Propagate napi_id to XDP socket Rx path") Signed-off-by: Alexander Duyck Acked-by: Björn Töpel Tested-by: Dave Switzer Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 03d9aad516d4e3..45d2c8f37c01f9 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6536,6 +6536,13 @@ err_setup_tx: return err; } +static int ixgbe_rx_napi_id(struct ixgbe_ring *rx_ring) +{ + struct ixgbe_q_vector *q_vector = rx_ring->q_vector; + + return q_vector ? q_vector->napi.napi_id : 0; +} + /** * ixgbe_setup_rx_resources - allocate Rx resources (Descriptors) * @adapter: pointer to ixgbe_adapter @@ -6583,7 +6590,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter, /* XDP RX-queue info */ if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, adapter->netdev, - rx_ring->queue_index, rx_ring->q_vector->napi.napi_id) < 0) + rx_ring->queue_index, ixgbe_rx_napi_id(rx_ring)) < 0) goto err; rx_ring->xdp_prog = adapter->xdp_prog; -- cgit 1.2.3-korg From debb9df311582c83fe369baa35fa4b92e8a9c58a Mon Sep 17 00:00:00 2001 From: Yongxin Liu Date: Mon, 22 Mar 2021 15:14:48 +0800 Subject: ixgbe: fix unbalanced device enable/disable in suspend/resume pci_disable_device() called in __ixgbe_shutdown() decreases dev->enable_cnt by 1. pci_enable_device_mem() which increases dev->enable_cnt by 1, was removed from ixgbe_resume() in commit 6f82b2558735 ("ixgbe: use generic power management"). This caused unbalanced increase/decrease. So add pci_enable_device_mem() back. Fix the following call trace. ixgbe 0000:17:00.1: disabling already-disabled device Call Trace: __ixgbe_shutdown+0x10a/0x1e0 [ixgbe] ixgbe_suspend+0x32/0x70 [ixgbe] pci_pm_suspend+0x87/0x160 ? pci_pm_freeze+0xd0/0xd0 dpm_run_callback+0x42/0x170 __device_suspend+0x114/0x460 async_suspend+0x1f/0xa0 async_run_entry_fn+0x3c/0xf0 process_one_work+0x1dd/0x410 worker_thread+0x34/0x3f0 ? cancel_delayed_work+0x90/0x90 kthread+0x14c/0x170 ? kthread_park+0x90/0x90 ret_from_fork+0x1f/0x30 Fixes: 6f82b2558735 ("ixgbe: use generic power management") Signed-off-by: Yongxin Liu Tested-by: Dave Switzer Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 45d2c8f37c01f9..cffb95f8f63266 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6899,6 +6899,11 @@ static int __maybe_unused ixgbe_resume(struct device *dev_d) adapter->hw.hw_addr = adapter->io_addr; + err = pci_enable_device_mem(pdev); + if (err) { + e_dev_err("Cannot enable PCI device from suspend\n"); + return err; + } smp_mb__before_atomic(); clear_bit(__IXGBE_DISABLED, &adapter->state); pci_set_master(pdev); -- cgit 1.2.3-korg From ef963ae427aa4669905e0a96b3bd9d44dc85db32 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 31 Mar 2021 15:46:28 +0100 Subject: ice: Fix potential infinite loop when using u8 loop counter A for-loop is using a u8 loop counter that is being compared to a u32 cmp_dcbcfg->numapp to check for the end of the loop. If cmp_dcbcfg->numapp is larger than 255 then the counter j will wrap around to zero and hence an infinite loop occurs. Fix this by making counter j the same type as cmp_dcbcfg->numapp. Addresses-Coverity: ("Infinite loop") Fixes: aeac8ce864d9 ("ice: Recognize 860 as iSCSI port in CEE mode") Signed-off-by: Colin Ian King Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_dcb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_dcb.c b/drivers/net/ethernet/intel/ice/ice_dcb.c index 211ac6f907adb3..28e834a128c07b 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb.c @@ -747,8 +747,8 @@ ice_cee_to_dcb_cfg(struct ice_aqc_get_cee_dcb_cfg_resp *cee_cfg, struct ice_port_info *pi) { u32 status, tlv_status = le32_to_cpu(cee_cfg->tlv_status); - u32 ice_aqc_cee_status_mask, ice_aqc_cee_status_shift; - u8 i, j, err, sync, oper, app_index, ice_app_sel_type; + u32 ice_aqc_cee_status_mask, ice_aqc_cee_status_shift, j; + u8 i, err, sync, oper, app_index, ice_app_sel_type; u16 app_prio = le16_to_cpu(cee_cfg->oper_app_prio); u16 ice_aqc_cee_app_mask, ice_aqc_cee_app_shift; struct ice_dcbx_cfg *cmp_dcbcfg, *dcbcfg; -- cgit 1.2.3-korg From 610f8c0fc8d46e0933955ce13af3d64484a4630a Mon Sep 17 00:00:00 2001 From: Hristo Venev Date: Mon, 12 Apr 2021 20:41:16 +0300 Subject: net: sit: Unregister catch-all devices A sit interface created without a local or a remote address is linked into the `sit_net::tunnels_wc` list of its original namespace. When deleting a network namespace, delete the devices that have been moved. The following script triggers a null pointer dereference if devices linked in a deleted `sit_net` remain: for i in `seq 1 30`; do ip netns add ns-test ip netns exec ns-test ip link add dev veth0 type veth peer veth1 ip netns exec ns-test ip link add dev sit$i type sit dev veth0 ip netns exec ns-test ip link set dev sit$i netns $$ ip netns del ns-test done for i in `seq 1 30`; do ip link del dev sit$i done Fixes: 5e6700b3bf98f ("sit: add support of x-netns") Signed-off-by: Hristo Venev Signed-off-by: David S. Miller --- net/ipv6/sit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 63ccd9f2dcccf9..9fdccf0718b59c 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1867,9 +1867,9 @@ static void __net_exit sit_destroy_tunnels(struct net *net, if (dev->rtnl_link_ops == &sit_link_ops) unregister_netdevice_queue(dev, head); - for (prio = 1; prio < 4; prio++) { + for (prio = 0; prio < 4; prio++) { int h; - for (h = 0; h < IP6_SIT_HASH_SIZE; h++) { + for (h = 0; h < (prio ? IP6_SIT_HASH_SIZE : 1); h++) { struct ip_tunnel *t; t = rtnl_dereference(sitn->tunnels[prio][h]); -- cgit 1.2.3-korg From 941ea91e87a6e879ed82dad4949f6234f2702bec Mon Sep 17 00:00:00 2001 From: Hristo Venev Date: Mon, 12 Apr 2021 20:41:17 +0300 Subject: net: ip6_tunnel: Unregister catch-all devices Similarly to the sit case, we need to remove the tunnels with no addresses that have been moved to another network namespace. Fixes: 0bd8762824e73 ("ip6tnl: add x-netns support") Signed-off-by: Hristo Venev Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 3fa0eca5a06f8a..42fe7db6bbb37e 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -2244,6 +2244,16 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head t = rtnl_dereference(t->next); } } + + t = rtnl_dereference(ip6n->tnls_wc[0]); + while (t) { + /* If dev is in the same netns, it has already + * been added to the list by the previous loop. + */ + if (!net_eq(dev_net(t->dev), net)) + unregister_netdevice_queue(t->dev, list); + t = rtnl_dereference(t->next); + } } static int __net_init ip6_tnl_init_net(struct net *net) -- cgit 1.2.3-korg From 97684f0970f6e112926de631fdd98d9693c7e5c1 Mon Sep 17 00:00:00 2001 From: Jonathon Reinhart Date: Tue, 13 Apr 2021 03:08:48 -0400 Subject: net: Make tcp_allowed_congestion_control readonly in non-init netns Currently, tcp_allowed_congestion_control is global and writable; writing to it in any net namespace will leak into all other net namespaces. tcp_available_congestion_control and tcp_allowed_congestion_control are the only sysctls in ipv4_net_table (the per-netns sysctl table) with a NULL data pointer; their handlers (proc_tcp_available_congestion_control and proc_allowed_congestion_control) have no other way of referencing a struct net. Thus, they operate globally. Because ipv4_net_table does not use designated initializers, there is no easy way to fix up this one "bad" table entry. However, the data pointer updating logic shouldn't be applied to NULL pointers anyway, so we instead force these entries to be read-only. These sysctls used to exist in ipv4_table (init-net only), but they were moved to the per-net ipv4_net_table, presumably without realizing that tcp_allowed_congestion_control was writable and thus introduced a leak. Because the intent of that commit was only to know (i.e. read) "which congestion algorithms are available or allowed", this read-only solution should be sufficient. The logic added in recent commit 31c4d2f160eb: ("net: Ensure net namespace isolation of sysctls") does not and cannot check for NULL data pointers, because other table entries (e.g. /proc/sys/net/netfilter/nf_log/) have .data=NULL but use other methods (.extra2) to access the struct net. Fixes: 9cb8e048e5d9 ("net/ipv4/sysctl: show tcp_{allowed, available}_congestion_control in non-initial netns") Signed-off-by: Jonathon Reinhart Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index f55095d3ed1656..60465f077497ed 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1378,9 +1378,19 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) if (!table) goto err_alloc; - /* Update the variables to point into the current struct net */ - for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) - table[i].data += (void *)net - (void *)&init_net; + for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) { + if (table[i].data) { + /* Update the variables to point into + * the current struct net + */ + table[i].data += (void *)net - (void *)&init_net; + } else { + /* Entries without data pointer are global; + * Make them read-only in non-init_net ns + */ + table[i].mode &= ~0222; + } + } } net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); -- cgit 1.2.3-korg From ca09bf7bb109a37a7ff05f230bb3fa3627e6625f Mon Sep 17 00:00:00 2001 From: Lijun Pan Date: Tue, 13 Apr 2021 03:33:25 -0500 Subject: ibmvnic: correctly use dev_consume/free_skb_irq It is more correct to use dev_kfree_skb_irq when packets are dropped, and to use dev_consume_skb_irq when packets are consumed. Fixes: 0d973388185d ("ibmvnic: Introduce xmit_more support using batched subCRQ hcalls") Suggested-by: Thomas Falcon Signed-off-by: Lijun Pan Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 9c6438d3b3a5be..110a0d0eaabb51 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -3204,9 +3204,6 @@ restart_loop: next = ibmvnic_next_scrq(adapter, scrq); for (i = 0; i < next->tx_comp.num_comps; i++) { - if (next->tx_comp.rcs[i]) - dev_err(dev, "tx error %x\n", - next->tx_comp.rcs[i]); index = be32_to_cpu(next->tx_comp.correlators[i]); if (index & IBMVNIC_TSO_POOL_MASK) { tx_pool = &adapter->tso_pool[pool]; @@ -3220,7 +3217,13 @@ restart_loop: num_entries += txbuff->num_entries; if (txbuff->skb) { total_bytes += txbuff->skb->len; - dev_consume_skb_irq(txbuff->skb); + if (next->tx_comp.rcs[i]) { + dev_err(dev, "tx error %x\n", + next->tx_comp.rcs[i]); + dev_kfree_skb_irq(txbuff->skb); + } else { + dev_consume_skb_irq(txbuff->skb); + } txbuff->skb = NULL; } else { netdev_warn(adapter->netdev, -- cgit 1.2.3-korg From b166a20b07382b8bc1dcee2a448715c9c2c81b5b Mon Sep 17 00:00:00 2001 From: Or Cohen Date: Tue, 13 Apr 2021 21:10:31 +0300 Subject: net/sctp: fix race condition in sctp_destroy_sock If sctp_destroy_sock is called without sock_net(sk)->sctp.addr_wq_lock held and sp->do_auto_asconf is true, then an element is removed from the auto_asconf_splist without any proper locking. This can happen in the following functions: 1. In sctp_accept, if sctp_sock_migrate fails. 2. In inet_create or inet6_create, if there is a bpf program attached to BPF_CGROUP_INET_SOCK_CREATE which denies creation of the sctp socket. The bug is fixed by acquiring addr_wq_lock in sctp_destroy_sock instead of sctp_close. This addresses CVE-2021-23133. Reported-by: Or Cohen Reviewed-by: Xin Long Fixes: 610236587600 ("bpf: Add new cgroup attach type to enable sock modifications") Signed-off-by: Or Cohen Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index a710917c5ac73a..b9b3d899a611cc 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1520,11 +1520,9 @@ static void sctp_close(struct sock *sk, long timeout) /* Supposedly, no process has access to the socket, but * the net layers still may. - * Also, sctp_destroy_sock() needs to be called with addr_wq_lock - * held and that should be grabbed before socket lock. */ - spin_lock_bh(&net->sctp.addr_wq_lock); - bh_lock_sock_nested(sk); + local_bh_disable(); + bh_lock_sock(sk); /* Hold the sock, since sk_common_release() will put sock_put() * and we have just a little more cleanup. @@ -1533,7 +1531,7 @@ static void sctp_close(struct sock *sk, long timeout) sk_common_release(sk); bh_unlock_sock(sk); - spin_unlock_bh(&net->sctp.addr_wq_lock); + local_bh_enable(); sock_put(sk); @@ -4993,9 +4991,6 @@ static int sctp_init_sock(struct sock *sk) sk_sockets_allocated_inc(sk); sock_prot_inuse_add(net, sk->sk_prot, 1); - /* Nothing can fail after this block, otherwise - * sctp_destroy_sock() will be called without addr_wq_lock held - */ if (net->sctp.default_auto_asconf) { spin_lock(&sock_net(sk)->sctp.addr_wq_lock); list_add_tail(&sp->auto_asconf_list, @@ -5030,7 +5025,9 @@ static void sctp_destroy_sock(struct sock *sk) if (sp->do_auto_asconf) { sp->do_auto_asconf = 0; + spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock); list_del(&sp->auto_asconf_list); + spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock); } sctp_endpoint_free(sp->ep); local_bh_disable(); -- cgit 1.2.3-korg From 38ec4944b593fd90c5ef42aaaa53e66ae5769d04 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Apr 2021 05:41:35 -0700 Subject: gro: ensure frag0 meets IP header alignment After commit 0f6925b3e8da ("virtio_net: Do not pull payload in skb->head") Guenter Roeck reported one failure in his tests using sh architecture. After much debugging, we have been able to spot silent unaligned accesses in inet_gro_receive() The issue at hand is that upper networking stacks assume their header is word-aligned. Low level drivers are supposed to reserve NET_IP_ALIGN bytes before the Ethernet header to make that happen. This patch hardens skb_gro_reset_offset() to not allow frag0 fast-path if the fragment is not properly aligned. Some arches like x86, arm64 and powerpc do not care and define NET_IP_ALIGN as 0, this extra check will be a NOP for them. Note that if frag0 is not used, GRO will call pskb_may_pull() as many times as needed to pull network and transport headers. Fixes: 0f6925b3e8da ("virtio_net: Do not pull payload in skb->head") Fixes: 78a478d0efd9 ("gro: Inline skb_gro_header and cache frag0 virtual address") Signed-off-by: Eric Dumazet Reported-by: Guenter Roeck Cc: Xuan Zhuo Cc: "Michael S. Tsirkin" Cc: Jason Wang Acked-by: Michael S. Tsirkin Tested-by: Guenter Roeck Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index af8c1ea040b936..1f79b9aa9a3f23 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5924,7 +5924,8 @@ static void skb_gro_reset_offset(struct sk_buff *skb) NAPI_GRO_CB(skb)->frag0_len = 0; if (!skb_headlen(skb) && pinfo->nr_frags && - !PageHighMem(skb_frag_page(frag0))) { + !PageHighMem(skb_frag_page(frag0)) && + (!NET_IP_ALIGN || !(skb_frag_off(frag0) & 3))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, skb_frag_size(frag0), -- cgit 1.2.3-korg From 2afeec08ab5c86ae21952151f726bfe184f6b23d Mon Sep 17 00:00:00 2001 From: Michael Brown Date: Tue, 13 Apr 2021 16:25:12 +0100 Subject: xen-netback: Check for hotplug-status existence before watching The logic in connect() is currently written with the assumption that xenbus_watch_pathfmt() will return an error for a node that does not exist. This assumption is incorrect: xenstore does allow a watch to be registered for a nonexistent node (and will send notifications should the node be subsequently created). As of commit 1f2565780 ("xen-netback: remove 'hotplug-status' once it has served its purpose"), this leads to a failure when a domU transitions into XenbusStateConnected more than once. On the first domU transition into Connected state, the "hotplug-status" node will be deleted by the hotplug_status_changed() callback in dom0. On the second or subsequent domU transition into Connected state, the hotplug_status_changed() callback will therefore never be invoked, and so the backend will remain stuck in InitWait. This failure prevents scenarios such as reloading the xen-netfront module within a domU, or booting a domU via iPXE. There is unfortunately no way for the domU to work around this dom0 bug. Fix by explicitly checking for existence of the "hotplug-status" node, thereby creating the behaviour that was previously assumed to exist. Signed-off-by: Michael Brown Reviewed-by: Paul Durrant Signed-off-by: David S. Miller --- drivers/net/xen-netback/xenbus.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index a5439c130130f8..d24b7a7993aa05 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -824,11 +824,15 @@ static void connect(struct backend_info *be) xenvif_carrier_on(be->vif); unregister_hotplug_status_watch(be); - err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch, NULL, - hotplug_status_changed, - "%s/%s", dev->nodename, "hotplug-status"); - if (!err) + if (xenbus_exists(XBT_NIL, dev->nodename, "hotplug-status")) { + err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch, + NULL, hotplug_status_changed, + "%s/%s", dev->nodename, + "hotplug-status"); + if (err) + goto err; be->have_hotplug_status_watch = 1; + } netif_tx_wake_all_queues(be->vif->dev); -- cgit 1.2.3-korg From 16756d3e77ad58cd07e36cbed724aa13ae5a0278 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 13 Apr 2021 20:46:14 -0700 Subject: ethtool: pause: make sure we init driver stats The intention was for pause statistics to not be reported when driver does not have the relevant callback (only report an empty netlink nest). What happens currently we report all 0s instead. Make sure statistics are initialized to "not set" (which is -1) so the dumping code skips them. Fixes: 9a27a33027f2 ("ethtool: add standard pause stats") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/ethtool/pause.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index 09998dc5c185fc..d4ac02718b72ad 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -38,16 +38,16 @@ static int pause_prepare_data(const struct ethnl_req_info *req_base, if (!dev->ethtool_ops->get_pauseparam) return -EOPNOTSUPP; + ethtool_stats_init((u64 *)&data->pausestat, + sizeof(data->pausestat) / 8); + ret = ethnl_ops_begin(dev); if (ret < 0) return ret; dev->ethtool_ops->get_pauseparam(dev, &data->pauseparam); if (req_base->flags & ETHTOOL_FLAG_STATS && - dev->ethtool_ops->get_pause_stats) { - ethtool_stats_init((u64 *)&data->pausestat, - sizeof(data->pausestat) / 8); + dev->ethtool_ops->get_pause_stats) dev->ethtool_ops->get_pause_stats(dev, &data->pausestat); - } ethnl_ops_complete(dev); return 0; -- cgit 1.2.3-korg From 453a77894efa4d9b6ef9644d74b9419c47ac427c Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 14 Apr 2021 10:47:10 +0200 Subject: r8169: don't advertise pause in jumbo mode It has been reported [0] that using pause frames in jumbo mode impacts performance. There's no available chip documentation, but vendor drivers r8168 and r8125 don't advertise pause in jumbo mode. So let's do the same, according to Roman it fixes the issue. [0] https://bugzilla.kernel.org/show_bug.cgi?id=212617 Fixes: 9cf9b84cc701 ("r8169: make use of phy_set_asym_pause") Reported-by: Roman Mamedov Tested-by: Roman Mamedov Signed-off-by: Heiner Kallweit Cc: stable@vger.kernel.org Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 581a92fc32924f..1df2c002c9f642 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -2350,6 +2350,13 @@ static void rtl_jumbo_config(struct rtl8169_private *tp) if (pci_is_pcie(tp->pci_dev) && tp->supports_gmii) pcie_set_readrq(tp->pci_dev, readrq); + + /* Chip doesn't support pause in jumbo mode */ + linkmode_mod_bit(ETHTOOL_LINK_MODE_Pause_BIT, + tp->phydev->advertising, !jumbo); + linkmode_mod_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, + tp->phydev->advertising, !jumbo); + phy_start_aneg(tp->phydev); } DECLARE_RTL_COND(rtl_chipcmd_cond) @@ -4630,8 +4637,6 @@ static int r8169_phy_connect(struct rtl8169_private *tp) if (!tp->supports_gmii) phy_set_max_speed(phydev, SPEED_100); - phy_support_asym_pause(phydev); - phy_attached_info(phydev); return 0; -- cgit 1.2.3-korg From 0775ebc4cf8554bdcd2c212669a0868ab68df5c0 Mon Sep 17 00:00:00 2001 From: Lijun Pan Date: Wed, 14 Apr 2021 02:46:14 -0500 Subject: ibmvnic: avoid calling napi_disable() twice __ibmvnic_open calls napi_disable without checking whether NAPI polling has already been disabled or not. This could cause napi_disable being called twice, which could generate deadlock. For example, the first napi_disable will spin until NAPI_STATE_SCHED is cleared by napi_complete_done, then set it again. When napi_disable is called the second time, it will loop infinitely because no dev->poll will be running to clear NAPI_STATE_SCHED. To prevent above scenario from happening, call ibmvnic_napi_disable() which checks if napi is disabled or not before calling napi_disable. Fixes: bfc32f297337 ("ibmvnic: Move resource initialization to its own routine") Suggested-by: Thomas Falcon Signed-off-by: Lijun Pan Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 110a0d0eaabb51..2d27f8aa0d4b4a 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1149,8 +1149,7 @@ static int __ibmvnic_open(struct net_device *netdev) rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_UP); if (rc) { - for (i = 0; i < adapter->req_rx_queues; i++) - napi_disable(&adapter->napi[i]); + ibmvnic_napi_disable(adapter); release_resources(adapter); return rc; } -- cgit 1.2.3-korg From d3a6abccbd272aea7dc2c6f984bb5a2c11278e44 Mon Sep 17 00:00:00 2001 From: Lijun Pan Date: Wed, 14 Apr 2021 02:46:15 -0500 Subject: ibmvnic: remove duplicate napi_schedule call in do_reset function During adapter reset, do_reset/do_hard_reset calls ibmvnic_open(), which will calls napi_schedule if previous state is VNIC_CLOSED (i.e, the reset case, and "ifconfig down" case). So there is no need for do_reset to call napi_schedule again at the end of the function though napi_schedule will neglect the request if napi is already scheduled. Fixes: ed651a10875f ("ibmvnic: Updated reset handling") Signed-off-by: Lijun Pan Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 2d27f8aa0d4b4a..f4bd63216672ef 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1921,7 +1921,7 @@ static int do_reset(struct ibmvnic_adapter *adapter, u64 old_num_rx_queues, old_num_tx_queues; u64 old_num_rx_slots, old_num_tx_slots; struct net_device *netdev = adapter->netdev; - int i, rc; + int rc; netdev_dbg(adapter->netdev, "[S:%d FOP:%d] Reset reason %d, reset_state %d\n", @@ -2110,10 +2110,6 @@ static int do_reset(struct ibmvnic_adapter *adapter, /* refresh device's multicast list */ ibmvnic_set_multi(netdev); - /* kick napi */ - for (i = 0; i < adapter->req_rx_queues; i++) - napi_schedule(&adapter->napi[i]); - if (adapter->reset_reason == VNIC_RESET_FAILOVER || adapter->reset_reason == VNIC_RESET_MOBILITY) __netdev_notify_peers(netdev); -- cgit 1.2.3-korg From 7c451f3ef676c805a4b77a743a01a5c21a250a73 Mon Sep 17 00:00:00 2001 From: Lijun Pan Date: Wed, 14 Apr 2021 02:46:16 -0500 Subject: ibmvnic: remove duplicate napi_schedule call in open function Remove the unnecessary napi_schedule() call in __ibmvnic_open() since interrupt_rx() calls napi_schedule_prep/__napi_schedule during every receive interrupt. Fixes: ed651a10875f ("ibmvnic: Updated reset handling") Signed-off-by: Lijun Pan Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index f4bd63216672ef..ffb2a91750c7e0 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1156,11 +1156,6 @@ static int __ibmvnic_open(struct net_device *netdev) netif_tx_start_all_queues(netdev); - if (prev_state == VNIC_CLOSED) { - for (i = 0; i < adapter->req_rx_queues; i++) - napi_schedule(&adapter->napi[i]); - } - adapter->state = VNIC_OPEN; return rc; } -- cgit 1.2.3-korg From 292ecd9f5a94dd29d09fe03b5b669cb20b44f19e Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 14 Apr 2021 12:00:27 +0200 Subject: doc: move seg6_flowlabel to seg6-sysctl.rst Let's have all seg6 sysctl at the same place. Fixes: a6dc6670cd7e ("ipv6: sr: Add documentation for seg_flowlabel sysctl") Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.rst | 15 --------------- Documentation/networking/seg6-sysctl.rst | 13 +++++++++++++ 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index c7952ac5bd2f1f..3feb5e565b1a1d 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1849,21 +1849,6 @@ ip6frag_low_thresh - INTEGER ip6frag_time - INTEGER Time in seconds to keep an IPv6 fragment in memory. -IPv6 Segment Routing: - -seg6_flowlabel - INTEGER - Controls the behaviour of computing the flowlabel of outer - IPv6 header in case of SR T.encaps - - == ======================================================= - -1 set flowlabel to zero. - 0 copy flowlabel from Inner packet in case of Inner IPv6 - (Set flowlabel to 0 in case IPv4/L2) - 1 Compute the flowlabel using seg6_make_flowlabel() - == ======================================================= - - Default is 0. - ``conf/default/*``: Change the interface-specific default settings. diff --git a/Documentation/networking/seg6-sysctl.rst b/Documentation/networking/seg6-sysctl.rst index ec73e144503075..07c20e470bafe6 100644 --- a/Documentation/networking/seg6-sysctl.rst +++ b/Documentation/networking/seg6-sysctl.rst @@ -24,3 +24,16 @@ seg6_require_hmac - INTEGER * 1 - Drop SR packets without HMAC, validate SR packets with HMAC Default is 0. + +seg6_flowlabel - INTEGER + Controls the behaviour of computing the flowlabel of outer + IPv6 header in case of SR T.encaps + + == ======================================================= + -1 set flowlabel to zero. + 0 copy flowlabel from Inner packet in case of Inner IPv6 + (Set flowlabel to 0 in case IPv4/L2) + 1 Compute the flowlabel using seg6_make_flowlabel() + == ======================================================= + + Default is 0. -- cgit 1.2.3-korg From 2e1534f395e73152e2051332034bff61a56a8368 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 14 Apr 2021 12:03:25 +0200 Subject: vrf: fix a comment about loopback device This is a leftover of the below commit. Fixes: 4f04256c983a ("net: vrf: Drop local rtable and rt6_info") Signed-off-by: Nicolas Dichtel Acked-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 6d9130859c55aa..503e2fd7ce5187 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -471,9 +471,8 @@ static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, skb_dst_drop(skb); - /* if dst.dev is loopback or the VRF device again this is locally - * originated traffic destined to a local address. Short circuit - * to Rx path + /* if dst.dev is the VRF device again this is locally originated traffic + * destined to a local address. Short circuit to Rx path. */ if (dst->dev == dev) return vrf_local_xmit(skb, dev, dst); @@ -547,9 +546,8 @@ static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, skb_dst_drop(skb); - /* if dst.dev is loopback or the VRF device again this is locally - * originated traffic destined to a local address. Short circuit - * to Rx path + /* if dst.dev is the VRF device again this is locally originated traffic + * destined to a local address. Short circuit to Rx path. */ if (rt->dst.dev == vrf_dev) return vrf_local_xmit(skb, vrf_dev, &rt->dst); -- cgit 1.2.3-korg From a714e27ea8bdee2b238748029d31472d0a65b611 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 14 Apr 2021 14:20:29 +0300 Subject: net: macb: fix the restore of cmp registers Commit a14d273ba159 ("net: macb: restore cmp registers on resume path") introduces the restore of CMP registers on resume path. In case the IP doesn't support type 2 screeners (zero on DCFG8 register) the struct macb::rx_fs_list::list is not initialized and thus the list_for_each_entry(item, &bp->rx_fs_list.list, list) loop introduced in commit a14d273ba159 ("net: macb: restore cmp registers on resume path") will access an uninitialized list leading to crash. Thus, initialize the struct macb::rx_fs_list::list without taking into account if the IP supports type 2 screeners or not. Fixes: a14d273ba159 ("net: macb: restore cmp registers on resume path") Signed-off-by: Claudiu Beznea Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 6e5cf490c01de2..0f6a6cb7e98d77 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -3918,6 +3918,7 @@ static int macb_init(struct platform_device *pdev) reg = gem_readl(bp, DCFG8); bp->max_tuples = min((GEM_BFEXT(SCR2CMP, reg) / 3), GEM_BFEXT(T2SCR, reg)); + INIT_LIST_HEAD(&bp->rx_fs_list.list); if (bp->max_tuples > 0) { /* also needs one ethtype match to check IPv4 */ if (GEM_BFEXT(SCR2ETH, reg) > 0) { @@ -3928,7 +3929,6 @@ static int macb_init(struct platform_device *pdev) /* Filtering is supported in hw but don't enable it in kernel now */ dev->hw_features |= NETIF_F_NTUPLE; /* init Rx flow definitions */ - INIT_LIST_HEAD(&bp->rx_fs_list.list); bp->rx_fs_list.count = 0; spin_lock_init(&bp->rx_fs_lock); } else -- cgit 1.2.3-korg From 416dcc5ce9d2a810477171c62ffa061a98f87367 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Wed, 14 Apr 2021 19:31:48 +0800 Subject: cavium/liquidio: Fix duplicate argument Fix the following coccicheck warning: ./drivers/net/ethernet/cavium/liquidio/cn66xx_regs.h:413:6-28: duplicated argument to & or | The CN6XXX_INTR_M1UPB0_ERR here is duplicate. Here should be CN6XXX_INTR_M1UNB0_ERR. Signed-off-by: Wan Jiabing Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/liquidio/cn66xx_regs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cavium/liquidio/cn66xx_regs.h b/drivers/net/ethernet/cavium/liquidio/cn66xx_regs.h index b248966837b4c2..7aad40b2aa7360 100644 --- a/drivers/net/ethernet/cavium/liquidio/cn66xx_regs.h +++ b/drivers/net/ethernet/cavium/liquidio/cn66xx_regs.h @@ -412,7 +412,7 @@ | CN6XXX_INTR_M0UNWI_ERR \ | CN6XXX_INTR_M1UPB0_ERR \ | CN6XXX_INTR_M1UPWI_ERR \ - | CN6XXX_INTR_M1UPB0_ERR \ + | CN6XXX_INTR_M1UNB0_ERR \ | CN6XXX_INTR_M1UNWI_ERR \ | CN6XXX_INTR_INSTR_DB_OF_ERR \ | CN6XXX_INTR_SLIST_DB_OF_ERR \ -- cgit 1.2.3-korg From 00423969d806d7169d16fa6314c570a472ca26c9 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 14 Apr 2021 17:10:07 +0200 Subject: Revert "net: stmmac: re-init rx buffers when mac resume back" This reverts commit 9c63faaa931e443e7abbbee9de0169f1d4710546, which introduces a suspend/resume regression on Jetson TX2 boards that can be reproduced every time. Given that the issue that this was supposed to fix only occurs very sporadically the safest course of action is to revert before v5.12 and then we can have another go at fixing the more rare issue in the next release (and perhaps backport it if necessary). The root cause of the observed problem seems to be that when the system is suspended, some packets are still in transit. When the descriptors for these buffers are cleared on resume, the descriptors become invalid and cause a fatal bus error. Link: https://lore.kernel.org/r/708edb92-a5df-ecc4-3126-5ab36707e275@nvidia.com/ Reported-by: Jonathan Hunter Signed-off-by: Thierry Reding Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 84 +---------------------- 1 file changed, 1 insertion(+), 83 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 208cae344ffa26..4749bd0af16073 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1379,88 +1379,6 @@ static void stmmac_free_tx_buffer(struct stmmac_priv *priv, u32 queue, int i) } } -/** - * stmmac_reinit_rx_buffers - reinit the RX descriptor buffer. - * @priv: driver private structure - * Description: this function is called to re-allocate a receive buffer, perform - * the DMA mapping and init the descriptor. - */ -static void stmmac_reinit_rx_buffers(struct stmmac_priv *priv) -{ - u32 rx_count = priv->plat->rx_queues_to_use; - u32 queue; - int i; - - for (queue = 0; queue < rx_count; queue++) { - struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue]; - - for (i = 0; i < priv->dma_rx_size; i++) { - struct stmmac_rx_buffer *buf = &rx_q->buf_pool[i]; - - if (buf->page) { - page_pool_recycle_direct(rx_q->page_pool, buf->page); - buf->page = NULL; - } - - if (priv->sph && buf->sec_page) { - page_pool_recycle_direct(rx_q->page_pool, buf->sec_page); - buf->sec_page = NULL; - } - } - } - - for (queue = 0; queue < rx_count; queue++) { - struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue]; - - for (i = 0; i < priv->dma_rx_size; i++) { - struct stmmac_rx_buffer *buf = &rx_q->buf_pool[i]; - struct dma_desc *p; - - if (priv->extend_desc) - p = &((rx_q->dma_erx + i)->basic); - else - p = rx_q->dma_rx + i; - - if (!buf->page) { - buf->page = page_pool_dev_alloc_pages(rx_q->page_pool); - if (!buf->page) - goto err_reinit_rx_buffers; - - buf->addr = page_pool_get_dma_addr(buf->page); - } - - if (priv->sph && !buf->sec_page) { - buf->sec_page = page_pool_dev_alloc_pages(rx_q->page_pool); - if (!buf->sec_page) - goto err_reinit_rx_buffers; - - buf->sec_addr = page_pool_get_dma_addr(buf->sec_page); - } - - stmmac_set_desc_addr(priv, p, buf->addr); - if (priv->sph) - stmmac_set_desc_sec_addr(priv, p, buf->sec_addr, true); - else - stmmac_set_desc_sec_addr(priv, p, buf->sec_addr, false); - if (priv->dma_buf_sz == BUF_SIZE_16KiB) - stmmac_init_desc3(priv, p); - } - } - - return; - -err_reinit_rx_buffers: - do { - while (--i >= 0) - stmmac_free_rx_buffer(priv, queue, i); - - if (queue == 0) - break; - - i = priv->dma_rx_size; - } while (queue-- > 0); -} - /** * init_dma_rx_desc_rings - init the RX descriptor rings * @dev: net device structure @@ -5428,7 +5346,7 @@ int stmmac_resume(struct device *dev) mutex_lock(&priv->lock); stmmac_reset_queues_param(priv); - stmmac_reinit_rx_buffers(priv); + stmmac_free_tx_skbufs(priv); stmmac_clear_descriptors(priv); -- cgit 1.2.3-korg From 41bafb31dcd58d834bdffa5db703f94fd2cec727 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Mon, 12 Apr 2021 17:50:08 +0300 Subject: net/mlx5: Fix setting of devlink traps in switchdev mode Prevent setting of devlink traps on the uplink while in switchdev mode. In this mode, it is the SW switch responsibility to handle both packets with a mismatch in destination MAC or VLAN ID. Therefore, there are no flow steering tables to trap undesirable packets and driver crashes upon setting a trap. Fixes: 241dc159391f ("net/mlx5: Notify on trap action by blocking event") Signed-off-by: Aya Levin Reviewed-by: Moshe Shemesh Reviewed-by: Roi Dayan Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index d7d8a68ef23d7e..d0f9d3cee97d59 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -246,6 +246,11 @@ static int mlx5_devlink_trap_action_set(struct devlink *devlink, struct mlx5_devlink_trap *dl_trap; int err = 0; + if (is_mdev_switchdev_mode(dev)) { + NL_SET_ERR_MSG_MOD(extack, "Devlink traps can't be set in switchdev mode"); + return -EOPNOTSUPP; + } + dl_trap = mlx5_find_trap_by_id(dev, trap->id); if (!dl_trap) { mlx5_core_err(dev, "Devlink trap: Set action on invalid trap id 0x%x", trap->id); -- cgit 1.2.3-korg From 7a320c9db3e73fb6c4f9a331087df9df18767221 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Sun, 11 Apr 2021 09:33:12 +0300 Subject: net/mlx5e: Fix setting of RS FEC mode Change register setting from bit number to bit mask. Fixes: b5ede32d3329 ("net/mlx5e: Add support for FEC modes based on 50G per lane links") Signed-off-by: Aya Levin Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/port.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c index 308fd279669ece..89510cac46c22c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c @@ -387,21 +387,6 @@ enum mlx5e_fec_supported_link_mode { *_policy = MLX5_GET(pplm_reg, _buf, fec_override_admin_##link); \ } while (0) -#define MLX5E_FEC_OVERRIDE_ADMIN_50G_POLICY(buf, policy, write, link) \ - do { \ - unsigned long policy_long; \ - u16 *__policy = &(policy); \ - bool _write = (write); \ - \ - policy_long = *__policy; \ - if (_write && *__policy) \ - *__policy = find_first_bit(&policy_long, \ - sizeof(policy_long) * BITS_PER_BYTE);\ - MLX5E_FEC_OVERRIDE_ADMIN_POLICY(buf, *__policy, _write, link); \ - if (!_write && *__policy) \ - *__policy = 1 << *__policy; \ - } while (0) - /* get/set FEC admin field for a given speed */ static int mlx5e_fec_admin_field(u32 *pplm, u16 *fec_policy, bool write, enum mlx5e_fec_supported_link_mode link_mode) @@ -423,16 +408,16 @@ static int mlx5e_fec_admin_field(u32 *pplm, u16 *fec_policy, bool write, MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 100g); break; case MLX5E_FEC_SUPPORTED_LINK_MODE_50G_1X: - MLX5E_FEC_OVERRIDE_ADMIN_50G_POLICY(pplm, *fec_policy, write, 50g_1x); + MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 50g_1x); break; case MLX5E_FEC_SUPPORTED_LINK_MODE_100G_2X: - MLX5E_FEC_OVERRIDE_ADMIN_50G_POLICY(pplm, *fec_policy, write, 100g_2x); + MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 100g_2x); break; case MLX5E_FEC_SUPPORTED_LINK_MODE_200G_4X: - MLX5E_FEC_OVERRIDE_ADMIN_50G_POLICY(pplm, *fec_policy, write, 200g_4x); + MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 200g_4x); break; case MLX5E_FEC_SUPPORTED_LINK_MODE_400G_8X: - MLX5E_FEC_OVERRIDE_ADMIN_50G_POLICY(pplm, *fec_policy, write, 400g_8x); + MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 400g_8x); break; default: return -EINVAL; -- cgit 1.2.3-korg From e3e0f9b279705154b951d579dc3d8b7041710e24 Mon Sep 17 00:00:00 2001 From: wenxu Date: Fri, 9 Apr 2021 13:33:48 +0800 Subject: net/mlx5e: fix ingress_ifindex check in mlx5e_flower_parse_meta In the nft_offload there is the mate flow_dissector with no ingress_ifindex but with ingress_iftype that only be used in the software. So if the mask of ingress_ifindex in meta is 0, this meta check should be bypass. Fixes: 6d65bc64e232 ("net/mlx5e: Add mlx5e_flower_parse_meta support") Signed-off-by: wenxu Acked-by: Pablo Neira Ayuso Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index df2a0af854bbf6..d675107d9ecabe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1895,6 +1895,9 @@ static int mlx5e_flower_parse_meta(struct net_device *filter_dev, return 0; flow_rule_match_meta(rule, &match); + if (!match.mask->ingress_ifindex) + return 0; + if (match.mask->ingress_ifindex != 0xFFFFFFFF) { NL_SET_ERR_MSG_MOD(extack, "Unsupported ingress ifindex mask"); return -EOPNOTSUPP; -- cgit 1.2.3-korg From 4e39a072a6a0fc422ba7da5e4336bdc295d70211 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Wed, 14 Apr 2021 10:34:28 +0800 Subject: i40e: fix the panic when running bpf in xdpdrv mode Fix this panic by adding more rules to calculate the value of @rss_size_max which could be used in allocating the queues when bpf is loaded, which, however, could cause the failure and then trigger the NULL pointer of vsi->rx_rings. Prio to this fix, the machine doesn't care about how many cpus are online and then allocates 256 queues on the machine with 32 cpus online actually. Once the load of bpf begins, the log will go like this "failed to get tracking for 256 queues for VSI 0 err -12" and this "setup of MAIN VSI failed". Thus, I attach the key information of the crash-log here. BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 RIP: 0010:i40e_xdp+0xdd/0x1b0 [i40e] Call Trace: [2160294.717292] ? i40e_reconfig_rss_queues+0x170/0x170 [i40e] [2160294.717666] dev_xdp_install+0x4f/0x70 [2160294.718036] dev_change_xdp_fd+0x11f/0x230 [2160294.718380] ? dev_disable_lro+0xe0/0xe0 [2160294.718705] do_setlink+0xac7/0xe70 [2160294.719035] ? __nla_parse+0xed/0x120 [2160294.719365] rtnl_newlink+0x73b/0x860 Fixes: 41c445ff0f48 ("i40e: main driver core") Co-developed-by: Shujin Li Signed-off-by: Shujin Li Signed-off-by: Jason Xing Reviewed-by: Jesse Brandeburg Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/i40e/i40e_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 30ad7c08d0fb61..527023ee4c076c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -12357,6 +12357,7 @@ static int i40e_sw_init(struct i40e_pf *pf) { int err = 0; int size; + u16 pow; /* Set default capability flags */ pf->flags = I40E_FLAG_RX_CSUM_ENABLED | @@ -12375,6 +12376,11 @@ static int i40e_sw_init(struct i40e_pf *pf) pf->rss_table_size = pf->hw.func_caps.rss_table_size; pf->rss_size_max = min_t(int, pf->rss_size_max, pf->hw.func_caps.num_tx_qp); + + /* find the next higher power-of-2 of num cpus */ + pow = roundup_pow_of_two(num_online_cpus()); + pf->rss_size_max = min_t(int, pf->rss_size_max, pow); + if (pf->hw.func_caps.rss) { pf->flags |= I40E_FLAG_RSS_ENABLED; pf->alloc_rss_size = min_t(int, pf->rss_size_max, -- cgit 1.2.3-korg From 1a73e427b824133940c2dd95ebe26b6dce1cbf10 Mon Sep 17 00:00:00 2001 From: Vinay Kumar Yadav Date: Thu, 15 Apr 2021 13:17:45 +0530 Subject: ch_ktls: Fix kernel panic Taking page refcount is not ideal and causes kernel panic sometimes. It's better to take tx_ctx lock for the complete skb transmit, to avoid page cleanup if ACK received in middle. Fixes: 5a4b9fe7fece ("cxgb4/chcr: complete record tx handling") Signed-off-by: Vinay Kumar Yadav Signed-off-by: Rohit Maheshwari Signed-off-by: David S. Miller --- .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 24 +++++----------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index 1115b8f9ea4e39..e39fa09403678e 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -2010,12 +2010,11 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) * we will send the complete record again. */ + spin_lock_irqsave(&tx_ctx->base.lock, flags); + do { - int i; cxgb4_reclaim_completed_tx(adap, &q->q, true); - /* lock taken */ - spin_lock_irqsave(&tx_ctx->base.lock, flags); /* fetch the tls record */ record = tls_get_record(&tx_ctx->base, tcp_seq, &tx_info->record_no); @@ -2074,11 +2073,11 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) tls_end_offset, skb_offset, 0); - spin_unlock_irqrestore(&tx_ctx->base.lock, flags); if (ret) { /* free the refcount taken earlier */ if (tls_end_offset < data_len) dev_kfree_skb_any(skb); + spin_unlock_irqrestore(&tx_ctx->base.lock, flags); goto out; } @@ -2088,16 +2087,6 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) continue; } - /* increase page reference count of the record, so that there - * won't be any chance of page free in middle if in case stack - * receives ACK and try to delete the record. - */ - for (i = 0; i < record->num_frags; i++) - __skb_frag_ref(&record->frags[i]); - /* lock cleared */ - spin_unlock_irqrestore(&tx_ctx->base.lock, flags); - - /* if a tls record is finishing in this SKB */ if (tls_end_offset <= data_len) { ret = chcr_end_part_handler(tx_info, skb, record, @@ -2122,13 +2111,9 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) data_len = 0; } - /* clear the frag ref count which increased locally before */ - for (i = 0; i < record->num_frags; i++) { - /* clear the frag ref count */ - __skb_frag_unref(&record->frags[i]); - } /* if any failure, come out from the loop. */ if (ret) { + spin_unlock_irqrestore(&tx_ctx->base.lock, flags); if (th->fin) dev_kfree_skb_any(skb); @@ -2143,6 +2128,7 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) } while (data_len > 0); + spin_unlock_irqrestore(&tx_ctx->base.lock, flags); atomic64_inc(&port_stats->ktls_tx_encrypted_packets); atomic64_add(skb_data_len, &port_stats->ktls_tx_encrypted_bytes); -- cgit 1.2.3-korg From bc16efd2430652f894ae34b1de5eccc3bf0d2810 Mon Sep 17 00:00:00 2001 From: Vinay Kumar Yadav Date: Thu, 15 Apr 2021 13:17:46 +0530 Subject: ch_ktls: fix device connection close When sge queue is full and chcr_ktls_xmit_wr_complete() returns failure, skb is not freed if it is not the last tls record in this skb, causes refcount never gets freed and tls_dev_del() never gets called on this connection. Fixes: 5a4b9fe7fece ("cxgb4/chcr: complete record tx handling") Signed-off-by: Vinay Kumar Yadav Signed-off-by: Rohit Maheshwari Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index e39fa09403678e..a626560f836513 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -1735,7 +1735,9 @@ static int chcr_end_part_handler(struct chcr_ktls_info *tx_info, struct sge_eth_txq *q, u32 skb_offset, u32 tls_end_offset, bool last_wr) { + bool free_skb_if_tx_fails = false; struct sk_buff *nskb = NULL; + /* check if it is a complete record */ if (tls_end_offset == record->len) { nskb = skb; @@ -1758,6 +1760,8 @@ static int chcr_end_part_handler(struct chcr_ktls_info *tx_info, if (last_wr) dev_kfree_skb_any(skb); + else + free_skb_if_tx_fails = true; last_wr = true; @@ -1769,6 +1773,8 @@ static int chcr_end_part_handler(struct chcr_ktls_info *tx_info, record->num_frags, (last_wr && tcp_push_no_fin), mss)) { + if (free_skb_if_tx_fails) + dev_kfree_skb_any(skb); goto out; } tx_info->prev_seq = record->end_seq; -- cgit 1.2.3-korg From 21d8c25e3f4b9052a471ced8f47b531956eb9963 Mon Sep 17 00:00:00 2001 From: Vinay Kumar Yadav Date: Thu, 15 Apr 2021 13:17:47 +0530 Subject: ch_ktls: tcb close causes tls connection failure HW doesn't need marking TCB closed. This TCB state change sometimes causes problem to the new connection which gets the same tid. Fixes: 34aba2c45024 ("cxgb4/chcr : Register to tls add and del callback") Signed-off-by: Vinay Kumar Yadav Signed-off-by: Rohit Maheshwari Signed-off-by: David S. Miller --- .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index a626560f836513..8559eec161f0fd 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -349,18 +349,6 @@ static int chcr_set_tcb_field(struct chcr_ktls_info *tx_info, u16 word, return cxgb4_ofld_send(tx_info->netdev, skb); } -/* - * chcr_ktls_mark_tcb_close: mark tcb state to CLOSE - * @tx_info - driver specific tls info. - * return: NET_TX_OK/NET_XMIT_DROP. - */ -static int chcr_ktls_mark_tcb_close(struct chcr_ktls_info *tx_info) -{ - return chcr_set_tcb_field(tx_info, TCB_T_STATE_W, - TCB_T_STATE_V(TCB_T_STATE_M), - CHCR_TCB_STATE_CLOSED, 1); -} - /* * chcr_ktls_dev_del: call back for tls_dev_del. * Remove the tid and l2t entry and close the connection. @@ -395,8 +383,6 @@ static void chcr_ktls_dev_del(struct net_device *netdev, /* clear tid */ if (tx_info->tid != -1) { - /* clear tcb state and then release tid */ - chcr_ktls_mark_tcb_close(tx_info); cxgb4_remove_tid(&tx_info->adap->tids, tx_info->tx_chan, tx_info->tid, tx_info->ip_family); } @@ -574,7 +560,6 @@ static int chcr_ktls_dev_add(struct net_device *netdev, struct sock *sk, return 0; free_tid: - chcr_ktls_mark_tcb_close(tx_info); #if IS_ENABLED(CONFIG_IPV6) /* clear clip entry */ if (tx_info->ip_family == AF_INET6) @@ -672,10 +657,6 @@ static int chcr_ktls_cpl_act_open_rpl(struct adapter *adap, if (tx_info->pending_close) { spin_unlock(&tx_info->lock); if (!status) { - /* it's a late success, tcb status is established, - * mark it close. - */ - chcr_ktls_mark_tcb_close(tx_info); cxgb4_remove_tid(&tx_info->adap->tids, tx_info->tx_chan, tid, tx_info->ip_family); } -- cgit 1.2.3-korg From e8a4155567b3c903f49cbf89b8017e9cc22c4fe4 Mon Sep 17 00:00:00 2001 From: Vinay Kumar Yadav Date: Thu, 15 Apr 2021 13:17:48 +0530 Subject: ch_ktls: do not send snd_una update to TCB in middle snd_una update should not be done when the same skb is being sent out.chcr_short_record_handler() sends it again even though SND_UNA update is already sent for the skb in chcr_ktls_xmit(), which causes mismatch in un-acked TCP seq number, later causes problem in sending out complete record. Fixes: 429765a149f1 ("chcr: handle partial end part of a record") Signed-off-by: Vinay Kumar Yadav Signed-off-by: Rohit Maheshwari Signed-off-by: David S. Miller --- .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 53 ---------------------- 1 file changed, 53 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index 8559eec161f0fd..a3f5b80888e5a6 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -1644,54 +1644,6 @@ static void chcr_ktls_copy_record_in_skb(struct sk_buff *nskb, refcount_add(nskb->truesize, &nskb->sk->sk_wmem_alloc); } -/* - * chcr_ktls_update_snd_una: Reset the SEND_UNA. It will be done to avoid - * sending the same segment again. It will discard the segment which is before - * the current tx max. - * @tx_info - driver specific tls info. - * @q - TX queue. - * return: NET_TX_OK/NET_XMIT_DROP. - */ -static int chcr_ktls_update_snd_una(struct chcr_ktls_info *tx_info, - struct sge_eth_txq *q) -{ - struct fw_ulptx_wr *wr; - unsigned int ndesc; - int credits; - void *pos; - u32 len; - - len = sizeof(*wr) + roundup(CHCR_SET_TCB_FIELD_LEN, 16); - ndesc = DIV_ROUND_UP(len, 64); - - credits = chcr_txq_avail(&q->q) - ndesc; - if (unlikely(credits < 0)) { - chcr_eth_txq_stop(q); - return NETDEV_TX_BUSY; - } - - pos = &q->q.desc[q->q.pidx]; - - wr = pos; - /* ULPTX wr */ - wr->op_to_compl = htonl(FW_WR_OP_V(FW_ULPTX_WR)); - wr->cookie = 0; - /* fill len in wr field */ - wr->flowid_len16 = htonl(FW_WR_LEN16_V(DIV_ROUND_UP(len, 16))); - - pos += sizeof(*wr); - - pos = chcr_write_cpl_set_tcb_ulp(tx_info, q, tx_info->tid, pos, - TCB_SND_UNA_RAW_W, - TCB_SND_UNA_RAW_V(TCB_SND_UNA_RAW_M), - TCB_SND_UNA_RAW_V(0), 0); - - chcr_txq_advance(&q->q, ndesc); - cxgb4_ring_tx_db(tx_info->adap, &q->q, ndesc); - - return 0; -} - /* * chcr_end_part_handler: This handler will handle the record which * is complete or if record's end part is received. T6 adapter has a issue that @@ -1892,11 +1844,6 @@ static int chcr_short_record_handler(struct chcr_ktls_info *tx_info, /* reset tcp_seq as per the prior_data_required len */ tcp_seq -= prior_data_len; } - /* reset snd una, so the middle record won't send the already - * sent part. - */ - if (chcr_ktls_update_snd_una(tx_info, q)) - goto out; atomic64_inc(&tx_info->adap->ch_ktls_stats.ktls_tx_middle_pkts); } else { atomic64_inc(&tx_info->adap->ch_ktls_stats.ktls_tx_start_pkts); -- cgit 1.2.3-korg From 9601148392520e2e134936e76788fc2a6371e7be Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 23 Mar 2021 08:32:59 +0100 Subject: bpf: Use correct permission flag for mixed signed bounds arithmetic We forbid adding unknown scalars with mixed signed bounds due to the spectre v1 masking mitigation. Hence this also needs bypass_spec_v1 flag instead of allow_ptr_leaks. Fixes: 2c78ee898d8f ("bpf: Implement CAP_BPF") Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3a738724a380c6..2ede4b85023022 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6085,7 +6085,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst, reg_type_str[ptr_reg->type]); return -EACCES; case PTR_TO_MAP_VALUE: - if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) { + if (!env->env->bypass_spec_v1 && !known && (smin_val < 0) != (smax_val < 0)) { verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n", off_reg == dst_reg ? dst : src); return -EACCES; -- cgit 1.2.3-korg From 6f55b2f2a1178856c19bbce2f71449926e731914 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 22 Mar 2021 15:45:52 +0100 Subject: bpf: Move off_reg into sanitize_ptr_alu Small refactor to drag off_reg into sanitize_ptr_alu(), so we later on can use off_reg for generalizing some of the checks for all pointer types. Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2ede4b85023022..4ee014dadac796 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5929,11 +5929,12 @@ static int sanitize_val_alu(struct bpf_verifier_env *env, static int sanitize_ptr_alu(struct bpf_verifier_env *env, struct bpf_insn *insn, const struct bpf_reg_state *ptr_reg, - struct bpf_reg_state *dst_reg, - bool off_is_neg) + const struct bpf_reg_state *off_reg, + struct bpf_reg_state *dst_reg) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_insn_aux_data *aux = cur_aux(env); + bool off_is_neg = off_reg->smin_value < 0; bool ptr_is_dst_reg = ptr_reg == dst_reg; u8 opcode = BPF_OP(insn->code); u32 alu_state, alu_limit; @@ -6110,7 +6111,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, switch (opcode) { case BPF_ADD: - ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); + ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg); if (ret < 0) { verbose(env, "R%d tried to add from different maps, paths, or prohibited types\n", dst); return ret; @@ -6165,7 +6166,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } break; case BPF_SUB: - ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); + ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg); if (ret < 0) { verbose(env, "R%d tried to sub from different maps, paths, or prohibited types\n", dst); return ret; -- cgit 1.2.3-korg From 24c109bb1537c12c02aeed2d51a347b4d6a9b76e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 23 Mar 2021 08:51:02 +0100 Subject: bpf: Ensure off_reg has no mixed signed bounds for all types The mixed signed bounds check really belongs into retrieve_ptr_limit() instead of outside of it in adjust_ptr_min_max_vals(). The reason is that this check is not tied to PTR_TO_MAP_VALUE only, but to all pointer types that we handle in retrieve_ptr_limit() and given errors from the latter propagate back to adjust_ptr_min_max_vals() and lead to rejection of the program, it's a better place to reside to avoid anything slipping through for future types. The reason why we must reject such off_reg is that we otherwise would not be able to derive a mask, see details in 9d7eceede769 ("bpf: restrict unknown scalars of mixed signed bounds for unprivileged"). Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4ee014dadac796..a21d7f1a0ba80b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5857,12 +5857,18 @@ static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) } static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, - u32 *ptr_limit, u8 opcode, bool off_is_neg) + const struct bpf_reg_state *off_reg, + u32 *ptr_limit, u8 opcode) { + bool off_is_neg = off_reg->smin_value < 0; bool mask_to_left = (opcode == BPF_ADD && off_is_neg) || (opcode == BPF_SUB && !off_is_neg); u32 off, max; + if (!tnum_is_const(off_reg->var_off) && + (off_reg->smin_value < 0) != (off_reg->smax_value < 0)) + return -EACCES; + switch (ptr_reg->type) { case PTR_TO_STACK: /* Offset 0 is out-of-bounds, but acceptable start for the @@ -5956,7 +5962,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, alu_state |= ptr_is_dst_reg ? BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; - err = retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg); + err = retrieve_ptr_limit(ptr_reg, off_reg, &alu_limit, opcode); if (err < 0) return err; @@ -6036,8 +6042,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value, umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; - u32 dst = insn->dst_reg, src = insn->src_reg; u8 opcode = BPF_OP(insn->code); + u32 dst = insn->dst_reg; int ret; dst_reg = ®s[dst]; @@ -6085,13 +6091,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; - case PTR_TO_MAP_VALUE: - if (!env->env->bypass_spec_v1 && !known && (smin_val < 0) != (smax_val < 0)) { - verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n", - off_reg == dst_reg ? dst : src); - return -EACCES; - } - fallthrough; default: break; } -- cgit 1.2.3-korg From b658bbb844e28f1862867f37e8ca11a8e2aa94a3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 23 Mar 2021 09:04:10 +0100 Subject: bpf: Rework ptr_limit into alu_limit and add common error path Small refactor with no semantic changes in order to consolidate the max ptr_limit boundary check. Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a21d7f1a0ba80b..b8e0171d9591e8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5858,12 +5858,12 @@ static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg, - u32 *ptr_limit, u8 opcode) + u32 *alu_limit, u8 opcode) { bool off_is_neg = off_reg->smin_value < 0; bool mask_to_left = (opcode == BPF_ADD && off_is_neg) || (opcode == BPF_SUB && !off_is_neg); - u32 off, max; + u32 off, max = 0, ptr_limit = 0; if (!tnum_is_const(off_reg->var_off) && (off_reg->smin_value < 0) != (off_reg->smax_value < 0)) @@ -5880,22 +5880,27 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, */ off = ptr_reg->off + ptr_reg->var_off.value; if (mask_to_left) - *ptr_limit = MAX_BPF_STACK + off; + ptr_limit = MAX_BPF_STACK + off; else - *ptr_limit = -off - 1; - return *ptr_limit >= max ? -ERANGE : 0; + ptr_limit = -off - 1; + break; case PTR_TO_MAP_VALUE: max = ptr_reg->map_ptr->value_size; if (mask_to_left) { - *ptr_limit = ptr_reg->umax_value + ptr_reg->off; + ptr_limit = ptr_reg->umax_value + ptr_reg->off; } else { off = ptr_reg->smin_value + ptr_reg->off; - *ptr_limit = ptr_reg->map_ptr->value_size - off - 1; + ptr_limit = ptr_reg->map_ptr->value_size - off - 1; } - return *ptr_limit >= max ? -ERANGE : 0; + break; default: return -EINVAL; } + + if (ptr_limit >= max) + return -ERANGE; + *alu_limit = ptr_limit; + return 0; } static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, -- cgit 1.2.3-korg From a6aaece00a57fa6f22575364b3903dfbccf5345d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 23 Mar 2021 09:30:01 +0100 Subject: bpf: Improve verifier error messages for users Consolidate all error handling and provide more user-friendly error messages from sanitize_ptr_alu() and sanitize_val_alu(). Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 86 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 63 insertions(+), 23 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b8e0171d9591e8..f378d4ae405fe7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5856,6 +5856,14 @@ static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) return &env->insn_aux_data[env->insn_idx]; } +enum { + REASON_BOUNDS = -1, + REASON_TYPE = -2, + REASON_PATHS = -3, + REASON_LIMIT = -4, + REASON_STACK = -5, +}; + static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg, u32 *alu_limit, u8 opcode) @@ -5867,7 +5875,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, if (!tnum_is_const(off_reg->var_off) && (off_reg->smin_value < 0) != (off_reg->smax_value < 0)) - return -EACCES; + return REASON_BOUNDS; switch (ptr_reg->type) { case PTR_TO_STACK: @@ -5894,11 +5902,11 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, } break; default: - return -EINVAL; + return REASON_TYPE; } if (ptr_limit >= max) - return -ERANGE; + return REASON_LIMIT; *alu_limit = ptr_limit; return 0; } @@ -5918,7 +5926,7 @@ static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, if (aux->alu_state && (aux->alu_state != alu_state || aux->alu_limit != alu_limit)) - return -EACCES; + return REASON_PATHS; /* Corresponding fixup done in fixup_bpf_calls(). */ aux->alu_state = alu_state; @@ -5991,7 +5999,46 @@ do_sim: ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); if (!ptr_is_dst_reg && ret) *dst_reg = tmp; - return !ret ? -EFAULT : 0; + return !ret ? REASON_STACK : 0; +} + +static int sanitize_err(struct bpf_verifier_env *env, + const struct bpf_insn *insn, int reason, + const struct bpf_reg_state *off_reg, + const struct bpf_reg_state *dst_reg) +{ + static const char *err = "pointer arithmetic with it prohibited for !root"; + const char *op = BPF_OP(insn->code) == BPF_ADD ? "add" : "sub"; + u32 dst = insn->dst_reg, src = insn->src_reg; + + switch (reason) { + case REASON_BOUNDS: + verbose(env, "R%d has unknown scalar with mixed signed bounds, %s\n", + off_reg == dst_reg ? dst : src, err); + break; + case REASON_TYPE: + verbose(env, "R%d has pointer with unsupported alu operation, %s\n", + off_reg == dst_reg ? src : dst, err); + break; + case REASON_PATHS: + verbose(env, "R%d tried to %s from different maps, paths or scalars, %s\n", + dst, op, err); + break; + case REASON_LIMIT: + verbose(env, "R%d tried to %s beyond pointer bounds, %s\n", + dst, op, err); + break; + case REASON_STACK: + verbose(env, "R%d could not be pushed for speculative verification, %s\n", + dst, err); + break; + default: + verbose(env, "verifier internal error: unknown reason (%d)\n", + reason); + break; + } + + return -EACCES; } /* check that stack access falls within stack limits and that 'reg' doesn't @@ -6116,10 +6163,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, switch (opcode) { case BPF_ADD: ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg); - if (ret < 0) { - verbose(env, "R%d tried to add from different maps, paths, or prohibited types\n", dst); - return ret; - } + if (ret < 0) + return sanitize_err(env, insn, ret, off_reg, dst_reg); + /* We can take a fixed offset as long as it doesn't overflow * the s32 'off' field */ @@ -6171,10 +6217,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, break; case BPF_SUB: ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg); - if (ret < 0) { - verbose(env, "R%d tried to sub from different maps, paths, or prohibited types\n", dst); - return ret; - } + if (ret < 0) + return sanitize_err(env, insn, ret, off_reg, dst_reg); + if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */ verbose(env, "R%d tried to subtract pointer from scalar\n", @@ -6863,9 +6908,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, s32 s32_min_val, s32_max_val; u32 u32_min_val, u32_max_val; u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; - u32 dst = insn->dst_reg; - int ret; bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64); + int ret; smin_val = src_reg.smin_value; smax_val = src_reg.smax_value; @@ -6924,20 +6968,16 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, switch (opcode) { case BPF_ADD: ret = sanitize_val_alu(env, insn); - if (ret < 0) { - verbose(env, "R%d tried to add from different pointers or scalars\n", dst); - return ret; - } + if (ret < 0) + return sanitize_err(env, insn, ret, NULL, NULL); scalar32_min_max_add(dst_reg, &src_reg); scalar_min_max_add(dst_reg, &src_reg); dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off); break; case BPF_SUB: ret = sanitize_val_alu(env, insn); - if (ret < 0) { - verbose(env, "R%d tried to sub from different pointers or scalars\n", dst); - return ret; - } + if (ret < 0) + return sanitize_err(env, insn, ret, NULL, NULL); scalar32_min_max_sub(dst_reg, &src_reg); scalar_min_max_sub(dst_reg, &src_reg); dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off); -- cgit 1.2.3-korg From 073815b756c51ba9d8384d924c5d1c03ca3d1ae4 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 23 Mar 2021 15:05:48 +0100 Subject: bpf: Refactor and streamline bounds check into helper Move the bounds check in adjust_ptr_min_max_vals() into a small helper named sanitize_check_bounds() in order to simplify the former a bit. Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 49 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f378d4ae405fe7..db77e2c670b9ee 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6075,6 +6075,37 @@ static int check_stack_access_for_ptr_arithmetic( return 0; } +static int sanitize_check_bounds(struct bpf_verifier_env *env, + const struct bpf_insn *insn, + const struct bpf_reg_state *dst_reg) +{ + u32 dst = insn->dst_reg; + + /* For unprivileged we require that resulting offset must be in bounds + * in order to be able to sanitize access later on. + */ + if (env->bypass_spec_v1) + return 0; + + switch (dst_reg->type) { + case PTR_TO_STACK: + if (check_stack_access_for_ptr_arithmetic(env, dst, dst_reg, + dst_reg->off + dst_reg->var_off.value)) + return -EACCES; + break; + case PTR_TO_MAP_VALUE: + if (check_map_access(env, dst, dst_reg->off, 1, false)) { + verbose(env, "R%d pointer arithmetic of map value goes out of range, " + "prohibited for !root\n", dst); + return -EACCES; + } + break; + default: + break; + } + + return 0; +} /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. * Caller should also handle BPF_MOV case separately. @@ -6300,22 +6331,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, __reg_deduce_bounds(dst_reg); __reg_bound_offset(dst_reg); - /* For unprivileged we require that resulting offset must be in bounds - * in order to be able to sanitize access later on. - */ - if (!env->bypass_spec_v1) { - if (dst_reg->type == PTR_TO_MAP_VALUE && - check_map_access(env, dst, dst_reg->off, 1, false)) { - verbose(env, "R%d pointer arithmetic of map value goes out of range, " - "prohibited for !root\n", dst); - return -EACCES; - } else if (dst_reg->type == PTR_TO_STACK && - check_stack_access_for_ptr_arithmetic( - env, dst, dst_reg, dst_reg->off + - dst_reg->var_off.value)) { - return -EACCES; - } - } + if (sanitize_check_bounds(env, insn, dst_reg) < 0) + return -EACCES; return 0; } -- cgit 1.2.3-korg From f528819334881fd622fdadeddb3f7edaed8b7c9b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 24 Mar 2021 11:25:39 +0100 Subject: bpf: Move sanitize_val_alu out of op switch Add a small sanitize_needed() helper function and move sanitize_val_alu() out of the main opcode switch. In upcoming work, we'll move sanitize_ptr_alu() as well out of its opcode switch so this helps to streamline both. Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index db77e2c670b9ee..e41b6326e3e67c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5945,6 +5945,11 @@ static int sanitize_val_alu(struct bpf_verifier_env *env, return update_alu_sanitation_state(aux, BPF_ALU_NON_POINTER, 0); } +static bool sanitize_needed(u8 opcode) +{ + return opcode == BPF_ADD || opcode == BPF_SUB; +} + static int sanitize_ptr_alu(struct bpf_verifier_env *env, struct bpf_insn *insn, const struct bpf_reg_state *ptr_reg, @@ -6968,6 +6973,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, return 0; } + if (sanitize_needed(opcode)) { + ret = sanitize_val_alu(env, insn); + if (ret < 0) + return sanitize_err(env, insn, ret, NULL, NULL); + } + /* Calculate sign/unsigned bounds and tnum for alu32 and alu64 bit ops. * There are two classes of instructions: The first class we track both * alu32 and alu64 sign/unsigned bounds independently this provides the @@ -6984,17 +6995,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, */ switch (opcode) { case BPF_ADD: - ret = sanitize_val_alu(env, insn); - if (ret < 0) - return sanitize_err(env, insn, ret, NULL, NULL); scalar32_min_max_add(dst_reg, &src_reg); scalar_min_max_add(dst_reg, &src_reg); dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off); break; case BPF_SUB: - ret = sanitize_val_alu(env, insn); - if (ret < 0) - return sanitize_err(env, insn, ret, NULL, NULL); scalar32_min_max_sub(dst_reg, &src_reg); scalar_min_max_sub(dst_reg, &src_reg); dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off); -- cgit 1.2.3-korg From 7fedb63a8307dda0ec3b8969a3b233a1dd7ea8e0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 24 Mar 2021 10:38:26 +0100 Subject: bpf: Tighten speculative pointer arithmetic mask This work tightens the offset mask we use for unprivileged pointer arithmetic in order to mitigate a corner case reported by Piotr and Benedict where in the speculative domain it is possible to advance, for example, the map value pointer by up to value_size-1 out-of-bounds in order to leak kernel memory via side-channel to user space. Before this change, the computed ptr_limit for retrieve_ptr_limit() helper represents largest valid distance when moving pointer to the right or left which is then fed as aux->alu_limit to generate masking instructions against the offset register. After the change, the derived aux->alu_limit represents the largest potential value of the offset register which we mask against which is just a narrower subset of the former limit. For minimal complexity, we call sanitize_ptr_alu() from 2 observation points in adjust_ptr_min_max_vals(), that is, before and after the simulated alu operation. In the first step, we retieve the alu_state and alu_limit before the operation as well as we branch-off a verifier path and push it to the verification stack as we did before which checks the dst_reg under truncation, in other words, when the speculative domain would attempt to move the pointer out-of-bounds. In the second step, we retrieve the new alu_limit and calculate the absolute distance between both. Moreover, we commit the alu_state and final alu_limit via update_alu_sanitation_state() to the env's instruction aux data, and bail out from there if there is a mismatch due to coming from different verification paths with different states. Reported-by: Piotr Krysiuk Reported-by: Benedict Schlueter Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov Tested-by: Benedict Schlueter --- kernel/bpf/verifier.c | 73 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 29 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e41b6326e3e67c..0399ac092b3639 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5871,7 +5871,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, bool off_is_neg = off_reg->smin_value < 0; bool mask_to_left = (opcode == BPF_ADD && off_is_neg) || (opcode == BPF_SUB && !off_is_neg); - u32 off, max = 0, ptr_limit = 0; + u32 max = 0, ptr_limit = 0; if (!tnum_is_const(off_reg->var_off) && (off_reg->smin_value < 0) != (off_reg->smax_value < 0)) @@ -5880,26 +5880,18 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, switch (ptr_reg->type) { case PTR_TO_STACK: /* Offset 0 is out-of-bounds, but acceptable start for the - * left direction, see BPF_REG_FP. + * left direction, see BPF_REG_FP. Also, unknown scalar + * offset where we would need to deal with min/max bounds is + * currently prohibited for unprivileged. */ max = MAX_BPF_STACK + mask_to_left; - /* Indirect variable offset stack access is prohibited in - * unprivileged mode so it's not handled here. - */ - off = ptr_reg->off + ptr_reg->var_off.value; - if (mask_to_left) - ptr_limit = MAX_BPF_STACK + off; - else - ptr_limit = -off - 1; + ptr_limit = -(ptr_reg->var_off.value + ptr_reg->off); break; case PTR_TO_MAP_VALUE: max = ptr_reg->map_ptr->value_size; - if (mask_to_left) { - ptr_limit = ptr_reg->umax_value + ptr_reg->off; - } else { - off = ptr_reg->smin_value + ptr_reg->off; - ptr_limit = ptr_reg->map_ptr->value_size - off - 1; - } + ptr_limit = (mask_to_left ? + ptr_reg->smin_value : + ptr_reg->umax_value) + ptr_reg->off; break; default: return REASON_TYPE; @@ -5954,10 +5946,12 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, struct bpf_insn *insn, const struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg, - struct bpf_reg_state *dst_reg) + struct bpf_reg_state *dst_reg, + struct bpf_insn_aux_data *tmp_aux, + const bool commit_window) { + struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : tmp_aux; struct bpf_verifier_state *vstate = env->cur_state; - struct bpf_insn_aux_data *aux = cur_aux(env); bool off_is_neg = off_reg->smin_value < 0; bool ptr_is_dst_reg = ptr_reg == dst_reg; u8 opcode = BPF_OP(insn->code); @@ -5976,18 +5970,33 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, if (vstate->speculative) goto do_sim; - alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0; - alu_state |= ptr_is_dst_reg ? - BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; - err = retrieve_ptr_limit(ptr_reg, off_reg, &alu_limit, opcode); if (err < 0) return err; + if (commit_window) { + /* In commit phase we narrow the masking window based on + * the observed pointer move after the simulated operation. + */ + alu_state = tmp_aux->alu_state; + alu_limit = abs(tmp_aux->alu_limit - alu_limit); + } else { + alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0; + alu_state |= ptr_is_dst_reg ? + BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; + } + err = update_alu_sanitation_state(aux, alu_state, alu_limit); if (err < 0) return err; do_sim: + /* If we're in commit phase, we're done here given we already + * pushed the truncated dst_reg into the speculative verification + * stack. + */ + if (commit_window) + return 0; + /* Simulate and find potential out-of-bounds access under * speculative execution from truncation as a result of * masking when off was not within expected range. If off @@ -6130,6 +6139,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value, umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; + struct bpf_insn_aux_data tmp_aux = {}; u8 opcode = BPF_OP(insn->code); u32 dst = insn->dst_reg; int ret; @@ -6196,12 +6206,15 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* pointer types do not carry 32-bit bounds at the moment. */ __mark_reg32_unbounded(dst_reg); - switch (opcode) { - case BPF_ADD: - ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg); + if (sanitize_needed(opcode)) { + ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg, + &tmp_aux, false); if (ret < 0) return sanitize_err(env, insn, ret, off_reg, dst_reg); + } + switch (opcode) { + case BPF_ADD: /* We can take a fixed offset as long as it doesn't overflow * the s32 'off' field */ @@ -6252,10 +6265,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } break; case BPF_SUB: - ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg); - if (ret < 0) - return sanitize_err(env, insn, ret, off_reg, dst_reg); - if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */ verbose(env, "R%d tried to subtract pointer from scalar\n", @@ -6338,6 +6347,12 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (sanitize_check_bounds(env, insn, dst_reg) < 0) return -EACCES; + if (sanitize_needed(opcode)) { + ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg, + &tmp_aux, true); + if (ret < 0) + return sanitize_err(env, insn, ret, off_reg, dst_reg); + } return 0; } -- cgit 1.2.3-korg From d7a5091351756d0ae8e63134313c455624e36a13 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 24 Mar 2021 14:52:31 +0100 Subject: bpf: Update selftests to reflect new error states Update various selftest error messages: * The 'Rx tried to sub from different maps, paths, or prohibited types' is reworked into more specific/differentiated error messages for better guidance. * The change into 'value -4294967168 makes map_value pointer be out of bounds' is due to moving the mixed bounds check into the speculation handling and thus occuring slightly later than above mentioned sanity check. * The change into 'math between map_value pointer and register with unbounded min value' is similarly due to register sanity check coming before the mixed bounds check. * The case of 'map access: known scalar += value_ptr from different maps' now loads fine given masks are the same from the different paths (despite max map value size being different). Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- tools/testing/selftests/bpf/verifier/bounds.c | 5 ----- .../selftests/bpf/verifier/bounds_deduction.c | 21 +++++++++++---------- .../selftests/bpf/verifier/bounds_mix_sign_unsign.c | 13 ------------- tools/testing/selftests/bpf/verifier/map_ptr.c | 4 ++-- tools/testing/selftests/bpf/verifier/unpriv.c | 2 +- .../selftests/bpf/verifier/value_ptr_arith.c | 6 ++---- 6 files changed, 16 insertions(+), 35 deletions(-) diff --git a/tools/testing/selftests/bpf/verifier/bounds.c b/tools/testing/selftests/bpf/verifier/bounds.c index 57ed67b8607465..8a1caf46ffbc37 100644 --- a/tools/testing/selftests/bpf/verifier/bounds.c +++ b/tools/testing/selftests/bpf/verifier/bounds.c @@ -261,8 +261,6 @@ }, .fixup_map_hash_8b = { 3 }, /* not actually fully unbounded, but the bound is very high */ - .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root", - .result_unpriv = REJECT, .errstr = "value -4294967168 makes map_value pointer be out of bounds", .result = REJECT, }, @@ -298,9 +296,6 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - /* not actually fully unbounded, but the bound is very high */ - .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root", - .result_unpriv = REJECT, .errstr = "value -4294967168 makes map_value pointer be out of bounds", .result = REJECT, }, diff --git a/tools/testing/selftests/bpf/verifier/bounds_deduction.c b/tools/testing/selftests/bpf/verifier/bounds_deduction.c index c162498a64fc6a..91869aea6d6414 100644 --- a/tools/testing/selftests/bpf/verifier/bounds_deduction.c +++ b/tools/testing/selftests/bpf/verifier/bounds_deduction.c @@ -6,7 +6,7 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types", + .errstr_unpriv = "R1 has pointer with unsupported alu operation", .errstr = "R0 tried to subtract pointer from scalar", .result = REJECT, }, @@ -21,7 +21,7 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R1 tried to sub from different maps, paths, or prohibited types", + .errstr_unpriv = "R1 has pointer with unsupported alu operation", .result_unpriv = REJECT, .result = ACCEPT, .retval = 1, @@ -34,22 +34,23 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types", + .errstr_unpriv = "R1 has pointer with unsupported alu operation", .errstr = "R0 tried to subtract pointer from scalar", .result = REJECT, }, { "check deducing bounds from const, 4", .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_JMP_IMM(BPF_JSLE, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), - BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0), + BPF_ALU64_REG(BPF_SUB, BPF_REG_6, BPF_REG_0), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R1 tried to sub from different maps, paths, or prohibited types", + .errstr_unpriv = "R6 has pointer with unsupported alu operation", .result_unpriv = REJECT, .result = ACCEPT, }, @@ -61,7 +62,7 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types", + .errstr_unpriv = "R1 has pointer with unsupported alu operation", .errstr = "R0 tried to subtract pointer from scalar", .result = REJECT, }, @@ -74,7 +75,7 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types", + .errstr_unpriv = "R1 has pointer with unsupported alu operation", .errstr = "R0 tried to subtract pointer from scalar", .result = REJECT, }, @@ -88,7 +89,7 @@ offsetof(struct __sk_buff, mark)), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R1 tried to sub from different maps, paths, or prohibited types", + .errstr_unpriv = "R1 has pointer with unsupported alu operation", .errstr = "dereference of modified ctx ptr", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, @@ -103,7 +104,7 @@ offsetof(struct __sk_buff, mark)), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R1 tried to add from different maps, paths, or prohibited types", + .errstr_unpriv = "R1 has pointer with unsupported alu operation", .errstr = "dereference of modified ctx ptr", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, @@ -116,7 +117,7 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types", + .errstr_unpriv = "R1 has pointer with unsupported alu operation", .errstr = "R0 tried to subtract pointer from scalar", .result = REJECT, }, diff --git a/tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c b/tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c index 9baca7a75c42ae..c2aa6f26738b41 100644 --- a/tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c +++ b/tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c @@ -19,7 +19,6 @@ }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", - .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -43,7 +42,6 @@ }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", - .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -69,7 +67,6 @@ }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", - .errstr_unpriv = "R8 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -94,7 +91,6 @@ }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", - .errstr_unpriv = "R8 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -141,7 +137,6 @@ }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", - .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -210,7 +205,6 @@ }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", - .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -260,7 +254,6 @@ }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", - .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -287,7 +280,6 @@ }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", - .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -313,7 +305,6 @@ }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", - .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -342,7 +333,6 @@ }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", - .errstr_unpriv = "R7 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -372,7 +362,6 @@ }, .fixup_map_hash_8b = { 4 }, .errstr = "unbounded min value", - .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -400,7 +389,5 @@ }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", - .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, - .result_unpriv = REJECT, }, diff --git a/tools/testing/selftests/bpf/verifier/map_ptr.c b/tools/testing/selftests/bpf/verifier/map_ptr.c index 6f610cfddae53b..1f82021429bf23 100644 --- a/tools/testing/selftests/bpf/verifier/map_ptr.c +++ b/tools/testing/selftests/bpf/verifier/map_ptr.c @@ -76,7 +76,7 @@ }, .fixup_map_hash_16b = { 4 }, .result_unpriv = REJECT, - .errstr_unpriv = "R1 tried to add from different maps, paths, or prohibited types", + .errstr_unpriv = "R1 has pointer with unsupported alu operation", .result = ACCEPT, }, { @@ -94,6 +94,6 @@ }, .fixup_map_hash_16b = { 4 }, .result_unpriv = REJECT, - .errstr_unpriv = "R1 tried to add from different maps, paths, or prohibited types", + .errstr_unpriv = "R0 has pointer with unsupported alu operation", .result = ACCEPT, }, diff --git a/tools/testing/selftests/bpf/verifier/unpriv.c b/tools/testing/selftests/bpf/verifier/unpriv.c index 3e32400c4b44b5..bd436df5cc3266 100644 --- a/tools/testing/selftests/bpf/verifier/unpriv.c +++ b/tools/testing/selftests/bpf/verifier/unpriv.c @@ -505,7 +505,7 @@ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, -8), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R1 tried to add from different maps, paths, or prohibited types", + .errstr_unpriv = "R1 stack pointer arithmetic goes out of range", .result_unpriv = REJECT, .result = ACCEPT, }, diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c index feb91266db39a0..e5913fd3b9030d 100644 --- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c +++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c @@ -21,8 +21,6 @@ .fixup_map_hash_16b = { 5 }, .fixup_map_array_48b = { 8 }, .result = ACCEPT, - .result_unpriv = REJECT, - .errstr_unpriv = "R1 tried to add from different maps", .retval = 1, }, { @@ -122,7 +120,7 @@ .fixup_map_array_48b = { 1 }, .result = ACCEPT, .result_unpriv = REJECT, - .errstr_unpriv = "R2 tried to add from different pointers or scalars", + .errstr_unpriv = "R2 tried to add from different maps, paths or scalars", .retval = 0, }, { @@ -169,7 +167,7 @@ .fixup_map_array_48b = { 1 }, .result = ACCEPT, .result_unpriv = REJECT, - .errstr_unpriv = "R2 tried to add from different maps, paths, or prohibited types", + .errstr_unpriv = "R2 tried to add from different maps, paths or scalars", .retval = 0, }, { -- cgit 1.2.3-korg From 6b389c16378a03fe71f3b1365b593ba41d2dd8ec Mon Sep 17 00:00:00 2001 From: Lijun Pan Date: Thu, 15 Apr 2021 23:18:13 -0500 Subject: MAINTAINERS: update my email Update my email and change myself to Reviewer. Signed-off-by: Lijun Pan Signed-off-by: David S. Miller --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 163264c282eb82..a31f76c41aec26 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8517,9 +8517,9 @@ F: drivers/pci/hotplug/rpaphp* IBM Power SRIOV Virtual NIC Device Driver M: Dany Madden -M: Lijun Pan M: Sukadev Bhattiprolu R: Thomas Falcon +R: Lijun Pan L: netdev@vger.kernel.org S: Supported F: drivers/net/ethernet/ibm/ibmvnic.* -- cgit 1.2.3-korg From f2764bd4f6a8dffaec3e220728385d9756b3c2cb Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 16 Apr 2021 21:29:13 +0200 Subject: netlink: don't call ->netlink_bind with table lock held When I added support to allow generic netlink multicast groups to be restricted to subscribers with CAP_NET_ADMIN I was unaware that a genl_bind implementation already existed in the past. It was reverted due to ABBA deadlock: 1. ->netlink_bind gets called with the table lock held. 2. genetlink bind callback is invoked, it grabs the genl lock. But when a new genl subsystem is (un)registered, these two locks are taken in reverse order. One solution would be to revert again and add a comment in genl referring 1e82a62fec613, "genetlink: remove genl_bind"). This would need a second change in mptcp to not expose the raw token value anymore, e.g. by hashing the token with a secret key so userspace can still associate subflow events with the correct mptcp connection. However, Paolo Abeni reminded me to double-check why the netlink table is locked in the first place. I can't find one. netlink_bind() is already called without this lock when userspace joins a group via NETLINK_ADD_MEMBERSHIP setsockopt. Same holds for the netlink_unbind operation. Digging through the history, commit f773608026ee1 ("netlink: access nlk groups safely in netlink bind and getname") expanded the lock scope. commit 3a20773beeeeade ("net: netlink: cap max groups which will be considered in netlink_bind()") ... removed the nlk->ngroups access that the lock scope extension was all about. Reduce the lock scope again and always call ->netlink_bind without the table lock. The Fixes tag should be vs. the patch mentioned in the link below, but that one got squash-merged into the patch that came earlier in the series. Fixes: 4d54cc32112d8d ("mptcp: avoid lock_fast usage in accept path") Link: https://lore.kernel.org/mptcp/20210213000001.379332-8-mathew.j.martineau@linux.intel.com/T/#u Cc: Cong Wang Cc: Xin Long Cc: Johannes Berg Cc: Sean Tranchetti Cc: Paolo Abeni Cc: Pablo Neira Ayuso Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index dd488938447f97..3a62f97acf39d1 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1019,7 +1019,6 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, return -EINVAL; } - netlink_lock_table(); if (nlk->netlink_bind && groups) { int group; @@ -1031,13 +1030,14 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, if (!err) continue; netlink_undo_bind(group, groups, sk); - goto unlock; + return err; } } /* No need for barriers here as we return to user-space without * using any of the bound attributes. */ + netlink_lock_table(); if (!bound) { err = nladdr->nl_pid ? netlink_insert(sk, nladdr->nl_pid) : -- cgit 1.2.3-korg