aboutsummaryrefslogtreecommitdiffstats
path: root/net/bridge
diff options
context:
space:
mode:
Diffstat (limited to 'net/bridge')
-rw-r--r--net/bridge/br_netfilter_hooks.c96
-rw-r--r--net/bridge/br_switchdev.c84
-rw-r--r--net/bridge/netfilter/nf_conntrack_bridge.c30
3 files changed, 182 insertions, 28 deletions
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index ed17208907578..35e10c5a766d5 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -43,6 +43,10 @@
#include <linux/sysctl.h>
#endif
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack_core.h>
+#endif
+
static unsigned int brnf_net_id __read_mostly;
struct brnf_net {
@@ -553,6 +557,90 @@ static unsigned int br_nf_pre_routing(void *priv,
return NF_STOLEN;
}
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+/* conntracks' nf_confirm logic cannot handle cloned skbs referencing
+ * the same nf_conn entry, which will happen for multicast (broadcast)
+ * Frames on bridges.
+ *
+ * Example:
+ * macvlan0
+ * br0
+ * ethX ethY
+ *
+ * ethX (or Y) receives multicast or broadcast packet containing
+ * an IP packet, not yet in conntrack table.
+ *
+ * 1. skb passes through bridge and fake-ip (br_netfilter)Prerouting.
+ * -> skb->_nfct now references a unconfirmed entry
+ * 2. skb is broad/mcast packet. bridge now passes clones out on each bridge
+ * interface.
+ * 3. skb gets passed up the stack.
+ * 4. In macvlan case, macvlan driver retains clone(s) of the mcast skb
+ * and schedules a work queue to send them out on the lower devices.
+ *
+ * The clone skb->_nfct is not a copy, it is the same entry as the
+ * original skb. The macvlan rx handler then returns RX_HANDLER_PASS.
+ * 5. Normal conntrack hooks (in NF_INET_LOCAL_IN) confirm the orig skb.
+ *
+ * The Macvlan broadcast worker and normal confirm path will race.
+ *
+ * This race will not happen if step 2 already confirmed a clone. In that
+ * case later steps perform skb_clone() with skb->_nfct already confirmed (in
+ * hash table). This works fine.
+ *
+ * But such confirmation won't happen when eb/ip/nftables rules dropped the
+ * packets before they reached the nf_confirm step in postrouting.
+ *
+ * Work around this problem by explicit confirmation of the entry at
+ * LOCAL_IN time, before upper layer has a chance to clone the unconfirmed
+ * entry.
+ *
+ */
+static unsigned int br_nf_local_in(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_conntrack *nfct = skb_nfct(skb);
+ const struct nf_ct_hook *ct_hook;
+ struct nf_conn *ct;
+ int ret;
+
+ if (!nfct || skb->pkt_type == PACKET_HOST)
+ return NF_ACCEPT;
+
+ ct = container_of(nfct, struct nf_conn, ct_general);
+ if (likely(nf_ct_is_confirmed(ct)))
+ return NF_ACCEPT;
+
+ WARN_ON_ONCE(skb_shared(skb));
+ WARN_ON_ONCE(refcount_read(&nfct->use) != 1);
+
+ /* We can't call nf_confirm here, it would create a dependency
+ * on nf_conntrack module.
+ */
+ ct_hook = rcu_dereference(nf_ct_hook);
+ if (!ct_hook) {
+ skb->_nfct = 0ul;
+ nf_conntrack_put(nfct);
+ return NF_ACCEPT;
+ }
+
+ nf_bridge_pull_encap_header(skb);
+ ret = ct_hook->confirm(skb);
+ switch (ret & NF_VERDICT_MASK) {
+ case NF_STOLEN:
+ return NF_STOLEN;
+ default:
+ nf_bridge_push_encap_header(skb);
+ break;
+ }
+
+ ct = container_of(nfct, struct nf_conn, ct_general);
+ WARN_ON_ONCE(!nf_ct_is_confirmed(ct));
+
+ return ret;
+}
+#endif
/* PF_BRIDGE/FORWARD *************************************************/
static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
@@ -964,6 +1052,14 @@ static const struct nf_hook_ops br_nf_ops[] = {
.hooknum = NF_BR_PRE_ROUTING,
.priority = NF_BR_PRI_BRNF,
},
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ {
+ .hook = br_nf_local_in,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_LOCAL_IN,
+ .priority = NF_BR_PRI_LAST,
+ },
+#endif
{
.hook = br_nf_forward,
.pf = NFPROTO_BRIDGE,
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index ee84e783e1dff..7b41ee8740cbb 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -595,21 +595,40 @@ br_switchdev_mdb_replay_one(struct notifier_block *nb, struct net_device *dev,
}
static int br_switchdev_mdb_queue_one(struct list_head *mdb_list,
+ struct net_device *dev,
+ unsigned long action,
enum switchdev_obj_id id,
const struct net_bridge_mdb_entry *mp,
struct net_device *orig_dev)
{
- struct switchdev_obj_port_mdb *mdb;
+ struct switchdev_obj_port_mdb mdb = {
+ .obj = {
+ .id = id,
+ .orig_dev = orig_dev,
+ },
+ };
+ struct switchdev_obj_port_mdb *pmdb;
- mdb = kzalloc(sizeof(*mdb), GFP_ATOMIC);
- if (!mdb)
- return -ENOMEM;
+ br_switchdev_mdb_populate(&mdb, mp);
+
+ if (action == SWITCHDEV_PORT_OBJ_ADD &&
+ switchdev_port_obj_act_is_deferred(dev, action, &mdb.obj)) {
+ /* This event is already in the deferred queue of
+ * events, so this replay must be elided, lest the
+ * driver receives duplicate events for it. This can
+ * only happen when replaying additions, since
+ * modifications are always immediately visible in
+ * br->mdb_list, whereas actual event delivery may be
+ * delayed.
+ */
+ return 0;
+ }
- mdb->obj.id = id;
- mdb->obj.orig_dev = orig_dev;
- br_switchdev_mdb_populate(mdb, mp);
- list_add_tail(&mdb->obj.list, mdb_list);
+ pmdb = kmemdup(&mdb, sizeof(mdb), GFP_ATOMIC);
+ if (!pmdb)
+ return -ENOMEM;
+ list_add_tail(&pmdb->obj.list, mdb_list);
return 0;
}
@@ -677,51 +696,50 @@ br_switchdev_mdb_replay(struct net_device *br_dev, struct net_device *dev,
if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
return 0;
- /* We cannot walk over br->mdb_list protected just by the rtnl_mutex,
- * because the write-side protection is br->multicast_lock. But we
- * need to emulate the [ blocking ] calling context of a regular
- * switchdev event, so since both br->multicast_lock and RCU read side
- * critical sections are atomic, we have no choice but to pick the RCU
- * read side lock, queue up all our events, leave the critical section
- * and notify switchdev from blocking context.
+ if (adding)
+ action = SWITCHDEV_PORT_OBJ_ADD;
+ else
+ action = SWITCHDEV_PORT_OBJ_DEL;
+
+ /* br_switchdev_mdb_queue_one() will take care to not queue a
+ * replay of an event that is already pending in the switchdev
+ * deferred queue. In order to safely determine that, there
+ * must be no new deferred MDB notifications enqueued for the
+ * duration of the MDB scan. Therefore, grab the write-side
+ * lock to avoid racing with any concurrent IGMP/MLD snooping.
*/
- rcu_read_lock();
+ spin_lock_bh(&br->multicast_lock);
- hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) {
+ hlist_for_each_entry(mp, &br->mdb_list, mdb_node) {
struct net_bridge_port_group __rcu * const *pp;
const struct net_bridge_port_group *p;
if (mp->host_joined) {
- err = br_switchdev_mdb_queue_one(&mdb_list,
+ err = br_switchdev_mdb_queue_one(&mdb_list, dev, action,
SWITCHDEV_OBJ_ID_HOST_MDB,
mp, br_dev);
if (err) {
- rcu_read_unlock();
+ spin_unlock_bh(&br->multicast_lock);
goto out_free_mdb;
}
}
- for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL;
+ for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL;
pp = &p->next) {
if (p->key.port->dev != dev)
continue;
- err = br_switchdev_mdb_queue_one(&mdb_list,
+ err = br_switchdev_mdb_queue_one(&mdb_list, dev, action,
SWITCHDEV_OBJ_ID_PORT_MDB,
mp, dev);
if (err) {
- rcu_read_unlock();
+ spin_unlock_bh(&br->multicast_lock);
goto out_free_mdb;
}
}
}
- rcu_read_unlock();
-
- if (adding)
- action = SWITCHDEV_PORT_OBJ_ADD;
- else
- action = SWITCHDEV_PORT_OBJ_DEL;
+ spin_unlock_bh(&br->multicast_lock);
list_for_each_entry(obj, &mdb_list, list) {
err = br_switchdev_mdb_replay_one(nb, dev,
@@ -786,6 +804,16 @@ static void nbp_switchdev_unsync_objs(struct net_bridge_port *p,
br_switchdev_mdb_replay(br_dev, dev, ctx, false, blocking_nb, NULL);
br_switchdev_vlan_replay(br_dev, ctx, false, blocking_nb, NULL);
+
+ /* Make sure that the device leaving this bridge has seen all
+ * relevant events before it is disassociated. In the normal
+ * case, when the device is directly attached to the bridge,
+ * this is covered by del_nbp(). If the association was indirect
+ * however, e.g. via a team or bond, and the device is leaving
+ * that intermediate device, then the bridge port remains in
+ * place.
+ */
+ switchdev_deferred_process();
}
/* Let the bridge know that this port is offloaded, so that it can assign a
diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c
index abb090f94ed26..6f877e31709ba 100644
--- a/net/bridge/netfilter/nf_conntrack_bridge.c
+++ b/net/bridge/netfilter/nf_conntrack_bridge.c
@@ -291,6 +291,30 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb,
return nf_conntrack_in(skb, &bridge_state);
}
+static unsigned int nf_ct_bridge_in(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ if (skb->pkt_type == PACKET_HOST)
+ return NF_ACCEPT;
+
+ /* nf_conntrack_confirm() cannot handle concurrent clones,
+ * this happens for broad/multicast frames with e.g. macvlan on top
+ * of the bridge device.
+ */
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct))
+ return NF_ACCEPT;
+
+ /* let inet prerouting call conntrack again */
+ skb->_nfct = 0;
+ nf_ct_put(ct);
+
+ return NF_ACCEPT;
+}
+
static void nf_ct_bridge_frag_save(struct sk_buff *skb,
struct nf_bridge_frag_data *data)
{
@@ -386,6 +410,12 @@ static struct nf_hook_ops nf_ct_bridge_hook_ops[] __read_mostly = {
.priority = NF_IP_PRI_CONNTRACK,
},
{
+ .hook = nf_ct_bridge_in,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
+ },
+ {
.hook = nf_ct_bridge_post,
.pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_POST_ROUTING,