Nikolay Aleksandrov
2017-Aug-16 20:38 UTC
[Bridge] [PATCH 1/6] bridge: learn dst metadata in FDB
On 16/08/17 20:01, David Lamparter wrote:> This implements holding dst metadata information in the bridge layer, > but only for unicast entries in the MAC table. Multicast is still left > to design and implement. > > Signed-off-by: David Lamparter <equinox at diac24.net> > ---Hi David, Sorry but I do not agree with this change, adding a special case for VPLS in the bridge code and hitting the fast path for everyone in a few different places for a feature that the majority will not use does not sound acceptable to me. We've been trying hard to optimize it, trying to avoid additional cache lines, removing tests and keeping special cases to a minimum. I understand that you want to use the fdb tables and avoid duplication, but this is not worth it. There're other similar use cases and they have their own private fdb tables, that way the user can opt out and is much cleaner and separated. As you've noted this is only an RFC so I will not point out every issue, but there seems to be a major problem with br_fdb_update(), note that it runs without any locks except RCU. By the way I've added the bridge maintainers to the CC list. Thanks, Nik> include/net/dst_metadata.h | 19 +++++++++++++------ > net/bridge/br_device.c | 4 ++++ > net/bridge/br_fdb.c | 45 +++++++++++++++++++++++++++++++-------------- > net/bridge/br_input.c | 6 ++++-- > net/bridge/br_private.h | 4 +++- > 5 files changed, 55 insertions(+), 23 deletions(-) > > diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h > index a803129a4849..8858dc441458 100644 > --- a/include/net/dst_metadata.h > +++ b/include/net/dst_metadata.h > @@ -56,16 +56,15 @@ static inline bool skb_valid_dst(const struct sk_buff *skb) > return dst && !(dst->flags & DST_METADATA); > } > > -static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a, > - const struct sk_buff *skb_b) > +static inline int dst_metadata_cmp(const struct dst_entry *dst_a, > + const struct dst_entry *dst_b) > { > const struct metadata_dst *a, *b; > - > - if (!(skb_a->_skb_refdst | skb_b->_skb_refdst)) > + if (!(dst_a || dst_b)) > return 0; > > - a = (const struct metadata_dst *) skb_dst(skb_a); > - b = (const struct metadata_dst *) skb_dst(skb_b); > + a = (const struct metadata_dst *) dst_a; > + b = (const struct metadata_dst *) dst_b; > > if (!a != !b || a->type != b->type) > return 1; > @@ -83,6 +82,14 @@ static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a, > } > } > > +static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a, > + const struct sk_buff *skb_b) > +{ > + if (!(skb_a->_skb_refdst | skb_b->_skb_refdst)) > + return 0; > + return dst_metadata_cmp(skb_dst(skb_a), skb_dst(skb_b)); > +} > + > void metadata_dst_free(struct metadata_dst *); > struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, > gfp_t flags); > diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c > index 861ae2a165f4..534cacf02f8d 100644 > --- a/net/bridge/br_device.c > +++ b/net/bridge/br_device.c > @@ -53,6 +53,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) > brstats->tx_bytes += skb->len; > u64_stats_update_end(&brstats->syncp); > > + skb_dst_drop(skb); > BR_INPUT_SKB_CB(skb)->brdev = dev; > > skb_reset_mac_header(skb); > @@ -81,6 +82,9 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) > else > br_flood(br, skb, BR_PKT_MULTICAST, false, true); > } else if ((dst = br_fdb_find_rcu(br, dest, vid)) != NULL) { > + struct dst_entry *md_dst = rcu_dereference(dst->md_dst); > + if (md_dst) > + skb_dst_set_noref(skb, md_dst); > br_forward(dst->dst, skb, false, true); > } else { > br_flood(br, skb, BR_PKT_UNICAST, false, true); > diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c > index a79b648aac88..0751fcb89699 100644 > --- a/net/bridge/br_fdb.c > +++ b/net/bridge/br_fdb.c > @@ -25,11 +25,13 @@ > #include <asm/unaligned.h> > #include <linux/if_vlan.h> > #include <net/switchdev.h> > +#include <net/dst_metadata.h> > #include "br_private.h" > > static struct kmem_cache *br_fdb_cache __read_mostly; > static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, > - const unsigned char *addr, u16 vid); > + struct dst_entry *md_dst, const unsigned char *addr, > + u16 vid); > static void fdb_notify(struct net_bridge *br, > const struct net_bridge_fdb_entry *, int); > > @@ -174,6 +176,8 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f) > if (f->is_static) > fdb_del_hw_addr(br, f->addr.addr); > > + dst_release(rcu_access_pointer(f->md_dst)); > + > hlist_del_init_rcu(&f->hlist); > fdb_notify(br, f, RTM_DELNEIGH); > call_rcu(&f->rcu, fdb_rcu_free); > @@ -260,7 +264,7 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) > > insert: > /* insert new address, may fail if invalid address or dup. */ > - fdb_insert(br, p, newaddr, 0); > + fdb_insert(br, p, NULL, newaddr, 0); > > if (!vg || !vg->num_vlans) > goto done; > @@ -270,7 +274,7 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) > * from under us. > */ > list_for_each_entry(v, &vg->vlan_list, vlist) > - fdb_insert(br, p, newaddr, v->vid); > + fdb_insert(br, p, NULL, newaddr, v->vid); > > done: > spin_unlock_bh(&br->hash_lock); > @@ -289,10 +293,11 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) > if (f && f->is_local && !f->dst && !f->added_by_user) > fdb_delete_local(br, NULL, f); > > - fdb_insert(br, NULL, newaddr, 0); > + fdb_insert(br, NULL, NULL, newaddr, 0); > vg = br_vlan_group(br); > if (!vg || !vg->num_vlans) > goto out; > + > /* Now remove and add entries for every VLAN configured on the > * bridge. This function runs under RTNL so the bitmap will not > * change from under us. > @@ -303,7 +308,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) > f = br_fdb_find(br, br->dev->dev_addr, v->vid); > if (f && f->is_local && !f->dst && !f->added_by_user) > fdb_delete_local(br, NULL, f); > - fdb_insert(br, NULL, newaddr, v->vid); > + fdb_insert(br, NULL, NULL, newaddr, v->vid); > } > out: > spin_unlock_bh(&br->hash_lock); > @@ -477,6 +482,7 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, > > static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, > struct net_bridge_port *source, > + struct dst_entry *md_dst, > const unsigned char *addr, > __u16 vid, > unsigned char is_local, > @@ -488,6 +494,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, > if (fdb) { > memcpy(fdb->addr.addr, addr, ETH_ALEN); > fdb->dst = source; > + rcu_assign_pointer(fdb->md_dst, dst_clone(md_dst)); > fdb->vlan_id = vid; > fdb->is_local = is_local; > fdb->is_static = is_static; > @@ -501,7 +508,8 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, > } > > static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, > - const unsigned char *addr, u16 vid) > + struct dst_entry *md_dst, const unsigned char *addr, > + u16 vid) > { > struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; > struct net_bridge_fdb_entry *fdb; > @@ -521,7 +529,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, > fdb_delete(br, fdb); > } > > - fdb = fdb_create(head, source, addr, vid, 1, 1); > + fdb = fdb_create(head, source, md_dst, addr, vid, 1, 1); > if (!fdb) > return -ENOMEM; > > @@ -537,13 +545,14 @@ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, > int ret; > > spin_lock_bh(&br->hash_lock); > - ret = fdb_insert(br, source, addr, vid); > + ret = fdb_insert(br, source, NULL, addr, vid); > spin_unlock_bh(&br->hash_lock); > return ret; > } > > void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, > - const unsigned char *addr, u16 vid, bool added_by_user) > + struct dst_entry *md_dst, const unsigned char *addr, > + u16 vid, bool added_by_user) > { > struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; > struct net_bridge_fdb_entry *fdb; > @@ -558,6 +567,9 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, > source->state == BR_STATE_FORWARDING)) > return; > > + if (md_dst && !(md_dst->flags & DST_METADATA)) > + md_dst = NULL; > + > fdb = fdb_find_rcu(head, addr, vid); > if (likely(fdb)) { > /* attempt to update an entry for a local interface */ > @@ -567,10 +579,15 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, > source->dev->name, addr, vid); > } else { > unsigned long now = jiffies; > + struct dst_entry *ref_md = rcu_access_pointer(fdb->md_dst); > > /* fastpath: update of existing entry */ > - if (unlikely(source != fdb->dst)) { > + if (unlikely(source != fdb->dst || > + dst_metadata_cmp(md_dst, ref_md))) { > fdb->dst = source; > + dst_release(ref_md); > + rcu_assign_pointer(fdb->md_dst, > + dst_clone(md_dst)); > fdb_modified = true; > /* Take over HW learned entry */ > if (unlikely(fdb->added_by_external_learn)) > @@ -586,7 +603,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, > } else { > spin_lock(&br->hash_lock); > if (likely(!fdb_find_rcu(head, addr, vid))) { > - fdb = fdb_create(head, source, addr, vid, 0, 0); > + fdb = fdb_create(head, source, md_dst, addr, vid, 0, 0); > if (fdb) { > if (unlikely(added_by_user)) > fdb->added_by_user = 1; > @@ -781,7 +798,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, > if (!(flags & NLM_F_CREATE)) > return -ENOENT; > > - fdb = fdb_create(head, source, addr, vid, 0, 0); > + fdb = fdb_create(head, source, NULL, addr, vid, 0, 0); > if (!fdb) > return -ENOMEM; > > @@ -844,7 +861,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, > } > local_bh_disable(); > rcu_read_lock(); > - br_fdb_update(br, p, addr, vid, true); > + br_fdb_update(br, p, NULL, addr, vid, true); > rcu_read_unlock(); > local_bh_enable(); > } else if (ndm->ndm_flags & NTF_EXT_LEARNED) { > @@ -1071,7 +1088,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, > head = &br->hash[br_mac_hash(addr, vid)]; > fdb = br_fdb_find(br, addr, vid); > if (!fdb) { > - fdb = fdb_create(head, p, addr, vid, 0, 0); > + fdb = fdb_create(head, p, NULL, addr, vid, 0, 0); > if (!fdb) { > err = -ENOMEM; > goto err_unlock; > diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c > index 7637f58c1226..df10c87b2499 100644 > --- a/net/bridge/br_input.c > +++ b/net/bridge/br_input.c > @@ -150,7 +150,8 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb > /* insert into forwarding database after filtering to avoid spoofing */ > br = p->br; > if (p->flags & BR_LEARNING) > - br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false); > + br_fdb_update(br, p, skb_dst(skb), eth_hdr(skb)->h_source, > + vid, false); > > local_rcv = !!(br->dev->flags & IFF_PROMISC); > dest = eth_hdr(skb)->h_dest; > @@ -230,7 +231,8 @@ static void __br_handle_local_finish(struct sk_buff *skb) > > /* check if vlan is allowed, to avoid spoofing */ > if (p->flags & BR_LEARNING && br_should_learn(p, skb, &vid)) > - br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false); > + br_fdb_update(p->br, p, skb_dst(skb), > + eth_hdr(skb)->h_source, vid, false); > } > > /* note: already called with rcu_read_lock */ > diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h > index fd9ee73e0a6d..b73f34ed765f 100644 > --- a/net/bridge/br_private.h > +++ b/net/bridge/br_private.h > @@ -164,6 +164,7 @@ struct net_bridge_vlan_group { > struct net_bridge_fdb_entry { > struct hlist_node hlist; > struct net_bridge_port *dst; > + struct dst_entry __rcu *md_dst; > > mac_addr addr; > __u16 vlan_id; > @@ -524,7 +525,8 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count, > int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, > const unsigned char *addr, u16 vid); > void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, > - const unsigned char *addr, u16 vid, bool added_by_user); > + struct dst_entry *md_dst, const unsigned char *addr, > + u16 vid, bool added_by_user); > > int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], > struct net_device *dev, const unsigned char *addr, u16 vid); >
David Lamparter
2017-Aug-17 11:03 UTC
[Bridge] [PATCH 1/6] bridge: learn dst metadata in FDB
Hi Nikolay, On Wed, Aug 16, 2017 at 11:38:06PM +0300, Nikolay Aleksandrov wrote:> On 16/08/17 20:01, David Lamparter wrote: > > This implements holding dst metadata information in the bridge layer, > > but only for unicast entries in the MAC table. Multicast is still left > > to design and implement. > > Sorry but I do not agree with this change, adding a special case for > VPLS in the bridge codeI don't think this is specific to VPLS at all, though you're right that VPLS is the only user currently.> and hitting the fast path for everyone in a few different places for a > feature that the majority will not use does not sound acceptable to > me. We've been trying hard to optimize it, trying to avoid additional > cache lines, removing tests and keeping special cases to a minimum.skb->dst is on the same cacheline as skb->len. fdb->md_dst is on the same cacheline as fdb->dst. Both will be 0 in a lot of cases, so this should be two null checks on data that is hot in the cache. Are you sure this is an actual problem?> I understand that you want to use the fdb tables and avoid > duplication, but this is not worth it. There're other similar use > cases and they have their own private fdb tables, that way the user > can opt out and is much cleaner and separated.Sure, this can be done. I think it's a noticeable performance penalty to have the entire fdb copied (multiple times for H-VPLS even), but I understand that it's preferable to have the normal cases faster in exchange. As the previous paragraph notes, I still wonder if that hit to the normal case exists though. I will leave this to Amine, he's paid to work on VPLS while I'm doing this for fun ;) There is however another concern I have here. As I noted in my introductory mail, I'm working on the bridge MDB making similar changes. And there's actually strong use cases for this in both VPLS and the 802.11 code (though I'm not sure I can code the latter one up, it's related to rate control and this is seriously complicated - the goal is to select a multicast rate based on the now-known receiving STAs). I really hope you're not suggesting the entire MDB with IPv4 & IPv6 snooping be duplicated into both VPLS and mac80211?> As you've noted this is only an RFC so I will not point out every issue, but there seems > to be a major problem with br_fdb_update(), note that it runs without any locks except RCU.Right, Thanks! ... I only thought about concurrent access, forgetting about concurrent modification... I'll replace it with an xchg I think. (No need for a lock that way) That said, now that I think about it, the skb_dst_set_noref() in the following chunk is probably not safe if the VPLS device has a qdisc on it. I was relying on the fact that br_dev_xmit is holding RCU there, but if the SKB is queued, md_dst might go away in the meantime... ... probably need to change this to dst_hold() + skb_dst_set()... -David> > diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c > > index 861ae2a165f4..534cacf02f8d 100644 > > --- a/net/bridge/br_device.c > > +++ b/net/bridge/br_device.c > > @@ -81,6 +82,9 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) > > else > > br_flood(br, skb, BR_PKT_MULTICAST, false, true); > > } else if ((dst = br_fdb_find_rcu(br, dest, vid)) != NULL) { > > + struct dst_entry *md_dst = rcu_dereference(dst->md_dst); > > + if (md_dst) > > + skb_dst_set_noref(skb, md_dst); > > br_forward(dst->dst, skb, false, true); > > } else { > > br_flood(br, skb, BR_PKT_UNICAST, false, true);> > diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c > > index a79b648aac88..0751fcb89699 100644 > > @@ -567,10 +579,15 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, > > source->dev->name, addr, vid); > > } else { > > unsigned long now = jiffies; > > + struct dst_entry *ref_md = rcu_access_pointer(fdb->md_dst); > > > > /* fastpath: update of existing entry */ > > - if (unlikely(source != fdb->dst)) { > > + if (unlikely(source != fdb->dst || > > + dst_metadata_cmp(md_dst, ref_md))) { > > fdb->dst = source; > > + dst_release(ref_md); > > + rcu_assign_pointer(fdb->md_dst, > > + dst_clone(md_dst)); > > fdb_modified = true; > > /* Take over HW learned entry */ > > if (unlikely(fdb->added_by_external_learn))
David Lamparter
2017-Aug-17 16:16 UTC
[Bridge] [PATCH 1/6] bridge: learn dst metadata in FDB
On Wed, Aug 16, 2017 at 11:38:06PM +0300, Nikolay Aleksandrov wrote:> On 16/08/17 20:01, David Lamparter wrote: > > This implements holding dst metadata information in the bridge layer, > > but only for unicast entries in the MAC table. Multicast is still left > > to design and implement. > > > > Signed-off-by: David Lamparter <equinox at diac24.net> > > --- > > Hi David, > Sorry but I do not agree with this change, adding a special case for VPLSTo prove that this is not a special case for VPLS, I have attached a patch for GRETAP multicast+unicast learning below. It's just 24(!) lines added to get functionality similar to "basic VXLAN" (i.e. multicast with dataplane learning.) -David --- From: David Lamparter <equinox at diac24.net> Date: Thu, 17 Aug 2017 18:11:16 +0200 Subject: [PATCH] gretap: support multicast + unicast learning This enables using an IPv4 multicast destination for gretap and enables learning unicast destinations through the bridge fdb. Signed-off-by: David Lamparter <equinox at diac24.net> --- net/ipv4/ip_gre.c | 27 +++++++++++++++++++++++---- net/ipv4/ip_tunnel.c | 1 + 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 7a7829e839c2..e58f8ccb2c87 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -266,7 +266,8 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, skb_pop_mac_header(skb); else skb_reset_mac_header(skb); - if (tunnel->collect_md) { + if (tunnel->collect_md + || ipv4_is_multicast(tunnel->parms.iph.daddr)) { __be16 flags; __be64 tun_id; @@ -379,7 +380,7 @@ static struct rtable *gre_get_rt(struct sk_buff *skb, static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, __be16 proto) { - struct ip_tunnel_info *tun_info; + struct ip_tunnel_info *tun_info, flipped; const struct ip_tunnel_key *key; struct rtable *rt = NULL; struct flowi4 fl; @@ -390,10 +391,22 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, int err; tun_info = skb_tunnel_info(skb); - if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || + if (unlikely(!tun_info || ip_tunnel_info_af(tun_info) != AF_INET)) goto err_free_skb; + if (!(tun_info->mode & IP_TUNNEL_INFO_TX)) { + struct ip_tunnel *tunnel = netdev_priv(dev); + + flipped = *tun_info; + flipped.mode |= IP_TUNNEL_INFO_TX; + flipped.key.u.ipv4.dst = tun_info->key.u.ipv4.src; + flipped.key.u.ipv4.src = tunnel->parms.iph.saddr; + flipped.key.tp_src = tun_info->key.tp_dst; + flipped.key.tp_dst = tun_info->key.tp_src; + tun_info = &flipped; + } + key = &tun_info->key; use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); if (use_cache) @@ -507,8 +520,9 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); + struct ip_tunnel_info *tun_info = skb_tunnel_info(skb); - if (tunnel->collect_md) { + if (tunnel->collect_md || tun_info) { gre_fb_xmit(skb, dev, htons(ETH_P_TEB)); return NETDEV_TX_OK; } @@ -933,6 +947,7 @@ static int gre_tap_init(struct net_device *dev) { __gre_tunnel_init(dev); dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); return ip_tunnel_init(dev); } @@ -940,6 +955,10 @@ static int gre_tap_init(struct net_device *dev) static const struct net_device_ops gre_tap_netdev_ops = { .ndo_init = gre_tap_init, .ndo_uninit = ip_tunnel_uninit, +#ifdef CONFIG_NET_IPGRE_BROADCAST + .ndo_open = ipgre_open, + .ndo_stop = ipgre_close, +#endif .ndo_start_xmit = gre_tap_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 129d1a3616f8..451c11fc9ae5 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -140,6 +140,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, hlist_for_each_entry_rcu(t, head, hash_node) { if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) && + (local != t->parms.iph.saddr || !ipv4_is_multicast(t->parms.iph.daddr)) && (local != t->parms.iph.daddr || !ipv4_is_multicast(local))) continue; -- 2.13.0