Nikolay Aleksandrov
2017-Feb-04 17:05 UTC
[Bridge] [PATCH net-next 0/4] bridge: improve cache utilization
Hi all, This is the first set which begins to deal with the bad bridge cache access patterns. The first patch rearranges the bridge and port structs a little so the frequently (and closely) accessed members are in the same cache line. The second patch then moves the garbage collection to a workqueue trying to improve system responsiveness under load (many fdbs) and more importantly removes the need to check if the matched entry is expired in __br_fdb_get which was a major source of false-sharing. The third patch is a preparation for the final one which If properly configured, i.e. ports bound to CPUs (thus updating "updated" locally) then the bridge's HitM goes from 100% to 0%, but even without binding we get a win because previously every lookup that iterated over the hash chain caused false-sharing due to the first cache line being used for both mac/vid and used/updated fields. Some results from tests I've run: (note that these were run in good conditions for the baseline, everything ran on a single NUMA node and there were only 3 fdbs) 1. baseline 100% Load HitM on the fdbs (between everyone who has done lookups and hit one of the 3 hash chains of the communicating src/dst fdbs) Overall 5.06% Load HitM for the bridge, first place in the list 2. patched & ports bound to CPUs 0% Local load HitM, bridge is not even in the c2c report list Also there's 3% consistent improvement in netperf tests. Thanks, Nik Nikolay Aleksandrov (4): bridge: modify bridge and port to have often accessed fields in one cache line bridge: move to workqueue gc bridge: move write-heavy fdb members in their own cache line bridge: fdb: write to used and updated at most once per jiffy net/bridge/br_device.c | 1 + net/bridge/br_fdb.c | 34 +++++++++++++++++----------- net/bridge/br_if.c | 2 +- net/bridge/br_input.c | 3 ++- net/bridge/br_ioctl.c | 2 +- net/bridge/br_netlink.c | 2 +- net/bridge/br_private.h | 57 +++++++++++++++++++++++------------------------ net/bridge/br_stp.c | 2 +- net/bridge/br_stp_if.c | 4 ++-- net/bridge/br_stp_timer.c | 2 -- net/bridge/br_sysfs_br.c | 2 +- 11 files changed, 59 insertions(+), 52 deletions(-) -- 2.1.4
Nikolay Aleksandrov
2017-Feb-04 17:05 UTC
[Bridge] [PATCH net-next 1/4] bridge: modify bridge and port to have often accessed fields in one cache line
Move around net_bridge so the vlan fields are in the beginning since they're checked on every packet even if vlan filtering is disabled. For the port move flags & vlan group to the beginning, so they're in the same cache line with the port's state (both flags and state are checked on each packet). Signed-off-by: Nikolay Aleksandrov <nikolay at cumulusnetworks.com> --- net/bridge/br_private.h | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 40177df45ba6..ec8560349b6f 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -212,12 +212,16 @@ struct net_bridge_mdb_htable u32 ver; }; -struct net_bridge_port -{ +struct net_bridge_port { struct net_bridge *br; struct net_device *dev; struct list_head list; + unsigned long flags; +#ifdef CONFIG_BRIDGE_VLAN_FILTERING + struct net_bridge_vlan_group __rcu *vlgrp; +#endif + /* STP */ u8 priority; u8 state; @@ -238,8 +242,6 @@ struct net_bridge_port struct kobject kobj; struct rcu_head rcu; - unsigned long flags; - #ifdef CONFIG_BRIDGE_IGMP_SNOOPING struct bridge_mcast_own_query ip4_own_query; #if IS_ENABLED(CONFIG_IPV6) @@ -259,9 +261,6 @@ struct net_bridge_port #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *np; #endif -#ifdef CONFIG_BRIDGE_VLAN_FILTERING - struct net_bridge_vlan_group __rcu *vlgrp; -#endif #ifdef CONFIG_NET_SWITCHDEV int offload_fwd_mark; #endif @@ -283,14 +282,21 @@ static inline struct net_bridge_port *br_port_get_rtnl(const struct net_device * rtnl_dereference(dev->rx_handler_data) : NULL; } -struct net_bridge -{ +struct net_bridge { spinlock_t lock; + spinlock_t hash_lock; struct list_head port_list; struct net_device *dev; - struct pcpu_sw_netstats __percpu *stats; - spinlock_t hash_lock; + /* These fields are accessed on each packet */ +#ifdef CONFIG_BRIDGE_VLAN_FILTERING + u8 vlan_enabled; + u8 vlan_stats_enabled; + __be16 vlan_proto; + u16 default_pvid; + struct net_bridge_vlan_group __rcu *vlgrp; +#endif + struct hlist_head hash[BR_HASH_SIZE]; #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) union { @@ -308,6 +314,9 @@ struct net_bridge bridge_id designated_root; bridge_id bridge_id; u32 root_path_cost; + unsigned char topology_change; + unsigned char topology_change_detected; + u16 root_port; unsigned long max_age; unsigned long hello_time; unsigned long forward_delay; @@ -319,7 +328,6 @@ struct net_bridge u8 group_addr[ETH_ALEN]; bool group_addr_set; - u16 root_port; enum { BR_NO_STP, /* no spanning tree */ @@ -327,9 +335,6 @@ struct net_bridge BR_USER_STP, /* new RSTP in userspace */ } stp_enabled; - unsigned char topology_change; - unsigned char topology_change_detected; - #ifdef CONFIG_BRIDGE_IGMP_SNOOPING unsigned char multicast_router; @@ -381,14 +386,6 @@ struct net_bridge #ifdef CONFIG_NET_SWITCHDEV int offload_fwd_mark; #endif - -#ifdef CONFIG_BRIDGE_VLAN_FILTERING - struct net_bridge_vlan_group __rcu *vlgrp; - u8 vlan_enabled; - u8 vlan_stats_enabled; - __be16 vlan_proto; - u16 default_pvid; -#endif }; struct br_input_skb_cb { -- 2.1.4
Nikolay Aleksandrov
2017-Feb-04 17:05 UTC
[Bridge] [PATCH net-next 2/4] bridge: move to workqueue gc
Move the fdb garbage collector to a workqueue which fires at least 10 milliseconds apart and cleans chain by chain allowing for other tasks to run in the meantime. When having thousands of fdbs the system is much more responsive. Most importantly remove the need to check if the matched entry has expired in __br_fdb_get that causes false-sharing and is completely unnecessary if we cleanup entries, at worst we'll get 10ms of traffic for that entry before it gets deleted. Signed-off-by: Nikolay Aleksandrov <nikolay at cumulusnetworks.com> --- net/bridge/br_device.c | 1 + net/bridge/br_fdb.c | 31 +++++++++++++++++++------------ net/bridge/br_if.c | 2 +- net/bridge/br_ioctl.c | 2 +- net/bridge/br_netlink.c | 2 +- net/bridge/br_private.h | 4 ++-- net/bridge/br_stp.c | 2 +- net/bridge/br_stp_if.c | 4 ++-- net/bridge/br_stp_timer.c | 2 -- net/bridge/br_sysfs_br.c | 2 +- 10 files changed, 29 insertions(+), 23 deletions(-) diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 6c46d1b4cdbb..89b414fd1901 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -413,4 +413,5 @@ void br_dev_setup(struct net_device *dev) br_netfilter_rtable_init(br); br_stp_timer_init(br); br_multicast_init(br); + INIT_DELAYED_WORK(&br->gc_work, br_fdb_cleanup); } diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index e4a4176171c9..5cbed5c0db88 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -154,7 +154,7 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f) if (f->added_by_external_learn) fdb_del_external_learn(f); - hlist_del_rcu(&f->hlist); + hlist_del_init_rcu(&f->hlist); fdb_notify(br, f, RTM_DELNEIGH); call_rcu(&f->rcu, fdb_rcu_free); } @@ -290,34 +290,43 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) spin_unlock_bh(&br->hash_lock); } -void br_fdb_cleanup(unsigned long _data) +void br_fdb_cleanup(struct work_struct *work) { - struct net_bridge *br = (struct net_bridge *)_data; + struct net_bridge *br = container_of(work, struct net_bridge, + gc_work.work); unsigned long delay = hold_time(br); - unsigned long next_timer = jiffies + br->ageing_time; + unsigned long work_delay = delay; + unsigned long now = jiffies; int i; - spin_lock(&br->hash_lock); for (i = 0; i < BR_HASH_SIZE; i++) { struct net_bridge_fdb_entry *f; struct hlist_node *n; + if (!br->hash[i].first) + continue; + + spin_lock_bh(&br->hash_lock); hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) { unsigned long this_timer; + if (f->is_static) continue; if (f->added_by_external_learn) continue; this_timer = f->updated + delay; - if (time_before_eq(this_timer, jiffies)) + if (time_after(this_timer, now)) + work_delay = min(work_delay, this_timer - now); + else fdb_delete(br, f); - else if (time_before(this_timer, next_timer)) - next_timer = this_timer; } + spin_unlock_bh(&br->hash_lock); + cond_resched(); } - spin_unlock(&br->hash_lock); - mod_timer(&br->gc_timer, round_jiffies_up(next_timer)); + /* Cleanup minimum 10 milliseconds apart */ + work_delay = max_t(unsigned long, work_delay, msecs_to_jiffies(10)); + mod_delayed_work(system_long_wq, &br->gc_work, work_delay); } /* Completely flush all dynamic entries in forwarding database.*/ @@ -382,8 +391,6 @@ struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br, &br->hash[br_mac_hash(addr, vid)], hlist) { if (ether_addr_equal(fdb->addr.addr, addr) && fdb->vlan_id == vid) { - if (unlikely(has_expired(br, fdb))) - break; return fdb; } } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index ed0dd3340084..8ac1770aa222 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -313,7 +313,7 @@ void br_dev_delete(struct net_device *dev, struct list_head *head) br_vlan_flush(br); br_multicast_dev_del(br); - del_timer_sync(&br->gc_timer); + cancel_delayed_work_sync(&br->gc_work); br_sysfs_delbr(br->dev); unregister_netdevice_queue(br->dev, head); diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index da8157c57eb1..7970f8540cbb 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -149,7 +149,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) b.hello_timer_value = br_timer_value(&br->hello_timer); b.tcn_timer_value = br_timer_value(&br->tcn_timer); b.topology_change_timer_value = br_timer_value(&br->topology_change_timer); - b.gc_timer_value = br_timer_value(&br->gc_timer); + b.gc_timer_value = br_timer_value(&br->gc_work.timer); rcu_read_unlock(); if (copy_to_user((void __user *)args[1], &b, sizeof(b))) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index fc5d885dbb22..1cbdc5b96aa7 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1250,7 +1250,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) if (nla_put_u64_64bit(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; - clockval = br_timer_value(&br->gc_timer); + clockval = br_timer_value(&br->gc_work.timer); if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index ec8560349b6f..47fd64bf5022 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -379,7 +379,7 @@ struct net_bridge { struct timer_list hello_timer; struct timer_list tcn_timer; struct timer_list topology_change_timer; - struct timer_list gc_timer; + struct delayed_work gc_work; struct kobject *ifobj; u32 auto_cnt; @@ -502,7 +502,7 @@ void br_fdb_find_delete_local(struct net_bridge *br, const unsigned char *addr, u16 vid); void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr); void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr); -void br_fdb_cleanup(unsigned long arg); +void br_fdb_cleanup(struct work_struct *work); void br_fdb_delete_by_port(struct net_bridge *br, const struct net_bridge_port *p, u16 vid, int do_all); struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br, diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 71fd1a4e63cc..8f56c2d1f1a7 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -602,7 +602,7 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time) br->ageing_time = t; spin_unlock_bh(&br->lock); - mod_timer(&br->gc_timer, jiffies); + mod_delayed_work(system_long_wq, &br->gc_work, 0); return 0; } diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 6c1e21411125..08341d2aa9c9 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -57,7 +57,7 @@ void br_stp_enable_bridge(struct net_bridge *br) spin_lock_bh(&br->lock); if (br->stp_enabled == BR_KERNEL_STP) mod_timer(&br->hello_timer, jiffies + br->hello_time); - mod_timer(&br->gc_timer, jiffies + HZ/10); + mod_delayed_work(system_long_wq, &br->gc_work, HZ / 10); br_config_bpdu_generation(br); @@ -88,7 +88,7 @@ void br_stp_disable_bridge(struct net_bridge *br) del_timer_sync(&br->hello_timer); del_timer_sync(&br->topology_change_timer); del_timer_sync(&br->tcn_timer); - del_timer_sync(&br->gc_timer); + cancel_delayed_work_sync(&br->gc_work); } /* called under bridge lock */ diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c index 7ddb38e0a06e..c98b3e5c140a 100644 --- a/net/bridge/br_stp_timer.c +++ b/net/bridge/br_stp_timer.c @@ -153,8 +153,6 @@ void br_stp_timer_init(struct net_bridge *br) setup_timer(&br->topology_change_timer, br_topology_change_timer_expired, (unsigned long) br); - - setup_timer(&br->gc_timer, br_fdb_cleanup, (unsigned long) br); } void br_stp_port_timer_init(struct net_bridge_port *p) diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index a18148213b08..0f4034934d56 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -263,7 +263,7 @@ static ssize_t gc_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%ld\n", br_timer_value(&br->gc_timer)); + return sprintf(buf, "%ld\n", br_timer_value(&br->gc_work.timer)); } static DEVICE_ATTR_RO(gc_timer); -- 2.1.4
Nikolay Aleksandrov
2017-Feb-04 17:05 UTC
[Bridge] [PATCH net-next 3/4] bridge: move write-heavy fdb members in their own cache line
Fdb's used and updated fields are written to on every packet forward and packet receive respectively. Thus if we are receiving packets from a particular fdb, they'll cause false-sharing with everyone who has looked it up (even if it didn't match, since mac/vid share cache line!). The "used" field is even worse since it is updated on every packet forward to that fdb, thus the standard config where X ports use a single gateway results in 100% fdb false-sharing. Note that this patch does not prevent the last scenario, but it makes it better for other bridge participants which are not using that fdb (and are only doing lookups over it). The point is with this move we make sure that only communicating parties get the false-sharing, in a later patch we'll show how to avoid that too. Signed-off-by: Nikolay Aleksandrov <nikolay at cumulusnetworks.com> --- net/bridge/br_private.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 47fd64bf5022..1cbbf63a5ef7 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -160,19 +160,21 @@ struct net_bridge_vlan_group { u16 pvid; }; -struct net_bridge_fdb_entry -{ +struct net_bridge_fdb_entry { struct hlist_node hlist; struct net_bridge_port *dst; - unsigned long updated; - unsigned long used; mac_addr addr; __u16 vlan_id; unsigned char is_local:1, is_static:1, added_by_user:1, added_by_external_learn:1; + + /* write-heavy members should not affect lookups */ + unsigned long updated ____cacheline_aligned_in_smp; + unsigned long used; + struct rcu_head rcu; }; -- 2.1.4
Nikolay Aleksandrov
2017-Feb-04 17:05 UTC
[Bridge] [PATCH net-next 4/4] bridge: fdb: write to used and updated at most once per jiffy
Writing once per jiffy is enough to limit the bridge's false sharing. After this change the bridge doesn't show up in the local load HitM stats. Suggested-by: David S. Miller <davem at davemloft.net> Signed-off-by: Nikolay Aleksandrov <nikolay at cumulusnetworks.com> --- net/bridge/br_fdb.c | 3 ++- net/bridge/br_input.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 5cbed5c0db88..5028691fa68a 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -597,7 +597,8 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, fdb->dst = source; fdb_modified = true; } - fdb->updated = jiffies; + if (jiffies != fdb->updated) + fdb->updated = jiffies; if (unlikely(added_by_user)) fdb->added_by_user = 1; if (unlikely(fdb_modified)) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index fba38d8a1a08..220943f920d2 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -198,7 +198,8 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (dst->is_local) return br_pass_frame_up(skb); - dst->used = jiffies; + if (jiffies != dst->used) + dst->used = jiffies; br_forward(dst->dst, skb, local_rcv, false); } else { if (!mcast_hit) -- 2.1.4
Stephen Hemminger
2017-Feb-04 21:46 UTC
[Bridge] [PATCH net-next 0/4] bridge: improve cache utilization
On Sat, 4 Feb 2017 18:05:05 +0100 Nikolay Aleksandrov <nikolay at cumulusnetworks.com> wrote:> Hi all, > This is the first set which begins to deal with the bad bridge cache > access patterns. The first patch rearranges the bridge and port structs > a little so the frequently (and closely) accessed members are in the same > cache line. The second patch then moves the garbage collection to a > workqueue trying to improve system responsiveness under load (many fdbs) > and more importantly removes the need to check if the matched entry is > expired in __br_fdb_get which was a major source of false-sharing. > The third patch is a preparation for the final one which > If properly configured, i.e. ports bound to CPUs (thus updating "updated" > locally) then the bridge's HitM goes from 100% to 0%, but even without > binding we get a win because previously every lookup that iterated over > the hash chain caused false-sharing due to the first cache line being > used for both mac/vid and used/updated fields. > > Some results from tests I've run: > (note that these were run in good conditions for the baseline, everything > ran on a single NUMA node and there were only 3 fdbs) > > 1. baseline > 100% Load HitM on the fdbs (between everyone who has done lookups and hit > one of the 3 hash chains of the communicating > src/dst fdbs) > Overall 5.06% Load HitM for the bridge, first place in the list > > 2. patched & ports bound to CPUs > 0% Local load HitM, bridge is not even in the c2c report list > Also there's 3% consistent improvement in netperf tests.What tool are you using to measure this?> > Thanks, > Nik > > Nikolay Aleksandrov (4): > bridge: modify bridge and port to have often accessed fields in one > cache line > bridge: move to workqueue gc > bridge: move write-heavy fdb members in their own cache line > bridge: fdb: write to used and updated at most once per jiffy > > net/bridge/br_device.c | 1 + > net/bridge/br_fdb.c | 34 +++++++++++++++++----------- > net/bridge/br_if.c | 2 +- > net/bridge/br_input.c | 3 ++- > net/bridge/br_ioctl.c | 2 +- > net/bridge/br_netlink.c | 2 +- > net/bridge/br_private.h | 57 +++++++++++++++++++++++------------------------ > net/bridge/br_stp.c | 2 +- > net/bridge/br_stp_if.c | 4 ++-- > net/bridge/br_stp_timer.c | 2 -- > net/bridge/br_sysfs_br.c | 2 +- > 11 files changed, 59 insertions(+), 52 deletions(-)Looks good thanks, I wounder this impacts smaller work loads. Reviewed-by: Stephen Hemminger <stephen at networkplumber.org>
David Miller
2017-Feb-07 03:53 UTC
[Bridge] [PATCH net-next 0/4] bridge: improve cache utilization
From: Nikolay Aleksandrov <nikolay at cumulusnetworks.com> Date: Sat, 4 Feb 2017 18:05:05 +0100> This is the first set which begins to deal with the bad bridge cache > access patterns. The first patch rearranges the bridge and port structs > a little so the frequently (and closely) accessed members are in the same > cache line. The second patch then moves the garbage collection to a > workqueue trying to improve system responsiveness under load (many fdbs) > and more importantly removes the need to check if the matched entry is > expired in __br_fdb_get which was a major source of false-sharing. > The third patch is a preparation for the final one which > If properly configured, i.e. ports bound to CPUs (thus updating "updated" > locally) then the bridge's HitM goes from 100% to 0%, but even without > binding we get a win because previously every lookup that iterated over > the hash chain caused false-sharing due to the first cache line being > used for both mac/vid and used/updated fields. > > Some results from tests I've run: > (note that these were run in good conditions for the baseline, everything > ran on a single NUMA node and there were only 3 fdbs) > > 1. baseline > 100% Load HitM on the fdbs (between everyone who has done lookups and hit > one of the 3 hash chains of the communicating > src/dst fdbs) > Overall 5.06% Load HitM for the bridge, first place in the list > > 2. patched & ports bound to CPUs > 0% Local load HitM, bridge is not even in the c2c report list > Also there's 3% consistent improvement in netperf tests.Looks great, series applied, thanks!