From 2c828620689650fefb478bac8a283d1ef30eff66 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 8 Jun 2020 16:57:43 +0200 Subject: [PATCH] kernel: backport upstream DSA GRO support Should improve performance Signed-off-by: Felix Fietkau --- ...et-dsa-add-GRO-support-via-gro_cells.patch | 143 ++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 target/linux/generic/backport-5.4/755-v5.8-net-dsa-add-GRO-support-via-gro_cells.patch diff --git a/target/linux/generic/backport-5.4/755-v5.8-net-dsa-add-GRO-support-via-gro_cells.patch b/target/linux/generic/backport-5.4/755-v5.8-net-dsa-add-GRO-support-via-gro_cells.patch new file mode 100644 index 0000000000..6d50a40d61 --- /dev/null +++ b/target/linux/generic/backport-5.4/755-v5.8-net-dsa-add-GRO-support-via-gro_cells.patch @@ -0,0 +1,143 @@ +From: Alexander Lobakin +Date: Tue, 21 Apr 2020 16:41:08 +0300 +Subject: [PATCH] net: dsa: add GRO support via gro_cells + +gro_cells lib is used by different encapsulating netdevices, such as +geneve, macsec, vxlan etc. to speed up decapsulated traffic processing. +CPU tag is a sort of "encapsulation", and we can use the same mechs to +greatly improve overall DSA performance. +skbs are passed to the GRO layer after removing CPU tags, so we don't +need any new packet offload types as it was firstly proposed by me in +the first GRO-over-DSA variant [1]. + +The size of struct gro_cells is sizeof(void *), so hot struct +dsa_slave_priv becomes only 4/8 bytes bigger, and all critical fields +remain in one 32-byte cacheline. +The other positive side effect is that drivers for network devices +that can be shipped as CPU ports of DSA-driven switches can now use +napi_gro_frags() to pass skbs to kernel. Packets built that way are +completely non-linear and are likely being dropped without GRO. + +This was tested on to-be-mainlined-soon Ethernet driver that uses +napi_gro_frags(), and the overall performance was on par with the +variant from [1], sometimes even better due to minimal overhead. +net.core.gro_normal_batch tuning may help to push it to the limit +on particular setups and platforms. + +iperf3 IPoE VLAN NAT TCP forwarding (port1.218 -> port0) setup +on 1.2 GHz MIPS board: + +5.7-rc2 baseline: + +[ID] Interval Transfer Bitrate Retr +[ 5] 0.00-120.01 sec 9.00 GBytes 644 Mbits/sec 413 sender +[ 5] 0.00-120.00 sec 8.99 GBytes 644 Mbits/sec receiver + +Iface RX packets TX packets +eth0 7097731 7097702 +port0 426050 6671829 +port1 6671681 425862 +port1.218 6671677 425851 + +With this patch: + +[ID] Interval Transfer Bitrate Retr +[ 5] 0.00-120.01 sec 12.2 GBytes 870 Mbits/sec 122 sender +[ 5] 0.00-120.00 sec 12.2 GBytes 870 Mbits/sec receiver + +Iface RX packets TX packets +eth0 9474792 9474777 +port0 455200 353288 +port1 9019592 455035 +port1.218 353144 455024 + +v2: + - Add some performance examples in the commit message; + - No functional changes. + +[1] https://lore.kernel.org/netdev/20191230143028.27313-1-alobakin@dlink.ru/ + +Signed-off-by: Alexander Lobakin +Signed-off-by: David S. Miller +--- + +--- a/net/dsa/Kconfig ++++ b/net/dsa/Kconfig +@@ -9,6 +9,7 @@ menuconfig NET_DSA + tristate "Distributed Switch Architecture" + depends on HAVE_NET_DSA + depends on BRIDGE || BRIDGE=n ++ select GRO_CELLS + select NET_SWITCHDEV + select PHYLINK + select NET_DEVLINK +--- a/net/dsa/dsa.c ++++ b/net/dsa/dsa.c +@@ -238,7 +238,7 @@ static int dsa_switch_rcv(struct sk_buff + if (dsa_skb_defer_rx_timestamp(p, skb)) + return 0; + +- netif_receive_skb(skb); ++ gro_cells_receive(&p->gcells, skb); + + return 0; + } +--- a/net/dsa/dsa_priv.h ++++ b/net/dsa/dsa_priv.h +@@ -11,6 +11,7 @@ + #include + #include + #include ++#include + + enum { + DSA_NOTIFIER_AGEING_TIME, +@@ -68,6 +69,8 @@ struct dsa_slave_priv { + + struct pcpu_sw_netstats *stats64; + ++ struct gro_cells gcells; ++ + /* DSA port data, such as switch, port index, etc. */ + struct dsa_port *dp; + +--- a/net/dsa/slave.c ++++ b/net/dsa/slave.c +@@ -1431,6 +1431,11 @@ int dsa_slave_create(struct dsa_port *po + free_netdev(slave_dev); + return -ENOMEM; + } ++ ++ ret = gro_cells_init(&p->gcells, slave_dev); ++ if (ret) ++ goto out_free; ++ + p->dp = port; + INIT_LIST_HEAD(&p->mall_tc_list); + INIT_WORK(&port->xmit_work, dsa_port_xmit_work); +@@ -1443,7 +1448,7 @@ int dsa_slave_create(struct dsa_port *po + ret = dsa_slave_phy_setup(slave_dev); + if (ret) { + netdev_err(master, "error %d setting up slave phy\n", ret); +- goto out_free; ++ goto out_gcells; + } + + dsa_slave_notify(slave_dev, DSA_PORT_REGISTER); +@@ -1462,6 +1467,8 @@ out_phy: + phylink_disconnect_phy(p->dp->pl); + rtnl_unlock(); + phylink_destroy(p->dp->pl); ++out_gcells: ++ gro_cells_destroy(&p->gcells); + out_free: + free_percpu(p->stats64); + free_netdev(slave_dev); +@@ -1482,6 +1489,7 @@ void dsa_slave_destroy(struct net_device + dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER); + unregister_netdev(slave_dev); + phylink_destroy(dp->pl); ++ gro_cells_destroy(&p->gcells); + free_percpu(p->stats64); + free_netdev(slave_dev); + }