thr3ads.net - Linux Virtualization - [PATCH net-next 2/3] virtio-net: use per-receive queue page frag alloc for mergeable bufs [Dec 2013]

If this information is useful, please help other people find it:
Share via:

Michael Dalton

2013-Dec-17 00:16 UTC

[PATCH net-next 1/3] net: allow > 0 order atomic page alloc in skb_page_frag_refill

skb_page_frag_refill currently permits only order-0 page allocs
unless GFP_WAIT is used. Change skb_page_frag_refill to attempt
higher-order page allocations whether or not GFP_WAIT is used. If
memory cannot be allocated, the allocator will fall back to
successively smaller page allocs (down to order-0 page allocs).

This change brings skb_page_frag_refill in line with the existing
page allocation strategy employed by netdev_alloc_frag, which attempts
higher-order page allocations whether or not GFP_WAIT is set, falling
back to successively lower-order page allocations on failure. Part
of migration of virtio-net to per-receive queue page frag allocators.

Signed-off-by: Michael Dalton <mwdalton at google.com>
---
 net/core/sock.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index ab20ed9..7383d23 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1865,9 +1865,7 @@ bool skb_page_frag_refill(unsigned int sz, struct
page_frag *pfrag, gfp_t prio)
 		put_page(pfrag->page);
 	}
 
-	/* We restrict high order allocations to users that can afford to wait */
-	order = (prio & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
-
+	order = SKB_FRAG_PAGE_ORDER;
 	do {
 		gfp_t gfp = prio;
 
-- 
1.8.5.1

Michael Dalton

2013-Dec-17 00:16 UTC

head link

[PATCH net-next 2/3] virtio-net: use per-receive queue page frag alloc for mergeable bufs

The virtio-net driver currently uses netdev_alloc_frag() for GFP_ATOMIC
mergeable rx buffer allocations. This commit migrates virtio-net to use
per-receive queue page frags for GFP_ATOMIC allocation. This change unifies
mergeable rx buffer memory allocation, which now will use skb_refill_frag()
for both atomic and GFP-WAIT buffer allocations.

To address fragmentation concerns, if after buffer allocation there
is too little space left in the page frag to allocate a subsequent
buffer, the remaining space is added to the current allocated buffer
so that the remaining space can be used to store packet data.

Signed-off-by: Michael Dalton <mwdalton at google.com>
---
 drivers/net/virtio_net.c | 69 ++++++++++++++++++++++++++----------------------
 1 file changed, 38 insertions(+), 31 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index c51a988..d38d130 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -78,6 +78,9 @@ struct receive_queue {
 	/* Chain pages by the private ptr. */
 	struct page *pages;
 
+	/* Page frag for GFP_ATOMIC packet buffer allocation. */
+	struct page_frag atomic_frag;
+
 	/* RX: fragments + linear part + virtio header */
 	struct scatterlist sg[MAX_SKB_FRAGS + 2];
 
@@ -127,9 +130,9 @@ struct virtnet_info {
 	struct mutex config_lock;
 
 	/* Page_frag for GFP_KERNEL packet buffer allocation when we run
-	 * low on memory.
+	 * low on memory. May sleep.
 	 */
-	struct page_frag alloc_frag;
+	struct page_frag sleep_frag;
 
 	/* Does the affinity hint is set for virtqueues? */
 	bool affinity_hint_set;
@@ -336,8 +339,8 @@ static struct sk_buff *receive_mergeable(struct net_device
*dev,
 	int num_buf = hdr->mhdr.num_buffers;
 	struct page *page = virt_to_head_page(buf);
 	int offset = buf - page_address(page);
-	struct sk_buff *head_skb = page_to_skb(rq, page, offset, len,
-					       MERGE_BUFFER_LEN);
+	int truesize = max_t(int, len, MERGE_BUFFER_LEN);
+	struct sk_buff *head_skb = page_to_skb(rq, page, offset, len, truesize);
 	struct sk_buff *curr_skb = head_skb;
 
 	if (unlikely(!curr_skb))
@@ -353,11 +356,6 @@ static struct sk_buff *receive_mergeable(struct net_device
*dev,
 			dev->stats.rx_length_errors++;
 			goto err_buf;
 		}
-		if (unlikely(len > MERGE_BUFFER_LEN)) {
-			pr_debug("%s: rx error: merge buffer too long\n",
-				 dev->name);
-			len = MERGE_BUFFER_LEN;
-		}
 
 		page = virt_to_head_page(buf);
 		--rq->num;
@@ -376,19 +374,20 @@ static struct sk_buff *receive_mergeable(struct net_device
*dev,
 			head_skb->truesize += nskb->truesize;
 			num_skb_frags = 0;
 		}
+		truesize = max_t(int, len, MERGE_BUFFER_LEN);
 		if (curr_skb != head_skb) {
 			head_skb->data_len += len;
 			head_skb->len += len;
-			head_skb->truesize += MERGE_BUFFER_LEN;
+			head_skb->truesize += truesize;
 		}
 		offset = buf - page_address(page);
 		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
 			put_page(page);
 			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
-					     len, MERGE_BUFFER_LEN);
+					     len, truesize);
 		} else {
 			skb_add_rx_frag(curr_skb, num_skb_frags, page,
-					offset, len, MERGE_BUFFER_LEN);
+					offset, len, truesize);
 		}
 	}
 
@@ -579,24 +578,24 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t
gfp)
 static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
 {
 	struct virtnet_info *vi = rq->vq->vdev->priv;
-	char *buf = NULL;
-	int err;
+	struct page_frag *alloc_frag;
+	char *buf;
+	int err, len, hole;
 
-	if (gfp & __GFP_WAIT) {
-		if (skb_page_frag_refill(MERGE_BUFFER_LEN, &vi->alloc_frag,
-					 gfp)) {
-			buf = (char *)page_address(vi->alloc_frag.page) +
-			      vi->alloc_frag.offset;
-			get_page(vi->alloc_frag.page);
-			vi->alloc_frag.offset += MERGE_BUFFER_LEN;
-		}
-	} else {
-		buf = netdev_alloc_frag(MERGE_BUFFER_LEN);
-	}
-	if (!buf)
+	alloc_frag = (gfp & __GFP_WAIT) ? &vi->sleep_frag :
&rq->atomic_frag;
+	if (unlikely(!skb_page_frag_refill(MERGE_BUFFER_LEN, alloc_frag, gfp)))
 		return -ENOMEM;
+	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
+	get_page(alloc_frag->page);
+	len = MERGE_BUFFER_LEN;
+	alloc_frag->offset += len;
+	hole = alloc_frag->size - alloc_frag->offset;
+	if (hole < MERGE_BUFFER_LEN) {
+		len += hole;
+		alloc_frag->offset += hole;
+	}
 
-	sg_init_one(rq->sg, buf, MERGE_BUFFER_LEN);
+	sg_init_one(rq->sg, buf, len);
 	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
 	if (err < 0)
 		put_page(virt_to_head_page(buf));
@@ -1377,6 +1376,16 @@ static void free_receive_bufs(struct virtnet_info *vi)
 	}
 }
 
+static void free_receive_page_frags(struct virtnet_info *vi)
+{
+	int i;
+	for (i = 0; i < vi->max_queue_pairs; i++)
+		if (vi->rq[i].atomic_frag.page)
+			put_page(vi->rq[i].atomic_frag.page);
+	if (vi->sleep_frag.page)
+		put_page(vi->sleep_frag.page);
+}
+
 static void free_unused_bufs(struct virtnet_info *vi)
 {
 	void *buf;
@@ -1706,8 +1715,7 @@ free_recv_bufs:
 free_vqs:
 	cancel_delayed_work_sync(&vi->refill);
 	virtnet_del_vqs(vi);
-	if (vi->alloc_frag.page)
-		put_page(vi->alloc_frag.page);
+	free_receive_page_frags(vi);
 free_stats:
 	free_percpu(vi->stats);
 free:
@@ -1741,8 +1749,7 @@ static void virtnet_remove(struct virtio_device *vdev)
 	unregister_netdev(vi->dev);
 
 	remove_vq_common(vi);
-	if (vi->alloc_frag.page)
-		put_page(vi->alloc_frag.page);
+	free_receive_page_frags(vi);
 
 	flush_work(&vi->config_work);
 
-- 
1.8.5.1

Michael Dalton

2013-Dec-17 00:16 UTC

head link

[PATCH net-next 3/3] net: auto-tune mergeable rx buffer size for improved performance

Commit 2613af0ed18a ("virtio_net: migrate mergeable rx buffers to page frag
allocators") changed the mergeable receive buffer size from PAGE_SIZE to
MTU-size, introducing a single-stream regression for benchmarks with large
average packet size. There is no single optimal buffer size for all
workloads.  For workloads with packet size <= MTU bytes, MTU + virtio-net
header-sized buffers are preferred as larger buffers reduce the TCP window
due to SKB truesize. However, single-stream workloads with large average
packet sizes have higher throughput if larger (e.g., PAGE_SIZE) buffers
are used.

This commit auto-tunes the mergeable receiver buffer packet size by
choosing the packet buffer size based on an EWMA of the recent packet
sizes for the receive queue. Packet buffer sizes range from MTU_SIZE +
virtio-net header len to PAGE_SIZE. This improves throughput for
large packet workloads, as any workload with average packet size >PAGE_SIZE
will use PAGE_SIZE buffers.

These optimizations interact positively with recent commit
ba275241030c ("virtio-net: coalesce rx frags when possible during
rx"),
which coalesces adjacent RX SKB fragments in virtio_net. The coalescing
optimizations benefit buffers of any size.

Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
between two QEMU VMs on a single physical machine. Each VM has two VCPUs
with all offloads & vhost enabled. All VMs and vhost threads run in a
single 4 CPU cgroup cpuset, using cgroups to ensure that other processes
in the system will not be scheduled on the benchmark CPUs. Trunk includes
SKB rx frag coalescing.

net-next w/ virtio_net before 2613af0ed18a (PAGE_SIZE bufs): 14642.85Gb/s
net-next (MTU-size bufs):  13170.01Gb/s
net-next + auto-tune: 14555.94Gb/s

Signed-off-by: Michael Dalton <mwdalton at google.com>
---
 drivers/net/virtio_net.c | 63 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 46 insertions(+), 17 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index d38d130..904af37 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -26,6 +26,7 @@
 #include <linux/if_vlan.h>
 #include <linux/slab.h>
 #include <linux/cpu.h>
+#include <linux/average.h>
 
 static int napi_weight = NAPI_POLL_WEIGHT;
 module_param(napi_weight, int, 0444);
@@ -36,11 +37,15 @@ module_param(gso, bool, 0444);
 
 /* FIXME: MTU in config. */
 #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
-#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \
-                                sizeof(struct virtio_net_hdr_mrg_rxbuf), \
-                                L1_CACHE_BYTES))
 #define GOOD_COPY_LEN	128
 
+/* Weight used for the RX packet size EWMA. The average packet size is used to
+ * determine the packet buffer size when refilling RX rings. As the entire RX
+ * ring may be refilled at once, the weight is chosen so that the EWMA will be
+ * insensitive to short-term, transient changes in packet size.
+ */
+#define RECEIVE_AVG_WEIGHT 64
+
 #define VIRTNET_DRIVER_VERSION "1.0.0"
 
 struct virtnet_stats {
@@ -78,6 +83,9 @@ struct receive_queue {
 	/* Chain pages by the private ptr. */
 	struct page *pages;
 
+	/* Average packet length for mergeable receive buffers. */
+	struct ewma mrg_avg_pkt_len;
+
 	/* Page frag for GFP_ATOMIC packet buffer allocation. */
 	struct page_frag atomic_frag;
 
@@ -339,13 +347,11 @@ static struct sk_buff *receive_mergeable(struct net_device
*dev,
 	int num_buf = hdr->mhdr.num_buffers;
 	struct page *page = virt_to_head_page(buf);
 	int offset = buf - page_address(page);
-	int truesize = max_t(int, len, MERGE_BUFFER_LEN);
-	struct sk_buff *head_skb = page_to_skb(rq, page, offset, len, truesize);
+	struct sk_buff *head_skb = page_to_skb(rq, page, offset, len, len);
 	struct sk_buff *curr_skb = head_skb;
 
 	if (unlikely(!curr_skb))
 		goto err_skb;
-
 	while (--num_buf) {
 		int num_skb_frags;
 
@@ -374,23 +380,40 @@ static struct sk_buff *receive_mergeable(struct net_device
*dev,
 			head_skb->truesize += nskb->truesize;
 			num_skb_frags = 0;
 		}
-		truesize = max_t(int, len, MERGE_BUFFER_LEN);
 		if (curr_skb != head_skb) {
 			head_skb->data_len += len;
 			head_skb->len += len;
-			head_skb->truesize += truesize;
+			head_skb->truesize += len;
 		}
 		offset = buf - page_address(page);
 		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
 			put_page(page);
 			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
-					     len, truesize);
+					     len, len);
 		} else {
 			skb_add_rx_frag(curr_skb, num_skb_frags, page,
-					offset, len, truesize);
+					offset, len, len);
 		}
 	}
 
+	/* All frags before the last frag are fully used -- for those frags,
+	 * truesize = len. Use the size of the most recent buffer allocation
+	 * from the last frag's page to estimate the truesize of the last frag.
+	 * EWMA with a weight of 64 makes the size adjustments quite small in
+	 * the frags allocated on one page (even a order-3 one), and truesize
+	 * doesn't need to be 100% accurate.
+	 */
+	if (skb_is_nonlinear(head_skb)) {
+		u32 est_buffer_len = page_private(page);
+		if (est_buffer_len > len) {
+			u32 truesize_delta = est_buffer_len - len;
+
+			curr_skb->truesize += truesize_delta;
+			if (curr_skb != head_skb)
+				head_skb->truesize += truesize_delta;
+		}
+	}
+	ewma_add(&rq->mrg_avg_pkt_len, head_skb->len);
 	return head_skb;
 
 err_skb:
@@ -578,24 +601,29 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t
gfp)
 static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
 {
 	struct virtnet_info *vi = rq->vq->vdev->priv;
+	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
 	struct page_frag *alloc_frag;
 	char *buf;
-	int err, len, hole;
+	int err, hole;
+	u32 buflen;
 
+	buflen = hdr_len + clamp_t(u32, ewma_read(&rq->mrg_avg_pkt_len),
+				   GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
+	buflen = ALIGN(buflen, L1_CACHE_BYTES);
 	alloc_frag = (gfp & __GFP_WAIT) ? &vi->sleep_frag :
&rq->atomic_frag;
-	if (unlikely(!skb_page_frag_refill(MERGE_BUFFER_LEN, alloc_frag, gfp)))
+	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, gfp)))
 		return -ENOMEM;
 	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
 	get_page(alloc_frag->page);
-	len = MERGE_BUFFER_LEN;
-	alloc_frag->offset += len;
+	alloc_frag->offset += buflen;
+	set_page_private(alloc_frag->page, buflen);
 	hole = alloc_frag->size - alloc_frag->offset;
-	if (hole < MERGE_BUFFER_LEN) {
-		len += hole;
+	if (hole < buflen) {
+		buflen += hole;
 		alloc_frag->offset += hole;
 	}
 
-	sg_init_one(rq->sg, buf, len);
+	sg_init_one(rq->sg, buf, buflen);
 	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
 	if (err < 0)
 		put_page(virt_to_head_page(buf));
@@ -1516,6 +1544,7 @@ static int virtnet_alloc_queues(struct virtnet_info *vi)
 			       napi_weight);
 
 		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
+		ewma_init(&vi->rq[i].mrg_avg_pkt_len, 1, RECEIVE_AVG_WEIGHT);
 		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
 	}
 
-- 
1.8.5.1

David Miller

2013-Dec-19 19:58 UTC

head link

[PATCH net-next 1/3] net: allow > 0 order atomic page alloc in skb_page_frag_refill

Can I get some reviews of this series from virtio folks?

Thanks.

Jason Wang

2013-Dec-23 07:52 UTC

head link

[PATCH net-next 1/3] net: allow > 0 order atomic page alloc in skb_page_frag_refill

On 12/17/2013 08:16 AM, Michael Dalton wrote:> skb_page_frag_refill currently permits only order-0 page allocs
> unless GFP_WAIT is used. Change skb_page_frag_refill to attempt
> higher-order page allocations whether or not GFP_WAIT is used. If
> memory cannot be allocated, the allocator will fall back to
> successively smaller page allocs (down to order-0 page allocs).
>
> This change brings skb_page_frag_refill in line with the existing
> page allocation strategy employed by netdev_alloc_frag, which attempts
> higher-order page allocations whether or not GFP_WAIT is set, falling
> back to successively lower-order page allocations on failure. Part
> of migration of virtio-net to per-receive queue page frag allocators.
>
> Signed-off-by: Michael Dalton <mwdalton at google.com>
> ---
>  net/core/sock.c | 4 +---
>  1 file changed, 1 insertion(+), 3 deletions(-)
>
> diff --git a/net/core/sock.c b/net/core/sock.c
> index ab20ed9..7383d23 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1865,9 +1865,7 @@ bool skb_page_frag_refill(unsigned int sz, struct
page_frag *pfrag, gfp_t prio)
>  		put_page(pfrag->page);
>  	}
>  
> -	/* We restrict high order allocations to users that can afford to wait */
> -	order = (prio & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
> -
> +	order = SKB_FRAG_PAGE_ORDER;
>  	do {
>  		gfp_t gfp = prio;
>  
The original code seems try to avoid the high order allocation for
atomic allocation. This patch changes this, and looks like it will
introduce some extra cost when the memory is highly fragmented.

Jason Wang

2013-Dec-23 08:12 UTC

head link

[PATCH net-next 2/3] virtio-net: use per-receive queue page frag alloc for mergeable bufs

On 12/17/2013 08:16 AM, Michael Dalton wrote:> The virtio-net driver currently uses netdev_alloc_frag() for GFP_ATOMIC
> mergeable rx buffer allocations. This commit migrates virtio-net to use
> per-receive queue page frags for GFP_ATOMIC allocation. This change unifies
> mergeable rx buffer memory allocation, which now will use skb_refill_frag()
> for both atomic and GFP-WAIT buffer allocations.
>
> To address fragmentation concerns, if after buffer allocation there
> is too little space left in the page frag to allocate a subsequent
> buffer, the remaining space is added to the current allocated buffer
> so that the remaining space can be used to store packet data.
>
> Signed-off-by: Michael Dalton <mwdalton at google.com>
> ---
>  drivers/net/virtio_net.c | 69
++++++++++++++++++++++++++----------------------
>  1 file changed, 38 insertions(+), 31 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index c51a988..d38d130 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -78,6 +78,9 @@ struct receive_queue {
>  	/* Chain pages by the private ptr. */
>  	struct page *pages;
>  
> +	/* Page frag for GFP_ATOMIC packet buffer allocation. */
> +	struct page_frag atomic_frag;
> +
>  	/* RX: fragments + linear part + virtio header */
>  	struct scatterlist sg[MAX_SKB_FRAGS + 2];
>  
> @@ -127,9 +130,9 @@ struct virtnet_info {
>  	struct mutex config_lock;
>  
>  	/* Page_frag for GFP_KERNEL packet buffer allocation when we run
> -	 * low on memory.
> +	 * low on memory. May sleep.
>  	 */
> -	struct page_frag alloc_frag;
> +	struct page_frag sleep_frag;
Any reason to use two different page_frag consider only
skb_page_frag_refill() is used?>  
>  	/* Does the affinity hint is set for virtqueues? */
>  	bool affinity_hint_set;
> @@ -336,8 +339,8 @@ static struct sk_buff *receive_mergeable(struct
net_device *dev,
>  	int num_buf = hdr->mhdr.num_buffers;
>  	struct page *page = virt_to_head_page(buf);
>  	int offset = buf - page_address(page);
> -	struct sk_buff *head_skb = page_to_skb(rq, page, offset, len,
> -					       MERGE_BUFFER_LEN);
> +	int truesize = max_t(int, len, MERGE_BUFFER_LEN);
> +	struct sk_buff *head_skb = page_to_skb(rq, page, offset, len, truesize);
>  	struct sk_buff *curr_skb = head_skb;
>  
>  	if (unlikely(!curr_skb))
> @@ -353,11 +356,6 @@ static struct sk_buff *receive_mergeable(struct
net_device *dev,
>  			dev->stats.rx_length_errors++;
>  			goto err_buf;
>  		}
> -		if (unlikely(len > MERGE_BUFFER_LEN)) {
> -			pr_debug("%s: rx error: merge buffer too long\n",
> -				 dev->name);
> -			len = MERGE_BUFFER_LEN;
> -		}
>  
>  		page = virt_to_head_page(buf);
>  		--rq->num;
> @@ -376,19 +374,20 @@ static struct sk_buff *receive_mergeable(struct
net_device *dev,
>  			head_skb->truesize += nskb->truesize;
>  			num_skb_frags = 0;
>  		}
> +		truesize = max_t(int, len, MERGE_BUFFER_LEN);
>  		if (curr_skb != head_skb) {
>  			head_skb->data_len += len;
>  			head_skb->len += len;
> -			head_skb->truesize += MERGE_BUFFER_LEN;
> +			head_skb->truesize += truesize;
>  		}
>  		offset = buf - page_address(page);
>  		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
>  			put_page(page);
>  			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> -					     len, MERGE_BUFFER_LEN);
> +					     len, truesize);
>  		} else {
>  			skb_add_rx_frag(curr_skb, num_skb_frags, page,
> -					offset, len, MERGE_BUFFER_LEN);
> +					offset, len, truesize);
>  		}
>  	}
>  
> @@ -579,24 +578,24 @@ static int add_recvbuf_big(struct receive_queue *rq,
gfp_t gfp)
>  static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
>  {
>  	struct virtnet_info *vi = rq->vq->vdev->priv;
> -	char *buf = NULL;
> -	int err;
> +	struct page_frag *alloc_frag;
> +	char *buf;
> +	int err, len, hole;
>  
> -	if (gfp & __GFP_WAIT) {
> -		if (skb_page_frag_refill(MERGE_BUFFER_LEN, &vi->alloc_frag,
> -					 gfp)) {
> -			buf = (char *)page_address(vi->alloc_frag.page) +
> -			      vi->alloc_frag.offset;
> -			get_page(vi->alloc_frag.page);
> -			vi->alloc_frag.offset += MERGE_BUFFER_LEN;
> -		}
> -	} else {
> -		buf = netdev_alloc_frag(MERGE_BUFFER_LEN);
> -	}
> -	if (!buf)
> +	alloc_frag = (gfp & __GFP_WAIT) ? &vi->sleep_frag :
&rq->atomic_frag;
> +	if (unlikely(!skb_page_frag_refill(MERGE_BUFFER_LEN, alloc_frag, gfp)))
>  		return -ENOMEM;
> +	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> +	get_page(alloc_frag->page);
> +	len = MERGE_BUFFER_LEN;
> +	alloc_frag->offset += len;
> +	hole = alloc_frag->size - alloc_frag->offset;
> +	if (hole < MERGE_BUFFER_LEN) {
> +		len += hole;
> +		alloc_frag->offset += hole;
> +	}
>  
> -	sg_init_one(rq->sg, buf, MERGE_BUFFER_LEN);
> +	sg_init_one(rq->sg, buf, len);
I wonder whether we can use get_a_page() and give_pages() to recycle the
pages like before which may help the performance. We can also do some
optimizations for this in vhost.>  	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
>  	if (err < 0)
>  		put_page(virt_to_head_page(buf));
> @@ -1377,6 +1376,16 @@ static void free_receive_bufs(struct virtnet_info
*vi)
>  	}
>  }
>  
> +static void free_receive_page_frags(struct virtnet_info *vi)
> +{
> +	int i;
> +	for (i = 0; i < vi->max_queue_pairs; i++)
> +		if (vi->rq[i].atomic_frag.page)
> +			put_page(vi->rq[i].atomic_frag.page);
> +	if (vi->sleep_frag.page)
> +		put_page(vi->sleep_frag.page);
> +}
> +
>  static void free_unused_bufs(struct virtnet_info *vi)
>  {
>  	void *buf;
> @@ -1706,8 +1715,7 @@ free_recv_bufs:
>  free_vqs:
>  	cancel_delayed_work_sync(&vi->refill);
>  	virtnet_del_vqs(vi);
> -	if (vi->alloc_frag.page)
> -		put_page(vi->alloc_frag.page);
> +	free_receive_page_frags(vi);
>  free_stats:
>  	free_percpu(vi->stats);
>  free:
> @@ -1741,8 +1749,7 @@ static void virtnet_remove(struct virtio_device
*vdev)
>  	unregister_netdev(vi->dev);
>  
>  	remove_vq_common(vi);
> -	if (vi->alloc_frag.page)
> -		put_page(vi->alloc_frag.page);
> +	free_receive_page_frags(vi);
>  
>  	flush_work(&vi->config_work);
>

Michael S. Tsirkin

2013-Dec-23 12:51 UTC

head link

[PATCH net-next 3/3] net: auto-tune mergeable rx buffer size for improved performance

On Mon, Dec 16, 2013 at 04:16:29PM -0800, Michael Dalton
wrote:> Commit 2613af0ed18a ("virtio_net: migrate mergeable rx buffers to page
frag
> allocators") changed the mergeable receive buffer size from PAGE_SIZE
to
> MTU-size, introducing a single-stream regression for benchmarks with large
> average packet size. There is no single optimal buffer size for all
> workloads.  For workloads with packet size <= MTU bytes, MTU +
virtio-net
> header-sized buffers are preferred as larger buffers reduce the TCP window
> due to SKB truesize. However, single-stream workloads with large average
> packet sizes have higher throughput if larger (e.g., PAGE_SIZE) buffers
> are used.
> 
> This commit auto-tunes the mergeable receiver buffer packet size by
> choosing the packet buffer size based on an EWMA of the recent packet
> sizes for the receive queue. Packet buffer sizes range from MTU_SIZE +
> virtio-net header len to PAGE_SIZE. This improves throughput for
> large packet workloads, as any workload with average packet size >>
PAGE_SIZE will use PAGE_SIZE buffers.
> 
> These optimizations interact positively with recent commit
> ba275241030c ("virtio-net: coalesce rx frags when possible during
rx"),
> which coalesces adjacent RX SKB fragments in virtio_net. The coalescing
> optimizations benefit buffers of any size.
> 
> Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
> between two QEMU VMs on a single physical machine. Each VM has two VCPUs
> with all offloads & vhost enabled. All VMs and vhost threads run in a
> single 4 CPU cgroup cpuset, using cgroups to ensure that other processes
> in the system will not be scheduled on the benchmark CPUs. Trunk includes
> SKB rx frag coalescing.
> 
> net-next w/ virtio_net before 2613af0ed18a (PAGE_SIZE bufs): 14642.85Gb/s
> net-next (MTU-size bufs):  13170.01Gb/s
> net-next + auto-tune: 14555.94Gb/s
> 
> Signed-off-by: Michael Dalton <mwdalton at google.com>
OK so a high level benchmark shows it's worth it,
but how well does the logic work?
I think we should make the buffer size accessible in sysfs
or debugfs, and look at it, otherwise we don't really know.
> ---
>  drivers/net/virtio_net.c | 63
+++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 46 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index d38d130..904af37 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -26,6 +26,7 @@
>  #include <linux/if_vlan.h>
>  #include <linux/slab.h>
>  #include <linux/cpu.h>
> +#include <linux/average.h>
>  
>  static int napi_weight = NAPI_POLL_WEIGHT;
>  module_param(napi_weight, int, 0444);
> @@ -36,11 +37,15 @@ module_param(gso, bool, 0444);
>  
>  /* FIXME: MTU in config. */
>  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> -#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \
> -                                sizeof(struct virtio_net_hdr_mrg_rxbuf), \
> -                                L1_CACHE_BYTES))
>  #define GOOD_COPY_LEN	128
>  
> +/* Weight used for the RX packet size EWMA. The average packet size is
used to
> + * determine the packet buffer size when refilling RX rings. As the entire
RX
> + * ring may be refilled at once, the weight is chosen so that the EWMA
will be
> + * insensitive to short-term, transient changes in packet size.
> + */
> +#define RECEIVE_AVG_WEIGHT 64
> +
>  #define VIRTNET_DRIVER_VERSION "1.0.0"
>  
>  struct virtnet_stats {
> @@ -78,6 +83,9 @@ struct receive_queue {
>  	/* Chain pages by the private ptr. */
>  	struct page *pages;
>  
> +	/* Average packet length for mergeable receive buffers. */
> +	struct ewma mrg_avg_pkt_len;
> +
>  	/* Page frag for GFP_ATOMIC packet buffer allocation. */
>  	struct page_frag atomic_frag;
>  
> @@ -339,13 +347,11 @@ static struct sk_buff *receive_mergeable(struct
net_device *dev,
>  	int num_buf = hdr->mhdr.num_buffers;
>  	struct page *page = virt_to_head_page(buf);
>  	int offset = buf - page_address(page);
> -	int truesize = max_t(int, len, MERGE_BUFFER_LEN);
> -	struct sk_buff *head_skb = page_to_skb(rq, page, offset, len, truesize);
> +	struct sk_buff *head_skb = page_to_skb(rq, page, offset, len, len);
>  	struct sk_buff *curr_skb = head_skb;
>  
>  	if (unlikely(!curr_skb))
>  		goto err_skb;
> -

Don't like this chunk :)
>  	while (--num_buf) {
>  		int num_skb_frags;
>  
> @@ -374,23 +380,40 @@ static struct sk_buff *receive_mergeable(struct
net_device *dev,
>  			head_skb->truesize += nskb->truesize;
>  			num_skb_frags = 0;
>  		}
> -		truesize = max_t(int, len, MERGE_BUFFER_LEN);
>  		if (curr_skb != head_skb) {
>  			head_skb->data_len += len;
>  			head_skb->len += len;
> -			head_skb->truesize += truesize;
> +			head_skb->truesize += len;
>  		}
>  		offset = buf - page_address(page);
>  		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
>  			put_page(page);
>  			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> -					     len, truesize);
> +					     len, len);
>  		} else {
>  			skb_add_rx_frag(curr_skb, num_skb_frags, page,
> -					offset, len, truesize);
> +					offset, len, len);
>  		}
>  	}
>  
> +	/* All frags before the last frag are fully used -- for those frags,
> +	 * truesize = len. Use the size of the most recent buffer allocation
> +	 * from the last frag's page to estimate the truesize of the last
frag.
I don't get the real motivation for this.

We have skbs A,B,C sharing a page, with chunk D being unused.
This randomly charges chunk D to an skb that ended up last
in the page.
Correct?
Why does this make sense?
> +	 * EWMA with a weight of 64 makes the size adjustments quite small in
> +	 * the frags allocated on one page (even a order-3 one), and truesize
> +	 * doesn't need to be 100% accurate.
If the explanation for the above is that we don't care where D is
charged, let's not charge it to any skbs.
> +	 */
> +	if (skb_is_nonlinear(head_skb)) {
> +		u32 est_buffer_len = page_private(page);
> +		if (est_buffer_len > len) {
> +			u32 truesize_delta = est_buffer_len - len;
> +
> +			curr_skb->truesize += truesize_delta;
> +			if (curr_skb != head_skb)
> +				head_skb->truesize += truesize_delta;
> +		}
> +	}
> +	ewma_add(&rq->mrg_avg_pkt_len, head_skb->len);
Why head_skb only? Why not full buffer size that comes from host?
This is simply len.

>  	return head_skb;
>  
>  err_skb:
> @@ -578,24 +601,29 @@ static int add_recvbuf_big(struct receive_queue *rq,
gfp_t gfp)
>  static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
>  {
>  	struct virtnet_info *vi = rq->vq->vdev->priv;
> +	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
>  	struct page_frag *alloc_frag;
>  	char *buf;
> -	int err, len, hole;
> +	int err, hole;
> +	u32 buflen;
>  
> +	buflen = hdr_len + clamp_t(u32, ewma_read(&rq->mrg_avg_pkt_len),
> +				   GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
> +	buflen = ALIGN(buflen, L1_CACHE_BYTES);
>  	alloc_frag = (gfp & __GFP_WAIT) ? &vi->sleep_frag :
&rq->atomic_frag;
> -	if (unlikely(!skb_page_frag_refill(MERGE_BUFFER_LEN, alloc_frag, gfp)))
> +	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, gfp)))
>  		return -ENOMEM;
>  	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
>  	get_page(alloc_frag->page);
> -	len = MERGE_BUFFER_LEN;
> -	alloc_frag->offset += len;
> +	alloc_frag->offset += buflen;
> +	set_page_private(alloc_frag->page, buflen);
>  	hole = alloc_frag->size - alloc_frag->offset;
> -	if (hole < MERGE_BUFFER_LEN) {
> -		len += hole;
> +	if (hole < buflen) {
> +		buflen += hole;
>  		alloc_frag->offset += hole;
>  	}
>  
> -	sg_init_one(rq->sg, buf, len);
> +	sg_init_one(rq->sg, buf, buflen);
>  	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
>  	if (err < 0)
>  		put_page(virt_to_head_page(buf));
> @@ -1516,6 +1544,7 @@ static int virtnet_alloc_queues(struct virtnet_info
*vi)
>  			       napi_weight);
>  
>  		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
> +		ewma_init(&vi->rq[i].mrg_avg_pkt_len, 1, RECEIVE_AVG_WEIGHT);
>  		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
>  	}
>  
> -- 
> 1.8.5.1

Michael S. Tsirkin

2013-Dec-23 12:53 UTC

head link

[PATCH net-next 1/3] net: allow > 0 order atomic page alloc in skb_page_frag_refill

On Mon, Dec 16, 2013 at 04:16:27PM -0800, Michael Dalton
wrote:> skb_page_frag_refill currently permits only order-0 page allocs
> unless GFP_WAIT is used. Change skb_page_frag_refill to attempt
> higher-order page allocations whether or not GFP_WAIT is used. If
> memory cannot be allocated, the allocator will fall back to
> successively smaller page allocs (down to order-0 page allocs).
> 
> This change brings skb_page_frag_refill in line with the existing
> page allocation strategy employed by netdev_alloc_frag, which attempts
> higher-order page allocations whether or not GFP_WAIT is set, falling
> back to successively lower-order page allocations on failure. Part
> of migration of virtio-net to per-receive queue page frag allocators.
> 
> Signed-off-by: Michael Dalton <mwdalton at google.com>
I don't get how this is related to patch 3/3 exactly.
That one seems to clamp the allocations from ewma to at most
PAGE_SIZE, so how to we get higher-order allocations here?
Could you clarify please?
> ---
>  net/core/sock.c | 4 +---
>  1 file changed, 1 insertion(+), 3 deletions(-)
> 
> diff --git a/net/core/sock.c b/net/core/sock.c
> index ab20ed9..7383d23 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1865,9 +1865,7 @@ bool skb_page_frag_refill(unsigned int sz, struct
page_frag *pfrag, gfp_t prio)
>  		put_page(pfrag->page);
>  	}
>  
> -	/* We restrict high order allocations to users that can afford to wait */
> -	order = (prio & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
> -
> +	order = SKB_FRAG_PAGE_ORDER;
>  	do {
>  		gfp_t gfp = prio;
>  
> -- 
> 1.8.5.1

Michael S. Tsirkin

2013-Dec-23 13:31 UTC

head link

[PATCH net-next 2/3] virtio-net: use per-receive queue page frag alloc for mergeable bufs

On Mon, Dec 16, 2013 at 04:16:28PM -0800, Michael Dalton
wrote:> The virtio-net driver currently uses netdev_alloc_frag() for GFP_ATOMIC
> mergeable rx buffer allocations. This commit migrates virtio-net to use
> per-receive queue page frags for GFP_ATOMIC allocation. This change unifies
> mergeable rx buffer memory allocation, which now will use skb_refill_frag()
> for both atomic and GFP-WAIT buffer allocations.
OK so just to clarify, this is intended as a cleanup
and preparation for 3/3, not as an optimization?
Some notes below.
> 
> To address fragmentation concerns, if after buffer allocation there
> is too little space left in the page frag to allocate a subsequent
> buffer, the remaining space is added to the current allocated buffer
> so that the remaining space can be used to store packet data.
> 
> Signed-off-by: Michael Dalton <mwdalton at google.com>
> ---
>  drivers/net/virtio_net.c | 69
++++++++++++++++++++++++++----------------------
>  1 file changed, 38 insertions(+), 31 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index c51a988..d38d130 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -78,6 +78,9 @@ struct receive_queue {
>  	/* Chain pages by the private ptr. */
>  	struct page *pages;
>  
> +	/* Page frag for GFP_ATOMIC packet buffer allocation. */
> +	struct page_frag atomic_frag;
> +
>  	/* RX: fragments + linear part + virtio header */
>  	struct scatterlist sg[MAX_SKB_FRAGS + 2];
>  
> @@ -127,9 +130,9 @@ struct virtnet_info {
>  	struct mutex config_lock;
>  
>  	/* Page_frag for GFP_KERNEL packet buffer allocation when we run
> -	 * low on memory.
> +	 * low on memory. May sleep.
>  	 */
> -	struct page_frag alloc_frag;
> +	struct page_frag sleep_frag;
>  
>  	/* Does the affinity hint is set for virtqueues? */
>  	bool affinity_hint_set;
> @@ -336,8 +339,8 @@ static struct sk_buff *receive_mergeable(struct
net_device *dev,
>  	int num_buf = hdr->mhdr.num_buffers;
>  	struct page *page = virt_to_head_page(buf);
>  	int offset = buf - page_address(page);
> -	struct sk_buff *head_skb = page_to_skb(rq, page, offset, len,
> -					       MERGE_BUFFER_LEN);
> +	int truesize = max_t(int, len, MERGE_BUFFER_LEN);
> +	struct sk_buff *head_skb = page_to_skb(rq, page, offset, len, truesize);
>  	struct sk_buff *curr_skb = head_skb;
>  
>  	if (unlikely(!curr_skb))
> @@ -353,11 +356,6 @@ static struct sk_buff *receive_mergeable(struct
net_device *dev,
>  			dev->stats.rx_length_errors++;
>  			goto err_buf;
>  		}
> -		if (unlikely(len > MERGE_BUFFER_LEN)) {
> -			pr_debug("%s: rx error: merge buffer too long\n",
> -				 dev->name);
> -			len = MERGE_BUFFER_LEN;
> -		}
>  
>  		page = virt_to_head_page(buf);
>  		--rq->num;
> @@ -376,19 +374,20 @@ static struct sk_buff *receive_mergeable(struct
net_device *dev,
>  			head_skb->truesize += nskb->truesize;
>  			num_skb_frags = 0;
>  		}
> +		truesize = max_t(int, len, MERGE_BUFFER_LEN);
>  		if (curr_skb != head_skb) {
>  			head_skb->data_len += len;
>  			head_skb->len += len;
> -			head_skb->truesize += MERGE_BUFFER_LEN;
> +			head_skb->truesize += truesize;
>  		}
>  		offset = buf - page_address(page);
>  		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
>  			put_page(page);
>  			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> -					     len, MERGE_BUFFER_LEN);
> +					     len, truesize);
>  		} else {
>  			skb_add_rx_frag(curr_skb, num_skb_frags, page,
> -					offset, len, MERGE_BUFFER_LEN);
> +					offset, len, truesize);
>  		}
>  	}
>  
> @@ -579,24 +578,24 @@ static int add_recvbuf_big(struct receive_queue *rq,
gfp_t gfp)
>  static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
>  {
>  	struct virtnet_info *vi = rq->vq->vdev->priv;
> -	char *buf = NULL;
> -	int err;
> +	struct page_frag *alloc_frag;
> +	char *buf;
> +	int err, len, hole;
>  
> -	if (gfp & __GFP_WAIT) {
> -		if (skb_page_frag_refill(MERGE_BUFFER_LEN, &vi->alloc_frag,
> -					 gfp)) {
> -			buf = (char *)page_address(vi->alloc_frag.page) +
> -			      vi->alloc_frag.offset;
> -			get_page(vi->alloc_frag.page);
> -			vi->alloc_frag.offset += MERGE_BUFFER_LEN;
> -		}
> -	} else {
> -		buf = netdev_alloc_frag(MERGE_BUFFER_LEN);
> -	}
> -	if (!buf)
> +	alloc_frag = (gfp & __GFP_WAIT) ? &vi->sleep_frag :
&rq->atomic_frag;
> +	if (unlikely(!skb_page_frag_refill(MERGE_BUFFER_LEN, alloc_frag, gfp)))
>  		return -ENOMEM;
I note that netdev_alloc_frag sets __GFP_COLD which kind of
makes sense.

I also note that netdev_alloc_frag uses some tricks to
reduce the amount of cache lines dirtied - need to look
at whether they actually apply here.
> +	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> +	get_page(alloc_frag->page);
> +	len = MERGE_BUFFER_LEN;
> +	alloc_frag->offset += len;
> +	hole = alloc_frag->size - alloc_frag->offset;
> +	if (hole < MERGE_BUFFER_LEN) {
> +		len += hole;
> +		alloc_frag->offset += hole;
> +	}
>  
> -	sg_init_one(rq->sg, buf, MERGE_BUFFER_LEN);
> +	sg_init_one(rq->sg, buf, len);
>  	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
>  	if (err < 0)
>  		put_page(virt_to_head_page(buf));
> @@ -1377,6 +1376,16 @@ static void free_receive_bufs(struct virtnet_info
*vi)
>  	}
>  }
>  
> +static void free_receive_page_frags(struct virtnet_info *vi)
> +{
> +	int i;
> +	for (i = 0; i < vi->max_queue_pairs; i++)
> +		if (vi->rq[i].atomic_frag.page)
> +			put_page(vi->rq[i].atomic_frag.page);
> +	if (vi->sleep_frag.page)
> +		put_page(vi->sleep_frag.page);
> +}
> +
>  static void free_unused_bufs(struct virtnet_info *vi)
>  {
>  	void *buf;
> @@ -1706,8 +1715,7 @@ free_recv_bufs:
>  free_vqs:
>  	cancel_delayed_work_sync(&vi->refill);
>  	virtnet_del_vqs(vi);
> -	if (vi->alloc_frag.page)
> -		put_page(vi->alloc_frag.page);
> +	free_receive_page_frags(vi);
>  free_stats:
>  	free_percpu(vi->stats);
>  free:
> @@ -1741,8 +1749,7 @@ static void virtnet_remove(struct virtio_device
*vdev)
>  	unregister_netdev(vi->dev);
>  
>  	remove_vq_common(vi);
> -	if (vi->alloc_frag.page)
> -		put_page(vi->alloc_frag.page);
> +	free_receive_page_frags(vi);
>  
>  	flush_work(&vi->config_work);
>  
> -- 
> 1.8.5.1

Michael S. Tsirkin

2013-Dec-23 13:33 UTC

head link

[PATCH net-next 3/3] net: auto-tune mergeable rx buffer size for improved performance

On Mon, Dec 16, 2013 at 04:16:29PM -0800, Michael Dalton
wrote:> Commit 2613af0ed18a ("virtio_net: migrate mergeable rx buffers to page
frag
> allocators") changed the mergeable receive buffer size from PAGE_SIZE
to
> MTU-size, introducing a single-stream regression for benchmarks with large
> average packet size. There is no single optimal buffer size for all
> workloads.  For workloads with packet size <= MTU bytes, MTU +
virtio-net
> header-sized buffers are preferred as larger buffers reduce the TCP window
> due to SKB truesize. However, single-stream workloads with large average
> packet sizes have higher throughput if larger (e.g., PAGE_SIZE) buffers
> are used.
> 
> This commit auto-tunes the mergeable receiver buffer packet size by
> choosing the packet buffer size based on an EWMA of the recent packet
> sizes for the receive queue. Packet buffer sizes range from MTU_SIZE +
> virtio-net header len to PAGE_SIZE. This improves throughput for
> large packet workloads, as any workload with average packet size >>
PAGE_SIZE will use PAGE_SIZE buffers.
> 
> These optimizations interact positively with recent commit
> ba275241030c ("virtio-net: coalesce rx frags when possible during
rx"),
> which coalesces adjacent RX SKB fragments in virtio_net. The coalescing
> optimizations benefit buffers of any size.
> 
> Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
> between two QEMU VMs on a single physical machine. Each VM has two VCPUs
> with all offloads & vhost enabled. All VMs and vhost threads run in a
> single 4 CPU cgroup cpuset, using cgroups to ensure that other processes
> in the system will not be scheduled on the benchmark CPUs. Trunk includes
> SKB rx frag coalescing.
> 
> net-next w/ virtio_net before 2613af0ed18a (PAGE_SIZE bufs): 14642.85Gb/s
> net-next (MTU-size bufs):  13170.01Gb/s
> net-next + auto-tune: 14555.94Gb/s
Also I guess this 1% difference is in the noise, right?
Could you share data about host CPU utilization during
these runs please?
> Signed-off-by: Michael Dalton <mwdalton at google.com>
> ---
>  drivers/net/virtio_net.c | 63
+++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 46 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index d38d130..904af37 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -26,6 +26,7 @@
>  #include <linux/if_vlan.h>
>  #include <linux/slab.h>
>  #include <linux/cpu.h>
> +#include <linux/average.h>
>  
>  static int napi_weight = NAPI_POLL_WEIGHT;
>  module_param(napi_weight, int, 0444);
> @@ -36,11 +37,15 @@ module_param(gso, bool, 0444);
>  
>  /* FIXME: MTU in config. */
>  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> -#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \
> -                                sizeof(struct virtio_net_hdr_mrg_rxbuf), \
> -                                L1_CACHE_BYTES))
>  #define GOOD_COPY_LEN	128
>  
> +/* Weight used for the RX packet size EWMA. The average packet size is
used to
> + * determine the packet buffer size when refilling RX rings. As the entire
RX
> + * ring may be refilled at once, the weight is chosen so that the EWMA
will be
> + * insensitive to short-term, transient changes in packet size.
> + */
> +#define RECEIVE_AVG_WEIGHT 64
> +
>  #define VIRTNET_DRIVER_VERSION "1.0.0"
>  
>  struct virtnet_stats {
> @@ -78,6 +83,9 @@ struct receive_queue {
>  	/* Chain pages by the private ptr. */
>  	struct page *pages;
>  
> +	/* Average packet length for mergeable receive buffers. */
> +	struct ewma mrg_avg_pkt_len;
> +
>  	/* Page frag for GFP_ATOMIC packet buffer allocation. */
>  	struct page_frag atomic_frag;
>  
> @@ -339,13 +347,11 @@ static struct sk_buff *receive_mergeable(struct
net_device *dev,
>  	int num_buf = hdr->mhdr.num_buffers;
>  	struct page *page = virt_to_head_page(buf);
>  	int offset = buf - page_address(page);
> -	int truesize = max_t(int, len, MERGE_BUFFER_LEN);
> -	struct sk_buff *head_skb = page_to_skb(rq, page, offset, len, truesize);
> +	struct sk_buff *head_skb = page_to_skb(rq, page, offset, len, len);
>  	struct sk_buff *curr_skb = head_skb;
>  
>  	if (unlikely(!curr_skb))
>  		goto err_skb;
> -
>  	while (--num_buf) {
>  		int num_skb_frags;
>  
> @@ -374,23 +380,40 @@ static struct sk_buff *receive_mergeable(struct
net_device *dev,
>  			head_skb->truesize += nskb->truesize;
>  			num_skb_frags = 0;
>  		}
> -		truesize = max_t(int, len, MERGE_BUFFER_LEN);
>  		if (curr_skb != head_skb) {
>  			head_skb->data_len += len;
>  			head_skb->len += len;
> -			head_skb->truesize += truesize;
> +			head_skb->truesize += len;
>  		}
>  		offset = buf - page_address(page);
>  		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
>  			put_page(page);
>  			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> -					     len, truesize);
> +					     len, len);
>  		} else {
>  			skb_add_rx_frag(curr_skb, num_skb_frags, page,
> -					offset, len, truesize);
> +					offset, len, len);
>  		}
>  	}
>  
> +	/* All frags before the last frag are fully used -- for those frags,
> +	 * truesize = len. Use the size of the most recent buffer allocation
> +	 * from the last frag's page to estimate the truesize of the last
frag.
> +	 * EWMA with a weight of 64 makes the size adjustments quite small in
> +	 * the frags allocated on one page (even a order-3 one), and truesize
> +	 * doesn't need to be 100% accurate.
> +	 */
> +	if (skb_is_nonlinear(head_skb)) {
> +		u32 est_buffer_len = page_private(page);
> +		if (est_buffer_len > len) {
> +			u32 truesize_delta = est_buffer_len - len;
> +
> +			curr_skb->truesize += truesize_delta;
> +			if (curr_skb != head_skb)
> +				head_skb->truesize += truesize_delta;
> +		}
> +	}
> +	ewma_add(&rq->mrg_avg_pkt_len, head_skb->len);
>  	return head_skb;
>  
>  err_skb:
> @@ -578,24 +601,29 @@ static int add_recvbuf_big(struct receive_queue *rq,
gfp_t gfp)
>  static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
>  {
>  	struct virtnet_info *vi = rq->vq->vdev->priv;
> +	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
>  	struct page_frag *alloc_frag;
>  	char *buf;
> -	int err, len, hole;
> +	int err, hole;
> +	u32 buflen;
>  
> +	buflen = hdr_len + clamp_t(u32, ewma_read(&rq->mrg_avg_pkt_len),
> +				   GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
> +	buflen = ALIGN(buflen, L1_CACHE_BYTES);
>  	alloc_frag = (gfp & __GFP_WAIT) ? &vi->sleep_frag :
&rq->atomic_frag;
> -	if (unlikely(!skb_page_frag_refill(MERGE_BUFFER_LEN, alloc_frag, gfp)))
> +	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, gfp)))
>  		return -ENOMEM;
>  	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
>  	get_page(alloc_frag->page);
> -	len = MERGE_BUFFER_LEN;
> -	alloc_frag->offset += len;
> +	alloc_frag->offset += buflen;
> +	set_page_private(alloc_frag->page, buflen);
>  	hole = alloc_frag->size - alloc_frag->offset;
> -	if (hole < MERGE_BUFFER_LEN) {
> -		len += hole;
> +	if (hole < buflen) {
> +		buflen += hole;
>  		alloc_frag->offset += hole;
>  	}
>  
> -	sg_init_one(rq->sg, buf, len);
> +	sg_init_one(rq->sg, buf, buflen);
>  	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
>  	if (err < 0)
>  		put_page(virt_to_head_page(buf));
> @@ -1516,6 +1544,7 @@ static int virtnet_alloc_queues(struct virtnet_info
*vi)
>  			       napi_weight);
>  
>  		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
> +		ewma_init(&vi->rq[i].mrg_avg_pkt_len, 1, RECEIVE_AVG_WEIGHT);
>  		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
>  	}
>  
> -- 
> 1.8.5.1

Michael S. Tsirkin

2013-Dec-23 13:35 UTC

head link

[PATCH net-next 1/3] net: allow > 0 order atomic page alloc in skb_page_frag_refill

On Thu, Dec 19, 2013 at 02:58:15PM -0500, David Miller
wrote:> 
> Can I get some reviews of this series from virtio folks?
> 
> Thanks.
I think it's a good idea, certainly the best we can do
short term.
Would like a couple of comments addressed before it's applied.

Longer term one approach to consider is to make device
look aheads in the VQ until it finds a large
enough buffer.

-- 
MST

Eric Dumazet

2013-Dec-23 17:24 UTC

head link

[PATCH net-next 1/3] net: allow > 0 order atomic page alloc in skb_page_frag_refill

On Mon, 2013-12-23 at 15:52 +0800, Jason Wang wrote:> On 12/17/2013 08:16 AM, Michael Dalton wrote:
> > skb_page_frag_refill currently permits only order-0 page allocs
> > unless GFP_WAIT is used. Change skb_page_frag_refill to attempt
> > higher-order page allocations whether or not GFP_WAIT is used. If
> > memory cannot be allocated, the allocator will fall back to
> > successively smaller page allocs (down to order-0 page allocs).
> >
> > This change brings skb_page_frag_refill in line with the existing
> > page allocation strategy employed by netdev_alloc_frag, which attempts
> > higher-order page allocations whether or not GFP_WAIT is set, falling
> > back to successively lower-order page allocations on failure. Part
> > of migration of virtio-net to per-receive queue page frag allocators.
> >
> > Signed-off-by: Michael Dalton <mwdalton at google.com>
> > ---
> >  net/core/sock.c | 4 +---
> >  1 file changed, 1 insertion(+), 3 deletions(-)
> >
> > diff --git a/net/core/sock.c b/net/core/sock.c
> > index ab20ed9..7383d23 100644
> > --- a/net/core/sock.c
> > +++ b/net/core/sock.c
> > @@ -1865,9 +1865,7 @@ bool skb_page_frag_refill(unsigned int sz,
struct page_frag *pfrag, gfp_t prio)
> >  		put_page(pfrag->page);
> >  	}
> >  
> > -	/* We restrict high order allocations to users that can afford to
wait */
> > -	order = (prio & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
> > -
> > +	order = SKB_FRAG_PAGE_ORDER;
> >  	do {
> >  		gfp_t gfp = prio;
> >  
> 
> The original code seems try to avoid the high order allocation for
> atomic allocation. This patch changes this, and looks like it will
> introduce some extra cost when the memory is highly fragmented.
No noticeable extra cost that I could measure.

We use the same strategy in RX path nowadays.

Acked-by: Eric Dumazet <edumazet at google.com>

Eric Dumazet

2013-Dec-23 17:30 UTC

head link

[PATCH net-next 1/3] net: allow > 0 order atomic page alloc in skb_page_frag_refill

On Mon, 2013-12-23 at 14:53 +0200, Michael S. Tsirkin
wrote:> On Mon, Dec 16, 2013 at 04:16:27PM -0800, Michael Dalton wrote:
> > skb_page_frag_refill currently permits only order-0 page allocs
> > unless GFP_WAIT is used. Change skb_page_frag_refill to attempt
> > higher-order page allocations whether or not GFP_WAIT is used. If
> > memory cannot be allocated, the allocator will fall back to
> > successively smaller page allocs (down to order-0 page allocs).
> > 
> > This change brings skb_page_frag_refill in line with the existing
> > page allocation strategy employed by netdev_alloc_frag, which attempts
> > higher-order page allocations whether or not GFP_WAIT is set, falling
> > back to successively lower-order page allocations on failure. Part
> > of migration of virtio-net to per-receive queue page frag allocators.
> > 
> > Signed-off-by: Michael Dalton <mwdalton at google.com>
> 
> I don't get how this is related to patch 3/3 exactly.
> That one seems to clamp the allocations from ewma to at most
> PAGE_SIZE, so how to we get higher-order allocations here?
> Could you clarify please?
If your ewma stabilizes at 2050 bytes per frag, using order-0 page will
waste ~50% of memory.

David Miller

2013-Dec-24 22:46 UTC

head link

[PATCH net-next 1/3] net: allow > 0 order atomic page alloc in skb_page_frag_refill

There is still feedback and/or minor adjustments being asked for wrt.
this series.   These changes have been sitting for more than a week
which is a bit rediculous.

Please resubmit these changes once everything is resolved to
everyone's satisfaction, thanks.

Jason Wang

2013-Dec-26 07:33 UTC

head link

[PATCH net-next 3/3] net: auto-tune mergeable rx buffer size for improved performance

On 12/17/2013 08:16 AM, Michael Dalton wrote:> Commit 2613af0ed18a ("virtio_net: migrate mergeable rx buffers to page
frag
> allocators") changed the mergeable receive buffer size from PAGE_SIZE
to
> MTU-size, introducing a single-stream regression for benchmarks with large
> average packet size. There is no single optimal buffer size for all
> workloads.  For workloads with packet size <= MTU bytes, MTU +
virtio-net
> header-sized buffers are preferred as larger buffers reduce the TCP window
> due to SKB truesize. However, single-stream workloads with large average
> packet sizes have higher throughput if larger (e.g., PAGE_SIZE) buffers
> are used.
>
> This commit auto-tunes the mergeable receiver buffer packet size by
> choosing the packet buffer size based on an EWMA of the recent packet
> sizes for the receive queue. Packet buffer sizes range from MTU_SIZE +
> virtio-net header len to PAGE_SIZE. This improves throughput for
> large packet workloads, as any workload with average packet size >>
PAGE_SIZE will use PAGE_SIZE buffers.
>
> These optimizations interact positively with recent commit
> ba275241030c ("virtio-net: coalesce rx frags when possible during
rx"),
> which coalesces adjacent RX SKB fragments in virtio_net. The coalescing
> optimizations benefit buffers of any size.
>
> Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
> between two QEMU VMs on a single physical machine. Each VM has two VCPUs
> with all offloads & vhost enabled. All VMs and vhost threads run in a
> single 4 CPU cgroup cpuset, using cgroups to ensure that other processes
> in the system will not be scheduled on the benchmark CPUs. Trunk includes
> SKB rx frag coalescing.
>
> net-next w/ virtio_net before 2613af0ed18a (PAGE_SIZE bufs): 14642.85Gb/s
> net-next (MTU-size bufs):  13170.01Gb/s
> net-next + auto-tune: 14555.94Gb/s
>
> Signed-off-by: Michael Dalton <mwdalton at google.com>
The patch looks good to me and test this patch with mlx4, it help to
increase the rx performance from about 22Gb/s to about 26 Gb/s.

Debabrata Banerjee

2014-Jan-03 00:42 UTC

head link

[PATCH net-next 1/3] net: allow > 0 order atomic page alloc in skb_page_frag_refill

Currently because of how mm behaves (3.10.y) the code even before the
patch is a problem. I believe what may fix it is if instead of just
removing the conditional on __GFP_WAIT, the initial order > 0
allocation should be made GFP_ATOMIC, then fallback to the original
gfp mask for the order-0 allocations.

On systems that have highly fragmented main memory with pressure,
skb_page_frag_refill() causes problems. mm enters significant
compaction cycles on all cpu's which in itself is bad (add
considerable spinlock contention in isolate_migratepages_range() for
several seconds in kernel at 100% cpu), however even without this
happening basically we have large memory reclaimation when only
allocations from order-3 were necessary. For example, I might see half
the existing page cache on a system (2GB out of 8GB) reclaimed in a
burst, which effectively means the application has to wait even longer
after this compact/reclaim cycle for those pages to be read back from
disk. This is a significant reduction in useful memory from before
skb_page_frag_refill() existed, as one of our systems could run in
steady state will little free memory and 100% fragmentation. Now I see
10-30x more memory free (read: not utilized). Order > 0 allocations
were happening rarely before, now it happens consistently from this
function.

My suggestion above would avoid mm going through
__alloc_pages_direct_compact() and triggering the bad events above. It
will take me several days to try this experiment.

-Debabrata

On Tue, Dec 24, 2013 at 5:46 PM, David Miller <davem at davemloft.net>
wrote:>
> There is still feedback and/or minor adjustments being asked for wrt.
> this series.   These changes have been sitting for more than a week
> which is a bit rediculous.
>
> Please resubmit these changes once everything is resolved to
> everyone's satisfaction, thanks.
> --
> To unsubscribe from this list: send the line "unsubscribe netdev"
in
> the body of a message to majordomo at vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Seemingly Similar Threads

Search for more reasonably related threads

Linux Virtualization - Dec 2013 - [PATCH net-next 2/3] virtio-net: use per-receive queue page frag alloc for mergeable bufs

[PATCH net-next 1/3] net: allow > 0 order atomic page alloc in skb_page_frag_refill

[PATCH net-next 2/3] virtio-net: use per-receive queue page frag alloc for mergeable bufs

[PATCH net-next 3/3] net: auto-tune mergeable rx buffer size for improved performance

[PATCH net-next 1/3] net: allow > 0 order atomic page alloc in skb_page_frag_refill

[PATCH net-next 1/3] net: allow > 0 order atomic page alloc in skb_page_frag_refill

[PATCH net-next 2/3] virtio-net: use per-receive queue page frag alloc for mergeable bufs

[PATCH net-next 3/3] net: auto-tune mergeable rx buffer size for improved performance

[PATCH net-next 1/3] net: allow > 0 order atomic page alloc in skb_page_frag_refill

[PATCH net-next 2/3] virtio-net: use per-receive queue page frag alloc for mergeable bufs

[PATCH net-next 3/3] net: auto-tune mergeable rx buffer size for improved performance

[PATCH net-next 1/3] net: allow > 0 order atomic page alloc in skb_page_frag_refill

[PATCH net-next 1/3] net: allow > 0 order atomic page alloc in skb_page_frag_refill

[PATCH net-next 1/3] net: allow > 0 order atomic page alloc in skb_page_frag_refill

[PATCH net-next 1/3] net: allow > 0 order atomic page alloc in skb_page_frag_refill

[PATCH net-next 3/3] net: auto-tune mergeable rx buffer size for improved performance

[PATCH net-next 1/3] net: allow > 0 order atomic page alloc in skb_page_frag_refill

Seemingly Similar Threads