thr3ads.net - Xen devel - [PATCH RFC] xen-netback: coalesce frags before copying [Mar 2013]

If this information is useful, please help other people find it:
Share via:

Wei Liu

2013-Mar-14 18:26 UTC

[PATCH RFC] xen-netback: coalesce frags before copying

This patch tries to coalesce txps when constructing grant copy structures. It
enables netback to deal with situation when frontend''s MAX_SKB_FRAGS is
larger
than backend''s MAX_SKB_FRAGS.

It also defines max_skb_frags, which is a estimation of the maximum number of
frags a guest can send, anything bigger than that is considered malicious. Now
it is set to 20, which should be enough to accommodate Linux (16 to 19) and
possibly Windows (19?).

Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 drivers/net/xen-netback/netback.c |  164 +++++++++++++++++++++++++++++--------
 1 file changed, 128 insertions(+), 36 deletions(-)

diff --git a/drivers/net/xen-netback/netback.c
b/drivers/net/xen-netback/netback.c
index 6e8e51a..e27f81a 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -47,9 +47,20 @@
 #include <asm/xen/hypercall.h>
 #include <asm/xen/page.h>
 
+/*
+ * This is an estimation of the maximum possible frags a SKB might
+ * have, anything larger than this is considered malicious. Typically
+ * Linux has 16 to 19, Windows has 19.
+ */
+#define NETBK_MAX_SKB_FRAGS_DEFAULT 20
+static unsigned int max_skb_frags = NETBK_MAX_SKB_FRAGS_DEFAULT;
+module_param(max_skb_frags, uint, 0444);
+
 struct pending_tx_info {
-	struct xen_netif_tx_request req;
+	struct xen_netif_tx_request req; /* points to the first txp  */
 	struct xenvif *vif;
+	unsigned int nr_tx_req; /* how many tx req we have in a chain?, at least 1 */
+	unsigned int start_idx; /* this is starting index of pending ring index */
 };
 typedef unsigned int pending_ring_idx_t;
 
@@ -251,7 +262,7 @@ static int max_required_rx_slots(struct xenvif *vif)
 	int max = DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE);
 
 	if (vif->can_sg || vif->gso || vif->gso_prefix)
-		max += MAX_SKB_FRAGS + 1; /* extra_info + frags */
+		max += max_skb_frags + 1; /* extra_info + frags */
 
 	return max;
 }
@@ -657,7 +668,7 @@ static void xen_netbk_rx_action(struct xen_netbk *netbk)
 		__skb_queue_tail(&rxq, skb);
 
 		/* Filled the batch queue? */
-		if (count + MAX_SKB_FRAGS >= XEN_NETIF_RX_RING_SIZE)
+		if (count + max_skb_frags >= XEN_NETIF_RX_RING_SIZE)
 			break;
 	}
 
@@ -920,7 +931,7 @@ static int netbk_count_requests(struct xenvif *vif,
 			return -ENODATA;
 		}
 
-		if (unlikely(frags >= MAX_SKB_FRAGS)) {
+		if (unlikely(frags >= max_skb_frags)) {
 			netdev_err(vif->dev, "Too many frags\n");
 			netbk_fatal_tx_err(vif);
 			return -E2BIG;
@@ -968,42 +979,98 @@ static struct gnttab_copy *xen_netbk_get_requests(struct
xen_netbk *netbk,
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	skb_frag_t *frags = shinfo->frags;
 	u16 pending_idx = *((u16 *)skb->data);
-	int i, start;
+	u16 head_idx = 0;
+	int i, j, start;
+	struct page *page;
+	pending_ring_idx_t index;
+	uint16_t dst_offset;
+	int total_frags_merged;
+	unsigned int nr_frags = shinfo->nr_frags; /* This can be as large as
max_skb_frags */
+	struct pending_tx_info *first = NULL;
+	int nr_txp;
 
 	/* Skip first skb fragment if it is on same page as header fragment. */
 	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
 
-	for (i = start; i < shinfo->nr_frags; i++, txp++) {
-		struct page *page;
-		pending_ring_idx_t index;
+	/* Coalesce */
+	total_frags_merged = 0;
+	for (i = j = start; i < MAX_SKB_FRAGS && j < nr_frags; i++) {
 		struct pending_tx_info *pending_tx_info  			netbk->pending_tx_info;
 
-		index = pending_index(netbk->pending_cons++);
-		pending_idx = netbk->pending_ring[index];
-		page = xen_netbk_alloc_page(netbk, pending_idx);
+		page = alloc_page(GFP_KERNEL|__GFP_COLD);
 		if (!page)
 			goto err;
 
-		gop->source.u.ref = txp->gref;
-		gop->source.domid = vif->domid;
-		gop->source.offset = txp->offset;
-
-		gop->dest.u.gmfn = virt_to_mfn(page_address(page));
-		gop->dest.domid = DOMID_SELF;
-		gop->dest.offset = txp->offset;
-
-		gop->len = txp->size;
-		gop->flags = GNTCOPY_source_gref;
+		nr_txp = 0;
+		dst_offset = 0;
+		first = NULL;
+		while (dst_offset < PAGE_SIZE && j < nr_frags) {
+			gop->flags = GNTCOPY_source_gref;
+
+			gop->source.u.ref = txp->gref;
+			gop->source.domid = vif->domid;
+			gop->source.offset = txp->offset;
+
+			gop->dest.domid = DOMID_SELF;
+
+			gop->dest.offset = dst_offset;
+			gop->dest.u.gmfn = virt_to_mfn(page_address(page));
+
+			if (dst_offset + txp->size > PAGE_SIZE) {
+				/* This page can only merge a portion of txp */
+				gop->len = PAGE_SIZE - dst_offset;
+				txp->offset += gop->len;
+				txp->size -= gop->len;
+				dst_offset += gop->len; /* == PAGE_SIZE, will quit loop */
+			} else {
+				/* This txp can be merged in the page */
+				gop->len = txp->size;
+				dst_offset += gop->len;
+
+				index = pending_index(netbk->pending_cons++);
+
+				pending_idx = netbk->pending_ring[index];
+
+				memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
+				xenvif_get(vif);
+
+				pending_tx_info[pending_idx].vif = vif;
+
+				/*
+				 * Poison these fields, corrsponding
+				 * fields for head txp will be set to
+				 * correct values after the loop.
+				 */
+				pending_tx_info[pending_idx].nr_tx_req = (u16)(~(u16)0);
+				netbk->mmap_pages[pending_idx] = (void *)(~0UL);
+				pending_tx_info[pending_idx].start_idx = -1;
+
+				if (unlikely(!first)) {
+					first = &pending_tx_info[pending_idx];
+					first->start_idx = index;
+					head_idx = pending_idx;
+				}
+
+				txp++;
+				nr_txp++;
+				j++;
+			}
 
-		gop++;
+			gop++;
+		}
 
-		memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
-		xenvif_get(vif);
-		pending_tx_info[pending_idx].vif = vif;
-		frag_set_pending_idx(&frags[i], pending_idx);
+		first->req.offset = 0;
+		first->req.size = dst_offset;
+		first->nr_tx_req = nr_txp;
+		total_frags_merged += nr_txp - 1;
+		set_page_ext(page, netbk, head_idx);
+		netbk->mmap_pages[head_idx] = page;
+		frag_set_pending_idx(&frags[i], head_idx);
 	}
 
+	shinfo->nr_frags -= total_frags_merged;
+
 	return gop;
 err:
 	/* Unwind, freeing all pages and sending error responses. */
@@ -1025,6 +1092,7 @@ static int xen_netbk_tx_check_gop(struct xen_netbk *netbk,
 	struct gnttab_copy *gop = *gopp;
 	u16 pending_idx = *((u16 *)skb->data);
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
+	struct pending_tx_info *tx_info;
 	int nr_frags = shinfo->nr_frags;
 	int i, err, start;
 
@@ -1037,12 +1105,14 @@ static int xen_netbk_tx_check_gop(struct xen_netbk
*netbk,
 	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
 
 	for (i = start; i < nr_frags; i++) {
-		int j, newerr;
+		int j, newerr = 0, n;
 
 		pending_idx = frag_get_pending_idx(&shinfo->frags[i]);
+		tx_info = &netbk->pending_tx_info[pending_idx];
 
 		/* Check error status: if okay then remember grant handle. */
-		newerr = (++gop)->status;
+		for (n = 0; n < tx_info->nr_tx_req; n++)
+			newerr |= (++gop)->status;
 		if (likely(!newerr)) {
 			/* Had a previous error? Invalidate this fragment. */
 			if (unlikely(err))
@@ -1267,11 +1337,11 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk
*netbk)
 	struct sk_buff *skb;
 	int ret;
 
-	while (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS)
&&
+	while (((nr_pending_reqs(netbk) + max_skb_frags) < MAX_PENDING_REQS)
&&
 		!list_empty(&netbk->net_schedule_list)) {
 		struct xenvif *vif;
 		struct xen_netif_tx_request txreq;
-		struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS];
+		struct xen_netif_tx_request txfrags[max_skb_frags];
 		struct page *page;
 		struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1];
 		u16 pending_idx;
@@ -1359,7 +1429,7 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk
*netbk)
 		pending_idx = netbk->pending_ring[index];
 
 		data_len = (txreq.size > PKT_PROT_LEN &&
-			    ret < MAX_SKB_FRAGS) ?
+			    ret < max_skb_frags) ?
 			PKT_PROT_LEN : txreq.size;
 
 		skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN,
@@ -1409,11 +1479,14 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk
*netbk)
 		memcpy(&netbk->pending_tx_info[pending_idx].req,
 		       &txreq, sizeof(txreq));
 		netbk->pending_tx_info[pending_idx].vif = vif;
+		netbk->pending_tx_info[pending_idx].start_idx = index;
+		netbk->pending_tx_info[pending_idx].nr_tx_req = 1;
 		*((u16 *)skb->data) = pending_idx;
 
 		__skb_put(skb, data_len);
 
 		skb_shinfo(skb)->nr_frags = ret;
+		/* If first skb fragment if it is on same page as header fragment. */
 		if (data_len < txreq.size) {
 			skb_shinfo(skb)->nr_frags++;
 			frag_set_pending_idx(&skb_shinfo(skb)->frags[0],
@@ -1540,6 +1613,11 @@ static void xen_netbk_idx_release(struct xen_netbk
*netbk, u16 pending_idx,
 	struct xenvif *vif;
 	struct pending_tx_info *pending_tx_info;
 	pending_ring_idx_t index;
+	unsigned int nr = 0;
+	unsigned int i = 0;
+	unsigned int start_idx = 0;
+
+	BUG_ON(netbk->mmap_pages[pending_idx] == (void *)(~0UL));
 
 	/* Already complete? */
 	if (netbk->mmap_pages[pending_idx] == NULL)
@@ -1548,13 +1626,27 @@ static void xen_netbk_idx_release(struct xen_netbk
*netbk, u16 pending_idx,
 	pending_tx_info = &netbk->pending_tx_info[pending_idx];
 
 	vif = pending_tx_info->vif;
+	nr = pending_tx_info->nr_tx_req;
+	start_idx = pending_tx_info->start_idx;
 
-	make_tx_response(vif, &pending_tx_info->req, status);
+	BUG_ON(nr == (u16)(~(u16)0));
 
-	index = pending_index(netbk->pending_prod++);
-	netbk->pending_ring[index] = pending_idx;
+	BUG_ON(netbk->pending_ring[pending_index(start_idx)] != pending_idx);
 
-	xenvif_put(vif);
+	for (i = 0; i < nr; i++) {
+		struct xen_netif_tx_request *txp;
+		unsigned int idx = pending_index(start_idx + i);
+		unsigned int info_idx = netbk->pending_ring[idx];
+
+		pending_tx_info = &netbk->pending_tx_info[info_idx];
+		txp = &pending_tx_info->req;
+		make_tx_response(vif, &pending_tx_info->req, status);
+
+		index = pending_index(netbk->pending_prod++);
+		netbk->pending_ring[index] = netbk->pending_ring[info_idx];
+
+		xenvif_put(vif);
+	}
 
 	netbk->mmap_pages[pending_idx]->mapping = 0;
 	put_page(netbk->mmap_pages[pending_idx]);
@@ -1613,7 +1705,7 @@ static inline int rx_work_todo(struct xen_netbk *netbk)
 static inline int tx_work_todo(struct xen_netbk *netbk)
 {
 
-	if (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS)
&&
+	if (((nr_pending_reqs(netbk)+max_skb_frags) < MAX_PENDING_REQS) &&
 			!list_empty(&netbk->net_schedule_list))
 		return 1;
 
-- 
1.7.10.4

Wei Liu

2013-Mar-14 18:31 UTC

head link

Re: [PATCH RFC] xen-netback: coalesce frags before copying

Too bad I don''t know how to make frags > backend_default_value at
the
moment. :-(

I only tested it in normal case, it works. If you see any problem,
please let me know.


Wei.

Jacek Milewicz

2013-Mar-14 18:58 UTC

head link

Re: [PATCH RFC] xen-netback: coalesce frags before copying

> Too bad I don''t know how to make frags > backend_default_value
at the
> moment. :-(
>
> I only tested it in normal case, it works. If you see any problem, please 
> let me
> know.
>Heavy NFS load triggered it reliably only after few MBs transferred.

Wei Liu

2013-Mar-14 20:03 UTC

head link

Re: [PATCH RFC] xen-netback: coalesce frags before copying

On Thu, 2013-03-14 at 18:58 +0000, Jacek Milewicz wrote:> > Too bad I don''t know how to make frags >
backend_default_value at the
> > moment. :-(
> >
> > I only tested it in normal case, it works. If you see any problem,
please
> > let me
> > know.
> >
> Heavy NFS load triggered it reliably only after few MBs transferred.
> 
> 
So I just tested it with NFS. I saw frags from frontend up to 18 while
my backend only supports 17. It worked. :-)


Wei.

Ian Campbell

2013-Mar-15 09:23 UTC

head link

Re: [PATCH RFC] xen-netback: coalesce frags before copying

On Thu, 2013-03-14 at 18:26 +0000, Wei Liu wrote:> This patch tries to coalesce txps when constructing grant copy structures.
It
> enables netback to deal with situation when frontend''s
MAX_SKB_FRAGS is larger
> than backend''s MAX_SKB_FRAGS.
> 
> It also defines max_skb_frags, which is a estimation of the maximum number
of
> frags a guest can send, anything bigger than that is considered malicious.
Now
> it is set to 20, which should be enough to accommodate Linux (16 to 19) and
> possibly Windows (19?).
> 
> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> ---
>  drivers/net/xen-netback/netback.c |  164
+++++++++++++++++++++++++++++--------
>  1 file changed, 128 insertions(+), 36 deletions(-)
> 
> diff --git a/drivers/net/xen-netback/netback.c
b/drivers/net/xen-netback/netback.c
> index 6e8e51a..e27f81a 100644
> --- a/drivers/net/xen-netback/netback.c
> +++ b/drivers/net/xen-netback/netback.c
> @@ -47,9 +47,20 @@
>  #include <asm/xen/hypercall.h>
>  #include <asm/xen/page.h>
>  
> +/*
> + * This is an estimation of the maximum possible frags a SKB might
> + * have, anything larger than this is considered malicious. Typically
> + * Linux has 16 to 19, Windows has 19.
Perhaps rather than this static (although configurable)
> + */
> +#define NETBK_MAX_SKB_FRAGS_DEFAULT 20
> +static unsigned int max_skb_frags = NETBK_MAX_SKB_FRAGS_DEFAULT;
> +module_param(max_skb_frags, uint, 0444);
You could at least make this 0644 so root can up the limit dynamically?
> +
>  struct pending_tx_info {
> -	struct xen_netif_tx_request req;
> +	struct xen_netif_tx_request req; /* points to the first txp  */
>  	struct xenvif *vif;
> +	unsigned int nr_tx_req; /* how many tx req we have in a chain?, at least
1 */
> +	unsigned int start_idx; /* this is starting index of pending ring index
*/
>  };
>  typedef unsigned int pending_ring_idx_t;
>  
> @@ -251,7 +262,7 @@ static int max_required_rx_slots(struct xenvif *vif)
>  	int max = DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE);
>  
>  	if (vif->can_sg || vif->gso || vif->gso_prefix)
> -		max += MAX_SKB_FRAGS + 1; /* extra_info + frags */
> +		max += max_skb_frags + 1; /* extra_info + frags */
>  
>  	return max;
>  }
> @@ -657,7 +668,7 @@ static void xen_netbk_rx_action(struct xen_netbk
*netbk)
>  		__skb_queue_tail(&rxq, skb);
>  
>  		/* Filled the batch queue? */
> -		if (count + MAX_SKB_FRAGS >= XEN_NETIF_RX_RING_SIZE)
> +		if (count + max_skb_frags >= XEN_NETIF_RX_RING_SIZE)
>  			break;
>  	}
>  
> @@ -920,7 +931,7 @@ static int netbk_count_requests(struct xenvif *vif,
>  			return -ENODATA;
>  		}
>  
> -		if (unlikely(frags >= MAX_SKB_FRAGS)) {
> +		if (unlikely(frags >= max_skb_frags)) {
>  			netdev_err(vif->dev, "Too many frags\n");
>  			netbk_fatal_tx_err(vif);
>  			return -E2BIG;
> @@ -968,42 +979,98 @@ static struct gnttab_copy
*xen_netbk_get_requests(struct xen_netbk *netbk,
>  	struct skb_shared_info *shinfo = skb_shinfo(skb);
>  	skb_frag_t *frags = shinfo->frags;
>  	u16 pending_idx = *((u16 *)skb->data);
> -	int i, start;
> +	u16 head_idx = 0;
> +	int i, j, start;
> +	struct page *page;
> +	pending_ring_idx_t index;
> +	uint16_t dst_offset;
> +	int total_frags_merged;
> +	unsigned int nr_frags = shinfo->nr_frags; /* This can be as large as
max_skb_frags */
> +	struct pending_tx_info *first = NULL;
> +	int nr_txp;
>  
>  	/* Skip first skb fragment if it is on same page as header fragment. */
>  	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
>  
> -	for (i = start; i < shinfo->nr_frags; i++, txp++) {
> -		struct page *page;
> -		pending_ring_idx_t index;
> +	/* Coalesce */
> +	total_frags_merged = 0;
> +	for (i = j = start; i < MAX_SKB_FRAGS && j < nr_frags; i++)
{
>  		struct pending_tx_info *pending_tx_info >  		
netbk->pending_tx_info;
>  
> -		index = pending_index(netbk->pending_cons++);
> -		pending_idx = netbk->pending_ring[index];
> -		page = xen_netbk_alloc_page(netbk, pending_idx);
> +		page = alloc_page(GFP_KERNEL|__GFP_COLD);
>  		if (!page)
>  			goto err;
>  
> -		gop->source.u.ref = txp->gref;
> -		gop->source.domid = vif->domid;
> -		gop->source.offset = txp->offset;
> -
> -		gop->dest.u.gmfn = virt_to_mfn(page_address(page));
> -		gop->dest.domid = DOMID_SELF;
> -		gop->dest.offset = txp->offset;
> -
> -		gop->len = txp->size;
> -		gop->flags = GNTCOPY_source_gref;
> +		nr_txp = 0;
> +		dst_offset = 0;
> +		first = NULL;
> +		while (dst_offset < PAGE_SIZE && j < nr_frags) {
> +			gop->flags = GNTCOPY_source_gref;
> +
> +			gop->source.u.ref = txp->gref;
> +			gop->source.domid = vif->domid;
> +			gop->source.offset = txp->offset;
> +
> +			gop->dest.domid = DOMID_SELF;
> +
> +			gop->dest.offset = dst_offset;
> +			gop->dest.u.gmfn = virt_to_mfn(page_address(page));
> +
> +			if (dst_offset + txp->size > PAGE_SIZE) {
> +				/* This page can only merge a portion of txp */
> +				gop->len = PAGE_SIZE - dst_offset;
> +				txp->offset += gop->len;
> +				txp->size -= gop->len;
> +				dst_offset += gop->len; /* == PAGE_SIZE, will quit loop */
> +			} else {
> +				/* This txp can be merged in the page */
> +				gop->len = txp->size;
> +				dst_offset += gop->len;
> +
> +				index = pending_index(netbk->pending_cons++);
> +
> +				pending_idx = netbk->pending_ring[index];
> +
> +				memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
> +				xenvif_get(vif);
> +
> +				pending_tx_info[pending_idx].vif = vif;
> +
> +				/*
> +				 * Poison these fields, corrsponding
> +				 * fields for head txp will be set to
> +				 * correct values after the loop.
> +				 */
> +				pending_tx_info[pending_idx].nr_tx_req = (u16)(~(u16)0);
> +				netbk->mmap_pages[pending_idx] = (void *)(~0UL);
> +				pending_tx_info[pending_idx].start_idx = -1;
> +
> +				if (unlikely(!first)) {
> +					first = &pending_tx_info[pending_idx];
> +					first->start_idx = index;
> +					head_idx = pending_idx;
> +				}
> +
> +				txp++;
> +				nr_txp++;
> +				j++;
> +			}
>  
> -		gop++;
> +			gop++;
> +		}
>  
> -		memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
> -		xenvif_get(vif);
> -		pending_tx_info[pending_idx].vif = vif;
> -		frag_set_pending_idx(&frags[i], pending_idx);
> +		first->req.offset = 0;
> +		first->req.size = dst_offset;
> +		first->nr_tx_req = nr_txp;
> +		total_frags_merged += nr_txp - 1;
> +		set_page_ext(page, netbk, head_idx);
> +		netbk->mmap_pages[head_idx] = page;
> +		frag_set_pending_idx(&frags[i], head_idx);
>  	}
>  
> +	shinfo->nr_frags -= total_frags_merged;
> +
>  	return gop;
>  err:
>  	/* Unwind, freeing all pages and sending error responses. */
> @@ -1025,6 +1092,7 @@ static int xen_netbk_tx_check_gop(struct xen_netbk
*netbk,
>  	struct gnttab_copy *gop = *gopp;
>  	u16 pending_idx = *((u16 *)skb->data);
>  	struct skb_shared_info *shinfo = skb_shinfo(skb);
> +	struct pending_tx_info *tx_info;
>  	int nr_frags = shinfo->nr_frags;
>  	int i, err, start;
>  
> @@ -1037,12 +1105,14 @@ static int xen_netbk_tx_check_gop(struct xen_netbk
*netbk,
>  	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
>  
>  	for (i = start; i < nr_frags; i++) {
> -		int j, newerr;
> +		int j, newerr = 0, n;
>  
>  		pending_idx = frag_get_pending_idx(&shinfo->frags[i]);
> +		tx_info = &netbk->pending_tx_info[pending_idx];
>  
>  		/* Check error status: if okay then remember grant handle. */
> -		newerr = (++gop)->status;
> +		for (n = 0; n < tx_info->nr_tx_req; n++)
> +			newerr |= (++gop)->status;
>  		if (likely(!newerr)) {
>  			/* Had a previous error? Invalidate this fragment. */
>  			if (unlikely(err))
> @@ -1267,11 +1337,11 @@ static unsigned xen_netbk_tx_build_gops(struct
xen_netbk *netbk)
>  	struct sk_buff *skb;
>  	int ret;
>  
> -	while (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS)
&&
> +	while (((nr_pending_reqs(netbk) + max_skb_frags) < MAX_PENDING_REQS)
&&
>  		!list_empty(&netbk->net_schedule_list)) {
>  		struct xenvif *vif;
>  		struct xen_netif_tx_request txreq;
> -		struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS];
> +		struct xen_netif_tx_request txfrags[max_skb_frags];
>  		struct page *page;
>  		struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1];
>  		u16 pending_idx;
> @@ -1359,7 +1429,7 @@ static unsigned xen_netbk_tx_build_gops(struct
xen_netbk *netbk)
>  		pending_idx = netbk->pending_ring[index];
>  
>  		data_len = (txreq.size > PKT_PROT_LEN &&
> -			    ret < MAX_SKB_FRAGS) ?
> +			    ret < max_skb_frags) ?
>  			PKT_PROT_LEN : txreq.size;
>  
>  		skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN,
> @@ -1409,11 +1479,14 @@ static unsigned xen_netbk_tx_build_gops(struct
xen_netbk *netbk)
>  		memcpy(&netbk->pending_tx_info[pending_idx].req,
>  		       &txreq, sizeof(txreq));
>  		netbk->pending_tx_info[pending_idx].vif = vif;
> +		netbk->pending_tx_info[pending_idx].start_idx = index;
> +		netbk->pending_tx_info[pending_idx].nr_tx_req = 1;
>  		*((u16 *)skb->data) = pending_idx;
>  
>  		__skb_put(skb, data_len);
>  
>  		skb_shinfo(skb)->nr_frags = ret;
> +		/* If first skb fragment if it is on same page as header fragment. */
>  		if (data_len < txreq.size) {
>  			skb_shinfo(skb)->nr_frags++;
>  			frag_set_pending_idx(&skb_shinfo(skb)->frags[0],
> @@ -1540,6 +1613,11 @@ static void xen_netbk_idx_release(struct xen_netbk
*netbk, u16 pending_idx,
>  	struct xenvif *vif;
>  	struct pending_tx_info *pending_tx_info;
>  	pending_ring_idx_t index;
> +	unsigned int nr = 0;
> +	unsigned int i = 0;
> +	unsigned int start_idx = 0;
> +
> +	BUG_ON(netbk->mmap_pages[pending_idx] == (void *)(~0UL));
>  
>  	/* Already complete? */
>  	if (netbk->mmap_pages[pending_idx] == NULL)
> @@ -1548,13 +1626,27 @@ static void xen_netbk_idx_release(struct xen_netbk
*netbk, u16 pending_idx,
>  	pending_tx_info = &netbk->pending_tx_info[pending_idx];
>  
>  	vif = pending_tx_info->vif;
> +	nr = pending_tx_info->nr_tx_req;
> +	start_idx = pending_tx_info->start_idx;
>  
> -	make_tx_response(vif, &pending_tx_info->req, status);
> +	BUG_ON(nr == (u16)(~(u16)0));
>  
> -	index = pending_index(netbk->pending_prod++);
> -	netbk->pending_ring[index] = pending_idx;
> +	BUG_ON(netbk->pending_ring[pending_index(start_idx)] != pending_idx);
>  
> -	xenvif_put(vif);
> +	for (i = 0; i < nr; i++) {
> +		struct xen_netif_tx_request *txp;
> +		unsigned int idx = pending_index(start_idx + i);
> +		unsigned int info_idx = netbk->pending_ring[idx];
> +
> +		pending_tx_info = &netbk->pending_tx_info[info_idx];
> +		txp = &pending_tx_info->req;
> +		make_tx_response(vif, &pending_tx_info->req, status);
> +
> +		index = pending_index(netbk->pending_prod++);
> +		netbk->pending_ring[index] = netbk->pending_ring[info_idx];
> +
> +		xenvif_put(vif);
> +	}
>  
>  	netbk->mmap_pages[pending_idx]->mapping = 0;
>  	put_page(netbk->mmap_pages[pending_idx]);
> @@ -1613,7 +1705,7 @@ static inline int rx_work_todo(struct xen_netbk
*netbk)
>  static inline int tx_work_todo(struct xen_netbk *netbk)
>  {
>  
> -	if (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS)
&&
> +	if (((nr_pending_reqs(netbk)+max_skb_frags) < MAX_PENDING_REQS)
&&
>  			!list_empty(&netbk->net_schedule_list))
>  		return 1;
>

Ian Campbell

2013-Mar-15 09:29 UTC

head link

Re: [PATCH RFC] xen-netback: coalesce frags before copying

On Fri, 2013-03-15 at 09:23 +0000, Ian Campbell wrote:> On Thu, 2013-03-14 at 18:26 +0000, Wei Liu wrote:
> > This patch tries to coalesce txps when constructing grant copy
structures. It
> > enables netback to deal with situation when frontend''s
MAX_SKB_FRAGS is larger
> > than backend''s MAX_SKB_FRAGS.
> >
> > It also defines max_skb_frags, which is a estimation of the maximum
number of
> > frags a guest can send, anything bigger than that is considered
malicious. Now
> > it is set to 20, which should be enough to accommodate Linux (16 to
19) and
> > possibly Windows (19?).
> >
> > Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> > ---
> >  drivers/net/xen-netback/netback.c |  164
+++++++++++++++++++++++++++++--------
> >  1 file changed, 128 insertions(+), 36 deletions(-)
> >
> > diff --git a/drivers/net/xen-netback/netback.c
b/drivers/net/xen-netback/netback.c
> > index 6e8e51a..e27f81a 100644
> > --- a/drivers/net/xen-netback/netback.c
> > +++ b/drivers/net/xen-netback/netback.c
> > @@ -47,9 +47,20 @@
> >  #include <asm/xen/hypercall.h>
> >  #include <asm/xen/page.h>
> >
> > +/*
> > + * This is an estimation of the maximum possible frags a SKB might
> > + * have, anything larger than this is considered malicious. Typically
> > + * Linux has 16 to 19, Windows has 19.
> 
> Perhaps rather than this static (although configurable)
Slipped and hit send... proper reply is forthcoming...

Ian Campbell

2013-Mar-15 10:42 UTC

head link

Re: [PATCH RFC] xen-netback: coalesce frags before copying

On Thu, 2013-03-14 at 18:26 +0000, Wei Liu wrote:> This patch tries to coalesce txps when constructing grant copy structures.
It
> enables netback to deal with situation when frontend''s
MAX_SKB_FRAGS is larger
> than backend''s MAX_SKB_FRAGS.
> 
> It also defines max_skb_frags, which is a estimation of the maximum number
of
> frags a guest can send, anything bigger than that is considered malicious.
Now
> it is set to 20, which should be enough to accommodate Linux (16 to 19) and
> possibly Windows (19?).
> 
> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> ---
>  drivers/net/xen-netback/netback.c |  164
+++++++++++++++++++++++++++++--------
>  1 file changed, 128 insertions(+), 36 deletions(-)
> 
> diff --git a/drivers/net/xen-netback/netback.c
b/drivers/net/xen-netback/netback.c
> index 6e8e51a..e27f81a 100644
> --- a/drivers/net/xen-netback/netback.c
> +++ b/drivers/net/xen-netback/netback.c
> @@ -47,9 +47,20 @@
>  #include <asm/xen/hypercall.h>
>  #include <asm/xen/page.h>
>  
> +/*
> + * This is an estimation of the maximum possible frags a SKB might
> + * have, anything larger than this is considered malicious. Typically
> + * Linux has 16 to 19, Windows has 19.
Perhaps rather than this static (although configurable) limit we could
avoid malicious guests by aborting only if we see a slot with invalid
contents? I think the XSA-39 issue doesn''t exist so long as each
request
results in a response (which a valid slot does).

Although I''m also sympathetic to the idea that this approach is simpler
and therefore expedient in the short term and we can consider the more
complicated approach above for the future.
> + */
> +#define NETBK_MAX_SKB_FRAGS_DEFAULT 20
> +static unsigned int max_skb_frags = NETBK_MAX_SKB_FRAGS_DEFAULT;
> +module_param(max_skb_frags, uint, 0444);
You could at least make this 0644 so root can up the limit dynamically?
> @@ -968,42 +979,98 @@ static struct gnttab_copy
*xen_netbk_get_requests(struct xen_netbk *netbk,
>  	struct skb_shared_info *shinfo = skb_shinfo(skb);
>  	skb_frag_t *frags = shinfo->frags;
>  	u16 pending_idx = *((u16 *)skb->data);
> -	int i, start;
> +	u16 head_idx = 0;
> +	int i, j, start;
> +	struct page *page;
> +	pending_ring_idx_t index;
> +	uint16_t dst_offset;
> +	int total_frags_merged;
> +	unsigned int nr_frags = shinfo->nr_frags; /* This can be as large as
max_skb_frags */
> +	struct pending_tx_info *first = NULL;
> +	int nr_txp;
>  
>  	/* Skip first skb fragment if it is on same page as header fragment. */
>  	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
>  
> -	for (i = start; i < shinfo->nr_frags; i++, txp++) {
> -		struct page *page;
> -		pending_ring_idx_t index;
> +	/* Coalesce */
> +	total_frags_merged = 0;
> +	for (i = j = start; i < MAX_SKB_FRAGS && j < nr_frags; i++)
{
I can''t see any code which causes us to drop the frag if we end up with
i >= MAX_SKB_FRAGS? Is it just too subtle for me?

I wonder if this code might all be a bit clearer if:
	s/nr_frags/nr_slots/
	s/j/slots/
	s/i/frags/
? Or possibly replace i with shinfo->nr_frags (if that works).

I think s/frags/slots/ in max_skb_frags too BTW. Keeping the concept of
skb frags separate from the number of ring slots which it requires to
represent those frags is less confusing here but also helpful when you
consider higher order frags.
>  		struct pending_tx_info *pending_tx_info >  		
netbk->pending_tx_info;
>  
> -		index = pending_index(netbk->pending_cons++);
> -		pending_idx = netbk->pending_ring[index];
> -		page = xen_netbk_alloc_page(netbk, pending_idx);
> +		page = alloc_page(GFP_KERNEL|__GFP_COLD);
Possibly for future enhancement we could use netdev_alloc_frag() and
take advantage of higher order pages here?
>  		if (!page)
>  			goto err;
>  
> -		gop->source.u.ref = txp->gref;
> -		gop->source.domid = vif->domid;
> -		gop->source.offset = txp->offset;
> -
> -		gop->dest.u.gmfn = virt_to_mfn(page_address(page));
> -		gop->dest.domid = DOMID_SELF;
> -		gop->dest.offset = txp->offset;
> -
> -		gop->len = txp->size;
> -		gop->flags = GNTCOPY_source_gref;
> +		nr_txp = 0;
> +		dst_offset = 0;
> +		first = NULL;
> +		while (dst_offset < PAGE_SIZE && j < nr_frags) {
I think splitting the handling of a single request slot into a function
would be useful for clarity. e.g. xen_netbk_handle_one_request or
something.
> +			gop->flags = GNTCOPY_source_gref;
> +
> +			gop->source.u.ref = txp->gref;
> +			gop->source.domid = vif->domid;
> +			gop->source.offset = txp->offset;
> +
> +			gop->dest.domid = DOMID_SELF;
> +
> +			gop->dest.offset = dst_offset;
> +			gop->dest.u.gmfn = virt_to_mfn(page_address(page));
> +
> +			if (dst_offset + txp->size > PAGE_SIZE) {
> +				/* This page can only merge a portion of txp */
> +				gop->len = PAGE_SIZE - dst_offset;
> +				txp->offset += gop->len;
> +				txp->size -= gop->len;
> +				dst_offset += gop->len; /* == PAGE_SIZE, will quit loop */
In this case we don''t touch pending_cons etc? Is that because when we
hit this case we always go around again and will eventually hit the else
case below ensuring we do the pending_cons stuff exactly once for the
tail end of the buffer? I think that is OK but it is worthy of a
comment.

What happens if we hit SKB_FRAG_MAX right as we hit this case? Might we
fail to respond to the final request which has been split in this way?
> +			} else {
> +				/* This txp can be merged in the page */
> +				gop->len = txp->size;
> +				dst_offset += gop->len;
> +
> +				index = pending_index(netbk->pending_cons++);
> +
> +				pending_idx = netbk->pending_ring[index];
> +
> +				memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
> +				xenvif_get(vif);
> +
> +				pending_tx_info[pending_idx].vif = vif;
> +
> +				/*
> +				 * Poison these fields, corrsponding
"corresponding"
> +				 * fields for head txp will be set to
> +				 * correct values after the loop.
> +				 */
> +				pending_tx_info[pending_idx].nr_tx_req = (u16)(~(u16)0);
> +				netbk->mmap_pages[pending_idx] = (void *)(~0UL);
> +				pending_tx_info[pending_idx].start_idx = -1;
> +
> +				if (unlikely(!first)) {
> +					first = &pending_tx_info[pending_idx];
> +					first->start_idx = index;
> +					head_idx = pending_idx;
> +				}
> +
> +				txp++;
> +				nr_txp++;
> +				j++;
> +			}
>  
> -		gop++;
> +			gop++;
> +		}
>  
> -		memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
> -		xenvif_get(vif);
> -		pending_tx_info[pending_idx].vif = vif;
> -		frag_set_pending_idx(&frags[i], pending_idx);
> +		first->req.offset = 0;
> +		first->req.size = dst_offset;
> +		first->nr_tx_req = nr_txp;
> +		total_frags_merged += nr_txp - 1;
> +		set_page_ext(page, netbk, head_idx);
> +		netbk->mmap_pages[head_idx] = page;
> +		frag_set_pending_idx(&frags[i], head_idx);
>  	}
>  
> +	shinfo->nr_frags -= total_frags_merged;
This adjustment seems a bit odd, why not just count them correctly in
the first place?
> +
>  	return gop;
>  err:
>  	/* Unwind, freeing all pages and sending error responses. */
> @@ -1025,6 +1092,7 @@ static int xen_netbk_tx_check_gop(struct xen_netbk
*netbk,
>  	struct gnttab_copy *gop = *gopp;
>  	u16 pending_idx = *((u16 *)skb->data);
>  	struct skb_shared_info *shinfo = skb_shinfo(skb);
> +	struct pending_tx_info *tx_info;
>  	int nr_frags = shinfo->nr_frags;
>  	int i, err, start;
>  
> @@ -1037,12 +1105,14 @@ static int xen_netbk_tx_check_gop(struct xen_netbk
*netbk,
>  	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
>  
>  	for (i = start; i < nr_frags; i++) {
> -		int j, newerr;
> +		int j, newerr = 0, n;
>  
>  		pending_idx = frag_get_pending_idx(&shinfo->frags[i]);
> +		tx_info = &netbk->pending_tx_info[pending_idx];
>  
>  		/* Check error status: if okay then remember grant handle. */
> -		newerr = (++gop)->status;
> +		for (n = 0; n < tx_info->nr_tx_req; n++)
> +			newerr |= (++gop)->status;
I don''t think GNTST_* are or-able in this way... I think for each
request the response should contain the status of the first failed grant
op relating to that slot, or success.
>  		if (likely(!newerr)) {
>  			/* Had a previous error? Invalidate this fragment. */
>  			if (unlikely(err))
> @@ -1267,11 +1337,11 @@ static unsigned xen_netbk_tx_build_gops(struct
xen_netbk *netbk)
>  	struct sk_buff *skb;
>  	int ret;
>  
> -	while (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS)
&&
> +	while (((nr_pending_reqs(netbk) + max_skb_frags) < MAX_PENDING_REQS)
&&
>  		!list_empty(&netbk->net_schedule_list)) {
>  		struct xenvif *vif;
>  		struct xen_netif_tx_request txreq;
> -		struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS];
> +		struct xen_netif_tx_request txfrags[max_skb_frags];
I guess this stack variable puts an implicit limit on max_skb_frags. It
also argues against being able to change it dynamically as I suggested
above (imagine a race between setting up the stack here and checking
against max_skb_frags later)
> @@ -1548,13 +1626,27 @@ static void xen_netbk_idx_release(struct xen_netbk
*netbk, u16 pending_idx,
>  	pending_tx_info = &netbk->pending_tx_info[pending_idx];
>  
>  	vif = pending_tx_info->vif;
> +	nr = pending_tx_info->nr_tx_req;
> +	start_idx = pending_tx_info->start_idx;
>  
> -	make_tx_response(vif, &pending_tx_info->req, status);
> +	BUG_ON(nr == (u16)(~(u16)0));
Is one of those casts not redundant?

Ian.

Wei Liu

2013-Mar-15 12:31 UTC

head link

Re: [PATCH RFC] xen-netback: coalesce frags before copying

On Fri, 2013-03-15 at 10:42 +0000, Ian Campbell wrote:> On Thu, 2013-03-14 at 18:26 +0000, Wei Liu wrote:
> > This patch tries to coalesce txps when constructing grant copy
structures. It
> > enables netback to deal with situation when frontend''s
MAX_SKB_FRAGS is larger
> > than backend''s MAX_SKB_FRAGS.
> > 
> > It also defines max_skb_frags, which is a estimation of the maximum
number of
> > frags a guest can send, anything bigger than that is considered
malicious. Now
> > it is set to 20, which should be enough to accommodate Linux (16 to
19) and
> > possibly Windows (19?).
> > 
> > Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> > ---
> >  drivers/net/xen-netback/netback.c |  164
+++++++++++++++++++++++++++++--------
> >  1 file changed, 128 insertions(+), 36 deletions(-)
> > 
> > diff --git a/drivers/net/xen-netback/netback.c
b/drivers/net/xen-netback/netback.c
> > index 6e8e51a..e27f81a 100644
> > --- a/drivers/net/xen-netback/netback.c
> > +++ b/drivers/net/xen-netback/netback.c
> > @@ -47,9 +47,20 @@
> >  #include <asm/xen/hypercall.h>
> >  #include <asm/xen/page.h>
> >  
> > +/*
> > + * This is an estimation of the maximum possible frags a SKB might
> > + * have, anything larger than this is considered malicious. Typically
> > + * Linux has 16 to 19, Windows has 19.
> 
> Perhaps rather than this static (although configurable) limit we could
> avoid malicious guests by aborting only if we see a slot with invalid
> contents? I think the XSA-39 issue doesn''t exist so long as each
request
> results in a response (which a valid slot does).
> 
> Although I''m also sympathetic to the idea that this approach is
simpler
> and therefore expedient in the short term and we can consider the more
> complicated approach above for the future.
> 
> > + */
> > +#define NETBK_MAX_SKB_FRAGS_DEFAULT 20
> > +static unsigned int max_skb_frags = NETBK_MAX_SKB_FRAGS_DEFAULT;
> > +module_param(max_skb_frags, uint, 0444);
> 
> You could at least make this 0644 so root can up the limit dynamically?
> 
This was done on purpose. Changing this on the fly can cause race
condition. If we are up to changing this, we need to write more code to
make sure all worker threads in quiescence state, this adds a layer of
complexity.
> > @@ -968,42 +979,98 @@ static struct gnttab_copy
*xen_netbk_get_requests(struct xen_netbk *netbk,
> >  	struct skb_shared_info *shinfo = skb_shinfo(skb);
> >  	skb_frag_t *frags = shinfo->frags;
> >  	u16 pending_idx = *((u16 *)skb->data);
> > -	int i, start;
> > +	u16 head_idx = 0;
> > +	int i, j, start;
> > +	struct page *page;
> > +	pending_ring_idx_t index;
> > +	uint16_t dst_offset;
> > +	int total_frags_merged;
> > +	unsigned int nr_frags = shinfo->nr_frags; /* This can be as large
as max_skb_frags */
> > +	struct pending_tx_info *first = NULL;
> > +	int nr_txp;
> >  
> >  	/* Skip first skb fragment if it is on same page as header fragment.
*/
> >  	start = (frag_get_pending_idx(&shinfo->frags[0]) ==
pending_idx);
> >  
> > -	for (i = start; i < shinfo->nr_frags; i++, txp++) {
> > -		struct page *page;
> > -		pending_ring_idx_t index;
> > +	/* Coalesce */
> > +	total_frags_merged = 0;
> > +	for (i = j = start; i < MAX_SKB_FRAGS && j < nr_frags;
i++) {
> 
> I can''t see any code which causes us to drop the frag if we end up
with
> i >= MAX_SKB_FRAGS? Is it just too subtle for me?
> 
The basic assumption that it is not possible to reach i >= MAX_SKB_FRAGS
since the maximum packet size supported in backend is 64K and we can
always accommodate a packet. Any packet larger than 64K causes fatal
error early on.

I could remove "i < MAX_SKB_FRAGS" if this causes confusion, it is
just
a redundant check.
> I wonder if this code might all be a bit clearer if:
> 	s/nr_frags/nr_slots/
> 	s/j/slots/
> 	s/i/frags/
> ? Or possibly replace i with shinfo->nr_frags (if that works).
> 
> I think s/frags/slots/ in max_skb_frags too BTW. Keeping the concept of
> skb frags separate from the number of ring slots which it requires to
> represent those frags is less confusing here but also helpful when you
> consider higher order frags.
> 
Sure.
> >  		struct pending_tx_info *pending_tx_info > >  		
netbk->pending_tx_info;
> >  
> > -		index = pending_index(netbk->pending_cons++);
> > -		pending_idx = netbk->pending_ring[index];
> > -		page = xen_netbk_alloc_page(netbk, pending_idx);
> > +		page = alloc_page(GFP_KERNEL|__GFP_COLD);
> 
> Possibly for future enhancement we could use netdev_alloc_frag() and
> take advantage of higher order pages here?
> 
Sure, but I think that deserves a separate patch. This patch is more of
a fix than improvement.
> >  		if (!page)
> >  			goto err;
> >  
> > -		gop->source.u.ref = txp->gref;
> > -		gop->source.domid = vif->domid;
> > -		gop->source.offset = txp->offset;
> > -
> > -		gop->dest.u.gmfn = virt_to_mfn(page_address(page));
> > -		gop->dest.domid = DOMID_SELF;
> > -		gop->dest.offset = txp->offset;
> > -
> > -		gop->len = txp->size;
> > -		gop->flags = GNTCOPY_source_gref;
> > +		nr_txp = 0;
> > +		dst_offset = 0;
> > +		first = NULL;
> > +		while (dst_offset < PAGE_SIZE && j < nr_frags) {
> 
> I think splitting the handling of a single request slot into a function
> would be useful for clarity. e.g. xen_netbk_handle_one_request or
> something.
> 
> > +			gop->flags = GNTCOPY_source_gref;
> > +
> > +			gop->source.u.ref = txp->gref;
> > +			gop->source.domid = vif->domid;
> > +			gop->source.offset = txp->offset;
> > +
> > +			gop->dest.domid = DOMID_SELF;
> > +
> > +			gop->dest.offset = dst_offset;
> > +			gop->dest.u.gmfn = virt_to_mfn(page_address(page));
> > +
> > +			if (dst_offset + txp->size > PAGE_SIZE) {
> > +				/* This page can only merge a portion of txp */
> > +				gop->len = PAGE_SIZE - dst_offset;
> > +				txp->offset += gop->len;
> > +				txp->size -= gop->len;
> > +				dst_offset += gop->len; /* == PAGE_SIZE, will quit loop */
> 
> In this case we don''t touch pending_cons etc? Is that because when
we
> hit this case we always go around again and will eventually hit the else
> case below ensuring we do the pending_cons stuff exactly once for the
> tail end of the buffer? I think that is OK but it is worthy of a
> comment.
> 
Yes, we cannot touch pending_cons in this case, because this slot needs
to be coped with in next loop, using another gop.
> What happens if we hit SKB_FRAG_MAX right as we hit this case? Might we
> fail to respond to the final request which has been split in this way?
> 
As stated above, we won''t hit i >= MAX_SKB_FRAGS.
> > +			} else {
[...]> > +		}
> >  
> > -		memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
> > -		xenvif_get(vif);
> > -		pending_tx_info[pending_idx].vif = vif;
> > -		frag_set_pending_idx(&frags[i], pending_idx);
> > +		first->req.offset = 0;
> > +		first->req.size = dst_offset;
> > +		first->nr_tx_req = nr_txp;
> > +		total_frags_merged += nr_txp - 1;
> > +		set_page_ext(page, netbk, head_idx);
> > +		netbk->mmap_pages[head_idx] = page;
> > +		frag_set_pending_idx(&frags[i], head_idx);
> >  	}
> >  
> > +	shinfo->nr_frags -= total_frags_merged;
> 
> This adjustment seems a bit odd, why not just count them correctly in
> the first place?
> 
Sure.
> > +
> >  	return gop;
> >  err:
> >  	/* Unwind, freeing all pages and sending error responses. */
> > @@ -1025,6 +1092,7 @@ static int xen_netbk_tx_check_gop(struct
xen_netbk *netbk,
> >  	struct gnttab_copy *gop = *gopp;
> >  	u16 pending_idx = *((u16 *)skb->data);
> >  	struct skb_shared_info *shinfo = skb_shinfo(skb);
> > +	struct pending_tx_info *tx_info;
> >  	int nr_frags = shinfo->nr_frags;
> >  	int i, err, start;
> >  
> > @@ -1037,12 +1105,14 @@ static int xen_netbk_tx_check_gop(struct
xen_netbk *netbk,
> >  	start = (frag_get_pending_idx(&shinfo->frags[0]) ==
pending_idx);
> >  
> >  	for (i = start; i < nr_frags; i++) {
> > -		int j, newerr;
> > +		int j, newerr = 0, n;
> >  
> >  		pending_idx = frag_get_pending_idx(&shinfo->frags[i]);
> > +		tx_info = &netbk->pending_tx_info[pending_idx];
> >  
> >  		/* Check error status: if okay then remember grant handle. */
> > -		newerr = (++gop)->status;
> > +		for (n = 0; n < tx_info->nr_tx_req; n++)
> > +			newerr |= (++gop)->status;
> 
> I don''t think GNTST_* are or-able in this way... I think for each
> request the response should contain the status of the first failed grant
> op relating to that slot, or success.
> 
Fixed.
> 
> >  		if (likely(!newerr)) {
> >  			/* Had a previous error? Invalidate this fragment. */
> >  			if (unlikely(err))
> > @@ -1267,11 +1337,11 @@ static unsigned xen_netbk_tx_build_gops(struct
xen_netbk *netbk)
> >  	struct sk_buff *skb;
> >  	int ret;
> >  
> > -	while (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) <
MAX_PENDING_REQS) &&
> > +	while (((nr_pending_reqs(netbk) + max_skb_frags) <
MAX_PENDING_REQS) &&
> >  		!list_empty(&netbk->net_schedule_list)) {
> >  		struct xenvif *vif;
> >  		struct xen_netif_tx_request txreq;
> > -		struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS];
> > +		struct xen_netif_tx_request txfrags[max_skb_frags];
> 
> I guess this stack variable puts an implicit limit on max_skb_frags. It
> also argues against being able to change it dynamically as I suggested
> above (imagine a race between setting up the stack here and checking
> against max_skb_frags later)
> 
Yes. See above explanation.


Wei.

Ian Campbell

2013-Mar-15 13:25 UTC

head link

Re: [PATCH RFC] xen-netback: coalesce frags before copying

On Fri, 2013-03-15 at 12:31 +0000, Wei Liu wrote:> On Fri, 2013-03-15 at 10:42 +0000, Ian Campbell wrote:
> > On Thu, 2013-03-14 at 18:26 +0000, Wei Liu wrote:
> >
> > > + */
> > > +#define NETBK_MAX_SKB_FRAGS_DEFAULT 20
> > > +static unsigned int max_skb_frags = NETBK_MAX_SKB_FRAGS_DEFAULT;
> > > +module_param(max_skb_frags, uint, 0444);
> > 
> > You could at least make this 0644 so root can up the limit
dynamically?
> > 
> 
> This was done on purpose. Changing this on the fly can cause race
> condition. If we are up to changing this, we need to write more code to
> make sure all worker threads in quiescence state, this adds a layer of
> complexity.
Right, I figured this out myself further down ;-)
> > > -	for (i = start; i < shinfo->nr_frags; i++, txp++) {
> > > -		struct page *page;
> > > -		pending_ring_idx_t index;
> > > +	/* Coalesce */
> > > +	total_frags_merged = 0;
> > > +	for (i = j = start; i < MAX_SKB_FRAGS && j <
nr_frags; i++) {
> > 
> > I can''t see any code which causes us to drop the frag if we
end up with
> > i >= MAX_SKB_FRAGS? Is it just too subtle for me?
> > 
> 
> The basic assumption that it is not possible to reach i >= MAX_SKB_FRAGS
> since the maximum packet size supported in backend is 64K and we can
> always accommodate a packet. Any packet larger than 64K causes fatal
> error early on.
Does that early check also account for frontends which lie in the
initial slot->size or is that otherwise caught later too? i.e. do we do
the right thing when there are more bytes in the additional slots than
the initially header said there would be? I expect we already do, I just
want to be sure this doesn''t subtly alter the conditions.
> I could remove "i < MAX_SKB_FRAGS" if this causes confusion,
it is just
> a redundant check.
Is it an actual invariant? i.e. could it be BUG_ON?
> > >  		struct pending_tx_info *pending_tx_info > > >  		
netbk->pending_tx_info;
> > >  
> > > -		index = pending_index(netbk->pending_cons++);
> > > -		pending_idx = netbk->pending_ring[index];
> > > -		page = xen_netbk_alloc_page(netbk, pending_idx);
> > > +		page = alloc_page(GFP_KERNEL|__GFP_COLD);
> > 
> > Possibly for future enhancement we could use netdev_alloc_frag() and
> > take advantage of higher order pages here?
> > 
> 
> Sure, but I think that deserves a separate patch. This patch is more of
> a fix than improvement.
Right, by "future enhancement" I meant a future patch not a change to
this one.
> > > +			gop->flags = GNTCOPY_source_gref;
> > > +
> > > +			gop->source.u.ref = txp->gref;
> > > +			gop->source.domid = vif->domid;
> > > +			gop->source.offset = txp->offset;
> > > +
> > > +			gop->dest.domid = DOMID_SELF;
> > > +
> > > +			gop->dest.offset = dst_offset;
> > > +			gop->dest.u.gmfn = virt_to_mfn(page_address(page));
> > > +
> > > +			if (dst_offset + txp->size > PAGE_SIZE) {
> > > +				/* This page can only merge a portion of txp */
> > > +				gop->len = PAGE_SIZE - dst_offset;
> > > +				txp->offset += gop->len;
> > > +				txp->size -= gop->len;
> > > +				dst_offset += gop->len; /* == PAGE_SIZE, will quit loop
*/
> > 
> > In this case we don''t touch pending_cons etc? Is that because
when we
> > hit this case we always go around again and will eventually hit the
else
> > case below ensuring we do the pending_cons stuff exactly once for the
> > tail end of the buffer? I think that is OK but it is worthy of a
> > comment.
> > 
> 
> Yes, we cannot touch pending_cons in this case, because this slot needs
> to be coped with in next loop, using another gop.
> 
> > What happens if we hit SKB_FRAG_MAX right as we hit this case? Might
we
> > fail to respond to the final request which has been split in this way?
> > 
> 
> As stated above, we won''t hit i >= MAX_SKB_FRAGS.
OK, same question for i == MAX_SKB_FRAGS - 1 then ;-) Or whatever
condition would be needed for us to exit here without ever getting to
this else clause.

Ian.

Wei Liu

2013-Mar-15 14:41 UTC

head link

Re: [PATCH RFC] xen-netback: coalesce frags before copying

On Fri, 2013-03-15 at 13:25 +0000, Ian Campbell wrote:> > > > -	for (i = start; i < shinfo->nr_frags; i++, txp++) {
> > > > -		struct page *page;
> > > > -		pending_ring_idx_t index;
> > > > +	/* Coalesce */
> > > > +	total_frags_merged = 0;
> > > > +	for (i = j = start; i < MAX_SKB_FRAGS && j <
nr_frags; i++) {
> > > 
> > > I can''t see any code which causes us to drop the frag if
we end up with
> > > i >= MAX_SKB_FRAGS? Is it just too subtle for me?
> > > 
> > 
> > The basic assumption that it is not possible to reach i >=
MAX_SKB_FRAGS
> > since the maximum packet size supported in backend is 64K and we can
> > always accommodate a packet. Any packet larger than 64K causes fatal
> > error early on.
> 
> Does that early check also account for frontends which lie in the
> initial slot->size or is that otherwise caught later too? i.e. do we do
> the right thing when there are more bytes in the additional slots than
> the initially header said there would be? I expect we already do, I just
> want to be sure this doesn''t subtly alter the conditions.
> 
We already do that in netbk_count_requests().
> > I could remove "i < MAX_SKB_FRAGS" if this causes
confusion, it is just
> > a redundant check.
> 
> Is it an actual invariant? i.e. could it be BUG_ON?
> 
Yes. I''ve added BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS) before
returning to caller.
> > > > +			gop->flags = GNTCOPY_source_gref;
> > > > +
> > > > +			gop->source.u.ref = txp->gref;
> > > > +			gop->source.domid = vif->domid;
> > > > +			gop->source.offset = txp->offset;
> > > > +
> > > > +			gop->dest.domid = DOMID_SELF;
> > > > +
> > > > +			gop->dest.offset = dst_offset;
> > > > +			gop->dest.u.gmfn = virt_to_mfn(page_address(page));
> > > > +
> > > > +			if (dst_offset + txp->size > PAGE_SIZE) {
> > > > +				/* This page can only merge a portion of txp */
> > > > +				gop->len = PAGE_SIZE - dst_offset;
> > > > +				txp->offset += gop->len;
> > > > +				txp->size -= gop->len;
> > > > +				dst_offset += gop->len; /* == PAGE_SIZE, will quit
loop */
> > > 
> > > In this case we don''t touch pending_cons etc? Is that
because when we
> > > hit this case we always go around again and will eventually hit
the else
> > > case below ensuring we do the pending_cons stuff exactly once for
the
> > > tail end of the buffer? I think that is OK but it is worthy of a
> > > comment.
> > > 
> > 
> > Yes, we cannot touch pending_cons in this case, because this slot
needs
> > to be coped with in next loop, using another gop.
> > 
> > > What happens if we hit SKB_FRAG_MAX right as we hit this case?
Might we
> > > fail to respond to the final request which has been split in this
way?
> > > 
> > 
> > As stated above, we won''t hit i >= MAX_SKB_FRAGS.
> 
> OK, same question for i == MAX_SKB_FRAGS - 1 then ;-) Or whatever
> condition would be needed for us to exit here without ever getting to
> this else clause.
> 
If we exit outer for-loop before processing all tx requests for a
packet, that means this packet is larger than 64K. But at this point,
we''ve already know that this packet will not be larger than 64K.

After removing "i < MAX_SKB_FRAGS" check, we can only exit the
for-loop
given that we finish processing all tx requests for a packet. And the
counter only decrements after we finish processing a tx request in the
''else'' clause.


Wei.

Wei Liu

2013-Mar-15 15:17 UTC

head link

Re: [PATCH RFC] xen-netback: coalesce frags before copying

On Fri, 2013-03-15 at 10:42 +0000, Ian Campbell wrote:
> >  			goto err;
> >  
> > -		gop->source.u.ref = txp->gref;
> > -		gop->source.domid = vif->domid;
> > -		gop->source.offset = txp->offset;
> > -
> > -		gop->dest.u.gmfn = virt_to_mfn(page_address(page));
> > -		gop->dest.domid = DOMID_SELF;
> > -		gop->dest.offset = txp->offset;
> > -
> > -		gop->len = txp->size;
> > -		gop->flags = GNTCOPY_source_gref;
> > +		nr_txp = 0;
> > +		dst_offset = 0;
> > +		first = NULL;
> > +		while (dst_offset < PAGE_SIZE && j < nr_frags) {
> 
> I think splitting the handling of a single request slot into a function
> would be useful for clarity. e.g. xen_netbk_handle_one_request or
> something.
> 
Regarding this comment, I think it looks better as-is. This loop is
closely related to the outer loop.


Wei.

Xen devel - Mar 2013 - [PATCH RFC] xen-netback: coalesce frags before copying

[PATCH RFC] xen-netback: coalesce frags before copying

Re: [PATCH RFC] xen-netback: coalesce frags before copying

Re: [PATCH RFC] xen-netback: coalesce frags before copying

Re: [PATCH RFC] xen-netback: coalesce frags before copying

Re: [PATCH RFC] xen-netback: coalesce frags before copying

Re: [PATCH RFC] xen-netback: coalesce frags before copying

Re: [PATCH RFC] xen-netback: coalesce frags before copying

Re: [PATCH RFC] xen-netback: coalesce frags before copying

Re: [PATCH RFC] xen-netback: coalesce frags before copying

Re: [PATCH RFC] xen-netback: coalesce frags before copying

Re: [PATCH RFC] xen-netback: coalesce frags before copying