thr3ads.net - Xen devel - [PATCH 5/6] xen-netback: coalesce slots before copying [Mar 2013]

If this information is useful, please help other people find it:
Share via:

Wei Liu

2013-Mar-25 11:08 UTC

[PATCH 5/6] xen-netback: coalesce slots before copying

This patch tries to coalesce tx requests when constructing grant copy
structures. It enables netback to deal with situation when frontend''s
MAX_SKB_FRAGS is larger than backend''s MAX_SKB_FRAGS.

It defines max_skb_slots, which is a estimation of the maximum number of slots
a guest can send, anything bigger than that is considered malicious. Now it is
set to 20, which should be enough to accommodate Linux (16 to 19).

Also change variable name from "frags" to "slots" in
netbk_count_requests.

Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 drivers/net/xen-netback/netback.c |  220 ++++++++++++++++++++++++++++---------
 1 file changed, 171 insertions(+), 49 deletions(-)

diff --git a/drivers/net/xen-netback/netback.c
b/drivers/net/xen-netback/netback.c
index 6e8e51a..a634dc5 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -47,11 +47,26 @@
 #include <asm/xen/hypercall.h>
 #include <asm/xen/page.h>
 
+/*
+ * This is an estimation of the maximum possible frags a SKB might
+ * have, anything larger than this is considered malicious. Typically
+ * Linux has 16 to 19.
+ */
+#define MAX_SKB_SLOTS_DEFAULT 20
+static unsigned int max_skb_slots = MAX_SKB_SLOTS_DEFAULT;
+module_param(max_skb_slots, uint, 0444);
+
+typedef unsigned int pending_ring_idx_t;
+#define INVALID_PENDING_RING_IDX (~0U)
+
 struct pending_tx_info {
-	struct xen_netif_tx_request req;
+	struct xen_netif_tx_request req; /* coalesced tx request  */
 	struct xenvif *vif;
+	pending_ring_idx_t head; /* head != INVALID_PENDING_RING_IDX
+				  * if it is head of serveral merged
+				  * tx reqs
+				  */
 };
-typedef unsigned int pending_ring_idx_t;
 
 struct netbk_rx_meta {
 	int id;
@@ -102,7 +117,11 @@ struct xen_netbk {
 	atomic_t netfront_count;
 
 	struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
-	struct gnttab_copy tx_copy_ops[MAX_PENDING_REQS];
+	/* Coalescing tx requests before copying makes number of grant
+	 * copy ops greater of equal to number of slots required. In
+	 * worst case a tx request consumes 2 gnttab_copy.
+	 */
+	struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS];
 
 	u16 pending_ring[MAX_PENDING_REQS];
 
@@ -251,7 +270,7 @@ static int max_required_rx_slots(struct xenvif *vif)
 	int max = DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE);
 
 	if (vif->can_sg || vif->gso || vif->gso_prefix)
-		max += MAX_SKB_FRAGS + 1; /* extra_info + frags */
+		max += max_skb_slots + 1; /* extra_info + frags */
 
 	return max;
 }
@@ -657,7 +676,7 @@ static void xen_netbk_rx_action(struct xen_netbk *netbk)
 		__skb_queue_tail(&rxq, skb);
 
 		/* Filled the batch queue? */
-		if (count + MAX_SKB_FRAGS >= XEN_NETIF_RX_RING_SIZE)
+		if (count + max_skb_slots >= XEN_NETIF_RX_RING_SIZE)
 			break;
 	}
 
@@ -908,34 +927,34 @@ static int netbk_count_requests(struct xenvif *vif,
 				int work_to_do)
 {
 	RING_IDX cons = vif->tx.req_cons;
-	int frags = 0;
+	int slots = 0;
 
 	if (!(first->flags & XEN_NETTXF_more_data))
 		return 0;
 
 	do {
-		if (frags >= work_to_do) {
-			netdev_err(vif->dev, "Need more frags\n");
+		if (slots >= work_to_do) {
+			netdev_err(vif->dev, "Need more slots\n");
 			netbk_fatal_tx_err(vif);
 			return -ENODATA;
 		}
 
-		if (unlikely(frags >= MAX_SKB_FRAGS)) {
-			netdev_err(vif->dev, "Too many frags\n");
+		if (unlikely(slots >= max_skb_slots)) {
+			netdev_err(vif->dev, "Too many slots\n");
 			netbk_fatal_tx_err(vif);
 			return -E2BIG;
 		}
 
-		memcpy(txp, RING_GET_REQUEST(&vif->tx, cons + frags),
+		memcpy(txp, RING_GET_REQUEST(&vif->tx, cons + slots),
 		       sizeof(*txp));
 		if (txp->size > first->size) {
-			netdev_err(vif->dev, "Frag is bigger than frame.\n");
+			netdev_err(vif->dev, "Packet is bigger than frame.\n");
 			netbk_fatal_tx_err(vif);
 			return -EIO;
 		}
 
 		first->size -= txp->size;
-		frags++;
+		slots++;
 
 		if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
 			netdev_err(vif->dev, "txp->offset: %x, size: %u\n",
@@ -944,7 +963,7 @@ static int netbk_count_requests(struct xenvif *vif,
 			return -EINVAL;
 		}
 	} while ((txp++)->flags & XEN_NETTXF_more_data);
-	return frags;
+	return slots;
 }
 
 static struct page *xen_netbk_alloc_page(struct xen_netbk *netbk,
@@ -968,48 +987,114 @@ static struct gnttab_copy *xen_netbk_get_requests(struct
xen_netbk *netbk,
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	skb_frag_t *frags = shinfo->frags;
 	u16 pending_idx = *((u16 *)skb->data);
-	int i, start;
+	u16 head_idx = 0;
+	int slot, start;
+	struct page *page;
+	pending_ring_idx_t index, start_idx = 0;
+	uint16_t dst_offset;
+	unsigned int nr_slots;
+	struct pending_tx_info *first = NULL;
+
+	/* At this point shinfo->nr_frags is in fact the number of
+	 * slots, which can be as large as max_skb_slots.
+	 */
+	nr_slots = shinfo->nr_frags;
 
 	/* Skip first skb fragment if it is on same page as header fragment. */
 	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
 
-	for (i = start; i < shinfo->nr_frags; i++, txp++) {
-		struct page *page;
-		pending_ring_idx_t index;
+	/* Coalesce tx requests, at this point the packet passed in
+	 * should be <= 64K. Any packets larger than 64K have been
+	 * handled in netbk_count_requests().
+	 */
+	for (shinfo->nr_frags = slot = start; slot < nr_slots;
+	     shinfo->nr_frags++) {
 		struct pending_tx_info *pending_tx_info  			netbk->pending_tx_info;
 
-		index = pending_index(netbk->pending_cons++);
-		pending_idx = netbk->pending_ring[index];
-		page = xen_netbk_alloc_page(netbk, pending_idx);
+		page = alloc_page(GFP_KERNEL|__GFP_COLD);
 		if (!page)
 			goto err;
 
-		gop->source.u.ref = txp->gref;
-		gop->source.domid = vif->domid;
-		gop->source.offset = txp->offset;
-
-		gop->dest.u.gmfn = virt_to_mfn(page_address(page));
-		gop->dest.domid = DOMID_SELF;
-		gop->dest.offset = txp->offset;
-
-		gop->len = txp->size;
-		gop->flags = GNTCOPY_source_gref;
+		dst_offset = 0;
+		first = NULL;
+		while (dst_offset < PAGE_SIZE && slot < nr_slots) {
+			gop->flags = GNTCOPY_source_gref;
+
+			gop->source.u.ref = txp->gref;
+			gop->source.domid = vif->domid;
+			gop->source.offset = txp->offset;
+
+			gop->dest.domid = DOMID_SELF;
+
+			gop->dest.offset = dst_offset;
+			gop->dest.u.gmfn = virt_to_mfn(page_address(page));
+
+			if (dst_offset + txp->size > PAGE_SIZE) {
+				/* This page can only merge a portion
+				 * of tx request. Do not increment any
+				 * pointer / counter here. The txp
+				 * will be dealt with in future
+				 * rounds, eventually hitting the
+				 * `else` branch.
+				 */
+				gop->len = PAGE_SIZE - dst_offset;
+				txp->offset += gop->len;
+				txp->size -= gop->len;
+				dst_offset += gop->len; /* quit loop */
+			} else {
+				/* This tx request can be merged in the page */
+				gop->len = txp->size;
+				dst_offset += gop->len;
+
+				index = pending_index(netbk->pending_cons++);
+
+				pending_idx = netbk->pending_ring[index];
+
+				memcpy(&pending_tx_info[pending_idx].req, txp,
+				       sizeof(*txp));
+				xenvif_get(vif);
+
+				pending_tx_info[pending_idx].vif = vif;
+
+				/* Poison these fields, corresponding
+				 * fields for head tx req will be set
+				 * to correct values after the loop.
+				 */
+				netbk->mmap_pages[pending_idx] = (void *)(~0UL);
+				pending_tx_info[pending_idx].head +					INVALID_PENDING_RING_IDX;
+
+				if (unlikely(!first)) {
+					first = &pending_tx_info[pending_idx];
+					start_idx = index;
+					head_idx = pending_idx;
+				}
+
+				txp++;
+				slot++;
+			}
 
-		gop++;
+			gop++;
+		}
 
-		memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
-		xenvif_get(vif);
-		pending_tx_info[pending_idx].vif = vif;
-		frag_set_pending_idx(&frags[i], pending_idx);
+		first->req.offset = 0;
+		first->req.size = dst_offset;
+		first->head = start_idx;
+		set_page_ext(page, netbk, head_idx);
+		netbk->mmap_pages[head_idx] = page;
+		frag_set_pending_idx(&frags[shinfo->nr_frags], head_idx);
 	}
 
+	BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);
+
 	return gop;
 err:
 	/* Unwind, freeing all pages and sending error responses. */
-	while (i-- > start) {
-		xen_netbk_idx_release(netbk, frag_get_pending_idx(&frags[i]),
-				      XEN_NETIF_RSP_ERROR);
+	while (shinfo->nr_frags-- > start) {
+		xen_netbk_idx_release(netbk,
+				frag_get_pending_idx(&frags[shinfo->nr_frags]),
+				XEN_NETIF_RSP_ERROR);
 	}
 	/* The head too, if necessary. */
 	if (start)
@@ -1025,8 +1110,10 @@ static int xen_netbk_tx_check_gop(struct xen_netbk
*netbk,
 	struct gnttab_copy *gop = *gopp;
 	u16 pending_idx = *((u16 *)skb->data);
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
+	struct pending_tx_info *tx_info;
 	int nr_frags = shinfo->nr_frags;
 	int i, err, start;
+	u16 peek; /* peek into next tx request */
 
 	/* Check status of header. */
 	err = gop->status;
@@ -1038,11 +1125,21 @@ static int xen_netbk_tx_check_gop(struct xen_netbk
*netbk,
 
 	for (i = start; i < nr_frags; i++) {
 		int j, newerr;
+		pending_ring_idx_t head;
 
 		pending_idx = frag_get_pending_idx(&shinfo->frags[i]);
+		tx_info = &netbk->pending_tx_info[pending_idx];
+		head = tx_info->head;
 
 		/* Check error status: if okay then remember grant handle. */
-		newerr = (++gop)->status;
+		do {
+			newerr = (++gop)->status;
+			if (newerr)
+				break;
+			peek = netbk->pending_ring[pending_index(++head)];
+		} while (netbk->pending_tx_info[peek].head
+			 == INVALID_PENDING_RING_IDX);
+
 		if (likely(!newerr)) {
 			/* Had a previous error? Invalidate this fragment. */
 			if (unlikely(err))
@@ -1267,11 +1364,11 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk
*netbk)
 	struct sk_buff *skb;
 	int ret;
 
-	while (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS)
&&
+	while (((nr_pending_reqs(netbk) + max_skb_slots) < MAX_PENDING_REQS)
&&
 		!list_empty(&netbk->net_schedule_list)) {
 		struct xenvif *vif;
 		struct xen_netif_tx_request txreq;
-		struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS];
+		struct xen_netif_tx_request txfrags[max_skb_slots];
 		struct page *page;
 		struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1];
 		u16 pending_idx;
@@ -1359,7 +1456,7 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk
*netbk)
 		pending_idx = netbk->pending_ring[index];
 
 		data_len = (txreq.size > PKT_PROT_LEN &&
-			    ret < MAX_SKB_FRAGS) ?
+			    ret < max_skb_slots) ?
 			PKT_PROT_LEN : txreq.size;
 
 		skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN,
@@ -1409,6 +1506,7 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk
*netbk)
 		memcpy(&netbk->pending_tx_info[pending_idx].req,
 		       &txreq, sizeof(txreq));
 		netbk->pending_tx_info[pending_idx].vif = vif;
+		netbk->pending_tx_info[pending_idx].head = index;
 		*((u16 *)skb->data) = pending_idx;
 
 		__skb_put(skb, data_len);
@@ -1539,7 +1637,10 @@ static void xen_netbk_idx_release(struct xen_netbk
*netbk, u16 pending_idx,
 {
 	struct xenvif *vif;
 	struct pending_tx_info *pending_tx_info;
-	pending_ring_idx_t index;
+	pending_ring_idx_t head;
+	u16 peek; /* peek into next tx request */
+
+	BUG_ON(netbk->mmap_pages[pending_idx] == (void *)(~0UL));
 
 	/* Already complete? */
 	if (netbk->mmap_pages[pending_idx] == NULL)
@@ -1548,13 +1649,34 @@ static void xen_netbk_idx_release(struct xen_netbk
*netbk, u16 pending_idx,
 	pending_tx_info = &netbk->pending_tx_info[pending_idx];
 
 	vif = pending_tx_info->vif;
+	head = pending_tx_info->head;
 
-	make_tx_response(vif, &pending_tx_info->req, status);
+	BUG_ON(head == INVALID_PENDING_RING_IDX);
+	BUG_ON(netbk->pending_ring[pending_index(head)] != pending_idx);
 
-	index = pending_index(netbk->pending_prod++);
-	netbk->pending_ring[index] = pending_idx;
+	do {
+		pending_ring_idx_t index;
+		pending_ring_idx_t idx = pending_index(head);
+		u16 info_idx = netbk->pending_ring[idx];
 
-	xenvif_put(vif);
+		pending_tx_info = &netbk->pending_tx_info[info_idx];
+		make_tx_response(vif, &pending_tx_info->req, status);
+
+		/* Setting any number other than
+		 * INVALID_PENDING_RING_IDX indicates this slot is
+		 * starting a new packet / ending a previous packet.
+		 */
+		pending_tx_info->head = 0;
+
+		index = pending_index(netbk->pending_prod++);
+		netbk->pending_ring[index] = netbk->pending_ring[info_idx];
+
+		xenvif_put(vif);
+
+		peek = netbk->pending_ring[pending_index(++head)];
+
+	} while (netbk->pending_tx_info[peek].head
+		 == INVALID_PENDING_RING_IDX);
 
 	netbk->mmap_pages[pending_idx]->mapping = 0;
 	put_page(netbk->mmap_pages[pending_idx]);
@@ -1613,7 +1735,7 @@ static inline int rx_work_todo(struct xen_netbk *netbk)
 static inline int tx_work_todo(struct xen_netbk *netbk)
 {
 
-	if (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS)
&&
+	if (((nr_pending_reqs(netbk) + max_skb_slots) < MAX_PENDING_REQS)
&&
 			!list_empty(&netbk->net_schedule_list))
 		return 1;
 
-- 
1.7.10.4

David Vrabel

2013-Mar-25 15:13 UTC

head link

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

On 25/03/13 11:08, Wei Liu wrote:> This patch tries to coalesce tx requests when constructing grant copy
> structures. It enables netback to deal with situation when
frontend''s
> MAX_SKB_FRAGS is larger than backend''s MAX_SKB_FRAGS.
> 
> It defines max_skb_slots, which is a estimation of the maximum number of
slots
> a guest can send, anything bigger than that is considered malicious. Now it
is
> set to 20, which should be enough to accommodate Linux (16 to 19).
This maximum needs to be defined as part of the protocol and added to
the interface header.
> Also change variable name from "frags" to "slots" in
netbk_count_requests.
I think this renaming should have been done as a separate patch.
> --- a/drivers/net/xen-netback/netback.c
> +++ b/drivers/net/xen-netback/netback.c
> @@ -47,11 +47,26 @@
>  #include <asm/xen/hypercall.h>
>  #include <asm/xen/page.h>
>  
> +/*
> + * This is an estimation of the maximum possible frags a SKB might
> + * have, anything larger than this is considered malicious. Typically
> + * Linux has 16 to 19.
> + */
Do you mean "max possible /slots/" a packet might have?
> @@ -968,48 +987,114 @@ static struct gnttab_copy
*xen_netbk_get_requests(struct xen_netbk *netbk,
[...]> +				/* Poison these fields, corresponding
> +				 * fields for head tx req will be set
> +				 * to correct values after the loop.
> +				 */
> +				netbk->mmap_pages[pending_idx] = (void *)(~0UL);
> +				pending_tx_info[pending_idx].head > +					INVALID_PENDING_RING_IDX;
Do you need to poison both values?
> +
> +				if (unlikely(!first)) {
This isn''t unlikely is it?
> +					first = &pending_tx_info[pending_idx];
> +					start_idx = index;
> +					head_idx = pending_idx;
> +				}
Instead of setting first here why not move the code below here?
> +		first->req.offset = 0;
> +		first->req.size = dst_offset;
> +		first->head = start_idx;
> +		set_page_ext(page, netbk, head_idx);
> +		netbk->mmap_pages[head_idx] = page;
> +		frag_set_pending_idx(&frags[shinfo->nr_frags], head_idx);
> @@ -1548,13 +1649,34 @@ static void xen_netbk_idx_release(struct xen_netbk
*netbk, u16 pending_idx,
[...]> +		/* Setting any number other than
> +		 * INVALID_PENDING_RING_IDX indicates this slot is
> +		 * starting a new packet / ending a previous packet.
> +		 */
> +		pending_tx_info->head = 0;
This doesn''t look needed.  It will be initialized again when reusing t
his pending_tx_info again, right?
> +		index = pending_index(netbk->pending_prod++);
> +		netbk->pending_ring[index] = netbk->pending_ring[info_idx];
> +
> +		xenvif_put(vif);
> +
> +		peek = netbk->pending_ring[pending_index(++head)];
> +
> +	} while (netbk->pending_tx_info[peek].head
> +		 == INVALID_PENDING_RING_IDX);
>  
>  	netbk->mmap_pages[pending_idx]->mapping = 0;
>  	put_page(netbk->mmap_pages[pending_idx]);
David

Wei Liu

2013-Mar-25 15:47 UTC

head link

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

On Mon, Mar 25, 2013 at 3:13 PM, David Vrabel <david.vrabel@citrix.com>
wrote:> On 25/03/13 11:08, Wei Liu wrote:
>> This patch tries to coalesce tx requests when constructing grant copy
>> structures. It enables netback to deal with situation when
frontend''s
>> MAX_SKB_FRAGS is larger than backend''s MAX_SKB_FRAGS.
>>
>> It defines max_skb_slots, which is a estimation of the maximum number
of slots
>> a guest can send, anything bigger than that is considered malicious.
Now it is
>> set to 20, which should be enough to accommodate Linux (16 to 19).
>
> This maximum needs to be defined as part of the protocol and added to
> the interface header.
>
No, this is not part of the protocol and not a hard limit. It is
configurable by system administrator.
>> Also change variable name from "frags" to "slots"
in netbk_count_requests.
>
> I think this renaming should have been done as a separate patch.
>
The frag -> slot thing only make sense after this patch. If I do this,
I will need to partially reverted what I''ve done in a previous patch
(like the frag -> slot in comment below you pointed out). :-(
>> --- a/drivers/net/xen-netback/netback.c
>> +++ b/drivers/net/xen-netback/netback.c
>> @@ -47,11 +47,26 @@
>>  #include <asm/xen/hypercall.h>
>>  #include <asm/xen/page.h>
>>
>> +/*
>> + * This is an estimation of the maximum possible frags a SKB might
>> + * have, anything larger than this is considered malicious. Typically
>> + * Linux has 16 to 19.
>> + */
>
> Do you mean "max possible /slots/" a packet might have?
>
Yes.
>> @@ -968,48 +987,114 @@ static struct gnttab_copy
*xen_netbk_get_requests(struct xen_netbk *netbk,
> [...]
>> +                             /* Poison these fields, corresponding
>> +                              * fields for head tx req will be set
>> +                              * to correct values after the loop.
>> +                              */
>> +                             netbk->mmap_pages[pending_idx] = (void
*)(~0UL);
>> +                             pending_tx_info[pending_idx].head
>> +                                     INVALID_PENDING_RING_IDX;
>
> Do you need to poison both values?
>
Just for safety. I have BUG_ON in the release path to check for possible error.
>> +
>> +                             if (unlikely(!first)) {
>
> This isn''t unlikely is it?
>
For big packet the chance is 1 in max_skb_slots, so 5% (1/20) in default case.
>> +                                     first =
&pending_tx_info[pending_idx];
>> +                                     start_idx = index;
>> +                                     head_idx = pending_idx;
>> +                             }
>
> Instead of setting first here why not move the code below here?
>
Because first->req.size needs to be set to dst_offset, it has to be
here anyway. So I put other fields here as well.
>> +             first->req.offset = 0;
>> +             first->req.size = dst_offset;
>> +             first->head = start_idx;
>> +             set_page_ext(page, netbk, head_idx);
>> +             netbk->mmap_pages[head_idx] = page;
>> +             frag_set_pending_idx(&frags[shinfo->nr_frags],
head_idx);
>
>> @@ -1548,13 +1649,34 @@ static void xen_netbk_idx_release(struct
xen_netbk *netbk, u16 pending_idx,
> [...]
>> +             /* Setting any number other than
>> +              * INVALID_PENDING_RING_IDX indicates this slot is
>> +              * starting a new packet / ending a previous packet.
>> +              */
>> +             pending_tx_info->head = 0;
>
> This doesn''t look needed.  It will be initialized again when
reusing t
> his pending_tx_info again, right?
>
Yes, it is needed. Otherwise netback responses to invalid tx_info and
cause netfront to crash before coming into the re-initialization path.


Wei.

David Miller

2013-Mar-25 16:20 UTC

head link

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

From: Wei Liu <wei.liu2@citrix.com>
Date: Mon, 25 Mar 2013 11:08:21 +0000
> This patch tries to coalesce tx requests when constructing grant copy
> structures. It enables netback to deal with situation when
frontend''s
> MAX_SKB_FRAGS is larger than backend''s MAX_SKB_FRAGS.
> 
> It defines max_skb_slots, which is a estimation of the maximum number of
slots
> a guest can send, anything bigger than that is considered malicious. Now it
is
> set to 20, which should be enough to accommodate Linux (16 to 19).
> 
> Also change variable name from "frags" to "slots" in
netbk_count_requests.
> 
> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
Please resubmit after you''ve incorporated the feedback you''ve
received.

David Vrabel

2013-Mar-25 16:34 UTC

head link

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

On 25/03/13 15:47, Wei Liu wrote:> On Mon, Mar 25, 2013 at 3:13 PM, David Vrabel
<david.vrabel@citrix.com> wrote:
>> On 25/03/13 11:08, Wei Liu wrote:
>>> This patch tries to coalesce tx requests when constructing grant
copy
>>> structures. It enables netback to deal with situation when
frontend''s
>>> MAX_SKB_FRAGS is larger than backend''s MAX_SKB_FRAGS.
>>>
>>> It defines max_skb_slots, which is a estimation of the maximum
number of slots
>>> a guest can send, anything bigger than that is considered
malicious. Now it is
>>> set to 20, which should be enough to accommodate Linux (16 to 19).
>>
>> This maximum needs to be defined as part of the protocol and added to
>> the interface header.
>>
> 
> No, this is not part of the protocol and not a hard limit. It is
> configurable by system administrator.
There is no mechanism by which the front and back ends can negotiate
this value, so it does need to be a fixed value that is equal or greater
than the max from any front or back end that has ever existed.

The reason for this patch is that this wasn''t properly specified and
changes outside of netback broke the protocol.
>>> +
>>> +                             if (unlikely(!first)) {
>>
>> This isn''t unlikely is it?
>>
> 
> For big packet the chance is 1 in max_skb_slots, so 5% (1/20) in default
case.
I don''t understand your reasoning here.  The "if (!first)"
branch is
taken once per page.  It will be 100% if each slot goes into its own
page and only 5% if the packet is less than PAGE_SIZE in length but
split into 20 slots.
>>> +                                     first =
&pending_tx_info[pending_idx];
>>> +                                     start_idx = index;
>>> +                                     head_idx = pending_idx;
>>> +                             }
>>
>> Instead of setting first here why not move the code below here?
>>
> 
> Because first->req.size needs to be set to dst_offset, it has to be
> here anyway. So I put other fields here as well.
Hmmm, I''d probably accumulate first->req.size but ok. It was a minor
point anyway.
>>> +             first->req.offset = 0;
>>> +             first->req.size = dst_offset;
>>> +             first->head = start_idx;
>>> +             set_page_ext(page, netbk, head_idx);
>>> +             netbk->mmap_pages[head_idx] = page;
>>> +             frag_set_pending_idx(&frags[shinfo->nr_frags],
head_idx);
>>
>>> @@ -1548,13 +1649,34 @@ static void xen_netbk_idx_release(struct
xen_netbk *netbk, u16 pending_idx,
>> [...]
>>> +             /* Setting any number other than
>>> +              * INVALID_PENDING_RING_IDX indicates this slot is
>>> +              * starting a new packet / ending a previous packet.
>>> +              */
>>> +             pending_tx_info->head = 0;
>>
>> This doesn''t look needed.  It will be initialized again when
reusing t
>> his pending_tx_info again, right?
>>
> 
> Yes, it is needed. Otherwise netback responses to invalid tx_info and
> cause netfront to crash before coming into the re-initialization path.
Maybe I''m missing something but this is after the make_tx_reponse()
call, and immediately after this pending_tx_info is returned to the
pending ring as free.

David

Konrad Rzeszutek Wilk

2013-Mar-25 16:57 UTC

head link

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

On Mon, Mar 25, 2013 at 11:08:21AM +0000, Wei Liu wrote:> This patch tries to coalesce tx requests when constructing grant copy
> structures. It enables netback to deal with situation when
frontend''s
> MAX_SKB_FRAGS is larger than backend''s MAX_SKB_FRAGS.
> 
> It defines max_skb_slots, which is a estimation of the maximum number of
slots
> a guest can send, anything bigger than that is considered malicious. Now it
is
> set to 20, which should be enough to accommodate Linux (16 to 19).
> 
> Also change variable name from "frags" to "slots" in
netbk_count_requests.
> 
This should probably also CC stable@vger.kernel.org
> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> ---
>  drivers/net/xen-netback/netback.c |  220
++++++++++++++++++++++++++++---------
>  1 file changed, 171 insertions(+), 49 deletions(-)
> 
> diff --git a/drivers/net/xen-netback/netback.c
b/drivers/net/xen-netback/netback.c
> index 6e8e51a..a634dc5 100644
> --- a/drivers/net/xen-netback/netback.c
> +++ b/drivers/net/xen-netback/netback.c
> @@ -47,11 +47,26 @@
>  #include <asm/xen/hypercall.h>
>  #include <asm/xen/page.h>
>  
> +/*
> + * This is an estimation of the maximum possible frags a SKB might
> + * have, anything larger than this is considered malicious. Typically
> + * Linux has 16 to 19.
> + */
> +#define MAX_SKB_SLOTS_DEFAULT 20
> +static unsigned int max_skb_slots = MAX_SKB_SLOTS_DEFAULT;
> +module_param(max_skb_slots, uint, 0444);
> +
> +typedef unsigned int pending_ring_idx_t;
> +#define INVALID_PENDING_RING_IDX (~0U)
> +
>  struct pending_tx_info {
> -	struct xen_netif_tx_request req;
> +	struct xen_netif_tx_request req; /* coalesced tx request  */
>  	struct xenvif *vif;
> +	pending_ring_idx_t head; /* head != INVALID_PENDING_RING_IDX
> +				  * if it is head of serveral merged
> +				  * tx reqs
> +				  */
>  };
> -typedef unsigned int pending_ring_idx_t;
>  
>  struct netbk_rx_meta {
>  	int id;
> @@ -102,7 +117,11 @@ struct xen_netbk {
>  	atomic_t netfront_count;
>  
>  	struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
> -	struct gnttab_copy tx_copy_ops[MAX_PENDING_REQS];
> +	/* Coalescing tx requests before copying makes number of grant
> +	 * copy ops greater of equal to number of slots required. In
> +	 * worst case a tx request consumes 2 gnttab_copy.
> +	 */
> +	struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS];
>  
>  	u16 pending_ring[MAX_PENDING_REQS];
>  
> @@ -251,7 +270,7 @@ static int max_required_rx_slots(struct xenvif *vif)
>  	int max = DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE);
>  
>  	if (vif->can_sg || vif->gso || vif->gso_prefix)
> -		max += MAX_SKB_FRAGS + 1; /* extra_info + frags */
> +		max += max_skb_slots + 1; /* extra_info + frags */
>  
>  	return max;
>  }
> @@ -657,7 +676,7 @@ static void xen_netbk_rx_action(struct xen_netbk
*netbk)
>  		__skb_queue_tail(&rxq, skb);
>  
>  		/* Filled the batch queue? */
> -		if (count + MAX_SKB_FRAGS >= XEN_NETIF_RX_RING_SIZE)
> +		if (count + max_skb_slots >= XEN_NETIF_RX_RING_SIZE)
>  			break;
>  	}
>  
> @@ -908,34 +927,34 @@ static int netbk_count_requests(struct xenvif *vif,
>  				int work_to_do)
>  {
>  	RING_IDX cons = vif->tx.req_cons;
> -	int frags = 0;
> +	int slots = 0;
>  
>  	if (!(first->flags & XEN_NETTXF_more_data))
>  		return 0;
>  
>  	do {
> -		if (frags >= work_to_do) {
> -			netdev_err(vif->dev, "Need more frags\n");
> +		if (slots >= work_to_do) {
> +			netdev_err(vif->dev, "Need more slots\n");
>  			netbk_fatal_tx_err(vif);
>  			return -ENODATA;
>  		}
>  
> -		if (unlikely(frags >= MAX_SKB_FRAGS)) {
> -			netdev_err(vif->dev, "Too many frags\n");
> +		if (unlikely(slots >= max_skb_slots)) {
> +			netdev_err(vif->dev, "Too many slots\n");
>  			netbk_fatal_tx_err(vif);
>  			return -E2BIG;
>  		}
>  
> -		memcpy(txp, RING_GET_REQUEST(&vif->tx, cons + frags),
> +		memcpy(txp, RING_GET_REQUEST(&vif->tx, cons + slots),
>  		       sizeof(*txp));
>  		if (txp->size > first->size) {
> -			netdev_err(vif->dev, "Frag is bigger than frame.\n");
> +			netdev_err(vif->dev, "Packet is bigger than frame.\n");
>  			netbk_fatal_tx_err(vif);
>  			return -EIO;
>  		}
>  
>  		first->size -= txp->size;
> -		frags++;
> +		slots++;
>  
>  		if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
>  			netdev_err(vif->dev, "txp->offset: %x, size: %u\n",
> @@ -944,7 +963,7 @@ static int netbk_count_requests(struct xenvif *vif,
>  			return -EINVAL;
>  		}
>  	} while ((txp++)->flags & XEN_NETTXF_more_data);
> -	return frags;
> +	return slots;
>  }
>  
>  static struct page *xen_netbk_alloc_page(struct xen_netbk *netbk,
> @@ -968,48 +987,114 @@ static struct gnttab_copy
*xen_netbk_get_requests(struct xen_netbk *netbk,
>  	struct skb_shared_info *shinfo = skb_shinfo(skb);
>  	skb_frag_t *frags = shinfo->frags;
>  	u16 pending_idx = *((u16 *)skb->data);
> -	int i, start;
> +	u16 head_idx = 0;
> +	int slot, start;
> +	struct page *page;
> +	pending_ring_idx_t index, start_idx = 0;
> +	uint16_t dst_offset;
> +	unsigned int nr_slots;
> +	struct pending_tx_info *first = NULL;
> +
> +	/* At this point shinfo->nr_frags is in fact the number of
> +	 * slots, which can be as large as max_skb_slots.
> +	 */
> +	nr_slots = shinfo->nr_frags;
>  
>  	/* Skip first skb fragment if it is on same page as header fragment. */
>  	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
>  
> -	for (i = start; i < shinfo->nr_frags; i++, txp++) {
> -		struct page *page;
> -		pending_ring_idx_t index;
> +	/* Coalesce tx requests, at this point the packet passed in
> +	 * should be <= 64K. Any packets larger than 64K have been
> +	 * handled in netbk_count_requests().
> +	 */
> +	for (shinfo->nr_frags = slot = start; slot < nr_slots;
> +	     shinfo->nr_frags++) {
>  		struct pending_tx_info *pending_tx_info >  		
netbk->pending_tx_info;
>  
> -		index = pending_index(netbk->pending_cons++);
> -		pending_idx = netbk->pending_ring[index];
> -		page = xen_netbk_alloc_page(netbk, pending_idx);
> +		page = alloc_page(GFP_KERNEL|__GFP_COLD);
>  		if (!page)
>  			goto err;
>  
> -		gop->source.u.ref = txp->gref;
> -		gop->source.domid = vif->domid;
> -		gop->source.offset = txp->offset;
> -
> -		gop->dest.u.gmfn = virt_to_mfn(page_address(page));
> -		gop->dest.domid = DOMID_SELF;
> -		gop->dest.offset = txp->offset;
> -
> -		gop->len = txp->size;
> -		gop->flags = GNTCOPY_source_gref;
> +		dst_offset = 0;
> +		first = NULL;
> +		while (dst_offset < PAGE_SIZE && slot < nr_slots) {
> +			gop->flags = GNTCOPY_source_gref;
> +
> +			gop->source.u.ref = txp->gref;
> +			gop->source.domid = vif->domid;
> +			gop->source.offset = txp->offset;
> +
> +			gop->dest.domid = DOMID_SELF;
> +
> +			gop->dest.offset = dst_offset;
> +			gop->dest.u.gmfn = virt_to_mfn(page_address(page));
> +
> +			if (dst_offset + txp->size > PAGE_SIZE) {
> +				/* This page can only merge a portion
> +				 * of tx request. Do not increment any
> +				 * pointer / counter here. The txp
> +				 * will be dealt with in future
> +				 * rounds, eventually hitting the
> +				 * `else` branch.
> +				 */
> +				gop->len = PAGE_SIZE - dst_offset;
> +				txp->offset += gop->len;
> +				txp->size -= gop->len;
> +				dst_offset += gop->len; /* quit loop */
> +			} else {
> +				/* This tx request can be merged in the page */
> +				gop->len = txp->size;
> +				dst_offset += gop->len;
> +
> +				index = pending_index(netbk->pending_cons++);
> +
> +				pending_idx = netbk->pending_ring[index];
> +
> +				memcpy(&pending_tx_info[pending_idx].req, txp,
> +				       sizeof(*txp));
> +				xenvif_get(vif);
> +
> +				pending_tx_info[pending_idx].vif = vif;
> +
> +				/* Poison these fields, corresponding
> +				 * fields for head tx req will be set
> +				 * to correct values after the loop.
> +				 */
> +				netbk->mmap_pages[pending_idx] = (void *)(~0UL);
> +				pending_tx_info[pending_idx].head > +					INVALID_PENDING_RING_IDX;
> +
> +				if (unlikely(!first)) {
> +					first = &pending_tx_info[pending_idx];
> +					start_idx = index;
> +					head_idx = pending_idx;
> +				}
> +
> +				txp++;
> +				slot++;
> +			}
>  
> -		gop++;
> +			gop++;
> +		}
>  
> -		memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
> -		xenvif_get(vif);
> -		pending_tx_info[pending_idx].vif = vif;
> -		frag_set_pending_idx(&frags[i], pending_idx);
> +		first->req.offset = 0;
> +		first->req.size = dst_offset;
> +		first->head = start_idx;
> +		set_page_ext(page, netbk, head_idx);
> +		netbk->mmap_pages[head_idx] = page;
> +		frag_set_pending_idx(&frags[shinfo->nr_frags], head_idx);
>  	}
>  
> +	BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);
> +
>  	return gop;
>  err:
>  	/* Unwind, freeing all pages and sending error responses. */
> -	while (i-- > start) {
> -		xen_netbk_idx_release(netbk, frag_get_pending_idx(&frags[i]),
> -				      XEN_NETIF_RSP_ERROR);
> +	while (shinfo->nr_frags-- > start) {
> +		xen_netbk_idx_release(netbk,
> +				frag_get_pending_idx(&frags[shinfo->nr_frags]),
> +				XEN_NETIF_RSP_ERROR);
>  	}
>  	/* The head too, if necessary. */
>  	if (start)
> @@ -1025,8 +1110,10 @@ static int xen_netbk_tx_check_gop(struct xen_netbk
*netbk,
>  	struct gnttab_copy *gop = *gopp;
>  	u16 pending_idx = *((u16 *)skb->data);
>  	struct skb_shared_info *shinfo = skb_shinfo(skb);
> +	struct pending_tx_info *tx_info;
>  	int nr_frags = shinfo->nr_frags;
>  	int i, err, start;
> +	u16 peek; /* peek into next tx request */
>  
>  	/* Check status of header. */
>  	err = gop->status;
> @@ -1038,11 +1125,21 @@ static int xen_netbk_tx_check_gop(struct xen_netbk
*netbk,
>  
>  	for (i = start; i < nr_frags; i++) {
>  		int j, newerr;
> +		pending_ring_idx_t head;
>  
>  		pending_idx = frag_get_pending_idx(&shinfo->frags[i]);
> +		tx_info = &netbk->pending_tx_info[pending_idx];
> +		head = tx_info->head;
>  
>  		/* Check error status: if okay then remember grant handle. */
> -		newerr = (++gop)->status;
> +		do {
> +			newerr = (++gop)->status;
> +			if (newerr)
> +				break;
> +			peek = netbk->pending_ring[pending_index(++head)];
> +		} while (netbk->pending_tx_info[peek].head
> +			 == INVALID_PENDING_RING_IDX);
> +
>  		if (likely(!newerr)) {
>  			/* Had a previous error? Invalidate this fragment. */
>  			if (unlikely(err))
> @@ -1267,11 +1364,11 @@ static unsigned xen_netbk_tx_build_gops(struct
xen_netbk *netbk)
>  	struct sk_buff *skb;
>  	int ret;
>  
> -	while (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS)
&&
> +	while (((nr_pending_reqs(netbk) + max_skb_slots) < MAX_PENDING_REQS)
&&
>  		!list_empty(&netbk->net_schedule_list)) {
>  		struct xenvif *vif;
>  		struct xen_netif_tx_request txreq;
> -		struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS];
> +		struct xen_netif_tx_request txfrags[max_skb_slots];
>  		struct page *page;
>  		struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1];
>  		u16 pending_idx;
> @@ -1359,7 +1456,7 @@ static unsigned xen_netbk_tx_build_gops(struct
xen_netbk *netbk)
>  		pending_idx = netbk->pending_ring[index];
>  
>  		data_len = (txreq.size > PKT_PROT_LEN &&
> -			    ret < MAX_SKB_FRAGS) ?
> +			    ret < max_skb_slots) ?
>  			PKT_PROT_LEN : txreq.size;
>  
>  		skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN,
> @@ -1409,6 +1506,7 @@ static unsigned xen_netbk_tx_build_gops(struct
xen_netbk *netbk)
>  		memcpy(&netbk->pending_tx_info[pending_idx].req,
>  		       &txreq, sizeof(txreq));
>  		netbk->pending_tx_info[pending_idx].vif = vif;
> +		netbk->pending_tx_info[pending_idx].head = index;
>  		*((u16 *)skb->data) = pending_idx;
>  
>  		__skb_put(skb, data_len);
> @@ -1539,7 +1637,10 @@ static void xen_netbk_idx_release(struct xen_netbk
*netbk, u16 pending_idx,
>  {
>  	struct xenvif *vif;
>  	struct pending_tx_info *pending_tx_info;
> -	pending_ring_idx_t index;
> +	pending_ring_idx_t head;
> +	u16 peek; /* peek into next tx request */
> +
> +	BUG_ON(netbk->mmap_pages[pending_idx] == (void *)(~0UL));
>  
>  	/* Already complete? */
>  	if (netbk->mmap_pages[pending_idx] == NULL)
> @@ -1548,13 +1649,34 @@ static void xen_netbk_idx_release(struct xen_netbk
*netbk, u16 pending_idx,
>  	pending_tx_info = &netbk->pending_tx_info[pending_idx];
>  
>  	vif = pending_tx_info->vif;
> +	head = pending_tx_info->head;
>  
> -	make_tx_response(vif, &pending_tx_info->req, status);
> +	BUG_ON(head == INVALID_PENDING_RING_IDX);
> +	BUG_ON(netbk->pending_ring[pending_index(head)] != pending_idx);
>  
> -	index = pending_index(netbk->pending_prod++);
> -	netbk->pending_ring[index] = pending_idx;
> +	do {
> +		pending_ring_idx_t index;
> +		pending_ring_idx_t idx = pending_index(head);
> +		u16 info_idx = netbk->pending_ring[idx];
>  
> -	xenvif_put(vif);
> +		pending_tx_info = &netbk->pending_tx_info[info_idx];
> +		make_tx_response(vif, &pending_tx_info->req, status);
> +
> +		/* Setting any number other than
> +		 * INVALID_PENDING_RING_IDX indicates this slot is
> +		 * starting a new packet / ending a previous packet.
> +		 */
> +		pending_tx_info->head = 0;
> +
> +		index = pending_index(netbk->pending_prod++);
> +		netbk->pending_ring[index] = netbk->pending_ring[info_idx];
> +
> +		xenvif_put(vif);
> +
> +		peek = netbk->pending_ring[pending_index(++head)];
> +
> +	} while (netbk->pending_tx_info[peek].head
> +		 == INVALID_PENDING_RING_IDX);
>  
>  	netbk->mmap_pages[pending_idx]->mapping = 0;
>  	put_page(netbk->mmap_pages[pending_idx]);
> @@ -1613,7 +1735,7 @@ static inline int rx_work_todo(struct xen_netbk
*netbk)
>  static inline int tx_work_todo(struct xen_netbk *netbk)
>  {
>  
> -	if (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS)
&&
> +	if (((nr_pending_reqs(netbk) + max_skb_slots) < MAX_PENDING_REQS)
&&
>  			!list_empty(&netbk->net_schedule_list))
>  		return 1;
>  
> -- 
> 1.7.10.4
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
>

Wei Liu

2013-Mar-25 16:58 UTC

head link

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

On Mon, Mar 25, 2013 at 4:34 PM, David Vrabel <david.vrabel@citrix.com>
wrote:> On 25/03/13 15:47, Wei Liu wrote:
>> On Mon, Mar 25, 2013 at 3:13 PM, David Vrabel
<david.vrabel@citrix.com> wrote:
>>> On 25/03/13 11:08, Wei Liu wrote:
>>>> This patch tries to coalesce tx requests when constructing
grant copy
>>>> structures. It enables netback to deal with situation when
frontend''s
>>>> MAX_SKB_FRAGS is larger than backend''s MAX_SKB_FRAGS.
>>>>
>>>> It defines max_skb_slots, which is a estimation of the maximum
number of slots
>>>> a guest can send, anything bigger than that is considered
malicious. Now it is
>>>> set to 20, which should be enough to accommodate Linux (16 to
19).
>>>
>>> This maximum needs to be defined as part of the protocol and added
to
>>> the interface header.
>>>
>>
>> No, this is not part of the protocol and not a hard limit. It is
>> configurable by system administrator.
>
> There is no mechanism by which the front and back ends can negotiate
> this value, so it does need to be a fixed value that is equal or greater
> than the max from any front or back end that has ever existed.
>
Are you suggesting move the default macro value to header file? It is
just an estimation, I have no knowledge of the accurate maximum value,
so I think make it part of the protocol a bad idea.

Do you have a handle on the maximum value?
> The reason for this patch is that this wasn''t properly specified
and
> changes outside of netback broke the protocol.
>
>>>> +
>>>> +                             if (unlikely(!first)) {
>>>
>>> This isn''t unlikely is it?
>>>
>>
>> For big packet the chance is 1 in max_skb_slots, so 5% (1/20) in
default case.
>
> I don''t understand your reasoning here.  The "if
(!first)" branch is
> taken once per page.  It will be 100% if each slot goes into its own
> page and only 5% if the packet is less than PAGE_SIZE in length but
> split into 20 slots.
>
My mistake. Should be a small packet split into multiple slots.
>>> [...]
>>>> +             /* Setting any number other than
>>>> +              * INVALID_PENDING_RING_IDX indicates this slot
is
>>>> +              * starting a new packet / ending a previous
packet.
>>>> +              */
>>>> +             pending_tx_info->head = 0;
>>>
>>> This doesn''t look needed.  It will be initialized again
when reusing t
>>> his pending_tx_info again, right?
>>>
>>
>> Yes, it is needed. Otherwise netback responses to invalid tx_info and
>> cause netfront to crash before coming into the re-initialization path.
>
> Maybe I''m missing something but this is after the
make_tx_reponse()
> call, and immediately after this pending_tx_info is returned to the
> pending ring as free.
>
So it is a bit tricky here. Let me clarify this, the head field is
used to indicate the start of a new tx requests queue and the end of
previous queue.

Imagine a sequence of head fileds(I = INVALID_PENDING_RING_IDX below),
the number is the starting index of pending ring.

  .... 0 I I I 5 I I ...

consume all tx_info but not setting I to 0 (or any number other then
I) makes the sequence remains the same as before. The in subsequent
call to process next SKB, which has 3 extra slots, which makes the
sequence look like

  .... 8 I I I I I I ...

but in fact the correct sequence should be

  .... 8 I I I 0 I I ...

The wrong sequence makes netbk_idx_release responses to more slots
than required, which causes netfront to crash miserably.


Wei.

> David

Wei Liu

2013-Mar-25 17:06 UTC

head link

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

On Mon, Mar 25, 2013 at 4:58 PM, Wei Liu <liuw@liuw.name>
wrote:>
> So it is a bit tricky here. Let me clarify this, the head field is
> used to indicate the start of a new tx requests queue and the end of
> previous queue.
>
> Imagine a sequence of head fileds(I = INVALID_PENDING_RING_IDX below),
> the number is the starting index of pending ring.
>
>   .... 0 I I I 5 I I ...
>
> consume all tx_info but not setting I to 0 (or any number other then
> I) makes the sequence remains the same as before. The in subsequent
> call to process next SKB, which has 3 extra slots, which makes the
Sorry, should be "4 extra slots"

sequence looks like

  .... 8 | | | | | | ...

but in fact the correct sequence should be

  .... 8 | | | | 0 0 ...


Wei.

David Vrabel

2013-Mar-25 18:29 UTC

head link

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

On 25/03/13 16:58, Wei Liu wrote:> On Mon, Mar 25, 2013 at 4:34 PM, David Vrabel
<david.vrabel@citrix.com> wrote:
>> On 25/03/13 15:47, Wei Liu wrote:
>>> On Mon, Mar 25, 2013 at 3:13 PM, David Vrabel
<david.vrabel@citrix.com> wrote:
>>>> On 25/03/13 11:08, Wei Liu wrote:
>>>>> This patch tries to coalesce tx requests when constructing
grant copy
>>>>> structures. It enables netback to deal with situation when
frontend''s
>>>>> MAX_SKB_FRAGS is larger than backend''s
MAX_SKB_FRAGS.
>>>>>
>>>>> It defines max_skb_slots, which is a estimation of the
maximum number of slots
>>>>> a guest can send, anything bigger than that is considered
malicious. Now it is
>>>>> set to 20, which should be enough to accommodate Linux (16
to 19).
>>>>
>>>> This maximum needs to be defined as part of the protocol and
added to
>>>> the interface header.
>>>>
>>>
>>> No, this is not part of the protocol and not a hard limit. It is
>>> configurable by system administrator.
>>
>> There is no mechanism by which the front and back ends can negotiate
>> this value, so it does need to be a fixed value that is equal or
greater
>> than the max from any front or back end that has ever existed.
>>
> 
> Are you suggesting move the default macro value to header file? It is
> just an estimation, I have no knowledge of the accurate maximum value,
> so I think make it part of the protocol a bad idea.
How is the author of a new frontend supposed to know how many slots they
can use per packet if it is not precisely defined?
> Do you have a handle on the maximum value?
Backends should provide the value to the frontend via a xenstore key
(e.g., max-slots-per-frame).  This value should be at least 18 (the
historical value of MAX_SKB_FRAGS).

The frontend may use up to this specified value or 17 if the
max-slots-per-frame key is missing.

Supporting at least 18 in the backend is required for existing
frontends.  Limiting frontends to 17 allows them to work with all
backends (including recent Linux version that only supported 17).

It''s not clear why 19 or 20 were suggested as possible values.  I
checked back to 2.6.18 and MAX_SKB_FRAGS there is (65536/PAGE_SIZE + 2)
== 18.

Separately, it may be sensible for the backend to drop packets with more
frags than max-slots-per-frame up to some threshold where anything more
is considered malicious (i.e., 1 - 18 slots is a valid packet, 19-20 are
dropped and 21 or more is a fatal error).
>> The reason for this patch is that this wasn''t properly
specified and
>> changes outside of netback broke the protocol.
>>
>>>> [...]
>>>>> +             /* Setting any number other than
>>>>> +              * INVALID_PENDING_RING_IDX indicates this
slot is
>>>>> +              * starting a new packet / ending a previous
packet.
>>>>> +              */
>>>>> +             pending_tx_info->head = 0;
>>>>
>>>> This doesn''t look needed.  It will be initialized
again when reusing t
>>>> his pending_tx_info again, right?
>>>>
>>>
>>> Yes, it is needed. Otherwise netback responses to invalid tx_info
and
>>> cause netfront to crash before coming into the re-initialization
path.
>>
>> Maybe I''m missing something but this is after the
make_tx_reponse()
>> call, and immediately after this pending_tx_info is returned to the
>> pending ring as free.
>>
> 
> So it is a bit tricky here. Let me clarify this, the head field is
> used to indicate the start of a new tx requests queue and the end of
> previous queue.
Ok, I get this now.

David

Wei Liu

2013-Mar-25 19:09 UTC

head link

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

On Mon, Mar 25, 2013 at 06:29:29PM +0000, David Vrabel
wrote:> >>
> > 
> > Are you suggesting move the default macro value to header file? It is
> > just an estimation, I have no knowledge of the accurate maximum value,
> > so I think make it part of the protocol a bad idea.
> 
> How is the author of a new frontend supposed to know how many slots they
> can use per packet if it is not precisely defined?
> 
A new frontend shuold use the scheme you mentioned below to get the
maximum value. For old frontends that cannot be fixed, administrator can
configure max_skb_slots to accommodate their need.
> > Do you have a handle on the maximum value?
> 
> Backends should provide the value to the frontend via a xenstore key
> (e.g., max-slots-per-frame).  This value should be at least 18 (the
> historical value of MAX_SKB_FRAGS).
> 
> The frontend may use up to this specified value or 17 if the
> max-slots-per-frame key is missing.
> 
> Supporting at least 18 in the backend is required for existing
> frontends.  Limiting frontends to 17 allows them to work with all
> backends (including recent Linux version that only supported 17).
> 
> It''s not clear why 19 or 20 were suggested as possible values.  I
> checked back to 2.6.18 and MAX_SKB_FRAGS there is (65536/PAGE_SIZE + 2)
Because the check is >= MAX_SKB_FRAGS originally and James Harper told
me that "Windows stops counting on 20".
> == 18.
> 
> Separately, it may be sensible for the backend to drop packets with more
> frags than max-slots-per-frame up to some threshold where anything more
> is considered malicious (i.e., 1 - 18 slots is a valid packet, 19-20 are
> dropped and 21 or more is a fatal error).
> 
Why drop the packet when we are able to process it? Frontend cannot know
it has crossed the line anyway.


Wei.

Paul Durrant

2013-Mar-26 09:16 UTC

head link

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

> -----Original Message-----
> From: xen-devel-bounces@lists.xen.org [mailto:xen-devel-
> bounces@lists.xen.org] On Behalf Of Wei Liu
> Sent: 25 March 2013 19:09
> To: David Vrabel
> Cc: Ian Campbell; konrad.wilk@oracle.com; netdev@vger.kernel.org; Wei
> Liu; xen-devel@lists.xen.org; annie.li@oracle.com
> Subject: Re: [Xen-devel] [PATCH 5/6] xen-netback: coalesce slots before
> copying
> 
> On Mon, Mar 25, 2013 at 06:29:29PM +0000, David Vrabel wrote:
> > >>
> > >
> > > Are you suggesting move the default macro value to header file?
It is
> > > just an estimation, I have no knowledge of the accurate maximum
value,
> > > so I think make it part of the protocol a bad idea.
> >
> > How is the author of a new frontend supposed to know how many slots
> they
> > can use per packet if it is not precisely defined?
> >
> 
> A new frontend shuold use the scheme you mentioned below to get the
> maximum value. For old frontends that cannot be fixed, administrator can
> configure max_skb_slots to accommodate their need.
> 
> > > Do you have a handle on the maximum value?
> >
> > Backends should provide the value to the frontend via a xenstore key
> > (e.g., max-slots-per-frame).  This value should be at least 18 (the
> > historical value of MAX_SKB_FRAGS).
> >
> > The frontend may use up to this specified value or 17 if the
> > max-slots-per-frame key is missing.
> >
> > Supporting at least 18 in the backend is required for existing
> > frontends.  Limiting frontends to 17 allows them to work with all
> > backends (including recent Linux version that only supported 17).
> >
> > It''s not clear why 19 or 20 were suggested as possible
values.  I
> > checked back to 2.6.18 and MAX_SKB_FRAGS there is (65536/PAGE_SIZE +
> 2)
> 
> Because the check is >= MAX_SKB_FRAGS originally and James Harper told
> me that "Windows stops counting on 20".
> 
For the Citrix PV drivers I lifted the #define of MAX_SKB_FRAGS from the dom0
kernel (i.e. 18). If a packet coming from the stack has more than that number of
fragments then it''s copied and coalesced. The value advertised for TSO
size is chosen such that a maximally sized TSO will always fit in 18 fragments
after coalescing but (since this is Windows) the drivers don''t trust
the stack to stick to that limit and will drop a packet if it won''t
fit.

It seems reasonable that, since the backend is copying anyway, that it should
handle any fragment list coming from the frontend that it can. This would allow
the copy-and-coalesce code to be removed from the frontend (and the double-copy
avoided). If there is a maximum backend packet size though then I think this
needs to be advertised to the frontend. The backend should clearly bin packets
coming from the frontend that exceed that limit but advertising that limit in
xenstore allows the frontend to choose the right TSO maximum size to advertise
to its stack, rather than having to make it based on some historical value that
actually has little meaning (in the absence of grant mapping).

  Paul

Wei Liu

2013-Mar-26 09:51 UTC

head link

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

On Tue, Mar 26, 2013 at 09:16:10AM +0000, Paul Durrant
wrote:> > >
> > > It''s not clear why 19 or 20 were suggested as possible
values.  I
> > > checked back to 2.6.18 and MAX_SKB_FRAGS there is
(65536/PAGE_SIZE
> > > +
> > 2)
> > 
> > Because the check is >= MAX_SKB_FRAGS originally and James Harper
> > told me that "Windows stops counting on 20".
> > 
> 
> For the Citrix PV drivers I lifted the #define of MAX_SKB_FRAGS from
> the dom0 kernel (i.e. 18). If a packet coming from the stack has more
That means Citrix PV drivers cannot run on a vanilla kernel backend
because upstream MAX_SKB_FRAGS now is 17. The scope of the patch is to
fix this sort of problems without touching frontend because it is
impossible for us to fix every frontend in the world.
> than that number of fragments then it''s copied and coalesced. The
> value advertised for TSO size is chosen such that a maximally sized
> TSO will always fit in 18 fragments after coalescing but (since this
> is Windows) the drivers don''t trust the stack to stick to that
limit
> and will drop a packet if it won''t fit.
> 
> It seems reasonable that, since the backend is copying anyway, that it
> should handle any fragment list coming from the frontend that it can.
> This would allow the copy-and-coalesce code to be removed from the
> frontend (and the double-copy avoided). If there is a maximum backend
> packet size though then I think this needs to be advertised to the
> frontend. The backend should clearly bin packets coming from the
> frontend that exceed that limit but advertising that limit in xenstore
> allows the frontend to choose the right TSO maximum size to advertise
> to its stack, rather than having to make it based on some historical
> value that actually has little meaning (in the absence of grant
> mapping).
> 
The historical is choosen based on the idea that it should cope with all
known frontend limits and leave a bit more space for frontends that
slightly cross the line. Advertising through Xenstore is the future plan
as it cannot be done without modifying frontend.


Wei.

James Harper

2013-Mar-26 10:52 UTC

head link

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

> > It''s not clear why 19 or 20 were suggested as possible
values.  I
> > checked back to 2.6.18 and MAX_SKB_FRAGS there is (65536/PAGE_SIZE +
> 2)
> 
> Because the check is >= MAX_SKB_FRAGS originally and James Harper told
> me that "Windows stops counting on 20".
> 
I''ve obviously not been clear enough here... GPLPV stopped counting at
20 (only needed to know if <20 or not). Windows itself can submit a packet to
NDIS with hundreds of buffers. It doesn''t really matter if
it''s 21 or 1021, I just didn''t want to be misquoted.

James

James Harper

2013-Mar-26 11:00 UTC

head link

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

> > Because the check is >= MAX_SKB_FRAGS originally and James Harper
told
> > me that "Windows stops counting on 20".
> >
> 
> For the Citrix PV drivers I lifted the #define of MAX_SKB_FRAGS from the
> dom0 kernel (i.e. 18). If a packet coming from the stack has more than that
> number of fragments then it''s copied and coalesced. The value
advertised
> for TSO size is chosen such that a maximally sized TSO will always fit in
18
> fragments after coalescing but (since this is Windows) the drivers
don''t trust
> the stack to stick to that limit and will drop a packet if it
won''t fit.
> 
> It seems reasonable that, since the backend is copying anyway, that it
should
> handle any fragment list coming from the frontend that it can. This would
> allow the copy-and-coalesce code to be removed from the frontend (and the
> double-copy avoided). If there is a maximum backend packet size though
> then I think this needs to be advertised to the frontend. The backend
should
> clearly bin packets coming from the frontend that exceed that limit but
> advertising that limit in xenstore allows the frontend to choose the right
TSO
> maximum size to advertise to its stack, rather than having to make it based
> on some historical value that actually has little meaning (in the absence
of
> grant mapping).
> 
As stated previously, I''ve observed windows issuing staggering numbers
of buffers to NDIS miniport drivers, so you will need to coalesce in a windows
driver anyway. I''m not sure what the break even point is but I think
it''s safe to say that in the choice between using 1000 (worst case)
ring slots (with the resulting mapping overheads) and coalescing in the
frontend, coalescing is going to be the better option.

James

David Vrabel

2013-Mar-26 11:06 UTC

head link

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

On 26/03/13 10:52, James Harper wrote:>>> It''s not clear why 19 or 20 were suggested as possible
values.
>>> I checked back to 2.6.18 and MAX_SKB_FRAGS there is
>>> (65536/PAGE_SIZE + 2)
>> 
>> Because the check is >= MAX_SKB_FRAGS originally and James Harper
>> told me that "Windows stops counting on 20".
>> 
> 
> I''ve obviously not been clear enough here... GPLPV stopped
counting
> at 20 (only needed to know if <20 or not). Windows itself can submit
> a packet to NDIS with hundreds of buffers. It doesn''t really
matter
> if it''s 21 or 1021, I just didn''t want to be misquoted.
This still isn''t clear.  What''s the maximum number of ring
entries that
GPLPV driver will use per packet?  Are you saying it''s 20?  If so how
has the GPLPV driver ever worked well with Linux''s netback (with its
historical limit of 18)?

David

David Vrabel

2013-Mar-26 11:13 UTC

head link

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

On 25/03/13 19:09, Wei Liu wrote:> On Mon, Mar 25, 2013 at 06:29:29PM +0000, David Vrabel wrote:
>>>>
>>>
>>> Are you suggesting move the default macro value to header file? It
is
>>> just an estimation, I have no knowledge of the accurate maximum
value,
>>> so I think make it part of the protocol a bad idea.
>>
>> How is the author of a new frontend supposed to know how many slots
they
>> can use per packet if it is not precisely defined?
>>
> 
> A new frontend shuold use the scheme you mentioned below to get the
> maximum value. For old frontends that cannot be fixed, administrator can
> configure max_skb_slots to accommodate their need.
I''m happy to the threshold for fatal errors to be configurable via a
module parameter.
>>> Do you have a handle on the maximum value?
>>
>> Backends should provide the value to the frontend via a xenstore key
>> (e.g., max-slots-per-frame).  This value should be at least 18 (the
>> historical value of MAX_SKB_FRAGS).
>>
>> The frontend may use up to this specified value or 17 if the
>> max-slots-per-frame key is missing.
>>
>> Supporting at least 18 in the backend is required for existing
>> frontends.  Limiting frontends to 17 allows them to work with all
>> backends (including recent Linux version that only supported 17).
>>
>> It''s not clear why 19 or 20 were suggested as possible values.
I
>> checked back to 2.6.18 and MAX_SKB_FRAGS there is (65536/PAGE_SIZE + 2)
> 
> Because the check is >= MAX_SKB_FRAGS originally and James Harper told
> me that "Windows stops counting on 20".
> 
>> == 18.
>>
>> Separately, it may be sensible for the backend to drop packets with
more
>> frags than max-slots-per-frame up to some threshold where anything more
>> is considered malicious (i.e., 1 - 18 slots is a valid packet, 19-20
are
>> dropped and 21 or more is a fatal error).
>>
> 
> Why drop the packet when we are able to process it? Frontend cannot know
> it has crossed the line anyway.
Because it''s a change to the protocol and we do not want to do this for
a regression fix.

As a separate fix we can consider increasing the number of slots
per-packet once there is a mechanism to report this to the front end.

David

James Harper

2013-Mar-26 11:15 UTC

head link

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

> 
> On 26/03/13 10:52, James Harper wrote:
> >>> It''s not clear why 19 or 20 were suggested as
possible values.
> >>> I checked back to 2.6.18 and MAX_SKB_FRAGS there is
> >>> (65536/PAGE_SIZE + 2)
> >>
> >> Because the check is >= MAX_SKB_FRAGS originally and James
Harper
> >> told me that "Windows stops counting on 20".
> >>
> >
> > I''ve obviously not been clear enough here... GPLPV stopped
counting
> > at 20 (only needed to know if <20 or not). Windows itself can
submit
> > a packet to NDIS with hundreds of buffers. It doesn''t really
matter
> > if it''s 21 or 1021, I just didn''t want to be
misquoted.
> 
> This still isn''t clear.  What''s the maximum number of
ring entries that
> GPLPV driver will use per packet?  Are you saying it''s 20?  If so
how
> has the GPLPV driver ever worked well with Linux''s netback (with
its
> historical limit of 18)?
> 
GPLPV will limit to 19, which I thought was the historic Linux limit but
obviously not. I''d better look in to that.

I added a debug statement to catch what Windows would give to GPLPV, and it
seemed that the maximum was 20, but then I double checked and GPLPV only needs
to know if there are >19 frags or not, so it stops counting at 20. The actual
number Windows will use internally is not limited so coalescing is required, and
no sane amount of bumping up the Linux limit will reduce the requirement that a
Windows driver will need to coalesce.

James

Paul Durrant

2013-Mar-26 11:24 UTC

head link

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

> -----Original Message-----
> From: James Harper [mailto:james.harper@bendigoit.com.au]
> Sent: 26 March 2013 11:01
> To: Paul Durrant; Wei Liu; David Vrabel
> Cc: Ian Campbell; Wei Liu; netdev@vger.kernel.org;
> konrad.wilk@oracle.com; xen-devel@lists.xen.org; annie.li@oracle.com
> Subject: RE: [Xen-devel] [PATCH 5/6] xen-netback: coalesce slots before
> copying
> 
> > > Because the check is >= MAX_SKB_FRAGS originally and James
Harper
> told
> > > me that "Windows stops counting on 20".
> > >
> >
> > For the Citrix PV drivers I lifted the #define of MAX_SKB_FRAGS from
the
> > dom0 kernel (i.e. 18). If a packet coming from the stack has more than
that
> > number of fragments then it''s copied and coalesced. The value
advertised
> > for TSO size is chosen such that a maximally sized TSO will always fit
in 18
> > fragments after coalescing but (since this is Windows) the drivers
don''t
> trust
> > the stack to stick to that limit and will drop a packet if it
won''t fit.
> >
> > It seems reasonable that, since the backend is copying anyway, that it
> should
> > handle any fragment list coming from the frontend that it can. This
would
> > allow the copy-and-coalesce code to be removed from the frontend (and
> the
> > double-copy avoided). If there is a maximum backend packet size though
> > then I think this needs to be advertised to the frontend. The backend
> should
> > clearly bin packets coming from the frontend that exceed that limit
but
> > advertising that limit in xenstore allows the frontend to choose the
right
> TSO
> > maximum size to advertise to its stack, rather than having to make it
based
> > on some historical value that actually has little meaning (in the
absence of
> > grant mapping).
> >
> 
> As stated previously, I''ve observed windows issuing staggering
numbers of
> buffers to NDIS miniport drivers, so you will need to coalesce in a windows
> driver anyway. I''m not sure what the break even point is but I
think it''s safe
> to say that in the choice between using 1000 (worst case) ring slots (with
the
> resulting mapping overheads) and coalescing in the frontend, coalescing is
> going to be the better option.
> 
Oh quite, if the backend is mapping and not copying then coalescing in the
frontend is the right way to go. I guess coalescing once the frag count reaches
a full ring count is probably necessary (since we can''t push a partial
packet) but it would be nice not to have to do it if the backend is going to
copy anyway.

  Paul

David Laight

2013-Mar-26 11:27 UTC

head link

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

> As stated previously, I''ve observed windows issuing staggering
> numbers of buffers to NDIS miniport drivers, so you will need
> to coalesce in a windows driver anyway. I''m not sure what the
> break even point is but I think it''s safe to say that in the
> choice between using 1000 (worst case) ring slots (with the
> resulting mapping overheads) and coalescing in the frontend,
> coalescing is going to be the better option.
A long time ago we did some calculation on a sparc mbus/sbus
system (that has an iommu requiring setup for dma) and got
a breakeven point of (about) 1k.
(And I''m not sure we arrange to do aligned copies.)

Clearly that isn''t directly relevant here...

It is even likely that the ethernet chips will underrun
if requested to do too many ring operations - especially
at their maximum speed.
I guess none of the modern ones require the first fragment
to be at least 100 bytes in order to guarantee retransmission
after a collision.

	David

James Harper

2013-Mar-26 11:29 UTC

head link

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

> > As stated previously, I''ve observed windows issuing
staggering numbers of
> > buffers to NDIS miniport drivers, so you will need to coalesce in a
windows
> > driver anyway. I''m not sure what the break even point is but
I think it''s safe
> > to say that in the choice between using 1000 (worst case) ring slots
(with
> > the
> > resulting mapping overheads) and coalescing in the frontend,
coalescing is
> > going to be the better option.
> >
> 
> Oh quite, if the backend is mapping and not copying then coalescing in the
> frontend is the right way to go. I guess coalescing once the frag count
> reaches a full ring count is probably necessary (since we can''t
push a partial
> packet) but it would be nice not to have to do it if the backend is going
to
> copy anyway.
> 
For a 9k packet with 100 frags (not a common case, but an example), what is the
cost of mapping those 100 frags into the backend vs coalescing to three pages in
the frontend and mapping those?

I may be misremembering but wasn''t there a patch floating around for
persistent mapping to avoid some of this overhead? (not applicable here but I
thought it meant that the cost wasn''t insignificant)

James

Wei Liu

2013-Mar-26 11:29 UTC

head link

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

On Tue, Mar 26, 2013 at 11:13:38AM +0000, David Vrabel
wrote:> >>
> >> Separately, it may be sensible for the backend to drop packets
with more
> >> frags than max-slots-per-frame up to some threshold where anything
more
> >> is considered malicious (i.e., 1 - 18 slots is a valid packet,
19-20 are
> >> dropped and 21 or more is a fatal error).
> >>
> > 
> > Why drop the packet when we are able to process it? Frontend cannot
know
> > it has crossed the line anyway.
> 
> Because it''s a change to the protocol and we do not want to do
this for
> a regression fix.
> 
If I understand correctly the regression you talked about was introduced
by harsh punishment in XSA-39? If so, this is the patch you need to fix
that. Frontend only knows that it has connectivity or not. This patch
guarantee that the old netfront with larger MAX_SKB_FRAGS still see the
same thing from its point of view.  Netfront cannot know the
intermediate state between 18 and 20.
> As a separate fix we can consider increasing the number of slots
> per-packet once there is a mechanism to report this to the front end.
> 
Sure, that''s on my TODO list.


Wei.

Paul Durrant

2013-Mar-26 11:38 UTC

head link

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

> -----Original Message-----
> From: James Harper [mailto:james.harper@bendigoit.com.au]
> Sent: 26 March 2013 11:29
> To: Paul Durrant; Wei Liu; David Vrabel
> Cc: Ian Campbell; Wei Liu; netdev@vger.kernel.org;
> konrad.wilk@oracle.com; xen-devel@lists.xen.org; annie.li@oracle.com
> Subject: RE: [Xen-devel] [PATCH 5/6] xen-netback: coalesce slots before
> copying
> 
> > > As stated previously, I''ve observed windows issuing
staggering numbers
> of
> > > buffers to NDIS miniport drivers, so you will need to coalesce in
a
> windows
> > > driver anyway. I''m not sure what the break even point is
but I think it''s
> safe
> > > to say that in the choice between using 1000 (worst case) ring
slots (with
> > > the
> > > resulting mapping overheads) and coalescing in the frontend,
coalescing
> is
> > > going to be the better option.
> > >
> >
> > Oh quite, if the backend is mapping and not copying then coalescing in
the
> > frontend is the right way to go. I guess coalescing once the frag
count
> > reaches a full ring count is probably necessary (since we
can''t push a partial
> > packet) but it would be nice not to have to do it if the backend is
going to
> > copy anyway.
> >
> 
> For a 9k packet with 100 frags (not a common case, but an example), what is
> the cost of mapping those 100 frags into the backend vs coalescing to three
> pages in the frontend and mapping those?
> 
> I may be misremembering but wasn''t there a patch floating around
for
> persistent mapping to avoid some of this overhead? (not applicable here but
> I thought it meant that the cost wasn''t insignificant)
> 
The current version of netback does not map, it always grant-copies.

  Paul

David Vrabel

2013-Mar-26 18:02 UTC

head link

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

On 25/03/13 11:08, Wei Liu wrote:> This patch tries to coalesce tx requests when constructing grant copy
> structures. It enables netback to deal with situation when
frontend''s
> MAX_SKB_FRAGS is larger than backend''s MAX_SKB_FRAGS.
> 
> It defines max_skb_slots, which is a estimation of the maximum number of
slots
> a guest can send, anything bigger than that is considered malicious. Now it
is
> set to 20, which should be enough to accommodate Linux (16 to 19).
> 
> Also change variable name from "frags" to "slots" in
netbk_count_requests.
It it worth summarizing an (off-line) discussion I had with Wei on this
patch.

There are two regression that need to be addressed here.

1. The reduction of the number of supported ring entries (slots) per
packet (from 18 to 17).

2. The XSA-39 security fix turning "too many frags" errors from just
dropping the packet to a fatal error and disabling the VIF.

The key root cause of the problem is that the protocol is poorly
specified using a property external to the netback/netfront drivers
(i.e., MAX_SKB_FRAGS).

A secondary problem is some frontends have used a max slots per packet
value that is larger than netback as supported. e.g., the Windows GPLPV
drivers use up to 19.  These packets have always been dropped.

The first step is to properly specify the maximum slots per-packet as
part of the interface.  This should be specified as 18 (the historical
value).

The second step is to define a threshold for slots per packet, above
which the guest is considered to be malicious and the error is fatal.
20 seems a sensible value here.

The behavior of netback for packet is thus:

    1-18 slots: valid
   19-20 slots: drop and respond with an error
   21+   slots: fatal error

Note that we do not make 19-20 valid as this is a change to the protocol
and guests may end up relying on this which would then break the guests
if they migrate or start on host with a limit of 18.

A third (and future) step would be to investigate whether increasing the
slots per-packet limit is sensible.  There would then need to be a
mechanism to negotiate this limit between the front and back ends.

David

Ian Campbell

2013-Apr-09 15:10 UTC

head link

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

On Mon, 2013-03-25 at 16:57 +0000, Konrad Rzeszutek Wilk
wrote:> On Mon, Mar 25, 2013 at 11:08:21AM +0000, Wei Liu wrote:
> > This patch tries to coalesce tx requests when constructing grant copy
> > structures. It enables netback to deal with situation when
frontend''s
> > MAX_SKB_FRAGS is larger than backend''s MAX_SKB_FRAGS.
> >
> > It defines max_skb_slots, which is a estimation of the maximum number
of slots
> > a guest can send, anything bigger than that is considered malicious.
Now it is
> > set to 20, which should be enough to accommodate Linux (16 to 19).
> >
> > Also change variable name from "frags" to "slots"
in netbk_count_requests.
> >
> 
> This should probably also CC stable@vger.kernel.org
DaveM prefers net patches to not do so and he takes care of forwarding
patches once he is happy (i.e. after they''ve been in
his/Linus'' tree for
a bit).

Ian

Xen devel - Mar 2013 - [PATCH 5/6] xen-netback: coalesce slots before copying

[PATCH 5/6] xen-netback: coalesce slots before copying

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

Re: [PATCH 5/6] xen-netback: coalesce slots before copying

Re: [PATCH 5/6] xen-netback: coalesce slots before copying