This patch tries to coalesce tx requests when constructing grant copy structures. It enables netback to deal with situation when frontend''s MAX_SKB_FRAGS is larger than backend''s MAX_SKB_FRAGS. It defines max_skb_slots, which is a estimation of the maximum number of slots a guest can send, anything bigger than that is considered malicious. Now it is set to 20, which should be enough to accommodate Linux (16 to 19). Also change variable name from "frags" to "slots" in netbk_count_requests. Signed-off-by: Wei Liu <wei.liu2@citrix.com> --- drivers/net/xen-netback/netback.c | 220 ++++++++++++++++++++++++++++--------- 1 file changed, 171 insertions(+), 49 deletions(-) diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 6e8e51a..a634dc5 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -47,11 +47,26 @@ #include <asm/xen/hypercall.h> #include <asm/xen/page.h> +/* + * This is an estimation of the maximum possible frags a SKB might + * have, anything larger than this is considered malicious. Typically + * Linux has 16 to 19. + */ +#define MAX_SKB_SLOTS_DEFAULT 20 +static unsigned int max_skb_slots = MAX_SKB_SLOTS_DEFAULT; +module_param(max_skb_slots, uint, 0444); + +typedef unsigned int pending_ring_idx_t; +#define INVALID_PENDING_RING_IDX (~0U) + struct pending_tx_info { - struct xen_netif_tx_request req; + struct xen_netif_tx_request req; /* coalesced tx request */ struct xenvif *vif; + pending_ring_idx_t head; /* head != INVALID_PENDING_RING_IDX + * if it is head of serveral merged + * tx reqs + */ }; -typedef unsigned int pending_ring_idx_t; struct netbk_rx_meta { int id; @@ -102,7 +117,11 @@ struct xen_netbk { atomic_t netfront_count; struct pending_tx_info pending_tx_info[MAX_PENDING_REQS]; - struct gnttab_copy tx_copy_ops[MAX_PENDING_REQS]; + /* Coalescing tx requests before copying makes number of grant + * copy ops greater of equal to number of slots required. In + * worst case a tx request consumes 2 gnttab_copy. + */ + struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS]; u16 pending_ring[MAX_PENDING_REQS]; @@ -251,7 +270,7 @@ static int max_required_rx_slots(struct xenvif *vif) int max = DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE); if (vif->can_sg || vif->gso || vif->gso_prefix) - max += MAX_SKB_FRAGS + 1; /* extra_info + frags */ + max += max_skb_slots + 1; /* extra_info + frags */ return max; } @@ -657,7 +676,7 @@ static void xen_netbk_rx_action(struct xen_netbk *netbk) __skb_queue_tail(&rxq, skb); /* Filled the batch queue? */ - if (count + MAX_SKB_FRAGS >= XEN_NETIF_RX_RING_SIZE) + if (count + max_skb_slots >= XEN_NETIF_RX_RING_SIZE) break; } @@ -908,34 +927,34 @@ static int netbk_count_requests(struct xenvif *vif, int work_to_do) { RING_IDX cons = vif->tx.req_cons; - int frags = 0; + int slots = 0; if (!(first->flags & XEN_NETTXF_more_data)) return 0; do { - if (frags >= work_to_do) { - netdev_err(vif->dev, "Need more frags\n"); + if (slots >= work_to_do) { + netdev_err(vif->dev, "Need more slots\n"); netbk_fatal_tx_err(vif); return -ENODATA; } - if (unlikely(frags >= MAX_SKB_FRAGS)) { - netdev_err(vif->dev, "Too many frags\n"); + if (unlikely(slots >= max_skb_slots)) { + netdev_err(vif->dev, "Too many slots\n"); netbk_fatal_tx_err(vif); return -E2BIG; } - memcpy(txp, RING_GET_REQUEST(&vif->tx, cons + frags), + memcpy(txp, RING_GET_REQUEST(&vif->tx, cons + slots), sizeof(*txp)); if (txp->size > first->size) { - netdev_err(vif->dev, "Frag is bigger than frame.\n"); + netdev_err(vif->dev, "Packet is bigger than frame.\n"); netbk_fatal_tx_err(vif); return -EIO; } first->size -= txp->size; - frags++; + slots++; if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) { netdev_err(vif->dev, "txp->offset: %x, size: %u\n", @@ -944,7 +963,7 @@ static int netbk_count_requests(struct xenvif *vif, return -EINVAL; } } while ((txp++)->flags & XEN_NETTXF_more_data); - return frags; + return slots; } static struct page *xen_netbk_alloc_page(struct xen_netbk *netbk, @@ -968,48 +987,114 @@ static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk, struct skb_shared_info *shinfo = skb_shinfo(skb); skb_frag_t *frags = shinfo->frags; u16 pending_idx = *((u16 *)skb->data); - int i, start; + u16 head_idx = 0; + int slot, start; + struct page *page; + pending_ring_idx_t index, start_idx = 0; + uint16_t dst_offset; + unsigned int nr_slots; + struct pending_tx_info *first = NULL; + + /* At this point shinfo->nr_frags is in fact the number of + * slots, which can be as large as max_skb_slots. + */ + nr_slots = shinfo->nr_frags; /* Skip first skb fragment if it is on same page as header fragment. */ start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx); - for (i = start; i < shinfo->nr_frags; i++, txp++) { - struct page *page; - pending_ring_idx_t index; + /* Coalesce tx requests, at this point the packet passed in + * should be <= 64K. Any packets larger than 64K have been + * handled in netbk_count_requests(). + */ + for (shinfo->nr_frags = slot = start; slot < nr_slots; + shinfo->nr_frags++) { struct pending_tx_info *pending_tx_info netbk->pending_tx_info; - index = pending_index(netbk->pending_cons++); - pending_idx = netbk->pending_ring[index]; - page = xen_netbk_alloc_page(netbk, pending_idx); + page = alloc_page(GFP_KERNEL|__GFP_COLD); if (!page) goto err; - gop->source.u.ref = txp->gref; - gop->source.domid = vif->domid; - gop->source.offset = txp->offset; - - gop->dest.u.gmfn = virt_to_mfn(page_address(page)); - gop->dest.domid = DOMID_SELF; - gop->dest.offset = txp->offset; - - gop->len = txp->size; - gop->flags = GNTCOPY_source_gref; + dst_offset = 0; + first = NULL; + while (dst_offset < PAGE_SIZE && slot < nr_slots) { + gop->flags = GNTCOPY_source_gref; + + gop->source.u.ref = txp->gref; + gop->source.domid = vif->domid; + gop->source.offset = txp->offset; + + gop->dest.domid = DOMID_SELF; + + gop->dest.offset = dst_offset; + gop->dest.u.gmfn = virt_to_mfn(page_address(page)); + + if (dst_offset + txp->size > PAGE_SIZE) { + /* This page can only merge a portion + * of tx request. Do not increment any + * pointer / counter here. The txp + * will be dealt with in future + * rounds, eventually hitting the + * `else` branch. + */ + gop->len = PAGE_SIZE - dst_offset; + txp->offset += gop->len; + txp->size -= gop->len; + dst_offset += gop->len; /* quit loop */ + } else { + /* This tx request can be merged in the page */ + gop->len = txp->size; + dst_offset += gop->len; + + index = pending_index(netbk->pending_cons++); + + pending_idx = netbk->pending_ring[index]; + + memcpy(&pending_tx_info[pending_idx].req, txp, + sizeof(*txp)); + xenvif_get(vif); + + pending_tx_info[pending_idx].vif = vif; + + /* Poison these fields, corresponding + * fields for head tx req will be set + * to correct values after the loop. + */ + netbk->mmap_pages[pending_idx] = (void *)(~0UL); + pending_tx_info[pending_idx].head + INVALID_PENDING_RING_IDX; + + if (unlikely(!first)) { + first = &pending_tx_info[pending_idx]; + start_idx = index; + head_idx = pending_idx; + } + + txp++; + slot++; + } - gop++; + gop++; + } - memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp)); - xenvif_get(vif); - pending_tx_info[pending_idx].vif = vif; - frag_set_pending_idx(&frags[i], pending_idx); + first->req.offset = 0; + first->req.size = dst_offset; + first->head = start_idx; + set_page_ext(page, netbk, head_idx); + netbk->mmap_pages[head_idx] = page; + frag_set_pending_idx(&frags[shinfo->nr_frags], head_idx); } + BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS); + return gop; err: /* Unwind, freeing all pages and sending error responses. */ - while (i-- > start) { - xen_netbk_idx_release(netbk, frag_get_pending_idx(&frags[i]), - XEN_NETIF_RSP_ERROR); + while (shinfo->nr_frags-- > start) { + xen_netbk_idx_release(netbk, + frag_get_pending_idx(&frags[shinfo->nr_frags]), + XEN_NETIF_RSP_ERROR); } /* The head too, if necessary. */ if (start) @@ -1025,8 +1110,10 @@ static int xen_netbk_tx_check_gop(struct xen_netbk *netbk, struct gnttab_copy *gop = *gopp; u16 pending_idx = *((u16 *)skb->data); struct skb_shared_info *shinfo = skb_shinfo(skb); + struct pending_tx_info *tx_info; int nr_frags = shinfo->nr_frags; int i, err, start; + u16 peek; /* peek into next tx request */ /* Check status of header. */ err = gop->status; @@ -1038,11 +1125,21 @@ static int xen_netbk_tx_check_gop(struct xen_netbk *netbk, for (i = start; i < nr_frags; i++) { int j, newerr; + pending_ring_idx_t head; pending_idx = frag_get_pending_idx(&shinfo->frags[i]); + tx_info = &netbk->pending_tx_info[pending_idx]; + head = tx_info->head; /* Check error status: if okay then remember grant handle. */ - newerr = (++gop)->status; + do { + newerr = (++gop)->status; + if (newerr) + break; + peek = netbk->pending_ring[pending_index(++head)]; + } while (netbk->pending_tx_info[peek].head + == INVALID_PENDING_RING_IDX); + if (likely(!newerr)) { /* Had a previous error? Invalidate this fragment. */ if (unlikely(err)) @@ -1267,11 +1364,11 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) struct sk_buff *skb; int ret; - while (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) && + while (((nr_pending_reqs(netbk) + max_skb_slots) < MAX_PENDING_REQS) && !list_empty(&netbk->net_schedule_list)) { struct xenvif *vif; struct xen_netif_tx_request txreq; - struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS]; + struct xen_netif_tx_request txfrags[max_skb_slots]; struct page *page; struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1]; u16 pending_idx; @@ -1359,7 +1456,7 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) pending_idx = netbk->pending_ring[index]; data_len = (txreq.size > PKT_PROT_LEN && - ret < MAX_SKB_FRAGS) ? + ret < max_skb_slots) ? PKT_PROT_LEN : txreq.size; skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN, @@ -1409,6 +1506,7 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) memcpy(&netbk->pending_tx_info[pending_idx].req, &txreq, sizeof(txreq)); netbk->pending_tx_info[pending_idx].vif = vif; + netbk->pending_tx_info[pending_idx].head = index; *((u16 *)skb->data) = pending_idx; __skb_put(skb, data_len); @@ -1539,7 +1637,10 @@ static void xen_netbk_idx_release(struct xen_netbk *netbk, u16 pending_idx, { struct xenvif *vif; struct pending_tx_info *pending_tx_info; - pending_ring_idx_t index; + pending_ring_idx_t head; + u16 peek; /* peek into next tx request */ + + BUG_ON(netbk->mmap_pages[pending_idx] == (void *)(~0UL)); /* Already complete? */ if (netbk->mmap_pages[pending_idx] == NULL) @@ -1548,13 +1649,34 @@ static void xen_netbk_idx_release(struct xen_netbk *netbk, u16 pending_idx, pending_tx_info = &netbk->pending_tx_info[pending_idx]; vif = pending_tx_info->vif; + head = pending_tx_info->head; - make_tx_response(vif, &pending_tx_info->req, status); + BUG_ON(head == INVALID_PENDING_RING_IDX); + BUG_ON(netbk->pending_ring[pending_index(head)] != pending_idx); - index = pending_index(netbk->pending_prod++); - netbk->pending_ring[index] = pending_idx; + do { + pending_ring_idx_t index; + pending_ring_idx_t idx = pending_index(head); + u16 info_idx = netbk->pending_ring[idx]; - xenvif_put(vif); + pending_tx_info = &netbk->pending_tx_info[info_idx]; + make_tx_response(vif, &pending_tx_info->req, status); + + /* Setting any number other than + * INVALID_PENDING_RING_IDX indicates this slot is + * starting a new packet / ending a previous packet. + */ + pending_tx_info->head = 0; + + index = pending_index(netbk->pending_prod++); + netbk->pending_ring[index] = netbk->pending_ring[info_idx]; + + xenvif_put(vif); + + peek = netbk->pending_ring[pending_index(++head)]; + + } while (netbk->pending_tx_info[peek].head + == INVALID_PENDING_RING_IDX); netbk->mmap_pages[pending_idx]->mapping = 0; put_page(netbk->mmap_pages[pending_idx]); @@ -1613,7 +1735,7 @@ static inline int rx_work_todo(struct xen_netbk *netbk) static inline int tx_work_todo(struct xen_netbk *netbk) { - if (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) && + if (((nr_pending_reqs(netbk) + max_skb_slots) < MAX_PENDING_REQS) && !list_empty(&netbk->net_schedule_list)) return 1; -- 1.7.10.4
David Vrabel
2013-Mar-25 15:13 UTC
Re: [PATCH 5/6] xen-netback: coalesce slots before copying
On 25/03/13 11:08, Wei Liu wrote:> This patch tries to coalesce tx requests when constructing grant copy > structures. It enables netback to deal with situation when frontend''s > MAX_SKB_FRAGS is larger than backend''s MAX_SKB_FRAGS. > > It defines max_skb_slots, which is a estimation of the maximum number of slots > a guest can send, anything bigger than that is considered malicious. Now it is > set to 20, which should be enough to accommodate Linux (16 to 19).This maximum needs to be defined as part of the protocol and added to the interface header.> Also change variable name from "frags" to "slots" in netbk_count_requests.I think this renaming should have been done as a separate patch.> --- a/drivers/net/xen-netback/netback.c > +++ b/drivers/net/xen-netback/netback.c > @@ -47,11 +47,26 @@ > #include <asm/xen/hypercall.h> > #include <asm/xen/page.h> > > +/* > + * This is an estimation of the maximum possible frags a SKB might > + * have, anything larger than this is considered malicious. Typically > + * Linux has 16 to 19. > + */Do you mean "max possible /slots/" a packet might have?> @@ -968,48 +987,114 @@ static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk,[...]> + /* Poison these fields, corresponding > + * fields for head tx req will be set > + * to correct values after the loop. > + */ > + netbk->mmap_pages[pending_idx] = (void *)(~0UL); > + pending_tx_info[pending_idx].head > + INVALID_PENDING_RING_IDX;Do you need to poison both values?> + > + if (unlikely(!first)) {This isn''t unlikely is it?> + first = &pending_tx_info[pending_idx]; > + start_idx = index; > + head_idx = pending_idx; > + }Instead of setting first here why not move the code below here?> + first->req.offset = 0; > + first->req.size = dst_offset; > + first->head = start_idx; > + set_page_ext(page, netbk, head_idx); > + netbk->mmap_pages[head_idx] = page; > + frag_set_pending_idx(&frags[shinfo->nr_frags], head_idx);> @@ -1548,13 +1649,34 @@ static void xen_netbk_idx_release(struct xen_netbk *netbk, u16 pending_idx,[...]> + /* Setting any number other than > + * INVALID_PENDING_RING_IDX indicates this slot is > + * starting a new packet / ending a previous packet. > + */ > + pending_tx_info->head = 0;This doesn''t look needed. It will be initialized again when reusing t his pending_tx_info again, right?> + index = pending_index(netbk->pending_prod++); > + netbk->pending_ring[index] = netbk->pending_ring[info_idx]; > + > + xenvif_put(vif); > + > + peek = netbk->pending_ring[pending_index(++head)]; > + > + } while (netbk->pending_tx_info[peek].head > + == INVALID_PENDING_RING_IDX); > > netbk->mmap_pages[pending_idx]->mapping = 0; > put_page(netbk->mmap_pages[pending_idx]);David
On Mon, Mar 25, 2013 at 3:13 PM, David Vrabel <david.vrabel@citrix.com> wrote:> On 25/03/13 11:08, Wei Liu wrote: >> This patch tries to coalesce tx requests when constructing grant copy >> structures. It enables netback to deal with situation when frontend''s >> MAX_SKB_FRAGS is larger than backend''s MAX_SKB_FRAGS. >> >> It defines max_skb_slots, which is a estimation of the maximum number of slots >> a guest can send, anything bigger than that is considered malicious. Now it is >> set to 20, which should be enough to accommodate Linux (16 to 19). > > This maximum needs to be defined as part of the protocol and added to > the interface header. >No, this is not part of the protocol and not a hard limit. It is configurable by system administrator.>> Also change variable name from "frags" to "slots" in netbk_count_requests. > > I think this renaming should have been done as a separate patch. >The frag -> slot thing only make sense after this patch. If I do this, I will need to partially reverted what I''ve done in a previous patch (like the frag -> slot in comment below you pointed out). :-(>> --- a/drivers/net/xen-netback/netback.c >> +++ b/drivers/net/xen-netback/netback.c >> @@ -47,11 +47,26 @@ >> #include <asm/xen/hypercall.h> >> #include <asm/xen/page.h> >> >> +/* >> + * This is an estimation of the maximum possible frags a SKB might >> + * have, anything larger than this is considered malicious. Typically >> + * Linux has 16 to 19. >> + */ > > Do you mean "max possible /slots/" a packet might have? >Yes.>> @@ -968,48 +987,114 @@ static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk, > [...] >> + /* Poison these fields, corresponding >> + * fields for head tx req will be set >> + * to correct values after the loop. >> + */ >> + netbk->mmap_pages[pending_idx] = (void *)(~0UL); >> + pending_tx_info[pending_idx].head >> + INVALID_PENDING_RING_IDX; > > Do you need to poison both values? >Just for safety. I have BUG_ON in the release path to check for possible error.>> + >> + if (unlikely(!first)) { > > This isn''t unlikely is it? >For big packet the chance is 1 in max_skb_slots, so 5% (1/20) in default case.>> + first = &pending_tx_info[pending_idx]; >> + start_idx = index; >> + head_idx = pending_idx; >> + } > > Instead of setting first here why not move the code below here? >Because first->req.size needs to be set to dst_offset, it has to be here anyway. So I put other fields here as well.>> + first->req.offset = 0; >> + first->req.size = dst_offset; >> + first->head = start_idx; >> + set_page_ext(page, netbk, head_idx); >> + netbk->mmap_pages[head_idx] = page; >> + frag_set_pending_idx(&frags[shinfo->nr_frags], head_idx); > >> @@ -1548,13 +1649,34 @@ static void xen_netbk_idx_release(struct xen_netbk *netbk, u16 pending_idx, > [...] >> + /* Setting any number other than >> + * INVALID_PENDING_RING_IDX indicates this slot is >> + * starting a new packet / ending a previous packet. >> + */ >> + pending_tx_info->head = 0; > > This doesn''t look needed. It will be initialized again when reusing t > his pending_tx_info again, right? >Yes, it is needed. Otherwise netback responses to invalid tx_info and cause netfront to crash before coming into the re-initialization path. Wei.
David Miller
2013-Mar-25 16:20 UTC
Re: [PATCH 5/6] xen-netback: coalesce slots before copying
From: Wei Liu <wei.liu2@citrix.com> Date: Mon, 25 Mar 2013 11:08:21 +0000> This patch tries to coalesce tx requests when constructing grant copy > structures. It enables netback to deal with situation when frontend''s > MAX_SKB_FRAGS is larger than backend''s MAX_SKB_FRAGS. > > It defines max_skb_slots, which is a estimation of the maximum number of slots > a guest can send, anything bigger than that is considered malicious. Now it is > set to 20, which should be enough to accommodate Linux (16 to 19). > > Also change variable name from "frags" to "slots" in netbk_count_requests. > > Signed-off-by: Wei Liu <wei.liu2@citrix.com>Please resubmit after you''ve incorporated the feedback you''ve received.
David Vrabel
2013-Mar-25 16:34 UTC
Re: [PATCH 5/6] xen-netback: coalesce slots before copying
On 25/03/13 15:47, Wei Liu wrote:> On Mon, Mar 25, 2013 at 3:13 PM, David Vrabel <david.vrabel@citrix.com> wrote: >> On 25/03/13 11:08, Wei Liu wrote: >>> This patch tries to coalesce tx requests when constructing grant copy >>> structures. It enables netback to deal with situation when frontend''s >>> MAX_SKB_FRAGS is larger than backend''s MAX_SKB_FRAGS. >>> >>> It defines max_skb_slots, which is a estimation of the maximum number of slots >>> a guest can send, anything bigger than that is considered malicious. Now it is >>> set to 20, which should be enough to accommodate Linux (16 to 19). >> >> This maximum needs to be defined as part of the protocol and added to >> the interface header. >> > > No, this is not part of the protocol and not a hard limit. It is > configurable by system administrator.There is no mechanism by which the front and back ends can negotiate this value, so it does need to be a fixed value that is equal or greater than the max from any front or back end that has ever existed. The reason for this patch is that this wasn''t properly specified and changes outside of netback broke the protocol.>>> + >>> + if (unlikely(!first)) { >> >> This isn''t unlikely is it? >> > > For big packet the chance is 1 in max_skb_slots, so 5% (1/20) in default case.I don''t understand your reasoning here. The "if (!first)" branch is taken once per page. It will be 100% if each slot goes into its own page and only 5% if the packet is less than PAGE_SIZE in length but split into 20 slots.>>> + first = &pending_tx_info[pending_idx]; >>> + start_idx = index; >>> + head_idx = pending_idx; >>> + } >> >> Instead of setting first here why not move the code below here? >> > > Because first->req.size needs to be set to dst_offset, it has to be > here anyway. So I put other fields here as well.Hmmm, I''d probably accumulate first->req.size but ok. It was a minor point anyway.>>> + first->req.offset = 0; >>> + first->req.size = dst_offset; >>> + first->head = start_idx; >>> + set_page_ext(page, netbk, head_idx); >>> + netbk->mmap_pages[head_idx] = page; >>> + frag_set_pending_idx(&frags[shinfo->nr_frags], head_idx); >> >>> @@ -1548,13 +1649,34 @@ static void xen_netbk_idx_release(struct xen_netbk *netbk, u16 pending_idx, >> [...] >>> + /* Setting any number other than >>> + * INVALID_PENDING_RING_IDX indicates this slot is >>> + * starting a new packet / ending a previous packet. >>> + */ >>> + pending_tx_info->head = 0; >> >> This doesn''t look needed. It will be initialized again when reusing t >> his pending_tx_info again, right? >> > > Yes, it is needed. Otherwise netback responses to invalid tx_info and > cause netfront to crash before coming into the re-initialization path.Maybe I''m missing something but this is after the make_tx_reponse() call, and immediately after this pending_tx_info is returned to the pending ring as free. David
Konrad Rzeszutek Wilk
2013-Mar-25 16:57 UTC
Re: [PATCH 5/6] xen-netback: coalesce slots before copying
On Mon, Mar 25, 2013 at 11:08:21AM +0000, Wei Liu wrote:> This patch tries to coalesce tx requests when constructing grant copy > structures. It enables netback to deal with situation when frontend''s > MAX_SKB_FRAGS is larger than backend''s MAX_SKB_FRAGS. > > It defines max_skb_slots, which is a estimation of the maximum number of slots > a guest can send, anything bigger than that is considered malicious. Now it is > set to 20, which should be enough to accommodate Linux (16 to 19). > > Also change variable name from "frags" to "slots" in netbk_count_requests. >This should probably also CC stable@vger.kernel.org> Signed-off-by: Wei Liu <wei.liu2@citrix.com> > --- > drivers/net/xen-netback/netback.c | 220 ++++++++++++++++++++++++++++--------- > 1 file changed, 171 insertions(+), 49 deletions(-) > > diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c > index 6e8e51a..a634dc5 100644 > --- a/drivers/net/xen-netback/netback.c > +++ b/drivers/net/xen-netback/netback.c > @@ -47,11 +47,26 @@ > #include <asm/xen/hypercall.h> > #include <asm/xen/page.h> > > +/* > + * This is an estimation of the maximum possible frags a SKB might > + * have, anything larger than this is considered malicious. Typically > + * Linux has 16 to 19. > + */ > +#define MAX_SKB_SLOTS_DEFAULT 20 > +static unsigned int max_skb_slots = MAX_SKB_SLOTS_DEFAULT; > +module_param(max_skb_slots, uint, 0444); > + > +typedef unsigned int pending_ring_idx_t; > +#define INVALID_PENDING_RING_IDX (~0U) > + > struct pending_tx_info { > - struct xen_netif_tx_request req; > + struct xen_netif_tx_request req; /* coalesced tx request */ > struct xenvif *vif; > + pending_ring_idx_t head; /* head != INVALID_PENDING_RING_IDX > + * if it is head of serveral merged > + * tx reqs > + */ > }; > -typedef unsigned int pending_ring_idx_t; > > struct netbk_rx_meta { > int id; > @@ -102,7 +117,11 @@ struct xen_netbk { > atomic_t netfront_count; > > struct pending_tx_info pending_tx_info[MAX_PENDING_REQS]; > - struct gnttab_copy tx_copy_ops[MAX_PENDING_REQS]; > + /* Coalescing tx requests before copying makes number of grant > + * copy ops greater of equal to number of slots required. In > + * worst case a tx request consumes 2 gnttab_copy. > + */ > + struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS]; > > u16 pending_ring[MAX_PENDING_REQS]; > > @@ -251,7 +270,7 @@ static int max_required_rx_slots(struct xenvif *vif) > int max = DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE); > > if (vif->can_sg || vif->gso || vif->gso_prefix) > - max += MAX_SKB_FRAGS + 1; /* extra_info + frags */ > + max += max_skb_slots + 1; /* extra_info + frags */ > > return max; > } > @@ -657,7 +676,7 @@ static void xen_netbk_rx_action(struct xen_netbk *netbk) > __skb_queue_tail(&rxq, skb); > > /* Filled the batch queue? */ > - if (count + MAX_SKB_FRAGS >= XEN_NETIF_RX_RING_SIZE) > + if (count + max_skb_slots >= XEN_NETIF_RX_RING_SIZE) > break; > } > > @@ -908,34 +927,34 @@ static int netbk_count_requests(struct xenvif *vif, > int work_to_do) > { > RING_IDX cons = vif->tx.req_cons; > - int frags = 0; > + int slots = 0; > > if (!(first->flags & XEN_NETTXF_more_data)) > return 0; > > do { > - if (frags >= work_to_do) { > - netdev_err(vif->dev, "Need more frags\n"); > + if (slots >= work_to_do) { > + netdev_err(vif->dev, "Need more slots\n"); > netbk_fatal_tx_err(vif); > return -ENODATA; > } > > - if (unlikely(frags >= MAX_SKB_FRAGS)) { > - netdev_err(vif->dev, "Too many frags\n"); > + if (unlikely(slots >= max_skb_slots)) { > + netdev_err(vif->dev, "Too many slots\n"); > netbk_fatal_tx_err(vif); > return -E2BIG; > } > > - memcpy(txp, RING_GET_REQUEST(&vif->tx, cons + frags), > + memcpy(txp, RING_GET_REQUEST(&vif->tx, cons + slots), > sizeof(*txp)); > if (txp->size > first->size) { > - netdev_err(vif->dev, "Frag is bigger than frame.\n"); > + netdev_err(vif->dev, "Packet is bigger than frame.\n"); > netbk_fatal_tx_err(vif); > return -EIO; > } > > first->size -= txp->size; > - frags++; > + slots++; > > if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) { > netdev_err(vif->dev, "txp->offset: %x, size: %u\n", > @@ -944,7 +963,7 @@ static int netbk_count_requests(struct xenvif *vif, > return -EINVAL; > } > } while ((txp++)->flags & XEN_NETTXF_more_data); > - return frags; > + return slots; > } > > static struct page *xen_netbk_alloc_page(struct xen_netbk *netbk, > @@ -968,48 +987,114 @@ static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk, > struct skb_shared_info *shinfo = skb_shinfo(skb); > skb_frag_t *frags = shinfo->frags; > u16 pending_idx = *((u16 *)skb->data); > - int i, start; > + u16 head_idx = 0; > + int slot, start; > + struct page *page; > + pending_ring_idx_t index, start_idx = 0; > + uint16_t dst_offset; > + unsigned int nr_slots; > + struct pending_tx_info *first = NULL; > + > + /* At this point shinfo->nr_frags is in fact the number of > + * slots, which can be as large as max_skb_slots. > + */ > + nr_slots = shinfo->nr_frags; > > /* Skip first skb fragment if it is on same page as header fragment. */ > start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx); > > - for (i = start; i < shinfo->nr_frags; i++, txp++) { > - struct page *page; > - pending_ring_idx_t index; > + /* Coalesce tx requests, at this point the packet passed in > + * should be <= 64K. Any packets larger than 64K have been > + * handled in netbk_count_requests(). > + */ > + for (shinfo->nr_frags = slot = start; slot < nr_slots; > + shinfo->nr_frags++) { > struct pending_tx_info *pending_tx_info > netbk->pending_tx_info; > > - index = pending_index(netbk->pending_cons++); > - pending_idx = netbk->pending_ring[index]; > - page = xen_netbk_alloc_page(netbk, pending_idx); > + page = alloc_page(GFP_KERNEL|__GFP_COLD); > if (!page) > goto err; > > - gop->source.u.ref = txp->gref; > - gop->source.domid = vif->domid; > - gop->source.offset = txp->offset; > - > - gop->dest.u.gmfn = virt_to_mfn(page_address(page)); > - gop->dest.domid = DOMID_SELF; > - gop->dest.offset = txp->offset; > - > - gop->len = txp->size; > - gop->flags = GNTCOPY_source_gref; > + dst_offset = 0; > + first = NULL; > + while (dst_offset < PAGE_SIZE && slot < nr_slots) { > + gop->flags = GNTCOPY_source_gref; > + > + gop->source.u.ref = txp->gref; > + gop->source.domid = vif->domid; > + gop->source.offset = txp->offset; > + > + gop->dest.domid = DOMID_SELF; > + > + gop->dest.offset = dst_offset; > + gop->dest.u.gmfn = virt_to_mfn(page_address(page)); > + > + if (dst_offset + txp->size > PAGE_SIZE) { > + /* This page can only merge a portion > + * of tx request. Do not increment any > + * pointer / counter here. The txp > + * will be dealt with in future > + * rounds, eventually hitting the > + * `else` branch. > + */ > + gop->len = PAGE_SIZE - dst_offset; > + txp->offset += gop->len; > + txp->size -= gop->len; > + dst_offset += gop->len; /* quit loop */ > + } else { > + /* This tx request can be merged in the page */ > + gop->len = txp->size; > + dst_offset += gop->len; > + > + index = pending_index(netbk->pending_cons++); > + > + pending_idx = netbk->pending_ring[index]; > + > + memcpy(&pending_tx_info[pending_idx].req, txp, > + sizeof(*txp)); > + xenvif_get(vif); > + > + pending_tx_info[pending_idx].vif = vif; > + > + /* Poison these fields, corresponding > + * fields for head tx req will be set > + * to correct values after the loop. > + */ > + netbk->mmap_pages[pending_idx] = (void *)(~0UL); > + pending_tx_info[pending_idx].head > + INVALID_PENDING_RING_IDX; > + > + if (unlikely(!first)) { > + first = &pending_tx_info[pending_idx]; > + start_idx = index; > + head_idx = pending_idx; > + } > + > + txp++; > + slot++; > + } > > - gop++; > + gop++; > + } > > - memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp)); > - xenvif_get(vif); > - pending_tx_info[pending_idx].vif = vif; > - frag_set_pending_idx(&frags[i], pending_idx); > + first->req.offset = 0; > + first->req.size = dst_offset; > + first->head = start_idx; > + set_page_ext(page, netbk, head_idx); > + netbk->mmap_pages[head_idx] = page; > + frag_set_pending_idx(&frags[shinfo->nr_frags], head_idx); > } > > + BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS); > + > return gop; > err: > /* Unwind, freeing all pages and sending error responses. */ > - while (i-- > start) { > - xen_netbk_idx_release(netbk, frag_get_pending_idx(&frags[i]), > - XEN_NETIF_RSP_ERROR); > + while (shinfo->nr_frags-- > start) { > + xen_netbk_idx_release(netbk, > + frag_get_pending_idx(&frags[shinfo->nr_frags]), > + XEN_NETIF_RSP_ERROR); > } > /* The head too, if necessary. */ > if (start) > @@ -1025,8 +1110,10 @@ static int xen_netbk_tx_check_gop(struct xen_netbk *netbk, > struct gnttab_copy *gop = *gopp; > u16 pending_idx = *((u16 *)skb->data); > struct skb_shared_info *shinfo = skb_shinfo(skb); > + struct pending_tx_info *tx_info; > int nr_frags = shinfo->nr_frags; > int i, err, start; > + u16 peek; /* peek into next tx request */ > > /* Check status of header. */ > err = gop->status; > @@ -1038,11 +1125,21 @@ static int xen_netbk_tx_check_gop(struct xen_netbk *netbk, > > for (i = start; i < nr_frags; i++) { > int j, newerr; > + pending_ring_idx_t head; > > pending_idx = frag_get_pending_idx(&shinfo->frags[i]); > + tx_info = &netbk->pending_tx_info[pending_idx]; > + head = tx_info->head; > > /* Check error status: if okay then remember grant handle. */ > - newerr = (++gop)->status; > + do { > + newerr = (++gop)->status; > + if (newerr) > + break; > + peek = netbk->pending_ring[pending_index(++head)]; > + } while (netbk->pending_tx_info[peek].head > + == INVALID_PENDING_RING_IDX); > + > if (likely(!newerr)) { > /* Had a previous error? Invalidate this fragment. */ > if (unlikely(err)) > @@ -1267,11 +1364,11 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) > struct sk_buff *skb; > int ret; > > - while (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) && > + while (((nr_pending_reqs(netbk) + max_skb_slots) < MAX_PENDING_REQS) && > !list_empty(&netbk->net_schedule_list)) { > struct xenvif *vif; > struct xen_netif_tx_request txreq; > - struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS]; > + struct xen_netif_tx_request txfrags[max_skb_slots]; > struct page *page; > struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1]; > u16 pending_idx; > @@ -1359,7 +1456,7 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) > pending_idx = netbk->pending_ring[index]; > > data_len = (txreq.size > PKT_PROT_LEN && > - ret < MAX_SKB_FRAGS) ? > + ret < max_skb_slots) ? > PKT_PROT_LEN : txreq.size; > > skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN, > @@ -1409,6 +1506,7 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) > memcpy(&netbk->pending_tx_info[pending_idx].req, > &txreq, sizeof(txreq)); > netbk->pending_tx_info[pending_idx].vif = vif; > + netbk->pending_tx_info[pending_idx].head = index; > *((u16 *)skb->data) = pending_idx; > > __skb_put(skb, data_len); > @@ -1539,7 +1637,10 @@ static void xen_netbk_idx_release(struct xen_netbk *netbk, u16 pending_idx, > { > struct xenvif *vif; > struct pending_tx_info *pending_tx_info; > - pending_ring_idx_t index; > + pending_ring_idx_t head; > + u16 peek; /* peek into next tx request */ > + > + BUG_ON(netbk->mmap_pages[pending_idx] == (void *)(~0UL)); > > /* Already complete? */ > if (netbk->mmap_pages[pending_idx] == NULL) > @@ -1548,13 +1649,34 @@ static void xen_netbk_idx_release(struct xen_netbk *netbk, u16 pending_idx, > pending_tx_info = &netbk->pending_tx_info[pending_idx]; > > vif = pending_tx_info->vif; > + head = pending_tx_info->head; > > - make_tx_response(vif, &pending_tx_info->req, status); > + BUG_ON(head == INVALID_PENDING_RING_IDX); > + BUG_ON(netbk->pending_ring[pending_index(head)] != pending_idx); > > - index = pending_index(netbk->pending_prod++); > - netbk->pending_ring[index] = pending_idx; > + do { > + pending_ring_idx_t index; > + pending_ring_idx_t idx = pending_index(head); > + u16 info_idx = netbk->pending_ring[idx]; > > - xenvif_put(vif); > + pending_tx_info = &netbk->pending_tx_info[info_idx]; > + make_tx_response(vif, &pending_tx_info->req, status); > + > + /* Setting any number other than > + * INVALID_PENDING_RING_IDX indicates this slot is > + * starting a new packet / ending a previous packet. > + */ > + pending_tx_info->head = 0; > + > + index = pending_index(netbk->pending_prod++); > + netbk->pending_ring[index] = netbk->pending_ring[info_idx]; > + > + xenvif_put(vif); > + > + peek = netbk->pending_ring[pending_index(++head)]; > + > + } while (netbk->pending_tx_info[peek].head > + == INVALID_PENDING_RING_IDX); > > netbk->mmap_pages[pending_idx]->mapping = 0; > put_page(netbk->mmap_pages[pending_idx]); > @@ -1613,7 +1735,7 @@ static inline int rx_work_todo(struct xen_netbk *netbk) > static inline int tx_work_todo(struct xen_netbk *netbk) > { > > - if (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) && > + if (((nr_pending_reqs(netbk) + max_skb_slots) < MAX_PENDING_REQS) && > !list_empty(&netbk->net_schedule_list)) > return 1; > > -- > 1.7.10.4 > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel >
On Mon, Mar 25, 2013 at 4:34 PM, David Vrabel <david.vrabel@citrix.com> wrote:> On 25/03/13 15:47, Wei Liu wrote: >> On Mon, Mar 25, 2013 at 3:13 PM, David Vrabel <david.vrabel@citrix.com> wrote: >>> On 25/03/13 11:08, Wei Liu wrote: >>>> This patch tries to coalesce tx requests when constructing grant copy >>>> structures. It enables netback to deal with situation when frontend''s >>>> MAX_SKB_FRAGS is larger than backend''s MAX_SKB_FRAGS. >>>> >>>> It defines max_skb_slots, which is a estimation of the maximum number of slots >>>> a guest can send, anything bigger than that is considered malicious. Now it is >>>> set to 20, which should be enough to accommodate Linux (16 to 19). >>> >>> This maximum needs to be defined as part of the protocol and added to >>> the interface header. >>> >> >> No, this is not part of the protocol and not a hard limit. It is >> configurable by system administrator. > > There is no mechanism by which the front and back ends can negotiate > this value, so it does need to be a fixed value that is equal or greater > than the max from any front or back end that has ever existed. >Are you suggesting move the default macro value to header file? It is just an estimation, I have no knowledge of the accurate maximum value, so I think make it part of the protocol a bad idea. Do you have a handle on the maximum value?> The reason for this patch is that this wasn''t properly specified and > changes outside of netback broke the protocol. > >>>> + >>>> + if (unlikely(!first)) { >>> >>> This isn''t unlikely is it? >>> >> >> For big packet the chance is 1 in max_skb_slots, so 5% (1/20) in default case. > > I don''t understand your reasoning here. The "if (!first)" branch is > taken once per page. It will be 100% if each slot goes into its own > page and only 5% if the packet is less than PAGE_SIZE in length but > split into 20 slots. >My mistake. Should be a small packet split into multiple slots.>>> [...] >>>> + /* Setting any number other than >>>> + * INVALID_PENDING_RING_IDX indicates this slot is >>>> + * starting a new packet / ending a previous packet. >>>> + */ >>>> + pending_tx_info->head = 0; >>> >>> This doesn''t look needed. It will be initialized again when reusing t >>> his pending_tx_info again, right? >>> >> >> Yes, it is needed. Otherwise netback responses to invalid tx_info and >> cause netfront to crash before coming into the re-initialization path. > > Maybe I''m missing something but this is after the make_tx_reponse() > call, and immediately after this pending_tx_info is returned to the > pending ring as free. >So it is a bit tricky here. Let me clarify this, the head field is used to indicate the start of a new tx requests queue and the end of previous queue. Imagine a sequence of head fileds(I = INVALID_PENDING_RING_IDX below), the number is the starting index of pending ring. .... 0 I I I 5 I I ... consume all tx_info but not setting I to 0 (or any number other then I) makes the sequence remains the same as before. The in subsequent call to process next SKB, which has 3 extra slots, which makes the sequence look like .... 8 I I I I I I ... but in fact the correct sequence should be .... 8 I I I 0 I I ... The wrong sequence makes netbk_idx_release responses to more slots than required, which causes netfront to crash miserably. Wei.> David
On Mon, Mar 25, 2013 at 4:58 PM, Wei Liu <liuw@liuw.name> wrote:> > So it is a bit tricky here. Let me clarify this, the head field is > used to indicate the start of a new tx requests queue and the end of > previous queue. > > Imagine a sequence of head fileds(I = INVALID_PENDING_RING_IDX below), > the number is the starting index of pending ring. > > .... 0 I I I 5 I I ... > > consume all tx_info but not setting I to 0 (or any number other then > I) makes the sequence remains the same as before. The in subsequent > call to process next SKB, which has 3 extra slots, which makes theSorry, should be "4 extra slots" sequence looks like .... 8 | | | | | | ... but in fact the correct sequence should be .... 8 | | | | 0 0 ... Wei.
David Vrabel
2013-Mar-25 18:29 UTC
Re: [PATCH 5/6] xen-netback: coalesce slots before copying
On 25/03/13 16:58, Wei Liu wrote:> On Mon, Mar 25, 2013 at 4:34 PM, David Vrabel <david.vrabel@citrix.com> wrote: >> On 25/03/13 15:47, Wei Liu wrote: >>> On Mon, Mar 25, 2013 at 3:13 PM, David Vrabel <david.vrabel@citrix.com> wrote: >>>> On 25/03/13 11:08, Wei Liu wrote: >>>>> This patch tries to coalesce tx requests when constructing grant copy >>>>> structures. It enables netback to deal with situation when frontend''s >>>>> MAX_SKB_FRAGS is larger than backend''s MAX_SKB_FRAGS. >>>>> >>>>> It defines max_skb_slots, which is a estimation of the maximum number of slots >>>>> a guest can send, anything bigger than that is considered malicious. Now it is >>>>> set to 20, which should be enough to accommodate Linux (16 to 19). >>>> >>>> This maximum needs to be defined as part of the protocol and added to >>>> the interface header. >>>> >>> >>> No, this is not part of the protocol and not a hard limit. It is >>> configurable by system administrator. >> >> There is no mechanism by which the front and back ends can negotiate >> this value, so it does need to be a fixed value that is equal or greater >> than the max from any front or back end that has ever existed. >> > > Are you suggesting move the default macro value to header file? It is > just an estimation, I have no knowledge of the accurate maximum value, > so I think make it part of the protocol a bad idea.How is the author of a new frontend supposed to know how many slots they can use per packet if it is not precisely defined?> Do you have a handle on the maximum value?Backends should provide the value to the frontend via a xenstore key (e.g., max-slots-per-frame). This value should be at least 18 (the historical value of MAX_SKB_FRAGS). The frontend may use up to this specified value or 17 if the max-slots-per-frame key is missing. Supporting at least 18 in the backend is required for existing frontends. Limiting frontends to 17 allows them to work with all backends (including recent Linux version that only supported 17). It''s not clear why 19 or 20 were suggested as possible values. I checked back to 2.6.18 and MAX_SKB_FRAGS there is (65536/PAGE_SIZE + 2) == 18. Separately, it may be sensible for the backend to drop packets with more frags than max-slots-per-frame up to some threshold where anything more is considered malicious (i.e., 1 - 18 slots is a valid packet, 19-20 are dropped and 21 or more is a fatal error).>> The reason for this patch is that this wasn''t properly specified and >> changes outside of netback broke the protocol. >> >>>> [...] >>>>> + /* Setting any number other than >>>>> + * INVALID_PENDING_RING_IDX indicates this slot is >>>>> + * starting a new packet / ending a previous packet. >>>>> + */ >>>>> + pending_tx_info->head = 0; >>>> >>>> This doesn''t look needed. It will be initialized again when reusing t >>>> his pending_tx_info again, right? >>>> >>> >>> Yes, it is needed. Otherwise netback responses to invalid tx_info and >>> cause netfront to crash before coming into the re-initialization path. >> >> Maybe I''m missing something but this is after the make_tx_reponse() >> call, and immediately after this pending_tx_info is returned to the >> pending ring as free. >> > > So it is a bit tricky here. Let me clarify this, the head field is > used to indicate the start of a new tx requests queue and the end of > previous queue.Ok, I get this now. David
On Mon, Mar 25, 2013 at 06:29:29PM +0000, David Vrabel wrote:> >> > > > > Are you suggesting move the default macro value to header file? It is > > just an estimation, I have no knowledge of the accurate maximum value, > > so I think make it part of the protocol a bad idea. > > How is the author of a new frontend supposed to know how many slots they > can use per packet if it is not precisely defined? >A new frontend shuold use the scheme you mentioned below to get the maximum value. For old frontends that cannot be fixed, administrator can configure max_skb_slots to accommodate their need.> > Do you have a handle on the maximum value? > > Backends should provide the value to the frontend via a xenstore key > (e.g., max-slots-per-frame). This value should be at least 18 (the > historical value of MAX_SKB_FRAGS). > > The frontend may use up to this specified value or 17 if the > max-slots-per-frame key is missing. > > Supporting at least 18 in the backend is required for existing > frontends. Limiting frontends to 17 allows them to work with all > backends (including recent Linux version that only supported 17). > > It''s not clear why 19 or 20 were suggested as possible values. I > checked back to 2.6.18 and MAX_SKB_FRAGS there is (65536/PAGE_SIZE + 2)Because the check is >= MAX_SKB_FRAGS originally and James Harper told me that "Windows stops counting on 20".> == 18. > > Separately, it may be sensible for the backend to drop packets with more > frags than max-slots-per-frame up to some threshold where anything more > is considered malicious (i.e., 1 - 18 slots is a valid packet, 19-20 are > dropped and 21 or more is a fatal error). >Why drop the packet when we are able to process it? Frontend cannot know it has crossed the line anyway. Wei.
Paul Durrant
2013-Mar-26 09:16 UTC
Re: [PATCH 5/6] xen-netback: coalesce slots before copying
> -----Original Message----- > From: xen-devel-bounces@lists.xen.org [mailto:xen-devel- > bounces@lists.xen.org] On Behalf Of Wei Liu > Sent: 25 March 2013 19:09 > To: David Vrabel > Cc: Ian Campbell; konrad.wilk@oracle.com; netdev@vger.kernel.org; Wei > Liu; xen-devel@lists.xen.org; annie.li@oracle.com > Subject: Re: [Xen-devel] [PATCH 5/6] xen-netback: coalesce slots before > copying > > On Mon, Mar 25, 2013 at 06:29:29PM +0000, David Vrabel wrote: > > >> > > > > > > Are you suggesting move the default macro value to header file? It is > > > just an estimation, I have no knowledge of the accurate maximum value, > > > so I think make it part of the protocol a bad idea. > > > > How is the author of a new frontend supposed to know how many slots > they > > can use per packet if it is not precisely defined? > > > > A new frontend shuold use the scheme you mentioned below to get the > maximum value. For old frontends that cannot be fixed, administrator can > configure max_skb_slots to accommodate their need. > > > > Do you have a handle on the maximum value? > > > > Backends should provide the value to the frontend via a xenstore key > > (e.g., max-slots-per-frame). This value should be at least 18 (the > > historical value of MAX_SKB_FRAGS). > > > > The frontend may use up to this specified value or 17 if the > > max-slots-per-frame key is missing. > > > > Supporting at least 18 in the backend is required for existing > > frontends. Limiting frontends to 17 allows them to work with all > > backends (including recent Linux version that only supported 17). > > > > It''s not clear why 19 or 20 were suggested as possible values. I > > checked back to 2.6.18 and MAX_SKB_FRAGS there is (65536/PAGE_SIZE + > 2) > > Because the check is >= MAX_SKB_FRAGS originally and James Harper told > me that "Windows stops counting on 20". >For the Citrix PV drivers I lifted the #define of MAX_SKB_FRAGS from the dom0 kernel (i.e. 18). If a packet coming from the stack has more than that number of fragments then it''s copied and coalesced. The value advertised for TSO size is chosen such that a maximally sized TSO will always fit in 18 fragments after coalescing but (since this is Windows) the drivers don''t trust the stack to stick to that limit and will drop a packet if it won''t fit. It seems reasonable that, since the backend is copying anyway, that it should handle any fragment list coming from the frontend that it can. This would allow the copy-and-coalesce code to be removed from the frontend (and the double-copy avoided). If there is a maximum backend packet size though then I think this needs to be advertised to the frontend. The backend should clearly bin packets coming from the frontend that exceed that limit but advertising that limit in xenstore allows the frontend to choose the right TSO maximum size to advertise to its stack, rather than having to make it based on some historical value that actually has little meaning (in the absence of grant mapping). Paul
On Tue, Mar 26, 2013 at 09:16:10AM +0000, Paul Durrant wrote:> > > > > > It''s not clear why 19 or 20 were suggested as possible values. I > > > checked back to 2.6.18 and MAX_SKB_FRAGS there is (65536/PAGE_SIZE > > > + > > 2) > > > > Because the check is >= MAX_SKB_FRAGS originally and James Harper > > told me that "Windows stops counting on 20". > > > > For the Citrix PV drivers I lifted the #define of MAX_SKB_FRAGS from > the dom0 kernel (i.e. 18). If a packet coming from the stack has moreThat means Citrix PV drivers cannot run on a vanilla kernel backend because upstream MAX_SKB_FRAGS now is 17. The scope of the patch is to fix this sort of problems without touching frontend because it is impossible for us to fix every frontend in the world.> than that number of fragments then it''s copied and coalesced. The > value advertised for TSO size is chosen such that a maximally sized > TSO will always fit in 18 fragments after coalescing but (since this > is Windows) the drivers don''t trust the stack to stick to that limit > and will drop a packet if it won''t fit. > > It seems reasonable that, since the backend is copying anyway, that it > should handle any fragment list coming from the frontend that it can. > This would allow the copy-and-coalesce code to be removed from the > frontend (and the double-copy avoided). If there is a maximum backend > packet size though then I think this needs to be advertised to the > frontend. The backend should clearly bin packets coming from the > frontend that exceed that limit but advertising that limit in xenstore > allows the frontend to choose the right TSO maximum size to advertise > to its stack, rather than having to make it based on some historical > value that actually has little meaning (in the absence of grant > mapping). >The historical is choosen based on the idea that it should cope with all known frontend limits and leave a bit more space for frontends that slightly cross the line. Advertising through Xenstore is the future plan as it cannot be done without modifying frontend. Wei.
James Harper
2013-Mar-26 10:52 UTC
Re: [PATCH 5/6] xen-netback: coalesce slots before copying
> > It''s not clear why 19 or 20 were suggested as possible values. I > > checked back to 2.6.18 and MAX_SKB_FRAGS there is (65536/PAGE_SIZE + > 2) > > Because the check is >= MAX_SKB_FRAGS originally and James Harper told > me that "Windows stops counting on 20". >I''ve obviously not been clear enough here... GPLPV stopped counting at 20 (only needed to know if <20 or not). Windows itself can submit a packet to NDIS with hundreds of buffers. It doesn''t really matter if it''s 21 or 1021, I just didn''t want to be misquoted. James
James Harper
2013-Mar-26 11:00 UTC
Re: [PATCH 5/6] xen-netback: coalesce slots before copying
> > Because the check is >= MAX_SKB_FRAGS originally and James Harper told > > me that "Windows stops counting on 20". > > > > For the Citrix PV drivers I lifted the #define of MAX_SKB_FRAGS from the > dom0 kernel (i.e. 18). If a packet coming from the stack has more than that > number of fragments then it''s copied and coalesced. The value advertised > for TSO size is chosen such that a maximally sized TSO will always fit in 18 > fragments after coalescing but (since this is Windows) the drivers don''t trust > the stack to stick to that limit and will drop a packet if it won''t fit. > > It seems reasonable that, since the backend is copying anyway, that it should > handle any fragment list coming from the frontend that it can. This would > allow the copy-and-coalesce code to be removed from the frontend (and the > double-copy avoided). If there is a maximum backend packet size though > then I think this needs to be advertised to the frontend. The backend should > clearly bin packets coming from the frontend that exceed that limit but > advertising that limit in xenstore allows the frontend to choose the right TSO > maximum size to advertise to its stack, rather than having to make it based > on some historical value that actually has little meaning (in the absence of > grant mapping). >As stated previously, I''ve observed windows issuing staggering numbers of buffers to NDIS miniport drivers, so you will need to coalesce in a windows driver anyway. I''m not sure what the break even point is but I think it''s safe to say that in the choice between using 1000 (worst case) ring slots (with the resulting mapping overheads) and coalescing in the frontend, coalescing is going to be the better option. James
David Vrabel
2013-Mar-26 11:06 UTC
Re: [PATCH 5/6] xen-netback: coalesce slots before copying
On 26/03/13 10:52, James Harper wrote:>>> It''s not clear why 19 or 20 were suggested as possible values. >>> I checked back to 2.6.18 and MAX_SKB_FRAGS there is >>> (65536/PAGE_SIZE + 2) >> >> Because the check is >= MAX_SKB_FRAGS originally and James Harper >> told me that "Windows stops counting on 20". >> > > I''ve obviously not been clear enough here... GPLPV stopped counting > at 20 (only needed to know if <20 or not). Windows itself can submit > a packet to NDIS with hundreds of buffers. It doesn''t really matter > if it''s 21 or 1021, I just didn''t want to be misquoted.This still isn''t clear. What''s the maximum number of ring entries that GPLPV driver will use per packet? Are you saying it''s 20? If so how has the GPLPV driver ever worked well with Linux''s netback (with its historical limit of 18)? David
David Vrabel
2013-Mar-26 11:13 UTC
Re: [PATCH 5/6] xen-netback: coalesce slots before copying
On 25/03/13 19:09, Wei Liu wrote:> On Mon, Mar 25, 2013 at 06:29:29PM +0000, David Vrabel wrote: >>>> >>> >>> Are you suggesting move the default macro value to header file? It is >>> just an estimation, I have no knowledge of the accurate maximum value, >>> so I think make it part of the protocol a bad idea. >> >> How is the author of a new frontend supposed to know how many slots they >> can use per packet if it is not precisely defined? >> > > A new frontend shuold use the scheme you mentioned below to get the > maximum value. For old frontends that cannot be fixed, administrator can > configure max_skb_slots to accommodate their need.I''m happy to the threshold for fatal errors to be configurable via a module parameter.>>> Do you have a handle on the maximum value? >> >> Backends should provide the value to the frontend via a xenstore key >> (e.g., max-slots-per-frame). This value should be at least 18 (the >> historical value of MAX_SKB_FRAGS). >> >> The frontend may use up to this specified value or 17 if the >> max-slots-per-frame key is missing. >> >> Supporting at least 18 in the backend is required for existing >> frontends. Limiting frontends to 17 allows them to work with all >> backends (including recent Linux version that only supported 17). >> >> It''s not clear why 19 or 20 were suggested as possible values. I >> checked back to 2.6.18 and MAX_SKB_FRAGS there is (65536/PAGE_SIZE + 2) > > Because the check is >= MAX_SKB_FRAGS originally and James Harper told > me that "Windows stops counting on 20". > >> == 18. >> >> Separately, it may be sensible for the backend to drop packets with more >> frags than max-slots-per-frame up to some threshold where anything more >> is considered malicious (i.e., 1 - 18 slots is a valid packet, 19-20 are >> dropped and 21 or more is a fatal error). >> > > Why drop the packet when we are able to process it? Frontend cannot know > it has crossed the line anyway.Because it''s a change to the protocol and we do not want to do this for a regression fix. As a separate fix we can consider increasing the number of slots per-packet once there is a mechanism to report this to the front end. David
James Harper
2013-Mar-26 11:15 UTC
Re: [PATCH 5/6] xen-netback: coalesce slots before copying
> > On 26/03/13 10:52, James Harper wrote: > >>> It''s not clear why 19 or 20 were suggested as possible values. > >>> I checked back to 2.6.18 and MAX_SKB_FRAGS there is > >>> (65536/PAGE_SIZE + 2) > >> > >> Because the check is >= MAX_SKB_FRAGS originally and James Harper > >> told me that "Windows stops counting on 20". > >> > > > > I''ve obviously not been clear enough here... GPLPV stopped counting > > at 20 (only needed to know if <20 or not). Windows itself can submit > > a packet to NDIS with hundreds of buffers. It doesn''t really matter > > if it''s 21 or 1021, I just didn''t want to be misquoted. > > This still isn''t clear. What''s the maximum number of ring entries that > GPLPV driver will use per packet? Are you saying it''s 20? If so how > has the GPLPV driver ever worked well with Linux''s netback (with its > historical limit of 18)? >GPLPV will limit to 19, which I thought was the historic Linux limit but obviously not. I''d better look in to that. I added a debug statement to catch what Windows would give to GPLPV, and it seemed that the maximum was 20, but then I double checked and GPLPV only needs to know if there are >19 frags or not, so it stops counting at 20. The actual number Windows will use internally is not limited so coalescing is required, and no sane amount of bumping up the Linux limit will reduce the requirement that a Windows driver will need to coalesce. James
Paul Durrant
2013-Mar-26 11:24 UTC
Re: [PATCH 5/6] xen-netback: coalesce slots before copying
> -----Original Message----- > From: James Harper [mailto:james.harper@bendigoit.com.au] > Sent: 26 March 2013 11:01 > To: Paul Durrant; Wei Liu; David Vrabel > Cc: Ian Campbell; Wei Liu; netdev@vger.kernel.org; > konrad.wilk@oracle.com; xen-devel@lists.xen.org; annie.li@oracle.com > Subject: RE: [Xen-devel] [PATCH 5/6] xen-netback: coalesce slots before > copying > > > > Because the check is >= MAX_SKB_FRAGS originally and James Harper > told > > > me that "Windows stops counting on 20". > > > > > > > For the Citrix PV drivers I lifted the #define of MAX_SKB_FRAGS from the > > dom0 kernel (i.e. 18). If a packet coming from the stack has more than that > > number of fragments then it''s copied and coalesced. The value advertised > > for TSO size is chosen such that a maximally sized TSO will always fit in 18 > > fragments after coalescing but (since this is Windows) the drivers don''t > trust > > the stack to stick to that limit and will drop a packet if it won''t fit. > > > > It seems reasonable that, since the backend is copying anyway, that it > should > > handle any fragment list coming from the frontend that it can. This would > > allow the copy-and-coalesce code to be removed from the frontend (and > the > > double-copy avoided). If there is a maximum backend packet size though > > then I think this needs to be advertised to the frontend. The backend > should > > clearly bin packets coming from the frontend that exceed that limit but > > advertising that limit in xenstore allows the frontend to choose the right > TSO > > maximum size to advertise to its stack, rather than having to make it based > > on some historical value that actually has little meaning (in the absence of > > grant mapping). > > > > As stated previously, I''ve observed windows issuing staggering numbers of > buffers to NDIS miniport drivers, so you will need to coalesce in a windows > driver anyway. I''m not sure what the break even point is but I think it''s safe > to say that in the choice between using 1000 (worst case) ring slots (with the > resulting mapping overheads) and coalescing in the frontend, coalescing is > going to be the better option. >Oh quite, if the backend is mapping and not copying then coalescing in the frontend is the right way to go. I guess coalescing once the frag count reaches a full ring count is probably necessary (since we can''t push a partial packet) but it would be nice not to have to do it if the backend is going to copy anyway. Paul
David Laight
2013-Mar-26 11:27 UTC
Re: [PATCH 5/6] xen-netback: coalesce slots before copying
> As stated previously, I''ve observed windows issuing staggering > numbers of buffers to NDIS miniport drivers, so you will need > to coalesce in a windows driver anyway. I''m not sure what the > break even point is but I think it''s safe to say that in the > choice between using 1000 (worst case) ring slots (with the > resulting mapping overheads) and coalescing in the frontend, > coalescing is going to be the better option.A long time ago we did some calculation on a sparc mbus/sbus system (that has an iommu requiring setup for dma) and got a breakeven point of (about) 1k. (And I''m not sure we arrange to do aligned copies.) Clearly that isn''t directly relevant here... It is even likely that the ethernet chips will underrun if requested to do too many ring operations - especially at their maximum speed. I guess none of the modern ones require the first fragment to be at least 100 bytes in order to guarantee retransmission after a collision. David
James Harper
2013-Mar-26 11:29 UTC
Re: [PATCH 5/6] xen-netback: coalesce slots before copying
> > As stated previously, I''ve observed windows issuing staggering numbers of > > buffers to NDIS miniport drivers, so you will need to coalesce in a windows > > driver anyway. I''m not sure what the break even point is but I think it''s safe > > to say that in the choice between using 1000 (worst case) ring slots (with > > the > > resulting mapping overheads) and coalescing in the frontend, coalescing is > > going to be the better option. > > > > Oh quite, if the backend is mapping and not copying then coalescing in the > frontend is the right way to go. I guess coalescing once the frag count > reaches a full ring count is probably necessary (since we can''t push a partial > packet) but it would be nice not to have to do it if the backend is going to > copy anyway. >For a 9k packet with 100 frags (not a common case, but an example), what is the cost of mapping those 100 frags into the backend vs coalescing to three pages in the frontend and mapping those? I may be misremembering but wasn''t there a patch floating around for persistent mapping to avoid some of this overhead? (not applicable here but I thought it meant that the cost wasn''t insignificant) James
On Tue, Mar 26, 2013 at 11:13:38AM +0000, David Vrabel wrote:> >> > >> Separately, it may be sensible for the backend to drop packets with more > >> frags than max-slots-per-frame up to some threshold where anything more > >> is considered malicious (i.e., 1 - 18 slots is a valid packet, 19-20 are > >> dropped and 21 or more is a fatal error). > >> > > > > Why drop the packet when we are able to process it? Frontend cannot know > > it has crossed the line anyway. > > Because it''s a change to the protocol and we do not want to do this for > a regression fix. >If I understand correctly the regression you talked about was introduced by harsh punishment in XSA-39? If so, this is the patch you need to fix that. Frontend only knows that it has connectivity or not. This patch guarantee that the old netfront with larger MAX_SKB_FRAGS still see the same thing from its point of view. Netfront cannot know the intermediate state between 18 and 20.> As a separate fix we can consider increasing the number of slots > per-packet once there is a mechanism to report this to the front end. >Sure, that''s on my TODO list. Wei.
Paul Durrant
2013-Mar-26 11:38 UTC
Re: [PATCH 5/6] xen-netback: coalesce slots before copying
> -----Original Message----- > From: James Harper [mailto:james.harper@bendigoit.com.au] > Sent: 26 March 2013 11:29 > To: Paul Durrant; Wei Liu; David Vrabel > Cc: Ian Campbell; Wei Liu; netdev@vger.kernel.org; > konrad.wilk@oracle.com; xen-devel@lists.xen.org; annie.li@oracle.com > Subject: RE: [Xen-devel] [PATCH 5/6] xen-netback: coalesce slots before > copying > > > > As stated previously, I''ve observed windows issuing staggering numbers > of > > > buffers to NDIS miniport drivers, so you will need to coalesce in a > windows > > > driver anyway. I''m not sure what the break even point is but I think it''s > safe > > > to say that in the choice between using 1000 (worst case) ring slots (with > > > the > > > resulting mapping overheads) and coalescing in the frontend, coalescing > is > > > going to be the better option. > > > > > > > Oh quite, if the backend is mapping and not copying then coalescing in the > > frontend is the right way to go. I guess coalescing once the frag count > > reaches a full ring count is probably necessary (since we can''t push a partial > > packet) but it would be nice not to have to do it if the backend is going to > > copy anyway. > > > > For a 9k packet with 100 frags (not a common case, but an example), what is > the cost of mapping those 100 frags into the backend vs coalescing to three > pages in the frontend and mapping those? > > I may be misremembering but wasn''t there a patch floating around for > persistent mapping to avoid some of this overhead? (not applicable here but > I thought it meant that the cost wasn''t insignificant) >The current version of netback does not map, it always grant-copies. Paul
David Vrabel
2013-Mar-26 18:02 UTC
Re: [PATCH 5/6] xen-netback: coalesce slots before copying
On 25/03/13 11:08, Wei Liu wrote:> This patch tries to coalesce tx requests when constructing grant copy > structures. It enables netback to deal with situation when frontend''s > MAX_SKB_FRAGS is larger than backend''s MAX_SKB_FRAGS. > > It defines max_skb_slots, which is a estimation of the maximum number of slots > a guest can send, anything bigger than that is considered malicious. Now it is > set to 20, which should be enough to accommodate Linux (16 to 19). > > Also change variable name from "frags" to "slots" in netbk_count_requests.It it worth summarizing an (off-line) discussion I had with Wei on this patch. There are two regression that need to be addressed here. 1. The reduction of the number of supported ring entries (slots) per packet (from 18 to 17). 2. The XSA-39 security fix turning "too many frags" errors from just dropping the packet to a fatal error and disabling the VIF. The key root cause of the problem is that the protocol is poorly specified using a property external to the netback/netfront drivers (i.e., MAX_SKB_FRAGS). A secondary problem is some frontends have used a max slots per packet value that is larger than netback as supported. e.g., the Windows GPLPV drivers use up to 19. These packets have always been dropped. The first step is to properly specify the maximum slots per-packet as part of the interface. This should be specified as 18 (the historical value). The second step is to define a threshold for slots per packet, above which the guest is considered to be malicious and the error is fatal. 20 seems a sensible value here. The behavior of netback for packet is thus: 1-18 slots: valid 19-20 slots: drop and respond with an error 21+ slots: fatal error Note that we do not make 19-20 valid as this is a change to the protocol and guests may end up relying on this which would then break the guests if they migrate or start on host with a limit of 18. A third (and future) step would be to investigate whether increasing the slots per-packet limit is sensible. There would then need to be a mechanism to negotiate this limit between the front and back ends. David
Ian Campbell
2013-Apr-09 15:10 UTC
Re: [PATCH 5/6] xen-netback: coalesce slots before copying
On Mon, 2013-03-25 at 16:57 +0000, Konrad Rzeszutek Wilk wrote:> On Mon, Mar 25, 2013 at 11:08:21AM +0000, Wei Liu wrote: > > This patch tries to coalesce tx requests when constructing grant copy > > structures. It enables netback to deal with situation when frontend''s > > MAX_SKB_FRAGS is larger than backend''s MAX_SKB_FRAGS. > > > > It defines max_skb_slots, which is a estimation of the maximum number of slots > > a guest can send, anything bigger than that is considered malicious. Now it is > > set to 20, which should be enough to accommodate Linux (16 to 19). > > > > Also change variable name from "frags" to "slots" in netbk_count_requests. > > > > This should probably also CC stable@vger.kernel.orgDaveM prefers net patches to not do so and he takes care of forwarding patches once he is happy (i.e. after they''ve been in his/Linus'' tree for a bit). Ian