This patch tries to coalesce txps when constructing grant copy structures. It enables netback to deal with situation when frontend''s MAX_SKB_FRAGS is larger than backend''s MAX_SKB_FRAGS. It also defines max_skb_frags, which is a estimation of the maximum number of frags a guest can send, anything bigger than that is considered malicious. Now it is set to 20, which should be enough to accommodate Linux (16 to 19) and possibly Windows (19?). Signed-off-by: Wei Liu <wei.liu2@citrix.com> --- drivers/net/xen-netback/netback.c | 164 +++++++++++++++++++++++++++++-------- 1 file changed, 128 insertions(+), 36 deletions(-) diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 6e8e51a..e27f81a 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -47,9 +47,20 @@ #include <asm/xen/hypercall.h> #include <asm/xen/page.h> +/* + * This is an estimation of the maximum possible frags a SKB might + * have, anything larger than this is considered malicious. Typically + * Linux has 16 to 19, Windows has 19. + */ +#define NETBK_MAX_SKB_FRAGS_DEFAULT 20 +static unsigned int max_skb_frags = NETBK_MAX_SKB_FRAGS_DEFAULT; +module_param(max_skb_frags, uint, 0444); + struct pending_tx_info { - struct xen_netif_tx_request req; + struct xen_netif_tx_request req; /* points to the first txp */ struct xenvif *vif; + unsigned int nr_tx_req; /* how many tx req we have in a chain?, at least 1 */ + unsigned int start_idx; /* this is starting index of pending ring index */ }; typedef unsigned int pending_ring_idx_t; @@ -251,7 +262,7 @@ static int max_required_rx_slots(struct xenvif *vif) int max = DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE); if (vif->can_sg || vif->gso || vif->gso_prefix) - max += MAX_SKB_FRAGS + 1; /* extra_info + frags */ + max += max_skb_frags + 1; /* extra_info + frags */ return max; } @@ -657,7 +668,7 @@ static void xen_netbk_rx_action(struct xen_netbk *netbk) __skb_queue_tail(&rxq, skb); /* Filled the batch queue? */ - if (count + MAX_SKB_FRAGS >= XEN_NETIF_RX_RING_SIZE) + if (count + max_skb_frags >= XEN_NETIF_RX_RING_SIZE) break; } @@ -920,7 +931,7 @@ static int netbk_count_requests(struct xenvif *vif, return -ENODATA; } - if (unlikely(frags >= MAX_SKB_FRAGS)) { + if (unlikely(frags >= max_skb_frags)) { netdev_err(vif->dev, "Too many frags\n"); netbk_fatal_tx_err(vif); return -E2BIG; @@ -968,42 +979,98 @@ static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk, struct skb_shared_info *shinfo = skb_shinfo(skb); skb_frag_t *frags = shinfo->frags; u16 pending_idx = *((u16 *)skb->data); - int i, start; + u16 head_idx = 0; + int i, j, start; + struct page *page; + pending_ring_idx_t index; + uint16_t dst_offset; + int total_frags_merged; + unsigned int nr_frags = shinfo->nr_frags; /* This can be as large as max_skb_frags */ + struct pending_tx_info *first = NULL; + int nr_txp; /* Skip first skb fragment if it is on same page as header fragment. */ start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx); - for (i = start; i < shinfo->nr_frags; i++, txp++) { - struct page *page; - pending_ring_idx_t index; + /* Coalesce */ + total_frags_merged = 0; + for (i = j = start; i < MAX_SKB_FRAGS && j < nr_frags; i++) { struct pending_tx_info *pending_tx_info netbk->pending_tx_info; - index = pending_index(netbk->pending_cons++); - pending_idx = netbk->pending_ring[index]; - page = xen_netbk_alloc_page(netbk, pending_idx); + page = alloc_page(GFP_KERNEL|__GFP_COLD); if (!page) goto err; - gop->source.u.ref = txp->gref; - gop->source.domid = vif->domid; - gop->source.offset = txp->offset; - - gop->dest.u.gmfn = virt_to_mfn(page_address(page)); - gop->dest.domid = DOMID_SELF; - gop->dest.offset = txp->offset; - - gop->len = txp->size; - gop->flags = GNTCOPY_source_gref; + nr_txp = 0; + dst_offset = 0; + first = NULL; + while (dst_offset < PAGE_SIZE && j < nr_frags) { + gop->flags = GNTCOPY_source_gref; + + gop->source.u.ref = txp->gref; + gop->source.domid = vif->domid; + gop->source.offset = txp->offset; + + gop->dest.domid = DOMID_SELF; + + gop->dest.offset = dst_offset; + gop->dest.u.gmfn = virt_to_mfn(page_address(page)); + + if (dst_offset + txp->size > PAGE_SIZE) { + /* This page can only merge a portion of txp */ + gop->len = PAGE_SIZE - dst_offset; + txp->offset += gop->len; + txp->size -= gop->len; + dst_offset += gop->len; /* == PAGE_SIZE, will quit loop */ + } else { + /* This txp can be merged in the page */ + gop->len = txp->size; + dst_offset += gop->len; + + index = pending_index(netbk->pending_cons++); + + pending_idx = netbk->pending_ring[index]; + + memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp)); + xenvif_get(vif); + + pending_tx_info[pending_idx].vif = vif; + + /* + * Poison these fields, corrsponding + * fields for head txp will be set to + * correct values after the loop. + */ + pending_tx_info[pending_idx].nr_tx_req = (u16)(~(u16)0); + netbk->mmap_pages[pending_idx] = (void *)(~0UL); + pending_tx_info[pending_idx].start_idx = -1; + + if (unlikely(!first)) { + first = &pending_tx_info[pending_idx]; + first->start_idx = index; + head_idx = pending_idx; + } + + txp++; + nr_txp++; + j++; + } - gop++; + gop++; + } - memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp)); - xenvif_get(vif); - pending_tx_info[pending_idx].vif = vif; - frag_set_pending_idx(&frags[i], pending_idx); + first->req.offset = 0; + first->req.size = dst_offset; + first->nr_tx_req = nr_txp; + total_frags_merged += nr_txp - 1; + set_page_ext(page, netbk, head_idx); + netbk->mmap_pages[head_idx] = page; + frag_set_pending_idx(&frags[i], head_idx); } + shinfo->nr_frags -= total_frags_merged; + return gop; err: /* Unwind, freeing all pages and sending error responses. */ @@ -1025,6 +1092,7 @@ static int xen_netbk_tx_check_gop(struct xen_netbk *netbk, struct gnttab_copy *gop = *gopp; u16 pending_idx = *((u16 *)skb->data); struct skb_shared_info *shinfo = skb_shinfo(skb); + struct pending_tx_info *tx_info; int nr_frags = shinfo->nr_frags; int i, err, start; @@ -1037,12 +1105,14 @@ static int xen_netbk_tx_check_gop(struct xen_netbk *netbk, start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx); for (i = start; i < nr_frags; i++) { - int j, newerr; + int j, newerr = 0, n; pending_idx = frag_get_pending_idx(&shinfo->frags[i]); + tx_info = &netbk->pending_tx_info[pending_idx]; /* Check error status: if okay then remember grant handle. */ - newerr = (++gop)->status; + for (n = 0; n < tx_info->nr_tx_req; n++) + newerr |= (++gop)->status; if (likely(!newerr)) { /* Had a previous error? Invalidate this fragment. */ if (unlikely(err)) @@ -1267,11 +1337,11 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) struct sk_buff *skb; int ret; - while (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) && + while (((nr_pending_reqs(netbk) + max_skb_frags) < MAX_PENDING_REQS) && !list_empty(&netbk->net_schedule_list)) { struct xenvif *vif; struct xen_netif_tx_request txreq; - struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS]; + struct xen_netif_tx_request txfrags[max_skb_frags]; struct page *page; struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1]; u16 pending_idx; @@ -1359,7 +1429,7 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) pending_idx = netbk->pending_ring[index]; data_len = (txreq.size > PKT_PROT_LEN && - ret < MAX_SKB_FRAGS) ? + ret < max_skb_frags) ? PKT_PROT_LEN : txreq.size; skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN, @@ -1409,11 +1479,14 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) memcpy(&netbk->pending_tx_info[pending_idx].req, &txreq, sizeof(txreq)); netbk->pending_tx_info[pending_idx].vif = vif; + netbk->pending_tx_info[pending_idx].start_idx = index; + netbk->pending_tx_info[pending_idx].nr_tx_req = 1; *((u16 *)skb->data) = pending_idx; __skb_put(skb, data_len); skb_shinfo(skb)->nr_frags = ret; + /* If first skb fragment if it is on same page as header fragment. */ if (data_len < txreq.size) { skb_shinfo(skb)->nr_frags++; frag_set_pending_idx(&skb_shinfo(skb)->frags[0], @@ -1540,6 +1613,11 @@ static void xen_netbk_idx_release(struct xen_netbk *netbk, u16 pending_idx, struct xenvif *vif; struct pending_tx_info *pending_tx_info; pending_ring_idx_t index; + unsigned int nr = 0; + unsigned int i = 0; + unsigned int start_idx = 0; + + BUG_ON(netbk->mmap_pages[pending_idx] == (void *)(~0UL)); /* Already complete? */ if (netbk->mmap_pages[pending_idx] == NULL) @@ -1548,13 +1626,27 @@ static void xen_netbk_idx_release(struct xen_netbk *netbk, u16 pending_idx, pending_tx_info = &netbk->pending_tx_info[pending_idx]; vif = pending_tx_info->vif; + nr = pending_tx_info->nr_tx_req; + start_idx = pending_tx_info->start_idx; - make_tx_response(vif, &pending_tx_info->req, status); + BUG_ON(nr == (u16)(~(u16)0)); - index = pending_index(netbk->pending_prod++); - netbk->pending_ring[index] = pending_idx; + BUG_ON(netbk->pending_ring[pending_index(start_idx)] != pending_idx); - xenvif_put(vif); + for (i = 0; i < nr; i++) { + struct xen_netif_tx_request *txp; + unsigned int idx = pending_index(start_idx + i); + unsigned int info_idx = netbk->pending_ring[idx]; + + pending_tx_info = &netbk->pending_tx_info[info_idx]; + txp = &pending_tx_info->req; + make_tx_response(vif, &pending_tx_info->req, status); + + index = pending_index(netbk->pending_prod++); + netbk->pending_ring[index] = netbk->pending_ring[info_idx]; + + xenvif_put(vif); + } netbk->mmap_pages[pending_idx]->mapping = 0; put_page(netbk->mmap_pages[pending_idx]); @@ -1613,7 +1705,7 @@ static inline int rx_work_todo(struct xen_netbk *netbk) static inline int tx_work_todo(struct xen_netbk *netbk) { - if (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) && + if (((nr_pending_reqs(netbk)+max_skb_frags) < MAX_PENDING_REQS) && !list_empty(&netbk->net_schedule_list)) return 1; -- 1.7.10.4
Too bad I don''t know how to make frags > backend_default_value at the moment. :-( I only tested it in normal case, it works. If you see any problem, please let me know. Wei.
Jacek Milewicz
2013-Mar-14 18:58 UTC
Re: [PATCH RFC] xen-netback: coalesce frags before copying
> Too bad I don''t know how to make frags > backend_default_value at the > moment. :-( > > I only tested it in normal case, it works. If you see any problem, please > let me > know. >Heavy NFS load triggered it reliably only after few MBs transferred.
On Thu, 2013-03-14 at 18:58 +0000, Jacek Milewicz wrote:> > Too bad I don''t know how to make frags > backend_default_value at the > > moment. :-( > > > > I only tested it in normal case, it works. If you see any problem, please > > let me > > know. > > > Heavy NFS load triggered it reliably only after few MBs transferred. > >So I just tested it with NFS. I saw frags from frontend up to 18 while my backend only supports 17. It worked. :-) Wei.
Ian Campbell
2013-Mar-15 09:23 UTC
Re: [PATCH RFC] xen-netback: coalesce frags before copying
On Thu, 2013-03-14 at 18:26 +0000, Wei Liu wrote:> This patch tries to coalesce txps when constructing grant copy structures. It > enables netback to deal with situation when frontend''s MAX_SKB_FRAGS is larger > than backend''s MAX_SKB_FRAGS. > > It also defines max_skb_frags, which is a estimation of the maximum number of > frags a guest can send, anything bigger than that is considered malicious. Now > it is set to 20, which should be enough to accommodate Linux (16 to 19) and > possibly Windows (19?). > > Signed-off-by: Wei Liu <wei.liu2@citrix.com> > --- > drivers/net/xen-netback/netback.c | 164 +++++++++++++++++++++++++++++-------- > 1 file changed, 128 insertions(+), 36 deletions(-) > > diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c > index 6e8e51a..e27f81a 100644 > --- a/drivers/net/xen-netback/netback.c > +++ b/drivers/net/xen-netback/netback.c > @@ -47,9 +47,20 @@ > #include <asm/xen/hypercall.h> > #include <asm/xen/page.h> > > +/* > + * This is an estimation of the maximum possible frags a SKB might > + * have, anything larger than this is considered malicious. Typically > + * Linux has 16 to 19, Windows has 19.Perhaps rather than this static (although configurable)> + */ > +#define NETBK_MAX_SKB_FRAGS_DEFAULT 20 > +static unsigned int max_skb_frags = NETBK_MAX_SKB_FRAGS_DEFAULT; > +module_param(max_skb_frags, uint, 0444);You could at least make this 0644 so root can up the limit dynamically?> + > struct pending_tx_info { > - struct xen_netif_tx_request req; > + struct xen_netif_tx_request req; /* points to the first txp */ > struct xenvif *vif; > + unsigned int nr_tx_req; /* how many tx req we have in a chain?, at least 1 */ > + unsigned int start_idx; /* this is starting index of pending ring index */ > }; > typedef unsigned int pending_ring_idx_t; > > @@ -251,7 +262,7 @@ static int max_required_rx_slots(struct xenvif *vif) > int max = DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE); > > if (vif->can_sg || vif->gso || vif->gso_prefix) > - max += MAX_SKB_FRAGS + 1; /* extra_info + frags */ > + max += max_skb_frags + 1; /* extra_info + frags */ > > return max; > } > @@ -657,7 +668,7 @@ static void xen_netbk_rx_action(struct xen_netbk *netbk) > __skb_queue_tail(&rxq, skb); > > /* Filled the batch queue? */ > - if (count + MAX_SKB_FRAGS >= XEN_NETIF_RX_RING_SIZE) > + if (count + max_skb_frags >= XEN_NETIF_RX_RING_SIZE) > break; > } > > @@ -920,7 +931,7 @@ static int netbk_count_requests(struct xenvif *vif, > return -ENODATA; > } > > - if (unlikely(frags >= MAX_SKB_FRAGS)) { > + if (unlikely(frags >= max_skb_frags)) { > netdev_err(vif->dev, "Too many frags\n"); > netbk_fatal_tx_err(vif); > return -E2BIG; > @@ -968,42 +979,98 @@ static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk, > struct skb_shared_info *shinfo = skb_shinfo(skb); > skb_frag_t *frags = shinfo->frags; > u16 pending_idx = *((u16 *)skb->data); > - int i, start; > + u16 head_idx = 0; > + int i, j, start; > + struct page *page; > + pending_ring_idx_t index; > + uint16_t dst_offset; > + int total_frags_merged; > + unsigned int nr_frags = shinfo->nr_frags; /* This can be as large as max_skb_frags */ > + struct pending_tx_info *first = NULL; > + int nr_txp; > > /* Skip first skb fragment if it is on same page as header fragment. */ > start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx); > > - for (i = start; i < shinfo->nr_frags; i++, txp++) { > - struct page *page; > - pending_ring_idx_t index; > + /* Coalesce */ > + total_frags_merged = 0; > + for (i = j = start; i < MAX_SKB_FRAGS && j < nr_frags; i++) { > struct pending_tx_info *pending_tx_info > netbk->pending_tx_info; > > - index = pending_index(netbk->pending_cons++); > - pending_idx = netbk->pending_ring[index]; > - page = xen_netbk_alloc_page(netbk, pending_idx); > + page = alloc_page(GFP_KERNEL|__GFP_COLD); > if (!page) > goto err; > > - gop->source.u.ref = txp->gref; > - gop->source.domid = vif->domid; > - gop->source.offset = txp->offset; > - > - gop->dest.u.gmfn = virt_to_mfn(page_address(page)); > - gop->dest.domid = DOMID_SELF; > - gop->dest.offset = txp->offset; > - > - gop->len = txp->size; > - gop->flags = GNTCOPY_source_gref; > + nr_txp = 0; > + dst_offset = 0; > + first = NULL; > + while (dst_offset < PAGE_SIZE && j < nr_frags) { > + gop->flags = GNTCOPY_source_gref; > + > + gop->source.u.ref = txp->gref; > + gop->source.domid = vif->domid; > + gop->source.offset = txp->offset; > + > + gop->dest.domid = DOMID_SELF; > + > + gop->dest.offset = dst_offset; > + gop->dest.u.gmfn = virt_to_mfn(page_address(page)); > + > + if (dst_offset + txp->size > PAGE_SIZE) { > + /* This page can only merge a portion of txp */ > + gop->len = PAGE_SIZE - dst_offset; > + txp->offset += gop->len; > + txp->size -= gop->len; > + dst_offset += gop->len; /* == PAGE_SIZE, will quit loop */ > + } else { > + /* This txp can be merged in the page */ > + gop->len = txp->size; > + dst_offset += gop->len; > + > + index = pending_index(netbk->pending_cons++); > + > + pending_idx = netbk->pending_ring[index]; > + > + memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp)); > + xenvif_get(vif); > + > + pending_tx_info[pending_idx].vif = vif; > + > + /* > + * Poison these fields, corrsponding > + * fields for head txp will be set to > + * correct values after the loop. > + */ > + pending_tx_info[pending_idx].nr_tx_req = (u16)(~(u16)0); > + netbk->mmap_pages[pending_idx] = (void *)(~0UL); > + pending_tx_info[pending_idx].start_idx = -1; > + > + if (unlikely(!first)) { > + first = &pending_tx_info[pending_idx]; > + first->start_idx = index; > + head_idx = pending_idx; > + } > + > + txp++; > + nr_txp++; > + j++; > + } > > - gop++; > + gop++; > + } > > - memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp)); > - xenvif_get(vif); > - pending_tx_info[pending_idx].vif = vif; > - frag_set_pending_idx(&frags[i], pending_idx); > + first->req.offset = 0; > + first->req.size = dst_offset; > + first->nr_tx_req = nr_txp; > + total_frags_merged += nr_txp - 1; > + set_page_ext(page, netbk, head_idx); > + netbk->mmap_pages[head_idx] = page; > + frag_set_pending_idx(&frags[i], head_idx); > } > > + shinfo->nr_frags -= total_frags_merged; > + > return gop; > err: > /* Unwind, freeing all pages and sending error responses. */ > @@ -1025,6 +1092,7 @@ static int xen_netbk_tx_check_gop(struct xen_netbk *netbk, > struct gnttab_copy *gop = *gopp; > u16 pending_idx = *((u16 *)skb->data); > struct skb_shared_info *shinfo = skb_shinfo(skb); > + struct pending_tx_info *tx_info; > int nr_frags = shinfo->nr_frags; > int i, err, start; > > @@ -1037,12 +1105,14 @@ static int xen_netbk_tx_check_gop(struct xen_netbk *netbk, > start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx); > > for (i = start; i < nr_frags; i++) { > - int j, newerr; > + int j, newerr = 0, n; > > pending_idx = frag_get_pending_idx(&shinfo->frags[i]); > + tx_info = &netbk->pending_tx_info[pending_idx]; > > /* Check error status: if okay then remember grant handle. */ > - newerr = (++gop)->status; > + for (n = 0; n < tx_info->nr_tx_req; n++) > + newerr |= (++gop)->status; > if (likely(!newerr)) { > /* Had a previous error? Invalidate this fragment. */ > if (unlikely(err)) > @@ -1267,11 +1337,11 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) > struct sk_buff *skb; > int ret; > > - while (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) && > + while (((nr_pending_reqs(netbk) + max_skb_frags) < MAX_PENDING_REQS) && > !list_empty(&netbk->net_schedule_list)) { > struct xenvif *vif; > struct xen_netif_tx_request txreq; > - struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS]; > + struct xen_netif_tx_request txfrags[max_skb_frags]; > struct page *page; > struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1]; > u16 pending_idx; > @@ -1359,7 +1429,7 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) > pending_idx = netbk->pending_ring[index]; > > data_len = (txreq.size > PKT_PROT_LEN && > - ret < MAX_SKB_FRAGS) ? > + ret < max_skb_frags) ? > PKT_PROT_LEN : txreq.size; > > skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN, > @@ -1409,11 +1479,14 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) > memcpy(&netbk->pending_tx_info[pending_idx].req, > &txreq, sizeof(txreq)); > netbk->pending_tx_info[pending_idx].vif = vif; > + netbk->pending_tx_info[pending_idx].start_idx = index; > + netbk->pending_tx_info[pending_idx].nr_tx_req = 1; > *((u16 *)skb->data) = pending_idx; > > __skb_put(skb, data_len); > > skb_shinfo(skb)->nr_frags = ret; > + /* If first skb fragment if it is on same page as header fragment. */ > if (data_len < txreq.size) { > skb_shinfo(skb)->nr_frags++; > frag_set_pending_idx(&skb_shinfo(skb)->frags[0], > @@ -1540,6 +1613,11 @@ static void xen_netbk_idx_release(struct xen_netbk *netbk, u16 pending_idx, > struct xenvif *vif; > struct pending_tx_info *pending_tx_info; > pending_ring_idx_t index; > + unsigned int nr = 0; > + unsigned int i = 0; > + unsigned int start_idx = 0; > + > + BUG_ON(netbk->mmap_pages[pending_idx] == (void *)(~0UL)); > > /* Already complete? */ > if (netbk->mmap_pages[pending_idx] == NULL) > @@ -1548,13 +1626,27 @@ static void xen_netbk_idx_release(struct xen_netbk *netbk, u16 pending_idx, > pending_tx_info = &netbk->pending_tx_info[pending_idx]; > > vif = pending_tx_info->vif; > + nr = pending_tx_info->nr_tx_req; > + start_idx = pending_tx_info->start_idx; > > - make_tx_response(vif, &pending_tx_info->req, status); > + BUG_ON(nr == (u16)(~(u16)0)); > > - index = pending_index(netbk->pending_prod++); > - netbk->pending_ring[index] = pending_idx; > + BUG_ON(netbk->pending_ring[pending_index(start_idx)] != pending_idx); > > - xenvif_put(vif); > + for (i = 0; i < nr; i++) { > + struct xen_netif_tx_request *txp; > + unsigned int idx = pending_index(start_idx + i); > + unsigned int info_idx = netbk->pending_ring[idx]; > + > + pending_tx_info = &netbk->pending_tx_info[info_idx]; > + txp = &pending_tx_info->req; > + make_tx_response(vif, &pending_tx_info->req, status); > + > + index = pending_index(netbk->pending_prod++); > + netbk->pending_ring[index] = netbk->pending_ring[info_idx]; > + > + xenvif_put(vif); > + } > > netbk->mmap_pages[pending_idx]->mapping = 0; > put_page(netbk->mmap_pages[pending_idx]); > @@ -1613,7 +1705,7 @@ static inline int rx_work_todo(struct xen_netbk *netbk) > static inline int tx_work_todo(struct xen_netbk *netbk) > { > > - if (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) && > + if (((nr_pending_reqs(netbk)+max_skb_frags) < MAX_PENDING_REQS) && > !list_empty(&netbk->net_schedule_list)) > return 1; >
Ian Campbell
2013-Mar-15 09:29 UTC
Re: [PATCH RFC] xen-netback: coalesce frags before copying
On Fri, 2013-03-15 at 09:23 +0000, Ian Campbell wrote:> On Thu, 2013-03-14 at 18:26 +0000, Wei Liu wrote: > > This patch tries to coalesce txps when constructing grant copy structures. It > > enables netback to deal with situation when frontend''s MAX_SKB_FRAGS is larger > > than backend''s MAX_SKB_FRAGS. > > > > It also defines max_skb_frags, which is a estimation of the maximum number of > > frags a guest can send, anything bigger than that is considered malicious. Now > > it is set to 20, which should be enough to accommodate Linux (16 to 19) and > > possibly Windows (19?). > > > > Signed-off-by: Wei Liu <wei.liu2@citrix.com> > > --- > > drivers/net/xen-netback/netback.c | 164 +++++++++++++++++++++++++++++-------- > > 1 file changed, 128 insertions(+), 36 deletions(-) > > > > diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c > > index 6e8e51a..e27f81a 100644 > > --- a/drivers/net/xen-netback/netback.c > > +++ b/drivers/net/xen-netback/netback.c > > @@ -47,9 +47,20 @@ > > #include <asm/xen/hypercall.h> > > #include <asm/xen/page.h> > > > > +/* > > + * This is an estimation of the maximum possible frags a SKB might > > + * have, anything larger than this is considered malicious. Typically > > + * Linux has 16 to 19, Windows has 19. > > Perhaps rather than this static (although configurable)Slipped and hit send... proper reply is forthcoming...
Ian Campbell
2013-Mar-15 10:42 UTC
Re: [PATCH RFC] xen-netback: coalesce frags before copying
On Thu, 2013-03-14 at 18:26 +0000, Wei Liu wrote:> This patch tries to coalesce txps when constructing grant copy structures. It > enables netback to deal with situation when frontend''s MAX_SKB_FRAGS is larger > than backend''s MAX_SKB_FRAGS. > > It also defines max_skb_frags, which is a estimation of the maximum number of > frags a guest can send, anything bigger than that is considered malicious. Now > it is set to 20, which should be enough to accommodate Linux (16 to 19) and > possibly Windows (19?). > > Signed-off-by: Wei Liu <wei.liu2@citrix.com> > --- > drivers/net/xen-netback/netback.c | 164 +++++++++++++++++++++++++++++-------- > 1 file changed, 128 insertions(+), 36 deletions(-) > > diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c > index 6e8e51a..e27f81a 100644 > --- a/drivers/net/xen-netback/netback.c > +++ b/drivers/net/xen-netback/netback.c > @@ -47,9 +47,20 @@ > #include <asm/xen/hypercall.h> > #include <asm/xen/page.h> > > +/* > + * This is an estimation of the maximum possible frags a SKB might > + * have, anything larger than this is considered malicious. Typically > + * Linux has 16 to 19, Windows has 19.Perhaps rather than this static (although configurable) limit we could avoid malicious guests by aborting only if we see a slot with invalid contents? I think the XSA-39 issue doesn''t exist so long as each request results in a response (which a valid slot does). Although I''m also sympathetic to the idea that this approach is simpler and therefore expedient in the short term and we can consider the more complicated approach above for the future.> + */ > +#define NETBK_MAX_SKB_FRAGS_DEFAULT 20 > +static unsigned int max_skb_frags = NETBK_MAX_SKB_FRAGS_DEFAULT; > +module_param(max_skb_frags, uint, 0444);You could at least make this 0644 so root can up the limit dynamically?> @@ -968,42 +979,98 @@ static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk, > struct skb_shared_info *shinfo = skb_shinfo(skb); > skb_frag_t *frags = shinfo->frags; > u16 pending_idx = *((u16 *)skb->data); > - int i, start; > + u16 head_idx = 0; > + int i, j, start; > + struct page *page; > + pending_ring_idx_t index; > + uint16_t dst_offset; > + int total_frags_merged; > + unsigned int nr_frags = shinfo->nr_frags; /* This can be as large as max_skb_frags */ > + struct pending_tx_info *first = NULL; > + int nr_txp; > > /* Skip first skb fragment if it is on same page as header fragment. */ > start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx); > > - for (i = start; i < shinfo->nr_frags; i++, txp++) { > - struct page *page; > - pending_ring_idx_t index; > + /* Coalesce */ > + total_frags_merged = 0; > + for (i = j = start; i < MAX_SKB_FRAGS && j < nr_frags; i++) {I can''t see any code which causes us to drop the frag if we end up with i >= MAX_SKB_FRAGS? Is it just too subtle for me? I wonder if this code might all be a bit clearer if: s/nr_frags/nr_slots/ s/j/slots/ s/i/frags/ ? Or possibly replace i with shinfo->nr_frags (if that works). I think s/frags/slots/ in max_skb_frags too BTW. Keeping the concept of skb frags separate from the number of ring slots which it requires to represent those frags is less confusing here but also helpful when you consider higher order frags.> struct pending_tx_info *pending_tx_info > netbk->pending_tx_info; > > - index = pending_index(netbk->pending_cons++); > - pending_idx = netbk->pending_ring[index]; > - page = xen_netbk_alloc_page(netbk, pending_idx); > + page = alloc_page(GFP_KERNEL|__GFP_COLD);Possibly for future enhancement we could use netdev_alloc_frag() and take advantage of higher order pages here?> if (!page) > goto err; > > - gop->source.u.ref = txp->gref; > - gop->source.domid = vif->domid; > - gop->source.offset = txp->offset; > - > - gop->dest.u.gmfn = virt_to_mfn(page_address(page)); > - gop->dest.domid = DOMID_SELF; > - gop->dest.offset = txp->offset; > - > - gop->len = txp->size; > - gop->flags = GNTCOPY_source_gref; > + nr_txp = 0; > + dst_offset = 0; > + first = NULL; > + while (dst_offset < PAGE_SIZE && j < nr_frags) {I think splitting the handling of a single request slot into a function would be useful for clarity. e.g. xen_netbk_handle_one_request or something.> + gop->flags = GNTCOPY_source_gref; > + > + gop->source.u.ref = txp->gref; > + gop->source.domid = vif->domid; > + gop->source.offset = txp->offset; > + > + gop->dest.domid = DOMID_SELF; > + > + gop->dest.offset = dst_offset; > + gop->dest.u.gmfn = virt_to_mfn(page_address(page)); > + > + if (dst_offset + txp->size > PAGE_SIZE) { > + /* This page can only merge a portion of txp */ > + gop->len = PAGE_SIZE - dst_offset; > + txp->offset += gop->len; > + txp->size -= gop->len; > + dst_offset += gop->len; /* == PAGE_SIZE, will quit loop */In this case we don''t touch pending_cons etc? Is that because when we hit this case we always go around again and will eventually hit the else case below ensuring we do the pending_cons stuff exactly once for the tail end of the buffer? I think that is OK but it is worthy of a comment. What happens if we hit SKB_FRAG_MAX right as we hit this case? Might we fail to respond to the final request which has been split in this way?> + } else { > + /* This txp can be merged in the page */ > + gop->len = txp->size; > + dst_offset += gop->len; > + > + index = pending_index(netbk->pending_cons++); > + > + pending_idx = netbk->pending_ring[index]; > + > + memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp)); > + xenvif_get(vif); > + > + pending_tx_info[pending_idx].vif = vif; > + > + /* > + * Poison these fields, corrsponding"corresponding"> + * fields for head txp will be set to > + * correct values after the loop. > + */ > + pending_tx_info[pending_idx].nr_tx_req = (u16)(~(u16)0); > + netbk->mmap_pages[pending_idx] = (void *)(~0UL); > + pending_tx_info[pending_idx].start_idx = -1; > + > + if (unlikely(!first)) { > + first = &pending_tx_info[pending_idx]; > + first->start_idx = index; > + head_idx = pending_idx; > + } > + > + txp++; > + nr_txp++; > + j++; > + } > > - gop++; > + gop++; > + } > > - memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp)); > - xenvif_get(vif); > - pending_tx_info[pending_idx].vif = vif; > - frag_set_pending_idx(&frags[i], pending_idx); > + first->req.offset = 0; > + first->req.size = dst_offset; > + first->nr_tx_req = nr_txp; > + total_frags_merged += nr_txp - 1; > + set_page_ext(page, netbk, head_idx); > + netbk->mmap_pages[head_idx] = page; > + frag_set_pending_idx(&frags[i], head_idx); > } > > + shinfo->nr_frags -= total_frags_merged;This adjustment seems a bit odd, why not just count them correctly in the first place?> + > return gop; > err: > /* Unwind, freeing all pages and sending error responses. */ > @@ -1025,6 +1092,7 @@ static int xen_netbk_tx_check_gop(struct xen_netbk *netbk, > struct gnttab_copy *gop = *gopp; > u16 pending_idx = *((u16 *)skb->data); > struct skb_shared_info *shinfo = skb_shinfo(skb); > + struct pending_tx_info *tx_info; > int nr_frags = shinfo->nr_frags; > int i, err, start; > > @@ -1037,12 +1105,14 @@ static int xen_netbk_tx_check_gop(struct xen_netbk *netbk, > start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx); > > for (i = start; i < nr_frags; i++) { > - int j, newerr; > + int j, newerr = 0, n; > > pending_idx = frag_get_pending_idx(&shinfo->frags[i]); > + tx_info = &netbk->pending_tx_info[pending_idx]; > > /* Check error status: if okay then remember grant handle. */ > - newerr = (++gop)->status; > + for (n = 0; n < tx_info->nr_tx_req; n++) > + newerr |= (++gop)->status;I don''t think GNTST_* are or-able in this way... I think for each request the response should contain the status of the first failed grant op relating to that slot, or success.> if (likely(!newerr)) { > /* Had a previous error? Invalidate this fragment. */ > if (unlikely(err)) > @@ -1267,11 +1337,11 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) > struct sk_buff *skb; > int ret; > > - while (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) && > + while (((nr_pending_reqs(netbk) + max_skb_frags) < MAX_PENDING_REQS) && > !list_empty(&netbk->net_schedule_list)) { > struct xenvif *vif; > struct xen_netif_tx_request txreq; > - struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS]; > + struct xen_netif_tx_request txfrags[max_skb_frags];I guess this stack variable puts an implicit limit on max_skb_frags. It also argues against being able to change it dynamically as I suggested above (imagine a race between setting up the stack here and checking against max_skb_frags later)> @@ -1548,13 +1626,27 @@ static void xen_netbk_idx_release(struct xen_netbk *netbk, u16 pending_idx, > pending_tx_info = &netbk->pending_tx_info[pending_idx]; > > vif = pending_tx_info->vif; > + nr = pending_tx_info->nr_tx_req; > + start_idx = pending_tx_info->start_idx; > > - make_tx_response(vif, &pending_tx_info->req, status); > + BUG_ON(nr == (u16)(~(u16)0));Is one of those casts not redundant? Ian.
On Fri, 2013-03-15 at 10:42 +0000, Ian Campbell wrote:> On Thu, 2013-03-14 at 18:26 +0000, Wei Liu wrote: > > This patch tries to coalesce txps when constructing grant copy structures. It > > enables netback to deal with situation when frontend''s MAX_SKB_FRAGS is larger > > than backend''s MAX_SKB_FRAGS. > > > > It also defines max_skb_frags, which is a estimation of the maximum number of > > frags a guest can send, anything bigger than that is considered malicious. Now > > it is set to 20, which should be enough to accommodate Linux (16 to 19) and > > possibly Windows (19?). > > > > Signed-off-by: Wei Liu <wei.liu2@citrix.com> > > --- > > drivers/net/xen-netback/netback.c | 164 +++++++++++++++++++++++++++++-------- > > 1 file changed, 128 insertions(+), 36 deletions(-) > > > > diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c > > index 6e8e51a..e27f81a 100644 > > --- a/drivers/net/xen-netback/netback.c > > +++ b/drivers/net/xen-netback/netback.c > > @@ -47,9 +47,20 @@ > > #include <asm/xen/hypercall.h> > > #include <asm/xen/page.h> > > > > +/* > > + * This is an estimation of the maximum possible frags a SKB might > > + * have, anything larger than this is considered malicious. Typically > > + * Linux has 16 to 19, Windows has 19. > > Perhaps rather than this static (although configurable) limit we could > avoid malicious guests by aborting only if we see a slot with invalid > contents? I think the XSA-39 issue doesn''t exist so long as each request > results in a response (which a valid slot does). > > Although I''m also sympathetic to the idea that this approach is simpler > and therefore expedient in the short term and we can consider the more > complicated approach above for the future. > > > + */ > > +#define NETBK_MAX_SKB_FRAGS_DEFAULT 20 > > +static unsigned int max_skb_frags = NETBK_MAX_SKB_FRAGS_DEFAULT; > > +module_param(max_skb_frags, uint, 0444); > > You could at least make this 0644 so root can up the limit dynamically? >This was done on purpose. Changing this on the fly can cause race condition. If we are up to changing this, we need to write more code to make sure all worker threads in quiescence state, this adds a layer of complexity.> > @@ -968,42 +979,98 @@ static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk, > > struct skb_shared_info *shinfo = skb_shinfo(skb); > > skb_frag_t *frags = shinfo->frags; > > u16 pending_idx = *((u16 *)skb->data); > > - int i, start; > > + u16 head_idx = 0; > > + int i, j, start; > > + struct page *page; > > + pending_ring_idx_t index; > > + uint16_t dst_offset; > > + int total_frags_merged; > > + unsigned int nr_frags = shinfo->nr_frags; /* This can be as large as max_skb_frags */ > > + struct pending_tx_info *first = NULL; > > + int nr_txp; > > > > /* Skip first skb fragment if it is on same page as header fragment. */ > > start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx); > > > > - for (i = start; i < shinfo->nr_frags; i++, txp++) { > > - struct page *page; > > - pending_ring_idx_t index; > > + /* Coalesce */ > > + total_frags_merged = 0; > > + for (i = j = start; i < MAX_SKB_FRAGS && j < nr_frags; i++) { > > I can''t see any code which causes us to drop the frag if we end up with > i >= MAX_SKB_FRAGS? Is it just too subtle for me? >The basic assumption that it is not possible to reach i >= MAX_SKB_FRAGS since the maximum packet size supported in backend is 64K and we can always accommodate a packet. Any packet larger than 64K causes fatal error early on. I could remove "i < MAX_SKB_FRAGS" if this causes confusion, it is just a redundant check.> I wonder if this code might all be a bit clearer if: > s/nr_frags/nr_slots/ > s/j/slots/ > s/i/frags/ > ? Or possibly replace i with shinfo->nr_frags (if that works). > > I think s/frags/slots/ in max_skb_frags too BTW. Keeping the concept of > skb frags separate from the number of ring slots which it requires to > represent those frags is less confusing here but also helpful when you > consider higher order frags. >Sure.> > struct pending_tx_info *pending_tx_info > > netbk->pending_tx_info; > > > > - index = pending_index(netbk->pending_cons++); > > - pending_idx = netbk->pending_ring[index]; > > - page = xen_netbk_alloc_page(netbk, pending_idx); > > + page = alloc_page(GFP_KERNEL|__GFP_COLD); > > Possibly for future enhancement we could use netdev_alloc_frag() and > take advantage of higher order pages here? >Sure, but I think that deserves a separate patch. This patch is more of a fix than improvement.> > if (!page) > > goto err; > > > > - gop->source.u.ref = txp->gref; > > - gop->source.domid = vif->domid; > > - gop->source.offset = txp->offset; > > - > > - gop->dest.u.gmfn = virt_to_mfn(page_address(page)); > > - gop->dest.domid = DOMID_SELF; > > - gop->dest.offset = txp->offset; > > - > > - gop->len = txp->size; > > - gop->flags = GNTCOPY_source_gref; > > + nr_txp = 0; > > + dst_offset = 0; > > + first = NULL; > > + while (dst_offset < PAGE_SIZE && j < nr_frags) { > > I think splitting the handling of a single request slot into a function > would be useful for clarity. e.g. xen_netbk_handle_one_request or > something. > > > + gop->flags = GNTCOPY_source_gref; > > + > > + gop->source.u.ref = txp->gref; > > + gop->source.domid = vif->domid; > > + gop->source.offset = txp->offset; > > + > > + gop->dest.domid = DOMID_SELF; > > + > > + gop->dest.offset = dst_offset; > > + gop->dest.u.gmfn = virt_to_mfn(page_address(page)); > > + > > + if (dst_offset + txp->size > PAGE_SIZE) { > > + /* This page can only merge a portion of txp */ > > + gop->len = PAGE_SIZE - dst_offset; > > + txp->offset += gop->len; > > + txp->size -= gop->len; > > + dst_offset += gop->len; /* == PAGE_SIZE, will quit loop */ > > In this case we don''t touch pending_cons etc? Is that because when we > hit this case we always go around again and will eventually hit the else > case below ensuring we do the pending_cons stuff exactly once for the > tail end of the buffer? I think that is OK but it is worthy of a > comment. >Yes, we cannot touch pending_cons in this case, because this slot needs to be coped with in next loop, using another gop.> What happens if we hit SKB_FRAG_MAX right as we hit this case? Might we > fail to respond to the final request which has been split in this way? >As stated above, we won''t hit i >= MAX_SKB_FRAGS.> > + } else {[...]> > + } > > > > - memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp)); > > - xenvif_get(vif); > > - pending_tx_info[pending_idx].vif = vif; > > - frag_set_pending_idx(&frags[i], pending_idx); > > + first->req.offset = 0; > > + first->req.size = dst_offset; > > + first->nr_tx_req = nr_txp; > > + total_frags_merged += nr_txp - 1; > > + set_page_ext(page, netbk, head_idx); > > + netbk->mmap_pages[head_idx] = page; > > + frag_set_pending_idx(&frags[i], head_idx); > > } > > > > + shinfo->nr_frags -= total_frags_merged; > > This adjustment seems a bit odd, why not just count them correctly in > the first place? >Sure.> > + > > return gop; > > err: > > /* Unwind, freeing all pages and sending error responses. */ > > @@ -1025,6 +1092,7 @@ static int xen_netbk_tx_check_gop(struct xen_netbk *netbk, > > struct gnttab_copy *gop = *gopp; > > u16 pending_idx = *((u16 *)skb->data); > > struct skb_shared_info *shinfo = skb_shinfo(skb); > > + struct pending_tx_info *tx_info; > > int nr_frags = shinfo->nr_frags; > > int i, err, start; > > > > @@ -1037,12 +1105,14 @@ static int xen_netbk_tx_check_gop(struct xen_netbk *netbk, > > start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx); > > > > for (i = start; i < nr_frags; i++) { > > - int j, newerr; > > + int j, newerr = 0, n; > > > > pending_idx = frag_get_pending_idx(&shinfo->frags[i]); > > + tx_info = &netbk->pending_tx_info[pending_idx]; > > > > /* Check error status: if okay then remember grant handle. */ > > - newerr = (++gop)->status; > > + for (n = 0; n < tx_info->nr_tx_req; n++) > > + newerr |= (++gop)->status; > > I don''t think GNTST_* are or-able in this way... I think for each > request the response should contain the status of the first failed grant > op relating to that slot, or success. >Fixed.> > > if (likely(!newerr)) { > > /* Had a previous error? Invalidate this fragment. */ > > if (unlikely(err)) > > @@ -1267,11 +1337,11 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) > > struct sk_buff *skb; > > int ret; > > > > - while (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) && > > + while (((nr_pending_reqs(netbk) + max_skb_frags) < MAX_PENDING_REQS) && > > !list_empty(&netbk->net_schedule_list)) { > > struct xenvif *vif; > > struct xen_netif_tx_request txreq; > > - struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS]; > > + struct xen_netif_tx_request txfrags[max_skb_frags]; > > I guess this stack variable puts an implicit limit on max_skb_frags. It > also argues against being able to change it dynamically as I suggested > above (imagine a race between setting up the stack here and checking > against max_skb_frags later) >Yes. See above explanation. Wei.
Ian Campbell
2013-Mar-15 13:25 UTC
Re: [PATCH RFC] xen-netback: coalesce frags before copying
On Fri, 2013-03-15 at 12:31 +0000, Wei Liu wrote:> On Fri, 2013-03-15 at 10:42 +0000, Ian Campbell wrote: > > On Thu, 2013-03-14 at 18:26 +0000, Wei Liu wrote: > > > > > + */ > > > +#define NETBK_MAX_SKB_FRAGS_DEFAULT 20 > > > +static unsigned int max_skb_frags = NETBK_MAX_SKB_FRAGS_DEFAULT; > > > +module_param(max_skb_frags, uint, 0444); > > > > You could at least make this 0644 so root can up the limit dynamically? > > > > This was done on purpose. Changing this on the fly can cause race > condition. If we are up to changing this, we need to write more code to > make sure all worker threads in quiescence state, this adds a layer of > complexity.Right, I figured this out myself further down ;-)> > > - for (i = start; i < shinfo->nr_frags; i++, txp++) { > > > - struct page *page; > > > - pending_ring_idx_t index; > > > + /* Coalesce */ > > > + total_frags_merged = 0; > > > + for (i = j = start; i < MAX_SKB_FRAGS && j < nr_frags; i++) { > > > > I can''t see any code which causes us to drop the frag if we end up with > > i >= MAX_SKB_FRAGS? Is it just too subtle for me? > > > > The basic assumption that it is not possible to reach i >= MAX_SKB_FRAGS > since the maximum packet size supported in backend is 64K and we can > always accommodate a packet. Any packet larger than 64K causes fatal > error early on.Does that early check also account for frontends which lie in the initial slot->size or is that otherwise caught later too? i.e. do we do the right thing when there are more bytes in the additional slots than the initially header said there would be? I expect we already do, I just want to be sure this doesn''t subtly alter the conditions.> I could remove "i < MAX_SKB_FRAGS" if this causes confusion, it is just > a redundant check.Is it an actual invariant? i.e. could it be BUG_ON?> > > struct pending_tx_info *pending_tx_info > > > netbk->pending_tx_info; > > > > > > - index = pending_index(netbk->pending_cons++); > > > - pending_idx = netbk->pending_ring[index]; > > > - page = xen_netbk_alloc_page(netbk, pending_idx); > > > + page = alloc_page(GFP_KERNEL|__GFP_COLD); > > > > Possibly for future enhancement we could use netdev_alloc_frag() and > > take advantage of higher order pages here? > > > > Sure, but I think that deserves a separate patch. This patch is more of > a fix than improvement.Right, by "future enhancement" I meant a future patch not a change to this one.> > > + gop->flags = GNTCOPY_source_gref; > > > + > > > + gop->source.u.ref = txp->gref; > > > + gop->source.domid = vif->domid; > > > + gop->source.offset = txp->offset; > > > + > > > + gop->dest.domid = DOMID_SELF; > > > + > > > + gop->dest.offset = dst_offset; > > > + gop->dest.u.gmfn = virt_to_mfn(page_address(page)); > > > + > > > + if (dst_offset + txp->size > PAGE_SIZE) { > > > + /* This page can only merge a portion of txp */ > > > + gop->len = PAGE_SIZE - dst_offset; > > > + txp->offset += gop->len; > > > + txp->size -= gop->len; > > > + dst_offset += gop->len; /* == PAGE_SIZE, will quit loop */ > > > > In this case we don''t touch pending_cons etc? Is that because when we > > hit this case we always go around again and will eventually hit the else > > case below ensuring we do the pending_cons stuff exactly once for the > > tail end of the buffer? I think that is OK but it is worthy of a > > comment. > > > > Yes, we cannot touch pending_cons in this case, because this slot needs > to be coped with in next loop, using another gop. > > > What happens if we hit SKB_FRAG_MAX right as we hit this case? Might we > > fail to respond to the final request which has been split in this way? > > > > As stated above, we won''t hit i >= MAX_SKB_FRAGS.OK, same question for i == MAX_SKB_FRAGS - 1 then ;-) Or whatever condition would be needed for us to exit here without ever getting to this else clause. Ian.
On Fri, 2013-03-15 at 13:25 +0000, Ian Campbell wrote:> > > > - for (i = start; i < shinfo->nr_frags; i++, txp++) { > > > > - struct page *page; > > > > - pending_ring_idx_t index; > > > > + /* Coalesce */ > > > > + total_frags_merged = 0; > > > > + for (i = j = start; i < MAX_SKB_FRAGS && j < nr_frags; i++) { > > > > > > I can''t see any code which causes us to drop the frag if we end up with > > > i >= MAX_SKB_FRAGS? Is it just too subtle for me? > > > > > > > The basic assumption that it is not possible to reach i >= MAX_SKB_FRAGS > > since the maximum packet size supported in backend is 64K and we can > > always accommodate a packet. Any packet larger than 64K causes fatal > > error early on. > > Does that early check also account for frontends which lie in the > initial slot->size or is that otherwise caught later too? i.e. do we do > the right thing when there are more bytes in the additional slots than > the initially header said there would be? I expect we already do, I just > want to be sure this doesn''t subtly alter the conditions. >We already do that in netbk_count_requests().> > I could remove "i < MAX_SKB_FRAGS" if this causes confusion, it is just > > a redundant check. > > Is it an actual invariant? i.e. could it be BUG_ON? >Yes. I''ve added BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS) before returning to caller.> > > > + gop->flags = GNTCOPY_source_gref; > > > > + > > > > + gop->source.u.ref = txp->gref; > > > > + gop->source.domid = vif->domid; > > > > + gop->source.offset = txp->offset; > > > > + > > > > + gop->dest.domid = DOMID_SELF; > > > > + > > > > + gop->dest.offset = dst_offset; > > > > + gop->dest.u.gmfn = virt_to_mfn(page_address(page)); > > > > + > > > > + if (dst_offset + txp->size > PAGE_SIZE) { > > > > + /* This page can only merge a portion of txp */ > > > > + gop->len = PAGE_SIZE - dst_offset; > > > > + txp->offset += gop->len; > > > > + txp->size -= gop->len; > > > > + dst_offset += gop->len; /* == PAGE_SIZE, will quit loop */ > > > > > > In this case we don''t touch pending_cons etc? Is that because when we > > > hit this case we always go around again and will eventually hit the else > > > case below ensuring we do the pending_cons stuff exactly once for the > > > tail end of the buffer? I think that is OK but it is worthy of a > > > comment. > > > > > > > Yes, we cannot touch pending_cons in this case, because this slot needs > > to be coped with in next loop, using another gop. > > > > > What happens if we hit SKB_FRAG_MAX right as we hit this case? Might we > > > fail to respond to the final request which has been split in this way? > > > > > > > As stated above, we won''t hit i >= MAX_SKB_FRAGS. > > OK, same question for i == MAX_SKB_FRAGS - 1 then ;-) Or whatever > condition would be needed for us to exit here without ever getting to > this else clause. >If we exit outer for-loop before processing all tx requests for a packet, that means this packet is larger than 64K. But at this point, we''ve already know that this packet will not be larger than 64K. After removing "i < MAX_SKB_FRAGS" check, we can only exit the for-loop given that we finish processing all tx requests for a packet. And the counter only decrements after we finish processing a tx request in the ''else'' clause. Wei.
On Fri, 2013-03-15 at 10:42 +0000, Ian Campbell wrote:> > goto err; > > > > - gop->source.u.ref = txp->gref; > > - gop->source.domid = vif->domid; > > - gop->source.offset = txp->offset; > > - > > - gop->dest.u.gmfn = virt_to_mfn(page_address(page)); > > - gop->dest.domid = DOMID_SELF; > > - gop->dest.offset = txp->offset; > > - > > - gop->len = txp->size; > > - gop->flags = GNTCOPY_source_gref; > > + nr_txp = 0; > > + dst_offset = 0; > > + first = NULL; > > + while (dst_offset < PAGE_SIZE && j < nr_frags) { > > I think splitting the handling of a single request slot into a function > would be useful for clarity. e.g. xen_netbk_handle_one_request or > something. >Regarding this comment, I think it looks better as-is. This loop is closely related to the outer loop. Wei.