Liang Chen
2023-May-26 05:46 UTC
[PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain
"private" of buffer page is currently used for big mode to chain pages. But in mergeable mode, that offset of page could mean something else, e.g. when page_pool page is used instead. So excluding mergeable mode to avoid such a problem. Signed-off-by: Liang Chen <liangchen.linux at gmail.com> --- drivers/net/virtio_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 5a7f7a76b920..c5dca0d92e64 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -497,7 +497,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, return NULL; page = (struct page *)page->private; - if (page) + if (!vi->mergeable_rx_bufs && page) give_pages(rq, page); goto ok; } -- 2.31.1
Liang Chen
2023-May-26 05:46 UTC
[PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
The implementation at the moment uses one page per packet in both the normal and XDP path. In addition, introducing a module parameter to enable or disable the usage of page pool (disabled by default). In single-core vm testing environments, it gives a modest performance gain in the normal path. Upstream codebase: 47.5 Gbits/sec Upstream codebase + page_pool support: 50.2 Gbits/sec In multi-core vm testing environments, The most significant performance gain is observed in XDP cpumap: Upstream codebase: 1.38 Gbits/sec Upstream codebase + page_pool support: 9.74 Gbits/sec With this foundation, we can further integrate page pool fragmentation and DMA map/unmap support. Signed-off-by: Liang Chen <liangchen.linux at gmail.com> --- drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- 1 file changed, 146 insertions(+), 42 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index c5dca0d92e64..99c0ca0c1781 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); module_param(gso, bool, 0444); module_param(napi_tx, bool, 0644); +static bool page_pool_enabled; +module_param(page_pool_enabled, bool, 0400); + /* FIXME: MTU in config. */ #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) #define GOOD_COPY_LEN 128 @@ -159,6 +162,9 @@ struct receive_queue { /* Chain pages by the private ptr. */ struct page *pages; + /* Page pool */ + struct page_pool *page_pool; + /* Average packet length for mergeable receive buffers. */ struct ewma_pkt_len mrg_avg_pkt_len; @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, return skb; } +static void virtnet_put_page(struct receive_queue *rq, struct page *page) +{ + if (rq->page_pool) + page_pool_put_full_page(rq->page_pool, page, true); + else + put_page(page); +} + /* Called from bottom half context */ static struct sk_buff *page_to_skb(struct virtnet_info *vi, struct receive_queue *rq, @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, hdr = skb_vnet_hdr(skb); memcpy(hdr, hdr_p, hdr_len); if (page_to_free) - put_page(page_to_free); + virtnet_put_page(rq, page_to_free); return skb; } @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, return ret; } -static void put_xdp_frags(struct xdp_buff *xdp) +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) { struct skb_shared_info *shinfo; struct page *xdp_page; @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) shinfo = xdp_get_shared_info_from_buff(xdp); for (i = 0; i < shinfo->nr_frags; i++) { xdp_page = skb_frag_page(&shinfo->frags[i]); - put_page(xdp_page); + virtnet_put_page(rq, xdp_page); } } } @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, if (page_off + *len + tailroom > PAGE_SIZE) return NULL; - page = alloc_page(GFP_ATOMIC); + if (rq->page_pool) + page = page_pool_dev_alloc_pages(rq->page_pool); + else + page = alloc_page(GFP_ATOMIC); + if (!page) return NULL; @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, * is sending packet larger than the MTU. */ if ((page_off + buflen + tailroom) > PAGE_SIZE) { - put_page(p); + virtnet_put_page(rq, p); goto err_buf; } memcpy(page_address(page) + page_off, page_address(p) + off, buflen); page_off += buflen; - put_page(p); + virtnet_put_page(rq, p); } /* Headroom does not contribute to packet length */ *len = page_off - VIRTIO_XDP_HEADROOM; return page; err_buf: - __free_pages(page, 0); + if (rq->page_pool) + page_pool_put_full_page(rq->page_pool, page, true); + else + __free_pages(page, 0); return NULL; } @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, } stats->bytes += len; page = virt_to_head_page(buf); - put_page(page); + virtnet_put_page(rq, page); } } @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, cur_frag_size = truesize; xdp_frags_truesz += cur_frag_size; if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { - put_page(page); + virtnet_put_page(rq, page); pr_debug("%s: rx error: len %u exceeds truesize %lu\n", dev->name, len, (unsigned long)(truesize - room)); dev->stats.rx_length_errors++; @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, return 0; err: - put_xdp_frags(xdp); + put_xdp_frags(xdp, rq); return -EINVAL; } @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, if (*len + xdp_room > PAGE_SIZE) return NULL; - xdp_page = alloc_page(GFP_ATOMIC); + if (rq->page_pool) + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); + else + xdp_page = alloc_page(GFP_ATOMIC); if (!xdp_page) return NULL; @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, *frame_sz = PAGE_SIZE; - put_page(*page); + virtnet_put_page(rq, *page); *page = xdp_page; @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); if (unlikely(!head_skb)) break; + if (rq->page_pool) + skb_mark_for_recycle(head_skb); return head_skb; case XDP_TX: @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, break; } - put_xdp_frags(&xdp); + put_xdp_frags(&xdp, rq); err_xdp: - put_page(page); + virtnet_put_page(rq, page); mergeable_buf_free(rq, num_buf, dev, stats); stats->xdp_drops++; @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); curr_skb = head_skb; + if (rq->page_pool) + skb_mark_for_recycle(curr_skb); + if (unlikely(!curr_skb)) goto err_skb; while (--num_buf) { @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, curr_skb = nskb; head_skb->truesize += nskb->truesize; num_skb_frags = 0; + if (rq->page_pool) + skb_mark_for_recycle(curr_skb); } if (curr_skb != head_skb) { head_skb->data_len += len; @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, } offset = buf - page_address(page); if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { - put_page(page); + virtnet_put_page(rq, page); skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, len, truesize); } else { @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, return head_skb; err_skb: - put_page(page); + virtnet_put_page(rq, page); mergeable_buf_free(rq, num_buf, dev, stats); err_buf: @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, * disabled GSO for XDP, it won't be a big issue. */ len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) - return -ENOMEM; + if (rq->page_pool) { + struct page *page; - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; - buf += headroom; /* advance address leaving hole at front of pkt */ - get_page(alloc_frag->page); - alloc_frag->offset += len + room; - hole = alloc_frag->size - alloc_frag->offset; - if (hole < len + room) { - /* To avoid internal fragmentation, if there is very likely not - * enough space for another buffer, add the remaining space to - * the current buffer. - * XDP core assumes that frame_size of xdp_buff and the length - * of the frag are PAGE_SIZE, so we disable the hole mechanism. - */ - if (!headroom) - len += hole; - alloc_frag->offset += hole; - } + page = page_pool_dev_alloc_pages(rq->page_pool); + if (unlikely(!page)) + return -ENOMEM; + buf = (char *)page_address(page); + buf += headroom; /* advance address leaving hole at front of pkt */ + } else { + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) + return -ENOMEM; + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; + buf += headroom; /* advance address leaving hole at front of pkt */ + get_page(alloc_frag->page); + alloc_frag->offset += len + room; + hole = alloc_frag->size - alloc_frag->offset; + if (hole < len + room) { + /* To avoid internal fragmentation, if there is very likely not + * enough space for another buffer, add the remaining space to + * the current buffer. + * XDP core assumes that frame_size of xdp_buff and the length + * of the frag are PAGE_SIZE, so we disable the hole mechanism. + */ + if (!headroom) + len += hole; + alloc_frag->offset += hole; + } + } sg_init_one(rq->sg, buf, len); ctx = mergeable_len_to_ctx(len + room, headroom); err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); if (err < 0) - put_page(virt_to_head_page(buf)); + virtnet_put_page(rq, virt_to_head_page(buf)); return err; } @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) if (err < 0) return err; - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, - MEM_TYPE_PAGE_SHARED, NULL); + if (vi->rq[qp_index].page_pool) + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, + MEM_TYPE_PAGE_POOL, + vi->rq[qp_index].page_pool); + else + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, + MEM_TYPE_PAGE_SHARED, + NULL); + if (err < 0) goto err_xdp_reg_mem_model; @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) ethtool_sprintf(&p, "tx_queue_%u_%s", i, virtnet_sq_stats_desc[j].desc); } + page_pool_ethtool_stats_get_strings(p); break; } } @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) switch (sset) { case ETH_SS_STATS: return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + - VIRTNET_SQ_STATS_LEN); + VIRTNET_SQ_STATS_LEN + + (page_pool_enabled && vi->mergeable_rx_bufs ? + page_pool_ethtool_stats_get_count() : 0)); default: return -EOPNOTSUPP; } } +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) +{ +#ifdef CONFIG_PAGE_POOL_STATS + struct virtnet_info *vi = netdev_priv(dev); + struct page_pool_stats pp_stats = {}; + int i; + + for (i = 0; i < vi->curr_queue_pairs; i++) { + if (!vi->rq[i].page_pool) + continue; + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); + } + page_pool_ethtool_stats_get(data, &pp_stats); +#endif /* CONFIG_PAGE_POOL_STATS */ +} + static void virtnet_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); idx += VIRTNET_SQ_STATS_LEN; } + + virtnet_get_page_pool_stats(dev, &data[idx]); } static void virtnet_get_channels(struct net_device *dev, @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) for (i = 0; i < vi->max_queue_pairs; i++) { __netif_napi_del(&vi->rq[i].napi); __netif_napi_del(&vi->sq[i].napi); + if (vi->rq[i].page_pool) + page_pool_destroy(vi->rq[i].page_pool); } /* We called __netif_napi_del(), @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) struct virtnet_info *vi = vq->vdev->priv; int i = vq2rxq(vq); - if (vi->mergeable_rx_bufs) - put_page(virt_to_head_page(buf)); - else if (vi->big_packets) + if (vi->mergeable_rx_bufs) { + if (vi->rq[i].page_pool) { + page_pool_put_full_page(vi->rq[i].page_pool, + virt_to_head_page(buf), + true); + } else { + put_page(virt_to_head_page(buf)); + } + } else if (vi->big_packets) { give_pages(&vi->rq[i], buf); - else + } else { put_page(virt_to_head_page(buf)); + } } static void free_unused_bufs(struct virtnet_info *vi) @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) virtnet_free_queues(vi); } +static void virtnet_alloc_page_pool(struct receive_queue *rq) +{ + struct virtio_device *vdev = rq->vq->vdev; + + struct page_pool_params pp_params = { + .order = 0, + .pool_size = rq->vq->num_max, + .nid = dev_to_node(vdev->dev.parent), + .dev = vdev->dev.parent, + .offset = 0, + }; + + rq->page_pool = page_pool_create(&pp_params); + if (IS_ERR(rq->page_pool)) { + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", + PTR_ERR(rq->page_pool)); + rq->page_pool = NULL; + } +} + /* How large should a single buffer be so a queue full of these can fit at * least one full packet? * Logic below assumes the mergeable buffer header is used. @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) vi->rq[i].vq = vqs[rxq2vq(i)]; vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); vi->sq[i].vq = vqs[txq2vq(i)]; + + if (page_pool_enabled && vi->mergeable_rx_bufs) + virtnet_alloc_page_pool(&vi->rq[i]); + else + dev_warn(&vi->vdev->dev, + "page pool only support mergeable mode\n"); + } /* run here: ret == 0. */ -- 2.31.1
Liang Chen
2023-May-26 05:46 UTC
[PATCH net-next 3/5] virtio_net: Add page pool fragmentation support
To further enhance performance, implement page pool fragmentation support and introduce a module parameter to enable or disable it. In single-core vm testing environments, there is an additional performance gain observed in the normal path compared to the one packet per page approach. Upstream codebase: 47.5 Gbits/sec Upstream codebase with page pool: 50.2 Gbits/sec Upstream codebase with page pool fragmentation support: 52.3 Gbits/sec There is also some performance gain for XDP cpumap. Upstream codebase: 1.38 Gbits/sec Upstream codebase with page pool: 9.74 Gbits/sec Upstream codebase with page pool fragmentation: 10.3 Gbits/sec Signed-off-by: Liang Chen <liangchen.linux at gmail.com> --- drivers/net/virtio_net.c | 72 ++++++++++++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 17 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 99c0ca0c1781..ac40b8c66c59 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -32,7 +32,9 @@ module_param(gso, bool, 0444); module_param(napi_tx, bool, 0644); static bool page_pool_enabled; +static bool page_pool_frag; module_param(page_pool_enabled, bool, 0400); +module_param(page_pool_frag, bool, 0400); /* FIXME: MTU in config. */ #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) @@ -909,23 +911,32 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, struct page *p, int offset, int page_off, - unsigned int *len) + unsigned int *len, + unsigned int *pp_frag_offset) { int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); struct page *page; + unsigned int pp_frag_offset_val; if (page_off + *len + tailroom > PAGE_SIZE) return NULL; if (rq->page_pool) - page = page_pool_dev_alloc_pages(rq->page_pool); + if (rq->page_pool->p.flags & PP_FLAG_PAGE_FRAG) + page = page_pool_dev_alloc_frag(rq->page_pool, pp_frag_offset, + PAGE_SIZE); + else + page = page_pool_dev_alloc_pages(rq->page_pool); else page = alloc_page(GFP_ATOMIC); if (!page) return NULL; - memcpy(page_address(page) + page_off, page_address(p) + offset, *len); + pp_frag_offset_val = pp_frag_offset ? *pp_frag_offset : 0; + + memcpy(page_address(page) + page_off + pp_frag_offset_val, + page_address(p) + offset, *len); page_off += *len; while (--*num_buf) { @@ -948,7 +959,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, goto err_buf; } - memcpy(page_address(page) + page_off, + memcpy(page_address(page) + page_off + pp_frag_offset_val, page_address(p) + off, buflen); page_off += buflen; virtnet_put_page(rq, p); @@ -1029,7 +1040,7 @@ static struct sk_buff *receive_small_xdp(struct net_device *dev, SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); xdp_page = xdp_linearize_page(rq, &num_buf, page, offset, header_offset, - &tlen); + &tlen, NULL); if (!xdp_page) goto err_xdp; @@ -1323,6 +1334,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, unsigned int headroom = mergeable_ctx_to_headroom(ctx); struct page *xdp_page; unsigned int xdp_room; + unsigned int page_frag_offset = 0; /* Transient failure which in theory could occur if * in-flight packets from before XDP was enabled reach @@ -1356,7 +1368,8 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, xdp_page = xdp_linearize_page(rq, num_buf, *page, offset, VIRTIO_XDP_HEADROOM, - len); + len, + &page_frag_offset); if (!xdp_page) return NULL; } else { @@ -1366,14 +1379,19 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, return NULL; if (rq->page_pool) - xdp_page = page_pool_dev_alloc_pages(rq->page_pool); + if (rq->page_pool->p.flags & PP_FLAG_PAGE_FRAG) + xdp_page = page_pool_dev_alloc_frag(rq->page_pool, + &page_frag_offset, PAGE_SIZE); + else + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); else xdp_page = alloc_page(GFP_ATOMIC); + if (!xdp_page) return NULL; - memcpy(page_address(xdp_page) + VIRTIO_XDP_HEADROOM, - page_address(*page) + offset, *len); + memcpy(page_address(xdp_page) + VIRTIO_XDP_HEADROOM + + page_frag_offset, page_address(*page) + offset, *len); } *frame_sz = PAGE_SIZE; @@ -1382,7 +1400,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, *page = xdp_page; - return page_address(*page) + VIRTIO_XDP_HEADROOM; + return page_address(*page) + VIRTIO_XDP_HEADROOM + page_frag_offset; } static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, @@ -1762,6 +1780,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, void *ctx; int err; unsigned int len, hole; + unsigned int pp_frag_offset; /* Extra tailroom is needed to satisfy XDP's assumption. This * means rx frags coalescing won't work, but consider we've @@ -1769,13 +1788,29 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, */ len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); if (rq->page_pool) { - struct page *page; + if (rq->page_pool->p.flags & PP_FLAG_PAGE_FRAG) { + if (unlikely(!page_pool_dev_alloc_frag(rq->page_pool, + &pp_frag_offset, len + room))) + return -ENOMEM; + buf = (char *)page_address(rq->page_pool->frag_page) + + pp_frag_offset; + buf += headroom; /* advance address leaving hole at front of pkt */ + hole = (PAGE_SIZE << rq->page_pool->p.order) + - rq->page_pool->frag_offset; + if (hole < len + room) { + if (!headroom) + len += hole; + rq->page_pool->frag_offset += hole; + } + } else { + struct page *page; - page = page_pool_dev_alloc_pages(rq->page_pool); - if (unlikely(!page)) - return -ENOMEM; - buf = (char *)page_address(page); - buf += headroom; /* advance address leaving hole at front of pkt */ + page = page_pool_dev_alloc_pages(rq->page_pool); + if (unlikely(!page)) + return -ENOMEM; + buf = (char *)page_address(page); + buf += headroom; /* advance address leaving hole at front of pkt */ + } } else { if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) return -ENOMEM; @@ -3800,13 +3835,16 @@ static void virtnet_alloc_page_pool(struct receive_queue *rq) struct virtio_device *vdev = rq->vq->vdev; struct page_pool_params pp_params = { - .order = 0, + .order = page_pool_frag ? SKB_FRAG_PAGE_ORDER : 0, .pool_size = rq->vq->num_max, .nid = dev_to_node(vdev->dev.parent), .dev = vdev->dev.parent, .offset = 0, }; + if (page_pool_frag) + pp_params.flags |= PP_FLAG_PAGE_FRAG; + rq->page_pool = page_pool_create(&pp_params); if (IS_ERR(rq->page_pool)) { dev_warn(&vdev->dev, "page pool creation failed: %ld\n", -- 2.31.1
Liang Chen
2023-May-26 05:46 UTC
[PATCH net-next 4/5] virtio_ring: Introduce DMA pre-handler
Currently, DMA operations of virtio devices' data buffer are encapsulated within the underlying virtqueue implementation. DMA map/unmap operations are performed for each data buffer attached to/detached from the virtqueue, which is transparent and invisible to the higher-level virtio device drivers. This encapsulation makes it not viable for device drivers to introduce certain mechanisms, such as page pool, that require explicit management of DMA map/unmap. Therefore, by inserting a pre-handler before the generic DMA map/unmap operations, virtio device drivers have the opportunity to participate in DMA operations. Signed-off-by: Liang Chen <liangchen.linux at gmail.com> --- drivers/virtio/virtio_ring.c | 73 +++++++++++++++++++++++++++++++++--- include/linux/virtio.h | 18 +++++++++ 2 files changed, 85 insertions(+), 6 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index c5310eaf8b46..a99641260555 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -213,6 +213,9 @@ struct vring_virtqueue { bool last_add_time_valid; ktime_t last_add_time; #endif + + /* DMA mapping Pre-handler for virtio device driver */ + struct virtqueue_pre_dma_ops *pre_dma_ops; }; static struct virtqueue *__vring_new_virtqueue(unsigned int index, @@ -369,6 +372,19 @@ static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq, return (dma_addr_t)sg_phys(sg); } + /* Allow virtio drivers to perform customized mapping operation, and + * fallback to the generic path if it fails to handle the mapping. + */ + if (vq->pre_dma_ops && vq->pre_dma_ops->map_page) { + dma_addr_t addr; + + addr = vq->pre_dma_ops->map_page(vring_dma_dev(vq), + sg_page(sg), sg->offset, sg->length, + direction, 0); + if (addr) + return addr; + } + /* * We can't use dma_map_sg, because we don't use scatterlists in * the way it expects (we don't guarantee that the scatterlist @@ -432,6 +448,15 @@ static void vring_unmap_one_split_indirect(const struct vring_virtqueue *vq, flags = virtio16_to_cpu(vq->vq.vdev, desc->flags); + if (vq->pre_dma_ops && vq->pre_dma_ops->unmap_page) { + if (vq->pre_dma_ops->unmap_page(vring_dma_dev(vq), + virtio64_to_cpu(vq->vq.vdev, desc->addr), + virtio32_to_cpu(vq->vq.vdev, desc->len), + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE, 0)) + return; + } + dma_unmap_page(vring_dma_dev(vq), virtio64_to_cpu(vq->vq.vdev, desc->addr), virtio32_to_cpu(vq->vq.vdev, desc->len), @@ -456,14 +481,22 @@ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq, extra[i].len, (flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE : DMA_TO_DEVICE); - } else { - dma_unmap_page(vring_dma_dev(vq), - extra[i].addr, - extra[i].len, - (flags & VRING_DESC_F_WRITE) ? - DMA_FROM_DEVICE : DMA_TO_DEVICE); + goto out; + } else if (vq->pre_dma_ops && vq->pre_dma_ops->unmap_page) { + if (vq->pre_dma_ops->unmap_page(vring_dma_dev(vq), + extra[i].addr, + extra[i].len, + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE, 0)) + goto out; } + dma_unmap_page(vring_dma_dev(vq), + extra[i].addr, + extra[i].len, + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE); + out: return extra[i].next; } @@ -1206,10 +1239,19 @@ static void vring_unmap_extra_packed(const struct vring_virtqueue *vq, (flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } else { + if (vq->pre_dma_ops && vq->pre_dma_ops->unmap_page) { + if (vq->pre_dma_ops->unmap_page(vring_dma_dev(vq), + extra->addr, + extra->len, + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE, 0)) + return; + } dma_unmap_page(vring_dma_dev(vq), extra->addr, extra->len, (flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + } } @@ -1223,6 +1265,15 @@ static void vring_unmap_desc_packed(const struct vring_virtqueue *vq, flags = le16_to_cpu(desc->flags); + if (vq->pre_dma_ops && vq->pre_dma_ops->unmap_page) { + if (vq->pre_dma_ops->unmap_page(vring_dma_dev(vq), + le64_to_cpu(desc->addr), + le32_to_cpu(desc->len), + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE, 0)) + return; + } + dma_unmap_page(vring_dma_dev(vq), le64_to_cpu(desc->addr), le32_to_cpu(desc->len), @@ -2052,6 +2103,7 @@ static struct virtqueue *vring_create_virtqueue_packed( vq->packed_ring = true; vq->dma_dev = dma_dev; vq->use_dma_api = vring_use_dma_api(vdev); + vq->pre_dma_ops = NULL; vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && !context; @@ -2541,6 +2593,7 @@ static struct virtqueue *__vring_new_virtqueue(unsigned int index, #endif vq->dma_dev = dma_dev; vq->use_dma_api = vring_use_dma_api(vdev); + vq->pre_dma_ops = NULL; vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && !context; @@ -2945,4 +2998,12 @@ const struct vring *virtqueue_get_vring(const struct virtqueue *vq) } EXPORT_SYMBOL_GPL(virtqueue_get_vring); +/* The virtio device driver can register its own DMA map/unmap pre-handler. */ +void virtqueue_register_pre_dma_ops(struct virtqueue *vq, + struct virtqueue_pre_dma_ops *pre_dma_ops) +{ + to_vvq(vq)->pre_dma_ops = pre_dma_ops; +} +EXPORT_SYMBOL_GPL(virtqueue_register_pre_dma_ops); + MODULE_LICENSE("GPL"); diff --git a/include/linux/virtio.h b/include/linux/virtio.h index b93238db94e3..1d5755b5e03f 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -9,6 +9,7 @@ #include <linux/device.h> #include <linux/mod_devicetable.h> #include <linux/gfp.h> +#include <linux/dma-map-ops.h> /** * struct virtqueue - a queue to register buffers for sending or receiving. @@ -203,4 +204,21 @@ void unregister_virtio_driver(struct virtio_driver *drv); #define module_virtio_driver(__virtio_driver) \ module_driver(__virtio_driver, register_virtio_driver, \ unregister_virtio_driver) +/** + * struct virtqueue_pre_dma_ops - DMA pre-handler for virtio device driver + * @map_page: map a single page of memory for DMA + * @unmap_page: unmap a single page of memory for DMA + */ +struct virtqueue_pre_dma_ops { + dma_addr_t (*map_page)(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, unsigned long attrs); + bool (*unmap_page)(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir, + unsigned long attrs); +}; + +void virtqueue_register_pre_dma_ops(struct virtqueue *vq, + struct virtqueue_pre_dma_ops *pre_dma_ops); + #endif /* _LINUX_VIRTIO_H */ -- 2.31.1
Liang Chen
2023-May-26 05:46 UTC
[PATCH net-next 5/5] virtio_net: Implement DMA pre-handler
Adding a DMA pre-handler that utilizes page pool for managing DMA mappings. When IOMMU is enabled, turning on the page_pool_dma_map module parameter to select page pool for DMA mapping management gives a significant reduction in the overhead caused by DMA mappings. In testing environments with a single core vm and qemu emulated IOMMU, significant performance improvements can be observed: Upstream codebase: 1.76 Gbits/sec Upstream codebase with page pool fragmentation support: 1.81 Gbits/sec Upstream codebase with page pool fragmentation and DMA support: 19.3 Gbits/sec Signed-off-by: Liang Chen <liangchen.linux at gmail.com> --- drivers/net/virtio_net.c | 55 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index ac40b8c66c59..73cc4f9fe4fa 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -22,6 +22,7 @@ #include <net/route.h> #include <net/xdp.h> #include <net/net_failover.h> +#include <linux/iommu.h> static int napi_weight = NAPI_POLL_WEIGHT; module_param(napi_weight, int, 0444); @@ -33,8 +34,10 @@ module_param(napi_tx, bool, 0644); static bool page_pool_enabled; static bool page_pool_frag; +static bool page_pool_dma_map; module_param(page_pool_enabled, bool, 0400); module_param(page_pool_frag, bool, 0400); +module_param(page_pool_dma_map, bool, 0400); /* FIXME: MTU in config. */ #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) @@ -3830,6 +3833,49 @@ static void virtnet_del_vqs(struct virtnet_info *vi) virtnet_free_queues(vi); } +static dma_addr_t virtnet_pp_dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + struct page *head_page; + + if (dir != DMA_FROM_DEVICE) + return 0; + + head_page = compound_head(page); + return page_pool_get_dma_addr(head_page) + + (page - head_page) * PAGE_SIZE + + offset; +} + +static bool virtnet_pp_dma_unmap_page(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + phys_addr_t phys; + + /* Handle only the RX direction, and sync the DMA memory only if it's not + * a DMA coherent architecture. + */ + if (dir != DMA_FROM_DEVICE) + return false; + + if (dev_is_dma_coherent(dev)) + return true; + + phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); + if (WARN_ON(!phys)) + return false; + + arch_sync_dma_for_cpu(phys, size, dir); + return true; +} + +static struct virtqueue_pre_dma_ops virtnet_pp_pre_dma_ops = { + .map_page = virtnet_pp_dma_map_page, + .unmap_page = virtnet_pp_dma_unmap_page, +}; + static void virtnet_alloc_page_pool(struct receive_queue *rq) { struct virtio_device *vdev = rq->vq->vdev; @@ -3845,6 +3891,15 @@ static void virtnet_alloc_page_pool(struct receive_queue *rq) if (page_pool_frag) pp_params.flags |= PP_FLAG_PAGE_FRAG; + /* Consider using page pool DMA support only when DMA API is used. */ + if (virtio_has_feature(vdev, VIRTIO_F_ACCESS_PLATFORM) && + page_pool_dma_map) { + pp_params.flags |= PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; + pp_params.dma_dir = DMA_FROM_DEVICE; + pp_params.max_len = PAGE_SIZE << pp_params.order; + virtqueue_register_pre_dma_ops(rq->vq, &virtnet_pp_pre_dma_ops); + } + rq->page_pool = page_pool_create(&pp_params); if (IS_ERR(rq->page_pool)) { dev_warn(&vdev->dev, "page pool creation failed: %ld\n", -- 2.31.1
Jason Wang
2023-May-26 06:38 UTC
[PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain
On Fri, May 26, 2023 at 1:46?PM Liang Chen <liangchen.linux at gmail.com> wrote:> > "private" of buffer page is currently used for big mode to chain pages. > But in mergeable mode, that offset of page could mean something else, > e.g. when page_pool page is used instead. So excluding mergeable mode to > avoid such a problem.If this issue happens only in the case of page_pool, it would be better to squash it there. Thanks> > Signed-off-by: Liang Chen <liangchen.linux at gmail.com> > --- > drivers/net/virtio_net.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index 5a7f7a76b920..c5dca0d92e64 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -497,7 +497,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > return NULL; > > page = (struct page *)page->private; > - if (page) > + if (!vi->mergeable_rx_bufs && page) > give_pages(rq, page); > goto ok; > } > -- > 2.31.1 >
Michael S. Tsirkin
2023-May-28 06:16 UTC
[PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain
On Fri, May 26, 2023 at 01:46:17PM +0800, Liang Chen wrote:> "private" of buffer page is currently used for big mode to chain pages. > But in mergeable mode, that offset of page could mean something else, > e.g. when page_pool page is used instead. So excluding mergeable mode to > avoid such a problem. > > Signed-off-by: Liang Chen <liangchen.linux at gmail.com>Ugh the subject makes it looks like current code has a problem but I don't think so because I don't think anything besides big packets uses page->private. The reason patch is needed is because follow up patches use page_pool. pls adjust commit log and subject to make all this clear.> --- > drivers/net/virtio_net.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index 5a7f7a76b920..c5dca0d92e64 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -497,7 +497,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > return NULL; > > page = (struct page *)page->private; > - if (page) > + if (!vi->mergeable_rx_bufs && page)To be safe let's limit to big packets too: if (!vi->mergeable_rx_bufs && vi->big_packets && page)> give_pages(rq, page); > goto ok; > } > -- > 2.31.1