Hi all:
This series is an update version of multiqueue virtio-net driver based on
Krishna Kumar's work to let virtio-net use multiple rx/tx queues to do the
packets reception and transmission. Please review and comments.
A protype implementation of qemu-kvm support could by found in
git://github.com/jasowang/qemu-kvm-mq.git. To start a guest with two queues, you
could specify the queues parameters to both tap and virtio-net like:
./qemu-kvm -netdev tap,queues=2,... -device virtio-net-pci,queues=2,...
then enable the multiqueue through ethtool by:
ethtool -L eth0 combined 2
Changes from V1:
Addressing Michael's comments:
- fix typos in commit log
- don't move virtnet_open()
- don't set to NULL in virtnet_free_queues()
- style & comment fixes
- conditionally set the irq affinity hint based on online cpus and queue pairs
- move the virnet_del_vqs to patch 1
- change the meaningless kzalloc() to kmalloc()
- open code the err handling
- store the name of virtqueue in send/receive queue
- avoid type cast in virtnet_find_vqs()
- fix the mem leak and freeing issue of names in virtnet_find_vqs()
- check cvq during before setting the max_queue_pairs in virtnet_probe()
- check the cvq and VIRTIO_NET_F_RFS in virtnet_set_queues()
- set the curr_queue_pairs in virtnet_set_queue()
- use the err report by virtnet_set_queue() as the return value of
  ethtool_set_channels()
Changes from RFC v7:
Addressing Rusty's comments:
- align the implementation (location of cvq) to v5.
- fix the style issue.
- use a global refill instead of per-vq one.
- check the VIRTIO_NET_F_RFS before calling virtnet_set_queues()
Addresing Michael's comments
- rename the curr_queue_pairs in virtnet_probe() to max_queue_pairs
- validate the number of queue pairs supported by the device against
  VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MIN and VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MAX.
- don't crash when failing to change the number of virtqueues
- don't set the affinity hint when onle single queue is used or there's
too much
  virtqueues
- add a TODO of handling cpu hotplug
- allow user to set the nubmer of queue pairs between 1 and max_queue_pairs
Changes from RFC v6:
- Align the implementation with the RFC spec update v5
- Addressing Rusty's comments:
  * split the patches
  * rename to max_queue_pairs and curr_queue_pairs
  * remove the useless status
  * fix the hibernation bug
- Addressing Ben's comments:
  * check other parameters in ethtool_set_queues
Changes from RFC v5:
- Align the implementation with the RFC spec update v4
- Switch the mode between single mode and multiqueue mode without reset
- Remove the 256 limitation of queues
- Use helpers to do the mapping between virtqueues and tx/rx queues
- Use commbined channels instead of separated rx/tx queus when do the queue
  number configuartion
- Other coding style comments from Michael
Changes from RFC v4:
- Add ability to negotiate the number of queues through control virtqueue
- Ethtool -{L|l} support and default the tx/rx queue number to 1
- Expose the API to set irq affinity instead of irq itself
Changes from RFC v3:
- Rebase to the net-next
- Let queue 2 to be the control virtqueue to obey the spec
- Prodives irq affinity
- Choose txq based on processor id
Reference:
- Virtio spec RFC: http://patchwork.ozlabs.org/patch/201303/
- V1: https://lkml.org/lkml/2012/11/27/177
- RFC V7: https://lkml.org/lkml/2012/11/27/177a
- RFC V6: https://lkml.org/lkml/2012/10/30/127
- RFC V5: http://lwn.net/Articles/505388/
- RFC V4: https://lkml.org/lkml/2012/6/25/120
- RFC V2: http://lwn.net/Articles/467283/
Perf Numbers:
Will do some basic test and post as a reply to this mail.
Jason Wang (3):
  virtio-net: separate fields of sending/receiving queue from
    virtnet_info
  virtio_net: multiqueue support
  virtio-net: support changing the number of queue pairs through
    ethtool
 drivers/net/virtio_net.c        |  726 +++++++++++++++++++++++++++++----------
 include/uapi/linux/virtio_net.h |   27 ++
 2 files changed, 567 insertions(+), 186 deletions(-)
Jason Wang
2012-Dec-05  10:37 UTC
[PATCH net-next v2 1/3] virtio-net: separate fields of sending/receiving queue from virtnet_info
To support multiqueue transmitq/receiveq, the first step is to separate queue
related structure from virtnet_info. This patch introduce send_queue and
receive_queue structure and use the pointer to them as the parameter in
functions handling sending/receiving.
Signed-off-by: Krishna Kumar <krkumar2 at in.ibm.com>
Signed-off-by: Jason Wang <jasowang at redhat.com>
---
 drivers/net/virtio_net.c |  282 ++++++++++++++++++++++++++--------------------
 1 files changed, 158 insertions(+), 124 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 8262232..0dcaee7 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -51,16 +51,40 @@ struct virtnet_stats {
 	u64 rx_packets;
 };
 
-struct virtnet_info {
-	struct virtio_device *vdev;
-	struct virtqueue *rvq, *svq, *cvq;
-	struct net_device *dev;
+/* Internal representation of a send virtqueue */
+struct send_queue {
+	/* Virtqueue associated with this send _queue */
+	struct virtqueue *vq;
+
+	/* TX: fragments + linear part + virtio header */
+	struct scatterlist sg[MAX_SKB_FRAGS + 2];
+};
+
+/* Internal representation of a receive virtqueue */
+struct receive_queue {
+	/* Virtqueue associated with this receive_queue */
+	struct virtqueue *vq;
+
 	struct napi_struct napi;
-	unsigned int status;
 
 	/* Number of input buffers, and max we've ever had. */
 	unsigned int num, max;
 
+	/* Chain pages by the private ptr. */
+	struct page *pages;
+
+	/* RX: fragments + linear part + virtio header */
+	struct scatterlist sg[MAX_SKB_FRAGS + 2];
+};
+
+struct virtnet_info {
+	struct virtio_device *vdev;
+	struct virtqueue *cvq;
+	struct net_device *dev;
+	struct send_queue sq;
+	struct receive_queue rq;
+	unsigned int status;
+
 	/* I like... big packets and I cannot lie! */
 	bool big_packets;
 
@@ -81,13 +105,6 @@ struct virtnet_info {
 
 	/* Lock for config space updates */
 	struct mutex config_lock;
-
-	/* Chain pages by the private ptr. */
-	struct page *pages;
-
-	/* fragments + linear part + virtio header */
-	struct scatterlist rx_sg[MAX_SKB_FRAGS + 2];
-	struct scatterlist tx_sg[MAX_SKB_FRAGS + 2];
 };
 
 struct skb_vnet_hdr {
@@ -117,22 +134,22 @@ static inline struct skb_vnet_hdr *skb_vnet_hdr(struct
sk_buff *skb)
  * private is used to chain pages for big packets, put the whole
  * most recent used list in the beginning for reuse
  */
-static void give_pages(struct virtnet_info *vi, struct page *page)
+static void give_pages(struct receive_queue *rq, struct page *page)
 {
 	struct page *end;
 
-	/* Find end of list, sew whole thing into vi->pages. */
+	/* Find end of list, sew whole thing into vi->rq.pages. */
 	for (end = page; end->private; end = (struct page *)end->private);
-	end->private = (unsigned long)vi->pages;
-	vi->pages = page;
+	end->private = (unsigned long)rq->pages;
+	rq->pages = page;
 }
 
-static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
+static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
 {
-	struct page *p = vi->pages;
+	struct page *p = rq->pages;
 
 	if (p) {
-		vi->pages = (struct page *)p->private;
+		rq->pages = (struct page *)p->private;
 		/* clear private here, it is used to chain pages */
 		p->private = 0;
 	} else
@@ -140,12 +157,12 @@ static struct page *get_a_page(struct virtnet_info *vi,
gfp_t gfp_mask)
 	return p;
 }
 
-static void skb_xmit_done(struct virtqueue *svq)
+static void skb_xmit_done(struct virtqueue *vq)
 {
-	struct virtnet_info *vi = svq->vdev->priv;
+	struct virtnet_info *vi = vq->vdev->priv;
 
 	/* Suppress further interrupts. */
-	virtqueue_disable_cb(svq);
+	virtqueue_disable_cb(vq);
 
 	/* We were probably waiting for more output buffers. */
 	netif_wake_queue(vi->dev);
@@ -167,9 +184,10 @@ static void set_skb_frag(struct sk_buff *skb, struct page
*page,
 }
 
 /* Called from bottom half context */
-static struct sk_buff *page_to_skb(struct virtnet_info *vi,
+static struct sk_buff *page_to_skb(struct receive_queue *rq,
 				   struct page *page, unsigned int len)
 {
+	struct virtnet_info *vi = rq->vq->vdev->priv;
 	struct sk_buff *skb;
 	struct skb_vnet_hdr *hdr;
 	unsigned int copy, hdr_len, offset;
@@ -224,12 +242,12 @@ static struct sk_buff *page_to_skb(struct virtnet_info
*vi,
 	}
 
 	if (page)
-		give_pages(vi, page);
+		give_pages(rq, page);
 
 	return skb;
 }
 
-static int receive_mergeable(struct virtnet_info *vi, struct sk_buff *skb)
+static int receive_mergeable(struct receive_queue *rq, struct sk_buff *skb)
 {
 	struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
 	struct page *page;
@@ -243,7 +261,7 @@ static int receive_mergeable(struct virtnet_info *vi, struct
sk_buff *skb)
 			skb->dev->stats.rx_length_errors++;
 			return -EINVAL;
 		}
-		page = virtqueue_get_buf(vi->rvq, &len);
+		page = virtqueue_get_buf(rq->vq, &len);
 		if (!page) {
 			pr_debug("%s: rx error: %d buffers missing\n",
 				 skb->dev->name, hdr->mhdr.num_buffers);
@@ -256,14 +274,15 @@ static int receive_mergeable(struct virtnet_info *vi,
struct sk_buff *skb)
 
 		set_skb_frag(skb, page, 0, &len);
 
-		--vi->num;
+		--rq->num;
 	}
 	return 0;
 }
 
-static void receive_buf(struct net_device *dev, void *buf, unsigned int len)
+static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
 {
-	struct virtnet_info *vi = netdev_priv(dev);
+	struct virtnet_info *vi = rq->vq->vdev->priv;
+	struct net_device *dev = vi->dev;
 	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
 	struct sk_buff *skb;
 	struct page *page;
@@ -273,7 +292,7 @@ static void receive_buf(struct net_device *dev, void *buf,
unsigned int len)
 		pr_debug("%s: short packet %i\n", dev->name, len);
 		dev->stats.rx_length_errors++;
 		if (vi->mergeable_rx_bufs || vi->big_packets)
-			give_pages(vi, buf);
+			give_pages(rq, buf);
 		else
 			dev_kfree_skb(buf);
 		return;
@@ -285,14 +304,14 @@ static void receive_buf(struct net_device *dev, void *buf,
unsigned int len)
 		skb_trim(skb, len);
 	} else {
 		page = buf;
-		skb = page_to_skb(vi, page, len);
+		skb = page_to_skb(rq, page, len);
 		if (unlikely(!skb)) {
 			dev->stats.rx_dropped++;
-			give_pages(vi, page);
+			give_pages(rq, page);
 			return;
 		}
 		if (vi->mergeable_rx_bufs)
-			if (receive_mergeable(vi, skb)) {
+			if (receive_mergeable(rq, skb)) {
 				dev_kfree_skb(skb);
 				return;
 			}
@@ -359,8 +378,9 @@ frame_err:
 	dev_kfree_skb(skb);
 }
 
-static int add_recvbuf_small(struct virtnet_info *vi, gfp_t gfp)
+static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp)
 {
+	struct virtnet_info *vi = rq->vq->vdev->priv;
 	struct sk_buff *skb;
 	struct skb_vnet_hdr *hdr;
 	int err;
@@ -372,77 +392,77 @@ static int add_recvbuf_small(struct virtnet_info *vi,
gfp_t gfp)
 	skb_put(skb, MAX_PACKET_LEN);
 
 	hdr = skb_vnet_hdr(skb);
-	sg_set_buf(vi->rx_sg, &hdr->hdr, sizeof hdr->hdr);
+	sg_set_buf(rq->sg, &hdr->hdr, sizeof hdr->hdr);
 
-	skb_to_sgvec(skb, vi->rx_sg + 1, 0, skb->len);
+	skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
 
-	err = virtqueue_add_buf(vi->rvq, vi->rx_sg, 0, 2, skb, gfp);
+	err = virtqueue_add_buf(rq->vq, rq->sg, 0, 2, skb, gfp);
 	if (err < 0)
 		dev_kfree_skb(skb);
 
 	return err;
 }
 
-static int add_recvbuf_big(struct virtnet_info *vi, gfp_t gfp)
+static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
 {
 	struct page *first, *list = NULL;
 	char *p;
 	int i, err, offset;
 
-	/* page in vi->rx_sg[MAX_SKB_FRAGS + 1] is list tail */
+	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
 	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
-		first = get_a_page(vi, gfp);
+		first = get_a_page(rq, gfp);
 		if (!first) {
 			if (list)
-				give_pages(vi, list);
+				give_pages(rq, list);
 			return -ENOMEM;
 		}
-		sg_set_buf(&vi->rx_sg[i], page_address(first), PAGE_SIZE);
+		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
 
 		/* chain new page in list head to match sg */
 		first->private = (unsigned long)list;
 		list = first;
 	}
 
-	first = get_a_page(vi, gfp);
+	first = get_a_page(rq, gfp);
 	if (!first) {
-		give_pages(vi, list);
+		give_pages(rq, list);
 		return -ENOMEM;
 	}
 	p = page_address(first);
 
-	/* vi->rx_sg[0], vi->rx_sg[1] share the same page */
-	/* a separated vi->rx_sg[0] for virtio_net_hdr only due to QEMU bug */
-	sg_set_buf(&vi->rx_sg[0], p, sizeof(struct virtio_net_hdr));
+	/* rq->sg[0], rq->sg[1] share the same page */
+	/* a separated rq->sg[0] for virtio_net_hdr only due to QEMU bug */
+	sg_set_buf(&rq->sg[0], p, sizeof(struct virtio_net_hdr));
 
-	/* vi->rx_sg[1] for data packet, from offset */
+	/* rq->sg[1] for data packet, from offset */
 	offset = sizeof(struct padded_vnet_hdr);
-	sg_set_buf(&vi->rx_sg[1], p + offset, PAGE_SIZE - offset);
+	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
 
 	/* chain first in list head */
 	first->private = (unsigned long)list;
-	err = virtqueue_add_buf(vi->rvq, vi->rx_sg, 0, MAX_SKB_FRAGS + 2,
+	err = virtqueue_add_buf(rq->vq, rq->sg, 0, MAX_SKB_FRAGS + 2,
 				first, gfp);
 	if (err < 0)
-		give_pages(vi, first);
+		give_pages(rq, first);
 
 	return err;
 }
 
-static int add_recvbuf_mergeable(struct virtnet_info *vi, gfp_t gfp)
+static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
 {
 	struct page *page;
 	int err;
 
-	page = get_a_page(vi, gfp);
+	page = get_a_page(rq, gfp);
 	if (!page)
 		return -ENOMEM;
 
-	sg_init_one(vi->rx_sg, page_address(page), PAGE_SIZE);
+	sg_init_one(rq->sg, page_address(page), PAGE_SIZE);
 
-	err = virtqueue_add_buf(vi->rvq, vi->rx_sg, 0, 1, page, gfp);
+	err = virtqueue_add_buf(rq->vq, rq->sg, 0, 1, page, gfp);
 	if (err < 0)
-		give_pages(vi, page);
+		give_pages(rq, page);
 
 	return err;
 }
@@ -454,65 +474,68 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
gfp_t gfp)
  * before we're receiving packets, or from refill_work which is
  * careful to disable receiving (using napi_disable).
  */
-static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
+static bool try_fill_recv(struct receive_queue *rq, gfp_t gfp)
 {
+	struct virtnet_info *vi = rq->vq->vdev->priv;
 	int err;
 	bool oom;
 
 	do {
 		if (vi->mergeable_rx_bufs)
-			err = add_recvbuf_mergeable(vi, gfp);
+			err = add_recvbuf_mergeable(rq, gfp);
 		else if (vi->big_packets)
-			err = add_recvbuf_big(vi, gfp);
+			err = add_recvbuf_big(rq, gfp);
 		else
-			err = add_recvbuf_small(vi, gfp);
+			err = add_recvbuf_small(rq, gfp);
 
 		oom = err == -ENOMEM;
 		if (err < 0)
 			break;
-		++vi->num;
+		++rq->num;
 	} while (err > 0);
-	if (unlikely(vi->num > vi->max))
-		vi->max = vi->num;
-	virtqueue_kick(vi->rvq);
+	if (unlikely(rq->num > rq->max))
+		rq->max = rq->num;
+	virtqueue_kick(rq->vq);
 	return !oom;
 }
 
 static void skb_recv_done(struct virtqueue *rvq)
 {
 	struct virtnet_info *vi = rvq->vdev->priv;
+	struct receive_queue *rq = &vi->rq;
+
 	/* Schedule NAPI, Suppress further interrupts if successful. */
-	if (napi_schedule_prep(&vi->napi)) {
+	if (napi_schedule_prep(&rq->napi)) {
 		virtqueue_disable_cb(rvq);
-		__napi_schedule(&vi->napi);
+		__napi_schedule(&rq->napi);
 	}
 }
 
-static void virtnet_napi_enable(struct virtnet_info *vi)
+static void virtnet_napi_enable(struct receive_queue *rq)
 {
-	napi_enable(&vi->napi);
+	napi_enable(&rq->napi);
 
 	/* If all buffers were filled by other side before we napi_enabled, we
 	 * won't get another interrupt, so process any outstanding packets
 	 * now.  virtnet_poll wants re-enable the queue, so we disable here.
 	 * We synchronize against interrupts via NAPI_STATE_SCHED */
-	if (napi_schedule_prep(&vi->napi)) {
-		virtqueue_disable_cb(vi->rvq);
+	if (napi_schedule_prep(&rq->napi)) {
+		virtqueue_disable_cb(rq->vq);
 		local_bh_disable();
-		__napi_schedule(&vi->napi);
+		__napi_schedule(&rq->napi);
 		local_bh_enable();
 	}
 }
 
 static void refill_work(struct work_struct *work)
 {
-	struct virtnet_info *vi;
+	struct virtnet_info *vi +		container_of(work, struct virtnet_info,
refill.work);
 	bool still_empty;
 
-	vi = container_of(work, struct virtnet_info, refill.work);
-	napi_disable(&vi->napi);
-	still_empty = !try_fill_recv(vi, GFP_KERNEL);
-	virtnet_napi_enable(vi);
+	napi_disable(&vi->rq.napi);
+	still_empty = !try_fill_recv(&vi->rq, GFP_KERNEL);
+	virtnet_napi_enable(&vi->rq);
 
 	/* In theory, this can happen: if we don't get any buffers in
 	 * we will *never* try to fill again. */
@@ -522,29 +545,31 @@ static void refill_work(struct work_struct *work)
 
 static int virtnet_poll(struct napi_struct *napi, int budget)
 {
-	struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi);
+	struct receive_queue *rq +		container_of(napi, struct receive_queue, napi);
+	struct virtnet_info *vi = rq->vq->vdev->priv;
 	void *buf;
 	unsigned int len, received = 0;
 
 again:
 	while (received < budget &&
-	       (buf = virtqueue_get_buf(vi->rvq, &len)) != NULL) {
-		receive_buf(vi->dev, buf, len);
-		--vi->num;
+	       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
+		receive_buf(rq, buf, len);
+		--rq->num;
 		received++;
 	}
 
-	if (vi->num < vi->max / 2) {
-		if (!try_fill_recv(vi, GFP_ATOMIC))
+	if (rq->num < rq->max / 2) {
+		if (!try_fill_recv(rq, GFP_ATOMIC))
 			schedule_delayed_work(&vi->refill, 0);
 	}
 
 	/* Out of packets? */
 	if (received < budget) {
 		napi_complete(napi);
-		if (unlikely(!virtqueue_enable_cb(vi->rvq)) &&
+		if (unlikely(!virtqueue_enable_cb(rq->vq)) &&
 		    napi_schedule_prep(napi)) {
-			virtqueue_disable_cb(vi->rvq);
+			virtqueue_disable_cb(rq->vq);
 			__napi_schedule(napi);
 			goto again;
 		}
@@ -553,13 +578,14 @@ again:
 	return received;
 }
 
-static unsigned int free_old_xmit_skbs(struct virtnet_info *vi)
+static unsigned int free_old_xmit_skbs(struct send_queue *sq)
 {
 	struct sk_buff *skb;
 	unsigned int len, tot_sgs = 0;
+	struct virtnet_info *vi = sq->vq->vdev->priv;
 	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
 
-	while ((skb = virtqueue_get_buf(vi->svq, &len)) != NULL) {
+	while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
 		pr_debug("Sent skb %p\n", skb);
 
 		u64_stats_update_begin(&stats->tx_syncp);
@@ -573,10 +599,11 @@ static unsigned int free_old_xmit_skbs(struct virtnet_info
*vi)
 	return tot_sgs;
 }
 
-static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
+static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 {
 	struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
 	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
+	struct virtnet_info *vi = sq->vq->vdev->priv;
 
 	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
 
@@ -611,25 +638,26 @@ static int xmit_skb(struct virtnet_info *vi, struct
sk_buff *skb)
 
 	/* Encode metadata header at front. */
 	if (vi->mergeable_rx_bufs)
-		sg_set_buf(vi->tx_sg, &hdr->mhdr, sizeof hdr->mhdr);
+		sg_set_buf(sq->sg, &hdr->mhdr, sizeof hdr->mhdr);
 	else
-		sg_set_buf(vi->tx_sg, &hdr->hdr, sizeof hdr->hdr);
+		sg_set_buf(sq->sg, &hdr->hdr, sizeof hdr->hdr);
 
-	hdr->num_sg = skb_to_sgvec(skb, vi->tx_sg + 1, 0, skb->len) + 1;
-	return virtqueue_add_buf(vi->svq, vi->tx_sg, hdr->num_sg,
+	hdr->num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
+	return virtqueue_add_buf(sq->vq, sq->sg, hdr->num_sg,
 				 0, skb, GFP_ATOMIC);
 }
 
 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
+	struct send_queue *sq = &vi->sq;
 	int capacity;
 
 	/* Free up any pending old buffers before queueing new ones. */
-	free_old_xmit_skbs(vi);
+	free_old_xmit_skbs(sq);
 
 	/* Try to transmit */
-	capacity = xmit_skb(vi, skb);
+	capacity = xmit_skb(sq, skb);
 
 	/* This can happen with OOM and indirect buffers. */
 	if (unlikely(capacity < 0)) {
@@ -648,7 +676,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct
net_device *dev)
 		kfree_skb(skb);
 		return NETDEV_TX_OK;
 	}
-	virtqueue_kick(vi->svq);
+	virtqueue_kick(sq->vq);
 
 	/* Don't wait up for transmitted skbs to be freed. */
 	skb_orphan(skb);
@@ -658,12 +686,12 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct
net_device *dev)
 	 * before it gets out of hand.  Naturally, this wastes entries. */
 	if (capacity < 2+MAX_SKB_FRAGS) {
 		netif_stop_queue(dev);
-		if (unlikely(!virtqueue_enable_cb_delayed(vi->svq))) {
+		if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
 			/* More just got used, free them then recheck. */
-			capacity += free_old_xmit_skbs(vi);
+			capacity += free_old_xmit_skbs(sq);
 			if (capacity >= 2+MAX_SKB_FRAGS) {
 				netif_start_queue(dev);
-				virtqueue_disable_cb(vi->svq);
+				virtqueue_disable_cb(sq->vq);
 			}
 		}
 	}
@@ -731,7 +759,7 @@ static void virtnet_netpoll(struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 
-	napi_schedule(&vi->napi);
+	napi_schedule(&vi->rq.napi);
 }
 #endif
 
@@ -740,10 +768,10 @@ static int virtnet_open(struct net_device *dev)
 	struct virtnet_info *vi = netdev_priv(dev);
 
 	/* Make sure we have some buffers: if oom use wq. */
-	if (!try_fill_recv(vi, GFP_KERNEL))
+	if (!try_fill_recv(&vi->rq, GFP_KERNEL))
 		schedule_delayed_work(&vi->refill, 0);
 
-	virtnet_napi_enable(vi);
+	virtnet_napi_enable(&vi->rq);
 	return 0;
 }
 
@@ -808,7 +836,7 @@ static int virtnet_close(struct net_device *dev)
 
 	/* Make sure refill_work doesn't re-enable napi! */
 	cancel_delayed_work_sync(&vi->refill);
-	napi_disable(&vi->napi);
+	napi_disable(&vi->rq.napi);
 
 	return 0;
 }
@@ -920,11 +948,10 @@ static void virtnet_get_ringparam(struct net_device *dev,
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 
-	ring->rx_max_pending = virtqueue_get_vring_size(vi->rvq);
-	ring->tx_max_pending = virtqueue_get_vring_size(vi->svq);
+	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq.vq);
+	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq.vq);
 	ring->rx_pending = ring->rx_max_pending;
 	ring->tx_pending = ring->tx_max_pending;
-
 }
 
 
@@ -1019,6 +1046,13 @@ static void virtnet_config_changed(struct virtio_device
*vdev)
 	schedule_work(&vi->config_work);
 }
 
+static void virtnet_del_vqs(struct virtnet_info *vi)
+{
+	struct virtio_device *vdev = vi->vdev;
+
+	vdev->config->del_vqs(vdev);
+}
+
 static int init_vqs(struct virtnet_info *vi)
 {
 	struct virtqueue *vqs[3];
@@ -1034,8 +1068,8 @@ static int init_vqs(struct virtnet_info *vi)
 	if (err)
 		return err;
 
-	vi->rvq = vqs[0];
-	vi->svq = vqs[1];
+	vi->rq.vq = vqs[0];
+	vi->sq.vq = vqs[1];
 
 	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) {
 		vi->cvq = vqs[2];
@@ -1100,11 +1134,11 @@ static int virtnet_probe(struct virtio_device *vdev)
 
 	/* Set up our device-specific information */
 	vi = netdev_priv(dev);
-	netif_napi_add(dev, &vi->napi, virtnet_poll, napi_weight);
+	netif_napi_add(dev, &vi->rq.napi, virtnet_poll, napi_weight);
 	vi->dev = dev;
 	vi->vdev = vdev;
 	vdev->priv = vi;
-	vi->pages = NULL;
+	vi->rq.pages = NULL;
 	vi->stats = alloc_percpu(struct virtnet_stats);
 	err = -ENOMEM;
 	if (vi->stats == NULL)
@@ -1114,8 +1148,8 @@ static int virtnet_probe(struct virtio_device *vdev)
 	mutex_init(&vi->config_lock);
 	vi->config_enable = true;
 	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
-	sg_init_table(vi->rx_sg, ARRAY_SIZE(vi->rx_sg));
-	sg_init_table(vi->tx_sg, ARRAY_SIZE(vi->tx_sg));
+	sg_init_table(vi->rq.sg, ARRAY_SIZE(vi->rq.sg));
+	sg_init_table(vi->sq.sg, ARRAY_SIZE(vi->sq.sg));
 
 	/* If we can receive ANY GSO packets, we must allocate large ones. */
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
@@ -1137,10 +1171,10 @@ static int virtnet_probe(struct virtio_device *vdev)
 	}
 
 	/* Last of all, set up some receive buffers. */
-	try_fill_recv(vi, GFP_KERNEL);
+	try_fill_recv(&vi->rq, GFP_KERNEL);
 
 	/* If we didn't even get one input buffer, we're useless. */
-	if (vi->num == 0) {
+	if (vi->rq.num == 0) {
 		err = -ENOMEM;
 		goto unregister;
 	}
@@ -1161,7 +1195,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 unregister:
 	unregister_netdev(dev);
 free_vqs:
-	vdev->config->del_vqs(vdev);
+	virtnet_del_vqs(vi);
 free_stats:
 	free_percpu(vi->stats);
 free:
@@ -1173,22 +1207,22 @@ static void free_unused_bufs(struct virtnet_info *vi)
 {
 	void *buf;
 	while (1) {
-		buf = virtqueue_detach_unused_buf(vi->svq);
+		buf = virtqueue_detach_unused_buf(vi->sq.vq);
 		if (!buf)
 			break;
 		dev_kfree_skb(buf);
 	}
 	while (1) {
-		buf = virtqueue_detach_unused_buf(vi->rvq);
+		buf = virtqueue_detach_unused_buf(vi->rq.vq);
 		if (!buf)
 			break;
 		if (vi->mergeable_rx_bufs || vi->big_packets)
-			give_pages(vi, buf);
+			give_pages(&vi->rq, buf);
 		else
 			dev_kfree_skb(buf);
-		--vi->num;
+		--vi->rq.num;
 	}
-	BUG_ON(vi->num != 0);
+	BUG_ON(vi->rq.num != 0);
 }
 
 static void remove_vq_common(struct virtnet_info *vi)
@@ -1198,10 +1232,10 @@ static void remove_vq_common(struct virtnet_info *vi)
 	/* Free unused buffers in both send and recv, if any. */
 	free_unused_bufs(vi);
 
-	vi->vdev->config->del_vqs(vi->vdev);
+	virtnet_del_vqs(vi);
 
-	while (vi->pages)
-		__free_pages(get_a_page(vi, GFP_KERNEL), 0);
+	while (vi->rq.pages)
+		__free_pages(get_a_page(&vi->rq, GFP_KERNEL), 0);
 }
 
 static void __devexit virtnet_remove(struct virtio_device *vdev)
@@ -1237,7 +1271,7 @@ static int virtnet_freeze(struct virtio_device *vdev)
 	cancel_delayed_work_sync(&vi->refill);
 
 	if (netif_running(vi->dev))
-		napi_disable(&vi->napi);
+		napi_disable(&vi->rq.napi);
 
 	remove_vq_common(vi);
 
@@ -1256,11 +1290,11 @@ static int virtnet_restore(struct virtio_device *vdev)
 		return err;
 
 	if (netif_running(vi->dev))
-		virtnet_napi_enable(vi);
+		virtnet_napi_enable(&vi->rq);
 
 	netif_device_attach(vi->dev);
 
-	if (!try_fill_recv(vi, GFP_KERNEL))
+	if (!try_fill_recv(&vi->rq, GFP_KERNEL))
 		schedule_delayed_work(&vi->refill, 0);
 
 	mutex_lock(&vi->config_lock);
-- 
1.7.1
This patch adds the multiqueue (VIRTIO_NET_F_RFS) support to virtio_net
driver. VIRTIO_NET_F_RFS capable device could allow the driver to do packet
transmission and reception through multiple queue pairs and does the packet
steering to get better performance. By default, one one queue pair is used, user
could change the number of queue pairs by ethtool in the next patch.
When multiple queue pairs is used and the number of queue pairs is equal to the
number of vcpus. Driver does the following optimizations to implement per-cpu
virt queue pairs:
- select the txq based on the smp processor id.
- smp affinity hint to the cpu that owns the queue pairs.
This could be used with the flow steering support of the device to guarantee the
packets of a single flow is handled by the same cpu.
Signed-off-by: Krishna Kumar <krkumar2 at in.ibm.com>
Signed-off-by: Jason Wang <jasowang at redhat.com>
---
 drivers/net/virtio_net.c        |  473 +++++++++++++++++++++++++++++++--------
 include/uapi/linux/virtio_net.h |   27 +++
 2 files changed, 402 insertions(+), 98 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 0dcaee7..def11ce 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -58,6 +58,9 @@ struct send_queue {
 
 	/* TX: fragments + linear part + virtio header */
 	struct scatterlist sg[MAX_SKB_FRAGS + 2];
+
+	/* Name of the send queue: output.$index */
+	char name[40];
 };
 
 /* Internal representation of a receive virtqueue */
@@ -75,22 +78,34 @@ struct receive_queue {
 
 	/* RX: fragments + linear part + virtio header */
 	struct scatterlist sg[MAX_SKB_FRAGS + 2];
+
+	/* Name of this receive queue: input.$index */
+	char name[40];
 };
 
 struct virtnet_info {
 	struct virtio_device *vdev;
 	struct virtqueue *cvq;
 	struct net_device *dev;
-	struct send_queue sq;
-	struct receive_queue rq;
+	struct send_queue *sq;
+	struct receive_queue *rq;
 	unsigned int status;
 
+	/* Max # of queue pairs supported by the device */
+	u16 max_queue_pairs;
+
+	/* # of queue pairs currently used by the driver */
+	u16 curr_queue_pairs;
+
 	/* I like... big packets and I cannot lie! */
 	bool big_packets;
 
 	/* Host will merge rx buffers for big packets (shake it! shake it!) */
 	bool mergeable_rx_bufs;
 
+	/* Has control virtqueue */
+	bool has_cvq;
+
 	/* enable config space updates */
 	bool config_enable;
 
@@ -105,6 +120,9 @@ struct virtnet_info {
 
 	/* Lock for config space updates */
 	struct mutex config_lock;
+
+	/* Does the affinity hint is set for virtqueues? */
+	bool affinity_hint_set;
 };
 
 struct skb_vnet_hdr {
@@ -125,6 +143,29 @@ struct padded_vnet_hdr {
 	char padding[6];
 };
 
+/* Converting between virtqueue no. and kernel tx/rx queue no.
+ * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
+ */
+static int vq2txq(struct virtqueue *vq)
+{
+	return (virtqueue_get_queue_index(vq) - 1) / 2;
+}
+
+static int txq2vq(int txq)
+{
+	return txq * 2 + 1;
+}
+
+static int vq2rxq(struct virtqueue *vq)
+{
+	return virtqueue_get_queue_index(vq) / 2;
+}
+
+static int rxq2vq(int rxq)
+{
+	return rxq * 2;
+}
+
 static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)
 {
 	return (struct skb_vnet_hdr *)skb->cb;
@@ -165,7 +206,7 @@ static void skb_xmit_done(struct virtqueue *vq)
 	virtqueue_disable_cb(vq);
 
 	/* We were probably waiting for more output buffers. */
-	netif_wake_queue(vi->dev);
+	netif_wake_subqueue(vi->dev, vq2txq(vq));
 }
 
 static void set_skb_frag(struct sk_buff *skb, struct page *page,
@@ -502,7 +543,7 @@ static bool try_fill_recv(struct receive_queue *rq, gfp_t
gfp)
 static void skb_recv_done(struct virtqueue *rvq)
 {
 	struct virtnet_info *vi = rvq->vdev->priv;
-	struct receive_queue *rq = &vi->rq;
+	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
 
 	/* Schedule NAPI, Suppress further interrupts if successful. */
 	if (napi_schedule_prep(&rq->napi)) {
@@ -532,15 +573,21 @@ static void refill_work(struct work_struct *work)
 	struct virtnet_info *vi  		container_of(work, struct virtnet_info,
refill.work);
 	bool still_empty;
+	int i;
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		struct receive_queue *rq = &vi->rq[i];
 
-	napi_disable(&vi->rq.napi);
-	still_empty = !try_fill_recv(&vi->rq, GFP_KERNEL);
-	virtnet_napi_enable(&vi->rq);
+		napi_disable(&rq->napi);
+		still_empty = !try_fill_recv(rq, GFP_KERNEL);
+		virtnet_napi_enable(rq);
 
-	/* In theory, this can happen: if we don't get any buffers in
-	 * we will *never* try to fill again. */
-	if (still_empty)
-		schedule_delayed_work(&vi->refill, HZ/2);
+		/* In theory, this can happen: if we don't get any buffers in
+		 * we will *never* try to fill again.
+		 */
+		if (still_empty)
+			schedule_delayed_work(&vi->refill, HZ/2);
+	}
 }
 
 static int virtnet_poll(struct napi_struct *napi, int budget)
@@ -578,6 +625,21 @@ again:
 	return received;
 }
 
+static int virtnet_open(struct net_device *dev)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		/* Make sure we have some buffers: if oom use wq. */
+		if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
+			schedule_delayed_work(&vi->refill, 0);
+		virtnet_napi_enable(&vi->rq[i]);
+	}
+
+	return 0;
+}
+
 static unsigned int free_old_xmit_skbs(struct send_queue *sq)
 {
 	struct sk_buff *skb;
@@ -650,7 +712,8 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff
*skb)
 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
-	struct send_queue *sq = &vi->sq;
+	int qnum = skb_get_queue_mapping(skb);
+	struct send_queue *sq = &vi->sq[qnum];
 	int capacity;
 
 	/* Free up any pending old buffers before queueing new ones. */
@@ -664,13 +727,14 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct
net_device *dev)
 		if (likely(capacity == -ENOMEM)) {
 			if (net_ratelimit())
 				dev_warn(&dev->dev,
-					 "TX queue failure: out of memory\n");
+					 "TXQ (%d) failure: out of memory\n",
+					 qnum);
 		} else {
 			dev->stats.tx_fifo_errors++;
 			if (net_ratelimit())
 				dev_warn(&dev->dev,
-					 "Unexpected TX queue failure: %d\n",
-					 capacity);
+					 "Unexpected TXQ (%d) failure: %d\n",
+					 qnum, capacity);
 		}
 		dev->stats.tx_dropped++;
 		kfree_skb(skb);
@@ -685,12 +749,12 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct
net_device *dev)
 	/* Apparently nice girls don't return TX_BUSY; stop the queue
 	 * before it gets out of hand.  Naturally, this wastes entries. */
 	if (capacity < 2+MAX_SKB_FRAGS) {
-		netif_stop_queue(dev);
+		netif_stop_subqueue(dev, qnum);
 		if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
 			/* More just got used, free them then recheck. */
 			capacity += free_old_xmit_skbs(sq);
 			if (capacity >= 2+MAX_SKB_FRAGS) {
-				netif_start_queue(dev);
+				netif_start_subqueue(dev, qnum);
 				virtqueue_disable_cb(sq->vq);
 			}
 		}
@@ -758,23 +822,13 @@ static struct rtnl_link_stats64 *virtnet_stats(struct
net_device *dev,
 static void virtnet_netpoll(struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
+	int i;
 
-	napi_schedule(&vi->rq.napi);
+	for (i = 0; i < vi->curr_queue_pairs; i++)
+		napi_schedule(&vi->rq[i].napi);
 }
 #endif
 
-static int virtnet_open(struct net_device *dev)
-{
-	struct virtnet_info *vi = netdev_priv(dev);
-
-	/* Make sure we have some buffers: if oom use wq. */
-	if (!try_fill_recv(&vi->rq, GFP_KERNEL))
-		schedule_delayed_work(&vi->refill, 0);
-
-	virtnet_napi_enable(&vi->rq);
-	return 0;
-}
-
 /*
  * Send command via the control virtqueue and check status.  Commands
  * supported by the hypervisor, as indicated by feature bits, should
@@ -830,13 +884,39 @@ static void virtnet_ack_link_announce(struct virtnet_info
*vi)
 	rtnl_unlock();
 }
 
+static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
+{
+	struct scatterlist sg;
+	struct virtio_net_ctrl_rfs s;
+	struct net_device *dev = vi->dev;
+
+	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_RFS))
+		return 0;
+
+	s.virtqueue_pairs = queue_pairs;
+	sg_init_one(&sg, &s, sizeof(s));
+
+	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RFS,
+				  VIRTIO_NET_CTRL_RFS_VQ_PAIRS_SET, &sg, 1, 0)){
+		dev_warn(&dev->dev, "Fail to set num of queue pairs to
%d\n",
+			 queue_pairs);
+		return -EINVAL;
+	} else
+		vi->curr_queue_pairs = queue_pairs;
+
+	return 0;
+}
+
 static int virtnet_close(struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
+	int i;
 
 	/* Make sure refill_work doesn't re-enable napi! */
 	cancel_delayed_work_sync(&vi->refill);
-	napi_disable(&vi->rq.napi);
+
+	for (i = 0; i < vi->max_queue_pairs; i++)
+		napi_disable(&vi->rq[i].napi);
 
 	return 0;
 }
@@ -943,13 +1023,41 @@ static int virtnet_vlan_rx_kill_vid(struct net_device
*dev, u16 vid)
 	return 0;
 }
 
+static void virtnet_set_affinity(struct virtnet_info *vi, bool set)
+{
+	int i;
+
+	/* In multiqueue mode, when the number of cpu is equal to the number of
+	 * queue pairs, we let the queue pairs to be private to one cpu by
+	 * setting the affinity hint to eliminate the contention.
+	 */
+	if ((vi->curr_queue_pairs == 1 ||
+	     vi->max_queue_pairs != num_online_cpus()) && set) {
+		if (vi->affinity_hint_set)
+			set = false;
+		else
+			return;
+	}
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		int cpu = set ? i : -1;
+		virtqueue_set_affinity(vi->rq[i].vq, cpu);
+		virtqueue_set_affinity(vi->sq[i].vq, cpu);
+	}
+
+	if (set)
+		vi->affinity_hint_set = true;
+	else
+		vi->affinity_hint_set = false;
+}
+
 static void virtnet_get_ringparam(struct net_device *dev,
 				struct ethtool_ringparam *ring)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 
-	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq.vq);
-	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq.vq);
+	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
+	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
 	ring->rx_pending = ring->rx_max_pending;
 	ring->tx_pending = ring->tx_max_pending;
 }
@@ -984,6 +1092,21 @@ static int virtnet_change_mtu(struct net_device *dev, int
new_mtu)
 	return 0;
 }
 
+/* To avoid contending a lock hold by a vcpu who would exit to host, select the
+ * txq based on the processor id.
+ * TODO: handle cpu hotplug.
+ */
+static u16 virtnet_select_queue(struct net_device *dev, struct sk_buff *skb)
+{
+	int txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) :
+		  smp_processor_id();
+
+	while (unlikely(txq >= dev->real_num_tx_queues))
+		txq -= dev->real_num_tx_queues;
+
+	return txq;
+}
+
 static const struct net_device_ops virtnet_netdev = {
 	.ndo_open            = virtnet_open,
 	.ndo_stop   	     = virtnet_close,
@@ -995,6 +1118,7 @@ static const struct net_device_ops virtnet_netdev = {
 	.ndo_get_stats64     = virtnet_stats,
 	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
+	.ndo_select_queue     = virtnet_select_queue,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller = virtnet_netpoll,
 #endif
@@ -1030,10 +1154,10 @@ static void virtnet_config_changed_work(struct
work_struct *work)
 
 	if (vi->status & VIRTIO_NET_S_LINK_UP) {
 		netif_carrier_on(vi->dev);
-		netif_wake_queue(vi->dev);
+		netif_tx_wake_all_queues(vi->dev);
 	} else {
 		netif_carrier_off(vi->dev);
-		netif_stop_queue(vi->dev);
+		netif_tx_stop_all_queues(vi->dev);
 	}
 done:
 	mutex_unlock(&vi->config_lock);
@@ -1046,48 +1170,203 @@ static void virtnet_config_changed(struct virtio_device
*vdev)
 	schedule_work(&vi->config_work);
 }
 
+static void virtnet_free_queues(struct virtnet_info *vi)
+{
+	kfree(vi->rq);
+	kfree(vi->sq);
+}
+
+static void free_receive_bufs(struct virtnet_info *vi)
+{
+	int i;
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		while (vi->rq[i].pages)
+			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
+	}
+}
+
+static void free_unused_bufs(struct virtnet_info *vi)
+{
+	void *buf;
+	int i;
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		struct virtqueue *vq = vi->sq[i].vq;
+		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
+			dev_kfree_skb(buf);
+	}
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		struct virtqueue *vq = vi->rq[i].vq;
+
+		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
+			if (vi->mergeable_rx_bufs || vi->big_packets)
+				give_pages(&vi->rq[i], buf);
+			else
+				dev_kfree_skb(buf);
+			--vi->rq[i].num;
+		}
+		BUG_ON(vi->rq[i].num != 0);
+	}
+}
+
 static void virtnet_del_vqs(struct virtnet_info *vi)
 {
 	struct virtio_device *vdev = vi->vdev;
 
+	virtnet_set_affinity(vi, false);
+
 	vdev->config->del_vqs(vdev);
+
+	virtnet_free_queues(vi);
 }
 
-static int init_vqs(struct virtnet_info *vi)
+static int virtnet_find_vqs(struct virtnet_info *vi)
 {
-	struct virtqueue *vqs[3];
-	vq_callback_t *callbacks[] = { skb_recv_done, skb_xmit_done, NULL};
-	const char *names[] = { "input", "output",
"control" };
-	int nvqs, err;
-
-	/* We expect two virtqueues, receive then send,
-	 * and optionally control. */
-	nvqs = virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) ? 3 : 2;
-
-	err = vi->vdev->config->find_vqs(vi->vdev, nvqs, vqs, callbacks,
names);
-	if (err)
-		return err;
+	vq_callback_t **callbacks;
+	struct virtqueue **vqs;
+	int ret = -ENOMEM;
+	int i, total_vqs;
+	const char **names;
+
+	/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
+	 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
+	 * possible control vq.
+	 */
+	total_vqs = vi->max_queue_pairs * 2 +
+		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);
+
+	/* Allocate space for find_vqs parameters */
+	vqs = kzalloc(total_vqs * sizeof(*vqs), GFP_KERNEL);
+	if (!vqs)
+		goto err_vq;
+	callbacks = kmalloc(total_vqs * sizeof(*callbacks), GFP_KERNEL);
+	if (!callbacks)
+		goto err_callback;
+	names = kmalloc(total_vqs * sizeof(*names), GFP_KERNEL);
+	if (!names)
+		goto err_names;
+
+	/* Parameters for control virtqueue, if any */
+	if (vi->has_cvq) {
+		callbacks[total_vqs - 1] = NULL;
+		names[total_vqs - 1] = "control";
+	}
 
-	vi->rq.vq = vqs[0];
-	vi->sq.vq = vqs[1];
+	/* Allocate/initialize parameters for send/receive virtqueues */
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		callbacks[rxq2vq(i)] = skb_recv_done;
+		callbacks[txq2vq(i)] = skb_xmit_done;
+		sprintf(vi->rq[i].name, "input.%d", i);
+		sprintf(vi->sq[i].name, "output.%d", i);
+		names[rxq2vq(i)] = vi->rq[i].name;
+		names[txq2vq(i)] = vi->sq[i].name;
+	}
 
-	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) {
-		vi->cvq = vqs[2];
+	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs,
callbacks,
+					 names);
+	if (ret)
+		goto err_find;
 
+	if (vi->has_cvq) {
+		vi->cvq = vqs[total_vqs - 1];
 		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
 			vi->dev->features |= NETIF_F_HW_VLAN_FILTER;
 	}
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		vi->rq[i].vq = vqs[rxq2vq(i)];
+		vi->sq[i].vq = vqs[txq2vq(i)];
+	}
+
+	kfree(names);
+	kfree(callbacks);
+	kfree(vqs);
+
 	return 0;
+
+err_find:
+	kfree(names);
+err_names:
+	kfree(callbacks);
+err_callback:
+	kfree(vqs);
+err_vq:
+	return ret;
+}
+
+static int virtnet_alloc_queues(struct virtnet_info *vi)
+{
+	int i;
+
+	vi->sq = kzalloc(sizeof(*vi->sq) * vi->max_queue_pairs, GFP_KERNEL);
+	if (!vi->sq)
+		goto err_sq;
+	vi->rq = kzalloc(sizeof(*vi->rq) * vi->max_queue_pairs, GFP_KERNEL);
+	if (!vi->sq)
+		goto err_rq;
+
+	INIT_DELAYED_WORK(&vi->refill, refill_work);
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		vi->rq[i].pages = NULL;
+		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
+			       napi_weight);
+
+		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
+		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
+	}
+
+	return 0;
+
+err_rq:
+	kfree(vi->sq);
+err_sq:
+	return -ENOMEM;
+}
+
+static int init_vqs(struct virtnet_info *vi)
+{
+	int ret;
+
+	/* Allocate send & receive queues */
+	ret = virtnet_alloc_queues(vi);
+	if (ret)
+		goto err;
+
+	ret = virtnet_find_vqs(vi);
+	if (ret)
+		goto err_free;
+
+	virtnet_set_affinity(vi, true);
+	return 0;
+
+err_free:
+	virtnet_free_queues(vi);
+err:
+	return ret;
 }
 
 static int virtnet_probe(struct virtio_device *vdev)
 {
-	int err;
+	int i, err;
 	struct net_device *dev;
 	struct virtnet_info *vi;
+	u16 max_queue_pairs;
+
+	/* Find if host supports multiqueue virtio_net device */
+	err = virtio_config_val(vdev, VIRTIO_NET_F_RFS,
+				offsetof(struct virtio_net_config,
+				max_virtqueue_pairs), &max_queue_pairs);
+
+	/* We need at least 2 queue's */
+	if (err || max_queue_pairs < VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MIN ||
+	    max_queue_pairs > VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MAX ||
+	    !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
+		max_queue_pairs = 1;
 
 	/* Allocate ourselves a network device with room for our info */
-	dev = alloc_etherdev(sizeof(struct virtnet_info));
+	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
 	if (!dev)
 		return -ENOMEM;
 
@@ -1134,22 +1413,17 @@ static int virtnet_probe(struct virtio_device *vdev)
 
 	/* Set up our device-specific information */
 	vi = netdev_priv(dev);
-	netif_napi_add(dev, &vi->rq.napi, virtnet_poll, napi_weight);
 	vi->dev = dev;
 	vi->vdev = vdev;
 	vdev->priv = vi;
-	vi->rq.pages = NULL;
 	vi->stats = alloc_percpu(struct virtnet_stats);
 	err = -ENOMEM;
 	if (vi->stats == NULL)
 		goto free;
 
-	INIT_DELAYED_WORK(&vi->refill, refill_work);
 	mutex_init(&vi->config_lock);
 	vi->config_enable = true;
 	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
-	sg_init_table(vi->rq.sg, ARRAY_SIZE(vi->rq.sg));
-	sg_init_table(vi->sq.sg, ARRAY_SIZE(vi->sq.sg));
 
 	/* If we can receive ANY GSO packets, we must allocate large ones. */
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
@@ -1160,10 +1434,21 @@ static int virtnet_probe(struct virtio_device *vdev)
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
 		vi->mergeable_rx_bufs = true;
 
+	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
+		vi->has_cvq = true;
+
+	/* Use single tx/rx queue pair as default */
+	vi->curr_queue_pairs = 1;
+	vi->max_queue_pairs = max_queue_pairs;
+
+	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
 	err = init_vqs(vi);
 	if (err)
 		goto free_stats;
 
+	netif_set_real_num_tx_queues(dev, 1);
+	netif_set_real_num_rx_queues(dev, 1);
+
 	err = register_netdev(dev);
 	if (err) {
 		pr_debug("virtio_net: registering device failed\n");
@@ -1171,12 +1456,15 @@ static int virtnet_probe(struct virtio_device *vdev)
 	}
 
 	/* Last of all, set up some receive buffers. */
-	try_fill_recv(&vi->rq, GFP_KERNEL);
-
-	/* If we didn't even get one input buffer, we're useless. */
-	if (vi->rq.num == 0) {
-		err = -ENOMEM;
-		goto unregister;
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		try_fill_recv(&vi->rq[i], GFP_KERNEL);
+
+		/* If we didn't even get one input buffer, we're useless. */
+		if (vi->rq[i].num == 0) {
+			free_unused_bufs(vi);
+			err = -ENOMEM;
+			goto free_recv_bufs;
+		}
 	}
 
 	/* Assume link up if device can't report link status,
@@ -1189,12 +1477,16 @@ static int virtnet_probe(struct virtio_device *vdev)
 		netif_carrier_on(dev);
 	}
 
-	pr_debug("virtnet: registered device %s\n", dev->name);
+	pr_debug("virtnet: registered device %s with %d RX and TX
vq's\n",
+		 dev->name, max_queue_pairs);
+
 	return 0;
 
-unregister:
+free_recv_bufs:
+	free_receive_bufs(vi);
 	unregister_netdev(dev);
 free_vqs:
+	cancel_delayed_work_sync(&vi->refill);
 	virtnet_del_vqs(vi);
 free_stats:
 	free_percpu(vi->stats);
@@ -1203,28 +1495,6 @@ free:
 	return err;
 }
 
-static void free_unused_bufs(struct virtnet_info *vi)
-{
-	void *buf;
-	while (1) {
-		buf = virtqueue_detach_unused_buf(vi->sq.vq);
-		if (!buf)
-			break;
-		dev_kfree_skb(buf);
-	}
-	while (1) {
-		buf = virtqueue_detach_unused_buf(vi->rq.vq);
-		if (!buf)
-			break;
-		if (vi->mergeable_rx_bufs || vi->big_packets)
-			give_pages(&vi->rq, buf);
-		else
-			dev_kfree_skb(buf);
-		--vi->rq.num;
-	}
-	BUG_ON(vi->rq.num != 0);
-}
-
 static void remove_vq_common(struct virtnet_info *vi)
 {
 	vi->vdev->config->reset(vi->vdev);
@@ -1232,10 +1502,9 @@ static void remove_vq_common(struct virtnet_info *vi)
 	/* Free unused buffers in both send and recv, if any. */
 	free_unused_bufs(vi);
 
-	virtnet_del_vqs(vi);
+	free_receive_bufs(vi);
 
-	while (vi->rq.pages)
-		__free_pages(get_a_page(&vi->rq, GFP_KERNEL), 0);
+	virtnet_del_vqs(vi);
 }
 
 static void __devexit virtnet_remove(struct virtio_device *vdev)
@@ -1261,6 +1530,7 @@ static void __devexit virtnet_remove(struct virtio_device
*vdev)
 static int virtnet_freeze(struct virtio_device *vdev)
 {
 	struct virtnet_info *vi = vdev->priv;
+	int i;
 
 	/* Prevent config work handler from accessing the device */
 	mutex_lock(&vi->config_lock);
@@ -1271,7 +1541,10 @@ static int virtnet_freeze(struct virtio_device *vdev)
 	cancel_delayed_work_sync(&vi->refill);
 
 	if (netif_running(vi->dev))
-		napi_disable(&vi->rq.napi);
+		for (i = 0; i < vi->max_queue_pairs; i++) {
+			napi_disable(&vi->rq[i].napi);
+			netif_napi_del(&vi->rq[i].napi);
+		}
 
 	remove_vq_common(vi);
 
@@ -1283,24 +1556,28 @@ static int virtnet_freeze(struct virtio_device *vdev)
 static int virtnet_restore(struct virtio_device *vdev)
 {
 	struct virtnet_info *vi = vdev->priv;
-	int err;
+	int err, i;
 
 	err = init_vqs(vi);
 	if (err)
 		return err;
 
 	if (netif_running(vi->dev))
-		virtnet_napi_enable(&vi->rq);
+		for (i = 0; i < vi->max_queue_pairs; i++)
+			virtnet_napi_enable(&vi->rq[i]);
 
 	netif_device_attach(vi->dev);
 
-	if (!try_fill_recv(&vi->rq, GFP_KERNEL))
-		schedule_delayed_work(&vi->refill, 0);
+	for (i = 0; i < vi->max_queue_pairs; i++)
+		if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
+			schedule_delayed_work(&vi->refill, 0);
 
 	mutex_lock(&vi->config_lock);
 	vi->config_enable = true;
 	mutex_unlock(&vi->config_lock);
 
+	virtnet_set_queues(vi, vi->curr_queue_pairs);
+
 	return 0;
 }
 #endif
@@ -1318,7 +1595,7 @@ static unsigned int features[] = {
 	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO,
 	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ,
 	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN,
-	VIRTIO_NET_F_GUEST_ANNOUNCE,
+	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_RFS,
 };
 
 static struct virtio_driver virtio_net_driver = {
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 2470f54..7f45f3a 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -51,6 +51,8 @@
 #define VIRTIO_NET_F_CTRL_RX_EXTRA 20	/* Extra RX mode control support */
 #define VIRTIO_NET_F_GUEST_ANNOUNCE 21	/* Guest can announce device on the
 					 * network */
+#define VIRTIO_NET_F_RFS	22	/* Device supports Receive Flow
+					 * Steering */
 
 #define VIRTIO_NET_S_LINK_UP	1	/* Link is up */
 #define VIRTIO_NET_S_ANNOUNCE	2	/* Announcement is needed */
@@ -60,6 +62,11 @@ struct virtio_net_config {
 	__u8 mac[6];
 	/* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */
 	__u16 status;
+	/* Maximum number of each of transmit and receive queues;
+	 * see VIRTIO_NET_F_RFS and VIRTIO_NET_CTRL_RFS.
+	 * Legal values are between 1 and 0x8000
+	 */
+	__u16 max_virtqueue_pairs;
 } __attribute__((packed));
 
 /* This is the first element of the scatter-gather list.  If you don't
@@ -166,4 +173,24 @@ struct virtio_net_ctrl_mac {
 #define VIRTIO_NET_CTRL_ANNOUNCE       3
  #define VIRTIO_NET_CTRL_ANNOUNCE_ACK         0
 
+/*
+ * Control Receive Flow Steering
+ *
+ * The command VIRTIO_NET_CTRL_RFS_VQ_PAIRS_SET
+ * enables Receive Flow Steering, specifying the number of the transmit and
+ * receive queues that will be used. After the command is consumed and acked by
+ * the device, the device will not steer new packets on receive virtqueues
+ * other than specified nor read from transmit virtqueues other than specified.
+ * Accordingly, driver should not transmit new packets  on virtqueues other
than
+ * specified.
+ */
+struct virtio_net_ctrl_rfs {
+	u16 virtqueue_pairs;
+};
+
+#define VIRTIO_NET_CTRL_RFS   4
+ #define VIRTIO_NET_CTRL_RFS_VQ_PAIRS_SET        0
+ #define VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MIN        1
+ #define VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MAX        0x8000
+
 #endif /* _LINUX_VIRTIO_NET_H */
-- 
1.7.1
Jason Wang
2012-Dec-05  10:37 UTC
[PATCH net-next v2 3/3] virtio-net: support changing the number of queue pairs through ethtool
This patch implements the ethtool_{set|get}_channels method of virtio-net to
allow user to change the number of queues when the device is running on demand.
Signed-off-by: Jason Wang <jasowang at redhat.com>
---
 drivers/net/virtio_net.c |   43 +++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 43 insertions(+), 0 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index def11ce..51c9b75 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1075,10 +1075,53 @@ static void virtnet_get_drvinfo(struct net_device *dev,
 
 }
 
+/* TODO: Eliminate OOO packets during switching */
+static int virtnet_set_channels(struct net_device *dev,
+				struct ethtool_channels *channels)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	u16 queue_pairs = channels->combined_count;
+	int err;
+
+	/* We don't support separate rx/tx channels.
+	 * We don't allow setting 'other' channels.
+	 */
+	if (channels->rx_count || channels->tx_count ||
channels->other_count)
+		return -EINVAL;
+
+	if (queue_pairs > vi->max_queue_pairs)
+		return -EINVAL;
+
+	err = virtnet_set_queues(vi, queue_pairs);
+	if (!err) {
+		netif_set_real_num_tx_queues(dev, queue_pairs);
+		netif_set_real_num_rx_queues(dev, queue_pairs);
+
+		virtnet_set_affinity(vi, true);
+	}
+
+	return err;
+}
+
+static void virtnet_get_channels(struct net_device *dev,
+				 struct ethtool_channels *channels)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+
+	channels->combined_count = vi->curr_queue_pairs;
+	channels->max_combined = vi->max_queue_pairs;
+	channels->max_other = 0;
+	channels->rx_count = 0;
+	channels->tx_count = 0;
+	channels->other_count = 0;
+}
+
 static const struct ethtool_ops virtnet_ethtool_ops = {
 	.get_drvinfo = virtnet_get_drvinfo,
 	.get_link = ethtool_op_get_link,
 	.get_ringparam = virtnet_get_ringparam,
+	.set_channels = virtnet_set_channels,
+	.get_channels = virtnet_get_channels,
 };
 
 #define MIN_MTU 68
-- 
1.7.1