This patchset implements bio-based IO path for virito-blk to improve performance. Fio test shows it gives, 28%, 24%, 21%, 16% IOPS boost and 32%, 17%, 21%, 16% latency improvement for sequential read/write, random read/write respectively. Asias He (3): block: Introduce __blk_segment_map_sg() helper block: Add blk_bio_map_sg() helper virtio-blk: Add bio-based IO path for virtio-blk block/blk-merge.c | 117 +++++++++++++++++-------- drivers/block/virtio_blk.c | 203 +++++++++++++++++++++++++++++++++++--------- include/linux/blkdev.h | 2 + 3 files changed, 247 insertions(+), 75 deletions(-) -- 1.7.10.2
Split the mapping code in blk_rq_map_sg() to a helper
__blk_segment_map_sg(), so that other mapping function, e.g.
blk_bio_map_sg(), can share the code.
Cc: Jens Axboe <axboe at kernel.dk>
Cc: Tejun Heo <tj at kernel.org>
Cc: Shaohua Li <shli at kernel.org>
Cc: linux-kernel at vger.kernel.org
Suggested-by: Tejun Heo <tj at kernel.org>
Suggested-by: Jens Axboe <axboe at kernel.dk>
Signed-off-by: Asias He <asias at redhat.com>
---
 block/blk-merge.c |   80 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 45 insertions(+), 35 deletions(-)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 160035f..576b68e 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -110,6 +110,49 @@ static int blk_phys_contig_segment(struct request_queue *q,
struct bio *bio,
 	return 0;
 }
 
+static void
+__blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
+		     struct scatterlist *sglist, struct bio_vec **bvprv,
+		     struct scatterlist **sg, int *nsegs, int *cluster)
+{
+
+	int nbytes = bvec->bv_len;
+
+	if (*bvprv && *cluster) {
+		if ((*sg)->length + nbytes > queue_max_segment_size(q))
+			goto new_segment;
+
+		if (!BIOVEC_PHYS_MERGEABLE(*bvprv, bvec))
+			goto new_segment;
+		if (!BIOVEC_SEG_BOUNDARY(q, *bvprv, bvec))
+			goto new_segment;
+
+		(*sg)->length += nbytes;
+	} else {
+new_segment:
+		if (!*sg)
+			*sg = sglist;
+		else {
+			/*
+			 * If the driver previously mapped a shorter
+			 * list, we could see a termination bit
+			 * prematurely unless it fully inits the sg
+			 * table on each mapping. We KNOW that there
+			 * must be more entries here or the driver
+			 * would be buggy, so force clear the
+			 * termination bit to avoid doing a full
+			 * sg_init_table() in drivers for each command.
+			 */
+			(*sg)->page_link &= ~0x02;
+			*sg = sg_next(*sg);
+		}
+
+		sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
+		(*nsegs)++;
+	}
+	*bvprv = bvec;
+}
+
 /*
  * map a request to scatterlist, return number of sg entries setup. Caller
  * must make sure sg can hold rq->nr_phys_segments entries
@@ -131,41 +174,8 @@ int blk_rq_map_sg(struct request_queue *q, struct request
*rq,
 	bvprv = NULL;
 	sg = NULL;
 	rq_for_each_segment(bvec, rq, iter) {
-		int nbytes = bvec->bv_len;
-
-		if (bvprv && cluster) {
-			if (sg->length + nbytes > queue_max_segment_size(q))
-				goto new_segment;
-
-			if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
-				goto new_segment;
-			if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
-				goto new_segment;
-
-			sg->length += nbytes;
-		} else {
-new_segment:
-			if (!sg)
-				sg = sglist;
-			else {
-				/*
-				 * If the driver previously mapped a shorter
-				 * list, we could see a termination bit
-				 * prematurely unless it fully inits the sg
-				 * table on each mapping. We KNOW that there
-				 * must be more entries here or the driver
-				 * would be buggy, so force clear the
-				 * termination bit to avoid doing a full
-				 * sg_init_table() in drivers for each command.
-				 */
-				sg->page_link &= ~0x02;
-				sg = sg_next(sg);
-			}
-
-			sg_set_page(sg, bvec->bv_page, nbytes, bvec->bv_offset);
-			nsegs++;
-		}
-		bvprv = bvec;
+		__blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg,
+				     &nsegs, &cluster);
 	} /* segments in rq */
 
 
-- 
1.7.10.2
Add a helper to map a bio to a scatterlist, modelled after
blk_rq_map_sg.
This helper is useful for any driver that wants to create
a scatterlist from its ->make_request_fn method.
Changes in v2:
 - Use __blk_segment_map_sg to avoid duplicated code
 - Add cocbook style function comment
Cc: Jens Axboe <axboe at kernel.dk>
Cc: Tejun Heo <tj at kernel.org>
Cc: Shaohua Li <shli at kernel.org>
Cc: linux-kernel at vger.kernel.org
Signed-off-by: Christoph Hellwig <hch at lst.de>
Signed-off-by: Minchan Kim <minchan.kim at gmail.com>
Signed-off-by: Asias He <asias at redhat.com>
---
 block/blk-merge.c      |   37 +++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |    2 ++
 2 files changed, 39 insertions(+)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 576b68e..e76279e 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -209,6 +209,43 @@ int blk_rq_map_sg(struct request_queue *q, struct request
*rq,
 }
 EXPORT_SYMBOL(blk_rq_map_sg);
 
+/**
+ * blk_bio_map_sg - map a bio to a scatterlist
+ * @q: request_queue in question
+ * @bio: bio being mapped
+ * @sglist: scatterlist being mapped
+ *
+ * Note:
+ *    Caller must make sure sg can hold bio->bi_phys_segments entries
+ *
+ * Will return the number of sg entries setup
+ */
+int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
+		   struct scatterlist *sglist)
+{
+	struct bio_vec *bvec, *bvprv;
+	struct scatterlist *sg;
+	int nsegs, cluster;
+	unsigned long i;
+
+	nsegs = 0;
+	cluster = blk_queue_cluster(q);
+
+	bvprv = NULL;
+	sg = NULL;
+	bio_for_each_segment(bvec, bio, i) {
+		__blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg,
+				     &nsegs, &cluster);
+	} /* segments in bio */
+
+	if (sg)
+		sg_mark_end(sg);
+
+	BUG_ON(bio->bi_phys_segments && nsegs >
bio->bi_phys_segments);
+	return nsegs;
+}
+EXPORT_SYMBOL(blk_bio_map_sg);
+
 static inline int ll_new_hw_segment(struct request_queue *q,
 				    struct request *req,
 				    struct bio *bio)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ba43f40..46916af 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -884,6 +884,8 @@ extern void blk_queue_flush_queueable(struct request_queue
*q, bool queueable);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device
*bdev);
 
 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct
scatterlist *);
+extern int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
+			  struct scatterlist *sglist);
 extern void blk_dump_rq_flags(struct request *, char *);
 extern long nr_blockdev_pages(void);
 
-- 
1.7.10.2
Asias He
2012-Jun-18  06:53 UTC
[PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
This patch introduces bio-based IO path for virtio-blk.
Compared to request-based IO path, bio-based IO path uses driver
provided ->make_request_fn() method to bypasses the IO scheduler. It
handles the bio to device directly without allocating a request in block
layer. This reduces the IO path in guest kernel to achieve high IOPS
and lower latency. The downside is that guest can not use the IO
scheduler to merge and sort requests. However, this is not a big problem
if the backend disk in host side uses faster disk device.
When the bio-based IO path is not enabled, virtio-blk still uses the
original request-based IO path, no performance difference is observed.
Performance evaluation:
-----------------------------
Fio test is performed in a 8 vcpu guest with ramdisk based guest using
kvm tool.
Short version:
 With bio-based IO path, sequential read/write, random read/write
 IOPS boost         : 28%, 24%, 21%, 16%
 Latency improvement: 32%, 17%, 21%, 16%
Long version:
 With bio-based IO path:
  seq-read  : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec
  seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec
  rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec
  rand-write: io=3095.7MB, bw=96198KB/s,  iops=192396 , runt= 32952msec
    clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30
    clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35
    clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39
    clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45
  cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954
  cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137
  cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648
  cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375
 With request-based IO path:
  seq-read  : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec
  seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec
  rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec
  rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec
    clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49
    clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15
    clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27
    clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68
  cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622
  cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959
  cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082
  cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985
How to use:
-----------------------------
Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk
use_bio=1' to enable ->make_request_fn() based I/O path.
Cc: Rusty Russell <rusty at rustcorp.com.au>
Cc: "Michael S. Tsirkin" <mst at redhat.com>
Cc: virtualization at lists.linux-foundation.org
Cc: kvm at vger.kernel.org
Cc: linux-kernel at vger.kernel.org
Signed-off-by: Christoph Hellwig <hch at lst.de>
Signed-off-by: Minchan Kim <minchan.kim at gmail.com>
Signed-off-by: Asias He <asias at redhat.com>
---
 drivers/block/virtio_blk.c |  203 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 163 insertions(+), 40 deletions(-)
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 774c31d..b2d5002 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -14,6 +14,9 @@
 
 #define PART_BITS 4
 
+static bool use_bio;
+module_param(use_bio, bool, S_IRUGO);
+
 static int major;
 static DEFINE_IDA(vd_index_ida);
 
@@ -23,6 +26,7 @@ struct virtio_blk
 {
 	struct virtio_device *vdev;
 	struct virtqueue *vq;
+	wait_queue_head_t queue_wait;
 
 	/* The disk structure for the kernel. */
 	struct gendisk *disk;
@@ -51,53 +55,87 @@ struct virtio_blk
 struct virtblk_req
 {
 	struct request *req;
+	struct bio *bio;
 	struct virtio_blk_outhdr out_hdr;
 	struct virtio_scsi_inhdr in_hdr;
 	u8 status;
+	struct scatterlist sg[];
 };
 
-static void blk_done(struct virtqueue *vq)
+static inline int virtblk_result(struct virtblk_req *vbr)
+{
+	switch (vbr->status) {
+	case VIRTIO_BLK_S_OK:
+		return 0;
+	case VIRTIO_BLK_S_UNSUPP:
+		return -ENOTTY;
+	default:
+		return -EIO;
+	}
+}
+
+static inline void virtblk_request_done(struct virtio_blk *vblk,
+					struct virtblk_req *vbr)
+{
+	struct request *req = vbr->req;
+	int error = virtblk_result(vbr);
+
+	if (req->cmd_type == REQ_TYPE_BLOCK_PC) {
+		req->resid_len = vbr->in_hdr.residual;
+		req->sense_len = vbr->in_hdr.sense_len;
+		req->errors = vbr->in_hdr.errors;
+	} else if (req->cmd_type == REQ_TYPE_SPECIAL) {
+		req->errors = (error != 0);
+	}
+
+	__blk_end_request_all(req, error);
+	mempool_free(vbr, vblk->pool);
+}
+
+static inline void virtblk_bio_done(struct virtio_blk *vblk,
+				    struct virtblk_req *vbr)
+{
+	bio_endio(vbr->bio, virtblk_result(vbr));
+	mempool_free(vbr, vblk->pool);
+}
+
+static void virtblk_done(struct virtqueue *vq)
 {
 	struct virtio_blk *vblk = vq->vdev->priv;
+	unsigned long bio_done = 0, req_done = 0;
 	struct virtblk_req *vbr;
-	unsigned int len;
 	unsigned long flags;
+	unsigned int len;
 
 	spin_lock_irqsave(vblk->disk->queue->queue_lock, flags);
 	while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
-		int error;
-
-		switch (vbr->status) {
-		case VIRTIO_BLK_S_OK:
-			error = 0;
-			break;
-		case VIRTIO_BLK_S_UNSUPP:
-			error = -ENOTTY;
-			break;
-		default:
-			error = -EIO;
-			break;
-		}
-
-		switch (vbr->req->cmd_type) {
-		case REQ_TYPE_BLOCK_PC:
-			vbr->req->resid_len = vbr->in_hdr.residual;
-			vbr->req->sense_len = vbr->in_hdr.sense_len;
-			vbr->req->errors = vbr->in_hdr.errors;
-			break;
-		case REQ_TYPE_SPECIAL:
-			vbr->req->errors = (error != 0);
-			break;
-		default:
-			break;
+		if (vbr->bio) {
+			virtblk_bio_done(vblk, vbr);
+			bio_done++;
+		} else {
+			virtblk_request_done(vblk, vbr);
+			req_done++;
 		}
-
-		__blk_end_request_all(vbr->req, error);
-		mempool_free(vbr, vblk->pool);
 	}
 	/* In case queue is stopped waiting for more buffers. */
-	blk_start_queue(vblk->disk->queue);
+	if (req_done)
+		blk_start_queue(vblk->disk->queue);
 	spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags);
+
+	if (bio_done)
+		wake_up(&vblk->queue_wait);
+}
+
+static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk,
+						    gfp_t gfp_mask)
+{
+	struct virtblk_req *vbr;
+
+	vbr = mempool_alloc(vblk->pool, gfp_mask);
+	if (vbr && use_bio)
+		sg_init_table(vbr->sg, vblk->sg_elems);
+
+	return vbr;
 }
 
 static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
@@ -106,13 +144,13 @@ static bool do_req(struct request_queue *q, struct
virtio_blk *vblk,
 	unsigned long num, out = 0, in = 0;
 	struct virtblk_req *vbr;
 
-	vbr = mempool_alloc(vblk->pool, GFP_ATOMIC);
+	vbr = virtblk_alloc_req(vblk, GFP_ATOMIC);
 	if (!vbr)
 		/* When another request finishes we'll try again. */
 		return false;
 
 	vbr->req = req;
-
+	vbr->bio = NULL;
 	if (req->cmd_flags & REQ_FLUSH) {
 		vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
 		vbr->out_hdr.sector = 0;
@@ -172,7 +210,8 @@ static bool do_req(struct request_queue *q, struct
virtio_blk *vblk,
 		}
 	}
 
-	if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr,
GFP_ATOMIC)<0) {
+	if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr,
+			      GFP_ATOMIC) < 0) {
 		mempool_free(vbr, vblk->pool);
 		return false;
 	}
@@ -180,7 +219,7 @@ static bool do_req(struct request_queue *q, struct
virtio_blk *vblk,
 	return true;
 }
 
-static void do_virtblk_request(struct request_queue *q)
+static void virtblk_request(struct request_queue *q)
 {
 	struct virtio_blk *vblk = q->queuedata;
 	struct request *req;
@@ -203,6 +242,82 @@ static void do_virtblk_request(struct request_queue *q)
 		virtqueue_kick(vblk->vq);
 }
 
+static void virtblk_add_buf_wait(struct virtio_blk *vblk,
+				 struct virtblk_req *vbr,
+				 unsigned long out,
+				 unsigned long in)
+{
+	DEFINE_WAIT(wait);
+
+	for (;;) {
+		prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
+					  TASK_UNINTERRUPTIBLE);
+
+		spin_lock_irq(vblk->disk->queue->queue_lock);
+		if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
+				      GFP_ATOMIC) < 0) {
+			spin_unlock_irq(vblk->disk->queue->queue_lock);
+			io_schedule();
+		} else {
+			virtqueue_kick(vblk->vq);
+			spin_unlock_irq(vblk->disk->queue->queue_lock);
+			break;
+		}
+
+	}
+
+	finish_wait(&vblk->queue_wait, &wait);
+}
+
+static void virtblk_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct virtio_blk *vblk = q->queuedata;
+	unsigned int num, out = 0, in = 0;
+	struct virtblk_req *vbr;
+
+	BUG_ON(bio->bi_phys_segments + 2 > vblk->sg_elems);
+	BUG_ON(bio->bi_rw & (REQ_FLUSH | REQ_FUA));
+
+	vbr = virtblk_alloc_req(vblk, GFP_NOIO);
+	if (!vbr) {
+		bio_endio(bio, -ENOMEM);
+		return;
+	}
+
+	vbr->bio = bio;
+	vbr->req = NULL;
+	vbr->out_hdr.type = 0;
+	vbr->out_hdr.sector = bio->bi_sector;
+	vbr->out_hdr.ioprio = bio_prio(bio);
+
+	sg_set_buf(&vbr->sg[out++], &vbr->out_hdr,
sizeof(vbr->out_hdr));
+
+	num = blk_bio_map_sg(q, bio, vbr->sg + out);
+
+	sg_set_buf(&vbr->sg[num + out + in++], &vbr->status,
+		   sizeof(vbr->status));
+
+	if (num) {
+		if (bio->bi_rw & REQ_WRITE) {
+			vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
+			out += num;
+		} else {
+			vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
+			in += num;
+		}
+	}
+
+	spin_lock_irq(vblk->disk->queue->queue_lock);
+	if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
+			      GFP_ATOMIC) < 0) {
+		spin_unlock_irq(vblk->disk->queue->queue_lock);
+		virtblk_add_buf_wait(vblk, vbr, out, in);
+	} else {
+		virtqueue_kick(vblk->vq);
+		spin_unlock_irq(vblk->disk->queue->queue_lock);
+	}
+}
+
 /* return id (s/n) string for *disk to *id_str
  */
 static int virtblk_get_id(struct gendisk *disk, char *id_str)
@@ -360,7 +475,7 @@ static int init_vq(struct virtio_blk *vblk)
 	int err = 0;
 
 	/* We expect one virtqueue, for output. */
-	vblk->vq = virtio_find_single_vq(vblk->vdev, blk_done,
"requests");
+	vblk->vq = virtio_find_single_vq(vblk->vdev, virtblk_done,
"requests");
 	if (IS_ERR(vblk->vq))
 		err = PTR_ERR(vblk->vq);
 
@@ -400,6 +515,8 @@ static int __devinit virtblk_probe(struct virtio_device
*vdev)
 	struct virtio_blk *vblk;
 	struct request_queue *q;
 	int err, index;
+	int pool_size;
+
 	u64 cap;
 	u32 v, blk_size, sg_elems, opt_io_size;
 	u16 min_io_size;
@@ -429,10 +546,12 @@ static int __devinit virtblk_probe(struct virtio_device
*vdev)
 		goto out_free_index;
 	}
 
+	init_waitqueue_head(&vblk->queue_wait);
 	vblk->vdev = vdev;
 	vblk->sg_elems = sg_elems;
 	sg_init_table(vblk->sg, vblk->sg_elems);
 	mutex_init(&vblk->config_lock);
+
 	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
 	vblk->config_enable = true;
 
@@ -440,7 +559,10 @@ static int __devinit virtblk_probe(struct virtio_device
*vdev)
 	if (err)
 		goto out_free_vblk;
 
-	vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req));
+	pool_size = sizeof(struct virtblk_req);
+	if (use_bio)
+		pool_size += sizeof(struct scatterlist) * sg_elems;
+	vblk->pool = mempool_create_kmalloc_pool(1, pool_size);
 	if (!vblk->pool) {
 		err = -ENOMEM;
 		goto out_free_vq;
@@ -453,12 +575,14 @@ static int __devinit virtblk_probe(struct virtio_device
*vdev)
 		goto out_mempool;
 	}
 
-	q = vblk->disk->queue = blk_init_queue(do_virtblk_request, NULL);
+	q = vblk->disk->queue = blk_init_queue(virtblk_request, NULL);
 	if (!q) {
 		err = -ENOMEM;
 		goto out_put_disk;
 	}
 
+	if (use_bio)
+		blk_queue_make_request(q, virtblk_make_request);
 	q->queuedata = vblk;
 
 	virtblk_name_format("vd", index, vblk->disk->disk_name,
DISK_NAME_LEN);
@@ -471,7 +595,7 @@ static int __devinit virtblk_probe(struct virtio_device
*vdev)
 	vblk->index = index;
 
 	/* configure queue flush support */
-	if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
+	if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH) && !use_bio)
 		blk_queue_flush(q, REQ_FLUSH);
 
 	/* If disk is read-only in the host, the guest should obey */
@@ -544,7 +668,6 @@ static int __devinit virtblk_probe(struct virtio_device
*vdev)
 	if (!err && opt_io_size)
 		blk_queue_io_opt(q, blk_size * opt_io_size);
 
-
 	add_disk(vblk->disk);
 	err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial);
 	if (err)
-- 
1.7.10.2
Rusty Russell
2012-Jun-18  07:46 UTC
[PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
On Mon, 18 Jun 2012 14:53:10 +0800, Asias He <asias at redhat.com> wrote:> This patch introduces bio-based IO path for virtio-blk.Why make it optional? Thanks, Rusty.
Asias He
2012-Jun-18  08:03 UTC
[PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
On 06/18/2012 03:46 PM, Rusty Russell wrote:> On Mon, 18 Jun 2012 14:53:10 +0800, Asias He <asias at redhat.com> wrote: >> This patch introduces bio-based IO path for virtio-blk. > > Why make it optional?request-based IO path is useful for users who do not want to bypass the IO scheduler in guest kernel, e.g. users using spinning disk. For users using fast disk device, e.g. SSD device, they can use bio-based IO path. -- Asias
On Mon, Jun 18, 2012 at 7:53 AM, Asias He <asias at redhat.com> wrote:> Fio test shows it gives, 28%, 24%, 21%, 16% IOPS boost and 32%, 17%, 21%, 16% > latency improvement for sequential read/write, random read/write respectively.Sounds great. What storage configuration did you use (single spinning disk, SSD, storage array) and are these numbers for parallel I/O or sequential I/O? What changed since Minchan worked on this? I remember he wasn't satisfied that this was a clear win. Your numbers are strong so either you fixed something important or you are looking at different benchmark configurations. Stefan
Stefan Hajnoczi
2012-Jun-18  09:37 UTC
[PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
On Mon, Jun 18, 2012 at 7:53 AM, Asias He <asias at redhat.com> wrote:> +static void virtblk_add_buf_wait(struct virtio_blk *vblk, > + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct virtblk_req *vbr, > + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?unsigned long out, > + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?unsigned long in) > +{ > + ? ? ? DEFINE_WAIT(wait); > + > + ? ? ? for (;;) { > + ? ? ? ? ? ? ? prepare_to_wait_exclusive(&vblk->queue_wait, &wait, > + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? TASK_UNINTERRUPTIBLE); > + > + ? ? ? ? ? ? ? spin_lock_irq(vblk->disk->queue->queue_lock); > + ? ? ? ? ? ? ? if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, > + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? GFP_ATOMIC) < 0) { > + ? ? ? ? ? ? ? ? ? ? ? spin_unlock_irq(vblk->disk->queue->queue_lock); > + ? ? ? ? ? ? ? ? ? ? ? io_schedule(); > + ? ? ? ? ? ? ? } else { > + ? ? ? ? ? ? ? ? ? ? ? virtqueue_kick(vblk->vq); > + ? ? ? ? ? ? ? ? ? ? ? spin_unlock_irq(vblk->disk->queue->queue_lock); > + ? ? ? ? ? ? ? ? ? ? ? break; > + ? ? ? ? ? ? ? } > + > + ? ? ? }Is there a meaningful condition where we would cancel this request (e.g. signal)? If the vring fills up for some reason and we get stuck here the user might wish to kill hung processes. Stefan
Rusty Russell
2012-Jun-18  10:05 UTC
[PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
On Mon, 18 Jun 2012 16:03:23 +0800, Asias He <asias at redhat.com> wrote:> On 06/18/2012 03:46 PM, Rusty Russell wrote: > > On Mon, 18 Jun 2012 14:53:10 +0800, Asias He <asias at redhat.com> wrote: > >> This patch introduces bio-based IO path for virtio-blk. > > > > Why make it optional? > > request-based IO path is useful for users who do not want to bypass the > IO scheduler in guest kernel, e.g. users using spinning disk. For users > using fast disk device, e.g. SSD device, they can use bio-based IO path.Users using a spinning disk still get IO scheduling in the host though. What benefit is there in doing it in the guest as well? Cheers, Rusty.
Michael S. Tsirkin
2012-Jun-18  10:13 UTC
[PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
On Mon, Jun 18, 2012 at 04:03:23PM +0800, Asias He wrote:> On 06/18/2012 03:46 PM, Rusty Russell wrote: > >On Mon, 18 Jun 2012 14:53:10 +0800, Asias He <asias at redhat.com> wrote: > >>This patch introduces bio-based IO path for virtio-blk. > > > >Why make it optional? > > request-based IO path is useful for users who do not want to bypass > the IO scheduler in guest kernel, e.g. users using spinning disk. > For users using fast disk device, e.g. SSD device, they can use > bio-based IO path.OK I guess but then it should be per-device. There could be a mix of slow and fast disks :)> -- > Asias >
Michael S. Tsirkin
2012-Jun-18  10:21 UTC
[PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
On Mon, Jun 18, 2012 at 02:53:10PM +0800, Asias He wrote:> +static void virtblk_make_request(struct request_queue *q, struct bio *bio) > +{ > + struct virtio_blk *vblk = q->queuedata; > + unsigned int num, out = 0, in = 0; > + struct virtblk_req *vbr; > + > + BUG_ON(bio->bi_phys_segments + 2 > vblk->sg_elems); > + BUG_ON(bio->bi_rw & (REQ_FLUSH | REQ_FUA)); > + > + vbr = virtblk_alloc_req(vblk, GFP_NOIO); > + if (!vbr) { > + bio_endio(bio, -ENOMEM); > + return; > + } > + > + vbr->bio = bio; > + vbr->req = NULL; > + vbr->out_hdr.type = 0; > + vbr->out_hdr.sector = bio->bi_sector; > + vbr->out_hdr.ioprio = bio_prio(bio); > + > + sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); > + > + num = blk_bio_map_sg(q, bio, vbr->sg + out); > + > + sg_set_buf(&vbr->sg[num + out + in++], &vbr->status, > + sizeof(vbr->status)); > + > + if (num) { > + if (bio->bi_rw & REQ_WRITE) { > + vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; > + out += num; > + } else { > + vbr->out_hdr.type |= VIRTIO_BLK_T_IN; > + in += num; > + } > + } > + > + spin_lock_irq(vblk->disk->queue->queue_lock); > + if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, > + GFP_ATOMIC) < 0) { > + spin_unlock_irq(vblk->disk->queue->queue_lock);Any implications of dropping lock like that? E.g. for suspend. like we are still discussing with unlocked kick?> + virtblk_add_buf_wait(vblk, vbr, out, in); > + } else { > + virtqueue_kick(vblk->vq);Why special case the first call? task state manipulation so expensive?> + spin_unlock_irq(vblk->disk->queue->queue_lock); > + } > +} > +
Tejun Heo
2012-Jun-18  21:31 UTC
[PATCH 1/3] block: Introduce __blk_segment_map_sg() helper
Hello, Asias. On Mon, Jun 18, 2012 at 02:53:08PM +0800, Asias He wrote:> Split the mapping code in blk_rq_map_sg() to a helper > __blk_segment_map_sg(), so that other mapping function, e.g. > blk_bio_map_sg(), can share the code. > > Cc: Jens Axboe <axboe at kernel.dk> > Cc: Tejun Heo <tj at kernel.org> > Cc: Shaohua Li <shli at kernel.org> > Cc: linux-kernel at vger.kernel.org > Suggested-by: Tejun Heo <tj at kernel.org> > Suggested-by: Jens Axboe <axboe at kernel.dk> > Signed-off-by: Asias He <asias at redhat.com> > --- > block/blk-merge.c | 80 ++++++++++++++++++++++++++++++----------------------- > 1 file changed, 45 insertions(+), 35 deletions(-) > > diff --git a/block/blk-merge.c b/block/blk-merge.c > index 160035f..576b68e 100644 > --- a/block/blk-merge.c > +++ b/block/blk-merge.c > @@ -110,6 +110,49 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, > return 0; > } > > +static void > +__blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec, > + struct scatterlist *sglist, struct bio_vec **bvprv, > + struct scatterlist **sg, int *nsegs, int *cluster) > +{ > + > + int nbytes = bvec->bv_len; > + > + if (*bvprv && *cluster) { > + if ((*sg)->length + nbytes > queue_max_segment_size(q)) > + goto new_segment; > + > + if (!BIOVEC_PHYS_MERGEABLE(*bvprv, bvec)) > + goto new_segment; > + if (!BIOVEC_SEG_BOUNDARY(q, *bvprv, bvec)) > + goto new_segment; > + > + (*sg)->length += nbytes; > + } else { > +new_segment: > + if (!*sg) > + *sg = sglist; > + else { > + /* > + * If the driver previously mapped a shorter > + * list, we could see a termination bit > + * prematurely unless it fully inits the sg > + * table on each mapping. We KNOW that there > + * must be more entries here or the driver > + * would be buggy, so force clear the > + * termination bit to avoid doing a full > + * sg_init_table() in drivers for each command. > + */ > + (*sg)->page_link &= ~0x02; > + *sg = sg_next(*sg); > + } > + > + sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset); > + (*nsegs)++; > + } > + *bvprv = bvec; > +}I *hope* this is a bit prettier. e.g. Do we really need to pass in @sglist and keep using "goto new_segment"? Thanks. -- tejun
Asias He
2012-Jun-19  02:30 UTC
[PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
On 06/18/2012 06:21 PM, Michael S. Tsirkin wrote:> On Mon, Jun 18, 2012 at 02:53:10PM +0800, Asias He wrote: >> +static void virtblk_make_request(struct request_queue *q, struct bio *bio) >> +{ >> + struct virtio_blk *vblk = q->queuedata; >> + unsigned int num, out = 0, in = 0; >> + struct virtblk_req *vbr; >> + >> + BUG_ON(bio->bi_phys_segments + 2 > vblk->sg_elems); >> + BUG_ON(bio->bi_rw & (REQ_FLUSH | REQ_FUA)); >> + >> + vbr = virtblk_alloc_req(vblk, GFP_NOIO); >> + if (!vbr) { >> + bio_endio(bio, -ENOMEM); >> + return; >> + } >> + >> + vbr->bio = bio; >> + vbr->req = NULL; >> + vbr->out_hdr.type = 0; >> + vbr->out_hdr.sector = bio->bi_sector; >> + vbr->out_hdr.ioprio = bio_prio(bio); >> + >> + sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); >> + >> + num = blk_bio_map_sg(q, bio, vbr->sg + out); >> + >> + sg_set_buf(&vbr->sg[num + out + in++], &vbr->status, >> + sizeof(vbr->status)); >> + >> + if (num) { >> + if (bio->bi_rw & REQ_WRITE) { >> + vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; >> + out += num; >> + } else { >> + vbr->out_hdr.type |= VIRTIO_BLK_T_IN; >> + in += num; >> + } >> + } >> + >> + spin_lock_irq(vblk->disk->queue->queue_lock); >> + if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, >> + GFP_ATOMIC) < 0) { >> + spin_unlock_irq(vblk->disk->queue->queue_lock); > > Any implications of dropping lock like that? > E.g. for suspend. like we are still discussing with > unlocked kick? > >> + virtblk_add_buf_wait(vblk, vbr, out, in); >> + } else { >> + virtqueue_kick(vblk->vq); > > Why special case the first call? task state manipulation so expensive?Hmm. Will switch them.> >> + spin_unlock_irq(vblk->disk->queue->queue_lock); >> + } >> +} >> + > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo at vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html >-- Asias
Asias He
2012-Jul-03  14:02 UTC
[PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
On 07/03/2012 09:31 PM, Paolo Bonzini wrote:> Il 02/07/2012 08:41, Rusty Russell ha scritto: >>> With the same workload in guest, the guest fires 200K requests to host >>> with merges enabled in guest (echo 0 > /sys/block/vdb/queue/nomerges), >>> while the guest fires 40000K requests to host with merges disabled in >>> guest (echo 2 > /sys/block/vdb/queue/nomerges). This show that the merge >>> in block layer reduces the total number of requests fire to host a lot >>> (40000K / 200K = 20). >>> > > 40000 / 200 is 200, not 20. :)Crap, I wrote one more zero here. Actually, it is 4000K. So the factor is still 4000K/200k = 20 ;-) -- Asias