Hi all, I add a blkback QoS patch. You can config(dynamic/static) different I/O speed for different VM disk by this patch. ---------------------------------------------------------------------------- diff -urNp blkback/blkback.c blkback-qos/blkback.c --- blkback/blkback.c 2011-06-22 07:54:19.000000000 +0800 +++ blkback-qos/blkback.c 2011-06-22 07:53:18.000000000 +0800 @@ -44,6 +44,11 @@ #include <asm/hypervisor.h> #include "common.h" +#undef DPRINTK +#define DPRINTK(fmt, args...) \ + printk("blkback/blkback (%s:%d) " fmt ".\n", \ + __FUNCTION__, __LINE__, ##args) + /* * These are rather arbitrary. They are fairly large because adjacent requests * pulled from a communication ring are quite likely to end up being part of @@ -110,7 +115,8 @@ static inline unsigned long vaddr(pendin static int do_block_io_op(blkif_t *blkif); static int dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req, - pending_req_t *pending_req); + pending_req_t *pending_req, + int *done_nr_sects); static void make_response(blkif_t *blkif, u64 id, unsigned short op, int st); @@ -206,10 +212,20 @@ static void print_stats(blkif_t *blkif) blkif->st_pk_req = 0; } +static void refill_reqcount(blkif_t *blkif) +{ + blkif->reqtime = jiffies + msecs_to_jiffies(1000); + blkif->reqcount = blkif->reqrate; + if (blkif->reqcount < blkif->reqmin) + blkif->reqcount = blkif->reqmin; +} + int blkif_schedule(void *arg) { blkif_t *blkif = arg; struct vbd *vbd = &blkif->vbd; + int ret = 0; + struct timeval cur_time; blkif_get(blkif); @@ -232,12 +248,34 @@ int blkif_schedule(void *arg) blkif->waiting_reqs = 0; smp_mb(); /* clear flag *before* checking for work */ - if (do_block_io_op(blkif)) + ret = do_block_io_op(blkif); + if (ret) blkif->waiting_reqs = 1; unplug_queue(blkif); + if(blkif->reqmin){ + if(2 == ret && (blkif->reqtime > jiffies)){ + jiffies_to_timeval(jiffies, &cur_time); + if(log_stats && (cur_time.tv_sec % 10 ==1 )) + printk(KERN_DEBUG "%s: going to sleep %d millsecs(rate=%d)\n", + current->comm, + jiffies_to_msecs(blkif->reqtime - jiffies), + blkif->reqrate); + + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(blkif->reqtime - jiffies); + + if(log_stats && (cur_time.tv_sec % 10 ==1 )) + printk(KERN_DEBUG "%s: sleep end(rate=%d)\n", + current->comm,blkif->reqrate); + } + if (time_after(jiffies, blkif->reqtime)) + refill_reqcount(blkif); + } + if (log_stats && time_after(jiffies, blkif->st_print)) print_stats(blkif); + } if (log_stats) @@ -306,7 +344,6 @@ irqreturn_t blkif_be_int(int irq, void * /****************************************************************** * DOWNWARD CALLS -- These interface with the block-device layer proper. */ - static int do_block_io_op(blkif_t *blkif) { blkif_back_rings_t *blk_rings = &blkif->blk_rings; @@ -314,15 +351,27 @@ static int do_block_io_op(blkif_t *blkif pending_req_t *pending_req; RING_IDX rc, rp; int more_to_do = 0, ret; + static int last_done_nr_sects = 0; rc = blk_rings->common.req_cons; rp = blk_rings->common.sring->req_prod; rmb(); /* Ensure we see queued requests up to ''rp''. */ + + if (blkif->reqmin && blkif->reqcount <= 0) + return (rc != rp) ? 2 : 0; while ((rc != rp) || (blkif->is_suspended_req)) { if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) break; + + if(blkif->reqmin){ + blkif->reqcount -= last_done_nr_sects; + if (blkif->reqcount <= 0) { + more_to_do = 2; + break; + } + } if (kthread_should_stop()) { more_to_do = 1; @@ -367,14 +416,14 @@ handle_request: switch (req.operation) { case BLKIF_OP_READ: blkif->st_rd_req++; - ret = dispatch_rw_block_io(blkif, &req, pending_req); + ret = dispatch_rw_block_io(blkif, &req, pending_req,&last_done_nr_sects); break; case BLKIF_OP_WRITE_BARRIER: blkif->st_br_req++; /* fall through */ case BLKIF_OP_WRITE: blkif->st_wr_req++; - ret = dispatch_rw_block_io(blkif, &req, pending_req); + ret = dispatch_rw_block_io(blkif, &req, pending_req,&last_done_nr_sects); break; case BLKIF_OP_PACKET: DPRINTK("error: block operation BLKIF_OP_PACKET not implemented\n"); @@ -412,9 +461,29 @@ handle_request: return more_to_do; } +static char* operation2str(int operation) +{ + char* ret_str = NULL; + switch (operation) { + case BLKIF_OP_READ: + ret_str = "READ"; + break; + case BLKIF_OP_WRITE: + ret_str = "WRITE"; + break; + case BLKIF_OP_WRITE_BARRIER: + ret_str = "WRITE_BARRIER"; + break; + default: + ret_str = "0"; + } + return ret_str; +} + static int dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req, - pending_req_t *pending_req) + pending_req_t *pending_req, + int *done_nr_sects) { extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; @@ -426,6 +495,9 @@ static int dispatch_rw_block_io(blkif_t struct bio *bio = NULL; int ret, i; int operation; + struct timeval cur_time; + + *done_nr_sects = 0; switch (req->operation) { case BLKIF_OP_READ: @@ -582,6 +654,12 @@ static int dispatch_rw_block_io(blkif_t else if (operation == WRITE || operation == WRITE_BARRIER) blkif->st_wr_sect += preq.nr_sects; + *done_nr_sects = preq.nr_sects; + jiffies_to_timeval(jiffies, &cur_time); + if ((log_stats == 2) && (cur_time.tv_sec % 10 ==1 )) + printk(KERN_DEBUG " operation=%s sects=%d\n", + operation2str(req->operation),preq.nr_sects); + return 0; fail_flush: @@ -695,6 +773,8 @@ static int __init blkif_init(void) blkif_xenbus_init(); + DPRINTK("blkif_inited\n"); + return 0; out_of_memory: diff -urNp blkback/cdrom.c blkback-qos/cdrom.c --- blkback/cdrom.c 2010-05-20 18:07:00.000000000 +0800 +++ blkback-qos/cdrom.c 2011-06-22 07:34:50.000000000 +0800 @@ -35,9 +35,9 @@ #include "common.h" #undef DPRINTK -#define DPRINTK(_f, _a...) \ - printk("(%s() file=%s, line=%d) " _f "\n", \ - __PRETTY_FUNCTION__, __FILE__ , __LINE__ , ##_a ) +#define DPRINTK(fmt, args...) \ + printk("blkback/cdrom (%s:%d) " fmt ".\n", \ + __FUNCTION__, __LINE__, ##args) #define MEDIA_PRESENT "media-present" diff -urNp blkback/common.h blkback-qos/common.h --- blkback/common.h 2010-05-20 18:07:00.000000000 +0800 +++ blkback-qos/common.h 2011-06-22 07:34:50.000000000 +0800 @@ -100,8 +100,17 @@ typedef struct blkif_st { grant_handle_t shmem_handle; grant_ref_t shmem_ref; + + /* qos information */ + unsigned long reqtime; + int reqcount; + int reqmin; + int reqrate; + } blkif_t; +#define VBD_QOS_MIN_RATE_LIMIT 2*1024 /* 1MBs */ + struct backend_info { struct xenbus_device *dev; @@ -111,6 +120,8 @@ struct backend_info unsigned major; unsigned minor; char *mode; + struct xenbus_watch rate_watch; + int have_rate_watch; }; blkif_t *blkif_alloc(domid_t domid); diff -urNp blkback/vbd.c blkback-qos/vbd.c --- blkback/vbd.c 2010-05-20 18:07:00.000000000 +0800 +++ blkback-qos/vbd.c 2011-06-22 07:34:50.000000000 +0800 @@ -35,6 +35,11 @@ #define vbd_sz(_v) ((_v)->bdev->bd_part ? \ (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk)) +#undef DPRINTK +#define DPRINTK(fmt, args...) \ + printk("blkback/vbd (%s:%d) " fmt ".\n", \ + __FUNCTION__, __LINE__, ##args) + unsigned long long vbd_size(struct vbd *vbd) { return vbd_sz(vbd); @@ -87,7 +92,7 @@ int vbd_create(blkif_t *blkif, blkif_vde if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE) vbd->type |= VDISK_REMOVABLE; - DPRINTK("Successful creation of handle=%04x (dom=%u)\n", + DPRINTK("Successful creation of handle=%04x (dom=%u)", handle, blkif->domid); return 0; } diff -urNp blkback/xenbus.c blkback-qos/xenbus.c --- blkback/xenbus.c 2010-05-20 18:07:00.000000000 +0800 +++ blkback-qos/xenbus.c 2011-06-22 07:34:50.000000000 +0800 @@ -25,13 +25,14 @@ #undef DPRINTK #define DPRINTK(fmt, args...) \ - pr_debug("blkback/xenbus (%s:%d) " fmt ".\n", \ + printk("blkback/xenbus (%s:%d) " fmt ".\n", \ __FUNCTION__, __LINE__, ##args) static void connect(struct backend_info *); static int connect_ring(struct backend_info *); static void backend_changed(struct xenbus_watch *, const char **, unsigned int); +static void unregister_rate_watch(struct backend_info *be); static int blkback_name(blkif_t *blkif, char *buf) { @@ -59,8 +60,10 @@ static void update_blkif_status(blkif_t char name[TASK_COMM_LEN]; /* Not ready to connect? */ - if (!blkif->irq || !blkif->vbd.bdev) + if (!blkif->irq || !blkif->vbd.bdev){ + DPRINTK("Not ready to connect"); return; + } /* Already connected? */ if (blkif->be->dev->state == XenbusStateConnected) @@ -193,6 +196,8 @@ static int blkback_remove(struct xenbus_ be->cdrom_watch.node = NULL; } + unregister_rate_watch(be); + if (be->blkif) { blkif_disconnect(be->blkif); vbd_free(&be->blkif->vbd); @@ -251,6 +256,10 @@ static int blkback_probe(struct xenbus_d err = xenbus_watch_path2(dev, dev->nodename, "physical-device", &be->backend_watch, backend_changed); + + DPRINTK("blkback_probe called"); + DPRINTK("dev->nodename=%s/physical-device",dev->nodename); + if (err) goto fail; @@ -266,7 +275,6 @@ fail: return err; } - /** * Callback received when the hotplug scripts have placed the physical-device * node. Read it and the mode node, and create a vbd. If the frontend is @@ -283,8 +291,9 @@ static void backend_changed(struct xenbu struct xenbus_device *dev = be->dev; int cdrom = 0; char *device_type; + char name[TASK_COMM_LEN]; - DPRINTK(""); + DPRINTK("backend_changed called"); err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x", &major, &minor); @@ -322,6 +331,34 @@ static void backend_changed(struct xenbu kfree(device_type); } + /* gather information about QoS policy for this device. */ + err = blkback_name(be->blkif, name); + if (err) { + xenbus_dev_error(be->dev, err, "get blkback dev name"); + return; + } + + err = xenbus_gather(XBT_NIL, dev->otherend, + "tokens-rate", "%d", &be->blkif->reqrate, + NULL); + if(err){ + DPRINTK("%s xenbus_gather(tokens-min,tokens-rate) error",name); + }else{ + if(be->blkif->reqrate <= 0){ + be->blkif->reqmin = 0 ; + DPRINTK("%s tokens-rate == 0,no limit",name); + }else{ + DPRINTK("%s xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate); + be->blkif->reqrate *= 2; + be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT; + if(be->blkif->reqmin > be->blkif->reqrate){ + be->blkif->reqrate = be->blkif->reqmin; + DPRINTK("%s reset default value(tokens-rate=%d)",name,be->blkif->reqrate); + } + } + } + be->blkif->reqtime = jiffies; + if (be->major == 0 && be->minor == 0) { /* Front end dir is a number, which is used as the handle. */ @@ -414,6 +451,49 @@ static void frontend_changed(struct xenb /* ** Connection ** */ +static void unregister_rate_watch(struct backend_info *be) +{ + if (be->have_rate_watch) { + unregister_xenbus_watch(&be->rate_watch); + kfree(be->rate_watch.node); + } + be->have_rate_watch = 0; +} + +static void rate_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + + struct backend_info *be=container_of(watch,struct backend_info, rate_watch); + int err; + char name[TASK_COMM_LEN]; + + err = blkback_name(be->blkif, name); + if (err) { + xenbus_dev_error(be->dev, err, "get blkback dev name"); + return; + } + + err = xenbus_gather(XBT_NIL,be->dev->otherend, + "tokens-rate", "%d", + &be->blkif->reqrate,NULL); + if(err){ + DPRINTK("%s xenbus_gather(tokens-rate) error",name); + }else{ + if(be->blkif->reqrate <= 0){ + be->blkif->reqmin = 0; + DPRINTK("%s tokens-rate == 0,no limit",name); + }else{ + DPRINTK("%s xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate); + be->blkif->reqrate *= 2; + be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT; + if(be->blkif->reqmin > be->blkif->reqrate){ + be->blkif->reqrate = be->blkif->reqmin; + DPRINTK("%s reset default value(tokens-rate=%d)",name,be->blkif->reqrate); + } + } + } +} /** * Write the physical details regarding the block device to the store, and @@ -439,6 +519,14 @@ again: if (err) goto abort; + /*add by andrew for centos pv*/ + err = xenbus_printf(xbt, dev->nodename,"feature-flush-cache", "1"); + if (err){ + xenbus_dev_fatal(dev, err, "writing %s/feature-flush-cache", + dev->nodename); + goto abort; + } + err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", vbd_size(&be->blkif->vbd)); if (err) { @@ -469,11 +557,22 @@ again: if (err) xenbus_dev_fatal(dev, err, "ending transaction"); + DPRINTK("xenbus_switch_to XenbusStateConnected"); + err = xenbus_switch_state(dev, XenbusStateConnected); if (err) xenbus_dev_fatal(dev, err, "switching to Connected state", dev->nodename); + unregister_rate_watch(be); + err=xenbus_watch_path2(dev, dev->otherend, "tokens-rate", + &be->rate_watch,rate_changed); + if (!err) + be->have_rate_watch = 1; + else + xenbus_dev_fatal(dev, err, "watching tokens-rate", + dev->nodename); + return; abort: xenbus_transaction_end(xbt, 1); _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On Tue, Jun 21, 2011 at 04:29:35PM +0800, Andrew Xu wrote:> Hi all, > > I add a blkback QoS patch.What tree is this against? There is a xen-blkback in 3.0-rc4, can you rebase it against that please. What is the patch solving? Why can''t it be done with dm-ioband?> You can config(dynamic/static) different I/O speed for different VM disk > by this patch. > > ---------------------------------------------------------------------------- > > diff -urNp blkback/blkback.c blkback-qos/blkback.c > --- blkback/blkback.c 2011-06-22 07:54:19.000000000 +0800 > +++ blkback-qos/blkback.c 2011-06-22 07:53:18.000000000 +0800 > @@ -44,6 +44,11 @@ > #include <asm/hypervisor.h> > #include "common.h" > > +#undef DPRINTK > +#define DPRINTK(fmt, args...) \ > + printk("blkback/blkback (%s:%d) " fmt ".\n", \ > + __FUNCTION__, __LINE__, ##args) > + > /* > * These are rather arbitrary. They are fairly large because adjacent requests > * pulled from a communication ring are quite likely to end up being part of > @@ -110,7 +115,8 @@ static inline unsigned long vaddr(pendin > static int do_block_io_op(blkif_t *blkif); > static int dispatch_rw_block_io(blkif_t *blkif, > blkif_request_t *req, > - pending_req_t *pending_req); > + pending_req_t *pending_req, > + int *done_nr_sects); > static void make_response(blkif_t *blkif, u64 id, > unsigned short op, int st); > > @@ -206,10 +212,20 @@ static void print_stats(blkif_t *blkif) > blkif->st_pk_req = 0; > } > > +static void refill_reqcount(blkif_t *blkif) > +{ > + blkif->reqtime = jiffies + msecs_to_jiffies(1000); > + blkif->reqcount = blkif->reqrate; > + if (blkif->reqcount < blkif->reqmin) > + blkif->reqcount = blkif->reqmin; > +} > + > int blkif_schedule(void *arg) > { > blkif_t *blkif = arg; > struct vbd *vbd = &blkif->vbd; > + int ret = 0; > + struct timeval cur_time; > > blkif_get(blkif); > > @@ -232,12 +248,34 @@ int blkif_schedule(void *arg) > blkif->waiting_reqs = 0; > smp_mb(); /* clear flag *before* checking for work */ > > - if (do_block_io_op(blkif)) > + ret = do_block_io_op(blkif); > + if (ret) > blkif->waiting_reqs = 1; > unplug_queue(blkif); > > + if(blkif->reqmin){ > + if(2 == ret && (blkif->reqtime > jiffies)){ > + jiffies_to_timeval(jiffies, &cur_time); > + if(log_stats && (cur_time.tv_sec % 10 ==1 )) > + printk(KERN_DEBUG "%s: going to sleep %d millsecs(rate=%d)\n", > + current->comm, > + jiffies_to_msecs(blkif->reqtime - jiffies), > + blkif->reqrate); > + > + set_current_state(TASK_INTERRUPTIBLE); > + schedule_timeout(blkif->reqtime - jiffies); > + > + if(log_stats && (cur_time.tv_sec % 10 ==1 )) > + printk(KERN_DEBUG "%s: sleep end(rate=%d)\n", > + current->comm,blkif->reqrate); > + } > + if (time_after(jiffies, blkif->reqtime)) > + refill_reqcount(blkif); > + } > + > if (log_stats && time_after(jiffies, blkif->st_print)) > print_stats(blkif); > + > } > > if (log_stats) > @@ -306,7 +344,6 @@ irqreturn_t blkif_be_int(int irq, void * > /****************************************************************** > * DOWNWARD CALLS -- These interface with the block-device layer proper. > */ > - > static int do_block_io_op(blkif_t *blkif) > { > blkif_back_rings_t *blk_rings = &blkif->blk_rings; > @@ -314,15 +351,27 @@ static int do_block_io_op(blkif_t *blkif > pending_req_t *pending_req; > RING_IDX rc, rp; > int more_to_do = 0, ret; > + static int last_done_nr_sects = 0; > > rc = blk_rings->common.req_cons; > rp = blk_rings->common.sring->req_prod; > rmb(); /* Ensure we see queued requests up to ''rp''. */ > + > + if (blkif->reqmin && blkif->reqcount <= 0) > + return (rc != rp) ? 2 : 0; > > while ((rc != rp) || (blkif->is_suspended_req)) { > > if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) > break; > + > + if(blkif->reqmin){ > + blkif->reqcount -= last_done_nr_sects; > + if (blkif->reqcount <= 0) { > + more_to_do = 2; > + break; > + } > + } > > if (kthread_should_stop()) { > more_to_do = 1; > @@ -367,14 +416,14 @@ handle_request: > switch (req.operation) { > case BLKIF_OP_READ: > blkif->st_rd_req++; > - ret = dispatch_rw_block_io(blkif, &req, pending_req); > + ret = dispatch_rw_block_io(blkif, &req, pending_req,&last_done_nr_sects); > break; > case BLKIF_OP_WRITE_BARRIER: > blkif->st_br_req++; > /* fall through */ > case BLKIF_OP_WRITE: > blkif->st_wr_req++; > - ret = dispatch_rw_block_io(blkif, &req, pending_req); > + ret = dispatch_rw_block_io(blkif, &req, pending_req,&last_done_nr_sects); > break; > case BLKIF_OP_PACKET: > DPRINTK("error: block operation BLKIF_OP_PACKET not implemented\n"); > @@ -412,9 +461,29 @@ handle_request: > return more_to_do; > } > > +static char* operation2str(int operation) > +{ > + char* ret_str = NULL; > + switch (operation) { > + case BLKIF_OP_READ: > + ret_str = "READ"; > + break; > + case BLKIF_OP_WRITE: > + ret_str = "WRITE"; > + break; > + case BLKIF_OP_WRITE_BARRIER: > + ret_str = "WRITE_BARRIER"; > + break; > + default: > + ret_str = "0"; > + } > + return ret_str; > +} > + > static int dispatch_rw_block_io(blkif_t *blkif, > blkif_request_t *req, > - pending_req_t *pending_req) > + pending_req_t *pending_req, > + int *done_nr_sects) > { > extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); > struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; > @@ -426,6 +495,9 @@ static int dispatch_rw_block_io(blkif_t > struct bio *bio = NULL; > int ret, i; > int operation; > + struct timeval cur_time; > + > + *done_nr_sects = 0; > > switch (req->operation) { > case BLKIF_OP_READ: > @@ -582,6 +654,12 @@ static int dispatch_rw_block_io(blkif_t > else if (operation == WRITE || operation == WRITE_BARRIER) > blkif->st_wr_sect += preq.nr_sects; > > + *done_nr_sects = preq.nr_sects; > + jiffies_to_timeval(jiffies, &cur_time); > + if ((log_stats == 2) && (cur_time.tv_sec % 10 ==1 )) > + printk(KERN_DEBUG " operation=%s sects=%d\n", > + operation2str(req->operation),preq.nr_sects); > + > return 0; > > fail_flush: > @@ -695,6 +773,8 @@ static int __init blkif_init(void) > > blkif_xenbus_init(); > > + DPRINTK("blkif_inited\n"); > + > return 0; > > out_of_memory: > diff -urNp blkback/cdrom.c blkback-qos/cdrom.c > --- blkback/cdrom.c 2010-05-20 18:07:00.000000000 +0800 > +++ blkback-qos/cdrom.c 2011-06-22 07:34:50.000000000 +0800 > @@ -35,9 +35,9 @@ > #include "common.h" > > #undef DPRINTK > -#define DPRINTK(_f, _a...) \ > - printk("(%s() file=%s, line=%d) " _f "\n", \ > - __PRETTY_FUNCTION__, __FILE__ , __LINE__ , ##_a ) > +#define DPRINTK(fmt, args...) \ > + printk("blkback/cdrom (%s:%d) " fmt ".\n", \ > + __FUNCTION__, __LINE__, ##args) > > > #define MEDIA_PRESENT "media-present" > diff -urNp blkback/common.h blkback-qos/common.h > --- blkback/common.h 2010-05-20 18:07:00.000000000 +0800 > +++ blkback-qos/common.h 2011-06-22 07:34:50.000000000 +0800 > @@ -100,8 +100,17 @@ typedef struct blkif_st { > > grant_handle_t shmem_handle; > grant_ref_t shmem_ref; > + > + /* qos information */ > + unsigned long reqtime; > + int reqcount; > + int reqmin; > + int reqrate; > + > } blkif_t; > > +#define VBD_QOS_MIN_RATE_LIMIT 2*1024 /* 1MBs */ > + > struct backend_info > { > struct xenbus_device *dev; > @@ -111,6 +120,8 @@ struct backend_info > unsigned major; > unsigned minor; > char *mode; > + struct xenbus_watch rate_watch; > + int have_rate_watch; > }; > > blkif_t *blkif_alloc(domid_t domid); > diff -urNp blkback/vbd.c blkback-qos/vbd.c > --- blkback/vbd.c 2010-05-20 18:07:00.000000000 +0800 > +++ blkback-qos/vbd.c 2011-06-22 07:34:50.000000000 +0800 > @@ -35,6 +35,11 @@ > #define vbd_sz(_v) ((_v)->bdev->bd_part ? \ > (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk)) > > +#undef DPRINTK > +#define DPRINTK(fmt, args...) \ > + printk("blkback/vbd (%s:%d) " fmt ".\n", \ > + __FUNCTION__, __LINE__, ##args) > + > unsigned long long vbd_size(struct vbd *vbd) > { > return vbd_sz(vbd); > @@ -87,7 +92,7 @@ int vbd_create(blkif_t *blkif, blkif_vde > if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE) > vbd->type |= VDISK_REMOVABLE; > > - DPRINTK("Successful creation of handle=%04x (dom=%u)\n", > + DPRINTK("Successful creation of handle=%04x (dom=%u)", > handle, blkif->domid); > return 0; > } > diff -urNp blkback/xenbus.c blkback-qos/xenbus.c > --- blkback/xenbus.c 2010-05-20 18:07:00.000000000 +0800 > +++ blkback-qos/xenbus.c 2011-06-22 07:34:50.000000000 +0800 > @@ -25,13 +25,14 @@ > > #undef DPRINTK > #define DPRINTK(fmt, args...) \ > - pr_debug("blkback/xenbus (%s:%d) " fmt ".\n", \ > + printk("blkback/xenbus (%s:%d) " fmt ".\n", \ > __FUNCTION__, __LINE__, ##args) > > static void connect(struct backend_info *); > static int connect_ring(struct backend_info *); > static void backend_changed(struct xenbus_watch *, const char **, > unsigned int); > +static void unregister_rate_watch(struct backend_info *be); > > static int blkback_name(blkif_t *blkif, char *buf) > { > @@ -59,8 +60,10 @@ static void update_blkif_status(blkif_t > char name[TASK_COMM_LEN]; > > /* Not ready to connect? */ > - if (!blkif->irq || !blkif->vbd.bdev) > + if (!blkif->irq || !blkif->vbd.bdev){ > + DPRINTK("Not ready to connect"); > return; > + } > > /* Already connected? */ > if (blkif->be->dev->state == XenbusStateConnected) > @@ -193,6 +196,8 @@ static int blkback_remove(struct xenbus_ > be->cdrom_watch.node = NULL; > } > > + unregister_rate_watch(be); > + > if (be->blkif) { > blkif_disconnect(be->blkif); > vbd_free(&be->blkif->vbd); > @@ -251,6 +256,10 @@ static int blkback_probe(struct xenbus_d > > err = xenbus_watch_path2(dev, dev->nodename, "physical-device", > &be->backend_watch, backend_changed); > + > + DPRINTK("blkback_probe called"); > + DPRINTK("dev->nodename=%s/physical-device",dev->nodename); > + > if (err) > goto fail; > > @@ -266,7 +275,6 @@ fail: > return err; > } > > - > /** > * Callback received when the hotplug scripts have placed the physical-device > * node. Read it and the mode node, and create a vbd. If the frontend is > @@ -283,8 +291,9 @@ static void backend_changed(struct xenbu > struct xenbus_device *dev = be->dev; > int cdrom = 0; > char *device_type; > + char name[TASK_COMM_LEN]; > > - DPRINTK(""); > + DPRINTK("backend_changed called"); > > err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x", > &major, &minor); > @@ -322,6 +331,34 @@ static void backend_changed(struct xenbu > kfree(device_type); > } > > + /* gather information about QoS policy for this device. */ > + err = blkback_name(be->blkif, name); > + if (err) { > + xenbus_dev_error(be->dev, err, "get blkback dev name"); > + return; > + } > + > + err = xenbus_gather(XBT_NIL, dev->otherend, > + "tokens-rate", "%d", &be->blkif->reqrate, > + NULL); > + if(err){ > + DPRINTK("%s xenbus_gather(tokens-min,tokens-rate) error",name); > + }else{ > + if(be->blkif->reqrate <= 0){ > + be->blkif->reqmin = 0 ; > + DPRINTK("%s tokens-rate == 0,no limit",name); > + }else{ > + DPRINTK("%s xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate); > + be->blkif->reqrate *= 2; > + be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT; > + if(be->blkif->reqmin > be->blkif->reqrate){ > + be->blkif->reqrate = be->blkif->reqmin; > + DPRINTK("%s reset default value(tokens-rate=%d)",name,be->blkif->reqrate); > + } > + } > + } > + be->blkif->reqtime = jiffies; > + > if (be->major == 0 && be->minor == 0) { > /* Front end dir is a number, which is used as the handle. */ > > @@ -414,6 +451,49 @@ static void frontend_changed(struct xenb > > /* ** Connection ** */ > > +static void unregister_rate_watch(struct backend_info *be) > +{ > + if (be->have_rate_watch) { > + unregister_xenbus_watch(&be->rate_watch); > + kfree(be->rate_watch.node); > + } > + be->have_rate_watch = 0; > +} > + > +static void rate_changed(struct xenbus_watch *watch, > + const char **vec, unsigned int len) > +{ > + > + struct backend_info *be=container_of(watch,struct backend_info, rate_watch); > + int err; > + char name[TASK_COMM_LEN]; > + > + err = blkback_name(be->blkif, name); > + if (err) { > + xenbus_dev_error(be->dev, err, "get blkback dev name"); > + return; > + } > + > + err = xenbus_gather(XBT_NIL,be->dev->otherend, > + "tokens-rate", "%d", > + &be->blkif->reqrate,NULL); > + if(err){ > + DPRINTK("%s xenbus_gather(tokens-rate) error",name); > + }else{ > + if(be->blkif->reqrate <= 0){ > + be->blkif->reqmin = 0; > + DPRINTK("%s tokens-rate == 0,no limit",name); > + }else{ > + DPRINTK("%s xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate); > + be->blkif->reqrate *= 2; > + be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT; > + if(be->blkif->reqmin > be->blkif->reqrate){ > + be->blkif->reqrate = be->blkif->reqmin; > + DPRINTK("%s reset default value(tokens-rate=%d)",name,be->blkif->reqrate); > + } > + } > + } > +} > > /** > * Write the physical details regarding the block device to the store, and > @@ -439,6 +519,14 @@ again: > if (err) > goto abort; > > + /*add by andrew for centos pv*/ > + err = xenbus_printf(xbt, dev->nodename,"feature-flush-cache", "1"); > + if (err){ > + xenbus_dev_fatal(dev, err, "writing %s/feature-flush-cache", > + dev->nodename); > + goto abort; > + } > + > err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", > vbd_size(&be->blkif->vbd)); > if (err) { > @@ -469,11 +557,22 @@ again: > if (err) > xenbus_dev_fatal(dev, err, "ending transaction"); > > + DPRINTK("xenbus_switch_to XenbusStateConnected"); > + > err = xenbus_switch_state(dev, XenbusStateConnected); > if (err) > xenbus_dev_fatal(dev, err, "switching to Connected state", > dev->nodename); > > + unregister_rate_watch(be); > + err=xenbus_watch_path2(dev, dev->otherend, "tokens-rate", > + &be->rate_watch,rate_changed); > + if (!err) > + be->have_rate_watch = 1; > + else > + xenbus_dev_fatal(dev, err, "watching tokens-rate", > + dev->nodename); > + > return; > abort: > xenbus_transaction_end(xbt, 1);> _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xensource.com > http://lists.xensource.com/xen-devel_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On Tue, 21 Jun 2011 09:33:37 -0400 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:> On Tue, Jun 21, 2011 at 04:29:35PM +0800, Andrew Xu wrote: > > Hi all, > > > > I add a blkback QoS patch. > > What tree is this against?This patch is based on suse11.sp1(2.6.32) xen-blkback source. (2.6.18 "Xenlinux" based source trees?)> There is a xen-blkback in 3.0-rc4, can you rebase > it against that please. >Ok, I will rebase it.> What is the patch solving? >With this path, you can set different speed I/O for different VM disk. For example, I set vm17-disk1 4MKB/s vm17-disk2 1MKB/s vm18-disk3 3MKB/s I/O speed, by writing follow xenstore key-values. /local/domain/17/device/vbd/768/tokens-rate = "4096" /local/domain/17/device/vbd/2048/tokens-rate = "1024" /local/domain/18/device/vbd/768/tokens-rate = "3096"> Why can''t it be done with dm-ioband?Of cause, I/O speed limit also can be done with dm-ioband. But with my patch, there is no need to load dm-ioband any more. This patch do speed-limit more close disk, more lightweight. How it doing? 1) It record how many sectors(512Bytes) have submit to disk in 1 second. 2) If it exceed 2*tokens-rate, then sleep (1 - time passed) seconds. 3) Next second come, going to 1)> > You can config(dynamic/static) different I/O speed for different VM disk > > by this patch. > > > > ---------------------------------------------------------------------------- > > > > diff -urNp blkback/blkback.c blkback-qos/blkback.c > > --- blkback/blkback.c 2011-06-22 07:54:19.000000000 +0800 > > +++ blkback-qos/blkback.c 2011-06-22 07:53:18.000000000 +0800 > > @@ -44,6 +44,11 @@ > > #include <asm/hypervisor.h> > > #include "common.h" > > > > +#undef DPRINTK > > +#define DPRINTK(fmt, args...) \ > > + printk("blkback/blkback (%s:%d) " fmt ".\n", \ > > + __FUNCTION__, __LINE__, ##args) > > + > > /* > > * These are rather arbitrary. They are fairly large because adjacent requests > > * pulled from a communication ring are quite likely to end up being part of > > @@ -110,7 +115,8 @@ static inline unsigned long vaddr(pendin > > static int do_block_io_op(blkif_t *blkif); > > static int dispatch_rw_block_io(blkif_t *blkif, > > blkif_request_t *req, > > - pending_req_t *pending_req); > > + pending_req_t *pending_req, > > + int *done_nr_sects); > > static void make_response(blkif_t *blkif, u64 id, > > unsigned short op, int st); > > > > @@ -206,10 +212,20 @@ static void print_stats(blkif_t *blkif) > > blkif->st_pk_req = 0; > > } > > > > +static void refill_reqcount(blkif_t *blkif) > > +{ > > + blkif->reqtime = jiffies + msecs_to_jiffies(1000); > > + blkif->reqcount = blkif->reqrate; > > + if (blkif->reqcount < blkif->reqmin) > > + blkif->reqcount = blkif->reqmin; > > +} > > + > > int blkif_schedule(void *arg) > > { > > blkif_t *blkif = arg; > > struct vbd *vbd = &blkif->vbd; > > + int ret = 0; > > + struct timeval cur_time; > > > > blkif_get(blkif); > > > > @@ -232,12 +248,34 @@ int blkif_schedule(void *arg) > > blkif->waiting_reqs = 0; > > smp_mb(); /* clear flag *before* checking for work */ > > > > - if (do_block_io_op(blkif)) > > + ret = do_block_io_op(blkif); > > + if (ret) > > blkif->waiting_reqs = 1; > > unplug_queue(blkif); > > > > + if(blkif->reqmin){ > > + if(2 == ret && (blkif->reqtime > jiffies)){ > > + jiffies_to_timeval(jiffies, &cur_time); > > + if(log_stats && (cur_time.tv_sec % 10 ==1 )) > > + printk(KERN_DEBUG "%s: going to sleep %d millsecs(rate=%d)\n", > > + current->comm, > > + jiffies_to_msecs(blkif->reqtime - jiffies), > > + blkif->reqrate); > > + > > + set_current_state(TASK_INTERRUPTIBLE); > > + schedule_timeout(blkif->reqtime - jiffies); > > + > > + if(log_stats && (cur_time.tv_sec % 10 ==1 )) > > + printk(KERN_DEBUG "%s: sleep end(rate=%d)\n", > > + current->comm,blkif->reqrate); > > + } > > + if (time_after(jiffies, blkif->reqtime)) > > + refill_reqcount(blkif); > > + } > > + > > if (log_stats && time_after(jiffies, blkif->st_print)) > > print_stats(blkif); > > + > > } > > > > if (log_stats) > > @@ -306,7 +344,6 @@ irqreturn_t blkif_be_int(int irq, void * > > /****************************************************************** > > * DOWNWARD CALLS -- These interface with the block-device layer proper. > > */ > > - > > static int do_block_io_op(blkif_t *blkif) > > { > > blkif_back_rings_t *blk_rings = &blkif->blk_rings; > > @@ -314,15 +351,27 @@ static int do_block_io_op(blkif_t *blkif > > pending_req_t *pending_req; > > RING_IDX rc, rp; > > int more_to_do = 0, ret; > > + static int last_done_nr_sects = 0; > > > > rc = blk_rings->common.req_cons; > > rp = blk_rings->common.sring->req_prod; > > rmb(); /* Ensure we see queued requests up to ''rp''. */ > > + > > + if (blkif->reqmin && blkif->reqcount <= 0) > > + return (rc != rp) ? 2 : 0; > > > > while ((rc != rp) || (blkif->is_suspended_req)) { > > > > if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) > > break; > > + > > + if(blkif->reqmin){ > > + blkif->reqcount -= last_done_nr_sects; > > + if (blkif->reqcount <= 0) { > > + more_to_do = 2; > > + break; > > + } > > + } > > > > if (kthread_should_stop()) { > > more_to_do = 1; > > @@ -367,14 +416,14 @@ handle_request: > > switch (req.operation) { > > case BLKIF_OP_READ: > > blkif->st_rd_req++; > > - ret = dispatch_rw_block_io(blkif, &req, pending_req); > > + ret = dispatch_rw_block_io(blkif, &req, pending_req,&last_done_nr_sects); > > break; > > case BLKIF_OP_WRITE_BARRIER: > > blkif->st_br_req++; > > /* fall through */ > > case BLKIF_OP_WRITE: > > blkif->st_wr_req++; > > - ret = dispatch_rw_block_io(blkif, &req, pending_req); > > + ret = dispatch_rw_block_io(blkif, &req, pending_req,&last_done_nr_sects); > > break; > > case BLKIF_OP_PACKET: > > DPRINTK("error: block operation BLKIF_OP_PACKET not implemented\n"); > > @@ -412,9 +461,29 @@ handle_request: > > return more_to_do; > > } > > > > +static char* operation2str(int operation) > > +{ > > + char* ret_str = NULL; > > + switch (operation) { > > + case BLKIF_OP_READ: > > + ret_str = "READ"; > > + break; > > + case BLKIF_OP_WRITE: > > + ret_str = "WRITE"; > > + break; > > + case BLKIF_OP_WRITE_BARRIER: > > + ret_str = "WRITE_BARRIER"; > > + break; > > + default: > > + ret_str = "0"; > > + } > > + return ret_str; > > +} > > + > > static int dispatch_rw_block_io(blkif_t *blkif, > > blkif_request_t *req, > > - pending_req_t *pending_req) > > + pending_req_t *pending_req, > > + int *done_nr_sects) > > { > > extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); > > struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; > > @@ -426,6 +495,9 @@ static int dispatch_rw_block_io(blkif_t > > struct bio *bio = NULL; > > int ret, i; > > int operation; > > + struct timeval cur_time; > > + > > + *done_nr_sects = 0; > > > > switch (req->operation) { > > case BLKIF_OP_READ: > > @@ -582,6 +654,12 @@ static int dispatch_rw_block_io(blkif_t > > else if (operation == WRITE || operation == WRITE_BARRIER) > > blkif->st_wr_sect += preq.nr_sects; > > > > + *done_nr_sects = preq.nr_sects; > > + jiffies_to_timeval(jiffies, &cur_time); > > + if ((log_stats == 2) && (cur_time.tv_sec % 10 ==1 )) > > + printk(KERN_DEBUG " operation=%s sects=%d\n", > > + operation2str(req->operation),preq.nr_sects); > > + > > return 0; > > > > fail_flush: > > @@ -695,6 +773,8 @@ static int __init blkif_init(void) > > > > blkif_xenbus_init(); > > > > + DPRINTK("blkif_inited\n"); > > + > > return 0; > > > > out_of_memory: > > diff -urNp blkback/cdrom.c blkback-qos/cdrom.c > > --- blkback/cdrom.c 2010-05-20 18:07:00.000000000 +0800 > > +++ blkback-qos/cdrom.c 2011-06-22 07:34:50.000000000 +0800 > > @@ -35,9 +35,9 @@ > > #include "common.h" > > > > #undef DPRINTK > > -#define DPRINTK(_f, _a...) \ > > - printk("(%s() file=%s, line=%d) " _f "\n", \ > > - __PRETTY_FUNCTION__, __FILE__ , __LINE__ , ##_a ) > > +#define DPRINTK(fmt, args...) \ > > + printk("blkback/cdrom (%s:%d) " fmt ".\n", \ > > + __FUNCTION__, __LINE__, ##args) > > > > > > #define MEDIA_PRESENT "media-present" > > diff -urNp blkback/common.h blkback-qos/common.h > > --- blkback/common.h 2010-05-20 18:07:00.000000000 +0800 > > +++ blkback-qos/common.h 2011-06-22 07:34:50.000000000 +0800 > > @@ -100,8 +100,17 @@ typedef struct blkif_st { > > > > grant_handle_t shmem_handle; > > grant_ref_t shmem_ref; > > + > > + /* qos information */ > > + unsigned long reqtime; > > + int reqcount; > > + int reqmin; > > + int reqrate; > > + > > } blkif_t; > > > > +#define VBD_QOS_MIN_RATE_LIMIT 2*1024 /* 1MBs */ > > + > > struct backend_info > > { > > struct xenbus_device *dev; > > @@ -111,6 +120,8 @@ struct backend_info > > unsigned major; > > unsigned minor; > > char *mode; > > + struct xenbus_watch rate_watch; > > + int have_rate_watch; > > }; > > > > blkif_t *blkif_alloc(domid_t domid); > > diff -urNp blkback/vbd.c blkback-qos/vbd.c > > --- blkback/vbd.c 2010-05-20 18:07:00.000000000 +0800 > > +++ blkback-qos/vbd.c 2011-06-22 07:34:50.000000000 +0800 > > @@ -35,6 +35,11 @@ > > #define vbd_sz(_v) ((_v)->bdev->bd_part ? \ > > (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk)) > > > > +#undef DPRINTK > > +#define DPRINTK(fmt, args...) \ > > + printk("blkback/vbd (%s:%d) " fmt ".\n", \ > > + __FUNCTION__, __LINE__, ##args) > > + > > unsigned long long vbd_size(struct vbd *vbd) > > { > > return vbd_sz(vbd); > > @@ -87,7 +92,7 @@ int vbd_create(blkif_t *blkif, blkif_vde > > if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE) > > vbd->type |= VDISK_REMOVABLE; > > > > - DPRINTK("Successful creation of handle=%04x (dom=%u)\n", > > + DPRINTK("Successful creation of handle=%04x (dom=%u)", > > handle, blkif->domid); > > return 0; > > } > > diff -urNp blkback/xenbus.c blkback-qos/xenbus.c > > --- blkback/xenbus.c 2010-05-20 18:07:00.000000000 +0800 > > +++ blkback-qos/xenbus.c 2011-06-22 07:34:50.000000000 +0800 > > @@ -25,13 +25,14 @@ > > > > #undef DPRINTK > > #define DPRINTK(fmt, args...) \ > > - pr_debug("blkback/xenbus (%s:%d) " fmt ".\n", \ > > + printk("blkback/xenbus (%s:%d) " fmt ".\n", \ > > __FUNCTION__, __LINE__, ##args) > > > > static void connect(struct backend_info *); > > static int connect_ring(struct backend_info *); > > static void backend_changed(struct xenbus_watch *, const char **, > > unsigned int); > > +static void unregister_rate_watch(struct backend_info *be); > > > > static int blkback_name(blkif_t *blkif, char *buf) > > { > > @@ -59,8 +60,10 @@ static void update_blkif_status(blkif_t > > char name[TASK_COMM_LEN]; > > > > /* Not ready to connect? */ > > - if (!blkif->irq || !blkif->vbd.bdev) > > + if (!blkif->irq || !blkif->vbd.bdev){ > > + DPRINTK("Not ready to connect"); > > return; > > + } > > > > /* Already connected? */ > > if (blkif->be->dev->state == XenbusStateConnected) > > @@ -193,6 +196,8 @@ static int blkback_remove(struct xenbus_ > > be->cdrom_watch.node = NULL; > > } > > > > + unregister_rate_watch(be); > > + > > if (be->blkif) { > > blkif_disconnect(be->blkif); > > vbd_free(&be->blkif->vbd); > > @@ -251,6 +256,10 @@ static int blkback_probe(struct xenbus_d > > > > err = xenbus_watch_path2(dev, dev->nodename, "physical-device", > > &be->backend_watch, backend_changed); > > + > > + DPRINTK("blkback_probe called"); > > + DPRINTK("dev->nodename=%s/physical-device",dev->nodename); > > + > > if (err) > > goto fail; > > > > @@ -266,7 +275,6 @@ fail: > > return err; > > } > > > > - > > /** > > * Callback received when the hotplug scripts have placed the physical-device > > * node. Read it and the mode node, and create a vbd. If the frontend is > > @@ -283,8 +291,9 @@ static void backend_changed(struct xenbu > > struct xenbus_device *dev = be->dev; > > int cdrom = 0; > > char *device_type; > > + char name[TASK_COMM_LEN]; > > > > - DPRINTK(""); > > + DPRINTK("backend_changed called"); > > > > err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x", > > &major, &minor); > > @@ -322,6 +331,34 @@ static void backend_changed(struct xenbu > > kfree(device_type); > > } > > > > + /* gather information about QoS policy for this device. */ > > + err = blkback_name(be->blkif, name); > > + if (err) { > > + xenbus_dev_error(be->dev, err, "get blkback dev name"); > > + return; > > + } > > + > > + err = xenbus_gather(XBT_NIL, dev->otherend, > > + "tokens-rate", "%d", &be->blkif->reqrate, > > + NULL); > > + if(err){ > > + DPRINTK("%s xenbus_gather(tokens-min,tokens-rate) error",name); > > + }else{ > > + if(be->blkif->reqrate <= 0){ > > + be->blkif->reqmin = 0 ; > > + DPRINTK("%s tokens-rate == 0,no limit",name); > > + }else{ > > + DPRINTK("%s xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate); > > + be->blkif->reqrate *= 2; > > + be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT; > > + if(be->blkif->reqmin > be->blkif->reqrate){ > > + be->blkif->reqrate = be->blkif->reqmin; > > + DPRINTK("%s reset default value(tokens-rate=%d)",name,be->blkif->reqrate); > > + } > > + } > > + } > > + be->blkif->reqtime = jiffies; > > + > > if (be->major == 0 && be->minor == 0) { > > /* Front end dir is a number, which is used as the handle. */ > > > > @@ -414,6 +451,49 @@ static void frontend_changed(struct xenb > > > > /* ** Connection ** */ > > > > +static void unregister_rate_watch(struct backend_info *be) > > +{ > > + if (be->have_rate_watch) { > > + unregister_xenbus_watch(&be->rate_watch); > > + kfree(be->rate_watch.node); > > + } > > + be->have_rate_watch = 0; > > +} > > + > > +static void rate_changed(struct xenbus_watch *watch, > > + const char **vec, unsigned int len) > > +{ > > + > > + struct backend_info *be=container_of(watch,struct backend_info, rate_watch); > > + int err; > > + char name[TASK_COMM_LEN]; > > + > > + err = blkback_name(be->blkif, name); > > + if (err) { > > + xenbus_dev_error(be->dev, err, "get blkback dev name"); > > + return; > > + } > > + > > + err = xenbus_gather(XBT_NIL,be->dev->otherend, > > + "tokens-rate", "%d", > > + &be->blkif->reqrate,NULL); > > + if(err){ > > + DPRINTK("%s xenbus_gather(tokens-rate) error",name); > > + }else{ > > + if(be->blkif->reqrate <= 0){ > > + be->blkif->reqmin = 0; > > + DPRINTK("%s tokens-rate == 0,no limit",name); > > + }else{ > > + DPRINTK("%s xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate); > > + be->blkif->reqrate *= 2; > > + be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT; > > + if(be->blkif->reqmin > be->blkif->reqrate){ > > + be->blkif->reqrate = be->blkif->reqmin; > > + DPRINTK("%s reset default value(tokens-rate=%d)",name,be->blkif->reqrate); > > + } > > + } > > + } > > +} > > > > /** > > * Write the physical details regarding the block device to the store, and > > @@ -439,6 +519,14 @@ again: > > if (err) > > goto abort; > > > > + /*add by andrew for centos pv*/ > > + err = xenbus_printf(xbt, dev->nodename,"feature-flush-cache", "1"); > > + if (err){ > > + xenbus_dev_fatal(dev, err, "writing %s/feature-flush-cache", > > + dev->nodename); > > + goto abort; > > + } > > + > > err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", > > vbd_size(&be->blkif->vbd)); > > if (err) { > > @@ -469,11 +557,22 @@ again: > > if (err) > > xenbus_dev_fatal(dev, err, "ending transaction"); > > > > + DPRINTK("xenbus_switch_to XenbusStateConnected"); > > + > > err = xenbus_switch_state(dev, XenbusStateConnected); > > if (err) > > xenbus_dev_fatal(dev, err, "switching to Connected state", > > dev->nodename); > > > > + unregister_rate_watch(be); > > + err=xenbus_watch_path2(dev, dev->otherend, "tokens-rate", > > + &be->rate_watch,rate_changed); > > + if (!err) > > + be->have_rate_watch = 1; > > + else > > + xenbus_dev_fatal(dev, err, "watching tokens-rate", > > + dev->nodename); > > + > > return; > > abort: > > xenbus_transaction_end(xbt, 1); > > > > _______________________________________________ > > Xen-devel mailing list > > Xen-devel@lists.xensource.com > > http://lists.xensource.com/xen-devel > > > _______________________________________________ > Xen-users mailing list > Xen-users@lists.xensource.com > http://lists.xensource.com/xen-users************************************************** 徐安(Andrew Xu) 部门 :云快线 - 运营支撑中心 - 研发中心 手机 : 18910391796 E-mail:xu.an@cloudex.cn 地址 :北京市朝阳区酒仙桥东路1号M5楼 ************************************************** _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Florian Heigl
2011-Jun-22 12:16 UTC
Re: [Xen-users] Re: [Xen-devel] VM disk I/O limit patch
Hi, relying on DM is not advisable since this is yet another layer that breaks write barriers, plus it kills portability to non-linux OS. btw: It would be cool if a block shaping patch finally made it in the Xen mainline, since the last one that was submitted in 2009 was apparently forgotten... Regards, Florian 2011/6/21 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>:> On Tue, Jun 21, 2011 at 04:29:35PM +0800, Andrew Xu wrote: >> Hi all, >> >> I add a blkback QoS patch. > > What tree is this against? There is a xen-blkback in 3.0-rc4, can you rebase > it against that please. > > What is the patch solving? Why can''t it be done with dm-ioband? > >> You can config(dynamic/static) different I/O speed for different VM disk >> by this patch. >> >> ---------------------------------------------------------------------------- >> >> diff -urNp blkback/blkback.c blkback-qos/blkback.c >> --- blkback/blkback.c 2011-06-22 07:54:19.000000000 +0800 >> +++ blkback-qos/blkback.c 2011-06-22 07:53:18.000000000 +0800 >> @@ -44,6 +44,11 @@ >> #include <asm/hypervisor.h> >> #include "common.h" >> >> +#undef DPRINTK >> +#define DPRINTK(fmt, args...) \ >> + printk("blkback/blkback (%s:%d) " fmt ".\n", \ >> + __FUNCTION__, __LINE__, ##args) >> + >> /* >> * These are rather arbitrary. They are fairly large because adjacent requests >> * pulled from a communication ring are quite likely to end up being part of >> @@ -110,7 +115,8 @@ static inline unsigned long vaddr(pendin >> static int do_block_io_op(blkif_t *blkif); >> static int dispatch_rw_block_io(blkif_t *blkif, >> blkif_request_t *req, >> - pending_req_t *pending_req); >> + pending_req_t *pending_req, >> + int *done_nr_sects); >> static void make_response(blkif_t *blkif, u64 id, >> unsigned short op, int st); >> >> @@ -206,10 +212,20 @@ static void print_stats(blkif_t *blkif) >> blkif->st_pk_req = 0; >> } >> >> +static void refill_reqcount(blkif_t *blkif) >> +{ >> + blkif->reqtime = jiffies + msecs_to_jiffies(1000); >> + blkif->reqcount = blkif->reqrate; >> + if (blkif->reqcount < blkif->reqmin) >> + blkif->reqcount = blkif->reqmin; >> +} >> + >> int blkif_schedule(void *arg) >> { >> blkif_t *blkif = arg; >> struct vbd *vbd = &blkif->vbd; >> + int ret = 0; >> + struct timeval cur_time; >> >> blkif_get(blkif); >> >> @@ -232,12 +248,34 @@ int blkif_schedule(void *arg) >> blkif->waiting_reqs = 0; >> smp_mb(); /* clear flag *before* checking for work */ >> >> - if (do_block_io_op(blkif)) >> + ret = do_block_io_op(blkif); >> + if (ret) >> blkif->waiting_reqs = 1; >> unplug_queue(blkif); >> >> + if(blkif->reqmin){ >> + if(2 == ret && (blkif->reqtime > jiffies)){ >> + jiffies_to_timeval(jiffies, &cur_time); >> + if(log_stats && (cur_time.tv_sec % 10 ==1 )) >> + printk(KERN_DEBUG "%s: going to sleep %d millsecs(rate=%d)\n", >> + current->comm, >> + jiffies_to_msecs(blkif->reqtime - jiffies), >> + blkif->reqrate); >> + >> + set_current_state(TASK_INTERRUPTIBLE); >> + schedule_timeout(blkif->reqtime - jiffies); >> + >> + if(log_stats && (cur_time.tv_sec % 10 ==1 )) >> + printk(KERN_DEBUG "%s: sleep end(rate=%d)\n", >> + current->comm,blkif->reqrate); >> + } >> + if (time_after(jiffies, blkif->reqtime)) >> + refill_reqcount(blkif); >> + } >> + >> if (log_stats && time_after(jiffies, blkif->st_print)) >> print_stats(blkif); >> + >> } >> >> if (log_stats) >> @@ -306,7 +344,6 @@ irqreturn_t blkif_be_int(int irq, void * >> /****************************************************************** >> * DOWNWARD CALLS -- These interface with the block-device layer proper. >> */ >> - >> static int do_block_io_op(blkif_t *blkif) >> { >> blkif_back_rings_t *blk_rings = &blkif->blk_rings; >> @@ -314,15 +351,27 @@ static int do_block_io_op(blkif_t *blkif >> pending_req_t *pending_req; >> RING_IDX rc, rp; >> int more_to_do = 0, ret; >> + static int last_done_nr_sects = 0; >> >> rc = blk_rings->common.req_cons; >> rp = blk_rings->common.sring->req_prod; >> rmb(); /* Ensure we see queued requests up to ''rp''. */ >> + >> + if (blkif->reqmin && blkif->reqcount <= 0) >> + return (rc != rp) ? 2 : 0; >> >> while ((rc != rp) || (blkif->is_suspended_req)) { >> >> if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) >> break; >> + >> + if(blkif->reqmin){ >> + blkif->reqcount -= last_done_nr_sects; >> + if (blkif->reqcount <= 0) { >> + more_to_do = 2; >> + break; >> + } >> + } >> >> if (kthread_should_stop()) { >> more_to_do = 1; >> @@ -367,14 +416,14 @@ handle_request: >> switch (req.operation) { >> case BLKIF_OP_READ: >> blkif->st_rd_req++; >> - ret = dispatch_rw_block_io(blkif, &req, pending_req); >> + ret = dispatch_rw_block_io(blkif, &req, pending_req,&last_done_nr_sects); >> break; >> case BLKIF_OP_WRITE_BARRIER: >> blkif->st_br_req++; >> /* fall through */ >> case BLKIF_OP_WRITE: >> blkif->st_wr_req++; >> - ret = dispatch_rw_block_io(blkif, &req, pending_req); >> + ret = dispatch_rw_block_io(blkif, &req, pending_req,&last_done_nr_sects); >> break; >> case BLKIF_OP_PACKET: >> DPRINTK("error: block operation BLKIF_OP_PACKET not implemented\n"); >> @@ -412,9 +461,29 @@ handle_request: >> return more_to_do; >> } >> >> +static char* operation2str(int operation) >> +{ >> + char* ret_str = NULL; >> + switch (operation) { >> + case BLKIF_OP_READ: >> + ret_str = "READ"; >> + break; >> + case BLKIF_OP_WRITE: >> + ret_str = "WRITE"; >> + break; >> + case BLKIF_OP_WRITE_BARRIER: >> + ret_str = "WRITE_BARRIER"; >> + break; >> + default: >> + ret_str = "0"; >> + } >> + return ret_str; >> +} >> + >> static int dispatch_rw_block_io(blkif_t *blkif, >> blkif_request_t *req, >> - pending_req_t *pending_req) >> + pending_req_t *pending_req, >> + int *done_nr_sects) >> { >> extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); >> struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; >> @@ -426,6 +495,9 @@ static int dispatch_rw_block_io(blkif_t >> struct bio *bio = NULL; >> int ret, i; >> int operation; >> + struct timeval cur_time; >> + >> + *done_nr_sects = 0; >> >> switch (req->operation) { >> case BLKIF_OP_READ: >> @@ -582,6 +654,12 @@ static int dispatch_rw_block_io(blkif_t >> else if (operation == WRITE || operation == WRITE_BARRIER) >> blkif->st_wr_sect += preq.nr_sects; >> >> + *done_nr_sects = preq.nr_sects; >> + jiffies_to_timeval(jiffies, &cur_time); >> + if ((log_stats == 2) && (cur_time.tv_sec % 10 ==1 )) >> + printk(KERN_DEBUG " operation=%s sects=%d\n", >> + operation2str(req->operation),preq.nr_sects); >> + >> return 0; >> >> fail_flush: >> @@ -695,6 +773,8 @@ static int __init blkif_init(void) >> >> blkif_xenbus_init(); >> >> + DPRINTK("blkif_inited\n"); >> + >> return 0; >> >> out_of_memory: >> diff -urNp blkback/cdrom.c blkback-qos/cdrom.c >> --- blkback/cdrom.c 2010-05-20 18:07:00.000000000 +0800 >> +++ blkback-qos/cdrom.c 2011-06-22 07:34:50.000000000 +0800 >> @@ -35,9 +35,9 @@ >> #include "common.h" >> >> #undef DPRINTK >> -#define DPRINTK(_f, _a...) \ >> - printk("(%s() file=%s, line=%d) " _f "\n", \ >> - __PRETTY_FUNCTION__, __FILE__ , __LINE__ , ##_a ) >> +#define DPRINTK(fmt, args...) \ >> + printk("blkback/cdrom (%s:%d) " fmt ".\n", \ >> + __FUNCTION__, __LINE__, ##args) >> >> >> #define MEDIA_PRESENT "media-present" >> diff -urNp blkback/common.h blkback-qos/common.h >> --- blkback/common.h 2010-05-20 18:07:00.000000000 +0800 >> +++ blkback-qos/common.h 2011-06-22 07:34:50.000000000 +0800 >> @@ -100,8 +100,17 @@ typedef struct blkif_st { >> >> grant_handle_t shmem_handle; >> grant_ref_t shmem_ref; >> + >> + /* qos information */ >> + unsigned long reqtime; >> + int reqcount; >> + int reqmin; >> + int reqrate; >> + >> } blkif_t; >> >> +#define VBD_QOS_MIN_RATE_LIMIT 2*1024 /* 1MBs */ >> + >> struct backend_info >> { >> struct xenbus_device *dev; >> @@ -111,6 +120,8 @@ struct backend_info >> unsigned major; >> unsigned minor; >> char *mode; >> + struct xenbus_watch rate_watch; >> + int have_rate_watch; >> }; >> >> blkif_t *blkif_alloc(domid_t domid); >> diff -urNp blkback/vbd.c blkback-qos/vbd.c >> --- blkback/vbd.c 2010-05-20 18:07:00.000000000 +0800 >> +++ blkback-qos/vbd.c 2011-06-22 07:34:50.000000000 +0800 >> @@ -35,6 +35,11 @@ >> #define vbd_sz(_v) ((_v)->bdev->bd_part ? \ >> (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk)) >> >> +#undef DPRINTK >> +#define DPRINTK(fmt, args...) \ >> + printk("blkback/vbd (%s:%d) " fmt ".\n", \ >> + __FUNCTION__, __LINE__, ##args) >> + >> unsigned long long vbd_size(struct vbd *vbd) >> { >> return vbd_sz(vbd); >> @@ -87,7 +92,7 @@ int vbd_create(blkif_t *blkif, blkif_vde >> if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE) >> vbd->type |= VDISK_REMOVABLE; >> >> - DPRINTK("Successful creation of handle=%04x (dom=%u)\n", >> + DPRINTK("Successful creation of handle=%04x (dom=%u)", >> handle, blkif->domid); >> return 0; >> } >> diff -urNp blkback/xenbus.c blkback-qos/xenbus.c >> --- blkback/xenbus.c 2010-05-20 18:07:00.000000000 +0800 >> +++ blkback-qos/xenbus.c 2011-06-22 07:34:50.000000000 +0800 >> @@ -25,13 +25,14 @@ >> >> #undef DPRINTK >> #define DPRINTK(fmt, args...) \ >> - pr_debug("blkback/xenbus (%s:%d) " fmt ".\n", \ >> + printk("blkback/xenbus (%s:%d) " fmt ".\n", \ >> __FUNCTION__, __LINE__, ##args) >> >> static void connect(struct backend_info *); >> static int connect_ring(struct backend_info *); >> static void backend_changed(struct xenbus_watch *, const char **, >> unsigned int); >> +static void unregister_rate_watch(struct backend_info *be); >> >> static int blkback_name(blkif_t *blkif, char *buf) >> { >> @@ -59,8 +60,10 @@ static void update_blkif_status(blkif_t >> char name[TASK_COMM_LEN]; >> >> /* Not ready to connect? */ >> - if (!blkif->irq || !blkif->vbd.bdev) >> + if (!blkif->irq || !blkif->vbd.bdev){ >> + DPRINTK("Not ready to connect"); >> return; >> + } >> >> /* Already connected? */ >> if (blkif->be->dev->state == XenbusStateConnected) >> @@ -193,6 +196,8 @@ static int blkback_remove(struct xenbus_ >> be->cdrom_watch.node = NULL; >> } >> >> + unregister_rate_watch(be); >> + >> if (be->blkif) { >> blkif_disconnect(be->blkif); >> vbd_free(&be->blkif->vbd); >> @@ -251,6 +256,10 @@ static int blkback_probe(struct xenbus_d >> >> err = xenbus_watch_path2(dev, dev->nodename, "physical-device", >> &be->backend_watch, backend_changed); >> + >> + DPRINTK("blkback_probe called"); >> + DPRINTK("dev->nodename=%s/physical-device",dev->nodename); >> + >> if (err) >> goto fail; >> >> @@ -266,7 +275,6 @@ fail: >> return err; >> } >> >> - >> /** >> * Callback received when the hotplug scripts have placed the physical-device >> * node. Read it and the mode node, and create a vbd. If the frontend is >> @@ -283,8 +291,9 @@ static void backend_changed(struct xenbu >> struct xenbus_device *dev = be->dev; >> int cdrom = 0; >> char *device_type; >> + char name[TASK_COMM_LEN]; >> >> - DPRINTK(""); >> + DPRINTK("backend_changed called"); >> >> err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x", >> &major, &minor); >> @@ -322,6 +331,34 @@ static void backend_changed(struct xenbu >> kfree(device_type); >> } >> >> + /* gather information about QoS policy for this device. */ >> + err = blkback_name(be->blkif, name); >> + if (err) { >> + xenbus_dev_error(be->dev, err, "get blkback dev name"); >> + return; >> + } >> + >> + err = xenbus_gather(XBT_NIL, dev->otherend, >> + "tokens-rate", "%d", &be->blkif->reqrate, >> + NULL); >> + if(err){ >> + DPRINTK("%s xenbus_gather(tokens-min,tokens-rate) error",name); >> + }else{ >> + if(be->blkif->reqrate <= 0){ >> + be->blkif->reqmin = 0 ; >> + DPRINTK("%s tokens-rate == 0,no limit",name); >> + }else{ >> + DPRINTK("%s xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate); >> + be->blkif->reqrate *= 2; >> + be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT; >> + if(be->blkif->reqmin > be->blkif->reqrate){ >> + be->blkif->reqrate = be->blkif->reqmin; >> + DPRINTK("%s reset default value(tokens-rate=%d)",name,be->blkif->reqrate); >> + } >> + } >> + } >> + be->blkif->reqtime = jiffies; >> + >> if (be->major == 0 && be->minor == 0) { >> /* Front end dir is a number, which is used as the handle. */ >> >> @@ -414,6 +451,49 @@ static void frontend_changed(struct xenb >> >> /* ** Connection ** */ >> >> +static void unregister_rate_watch(struct backend_info *be) >> +{ >> + if (be->have_rate_watch) { >> + unregister_xenbus_watch(&be->rate_watch); >> + kfree(be->rate_watch.node); >> + } >> + be->have_rate_watch = 0; >> +} >> + >> +static void rate_changed(struct xenbus_watch *watch, >> + const char **vec, unsigned int len) >> +{ >> + >> + struct backend_info *be=container_of(watch,struct backend_info, rate_watch); >> + int err; >> + char name[TASK_COMM_LEN]; >> + >> + err = blkback_name(be->blkif, name); >> + if (err) { >> + xenbus_dev_error(be->dev, err, "get blkback dev name"); >> + return; >> + } >> + >> + err = xenbus_gather(XBT_NIL,be->dev->otherend, >> + "tokens-rate", "%d", >> + &be->blkif->reqrate,NULL); >> + if(err){ >> + DPRINTK("%s xenbus_gather(tokens-rate) error",name); >> + }else{ >> + if(be->blkif->reqrate <= 0){ >> + be->blkif->reqmin = 0; >> + DPRINTK("%s tokens-rate == 0,no limit",name); >> + }else{ >> + DPRINTK("%s xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate); >> + be->blkif->reqrate *= 2; >> + be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT; >> + if(be->blkif->reqmin > be->blkif->reqrate){ >> + be->blkif->reqrate = be->blkif->reqmin; >> + DPRINTK("%s reset default value(tokens-rate=%d)",name,be->blkif->reqrate); >> + } >> + } >> + } >> +} >> >> /** >> * Write the physical details regarding the block device to the store, and >> @@ -439,6 +519,14 @@ again: >> if (err) >> goto abort; >> >> + /*add by andrew for centos pv*/ >> + err = xenbus_printf(xbt, dev->nodename,"feature-flush-cache", "1"); >> + if (err){ >> + xenbus_dev_fatal(dev, err, "writing %s/feature-flush-cache", >> + dev->nodename); >> + goto abort; >> + } >> + >> err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", >> vbd_size(&be->blkif->vbd)); >> if (err) { >> @@ -469,11 +557,22 @@ again: >> if (err) >> xenbus_dev_fatal(dev, err, "ending transaction"); >> >> + DPRINTK("xenbus_switch_to XenbusStateConnected"); >> + >> err = xenbus_switch_state(dev, XenbusStateConnected); >> if (err) >> xenbus_dev_fatal(dev, err, "switching to Connected state", >> dev->nodename); >> >> + unregister_rate_watch(be); >> + err=xenbus_watch_path2(dev, dev->otherend, "tokens-rate", >> + &be->rate_watch,rate_changed); >> + if (!err) >> + be->have_rate_watch = 1; >> + else >> + xenbus_dev_fatal(dev, err, "watching tokens-rate", >> + dev->nodename); >> + >> return; >> abort: >> xenbus_transaction_end(xbt, 1); > > >> _______________________________________________ >> Xen-devel mailing list >> Xen-devel@lists.xensource.com >> http://lists.xensource.com/xen-devel > > > _______________________________________________ > Xen-users mailing list > Xen-users@lists.xensource.com > http://lists.xensource.com/xen-users >-- the purpose of libvirt is to provide an abstraction layer hiding all xen features added since 2006 until they were finally understood and copied by the kvm devs. _______________________________________________ Xen-users mailing list Xen-users@lists.xensource.com http://lists.xensource.com/xen-users
Konrad Rzeszutek Wilk
2011-Jun-22 13:07 UTC
Re: [Xen-users] Re: [Xen-devel] VM disk I/O limit patch
On Wed, Jun 22, 2011 at 02:16:09PM +0200, Florian Heigl wrote:> Hi, > > relying on DM is not advisable since this is yet another layer that > breaks write barriers, plus it kills portability to non-linux OS.How so? And write barriers don''t exist anymore - I think you mean flush support. And I believe they are fixed now - flush requests are passed through to the bottom layer (the real disk). But let me double check that shortly. Portability to non-linux OS? ? What do you mean by that? As in using this patch on NetBSD or OpenSolaris backend? The backend in those OS-es is quite different so it wouldn''t be "portable"> > btw: > It would be cool if a block shaping patch finally made it in the Xen > mainline, since the last one that was submitted in 2009 was apparently > forgotten...Why can''t dm-ioband do this for you? Hm, I don''t remember seeing that patch at all? _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Konrad Rzeszutek Wilk
2011-Jun-22 13:11 UTC
Re: [Xen-users] Re: [Xen-devel] VM disk I/O limit patch
On Wed, Jun 22, 2011 at 08:06:23PM +0800, Andrew Xu wrote:> > On Tue, 21 Jun 2011 09:33:37 -0400 > Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote: > > > On Tue, Jun 21, 2011 at 04:29:35PM +0800, Andrew Xu wrote: > > > Hi all, > > > > > > I add a blkback QoS patch. > > > > What tree is this against? > This patch is based on suse11.sp1(2.6.32) xen-blkback source. > (2.6.18 "Xenlinux" based source trees?) > > > There is a xen-blkback in 3.0-rc4, can you rebase > > it against that please. > > > Ok, I will rebase it.Hold on, lets talk about the problem you are trying to solve first.> > > What is the patch solving? > > > With this path, you can set different speed I/O for different VM disk. > For example, I set vm17-disk1 4MKB/s > vm17-disk2 1MKB/s > vm18-disk3 3MKB/s > I/O speed, by writing follow xenstore key-values. > /local/domain/17/device/vbd/768/tokens-rate = "4096" > /local/domain/17/device/vbd/2048/tokens-rate = "1024" > /local/domain/18/device/vbd/768/tokens-rate = "3096" > > > Why can''t it be done with dm-ioband? > Of cause, I/O speed limit also can be done with dm-ioband. > But with my patch, there is no need to load dm-ioband any more. > This patch do speed-limit more close disk, more lightweight.I am not convienced this will be easier to maintain than using existing code (dm-ioband) that Linux kernel provides already. Are there other technical reasons ''dm-ioband'' is not sufficient enough? Could it be possible to fix ''dm-ioband'' to not have those bugs? Florian mentioned flush requests not passing through the DM layers but I am pretty sure those have been fixed. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Andrew Xu
2011-Jun-22 14:12 UTC
Re[2]: [Xen-users] Re: [Xen-devel] VM disk I/O limit patch
On Wed, 22 Jun 2011 09:11:21 -0400 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:> On Wed, Jun 22, 2011 at 08:06:23PM +0800, Andrew Xu wrote: > > > > On Tue, 21 Jun 2011 09:33:37 -0400 > > Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote: > > > > > On Tue, Jun 21, 2011 at 04:29:35PM +0800, Andrew Xu wrote: > > > > Hi all, > > > > > > > > I add a blkback QoS patch. > > > > > > What tree is this against? > > This patch is based on suse11.sp1(2.6.32) xen-blkback source. > > (2.6.18 "Xenlinux" based source trees?) > > > > > There is a xen-blkback in 3.0-rc4, can you rebase > > > it against that please. > > > > > Ok, I will rebase it. > > Hold on, lets talk about the problem you are trying to solve first. > > > > > What is the patch solving? > > > > > With this path, you can set different speed I/O for different VM disk. > > For example, I set vm17-disk1 4MKB/s > > vm17-disk2 1MKB/s > > vm18-disk3 3MKB/s > > I/O speed, by writing follow xenstore key-values. > > /local/domain/17/device/vbd/768/tokens-rate = "4096" > > /local/domain/17/device/vbd/2048/tokens-rate = "1024" > > /local/domain/18/device/vbd/768/tokens-rate = "3096" > > > > > Why can''t it be done with dm-ioband? > > Of cause, I/O speed limit also can be done with dm-ioband. > > But with my patch, there is no need to load dm-ioband any more. > > This patch do speed-limit more close disk, more lightweight. > > I am not convienced this will be easier to maintain than > using existing code (dm-ioband) that Linux kernel provides already. > > Are there other technical reasons ''dm-ioband'' is not sufficient > enough? Could it be possible to fix ''dm-ioband'' to not have those > bugs? Florian mentioned flush requests not passing through > the DM layers but I am pretty sure those have been fixed. >I don''t find dm-ioband''s bug, so I can''t answer your question. But, xen-vm-disk I/O limitation shoud done by xen module, is not it?> _______________________________________________ > Xen-users mailing list > Xen-users@lists.xensource.com > http://lists.xensource.com/xen-users_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Konrad Rzeszutek Wilk
2011-Jun-22 14:39 UTC
Re: [Xen-users] Re: [Xen-devel] VM disk I/O limit patch
> > I am not convienced this will be easier to maintain than > > using existing code (dm-ioband) that Linux kernel provides already. > > > > Are there other technical reasons ''dm-ioband'' is not sufficient > > enough? Could it be possible to fix ''dm-ioband'' to not have those > > bugs? Florian mentioned flush requests not passing through > > the DM layers but I am pretty sure those have been fixed. > > > I don''t find dm-ioband''s bug, so I can''t answer your question. > > But, xen-vm-disk I/O limitation shoud done by xen module, is not it?Not all I/O''s for a guest go through the xen-blkback. For example the tap, qcow, are all inside of QEMU which has no notion of xen-blkback (this is upstream QEMU BTW) - so they would not benefit from this at all. While if all of the "disks" that are assigned to a guest go through dm-ioband the system admin has only one interface that can cover _all_ of the I/O sinks. And it is standard enough so that if that system admin is switching over from KVM to Xen they don''t have to alter a lot of their infrastructure. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Does this match only limit throughput or can it also limit the guest by disk IOPS? christopher aker had a patch way back for UML that did disk based qos. What i really liked about that patch was that it allowed for bursting by using a bucket. If i remember correctly you specified that a guest''s bucket could hold say 4000 tokens, and the bucket would be filled with 10 tokens a second. Each IO took one token from the bucket. When the bucket was empty IO was paused and processed as the bucket was filled. This allowed a guest to burst for a short period of time until that bucket was empty and then it would slowely be filled back up. Also what was nice is that the guest had a /proc/ entry that told the customer how many tokens they currently had in their bucket. I would like to see somthing like this in Xen, I''ve even thought about posting to the devel forums seeing if somebody wanted to write one for $$$ ~Shaun _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ian Jackson
2011-Jun-24 14:29 UTC
Re[2]: [Xen-users] Re: [Xen-devel] VM disk I/O limit patch
Andrew Xu writes ("Re[2]: [Xen-users] Re: [Xen-devel] VM disk I/O limit patch"):> On Wed, 22 Jun 2011 09:11:21 -0400 > Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote: > > I am not convienced this will be easier to maintain than > > using existing code (dm-ioband) that Linux kernel provides already. > > But, xen-vm-disk I/O limitation shoud done by xen module, is not it?Certainly not. It is much better for Xen systems to use general facilities where those are sufficient. Ian. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On Thu, Jun 23, 2011 at 01:45:36PM -0700, Shaun Reitan wrote:> Does this match only limit throughput or can it also limit the guest > by disk IOPS? christopher aker had a patch way back for UML thatJust throughpout.> did disk based qos. What i really liked about that patch was that > it allowed for bursting by using a bucket. If i remember correctly > you specified that a guest''s bucket could hold say 4000 tokens, and > the bucket would be filled with 10 tokens a second. Each IO took > one token from the bucket. When the bucket was empty IO was paused > and processed as the bucket was filled. This allowed a guest to > burst for a short period of time until that bucket was empty and > then it would slowely be filled back up.Uhhh... are you sure you are talking about the same patch.> > Also what was nice is that the guest had a /proc/ entry that told > the customer how many tokens they currently had in their bucket.OK.. but how would this help the customers? They don''t have access to the /proc in Dom0.> > I would like to see somthing like this in Xen, I''ve even thought > about posting to the devel forums seeing if somebody wanted to write > one for $$$Why not use dm-ioband (here is a doc about it: http://lwn.net/Articles/344441/) which has much more options and also provide the bucket and tokens you are looking for. [edit: Looks like dm-ioband never made it in the Linux kernel. But there was something that I thought Vivek wrote that was superior to dm-ioband.. Ah, yes: blkio-controller.txt. Look in Documentation/cgroups/blkio-controller.txt] _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
The patch i''m talking about can be found here http://theshore.net/~caker/uml/patches/ It''s called the token limiter. The entries in proc where under the guests proc. dm-ioband is something i just found a few weeks ago, i do plan to do some testing with it. ~Shaun _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Florian Heigl
2011-Jun-28 11:32 UTC
Re: [Xen-users] Re: [Xen-devel] Re: VM disk I/O limit patch
Hi Konrad, Thanks for providing links to info about both dm-ioband and bklio. This is surely something to test with and might be the best choice for Xen from 2.6.34 and up. Question/food for thought: since: - 2.6.18 still has CFQ1 which has notable issues with processes starving each other (some people have seen this and some havent, but it exists and it''s one of the worst issues that exist. Normally people will switch to deadline scheduler and ... experience they no longer can priorize now, and even then they''ll still see their dom0 go sluggish if a domU is too IO heavy) - Both blkio patch and dm-ioband are not in 2.6.18 and not even in 2.6.32(!!!) - The patch from last week was for 2.618... would it be possible to add the patch to the 2.6.18-ish Xen trees and not into the 3.x one? We could have a (hopefully) working solution for a problem that exists now on the deployments that are in use now and that could easily go into a XenServer 5.6 Patch123456 or XCP or OracleVM. This might also be the more time-conserving way to do it, since right now the cgroups mechanisms in Linux are nice, but it should be obvious that there''s still a year or two to go from setting up every single stuff via /sys after a process is started to a working solution that can be pre-configured for all VMs. Unless anybody thinks this is enough ;) 2011/6/27 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>:> On Thu, Jun 23, 2011 at 01:45:36PM -0700, Shaun Reitan wrote: >> Does this match only limit throughput or can it also limit the guest >> by disk IOPS? christopher aker had a patch way back for UML that > > Just throughpout. > >> did disk based qos. What i really liked about that patch was that >> it allowed for bursting by using a bucket. If i remember correctlyanything that is able to employ limits and keeps them burstable is just perfect :) -- the purpose of libvirt is to provide an abstraction layer hiding all xen features added since 2006 until they were finally understood and copied by the kvm devs. _______________________________________________ Xen-users mailing list Xen-users@lists.xensource.com http://lists.xensource.com/xen-users
Konrad Rzeszutek Wilk
2011-Jun-28 13:29 UTC
Re: [Xen-users] Re: [Xen-devel] Re: VM disk I/O limit patch
On Tue, Jun 28, 2011 at 01:32:14PM +0200, Florian Heigl wrote:> Hi Konrad, > > Thanks for providing links to info about both dm-ioband and bklio. > This is surely something to test with and might be the best choice for > Xen from 2.6.34 and up. > > Question/food for thought: > since: > - 2.6.18 still has CFQ1 which has notable issues with processes > starving each other (some people have seen this and some havent, but > it exists and it''s one of the worst issues that exist. Normally people > will switch to deadline scheduler and ... experience they no longer > can priorize now, and even then they''ll still see their dom0 go > sluggish if a domU is too IO heavy) > - Both blkio patch and dm-ioband are not in 2.6.18 and not even in 2.6.32(!!!)Right, so you can upgrade to 3.0 or 2.6.39.> - The patch from last week was for 2.618...Ah, not idea who is the maintainer for the 2.6.18 tree anymore.> > would it be possible to add the patch to the 2.6.18-ish Xen trees and > not into the 3.x one?You are welcome to do this, but I don''t think anybody else is going to do this. I am definitly not going to take the patch for the 3.0 tree.> We could have a (hopefully) working solution for a problem that exists > now on the deployments that are in use now and that could easily go > into a XenServer 5.6 Patch123456 or XCP or OracleVM. > > > This might also be the more time-conserving way to do it, since right > now the cgroups mechanisms in Linux are nice, but it should be obvious > that there''s still a year or two to go from setting up every single > stuff via /sys after a process is started to a working solution that > can be pre-configured for all VMs. > > Unless anybody thinks this is enough ;) > > > 2011/6/27 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>: > > On Thu, Jun 23, 2011 at 01:45:36PM -0700, Shaun Reitan wrote: > >> Does this match only limit throughput or can it also limit the guest > >> by disk IOPS? christopher aker had a patch way back for UML that > > > > Just throughpout. > > > >> did disk based qos. What i really liked about that patch was that > >> it allowed for bursting by using a bucket. If i remember correctly > > anything that is able to employ limits and keeps them burstable is > just perfect :) > > > -- > the purpose of libvirt is to provide an abstraction layer hiding all > xen features added since 2006 until they were finally understood and > copied by the kvm devs._______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel