Blue Swirl
2013-Jan-04 21:01 UTC
Re: [Qemu-devel] [PATCH RFC 3/3] xen_disk: add persistent grant support to xen_disk backend
On Mon, Dec 31, 2012 at 12:16 PM, Roger Pau Monne <roger.pau@citrix.com> wrote:> This protocol extension reuses the same set of grant pages for all > transactions between the front/back drivers, avoiding expensive tlb > flushes, grant table lock contention and switches between userspace > and kernel space. The full description of the protocol can be found in > the public blkif.h header. > > Speed improvement with 15 guests performing I/O is ~450%. > > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> > Cc: xen-devel@lists.xen.org > Cc: Stefano Stabellini <Stefano.Stabellini@eu.citrix.com> > Cc: Anthony PERARD <anthony.perard@citrix.com> > --- > Performance comparison with the previous implementation can be seen in > the followign graph: > > http://xenbits.xen.org/people/royger/persistent_read_qemu.png > --- > hw/xen_disk.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++++++------ > 1 files changed, 138 insertions(+), 17 deletions(-) > > diff --git a/hw/xen_disk.c b/hw/xen_disk.c > index 1eb485a..bafeceb 100644 > --- a/hw/xen_disk.c > +++ b/hw/xen_disk.c > @@ -52,6 +52,11 @@ static int max_requests = 32; > #define BLOCK_SIZE 512 > #define IOCB_COUNT (BLKIF_MAX_SEGMENTS_PER_REQUEST + 2) > > +struct persistent_gnt {This should be PersistentGrant and don't forget to add a typedef.> + void *page; > + struct XenBlkDev *blkdev; > +}; > + > struct ioreq { > blkif_request_t req; > int16_t status; > @@ -69,6 +74,7 @@ struct ioreq { > int prot; > void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST]; > void *pages; > + int num_unmap; > > /* aio status */ > int aio_inflight; > @@ -105,6 +111,12 @@ struct XenBlkDev { > int requests_inflight; > int requests_finished; > > + /* Persistent grants extension */ > + gboolean feature_persistent; > + GTree *persistent_gnts; > + unsigned int persistent_gnt_c; > + unsigned int max_grants; > + > /* qemu block driver */ > DriveInfo *dinfo; > BlockDriverState *bs; > @@ -138,6 +150,29 @@ static void ioreq_reset(struct ioreq *ioreq) > qemu_iovec_reset(&ioreq->v); > } > > +static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data) > +{ > + uint ua = GPOINTER_TO_UINT(a); > + uint ub = GPOINTER_TO_UINT(b); > + return (ua > ub) - (ua < ub); > +} > + > +static void destroy_grant(gpointer pgnt) > +{ > + struct persistent_gnt *grant = pgnt; > + XenGnttab gnt = grant->blkdev->xendev.gnttabdev; > + > + if (xc_gnttab_munmap(gnt, grant->page, 1) != 0) { > + xen_be_printf(&grant->blkdev->xendev, 0, > + "xc_gnttab_munmap failed: %s\n", > + strerror(errno)); > + } > + grant->blkdev->persistent_gnt_c--; > + xen_be_printf(&grant->blkdev->xendev, 3, > + "unmapped grant %p\n", grant->page); > + g_free(grant); > +} > + > static struct ioreq *ioreq_start(struct XenBlkDev *blkdev) > { > struct ioreq *ioreq = NULL; > @@ -266,21 +301,21 @@ static void ioreq_unmap(struct ioreq *ioreq) > XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev; > int i; > > - if (ioreq->v.niov == 0 || ioreq->mapped == 0) { > + if (ioreq->num_unmap == 0 || ioreq->mapped == 0) { > return; > } > if (batch_maps) { > if (!ioreq->pages) { > return; > } > - if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->v.niov) != 0) { > + if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->num_unmap) != 0) { > xen_be_printf(&ioreq->blkdev->xendev, 0, "xc_gnttab_munmap failed: %s\n", > strerror(errno)); > } > - ioreq->blkdev->cnt_map -= ioreq->v.niov; > + ioreq->blkdev->cnt_map -= ioreq->num_unmap; > ioreq->pages = NULL; > } else { > - for (i = 0; i < ioreq->v.niov; i++) { > + for (i = 0; i < ioreq->num_unmap; i++) { > if (!ioreq->page[i]) { > continue; > } > @@ -298,41 +333,107 @@ static void ioreq_unmap(struct ioreq *ioreq) > static int ioreq_map(struct ioreq *ioreq) > { > XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev; > - int i; > + uint32_t domids[BLKIF_MAX_SEGMENTS_PER_REQUEST]; > + uint32_t refs[BLKIF_MAX_SEGMENTS_PER_REQUEST]; > + void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST]; > + int i, j, new_maps = 0; > + struct persistent_gnt *grant; > > if (ioreq->v.niov == 0 || ioreq->mapped == 1) { > return 0; > } > - if (batch_maps) { > + if (ioreq->blkdev->feature_persistent) { > + for (i = 0; i < ioreq->v.niov; i++) { > + grant = g_tree_lookup(ioreq->blkdev->persistent_gnts, > + GUINT_TO_POINTER(ioreq->refs[i])); > + > + if (grant != NULL) { > + page[i] = grant->page; > + xen_be_printf(&ioreq->blkdev->xendev, 3, > + "using persistent-grant %" PRIu32 "\n", > + ioreq->refs[i]); > + } else { > + /* Add the grant to the list of grants that > + * should be mapped > + */ > + domids[new_maps] = ioreq->domids[i]; > + refs[new_maps] = ioreq->refs[i]; > + page[i] = NULL; > + new_maps++; > + } > + } > + /* Set the protection to RW, since grants may be reused later > + * with a different protection than the one needed for this request > + */ > + ioreq->prot = PROT_WRITE | PROT_READ; > + } else { > + /* All grants in the request should be mapped */ > + memcpy(refs, ioreq->refs, sizeof(refs)); > + memcpy(domids, ioreq->domids, sizeof(domids)); > + memset(page, 0, sizeof(page)); > + new_maps = ioreq->v.niov; > + } > + > + if (batch_maps && new_maps) { > ioreq->pages = xc_gnttab_map_grant_refs > - (gnt, ioreq->v.niov, ioreq->domids, ioreq->refs, ioreq->prot); > + (gnt, new_maps, domids, refs, ioreq->prot); > if (ioreq->pages == NULL) { > xen_be_printf(&ioreq->blkdev->xendev, 0, > "can't map %d grant refs (%s, %d maps)\n", > - ioreq->v.niov, strerror(errno), ioreq->blkdev->cnt_map); > + new_maps, strerror(errno), ioreq->blkdev->cnt_map); > return -1; > } > - for (i = 0; i < ioreq->v.niov; i++) { > - ioreq->v.iov[i].iov_base = ioreq->pages + i * XC_PAGE_SIZE + > - (uintptr_t)ioreq->v.iov[i].iov_base; > + for (i = 0, j = 0; i < ioreq->v.niov; i++) { > + if (page[i] == NULL) > + page[i] = ioreq->pages + (j++) * XC_PAGE_SIZE; > } > - ioreq->blkdev->cnt_map += ioreq->v.niov; > - } else { > - for (i = 0; i < ioreq->v.niov; i++) { > + ioreq->blkdev->cnt_map += new_maps; > + } else if (new_maps) { > + for (i = 0; i < new_maps; i++) { > ioreq->page[i] = xc_gnttab_map_grant_ref > - (gnt, ioreq->domids[i], ioreq->refs[i], ioreq->prot); > + (gnt, domids[i], refs[i], ioreq->prot); > if (ioreq->page[i] == NULL) { > xen_be_printf(&ioreq->blkdev->xendev, 0, > "can't map grant ref %d (%s, %d maps)\n", > - ioreq->refs[i], strerror(errno), ioreq->blkdev->cnt_map); > + refs[i], strerror(errno), ioreq->blkdev->cnt_map); > ioreq_unmap(ioreq); > return -1; > } > - ioreq->v.iov[i].iov_base = ioreq->page[i] + (uintptr_t)ioreq->v.iov[i].iov_base; > ioreq->blkdev->cnt_map++; > } > + for (i = 0, j = 0; i < ioreq->v.niov; i++) { > + if (page[i] == NULL) > + page[i] = ioreq->page[j++]; > + } > + } > + if (ioreq->blkdev->feature_persistent) { > + while((ioreq->blkdev->persistent_gnt_c < ioreq->blkdev->max_grants) && > + new_maps) { > + /* Go through the list of newly mapped grants and add as many > + * as possible to the list of persistently mapped grants > + */ > + grant = g_malloc0(sizeof(*grant)); > + new_maps--; > + if (batch_maps) > + grant->page = ioreq->pages + (new_maps) * XC_PAGE_SIZE; > + else > + grant->page = ioreq->page[new_maps]; > + grant->blkdev = ioreq->blkdev; > + xen_be_printf(&ioreq->blkdev->xendev, 3, > + "adding grant %" PRIu32 " page: %p\n", > + refs[new_maps], grant->page); > + g_tree_insert(ioreq->blkdev->persistent_gnts, > + GUINT_TO_POINTER(refs[new_maps]), > + grant); > + ioreq->blkdev->persistent_gnt_c++; > + } > + } > + for (i = 0; i < ioreq->v.niov; i++) { > + ioreq->v.iov[i].iov_base = page[i] + > + (uintptr_t)ioreq->v.iov[i].iov_base; > } > ioreq->mapped = 1; > + ioreq->num_unmap = new_maps; > return 0; > } > > @@ -690,6 +791,7 @@ static int blk_init(struct XenDevice *xendev) > > /* fill info */ > xenstore_write_be_int(&blkdev->xendev, "feature-barrier", 1); > + xenstore_write_be_int(&blkdev->xendev, "feature-persistent", 1); > xenstore_write_be_int(&blkdev->xendev, "info", info); > xenstore_write_be_int(&blkdev->xendev, "sector-size", blkdev->file_blk); > xenstore_write_be_int(&blkdev->xendev, "sectors", > @@ -713,6 +815,7 @@ out_error: > static int blk_connect(struct XenDevice *xendev) > { > struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev); > + int pers; > > if (xenstore_read_fe_int(&blkdev->xendev, "ring-ref", &blkdev->ring_ref) == -1) { > return -1; > @@ -721,6 +824,11 @@ static int blk_connect(struct XenDevice *xendev) > &blkdev->xendev.remote_port) == -1) { > return -1; > } > + if (xenstore_read_fe_int(&blkdev->xendev, "feature-persistent", &pers)) { > + blkdev->feature_persistent = FALSE; > + } else { > + blkdev->feature_persistent = !!pers; > + } > > blkdev->protocol = BLKIF_PROTOCOL_NATIVE; > if (blkdev->xendev.protocol) { > @@ -764,6 +872,15 @@ static int blk_connect(struct XenDevice *xendev) > } > } > > + if (blkdev->feature_persistent) { > + /* Init persistent grants */ > + blkdev->max_grants = max_requests * BLKIF_MAX_SEGMENTS_PER_REQUEST; > + blkdev->persistent_gnts = g_tree_new_full((GCompareDataFunc)int_cmp, > + NULL, NULL, > + (GDestroyNotify)destroy_grant); > + blkdev->persistent_gnt_c = 0; > + } > + > xen_be_bind_evtchn(&blkdev->xendev); > > xen_be_printf(&blkdev->xendev, 1, "ok: proto %s, ring-ref %d, " > @@ -804,6 +921,10 @@ static int blk_free(struct XenDevice *xendev) > blk_disconnect(xendev); > } > > + /* Free persistent grants */ > + if (blkdev->feature_persistent) > + g_tree_destroy(blkdev->persistent_gnts); > + > while (!QLIST_EMPTY(&blkdev->freelist)) { > ioreq = QLIST_FIRST(&blkdev->freelist); > QLIST_REMOVE(ioreq, list); > -- > 1.7.7.5 (Apple Git-26) > >_______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel