Roger Pau Monne
2012-Dec-31 12:16 UTC
[PATCH RFC 3/3] xen_disk: add persistent grant support to xen_disk backend
This protocol extension reuses the same set of grant pages for all transactions between the front/back drivers, avoiding expensive tlb flushes, grant table lock contention and switches between userspace and kernel space. The full description of the protocol can be found in the public blkif.h header. Speed improvement with 15 guests performing I/O is ~450%. Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> Cc: xen-devel@lists.xen.org Cc: Stefano Stabellini <Stefano.Stabellini@eu.citrix.com> Cc: Anthony PERARD <anthony.perard@citrix.com> --- Performance comparison with the previous implementation can be seen in the followign graph: http://xenbits.xen.org/people/royger/persistent_read_qemu.png --- hw/xen_disk.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 files changed, 138 insertions(+), 17 deletions(-) diff --git a/hw/xen_disk.c b/hw/xen_disk.c index 1eb485a..bafeceb 100644 --- a/hw/xen_disk.c +++ b/hw/xen_disk.c @@ -52,6 +52,11 @@ static int max_requests = 32; #define BLOCK_SIZE 512 #define IOCB_COUNT (BLKIF_MAX_SEGMENTS_PER_REQUEST + 2) +struct persistent_gnt { + void *page; + struct XenBlkDev *blkdev; +}; + struct ioreq { blkif_request_t req; int16_t status; @@ -69,6 +74,7 @@ struct ioreq { int prot; void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST]; void *pages; + int num_unmap; /* aio status */ int aio_inflight; @@ -105,6 +111,12 @@ struct XenBlkDev { int requests_inflight; int requests_finished; + /* Persistent grants extension */ + gboolean feature_persistent; + GTree *persistent_gnts; + unsigned int persistent_gnt_c; + unsigned int max_grants; + /* qemu block driver */ DriveInfo *dinfo; BlockDriverState *bs; @@ -138,6 +150,29 @@ static void ioreq_reset(struct ioreq *ioreq) qemu_iovec_reset(&ioreq->v); } +static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data) +{ + uint ua = GPOINTER_TO_UINT(a); + uint ub = GPOINTER_TO_UINT(b); + return (ua > ub) - (ua < ub); +} + +static void destroy_grant(gpointer pgnt) +{ + struct persistent_gnt *grant = pgnt; + XenGnttab gnt = grant->blkdev->xendev.gnttabdev; + + if (xc_gnttab_munmap(gnt, grant->page, 1) != 0) { + xen_be_printf(&grant->blkdev->xendev, 0, + "xc_gnttab_munmap failed: %s\n", + strerror(errno)); + } + grant->blkdev->persistent_gnt_c--; + xen_be_printf(&grant->blkdev->xendev, 3, + "unmapped grant %p\n", grant->page); + g_free(grant); +} + static struct ioreq *ioreq_start(struct XenBlkDev *blkdev) { struct ioreq *ioreq = NULL; @@ -266,21 +301,21 @@ static void ioreq_unmap(struct ioreq *ioreq) XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev; int i; - if (ioreq->v.niov == 0 || ioreq->mapped == 0) { + if (ioreq->num_unmap == 0 || ioreq->mapped == 0) { return; } if (batch_maps) { if (!ioreq->pages) { return; } - if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->v.niov) != 0) { + if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->num_unmap) != 0) { xen_be_printf(&ioreq->blkdev->xendev, 0, "xc_gnttab_munmap failed: %s\n", strerror(errno)); } - ioreq->blkdev->cnt_map -= ioreq->v.niov; + ioreq->blkdev->cnt_map -= ioreq->num_unmap; ioreq->pages = NULL; } else { - for (i = 0; i < ioreq->v.niov; i++) { + for (i = 0; i < ioreq->num_unmap; i++) { if (!ioreq->page[i]) { continue; } @@ -298,41 +333,107 @@ static void ioreq_unmap(struct ioreq *ioreq) static int ioreq_map(struct ioreq *ioreq) { XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev; - int i; + uint32_t domids[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + uint32_t refs[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + int i, j, new_maps = 0; + struct persistent_gnt *grant; if (ioreq->v.niov == 0 || ioreq->mapped == 1) { return 0; } - if (batch_maps) { + if (ioreq->blkdev->feature_persistent) { + for (i = 0; i < ioreq->v.niov; i++) { + grant = g_tree_lookup(ioreq->blkdev->persistent_gnts, + GUINT_TO_POINTER(ioreq->refs[i])); + + if (grant != NULL) { + page[i] = grant->page; + xen_be_printf(&ioreq->blkdev->xendev, 3, + "using persistent-grant %" PRIu32 "\n", + ioreq->refs[i]); + } else { + /* Add the grant to the list of grants that + * should be mapped + */ + domids[new_maps] = ioreq->domids[i]; + refs[new_maps] = ioreq->refs[i]; + page[i] = NULL; + new_maps++; + } + } + /* Set the protection to RW, since grants may be reused later + * with a different protection than the one needed for this request + */ + ioreq->prot = PROT_WRITE | PROT_READ; + } else { + /* All grants in the request should be mapped */ + memcpy(refs, ioreq->refs, sizeof(refs)); + memcpy(domids, ioreq->domids, sizeof(domids)); + memset(page, 0, sizeof(page)); + new_maps = ioreq->v.niov; + } + + if (batch_maps && new_maps) { ioreq->pages = xc_gnttab_map_grant_refs - (gnt, ioreq->v.niov, ioreq->domids, ioreq->refs, ioreq->prot); + (gnt, new_maps, domids, refs, ioreq->prot); if (ioreq->pages == NULL) { xen_be_printf(&ioreq->blkdev->xendev, 0, "can't map %d grant refs (%s, %d maps)\n", - ioreq->v.niov, strerror(errno), ioreq->blkdev->cnt_map); + new_maps, strerror(errno), ioreq->blkdev->cnt_map); return -1; } - for (i = 0; i < ioreq->v.niov; i++) { - ioreq->v.iov[i].iov_base = ioreq->pages + i * XC_PAGE_SIZE + - (uintptr_t)ioreq->v.iov[i].iov_base; + for (i = 0, j = 0; i < ioreq->v.niov; i++) { + if (page[i] == NULL) + page[i] = ioreq->pages + (j++) * XC_PAGE_SIZE; } - ioreq->blkdev->cnt_map += ioreq->v.niov; - } else { - for (i = 0; i < ioreq->v.niov; i++) { + ioreq->blkdev->cnt_map += new_maps; + } else if (new_maps) { + for (i = 0; i < new_maps; i++) { ioreq->page[i] = xc_gnttab_map_grant_ref - (gnt, ioreq->domids[i], ioreq->refs[i], ioreq->prot); + (gnt, domids[i], refs[i], ioreq->prot); if (ioreq->page[i] == NULL) { xen_be_printf(&ioreq->blkdev->xendev, 0, "can't map grant ref %d (%s, %d maps)\n", - ioreq->refs[i], strerror(errno), ioreq->blkdev->cnt_map); + refs[i], strerror(errno), ioreq->blkdev->cnt_map); ioreq_unmap(ioreq); return -1; } - ioreq->v.iov[i].iov_base = ioreq->page[i] + (uintptr_t)ioreq->v.iov[i].iov_base; ioreq->blkdev->cnt_map++; } + for (i = 0, j = 0; i < ioreq->v.niov; i++) { + if (page[i] == NULL) + page[i] = ioreq->page[j++]; + } + } + if (ioreq->blkdev->feature_persistent) { + while((ioreq->blkdev->persistent_gnt_c < ioreq->blkdev->max_grants) && + new_maps) { + /* Go through the list of newly mapped grants and add as many + * as possible to the list of persistently mapped grants + */ + grant = g_malloc0(sizeof(*grant)); + new_maps--; + if (batch_maps) + grant->page = ioreq->pages + (new_maps) * XC_PAGE_SIZE; + else + grant->page = ioreq->page[new_maps]; + grant->blkdev = ioreq->blkdev; + xen_be_printf(&ioreq->blkdev->xendev, 3, + "adding grant %" PRIu32 " page: %p\n", + refs[new_maps], grant->page); + g_tree_insert(ioreq->blkdev->persistent_gnts, + GUINT_TO_POINTER(refs[new_maps]), + grant); + ioreq->blkdev->persistent_gnt_c++; + } + } + for (i = 0; i < ioreq->v.niov; i++) { + ioreq->v.iov[i].iov_base = page[i] + + (uintptr_t)ioreq->v.iov[i].iov_base; } ioreq->mapped = 1; + ioreq->num_unmap = new_maps; return 0; } @@ -690,6 +791,7 @@ static int blk_init(struct XenDevice *xendev) /* fill info */ xenstore_write_be_int(&blkdev->xendev, "feature-barrier", 1); + xenstore_write_be_int(&blkdev->xendev, "feature-persistent", 1); xenstore_write_be_int(&blkdev->xendev, "info", info); xenstore_write_be_int(&blkdev->xendev, "sector-size", blkdev->file_blk); xenstore_write_be_int(&blkdev->xendev, "sectors", @@ -713,6 +815,7 @@ out_error: static int blk_connect(struct XenDevice *xendev) { struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev); + int pers; if (xenstore_read_fe_int(&blkdev->xendev, "ring-ref", &blkdev->ring_ref) == -1) { return -1; @@ -721,6 +824,11 @@ static int blk_connect(struct XenDevice *xendev) &blkdev->xendev.remote_port) == -1) { return -1; } + if (xenstore_read_fe_int(&blkdev->xendev, "feature-persistent", &pers)) { + blkdev->feature_persistent = FALSE; + } else { + blkdev->feature_persistent = !!pers; + } blkdev->protocol = BLKIF_PROTOCOL_NATIVE; if (blkdev->xendev.protocol) { @@ -764,6 +872,15 @@ static int blk_connect(struct XenDevice *xendev) } } + if (blkdev->feature_persistent) { + /* Init persistent grants */ + blkdev->max_grants = max_requests * BLKIF_MAX_SEGMENTS_PER_REQUEST; + blkdev->persistent_gnts = g_tree_new_full((GCompareDataFunc)int_cmp, + NULL, NULL, + (GDestroyNotify)destroy_grant); + blkdev->persistent_gnt_c = 0; + } + xen_be_bind_evtchn(&blkdev->xendev); xen_be_printf(&blkdev->xendev, 1, "ok: proto %s, ring-ref %d, " @@ -804,6 +921,10 @@ static int blk_free(struct XenDevice *xendev) blk_disconnect(xendev); } + /* Free persistent grants */ + if (blkdev->feature_persistent) + g_tree_destroy(blkdev->persistent_gnts); + while (!QLIST_EMPTY(&blkdev->freelist)) { ioreq = QLIST_FIRST(&blkdev->freelist); QLIST_REMOVE(ioreq, list); -- 1.7.7.5 (Apple Git-26) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Stefano Stabellini
2013-Jan-04 16:42 UTC
Re: [PATCH RFC 3/3] xen_disk: add persistent grant support to xen_disk backend
On Mon, 31 Dec 2012, Roger Pau Monne wrote:> This protocol extension reuses the same set of grant pages for all > transactions between the front/back drivers, avoiding expensive tlb > flushes, grant table lock contention and switches between userspace > and kernel space. The full description of the protocol can be found in > the public blkif.h header.it would be great if you could add a link to a webpage too> Speed improvement with 15 guests performing I/O is ~450%. > > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> > Cc: xen-devel@lists.xen.org > Cc: Stefano Stabellini <Stefano.Stabellini@eu.citrix.com> > Cc: Anthony PERARD <anthony.perard@citrix.com> > --- > Performance comparison with the previous implementation can be seen in > the followign graph: > > http://xenbits.xen.org/people/royger/persistent_read_qemu.pngthis is what I like to see!> hw/xen_disk.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++++++------ > 1 files changed, 138 insertions(+), 17 deletions(-) > > diff --git a/hw/xen_disk.c b/hw/xen_disk.c > index 1eb485a..bafeceb 100644 > --- a/hw/xen_disk.c > +++ b/hw/xen_disk.c > @@ -52,6 +52,11 @@ static int max_requests = 32; > #define BLOCK_SIZE 512 > #define IOCB_COUNT (BLKIF_MAX_SEGMENTS_PER_REQUEST + 2) > > +struct persistent_gnt { > + void *page; > + struct XenBlkDev *blkdev; > +}; > + > struct ioreq { > blkif_request_t req; > int16_t status; > @@ -69,6 +74,7 @@ struct ioreq { > int prot; > void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST]; > void *pages; > + int num_unmap; > > /* aio status */ > int aio_inflight; > @@ -105,6 +111,12 @@ struct XenBlkDev { > int requests_inflight; > int requests_finished; > > + /* Persistent grants extension */ > + gboolean feature_persistent; > + GTree *persistent_gnts; > + unsigned int persistent_gnt_c;can you come up with a better name for this variable?> + unsigned int max_grants; > + > /* qemu block driver */ > DriveInfo *dinfo; > BlockDriverState *bs; > @@ -138,6 +150,29 @@ static void ioreq_reset(struct ioreq *ioreq) > qemu_iovec_reset(&ioreq->v); > } > > +static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data) > +{ > + uint ua = GPOINTER_TO_UINT(a); > + uint ub = GPOINTER_TO_UINT(b); > + return (ua > ub) - (ua < ub); > +} > + > +static void destroy_grant(gpointer pgnt) > +{ > + struct persistent_gnt *grant = pgnt; > + XenGnttab gnt = grant->blkdev->xendev.gnttabdev; > + > + if (xc_gnttab_munmap(gnt, grant->page, 1) != 0) { > + xen_be_printf(&grant->blkdev->xendev, 0, > + "xc_gnttab_munmap failed: %s\n", > + strerror(errno)); > + } > + grant->blkdev->persistent_gnt_c--; > + xen_be_printf(&grant->blkdev->xendev, 3, > + "unmapped grant %p\n", grant->page); > + g_free(grant); > +} > + > static struct ioreq *ioreq_start(struct XenBlkDev *blkdev) > { > struct ioreq *ioreq = NULL; > @@ -266,21 +301,21 @@ static void ioreq_unmap(struct ioreq *ioreq) > XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev; > int i; > > - if (ioreq->v.niov == 0 || ioreq->mapped == 0) { > + if (ioreq->num_unmap == 0 || ioreq->mapped == 0) { > return; > } > if (batch_maps) { > if (!ioreq->pages) { > return; > } > - if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->v.niov) != 0) { > + if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->num_unmap) != 0) { > xen_be_printf(&ioreq->blkdev->xendev, 0, "xc_gnttab_munmap failed: %s\n", > strerror(errno)); > }Could you please add a comment saying that only the first num_unmap pages are going to be unmapped here? The others are going to be unmapped one by one by destroy_grant.> - ioreq->blkdev->cnt_map -= ioreq->v.niov; > + ioreq->blkdev->cnt_map -= ioreq->num_unmap; > ioreq->pages = NULL; > } else { > - for (i = 0; i < ioreq->v.niov; i++) { > + for (i = 0; i < ioreq->num_unmap; i++) { > if (!ioreq->page[i]) { > continue; > }I would be tempted to remove persistent grants support for the !batch_maps case, if it is going to make the code simpler> @@ -298,41 +333,107 @@ static void ioreq_unmap(struct ioreq *ioreq) > static int ioreq_map(struct ioreq *ioreq) > { > XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev; > - int i; > + uint32_t domids[BLKIF_MAX_SEGMENTS_PER_REQUEST]; > + uint32_t refs[BLKIF_MAX_SEGMENTS_PER_REQUEST]; > + void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST]; > + int i, j, new_maps = 0; > + struct persistent_gnt *grant; > > if (ioreq->v.niov == 0 || ioreq->mapped == 1) { > return 0; > } > - if (batch_maps) { > + if (ioreq->blkdev->feature_persistent) { > + for (i = 0; i < ioreq->v.niov; i++) { > + grant = g_tree_lookup(ioreq->blkdev->persistent_gnts, > + GUINT_TO_POINTER(ioreq->refs[i])); > + > + if (grant != NULL) { > + page[i] = grant->page; > + xen_be_printf(&ioreq->blkdev->xendev, 3, > + "using persistent-grant %" PRIu32 "\n", > + ioreq->refs[i]); > + } else { > + /* Add the grant to the list of grants that > + * should be mapped > + */ > + domids[new_maps] = ioreq->domids[i]; > + refs[new_maps] = ioreq->refs[i]; > + page[i] = NULL; > + new_maps++; > + } > + } > + /* Set the protection to RW, since grants may be reused later > + * with a different protection than the one needed for this request > + */ > + ioreq->prot = PROT_WRITE | PROT_READ; > + } else { > + /* All grants in the request should be mapped */ > + memcpy(refs, ioreq->refs, sizeof(refs)); > + memcpy(domids, ioreq->domids, sizeof(domids)); > + memset(page, 0, sizeof(page)); > + new_maps = ioreq->v.niov; > + } > + > + if (batch_maps && new_maps) { > ioreq->pages = xc_gnttab_map_grant_refs > - (gnt, ioreq->v.niov, ioreq->domids, ioreq->refs, ioreq->prot); > + (gnt, new_maps, domids, refs, ioreq->prot); > if (ioreq->pages == NULL) { > xen_be_printf(&ioreq->blkdev->xendev, 0, > "can''t map %d grant refs (%s, %d maps)\n", > - ioreq->v.niov, strerror(errno), ioreq->blkdev->cnt_map); > + new_maps, strerror(errno), ioreq->blkdev->cnt_map); > return -1; > } > - for (i = 0; i < ioreq->v.niov; i++) { > - ioreq->v.iov[i].iov_base = ioreq->pages + i * XC_PAGE_SIZE + > - (uintptr_t)ioreq->v.iov[i].iov_base; > + for (i = 0, j = 0; i < ioreq->v.niov; i++) { > + if (page[i] == NULL) > + page[i] = ioreq->pages + (j++) * XC_PAGE_SIZE; > }Code style. Is it correct to assume that the refs to map are always the first set? What if the first 10 refs are already mapped but 10-20 are not? It seems to me that we are going to try to map the first 10 again. Considering that this behaviour can be triggered by the frontend, it is a potential security issue too. This loop should be shared between the two cases batch_maps and !batch_maps.> - ioreq->blkdev->cnt_map += ioreq->v.niov; > - } else { > - for (i = 0; i < ioreq->v.niov; i++) { > + ioreq->blkdev->cnt_map += new_maps; > + } else if (new_maps) { > + for (i = 0; i < new_maps; i++) { > ioreq->page[i] = xc_gnttab_map_grant_ref > - (gnt, ioreq->domids[i], ioreq->refs[i], ioreq->prot); > + (gnt, domids[i], refs[i], ioreq->prot);same assumption> if (ioreq->page[i] == NULL) { > xen_be_printf(&ioreq->blkdev->xendev, 0, > "can''t map grant ref %d (%s, %d maps)\n", > - ioreq->refs[i], strerror(errno), ioreq->blkdev->cnt_map); > + refs[i], strerror(errno), ioreq->blkdev->cnt_map); > ioreq_unmap(ioreq); > return -1; > } > - ioreq->v.iov[i].iov_base = ioreq->page[i] + (uintptr_t)ioreq->v.iov[i].iov_base; > ioreq->blkdev->cnt_map++; > } > + for (i = 0, j = 0; i < ioreq->v.niov; i++) { > + if (page[i] == NULL) > + page[i] = ioreq->page[j++]; > + }same comments about this loop> + } > + if (ioreq->blkdev->feature_persistent) { > + while((ioreq->blkdev->persistent_gnt_c < ioreq->blkdev->max_grants) && > + new_maps) { > + /* Go through the list of newly mapped grants and add as many > + * as possible to the list of persistently mapped grants > + */ > + grant = g_malloc0(sizeof(*grant)); > + new_maps--; > + if (batch_maps) > + grant->page = ioreq->pages + (new_maps) * XC_PAGE_SIZE; > + else > + grant->page = ioreq->page[new_maps]; > + grant->blkdev = ioreq->blkdev; > + xen_be_printf(&ioreq->blkdev->xendev, 3, > + "adding grant %" PRIu32 " page: %p\n", > + refs[new_maps], grant->page); > + g_tree_insert(ioreq->blkdev->persistent_gnts, > + GUINT_TO_POINTER(refs[new_maps]), > + grant); > + ioreq->blkdev->persistent_gnt_c++; > + } > + }Please run this patch through check_patch.pl. You are still exploiting the same assumption that only the first new_maps pages are being mapped (page[new_maps] corresponds to refs[new_maps]). I would like a comment saying that the first num_unmap pages of the array are left out and need to be unmapped separately. Wouldn''t it be clearer if you just did this inside the previous loop where you set page[i]?> + for (i = 0; i < ioreq->v.niov; i++) { > + ioreq->v.iov[i].iov_base = page[i] + > + (uintptr_t)ioreq->v.iov[i].iov_base; > }I would also put this inside the same loop> ioreq->mapped = 1; > + ioreq->num_unmap = new_maps; > return 0; > } > > @@ -690,6 +791,7 @@ static int blk_init(struct XenDevice *xendev) > > /* fill info */ > xenstore_write_be_int(&blkdev->xendev, "feature-barrier", 1); > + xenstore_write_be_int(&blkdev->xendev, "feature-persistent", 1); > xenstore_write_be_int(&blkdev->xendev, "info", info); > xenstore_write_be_int(&blkdev->xendev, "sector-size", blkdev->file_blk); > xenstore_write_be_int(&blkdev->xendev, "sectors", > @@ -713,6 +815,7 @@ out_error: > static int blk_connect(struct XenDevice *xendev) > { > struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev); > + int pers; > > if (xenstore_read_fe_int(&blkdev->xendev, "ring-ref", &blkdev->ring_ref) == -1) { > return -1; > @@ -721,6 +824,11 @@ static int blk_connect(struct XenDevice *xendev) > &blkdev->xendev.remote_port) == -1) { > return -1; > } > + if (xenstore_read_fe_int(&blkdev->xendev, "feature-persistent", &pers)) { > + blkdev->feature_persistent = FALSE; > + } else { > + blkdev->feature_persistent = !!pers; > + } > > blkdev->protocol = BLKIF_PROTOCOL_NATIVE; > if (blkdev->xendev.protocol) { > @@ -764,6 +872,15 @@ static int blk_connect(struct XenDevice *xendev) > } > } > > + if (blkdev->feature_persistent) { > + /* Init persistent grants */ > + blkdev->max_grants = max_requests * BLKIF_MAX_SEGMENTS_PER_REQUEST; > + blkdev->persistent_gnts = g_tree_new_full((GCompareDataFunc)int_cmp, > + NULL, NULL, > + (GDestroyNotify)destroy_grant); > + blkdev->persistent_gnt_c = 0; > + } > + > xen_be_bind_evtchn(&blkdev->xendev); > > xen_be_printf(&blkdev->xendev, 1, "ok: proto %s, ring-ref %d, " > @@ -804,6 +921,10 @@ static int blk_free(struct XenDevice *xendev) > blk_disconnect(xendev); > } > > + /* Free persistent grants */ > + if (blkdev->feature_persistent) > + g_tree_destroy(blkdev->persistent_gnts);code style> while (!QLIST_EMPTY(&blkdev->freelist)) { > ioreq = QLIST_FIRST(&blkdev->freelist); > QLIST_REMOVE(ioreq, list); > -- > 1.7.7.5 (Apple Git-26) > >_______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Roger Pau Monné
2013-Jan-04 17:37 UTC
Re: [PATCH RFC 3/3] xen_disk: add persistent grant support to xen_disk backend
On 04/01/13 17:42, Stefano Stabellini wrote:> On Mon, 31 Dec 2012, Roger Pau Monne wrote: >> This protocol extension reuses the same set of grant pages for all >> transactions between the front/back drivers, avoiding expensive tlb >> flushes, grant table lock contention and switches between userspace >> and kernel space. The full description of the protocol can be found in >> the public blkif.h header. > > it would be great if you could add a link to a webpage tooSure, thanks for the review.> >> Speed improvement with 15 guests performing I/O is ~450%. >> >> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> >> Cc: xen-devel@lists.xen.org >> Cc: Stefano Stabellini <Stefano.Stabellini@eu.citrix.com> >> Cc: Anthony PERARD <anthony.perard@citrix.com> >> --- >> Performance comparison with the previous implementation can be seen in >> the followign graph: >> >> http://xenbits.xen.org/people/royger/persistent_read_qemu.png > > this is what I like to see! > > >> hw/xen_disk.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++++++------ >> 1 files changed, 138 insertions(+), 17 deletions(-) >> >> diff --git a/hw/xen_disk.c b/hw/xen_disk.c >> index 1eb485a..bafeceb 100644 >> --- a/hw/xen_disk.c >> +++ b/hw/xen_disk.c >> @@ -52,6 +52,11 @@ static int max_requests = 32; >> #define BLOCK_SIZE 512 >> #define IOCB_COUNT (BLKIF_MAX_SEGMENTS_PER_REQUEST + 2) >> >> +struct persistent_gnt { >> + void *page; >> + struct XenBlkDev *blkdev; >> +}; >> + >> struct ioreq { >> blkif_request_t req; >> int16_t status; >> @@ -69,6 +74,7 @@ struct ioreq { >> int prot; >> void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST]; >> void *pages; >> + int num_unmap; >> >> /* aio status */ >> int aio_inflight; >> @@ -105,6 +111,12 @@ struct XenBlkDev { >> int requests_inflight; >> int requests_finished; >> >> + /* Persistent grants extension */ >> + gboolean feature_persistent; >> + GTree *persistent_gnts; >> + unsigned int persistent_gnt_c; > > can you come up with a better name for this variable?persistent_gnt_num or persistent_gnt_count?> >> + unsigned int max_grants; >> + >> /* qemu block driver */ >> DriveInfo *dinfo; >> BlockDriverState *bs; >> @@ -138,6 +150,29 @@ static void ioreq_reset(struct ioreq *ioreq) >> qemu_iovec_reset(&ioreq->v); >> } >> >> +static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data) >> +{ >> + uint ua = GPOINTER_TO_UINT(a); >> + uint ub = GPOINTER_TO_UINT(b); >> + return (ua > ub) - (ua < ub); >> +} >> + >> +static void destroy_grant(gpointer pgnt) >> +{ >> + struct persistent_gnt *grant = pgnt; >> + XenGnttab gnt = grant->blkdev->xendev.gnttabdev; >> + >> + if (xc_gnttab_munmap(gnt, grant->page, 1) != 0) { >> + xen_be_printf(&grant->blkdev->xendev, 0, >> + "xc_gnttab_munmap failed: %s\n", >> + strerror(errno)); >> + } >> + grant->blkdev->persistent_gnt_c--; >> + xen_be_printf(&grant->blkdev->xendev, 3, >> + "unmapped grant %p\n", grant->page); >> + g_free(grant); >> +} >> + >> static struct ioreq *ioreq_start(struct XenBlkDev *blkdev) >> { >> struct ioreq *ioreq = NULL; >> @@ -266,21 +301,21 @@ static void ioreq_unmap(struct ioreq *ioreq) >> XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev; >> int i; >> >> - if (ioreq->v.niov == 0 || ioreq->mapped == 0) { >> + if (ioreq->num_unmap == 0 || ioreq->mapped == 0) { >> return; >> } >> if (batch_maps) { >> if (!ioreq->pages) { >> return; >> } >> - if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->v.niov) != 0) { >> + if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->num_unmap) != 0) { >> xen_be_printf(&ioreq->blkdev->xendev, 0, "xc_gnttab_munmap failed: %s\n", >> strerror(errno)); >> } > > Could you please add a comment saying that only the first num_unmap > pages are going to be unmapped here?Sure> The others are going to be unmapped one by one by destroy_grant. > > >> - ioreq->blkdev->cnt_map -= ioreq->v.niov; >> + ioreq->blkdev->cnt_map -= ioreq->num_unmap; >> ioreq->pages = NULL; >> } else { >> - for (i = 0; i < ioreq->v.niov; i++) { >> + for (i = 0; i < ioreq->num_unmap; i++) { >> if (!ioreq->page[i]) { >> continue; >> } > > I would be tempted to remove persistent grants support for the > !batch_maps case, if it is going to make the code simplerIt is probably going to make the code simpler.> >> @@ -298,41 +333,107 @@ static void ioreq_unmap(struct ioreq *ioreq) >> static int ioreq_map(struct ioreq *ioreq) >> { >> XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev; >> - int i; >> + uint32_t domids[BLKIF_MAX_SEGMENTS_PER_REQUEST]; >> + uint32_t refs[BLKIF_MAX_SEGMENTS_PER_REQUEST]; >> + void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST]; >> + int i, j, new_maps = 0; >> + struct persistent_gnt *grant; >> >> if (ioreq->v.niov == 0 || ioreq->mapped == 1) { >> return 0; >> } >> - if (batch_maps) { >> + if (ioreq->blkdev->feature_persistent) { >> + for (i = 0; i < ioreq->v.niov; i++) { >> + grant = g_tree_lookup(ioreq->blkdev->persistent_gnts, >> + GUINT_TO_POINTER(ioreq->refs[i])); >> + >> + if (grant != NULL) { >> + page[i] = grant->page; >> + xen_be_printf(&ioreq->blkdev->xendev, 3, >> + "using persistent-grant %" PRIu32 "\n", >> + ioreq->refs[i]); >> + } else { >> + /* Add the grant to the list of grants that >> + * should be mapped >> + */ >> + domids[new_maps] = ioreq->domids[i]; >> + refs[new_maps] = ioreq->refs[i]; >> + page[i] = NULL; >> + new_maps++; >> + } >> + } >> + /* Set the protection to RW, since grants may be reused later >> + * with a different protection than the one needed for this request >> + */ >> + ioreq->prot = PROT_WRITE | PROT_READ; >> + } else { >> + /* All grants in the request should be mapped */ >> + memcpy(refs, ioreq->refs, sizeof(refs)); >> + memcpy(domids, ioreq->domids, sizeof(domids)); >> + memset(page, 0, sizeof(page)); >> + new_maps = ioreq->v.niov; >> + } >> + >> + if (batch_maps && new_maps) { >> ioreq->pages = xc_gnttab_map_grant_refs >> - (gnt, ioreq->v.niov, ioreq->domids, ioreq->refs, ioreq->prot); >> + (gnt, new_maps, domids, refs, ioreq->prot); >> if (ioreq->pages == NULL) { >> xen_be_printf(&ioreq->blkdev->xendev, 0, >> "can't map %d grant refs (%s, %d maps)\n", >> - ioreq->v.niov, strerror(errno), ioreq->blkdev->cnt_map); >> + new_maps, strerror(errno), ioreq->blkdev->cnt_map); >> return -1; >> } >> - for (i = 0; i < ioreq->v.niov; i++) { >> - ioreq->v.iov[i].iov_base = ioreq->pages + i * XC_PAGE_SIZE + >> - (uintptr_t)ioreq->v.iov[i].iov_base; >> + for (i = 0, j = 0; i < ioreq->v.niov; i++) { >> + if (page[i] == NULL) >> + page[i] = ioreq->pages + (j++) * XC_PAGE_SIZE; >> } > > Code style. > > Is it correct to assume that the refs to map are always the first set?"refs" only contains grant references that should be mapped. Note that refs is not ioreq->refs, it is the subset of references in ioreq->refs that are not yet mapped.> What if the first 10 refs are already mapped but 10-20 are not?"refs" should only contain unmapped references, and it's size is "new_maps"> It seems to me that we are going to try to map the first 10 again. > Considering that this behaviour can be triggered by the frontend, it is > a potential security issue too.I think I should add a comment to ioreq_map, because it's logic can be a bit tricky. The basic approach is that "refs" and "domids" should only contain the information about the grants that should be mapped, and the number of grants to map is "new_maps". After fetching all the persistent grants, and mapping the ones that we need to complete the request, "page" contains the address of the memory pages where data should be read or written. ioreq->page or ioreq->pages will contain the memory addresses of the pages that have been newly mapped for this request. Then, if we still have room to store more persistent grants, we will start by the end of ioreq->page or ioreq->pages and make those granted pages persistent, decreasing new_maps as we go. So at the end new_maps will contain the number of maps that cannot be made persistent, and ioreq->page or ioreq->pages will contain the memory address of those pages (that should be unmapped) at the start.> This loop should be shared between the two cases batch_maps and !batch_maps. > > >> - ioreq->blkdev->cnt_map += ioreq->v.niov; >> - } else { >> - for (i = 0; i < ioreq->v.niov; i++) { >> + ioreq->blkdev->cnt_map += new_maps; >> + } else if (new_maps) { >> + for (i = 0; i < new_maps; i++) { >> ioreq->page[i] = xc_gnttab_map_grant_ref >> - (gnt, ioreq->domids[i], ioreq->refs[i], ioreq->prot); >> + (gnt, domids[i], refs[i], ioreq->prot); > > same assumption > >> if (ioreq->page[i] == NULL) { >> xen_be_printf(&ioreq->blkdev->xendev, 0, >> "can't map grant ref %d (%s, %d maps)\n", >> - ioreq->refs[i], strerror(errno), ioreq->blkdev->cnt_map); >> + refs[i], strerror(errno), ioreq->blkdev->cnt_map); >> ioreq_unmap(ioreq); >> return -1; >> } >> - ioreq->v.iov[i].iov_base = ioreq->page[i] + (uintptr_t)ioreq->v.iov[i].iov_base; >> ioreq->blkdev->cnt_map++; >> } >> + for (i = 0, j = 0; i < ioreq->v.niov; i++) { >> + if (page[i] == NULL) >> + page[i] = ioreq->page[j++]; >> + } > > same comments about this loopHere we fill the blanks in the page array, those blanks are grants that are not (yet) persistently mapped, so we need to set them to the pages that have just been mapped in the above call to xc_gnttab_map_grant_ref.>> + } >> + if (ioreq->blkdev->feature_persistent) { >> + while((ioreq->blkdev->persistent_gnt_c < ioreq->blkdev->max_grants) && >> + new_maps) { >> + /* Go through the list of newly mapped grants and add as many >> + * as possible to the list of persistently mapped grants >> + */ >> + grant = g_malloc0(sizeof(*grant)); >> + new_maps--; >> + if (batch_maps) >> + grant->page = ioreq->pages + (new_maps) * XC_PAGE_SIZE; >> + else >> + grant->page = ioreq->page[new_maps]; >> + grant->blkdev = ioreq->blkdev; >> + xen_be_printf(&ioreq->blkdev->xendev, 3, >> + "adding grant %" PRIu32 " page: %p\n", >> + refs[new_maps], grant->page); >> + g_tree_insert(ioreq->blkdev->persistent_gnts, >> + GUINT_TO_POINTER(refs[new_maps]), >> + grant); >> + ioreq->blkdev->persistent_gnt_c++; >> + } >> + } > > Please run this patch through check_patch.pl. > > You are still exploiting the same assumption that only the first new_maps > pages are being mapped (page[new_maps] corresponds to refs[new_maps]).This is not true, page[new_maps] doesn't correspond to refs[new_maps] (this is true for ioreq->page[new_maps] and refs[new_maps]), since page[x] contains the page address of all the granted references needed to complete the request, and refs[x] only contains the grant references of the grants that have been mapped to complete this request.> > I would like a comment saying that the first num_unmap pages of the > array are left out and need to be unmapped separately. > > Wouldn't it be clearer if you just did this inside the previous loop > where you set page[i]? > > >> + for (i = 0; i < ioreq->v.niov; i++) { >> + ioreq->v.iov[i].iov_base = page[i] + >> + (uintptr_t)ioreq->v.iov[i].iov_base; >> } > > I would also put this inside the same loop > > >> ioreq->mapped = 1; >> + ioreq->num_unmap = new_maps; >> return 0; >> } >> >> @@ -690,6 +791,7 @@ static int blk_init(struct XenDevice *xendev) >> >> /* fill info */ >> xenstore_write_be_int(&blkdev->xendev, "feature-barrier", 1); >> + xenstore_write_be_int(&blkdev->xendev, "feature-persistent", 1); >> xenstore_write_be_int(&blkdev->xendev, "info", info); >> xenstore_write_be_int(&blkdev->xendev, "sector-size", blkdev->file_blk); >> xenstore_write_be_int(&blkdev->xendev, "sectors", >> @@ -713,6 +815,7 @@ out_error: >> static int blk_connect(struct XenDevice *xendev) >> { >> struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev); >> + int pers; >> >> if (xenstore_read_fe_int(&blkdev->xendev, "ring-ref", &blkdev->ring_ref) == -1) { >> return -1; >> @@ -721,6 +824,11 @@ static int blk_connect(struct XenDevice *xendev) >> &blkdev->xendev.remote_port) == -1) { >> return -1; >> } >> + if (xenstore_read_fe_int(&blkdev->xendev, "feature-persistent", &pers)) { >> + blkdev->feature_persistent = FALSE; >> + } else { >> + blkdev->feature_persistent = !!pers; >> + } >> >> blkdev->protocol = BLKIF_PROTOCOL_NATIVE; >> if (blkdev->xendev.protocol) { >> @@ -764,6 +872,15 @@ static int blk_connect(struct XenDevice *xendev) >> } >> } >> >> + if (blkdev->feature_persistent) { >> + /* Init persistent grants */ >> + blkdev->max_grants = max_requests * BLKIF_MAX_SEGMENTS_PER_REQUEST; >> + blkdev->persistent_gnts = g_tree_new_full((GCompareDataFunc)int_cmp, >> + NULL, NULL, >> + (GDestroyNotify)destroy_grant); >> + blkdev->persistent_gnt_c = 0; >> + } >> + >> xen_be_bind_evtchn(&blkdev->xendev); >> >> xen_be_printf(&blkdev->xendev, 1, "ok: proto %s, ring-ref %d, " >> @@ -804,6 +921,10 @@ static int blk_free(struct XenDevice *xendev) >> blk_disconnect(xendev); >> } >> >> + /* Free persistent grants */ >> + if (blkdev->feature_persistent) >> + g_tree_destroy(blkdev->persistent_gnts); > > code style > >> while (!QLIST_EMPTY(&blkdev->freelist)) { >> ioreq = QLIST_FIRST(&blkdev->freelist); >> QLIST_REMOVE(ioreq, list); >> -- >> 1.7.7.5 (Apple Git-26) >> >>_______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Stefano Stabellini
2013-Jan-04 18:02 UTC
Re: [PATCH RFC 3/3] xen_disk: add persistent grant support to xen_disk backend
On Fri, 4 Jan 2013, Roger Pau Monne wrote:> On 04/01/13 17:42, Stefano Stabellini wrote: > > On Mon, 31 Dec 2012, Roger Pau Monne wrote: > >> This protocol extension reuses the same set of grant pages for all > >> transactions between the front/back drivers, avoiding expensive tlb > >> flushes, grant table lock contention and switches between userspace > >> and kernel space. The full description of the protocol can be found in > >> the public blkif.h header. > > > > it would be great if you could add a link to a webpage too > > Sure, thanks for the review. > > > > >> Speed improvement with 15 guests performing I/O is ~450%. > >> > >> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> > >> Cc: xen-devel@lists.xen.org > >> Cc: Stefano Stabellini <Stefano.Stabellini@eu.citrix.com> > >> Cc: Anthony PERARD <anthony.perard@citrix.com> > >> --- > >> Performance comparison with the previous implementation can be seen in > >> the followign graph: > >> > >> http://xenbits.xen.org/people/royger/persistent_read_qemu.png > > > > this is what I like to see! > > > > > >> hw/xen_disk.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++++++------ > >> 1 files changed, 138 insertions(+), 17 deletions(-) > >> > >> diff --git a/hw/xen_disk.c b/hw/xen_disk.c > >> index 1eb485a..bafeceb 100644 > >> --- a/hw/xen_disk.c > >> +++ b/hw/xen_disk.c > >> @@ -52,6 +52,11 @@ static int max_requests = 32; > >> #define BLOCK_SIZE 512 > >> #define IOCB_COUNT (BLKIF_MAX_SEGMENTS_PER_REQUEST + 2) > >> > >> +struct persistent_gnt { > >> + void *page; > >> + struct XenBlkDev *blkdev; > >> +}; > >> + > >> struct ioreq { > >> blkif_request_t req; > >> int16_t status; > >> @@ -69,6 +74,7 @@ struct ioreq { > >> int prot; > >> void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST]; > >> void *pages; > >> + int num_unmap; > >> > >> /* aio status */ > >> int aio_inflight; > >> @@ -105,6 +111,12 @@ struct XenBlkDev { > >> int requests_inflight; > >> int requests_finished; > >> > >> + /* Persistent grants extension */ > >> + gboolean feature_persistent; > >> + GTree *persistent_gnts; > >> + unsigned int persistent_gnt_c; > > > > can you come up with a better name for this variable? > > persistent_gnt_num or persistent_gnt_count?persistent_gnt_count> >> @@ -298,41 +333,107 @@ static void ioreq_unmap(struct ioreq *ioreq) > >> static int ioreq_map(struct ioreq *ioreq) > >> { > >> XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev; > >> - int i; > >> + uint32_t domids[BLKIF_MAX_SEGMENTS_PER_REQUEST]; > >> + uint32_t refs[BLKIF_MAX_SEGMENTS_PER_REQUEST]; > >> + void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST]; > >> + int i, j, new_maps = 0; > >> + struct persistent_gnt *grant; > >> > >> if (ioreq->v.niov == 0 || ioreq->mapped == 1) { > >> return 0; > >> } > >> - if (batch_maps) { > >> + if (ioreq->blkdev->feature_persistent) { > >> + for (i = 0; i < ioreq->v.niov; i++) { > >> + grant = g_tree_lookup(ioreq->blkdev->persistent_gnts, > >> + GUINT_TO_POINTER(ioreq->refs[i])); > >> + > >> + if (grant != NULL) { > >> + page[i] = grant->page; > >> + xen_be_printf(&ioreq->blkdev->xendev, 3, > >> + "using persistent-grant %" PRIu32 "\n", > >> + ioreq->refs[i]); > >> + } else { > >> + /* Add the grant to the list of grants that > >> + * should be mapped > >> + */ > >> + domids[new_maps] = ioreq->domids[i]; > >> + refs[new_maps] = ioreq->refs[i]; > >> + page[i] = NULL; > >> + new_maps++; > >> + } > >> + } > >> + /* Set the protection to RW, since grants may be reused later > >> + * with a different protection than the one needed for this request > >> + */ > >> + ioreq->prot = PROT_WRITE | PROT_READ; > >> + } else { > >> + /* All grants in the request should be mapped */ > >> + memcpy(refs, ioreq->refs, sizeof(refs)); > >> + memcpy(domids, ioreq->domids, sizeof(domids)); > >> + memset(page, 0, sizeof(page)); > >> + new_maps = ioreq->v.niov; > >> + } > >> + > >> + if (batch_maps && new_maps) { > >> ioreq->pages = xc_gnttab_map_grant_refs > >> - (gnt, ioreq->v.niov, ioreq->domids, ioreq->refs, ioreq->prot); > >> + (gnt, new_maps, domids, refs, ioreq->prot); > >> if (ioreq->pages == NULL) { > >> xen_be_printf(&ioreq->blkdev->xendev, 0, > >> "can''t map %d grant refs (%s, %d maps)\n", > >> - ioreq->v.niov, strerror(errno), ioreq->blkdev->cnt_map); > >> + new_maps, strerror(errno), ioreq->blkdev->cnt_map); > >> return -1; > >> } > >> - for (i = 0; i < ioreq->v.niov; i++) { > >> - ioreq->v.iov[i].iov_base = ioreq->pages + i * XC_PAGE_SIZE + > >> - (uintptr_t)ioreq->v.iov[i].iov_base; > >> + for (i = 0, j = 0; i < ioreq->v.niov; i++) { > >> + if (page[i] == NULL) > >> + page[i] = ioreq->pages + (j++) * XC_PAGE_SIZE; > >> } > > > > Code style. > > > > Is it correct to assume that the refs to map are always the first set? > > "refs" only contains grant references that should be mapped. Note that > refs is not ioreq->refs, it is the subset of references in ioreq->refs > that are not yet mapped.Oh right, I missed that _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel