Mats Petersson
2012-Nov-16 14:45 UTC
[PATCH] Improve performance of IOCTL_PRIVCMD_MMAPBATCH_V2
This patch makes the IOCTL_PRIVCMD_MMAPBATCH_V2 (and older V1 version) map multiple (typically 1024) pages at a time rather than one page at a time, despite the pages being non-consecutive MFNs. The main change is to pass a pointer to an array of mfns, rather than one mfn. To support error reporting, we also pass an err_ptr. If err_ptr is NULL, it indicates we want the contiguous pages behaviour, so the mfn value is incremented rather than the pointer itself. Performance of mapping guest memory into Dom0 is improved by a factor of around 6 with this change. Signed-off-by: Mats Petersson <mats.petersson@citrix.com> --- arch/x86/xen/mmu.c | 47 ++++++++++++++++++++++++++------- drivers/xen/privcmd.c | 70 ++++++++++++++++++++++++++++++++++++++++++------- include/xen/xen-ops.h | 5 ++-- 3 files changed, 100 insertions(+), 22 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index dcf5f2d..c5e23ba 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2477,7 +2477,8 @@ void __init xen_hvm_init_mmu_ops(void) #define REMAP_BATCH_SIZE 16 struct remap_data { - unsigned long mfn; + unsigned long *mfn; + int contiguous; pgprot_t prot; struct mmu_update *mmu_update; }; @@ -2486,7 +2487,13 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, void *data) { struct remap_data *rmd = data; - pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot)); + pte_t pte = pte_mkspecial(pfn_pte(*rmd->mfn, rmd->prot)); + /* If we have a contigious range, just update the mfn itself, + else update pointer to be "next mfn". */ + if (rmd->contiguous) + (*rmd->mfn)++; + else + rmd->mfn++; rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; rmd->mmu_update->val = pte_val_ma(pte); @@ -2495,16 +2502,17 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, return 0; } + int xen_remap_domain_mfn_range(struct vm_area_struct *vma, unsigned long addr, - unsigned long mfn, int nr, - pgprot_t prot, unsigned domid) + unsigned long *mfn, int nr, + int *err_ptr, pgprot_t prot, + unsigned domid) { + int err; struct remap_data rmd; struct mmu_update mmu_update[REMAP_BATCH_SIZE]; - int batch; unsigned long range; - int err = 0; if (xen_feature(XENFEAT_auto_translated_physmap)) return -EINVAL; @@ -2515,9 +2523,15 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, rmd.mfn = mfn; rmd.prot = prot; + /* We use the err_ptr to indicate if there we are doing a contigious + * mapping or a discontigious mapping. */ + rmd.contiguous = !err_ptr; while (nr) { - batch = min(REMAP_BATCH_SIZE, nr); + int index = 0; + int done = 0; + int batch = min(REMAP_BATCH_SIZE, nr); + int batch_left = batch; range = (unsigned long)batch << PAGE_SHIFT; rmd.mmu_update = mmu_update; @@ -2526,12 +2540,25 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, if (err) goto out; - err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid); - if (err < 0) - goto out; + /* We record the error for each page that gives an error, but + * continue mapping until the whole set is done */ + do { + err = HYPERVISOR_mmu_update(&mmu_update[index], + batch_left, &done, domid); + if (err < 0) { + if (err_ptr) + err_ptr[index] = err; + else /* exit if error and no err_ptr */ + goto out; + } + batch_left -= done; + index += done; + } while (batch_left); nr -= batch; addr += range; + if (err_ptr) + err_ptr += batch; } err = 0; diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 24aec2f..68bc619 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -151,6 +151,41 @@ static int traverse_pages(unsigned nelem, size_t size, return ret; } + +/* + * Similar to traverse_pages, but use each page as a "block" of + * data to be processed as one unit. + */ +static int traverse_pages_block(unsigned nelem, size_t size, + struct list_head *pos, + int (*fn)(void *data, int nr, void *state), + void *state) +{ + void *pagedata; + unsigned pageidx; + int ret = 0; + + BUG_ON(size > PAGE_SIZE); + + pageidx = PAGE_SIZE; + + while (nelem) { + int nr = (PAGE_SIZE/size); + struct page *page; + if (nr > nelem) + nr = nelem; + pos = pos->next; + page = list_entry(pos, struct page, lru); + pagedata = page_address(page); + ret = (*fn)(pagedata, nr, state); + if (ret) + break; + nelem -= nr; + } + + return ret; +} + struct mmap_mfn_state { unsigned long va; struct vm_area_struct *vma; @@ -163,6 +198,7 @@ static int mmap_mfn_range(void *data, void *state) struct mmap_mfn_state *st = state; struct vm_area_struct *vma = st->vma; int rc; + xen_pfn_t mfn; /* Do not allow range to wrap the address space. */ if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) || @@ -174,9 +210,15 @@ static int mmap_mfn_range(void *data, void *state) ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end)) return -EINVAL; + /* We need to pass a xen_pfn_t. Check that it fits, or pass EINVAL! */ + mfn = (xen_pfn_t)msg->mfn; + if (mfn != msg->mfn) + return -EINVAL; + rc = xen_remap_domain_mfn_range(vma, msg->va & PAGE_MASK, - msg->mfn, msg->npages, + &mfn, msg->npages, + NULL, vma->vm_page_prot, st->domain); if (rc < 0) @@ -250,7 +292,7 @@ struct mmap_batch_state { * 0 for no errors * 1 if at least one error has happened (and no * -ENOENT errors have happened) - * -ENOENT if at least 1 -ENOENT has happened. + * -ENOENT if at least one -ENOENT has happened. */ int global_error; /* An array for individual errors */ @@ -260,17 +302,21 @@ struct mmap_batch_state { xen_pfn_t __user *user_mfn; }; -static int mmap_batch_fn(void *data, void *state) +static int mmap_batch_fn(void *data, int nr, void *state) { xen_pfn_t *mfnp = data; + struct mmap_batch_state *st = state; int ret; - ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, - st->vma->vm_page_prot, st->domain); + BUG_ON(nr < 0); - /* Store error code for second pass. */ - *(st->err++) = ret; + ret = xen_remap_domain_mfn_range(st->vma, + st->va & PAGE_MASK, + mfnp, nr, + st->err, + st->vma->vm_page_prot, + st->domain); /* And see if it affects the global_error. */ if (ret < 0) { @@ -282,7 +328,7 @@ static int mmap_batch_fn(void *data, void *state) st->global_error = 1; } } - st->va += PAGE_SIZE; + st->va += PAGE_SIZE * nr; return 0; } @@ -303,6 +349,8 @@ static int mmap_return_errors_v1(void *data, void *state) return __put_user(*mfnp, st->user_mfn++); } + + static struct vm_operations_struct privcmd_vm_ops; static long privcmd_ioctl_mmap_batch(void __user *udata, int version) @@ -319,6 +367,8 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) if (!xen_initial_domain()) return -EPERM; + + switch (version) { case 1: if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) @@ -378,8 +428,8 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) state.err = err_array; /* mmap_batch_fn guarantees ret == 0 */ - BUG_ON(traverse_pages(m.num, sizeof(xen_pfn_t), - &pagelist, mmap_batch_fn, &state)); + BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t), + &pagelist, mmap_batch_fn, &state)); up_write(&mm->mmap_sem); diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 6a198e4..1bcda39 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -26,7 +26,8 @@ void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order); struct vm_area_struct; int xen_remap_domain_mfn_range(struct vm_area_struct *vma, unsigned long addr, - unsigned long mfn, int nr, - pgprot_t prot, unsigned domid); + unsigned long *mfn, int nr, + int *err_ptr, pgprot_t prot, + unsigned domid); #endif /* INCLUDE_XEN_OPS_H */ -- 1.7.9.5
David Vrabel
2012-Nov-16 15:48 UTC
Re: [PATCH] Improve performance of IOCTL_PRIVCMD_MMAPBATCH_V2
On 16/11/12 14:45, Mats Petersson wrote: Add "xen/privcmd:" prefix to subject.> This patch makes the IOCTL_PRIVCMD_MMAPBATCH_V2 (and older V1 version) > map multiple (typically 1024) pages at a time rather than one page at a > time, despite the pages being non-consecutive MFNs. The main change is > to pass a pointer to an array of mfns, rather than one mfn. To support > error reporting, we also pass an err_ptr. If err_ptr is NULL, it indicates > we want the contiguous pages behaviour, so the mfn value is incremented > rather than the pointer itself. Performance of mapping guest memory into > Dom0 is improved by a factor of around 6 with this change.Can you include details on the test and the raw figures as well?> Signed-off-by: Mats Petersson <mats.petersson@citrix.com> > --- > arch/x86/xen/mmu.c | 47 ++++++++++++++++++++++++++------- > drivers/xen/privcmd.c | 70 ++++++++++++++++++++++++++++++++++++++++++------- > include/xen/xen-ops.h | 5 ++-- > 3 files changed, 100 insertions(+), 22 deletions(-) > > diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c > index dcf5f2d..c5e23ba 100644 > --- a/arch/x86/xen/mmu.c > +++ b/arch/x86/xen/mmu.c > @@ -2477,7 +2477,8 @@ void __init xen_hvm_init_mmu_ops(void) > #define REMAP_BATCH_SIZE 16 > > struct remap_data { > - unsigned long mfn; > + unsigned long *mfn; > + int contiguous;bool.> pgprot_t prot; > struct mmu_update *mmu_update; > }; > @@ -2486,7 +2487,13 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, > unsigned long addr, void *data) > { > struct remap_data *rmd = data; > - pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot)); > + pte_t pte = pte_mkspecial(pfn_pte(*rmd->mfn, rmd->prot)); > + /* If we have a contigious range, just update the mfn itself, > + else update pointer to be "next mfn". */ > + if (rmd->contiguous) > + (*rmd->mfn)++; > + else > + rmd->mfn++; > > rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; > rmd->mmu_update->val = pte_val_ma(pte); > @@ -2495,16 +2502,17 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, > return 0; > } > > + > int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > unsigned long addr, > - unsigned long mfn, int nr, > - pgprot_t prot, unsigned domid) > + unsigned long *mfn, int nr, > + int *err_ptr, pgprot_t prot, > + unsigned domid) > { > + int err; > struct remap_data rmd; > struct mmu_update mmu_update[REMAP_BATCH_SIZE]; > - int batch; > unsigned long range; > - int err = 0; > > if (xen_feature(XENFEAT_auto_translated_physmap)) > return -EINVAL; > @@ -2515,9 +2523,15 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > > rmd.mfn = mfn; > rmd.prot = prot; > + /* We use the err_ptr to indicate if there we are doing a contigious > + * mapping or a discontigious mapping. */ > + rmd.contiguous = !err_ptr;This is non-obvious for an API call. Suggest having two wrapper functions for the two different use cases that share a common internal implementation. e.g., int xen_remap_domain_mfn_array(struct vm_area_struct *vma, unsigned long addr, unsigned long *mfns, int nr, int *err_ptr, pgprot_t prot, unsigned domid) int xen_remap_domain_mfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long start_mfn, unsigned long end_mfn, pgprot_t prot, unsigned domid) It would be nice if the API calls had some docs as well (e.g., in kernel-doc style). David
Mats Petersson
2012-Nov-16 16:03 UTC
Re: [PATCH] Improve performance of IOCTL_PRIVCMD_MMAPBATCH_V2
On 16/11/12 15:48, David Vrabel wrote:> On 16/11/12 14:45, Mats Petersson wrote: > > Add "xen/privcmd:" prefix to subject. > >> This patch makes the IOCTL_PRIVCMD_MMAPBATCH_V2 (and older V1 version) >> map multiple (typically 1024) pages at a time rather than one page at a >> time, despite the pages being non-consecutive MFNs. The main change is >> to pass a pointer to an array of mfns, rather than one mfn. To support >> error reporting, we also pass an err_ptr. If err_ptr is NULL, it indicates >> we want the contiguous pages behaviour, so the mfn value is incremented >> rather than the pointer itself. Performance of mapping guest memory into >> Dom0 is improved by a factor of around 6 with this change. > Can you include details on the test and the raw figures as well? > >> Signed-off-by: Mats Petersson <mats.petersson@citrix.com> >> --- >> arch/x86/xen/mmu.c | 47 ++++++++++++++++++++++++++------- >> drivers/xen/privcmd.c | 70 ++++++++++++++++++++++++++++++++++++++++++------- >> include/xen/xen-ops.h | 5 ++-- >> 3 files changed, 100 insertions(+), 22 deletions(-) >> >> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c >> index dcf5f2d..c5e23ba 100644 >> --- a/arch/x86/xen/mmu.c >> +++ b/arch/x86/xen/mmu.c >> @@ -2477,7 +2477,8 @@ void __init xen_hvm_init_mmu_ops(void) >> #define REMAP_BATCH_SIZE 16 >> >> struct remap_data { >> - unsigned long mfn; >> + unsigned long *mfn; >> + int contiguous; > bool. > >> pgprot_t prot; >> struct mmu_update *mmu_update; >> }; >> @@ -2486,7 +2487,13 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, >> unsigned long addr, void *data) >> { >> struct remap_data *rmd = data; >> - pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot)); >> + pte_t pte = pte_mkspecial(pfn_pte(*rmd->mfn, rmd->prot)); >> + /* If we have a contigious range, just update the mfn itself, >> + else update pointer to be "next mfn". */ >> + if (rmd->contiguous) >> + (*rmd->mfn)++; >> + else >> + rmd->mfn++; >> >> rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; >> rmd->mmu_update->val = pte_val_ma(pte); >> @@ -2495,16 +2502,17 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, >> return 0; >> } >> >> + >> int xen_remap_domain_mfn_range(struct vm_area_struct *vma, >> unsigned long addr, >> - unsigned long mfn, int nr, >> - pgprot_t prot, unsigned domid) >> + unsigned long *mfn, int nr, >> + int *err_ptr, pgprot_t prot, >> + unsigned domid) >> { >> + int err; >> struct remap_data rmd; >> struct mmu_update mmu_update[REMAP_BATCH_SIZE]; >> - int batch; >> unsigned long range; >> - int err = 0; >> >> if (xen_feature(XENFEAT_auto_translated_physmap)) >> return -EINVAL; >> @@ -2515,9 +2523,15 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, >> >> rmd.mfn = mfn; >> rmd.prot = prot; >> + /* We use the err_ptr to indicate if there we are doing a contigious >> + * mapping or a discontigious mapping. */ >> + rmd.contiguous = !err_ptr; > This is non-obvious for an API call. Suggest having two wrapper > functions for the two different use cases that share a common internal > implementation.I originally had another argument to the function of "contiguous" - but found that in both places this function is called, it had both contiguous and err_ptr either "1, NULL" or "0, not-NULL". So I thought "why not save one argument". Do you foresee these functions being called from many other places?> > e.g., > > int xen_remap_domain_mfn_array(struct vm_area_struct *vma, > unsigned long addr, > unsigned long *mfns, int nr, > int *err_ptr, > pgprot_t prot, unsigned domid) > > int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > unsigned long addr, > unsigned long start_mfn, unsigned long end_mfn, > pgprot_t prot, unsigned domid)The original xen_remap_mfn_range() takes mfn and nr, so I think if we have two functions, the original function should remain as before (to reduce the amount of changes). It also makes the common function more symmetrical between the two cases, as we''d have to back-calculate the "nr" argument - we clearly can''t know "end_mfn" in the array case.> > It would be nice if the API calls had some docs as well (e.g., in > kernel-doc style).Yes, that would be nice, wouldn''t it. I''ll add something. -- Mats> > David > >
Jan Beulich
2012-Nov-21 10:29 UTC
Re: [PATCH] Improve performance of IOCTL_PRIVCMD_MMAPBATCH_V2
>>> On 16.11.12 at 15:45, Mats Petersson <mats.petersson@citrix.com> wrote: > @@ -2526,12 +2540,25 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > if (err) > goto out; > > - err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid); > - if (err < 0) > - goto out; > + /* We record the error for each page that gives an error, but > + * continue mapping until the whole set is done */ > + do { > + err = HYPERVISOR_mmu_update(&mmu_update[index], > + batch_left, &done, domid); > + if (err < 0) { > + if (err_ptr) > + err_ptr[index] = err;Shouldn''t you increment "done" here, in order to not retry the failed slot immediately?> + else /* exit if error and no err_ptr */ > + goto out; > + } > + batch_left -= done; > + index += done; > + } while (batch_left); > > nr -= batch; > addr += range; > + if (err_ptr) > + err_ptr += batch; > } > > err = 0; > @@ -303,6 +349,8 @@ static int mmap_return_errors_v1(void *data, void *state) > return __put_user(*mfnp, st->user_mfn++); > } > > + > +???> static struct vm_operations_struct privcmd_vm_ops; > > static long privcmd_ioctl_mmap_batch(void __user *udata, int version) > @@ -319,6 +367,8 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, > int version) > if (!xen_initial_domain()) > return -EPERM; > > + > +??? Jan> switch (version) { > case 1: > if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch)))
Mats Petersson
2012-Nov-21 10:51 UTC
Re: [PATCH] Improve performance of IOCTL_PRIVCMD_MMAPBATCH_V2
On 21/11/12 10:29, Jan Beulich wrote:>>>> On 16.11.12 at 15:45, Mats Petersson <mats.petersson@citrix.com> wrote: >> @@ -2526,12 +2540,25 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, >> if (err) >> goto out; >> >> - err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid); >> - if (err < 0) >> - goto out; >> + /* We record the error for each page that gives an error, but >> + * continue mapping until the whole set is done */ >> + do { >> + err = HYPERVISOR_mmu_update(&mmu_update[index], >> + batch_left, &done, domid); >> + if (err < 0) { >> + if (err_ptr) >> + err_ptr[index] = err; > Shouldn''t you increment "done" here, in order to not retry the failed > slot immediately?Yes, good spot - for some reason, I have double checked the behaviour of "done", and it returns the index of the item which gave the error, not actually "how many were processed". I have rewritten this part of code for V3 of this patch, but I think it still requires an increment of "done" to make it work correctly.> >> + else /* exit if error and no err_ptr */ >> + goto out; >> + } >> + batch_left -= done; >> + index += done; >> + } while (batch_left); >> >> nr -= batch; >> addr += range; >> + if (err_ptr) >> + err_ptr += batch; >> } >> >> err = 0; >> @@ -303,6 +349,8 @@ static int mmap_return_errors_v1(void *data, void *state) >> return __put_user(*mfnp, st->user_mfn++); >> } >> >> + >> + > ???This is cleaned up in the V3 patch.>> static struct vm_operations_struct privcmd_vm_ops; >> >> static long privcmd_ioctl_mmap_batch(void __user *udata, int version) >> @@ -319,6 +367,8 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, >> int version) >> if (!xen_initial_domain()) >> return -EPERM; >> >> + >> + > ???As above. -- Mats> Jan > >> switch (version) { >> case 1: >> if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) > > >
Jan Beulich
2012-Nov-21 10:57 UTC
Re: [PATCH] Improve performance of IOCTL_PRIVCMD_MMAPBATCH_V2
>>> On 21.11.12 at 11:51, Mats Petersson <mats.petersson@citrix.com> wrote: > On 21/11/12 10:29, Jan Beulich wrote: >>>>> On 16.11.12 at 15:45, Mats Petersson <mats.petersson@citrix.com> wrote: >>> @@ -2526,12 +2540,25 @@ int xen_remap_domain_mfn_range(struct vm_area_struct > *vma, >>> if (err) >>> goto out; >>> >>> - err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid); >>> - if (err < 0) >>> - goto out; >>> + /* We record the error for each page that gives an error, but >>> + * continue mapping until the whole set is done */ >>> + do { >>> + err = HYPERVISOR_mmu_update(&mmu_update[index], >>> + batch_left, &done, domid); >>> + if (err < 0) { >>> + if (err_ptr) >>> + err_ptr[index] = err; >> Shouldn''t you increment "done" here, in order to not retry the failed >> slot immediately? > > Yes, good spot - for some reason, I have double checked the behaviour of > "done", and it returns the index of the item which gave the error, not > actually "how many were processed".Isn''t the index of failure equal to the number of successfully processed ones?> I have rewritten this part of code for V3 of this patch, but I think it > still requires an increment of "done" to make it work correctly. >> >>> + else /* exit if error and no err_ptr */ >>> + goto out; >>> + } >>> + batch_left -= done; >>> + index += done; >>> + } while (batch_left); >>> >>> nr -= batch; >>> addr += range; >>> + if (err_ptr) >>> + err_ptr += batch; >>> } >>> >>> err = 0; >>> @@ -303,6 +349,8 @@ static int mmap_return_errors_v1(void *data, void *state) >>> return __put_user(*mfnp, st->user_mfn++); >>> } >>> >>> + >>> + >> ??? > This is cleaned up in the V3 patch.Did I miss that (and v2 too)? Jan