David Vrabel
2012-Aug-30 12:58 UTC
[PATCHv3 0/2] xen/privcmd: support for paged-out frames
This series is a straight forward-port of some functionality from classic kernels to support Xen hosts that do paging of guests. This isn''t functionality the XenServer makes use of so I''ve not tested these with paging in use. Changes since v2: - Better docs/comments, - Return -ENOENT if any frame failed with -ENOENT (even if other frames fail for other reasons). Changes since v1: - Don''t change PRIVCMD_MMAPBATCH (except to #define a constant for the error). It''s broken and not really fixable sensibly and libxc will use V2 if it is available. - Return -ENOENT if all failures were -ENOENT. - Clear arg->err on success (libxc expected this). I think this should probably get a "Tested-by" Andres or someone else who uses paging before being applied. David
David Vrabel
2012-Aug-30 12:58 UTC
[PATCH 1/2] xen/mm: return more precise error from xen_remap_domain_range()
From: David Vrabel <david.vrabel@citrix.com> Callers of xen_remap_domain_range() need to know if the remap failed because frame is currently paged out. So they can retry the remap later on. Return -ENOENT in this case. This assumes that the error codes returned by Xen are a subset of those used by the kernel. It is unclear if this is defined as part of the hypercall ABI. Signed-off-by: David Vrabel <david.vrabel@citrix.com> --- arch/x86/xen/mmu.c | 4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index b65a761..fb187ea 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2355,8 +2355,8 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, if (err) goto out; - err = -EFAULT; - if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0) + err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid); + if (err < 0) goto out; nr -= batch; -- 1.7.2.5
David Vrabel
2012-Aug-30 12:58 UTC
[PATCH 2/2] xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl
From: David Vrabel <david.vrabel@citrix.com> PRIVCMD_MMAPBATCH_V2 extends PRIVCMD_MMAPBATCH with an additional field for reporting the error code for every frame that could not be mapped. libxc prefers PRIVCMD_MMAPBATCH_V2 over PRIVCMD_MMAPBATCH. Signed-off-by: David Vrabel <david.vrabel@citrix.com> --- drivers/xen/privcmd.c | 99 +++++++++++++++++++++++++++++++++++++++--------- include/xen/privcmd.h | 23 +++++++++++- 2 files changed, 102 insertions(+), 20 deletions(-) diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index ccee0f1..c0e89e7 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -76,7 +76,7 @@ static void free_page_list(struct list_head *pages) */ static int gather_array(struct list_head *pagelist, unsigned nelem, size_t size, - void __user *data) + const void __user *data) { unsigned pageidx; void *pagedata; @@ -248,18 +248,37 @@ struct mmap_batch_state { struct vm_area_struct *vma; int err; - xen_pfn_t __user *user; + xen_pfn_t __user *user_mfn; + int __user *user_err; }; static int mmap_batch_fn(void *data, void *state) { xen_pfn_t *mfnp = data; struct mmap_batch_state *st = state; + int ret; - if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, - st->vma->vm_page_prot, st->domain) < 0) { - *mfnp |= 0xf0000000U; - st->err++; + ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, + st->vma->vm_page_prot, st->domain); + if (ret < 0) { + /* + * Error reporting is a mess but userspace relies on + * it behaving this way. + * + * V2 needs to a) return the result of each frame''s + * remap; and b) return -ENOENT if any frame failed + * with -ENOENT. + * + * In this first pass the error code is saved by + * overwriting the mfn and an error is indicated in + * st->err. + * + * The second pass by mmap_return_errors() will write + * the error codes to user space and get the right + * ioctl return value. + */ + *(int *)mfnp = ret; + st->err = ret; } st->va += PAGE_SIZE; @@ -270,16 +289,33 @@ static int mmap_return_errors(void *data, void *state) { xen_pfn_t *mfnp = data; struct mmap_batch_state *st = state; + int ret; + + if (st->user_err) { + int err = *(int *)mfnp; + + if (err == -ENOENT) + st->err = err; - return put_user(*mfnp, st->user++); + return __put_user(err, st->user_err++); + } else { + xen_pfn_t mfn; + + ret = __get_user(mfn, st->user_mfn); + if (ret < 0) + return ret; + + mfn |= PRIVCMD_MMAPBATCH_MFN_ERROR; + return __put_user(mfn, st->user_mfn++); + } } static struct vm_operations_struct privcmd_vm_ops; -static long privcmd_ioctl_mmap_batch(void __user *udata) +static long privcmd_ioctl_mmap_batch(void __user *udata, int version) { int ret; - struct privcmd_mmapbatch m; + struct privcmd_mmapbatch_v2 m; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long nr_pages; @@ -289,15 +325,31 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) if (!xen_initial_domain()) return -EPERM; - if (copy_from_user(&m, udata, sizeof(m))) - return -EFAULT; + switch (version) { + case 1: + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) + return -EFAULT; + /* Returns per-frame error in m.arr. */ + m.err = NULL; + if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) + return -EFAULT; + break; + case 2: + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) + return -EFAULT; + /* Returns per-frame error code in m.err. */ + if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) + return -EFAULT; + break; + default: + return -EINVAL; + } nr_pages = m.num; if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) return -EINVAL; - ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), - m.arr); + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); if (ret || list_empty(&pagelist)) goto out; @@ -325,12 +377,17 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) up_write(&mm->mmap_sem); - if (state.err > 0) { - state.user = m.arr; + if (state.err) { + state.err = 0; + state.user_mfn = (xen_pfn_t *)m.arr; + state.user_err = m.err; ret = traverse_pages(m.num, sizeof(xen_pfn_t), - &pagelist, - mmap_return_errors, &state); - } + &pagelist, + mmap_return_errors, &state); + if (ret >= 0) + ret = state.err; + } else if (m.err) + __clear_user(m.err, m.num * sizeof(*m.err)); out: free_page_list(&pagelist); @@ -354,7 +411,11 @@ static long privcmd_ioctl(struct file *file, break; case IOCTL_PRIVCMD_MMAPBATCH: - ret = privcmd_ioctl_mmap_batch(udata); + ret = privcmd_ioctl_mmap_batch(udata, 1); + break; + + case IOCTL_PRIVCMD_MMAPBATCH_V2: + ret = privcmd_ioctl_mmap_batch(udata, 2); break; default: diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h index 17857fb..f60d75c 100644 --- a/include/xen/privcmd.h +++ b/include/xen/privcmd.h @@ -59,13 +59,32 @@ struct privcmd_mmapbatch { int num; /* number of pages to populate */ domid_t dom; /* target domain */ __u64 addr; /* virtual address */ - xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ + xen_pfn_t __user *arr; /* array of mfns - or''d with + PRIVCMD_MMAPBATCH_MFN_ERROR on err */ +}; + +#define PRIVCMD_MMAPBATCH_MFN_ERROR 0xf0000000U + +struct privcmd_mmapbatch_v2 { + unsigned int num; /* number of pages to populate */ + domid_t dom; /* target domain */ + __u64 addr; /* virtual address */ + const xen_pfn_t __user *arr; /* array of mfns */ + int __user *err; /* array of error codes */ }; /* * @cmd: IOCTL_PRIVCMD_HYPERCALL * @arg: &privcmd_hypercall_t * Return: Value returned from execution of the specified hypercall. + * + * @cmd: IOCTL_PRIVCMD_MMAPBATCH_V2 + * @arg: &struct privcmd_mmapbatch_v2 + * Return: 0 on success (i.e., arg->err contains valid error codes for + * each frame). On an error other than a failed frame remap, -1 is + * returned and errno is set to EINVAL, EFAULT etc. As an exception, + * if the operation was otherwise successful but any frame failed with + * -ENOENT, then -1 is returned and errno is set to ENOENT. */ #define IOCTL_PRIVCMD_HYPERCALL \ _IOC(_IOC_NONE, ''P'', 0, sizeof(struct privcmd_hypercall)) @@ -73,5 +92,7 @@ struct privcmd_mmapbatch { _IOC(_IOC_NONE, ''P'', 2, sizeof(struct privcmd_mmap)) #define IOCTL_PRIVCMD_MMAPBATCH \ _IOC(_IOC_NONE, ''P'', 3, sizeof(struct privcmd_mmapbatch)) +#define IOCTL_PRIVCMD_MMAPBATCH_V2 \ + _IOC(_IOC_NONE, ''P'', 4, sizeof(struct privcmd_mmapbatch_v2)) #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ -- 1.7.2.5
Andres Lagar-Cavilla
2012-Aug-30 15:07 UTC
Re: [PATCH 1/2] xen/mm: return more precise error from xen_remap_domain_range()
On Aug 30, 2012, at 8:58 AM, David Vrabel wrote:> From: David Vrabel <david.vrabel@citrix.com> > > Callers of xen_remap_domain_range() need to know if the remap failed > because frame is currently paged out. So they can retry the remap > later on. Return -ENOENT in this case. > > This assumes that the error codes returned by Xen are a subset of > those used by the kernel. It is unclear if this is defined as part of > the hypercall ABI. > > Signed-off-by: David Vrabel <david.vrabel@citrix.com>Acked-by: Andres Lagar-Cavilla <andres@lagarcavilla.org> Thanks, Andres> --- > arch/x86/xen/mmu.c | 4 ++-- > 1 files changed, 2 insertions(+), 2 deletions(-) > > diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c > index b65a761..fb187ea 100644 > --- a/arch/x86/xen/mmu.c > +++ b/arch/x86/xen/mmu.c > @@ -2355,8 +2355,8 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > if (err) > goto out; > > - err = -EFAULT; > - if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0) > + err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid); > + if (err < 0) > goto out; > > nr -= batch; > -- > 1.7.2.5 >
Andres Lagar-Cavilla
2012-Aug-30 16:41 UTC
Re: [PATCH 2/2] xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl
David, The patch looks functionally ok, but I still have two lingering concerns: - the hideous casting of mfn into err - why not signal paged out frames for V1 Rather than keep writing English, I wrote some C :) And took the liberty to include your signed-off. David & Konrad, let me know what you think, and once we settle on either version we can move into unit testing this. Thanks Andres commit 3c0c619f11a26b7bc3f12a1c477cf969c25de231 Author: Andres Lagar-Cavilla <andres@lagarcavilla.org> Date: Thu Aug 30 12:23:33 2012 -0400 xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl PRIVCMD_MMAPBATCH_V2 extends PRIVCMD_MMAPBATCH with an additional field for reporting the error code for every frame that could not be mapped. libxc prefers PRIVCMD_MMAPBATCH_V2 over PRIVCMD_MMAPBATCH. Also expand PRIVCMD_MMAPBATCH to return appropriate error-encoding top nibble in the mfn array. Signed-off-by: David Vrabel <david.vrabel@citrix.com> Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 85226cb..6562e29 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -76,7 +76,7 @@ static void free_page_list(struct list_head *pages) */ static int gather_array(struct list_head *pagelist, unsigned nelem, size_t size, - void __user *data) + const void __user *data) { unsigned pageidx; void *pagedata; @@ -246,20 +246,54 @@ struct mmap_batch_state { domid_t domain; unsigned long va; struct vm_area_struct *vma; + /* A tristate: + * 0 for no errors + * 1 if at least one error has happened (and no + * -ENOENT errors have happened) + * -ENOENT if at least 1 -ENOENT has happened. + */ int err; - xen_pfn_t __user *user; + xen_pfn_t __user *user_mfn; + int __user *user_err; }; static int mmap_batch_fn(void *data, void *state) { xen_pfn_t *mfnp = data; struct mmap_batch_state *st = state; + int ret; + + ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, + st->vma->vm_page_prot, st->domain); + if (ret < 0) { + /* + * V2 provides a user-space (pre-checked for access) user_err + * pointer, in which we store the individual map error codes. + * + * V1 encodes the error codes in the 32bit top nibble of the + * mfn (with its known limitations vis-a-vis 64 bit callers). + * + * In either case, global state.err is zero unless one or more + * individual maps fail with -ENOENT, in which case it is -ENOENT. + * + */ + if (st->user_err) + BUG_ON(__put_user(ret, st->user_err++)); + else { + xen_pfn_t nibble = (ret == -ENOENT) ? + PRIVCMD_MMAPBATCH_PAGED_ERROR : + PRIVCMD_MMAPBATCH_MFN_ERROR; + *mfnp |= nibble; + } - if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, - st->vma->vm_page_prot, st->domain) < 0) { - *mfnp |= 0xf0000000U; - st->err++; + if (ret == -ENOENT) + st->err = -ENOENT; + else { + /* Record that at least one error has happened. */ + if (st->err == 0) + st->err = 1; + } } st->va += PAGE_SIZE; @@ -271,15 +305,18 @@ static int mmap_return_errors(void *data, void *state) xen_pfn_t *mfnp = data; struct mmap_batch_state *st = state; - return put_user(*mfnp, st->user++); + if (st->user_err == NULL) + return __put_user(*mfnp, st->user_mfn++); + + return 0; } static struct vm_operations_struct privcmd_vm_ops; -static long privcmd_ioctl_mmap_batch(void __user *udata) +static long privcmd_ioctl_mmap_batch(void __user *udata, int version) { int ret; - struct privcmd_mmapbatch m; + struct privcmd_mmapbatch_v2 m; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long nr_pages; @@ -289,15 +326,31 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) if (!xen_initial_domain()) return -EPERM; - if (copy_from_user(&m, udata, sizeof(m))) - return -EFAULT; + switch (version) { + case 1: + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) + return -EFAULT; + /* Returns per-frame error in m.arr. */ + m.err = NULL; + if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) + return -EFAULT; + break; + case 2: + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) + return -EFAULT; + /* Returns per-frame error code in m.err. */ + if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) + return -EFAULT; + break; + default: + return -EINVAL; + } nr_pages = m.num; if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) return -EINVAL; - ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), - m.arr); + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); if (ret || list_empty(&pagelist)) goto out; @@ -315,22 +368,34 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) goto out; } - state.domain = m.dom; - state.vma = vma; - state.va = m.addr; - state.err = 0; + state.domain = m.dom; + state.vma = vma; + state.va = m.addr; + state.err = 0; + state.user_err = m.err; - ret = traverse_pages(m.num, sizeof(xen_pfn_t), - &pagelist, mmap_batch_fn, &state); + /* mmap_batch_fn guarantees ret == 0 */ + BUG_ON(traverse_pages(m.num, sizeof(xen_pfn_t), + &pagelist, mmap_batch_fn, &state)); up_write(&mm->mmap_sem); - if (state.err > 0) { - state.user = m.arr; - ret = traverse_pages(m.num, sizeof(xen_pfn_t), - &pagelist, - mmap_return_errors, &state); - } + if (state.err) { + if (state.err == -ENOENT) + ret = -ENOENT; + /* V1 still needs to write back nibbles. */ + if (m.err == NULL) + { + int efault; + state.user_mfn = (xen_pfn_t *)m.arr; + efault = traverse_pages(m.num, sizeof(xen_pfn_t), + &pagelist, + mmap_return_errors, &state); + if (efault) + ret = efault; + } + } else if (m.err) + __clear_user(m.err, m.num * sizeof(*m.err)); out: free_page_list(&pagelist); @@ -354,7 +419,11 @@ static long privcmd_ioctl(struct file *file, break; case IOCTL_PRIVCMD_MMAPBATCH: - ret = privcmd_ioctl_mmap_batch(udata); + ret = privcmd_ioctl_mmap_batch(udata, 1); + break; + + case IOCTL_PRIVCMD_MMAPBATCH_V2: + ret = privcmd_ioctl_mmap_batch(udata, 2); break; default: diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h index 45c1aa1..a853168 100644 --- a/include/xen/privcmd.h +++ b/include/xen/privcmd.h @@ -58,13 +58,33 @@ struct privcmd_mmapbatch { int num; /* number of pages to populate */ domid_t dom; /* target domain */ __u64 addr; /* virtual address */ - xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ + xen_pfn_t __user *arr; /* array of mfns - or''d with + PRIVCMD_MMAPBATCH_*_ERROR on err */ +}; + +#define PRIVCMD_MMAPBATCH_MFN_ERROR 0xf0000000U +#define PRIVCMD_MMAPBATCH_PAGED_ERROR 0x80000000U + +struct privcmd_mmapbatch_v2 { + unsigned int num; /* number of pages to populate */ + domid_t dom; /* target domain */ + __u64 addr; /* virtual address */ + const xen_pfn_t __user *arr; /* array of mfns */ + int __user *err; /* array of error codes */ }; /* * @cmd: IOCTL_PRIVCMD_HYPERCALL * @arg: &privcmd_hypercall_t * Return: Value returned from execution of the specified hypercall. + * + * @cmd: IOCTL_PRIVCMD_MMAPBATCH_V2 + * @arg: &struct privcmd_mmapbatch_v2 + * Return: 0 on success (i.e., arg->err contains valid error codes for + * each frame). On an error other than a failed frame remap, -1 is + * returned and errno is set to EINVAL, EFAULT etc. As an exception, + * if the operation was otherwise successful but any frame failed with + * -ENOENT, then -1 is returned and errno is set to ENOENT. */ #define IOCTL_PRIVCMD_HYPERCALL \ _IOC(_IOC_NONE, ''P'', 0, sizeof(struct privcmd_hypercall)) @@ -72,5 +92,7 @@ struct privcmd_mmapbatch { _IOC(_IOC_NONE, ''P'', 2, sizeof(struct privcmd_mmap)) #define IOCTL_PRIVCMD_MMAPBATCH \ _IOC(_IOC_NONE, ''P'', 3, sizeof(struct privcmd_mmapbatch)) +#define IOCTL_PRIVCMD_MMAPBATCH_V2 \ + _IOC(_IOC_NONE, ''P'', 4, sizeof(struct privcmd_mmapbatch_v2)) #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ On Aug 30, 2012, at 8:58 AM, David Vrabel wrote:> From: David Vrabel <david.vrabel@citrix.com> > > PRIVCMD_MMAPBATCH_V2 extends PRIVCMD_MMAPBATCH with an additional > field for reporting the error code for every frame that could not be > mapped. libxc prefers PRIVCMD_MMAPBATCH_V2 over PRIVCMD_MMAPBATCH. > > Signed-off-by: David Vrabel <david.vrabel@citrix.com> > --- > drivers/xen/privcmd.c | 99 +++++++++++++++++++++++++++++++++++++++--------- > include/xen/privcmd.h | 23 +++++++++++- > 2 files changed, 102 insertions(+), 20 deletions(-) > > diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c > index ccee0f1..c0e89e7 100644 > --- a/drivers/xen/privcmd.c > +++ b/drivers/xen/privcmd.c > @@ -76,7 +76,7 @@ static void free_page_list(struct list_head *pages) > */ > static int gather_array(struct list_head *pagelist, > unsigned nelem, size_t size, > - void __user *data) > + const void __user *data) > { > unsigned pageidx; > void *pagedata; > @@ -248,18 +248,37 @@ struct mmap_batch_state { > struct vm_area_struct *vma; > int err; > > - xen_pfn_t __user *user; > + xen_pfn_t __user *user_mfn; > + int __user *user_err; > }; > > static int mmap_batch_fn(void *data, void *state) > { > xen_pfn_t *mfnp = data; > struct mmap_batch_state *st = state; > + int ret; > > - if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, > - st->vma->vm_page_prot, st->domain) < 0) { > - *mfnp |= 0xf0000000U; > - st->err++; > + ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, > + st->vma->vm_page_prot, st->domain); > + if (ret < 0) { > + /* > + * Error reporting is a mess but userspace relies on > + * it behaving this way. > + * > + * V2 needs to a) return the result of each frame''s > + * remap; and b) return -ENOENT if any frame failed > + * with -ENOENT. > + * > + * In this first pass the error code is saved by > + * overwriting the mfn and an error is indicated in > + * st->err. > + * > + * The second pass by mmap_return_errors() will write > + * the error codes to user space and get the right > + * ioctl return value. > + */ > + *(int *)mfnp = ret; > + st->err = ret; > } > st->va += PAGE_SIZE; > > @@ -270,16 +289,33 @@ static int mmap_return_errors(void *data, void *state) > { > xen_pfn_t *mfnp = data; > struct mmap_batch_state *st = state; > + int ret; > + > + if (st->user_err) { > + int err = *(int *)mfnp; > + > + if (err == -ENOENT) > + st->err = err; > > - return put_user(*mfnp, st->user++); > + return __put_user(err, st->user_err++); > + } else { > + xen_pfn_t mfn; > + > + ret = __get_user(mfn, st->user_mfn); > + if (ret < 0) > + return ret; > + > + mfn |= PRIVCMD_MMAPBATCH_MFN_ERROR; > + return __put_user(mfn, st->user_mfn++); > + } > } > > static struct vm_operations_struct privcmd_vm_ops; > > -static long privcmd_ioctl_mmap_batch(void __user *udata) > +static long privcmd_ioctl_mmap_batch(void __user *udata, int version) > { > int ret; > - struct privcmd_mmapbatch m; > + struct privcmd_mmapbatch_v2 m; > struct mm_struct *mm = current->mm; > struct vm_area_struct *vma; > unsigned long nr_pages; > @@ -289,15 +325,31 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) > if (!xen_initial_domain()) > return -EPERM; > > - if (copy_from_user(&m, udata, sizeof(m))) > - return -EFAULT; > + switch (version) { > + case 1: > + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) > + return -EFAULT; > + /* Returns per-frame error in m.arr. */ > + m.err = NULL; > + if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) > + return -EFAULT; > + break; > + case 2: > + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) > + return -EFAULT; > + /* Returns per-frame error code in m.err. */ > + if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) > + return -EFAULT; > + break; > + default: > + return -EINVAL; > + } > > nr_pages = m.num; > if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) > return -EINVAL; > > - ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), > - m.arr); > + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); > > if (ret || list_empty(&pagelist)) > goto out; > @@ -325,12 +377,17 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) > > up_write(&mm->mmap_sem); > > - if (state.err > 0) { > - state.user = m.arr; > + if (state.err) { > + state.err = 0; > + state.user_mfn = (xen_pfn_t *)m.arr; > + state.user_err = m.err; > ret = traverse_pages(m.num, sizeof(xen_pfn_t), > - &pagelist, > - mmap_return_errors, &state); > - } > + &pagelist, > + mmap_return_errors, &state); > + if (ret >= 0) > + ret = state.err; > + } else if (m.err) > + __clear_user(m.err, m.num * sizeof(*m.err)); > > out: > free_page_list(&pagelist); > @@ -354,7 +411,11 @@ static long privcmd_ioctl(struct file *file, > break; > > case IOCTL_PRIVCMD_MMAPBATCH: > - ret = privcmd_ioctl_mmap_batch(udata); > + ret = privcmd_ioctl_mmap_batch(udata, 1); > + break; > + > + case IOCTL_PRIVCMD_MMAPBATCH_V2: > + ret = privcmd_ioctl_mmap_batch(udata, 2); > break; > > default: > diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h > index 17857fb..f60d75c 100644 > --- a/include/xen/privcmd.h > +++ b/include/xen/privcmd.h > @@ -59,13 +59,32 @@ struct privcmd_mmapbatch { > int num; /* number of pages to populate */ > domid_t dom; /* target domain */ > __u64 addr; /* virtual address */ > - xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ > + xen_pfn_t __user *arr; /* array of mfns - or''d with > + PRIVCMD_MMAPBATCH_MFN_ERROR on err */ > +}; > + > +#define PRIVCMD_MMAPBATCH_MFN_ERROR 0xf0000000U > + > +struct privcmd_mmapbatch_v2 { > + unsigned int num; /* number of pages to populate */ > + domid_t dom; /* target domain */ > + __u64 addr; /* virtual address */ > + const xen_pfn_t __user *arr; /* array of mfns */ > + int __user *err; /* array of error codes */ > }; > > /* > * @cmd: IOCTL_PRIVCMD_HYPERCALL > * @arg: &privcmd_hypercall_t > * Return: Value returned from execution of the specified hypercall. > + * > + * @cmd: IOCTL_PRIVCMD_MMAPBATCH_V2 > + * @arg: &struct privcmd_mmapbatch_v2 > + * Return: 0 on success (i.e., arg->err contains valid error codes for > + * each frame). On an error other than a failed frame remap, -1 is > + * returned and errno is set to EINVAL, EFAULT etc. As an exception, > + * if the operation was otherwise successful but any frame failed with > + * -ENOENT, then -1 is returned and errno is set to ENOENT. > */ > #define IOCTL_PRIVCMD_HYPERCALL \ > _IOC(_IOC_NONE, ''P'', 0, sizeof(struct privcmd_hypercall)) > @@ -73,5 +92,7 @@ struct privcmd_mmapbatch { > _IOC(_IOC_NONE, ''P'', 2, sizeof(struct privcmd_mmap)) > #define IOCTL_PRIVCMD_MMAPBATCH \ > _IOC(_IOC_NONE, ''P'', 3, sizeof(struct privcmd_mmapbatch)) > +#define IOCTL_PRIVCMD_MMAPBATCH_V2 \ > + _IOC(_IOC_NONE, ''P'', 4, sizeof(struct privcmd_mmapbatch_v2)) > > #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ > -- > 1.7.2.5 >
David Vrabel
2012-Aug-30 17:04 UTC
Re: [PATCH 2/2] xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl
On 30/08/12 17:41, Andres Lagar-Cavilla wrote:> David, > The patch looks functionally ok, but I still have two lingering concerns: > - the hideous casting of mfn into errI considered couple of other approaches (unions, extending gather_array() to add gaps for the int return). They were all worse. I also tried your proposal here but it doesn''t work. See below.> - why not signal paged out frames for V1Because V1 is broken on 64bit and there doesn''t seem to be any point in changing it given that libxc won''t call it if V2 is present,> Rather than keep writing English, I wrote some C :) > > And took the liberty to include your signed-off. David & Konrad, let > me know what you think, and once we settle on either version we can move > into unit testing this.[...]> static int mmap_batch_fn(void *data, void *state) > { > xen_pfn_t *mfnp = data; > struct mmap_batch_state *st = state; > + int ret; > + > + ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, > + st->vma->vm_page_prot, st->domain); > + if (ret < 0) { > + /* > + * V2 provides a user-space (pre-checked for access) user_err > + * pointer, in which we store the individual map error codes. > + * > + * V1 encodes the error codes in the 32bit top nibble of the > + * mfn (with its known limitations vis-a-vis 64 bit callers). > + * > + * In either case, global state.err is zero unless one or more > + * individual maps fail with -ENOENT, in which case it is -ENOENT. > + * > + */ > + if (st->user_err) > + BUG_ON(__put_user(ret, st->user_err++));You can''t access user space pages here while holding current->mm->mmap_sem. I tried this and it would sometimes deadlock in the page fault handler. access_ok() only checks if the pointer is in the user space virtual address space - not that a valid mapping exists and is writable. So BUG_ON(__put_user()) should not be done. David
Andres Lagar-Cavilla
2012-Aug-30 18:29 UTC
Re: [PATCH 2/2] xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl
On Aug 30, 2012, at 1:04 PM, David Vrabel wrote:> On 30/08/12 17:41, Andres Lagar-Cavilla wrote: >> David, >> The patch looks functionally ok, but I still have two lingering concerns: >> - the hideous casting of mfn into err > > I considered couple of other approaches (unions, extending > gather_array() to add gaps for the int return). They were all worse. > > I also tried your proposal here but it doesn''t work. See below. > >> - why not signal paged out frames for V1 > > Because V1 is broken on 64bit and there doesn''t seem to be any point in > changing it given that libxc won''t call it if V2 is present, > >> Rather than keep writing English, I wrote some C :) >> >> And took the liberty to include your signed-off. David & Konrad, let >> me know what you think, and once we settle on either version we can move >> into unit testing this. > [...] >> static int mmap_batch_fn(void *data, void *state) >> { >> xen_pfn_t *mfnp = data; >> struct mmap_batch_state *st = state; >> + int ret; >> + >> + ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, >> + st->vma->vm_page_prot, st->domain); >> + if (ret < 0) { >> + /* >> + * V2 provides a user-space (pre-checked for access) user_err >> + * pointer, in which we store the individual map error codes. >> + * >> + * V1 encodes the error codes in the 32bit top nibble of the >> + * mfn (with its known limitations vis-a-vis 64 bit callers). >> + * >> + * In either case, global state.err is zero unless one or more >> + * individual maps fail with -ENOENT, in which case it is -ENOENT. >> + * >> + */ >> + if (st->user_err) >> + BUG_ON(__put_user(ret, st->user_err++)); > > You can''t access user space pages here while holding > current->mm->mmap_sem. I tried this and it would sometimes deadlock in > the page fault handler. > > access_ok() only checks if the pointer is in the user space virtual > address space - not that a valid mapping exists and is writable. So > BUG_ON(__put_user()) should not be done.Very true. Thanks for the pointer. Clearly the reason for the gather_array/traverse_pages structure. Re-posting my version Andres> > David
Andres Lagar-Cavilla
2012-Aug-30 18:32 UTC
Re: [PATCH 2/2] xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl
Second repost of my version, heavily based on David''s. Complementary to this patch, on the xen tree I intend to add PRIVCMD_MMAPBATCH_*_ERROR into the libxc header files, and remove XEN_DOMCTL_PFINFO_PAGEDTAB from domctl.h Please review. Thanks Andres commit 3f40e8d79b7e032527ee207a97499ddbc81ca12b Author: Andres Lagar-Cavilla <andres@lagarcavilla.org> Date: Thu Aug 30 12:23:33 2012 -0400 xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl PRIVCMD_MMAPBATCH_V2 extends PRIVCMD_MMAPBATCH with an additional field for reporting the error code for every frame that could not be mapped. libxc prefers PRIVCMD_MMAPBATCH_V2 over PRIVCMD_MMAPBATCH. Also expand PRIVCMD_MMAPBATCH to return appropriate error-encoding top nibble in the mfn array. Signed-off-by: David Vrabel <david.vrabel@citrix.com> Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 85226cb..5a03dc1 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -76,7 +76,7 @@ static void free_page_list(struct list_head *pages) */ static int gather_array(struct list_head *pagelist, unsigned nelem, size_t size, - void __user *data) + const void __user *data) { unsigned pageidx; void *pagedata; @@ -246,20 +246,42 @@ struct mmap_batch_state { domid_t domain; unsigned long va; struct vm_area_struct *vma; - int err; - - xen_pfn_t __user *user; + /* A tristate: + * 0 for no errors + * 1 if at least one error has happened (and no + * -ENOENT errors have happened) + * -ENOENT if at least 1 -ENOENT has happened. + */ + int global_error; + /* An array for individual errors */ + int *err; + + /* User-space pointers to store errors in the second pass. */ + xen_pfn_t __user *user_mfn; + int __user *user_err; }; static int mmap_batch_fn(void *data, void *state) { xen_pfn_t *mfnp = data; struct mmap_batch_state *st = state; + int ret; - if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, - st->vma->vm_page_prot, st->domain) < 0) { - *mfnp |= 0xf0000000U; - st->err++; + ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, + st->vma->vm_page_prot, st->domain); + + /* Store error code for second pass. */ + *(st->err++) = ret; + + /* And see if it affects the global global_error. */ + if (ret < 0) { + if (ret == -ENOENT) + st->global_error = -ENOENT; + else { + /* Record that at least one error has happened. */ + if (st->global_error == 0) + st->global_error = 1; + } } st->va += PAGE_SIZE; @@ -270,37 +292,76 @@ static int mmap_return_errors(void *data, void *state) { xen_pfn_t *mfnp = data; struct mmap_batch_state *st = state; - - return put_user(*mfnp, st->user++); + int err = *(st->err++); + + /* + * V2 provides a user-space user_err pointer, in which we store the + * individual map error codes. + */ + if (st->user_err) + return __put_user(err, st->user_err++); + + /* + * V1 encodes the error codes in the 32bit top nibble of the + * mfn (with its known limitations vis-a-vis 64 bit callers). + */ + *mfnp |= (err == -ENOENT) ? + PRIVCMD_MMAPBATCH_PAGED_ERROR : + PRIVCMD_MMAPBATCH_MFN_ERROR; + return __put_user(*mfnp, st->user_mfn++); } static struct vm_operations_struct privcmd_vm_ops; -static long privcmd_ioctl_mmap_batch(void __user *udata) +static long privcmd_ioctl_mmap_batch(void __user *udata, int version) { int ret; - struct privcmd_mmapbatch m; + struct privcmd_mmapbatch_v2 m; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long nr_pages; LIST_HEAD(pagelist); + int *err_array; struct mmap_batch_state state; if (!xen_initial_domain()) return -EPERM; - if (copy_from_user(&m, udata, sizeof(m))) - return -EFAULT; + switch (version) { + case 1: + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) + return -EFAULT; + /* Returns per-frame error in m.arr. */ + m.err = NULL; + if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) + return -EFAULT; + break; + case 2: + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) + return -EFAULT; + /* Returns per-frame error code in m.err. */ + if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) + return -EFAULT; + break; + default: + return -EINVAL; + } nr_pages = m.num; if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) return -EINVAL; - ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), - m.arr); + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); if (ret || list_empty(&pagelist)) - goto out; + goto out_no_err_array; + + err_array = kmalloc_array(m.num, sizeof(int), GFP_KERNEL); + if (err_array == NULL) + { + ret = -ENOMEM; + goto out_no_err_array; + } down_write(&mm->mmap_sem); @@ -315,24 +376,38 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) goto out; } - state.domain = m.dom; - state.vma = vma; - state.va = m.addr; - state.err = 0; + state.domain = m.dom; + state.vma = vma; + state.va = m.addr; + state.global_error = 0; + state.err = err_array; - ret = traverse_pages(m.num, sizeof(xen_pfn_t), - &pagelist, mmap_batch_fn, &state); + /* mmap_batch_fn guarantees ret == 0 */ + BUG_ON(traverse_pages(m.num, sizeof(xen_pfn_t), + &pagelist, mmap_batch_fn, &state)); up_write(&mm->mmap_sem); - if (state.err > 0) { - state.user = m.arr; - ret = traverse_pages(m.num, sizeof(xen_pfn_t), - &pagelist, - mmap_return_errors, &state); - } + if (state.global_error) { + int efault; + + if (state.global_error == -ENOENT) + ret = -ENOENT; + + /* Write back errors in second pass. */ + state.user_mfn = (xen_pfn_t *)m.arr; + state.user_err = m.err; + state.err = err_array; + efault = traverse_pages(m.num, sizeof(xen_pfn_t), + &pagelist, mmap_return_errors, &state); + if (efault) + ret = efault; + } else if (m.err) + ret = __clear_user(m.err, m.num * sizeof(*m.err)); out: + kfree(err_array); +out_no_err_array: free_page_list(&pagelist); return ret; @@ -354,7 +429,11 @@ static long privcmd_ioctl(struct file *file, break; case IOCTL_PRIVCMD_MMAPBATCH: - ret = privcmd_ioctl_mmap_batch(udata); + ret = privcmd_ioctl_mmap_batch(udata, 1); + break; + + case IOCTL_PRIVCMD_MMAPBATCH_V2: + ret = privcmd_ioctl_mmap_batch(udata, 2); break; default: diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h index 45c1aa1..a853168 100644 --- a/include/xen/privcmd.h +++ b/include/xen/privcmd.h @@ -58,13 +58,33 @@ struct privcmd_mmapbatch { int num; /* number of pages to populate */ domid_t dom; /* target domain */ __u64 addr; /* virtual address */ - xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ + xen_pfn_t __user *arr; /* array of mfns - or''d with + PRIVCMD_MMAPBATCH_*_ERROR on err */ +}; + +#define PRIVCMD_MMAPBATCH_MFN_ERROR 0xf0000000U +#define PRIVCMD_MMAPBATCH_PAGED_ERROR 0x80000000U + +struct privcmd_mmapbatch_v2 { + unsigned int num; /* number of pages to populate */ + domid_t dom; /* target domain */ + __u64 addr; /* virtual address */ + const xen_pfn_t __user *arr; /* array of mfns */ + int __user *err; /* array of error codes */ }; /* * @cmd: IOCTL_PRIVCMD_HYPERCALL * @arg: &privcmd_hypercall_t * Return: Value returned from execution of the specified hypercall. + * + * @cmd: IOCTL_PRIVCMD_MMAPBATCH_V2 + * @arg: &struct privcmd_mmapbatch_v2 + * Return: 0 on success (i.e., arg->err contains valid error codes for + * each frame). On an error other than a failed frame remap, -1 is + * returned and errno is set to EINVAL, EFAULT etc. As an exception, + * if the operation was otherwise successful but any frame failed with + * -ENOENT, then -1 is returned and errno is set to ENOENT. */ #define IOCTL_PRIVCMD_HYPERCALL \ _IOC(_IOC_NONE, ''P'', 0, sizeof(struct privcmd_hypercall)) @@ -72,5 +92,7 @@ struct privcmd_mmapbatch { _IOC(_IOC_NONE, ''P'', 2, sizeof(struct privcmd_mmap)) #define IOCTL_PRIVCMD_MMAPBATCH \ _IOC(_IOC_NONE, ''P'', 3, sizeof(struct privcmd_mmapbatch)) +#define IOCTL_PRIVCMD_MMAPBATCH_V2 \ + _IOC(_IOC_NONE, ''P'', 4, sizeof(struct privcmd_mmapbatch_v2)) #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ On Aug 30, 2012, at 8:58 AM, David Vrabel wrote:> From: David Vrabel <david.vrabel@citrix.com> > > PRIVCMD_MMAPBATCH_V2 extends PRIVCMD_MMAPBATCH with an additional > field for reporting the error code for every frame that could not be > mapped. libxc prefers PRIVCMD_MMAPBATCH_V2 over PRIVCMD_MMAPBATCH. > > Signed-off-by: David Vrabel <david.vrabel@citrix.com> > --- > drivers/xen/privcmd.c | 99 +++++++++++++++++++++++++++++++++++++++--------- > include/xen/privcmd.h | 23 +++++++++++- > 2 files changed, 102 insertions(+), 20 deletions(-) > > diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c > index ccee0f1..c0e89e7 100644 > --- a/drivers/xen/privcmd.c > +++ b/drivers/xen/privcmd.c > @@ -76,7 +76,7 @@ static void free_page_list(struct list_head *pages) > */ > static int gather_array(struct list_head *pagelist, > unsigned nelem, size_t size, > - void __user *data) > + const void __user *data) > { > unsigned pageidx; > void *pagedata; > @@ -248,18 +248,37 @@ struct mmap_batch_state { > struct vm_area_struct *vma; > int err; > > - xen_pfn_t __user *user; > + xen_pfn_t __user *user_mfn; > + int __user *user_err; > }; > > static int mmap_batch_fn(void *data, void *state) > { > xen_pfn_t *mfnp = data; > struct mmap_batch_state *st = state; > + int ret; > > - if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, > - st->vma->vm_page_prot, st->domain) < 0) { > - *mfnp |= 0xf0000000U; > - st->err++; > + ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, > + st->vma->vm_page_prot, st->domain); > + if (ret < 0) { > + /* > + * Error reporting is a mess but userspace relies on > + * it behaving this way. > + * > + * V2 needs to a) return the result of each frame''s > + * remap; and b) return -ENOENT if any frame failed > + * with -ENOENT. > + * > + * In this first pass the error code is saved by > + * overwriting the mfn and an error is indicated in > + * st->err. > + * > + * The second pass by mmap_return_errors() will write > + * the error codes to user space and get the right > + * ioctl return value. > + */ > + *(int *)mfnp = ret; > + st->err = ret; > } > st->va += PAGE_SIZE; > > @@ -270,16 +289,33 @@ static int mmap_return_errors(void *data, void *state) > { > xen_pfn_t *mfnp = data; > struct mmap_batch_state *st = state; > + int ret; > + > + if (st->user_err) { > + int err = *(int *)mfnp; > + > + if (err == -ENOENT) > + st->err = err; > > - return put_user(*mfnp, st->user++); > + return __put_user(err, st->user_err++); > + } else { > + xen_pfn_t mfn; > + > + ret = __get_user(mfn, st->user_mfn); > + if (ret < 0) > + return ret; > + > + mfn |= PRIVCMD_MMAPBATCH_MFN_ERROR; > + return __put_user(mfn, st->user_mfn++); > + } > } > > static struct vm_operations_struct privcmd_vm_ops; > > -static long privcmd_ioctl_mmap_batch(void __user *udata) > +static long privcmd_ioctl_mmap_batch(void __user *udata, int version) > { > int ret; > - struct privcmd_mmapbatch m; > + struct privcmd_mmapbatch_v2 m; > struct mm_struct *mm = current->mm; > struct vm_area_struct *vma; > unsigned long nr_pages; > @@ -289,15 +325,31 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) > if (!xen_initial_domain()) > return -EPERM; > > - if (copy_from_user(&m, udata, sizeof(m))) > - return -EFAULT; > + switch (version) { > + case 1: > + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) > + return -EFAULT; > + /* Returns per-frame error in m.arr. */ > + m.err = NULL; > + if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) > + return -EFAULT; > + break; > + case 2: > + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) > + return -EFAULT; > + /* Returns per-frame error code in m.err. */ > + if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) > + return -EFAULT; > + break; > + default: > + return -EINVAL; > + } > > nr_pages = m.num; > if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) > return -EINVAL; > > - ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), > - m.arr); > + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); > > if (ret || list_empty(&pagelist)) > goto out; > @@ -325,12 +377,17 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) > > up_write(&mm->mmap_sem); > > - if (state.err > 0) { > - state.user = m.arr; > + if (state.err) { > + state.err = 0; > + state.user_mfn = (xen_pfn_t *)m.arr; > + state.user_err = m.err; > ret = traverse_pages(m.num, sizeof(xen_pfn_t), > - &pagelist, > - mmap_return_errors, &state); > - } > + &pagelist, > + mmap_return_errors, &state); > + if (ret >= 0) > + ret = state.err; > + } else if (m.err) > + __clear_user(m.err, m.num * sizeof(*m.err)); > > out: > free_page_list(&pagelist); > @@ -354,7 +411,11 @@ static long privcmd_ioctl(struct file *file, > break; > > case IOCTL_PRIVCMD_MMAPBATCH: > - ret = privcmd_ioctl_mmap_batch(udata); > + ret = privcmd_ioctl_mmap_batch(udata, 1); > + break; > + > + case IOCTL_PRIVCMD_MMAPBATCH_V2: > + ret = privcmd_ioctl_mmap_batch(udata, 2); > break; > > default: > diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h > index 17857fb..f60d75c 100644 > --- a/include/xen/privcmd.h > +++ b/include/xen/privcmd.h > @@ -59,13 +59,32 @@ struct privcmd_mmapbatch { > int num; /* number of pages to populate */ > domid_t dom; /* target domain */ > __u64 addr; /* virtual address */ > - xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ > + xen_pfn_t __user *arr; /* array of mfns - or''d with > + PRIVCMD_MMAPBATCH_MFN_ERROR on err */ > +}; > + > +#define PRIVCMD_MMAPBATCH_MFN_ERROR 0xf0000000U > + > +struct privcmd_mmapbatch_v2 { > + unsigned int num; /* number of pages to populate */ > + domid_t dom; /* target domain */ > + __u64 addr; /* virtual address */ > + const xen_pfn_t __user *arr; /* array of mfns */ > + int __user *err; /* array of error codes */ > }; > > /* > * @cmd: IOCTL_PRIVCMD_HYPERCALL > * @arg: &privcmd_hypercall_t > * Return: Value returned from execution of the specified hypercall. > + * > + * @cmd: IOCTL_PRIVCMD_MMAPBATCH_V2 > + * @arg: &struct privcmd_mmapbatch_v2 > + * Return: 0 on success (i.e., arg->err contains valid error codes for > + * each frame). On an error other than a failed frame remap, -1 is > + * returned and errno is set to EINVAL, EFAULT etc. As an exception, > + * if the operation was otherwise successful but any frame failed with > + * -ENOENT, then -1 is returned and errno is set to ENOENT. > */ > #define IOCTL_PRIVCMD_HYPERCALL \ > _IOC(_IOC_NONE, ''P'', 0, sizeof(struct privcmd_hypercall)) > @@ -73,5 +92,7 @@ struct privcmd_mmapbatch { > _IOC(_IOC_NONE, ''P'', 2, sizeof(struct privcmd_mmap)) > #define IOCTL_PRIVCMD_MMAPBATCH \ > _IOC(_IOC_NONE, ''P'', 3, sizeof(struct privcmd_mmapbatch)) > +#define IOCTL_PRIVCMD_MMAPBATCH_V2 \ > + _IOC(_IOC_NONE, ''P'', 4, sizeof(struct privcmd_mmapbatch_v2)) > > #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ > -- > 1.7.2.5 >
Konrad Rzeszutek Wilk
2012-Aug-30 20:05 UTC
Re: [PATCHv3 0/2] xen/privcmd: support for paged-out frames
> I think this should probably get a "Tested-by" Andres or someone else > who uses paging before being applied.How do I test it? Is there an easy explanation/tutorial Andres?
Andres Lagar-Cavilla
2012-Aug-30 20:12 UTC
Re: [PATCHv3 0/2] xen/privcmd: support for paged-out frames
On Aug 30, 2012, at 4:05 PM, Konrad Rzeszutek Wilk wrote:>> I think this should probably get a "Tested-by" Andres or someone else >> who uses paging before being applied. > > How do I test it? Is there an easy explanation/tutorial Andres?I have a unit test lying somewhere but I''ll need a day to dig it out. Or you can start an hvm, pause it, fire up xenpaging with debug output so it tells you what it paged out, and then trace libxc as you xc_map_foreign_bulk a known batch of evicted pfns. On the first try it should get rc == -1, errno == ENOENT, error field for the pfn == ENOENT. On the second try it should all be dandy. Thanks Andres
Ian Campbell
2012-Aug-31 07:02 UTC
Re: [PATCH 2/2] xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl
On Thu, 2012-08-30 at 19:29 +0100, Andres Lagar-Cavilla wrote:> On Aug 30, 2012, at 1:04 PM, David Vrabel wrote: > > You can''t access user space pages here while holding > > current->mm->mmap_sem. I tried this and it would sometimes deadlock in > > the page fault handler. > > > > access_ok() only checks if the pointer is in the user space virtual > > address space - not that a valid mapping exists and is writable. So > > BUG_ON(__put_user()) should not be done. > > Very true. Thanks for the pointer. Clearly the reason for the gather_array/traverse_pages structure./me has flashbacks to the mammoth debugging session which led to http://xenbits.xen.org/hg/linux-2.6.18-xen.hg/rev/043dc7488c11. Ian.
David Vrabel
2012-Aug-31 13:08 UTC
Re: [PATCH 2/2] xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl
On 30/08/12 19:32, Andres Lagar-Cavilla wrote:> Second repost of my version, heavily based on David''s.Doing another allocation that may be larger than a page (and its associated additional error paths) seems to me to be a hammer to crack the (admittedly bit wonky) casting nut. That said, it''s up to Konrad which version he prefers. There are also some minor improvements you could make if you respin this patch.> Complementary to this patch, on the xen tree I intend to add > PRIVCMD_MMAPBATCH_*_ERROR into the libxc header files, and remove > XEN_DOMCTL_PFINFO_PAGEDTAB from domctl.hYes, a good idea. There''s no correspondence between the ioctl''s error reporting values and the DOMCTL_PFINFO flags.> commit 3f40e8d79b7e032527ee207a97499ddbc81ca12b > Author: Andres Lagar-Cavilla <andres@lagarcavilla.org> > Date: Thu Aug 30 12:23:33 2012 -0400 > > xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl > > PRIVCMD_MMAPBATCH_V2 extends PRIVCMD_MMAPBATCH with an additional > field for reporting the error code for every frame that could not be > mapped. libxc prefers PRIVCMD_MMAPBATCH_V2 over PRIVCMD_MMAPBATCH. > > Also expand PRIVCMD_MMAPBATCH to return appropriate error-encoding top nibble > in the mfn array. > > Signed-off-by: David Vrabel <david.vrabel@citrix.com> > Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org>[...]> -static long privcmd_ioctl_mmap_batch(void __user *udata) > +static long privcmd_ioctl_mmap_batch(void __user *udata, int version) > { > int ret; > - struct privcmd_mmapbatch m; > + struct privcmd_mmapbatch_v2 m; > struct mm_struct *mm = current->mm; > struct vm_area_struct *vma; > unsigned long nr_pages; > LIST_HEAD(pagelist); > + int *err_array;int *err_array = NULL; and you could avoid the additional jump label as kfree(NULL) is safe.> struct mmap_batch_state state; > > if (!xen_initial_domain()) > return -EPERM; > > - if (copy_from_user(&m, udata, sizeof(m))) > - return -EFAULT; > + switch (version) { > + case 1: > + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) > + return -EFAULT; > + /* Returns per-frame error in m.arr. */ > + m.err = NULL; > + if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) > + return -EFAULT; > + break; > + case 2: > + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) > + return -EFAULT; > + /* Returns per-frame error code in m.err. */ > + if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) > + return -EFAULT; > + break; > + default: > + return -EINVAL; > + } > > nr_pages = m.num; > if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) > return -EINVAL; > > - ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), > - m.arr); > + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); > > if (ret || list_empty(&pagelist)) > - goto out; > + goto out_no_err_array; > + > + err_array = kmalloc_array(m.num, sizeof(int), GFP_KERNEL);kcalloc() (see below).> + if (err_array == NULL) > + {Style: if (err_array == NULL) {> + if (state.global_error) { > + int efault; > + > + if (state.global_error == -ENOENT) > + ret = -ENOENT; > + > + /* Write back errors in second pass. */ > + state.user_mfn = (xen_pfn_t *)m.arr; > + state.user_err = m.err; > + state.err = err_array; > + efault = traverse_pages(m.num, sizeof(xen_pfn_t), > + &pagelist, mmap_return_errors, &state); > + if (efault) > + ret = efault; > + } else if (m.err) > + ret = __clear_user(m.err, m.num * sizeof(*m.err));Since you have an array of errors already there''s no need to iterate through all the MFNs again for V2. A simple copy_to_user() is sufficient provided err_array was zeroed with kcalloc(). if (m.err) ret = __copy_to_user(m.err, err_array, m.num * sizeof(*m.err)); else { /* Write back errors in second pass. */ state.user_mfn = (xen_pfn_t *)m.arr; state.user_err = m.err; state.err = err_array; ret = traverse_pages(m.num, sizeof(xen_pfn_t), &pagelist, mmap_return_errors, &state); } if (ret == 0 && state.global_error == -ENOENT) ret = -ENOENT; David
Andres Lagar-Cavilla
2012-Aug-31 13:13 UTC
Re: [PATCH 2/2] xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl
On Aug 31, 2012, at 9:08 AM, David Vrabel wrote:> On 30/08/12 19:32, Andres Lagar-Cavilla wrote: >> Second repost of my version, heavily based on David''s. > > Doing another allocation that may be larger than a page (and its > associated additional error paths) seems to me to be a hammer to crack > the (admittedly bit wonky) casting nut. > > That said, it''s up to Konrad which version he prefers.Yeah absolutely.> > There are also some minor improvements you could make if you respin this > patch. > >> Complementary to this patch, on the xen tree I intend to add >> PRIVCMD_MMAPBATCH_*_ERROR into the libxc header files, and remove >> XEN_DOMCTL_PFINFO_PAGEDTAB from domctl.h > > Yes, a good idea. There''s no correspondence between the ioctl''s error > reporting values and the DOMCTL_PFINFO flags. > >> commit 3f40e8d79b7e032527ee207a97499ddbc81ca12b >> Author: Andres Lagar-Cavilla <andres@lagarcavilla.org> >> Date: Thu Aug 30 12:23:33 2012 -0400 >> >> xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl >> >> PRIVCMD_MMAPBATCH_V2 extends PRIVCMD_MMAPBATCH with an additional >> field for reporting the error code for every frame that could not be >> mapped. libxc prefers PRIVCMD_MMAPBATCH_V2 over PRIVCMD_MMAPBATCH. >> >> Also expand PRIVCMD_MMAPBATCH to return appropriate error-encoding top nibble >> in the mfn array. >> >> Signed-off-by: David Vrabel <david.vrabel@citrix.com> >> Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org> > [...] >> -static long privcmd_ioctl_mmap_batch(void __user *udata) >> +static long privcmd_ioctl_mmap_batch(void __user *udata, int version) >> { >> int ret; >> - struct privcmd_mmapbatch m; >> + struct privcmd_mmapbatch_v2 m; >> struct mm_struct *mm = current->mm; >> struct vm_area_struct *vma; >> unsigned long nr_pages; >> LIST_HEAD(pagelist); >> + int *err_array; > > int *err_array = NULL; > > and you could avoid the additional jump label as kfree(NULL) is safe.Didn''t know, great.> >> struct mmap_batch_state state; >> >> if (!xen_initial_domain()) >> return -EPERM; >> >> - if (copy_from_user(&m, udata, sizeof(m))) >> - return -EFAULT; >> + switch (version) { >> + case 1: >> + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) >> + return -EFAULT; >> + /* Returns per-frame error in m.arr. */ >> + m.err = NULL; >> + if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) >> + return -EFAULT; >> + break; >> + case 2: >> + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) >> + return -EFAULT; >> + /* Returns per-frame error code in m.err. */ >> + if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) >> + return -EFAULT; >> + break; >> + default: >> + return -EINVAL; >> + } >> >> nr_pages = m.num; >> if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) >> return -EINVAL; >> >> - ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), >> - m.arr); >> + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); >> >> if (ret || list_empty(&pagelist)) >> - goto out; >> + goto out_no_err_array; >> + >> + err_array = kmalloc_array(m.num, sizeof(int), GFP_KERNEL); > > kcalloc() (see below). > >> + if (err_array == NULL) >> + { > > Style: if (err_array == NULL) { > >> + if (state.global_error) { >> + int efault; >> + >> + if (state.global_error == -ENOENT) >> + ret = -ENOENT; >> + >> + /* Write back errors in second pass. */ >> + state.user_mfn = (xen_pfn_t *)m.arr; >> + state.user_err = m.err; >> + state.err = err_array; >> + efault = traverse_pages(m.num, sizeof(xen_pfn_t), >> + &pagelist, mmap_return_errors, &state); >> + if (efault) >> + ret = efault; >> + } else if (m.err) >> + ret = __clear_user(m.err, m.num * sizeof(*m.err)); > > Since you have an array of errors already there''s no need to iterate > through all the MFNs again for V2. A simple copy_to_user() is > sufficient provided err_array was zeroed with kcalloc().I can kcalloc for extra paranoia, but the code will set all error slots, and is guaranteed to not jump over anything. I +1 the shortcut you outline below. Re-spin coming. Andres> > if (m.err) > ret = __copy_to_user(m.err, err_array, m.num * sizeof(*m.err)); > else { > /* Write back errors in second pass. */ > state.user_mfn = (xen_pfn_t *)m.arr; > state.user_err = m.err; > state.err = err_array; > ret = traverse_pages(m.num, sizeof(xen_pfn_t), > &pagelist, mmap_return_errors, &state); > } > > if (ret == 0 && state.global_error == -ENOENT) > ret = -ENOENT; > > David
Andres Lagar-Cavilla
2012-Aug-31 13:59 UTC
Re: [PATCH 2/2] xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl
Re-spin of alternative patch after David''s feedback. Thanks Andres commit ab351a5cef1797935b083c2f6e72800a8949c515 Author: Andres Lagar-Cavilla <andres@lagarcavilla.org> Date: Thu Aug 30 12:23:33 2012 -0400 xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl PRIVCMD_MMAPBATCH_V2 extends PRIVCMD_MMAPBATCH with an additional field for reporting the error code for every frame that could not be mapped. libxc prefers PRIVCMD_MMAPBATCH_V2 over PRIVCMD_MMAPBATCH. Also expand PRIVCMD_MMAPBATCH to return appropriate error-encoding top nibble in the mfn array. Signed-off-by: David Vrabel <david.vrabel@citrix.com> Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 85226cb..5386f20 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -76,7 +76,7 @@ static void free_page_list(struct list_head *pages) */ static int gather_array(struct list_head *pagelist, unsigned nelem, size_t size, - void __user *data) + const void __user *data) { unsigned pageidx; void *pagedata; @@ -246,61 +246,117 @@ struct mmap_batch_state { domid_t domain; unsigned long va; struct vm_area_struct *vma; - int err; - - xen_pfn_t __user *user; + /* A tristate: + * 0 for no errors + * 1 if at least one error has happened (and no + * -ENOENT errors have happened) + * -ENOENT if at least 1 -ENOENT has happened. + */ + int global_error; + /* An array for individual errors */ + int *err; + + /* User-space mfn array to store errors in the second pass for V1. */ + xen_pfn_t __user *user_mfn; }; static int mmap_batch_fn(void *data, void *state) { xen_pfn_t *mfnp = data; struct mmap_batch_state *st = state; + int ret; - if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, - st->vma->vm_page_prot, st->domain) < 0) { - *mfnp |= 0xf0000000U; - st->err++; + ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, + st->vma->vm_page_prot, st->domain); + + /* Store error code for second pass. */ + *(st->err++) = ret; + + /* And see if it affects the global_error. */ + if (ret < 0) { + if (ret == -ENOENT) + st->global_error = -ENOENT; + else { + /* Record that at least one error has happened. */ + if (st->global_error == 0) + st->global_error = 1; + } } st->va += PAGE_SIZE; return 0; } -static int mmap_return_errors(void *data, void *state) +static int mmap_return_errors_v1(void *data, void *state) { xen_pfn_t *mfnp = data; struct mmap_batch_state *st = state; - - return put_user(*mfnp, st->user++); + int err = *(st->err++); + + /* + * V1 encodes the error codes in the 32bit top nibble of the + * mfn (with its known limitations vis-a-vis 64 bit callers). + */ + *mfnp |= (err == -ENOENT) ? + PRIVCMD_MMAPBATCH_PAGED_ERROR : + PRIVCMD_MMAPBATCH_MFN_ERROR; + return __put_user(*mfnp, st->user_mfn++); } static struct vm_operations_struct privcmd_vm_ops; -static long privcmd_ioctl_mmap_batch(void __user *udata) +static long privcmd_ioctl_mmap_batch(void __user *udata, int version) { int ret; - struct privcmd_mmapbatch m; + struct privcmd_mmapbatch_v2 m; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long nr_pages; LIST_HEAD(pagelist); + int *err_array = NULL; struct mmap_batch_state state; if (!xen_initial_domain()) return -EPERM; - if (copy_from_user(&m, udata, sizeof(m))) - return -EFAULT; + switch (version) { + case 1: + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) + return -EFAULT; + /* Returns per-frame error in m.arr. */ + m.err = NULL; + if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) + return -EFAULT; + break; + case 2: + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) + return -EFAULT; + /* Returns per-frame error code in m.err. */ + if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) + return -EFAULT; + break; + default: + return -EINVAL; + } nr_pages = m.num; if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) return -EINVAL; - ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), - m.arr); + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); + + if (ret) + goto out; + if (list_empty(&pagelist)) { + ret = -EINVAL; + goto out; + } - if (ret || list_empty(&pagelist)) + err_array = kcalloc(m.num, sizeof(int), GFP_KERNEL); + if (err_array == NULL) { + ret = -ENOMEM; goto out; + } down_write(&mm->mmap_sem); @@ -315,24 +371,34 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) goto out; } - state.domain = m.dom; - state.vma = vma; - state.va = m.addr; - state.err = 0; + state.domain = m.dom; + state.vma = vma; + state.va = m.addr; + state.global_error = 0; + state.err = err_array; - ret = traverse_pages(m.num, sizeof(xen_pfn_t), - &pagelist, mmap_batch_fn, &state); + /* mmap_batch_fn guarantees ret == 0 */ + BUG_ON(traverse_pages(m.num, sizeof(xen_pfn_t), + &pagelist, mmap_batch_fn, &state)); up_write(&mm->mmap_sem); - if (state.err > 0) { - state.user = m.arr; + if (state.global_error && (version == 1)) { + /* Write back errors in second pass. */ + state.user_mfn = (xen_pfn_t *)m.arr; + state.err = err_array; ret = traverse_pages(m.num, sizeof(xen_pfn_t), - &pagelist, - mmap_return_errors, &state); - } + &pagelist, mmap_return_errors_v1, &state); + } else + ret = __copy_to_user(m.err, err_array, m.num * sizeof(int)); + + /* If we have not had any EFAULT-like global errors then set the global + * error to -ENOENT if necessary. */ + if ((ret == 0) && (state.global_error == -ENOENT)) + ret = -ENOENT; out: + kfree(err_array); free_page_list(&pagelist); return ret; @@ -354,7 +420,11 @@ static long privcmd_ioctl(struct file *file, break; case IOCTL_PRIVCMD_MMAPBATCH: - ret = privcmd_ioctl_mmap_batch(udata); + ret = privcmd_ioctl_mmap_batch(udata, 1); + break; + + case IOCTL_PRIVCMD_MMAPBATCH_V2: + ret = privcmd_ioctl_mmap_batch(udata, 2); break; default: diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h index 45c1aa1..a853168 100644 --- a/include/xen/privcmd.h +++ b/include/xen/privcmd.h @@ -58,13 +58,33 @@ struct privcmd_mmapbatch { int num; /* number of pages to populate */ domid_t dom; /* target domain */ __u64 addr; /* virtual address */ - xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ + xen_pfn_t __user *arr; /* array of mfns - or''d with + PRIVCMD_MMAPBATCH_*_ERROR on err */ +}; + +#define PRIVCMD_MMAPBATCH_MFN_ERROR 0xf0000000U +#define PRIVCMD_MMAPBATCH_PAGED_ERROR 0x80000000U + +struct privcmd_mmapbatch_v2 { + unsigned int num; /* number of pages to populate */ + domid_t dom; /* target domain */ + __u64 addr; /* virtual address */ + const xen_pfn_t __user *arr; /* array of mfns */ + int __user *err; /* array of error codes */ }; /* * @cmd: IOCTL_PRIVCMD_HYPERCALL * @arg: &privcmd_hypercall_t * Return: Value returned from execution of the specified hypercall. + * + * @cmd: IOCTL_PRIVCMD_MMAPBATCH_V2 + * @arg: &struct privcmd_mmapbatch_v2 + * Return: 0 on success (i.e., arg->err contains valid error codes for + * each frame). On an error other than a failed frame remap, -1 is + * returned and errno is set to EINVAL, EFAULT etc. As an exception, + * if the operation was otherwise successful but any frame failed with + * -ENOENT, then -1 is returned and errno is set to ENOENT. */ #define IOCTL_PRIVCMD_HYPERCALL \ _IOC(_IOC_NONE, ''P'', 0, sizeof(struct privcmd_hypercall)) @@ -72,5 +92,7 @@ struct privcmd_mmapbatch { _IOC(_IOC_NONE, ''P'', 2, sizeof(struct privcmd_mmap)) #define IOCTL_PRIVCMD_MMAPBATCH \ _IOC(_IOC_NONE, ''P'', 3, sizeof(struct privcmd_mmapbatch)) +#define IOCTL_PRIVCMD_MMAPBATCH_V2 \ + _IOC(_IOC_NONE, ''P'', 4, sizeof(struct privcmd_mmapbatch_v2)) #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ On Aug 30, 2012, at 8:58 AM, David Vrabel wrote:> From: David Vrabel <david.vrabel@citrix.com> > > PRIVCMD_MMAPBATCH_V2 extends PRIVCMD_MMAPBATCH with an additional > field for reporting the error code for every frame that could not be > mapped. libxc prefers PRIVCMD_MMAPBATCH_V2 over PRIVCMD_MMAPBATCH. > > Signed-off-by: David Vrabel <david.vrabel@citrix.com> > --- > drivers/xen/privcmd.c | 99 +++++++++++++++++++++++++++++++++++++++--------- > include/xen/privcmd.h | 23 +++++++++++- > 2 files changed, 102 insertions(+), 20 deletions(-) > > diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c > index ccee0f1..c0e89e7 100644 > --- a/drivers/xen/privcmd.c > +++ b/drivers/xen/privcmd.c > @@ -76,7 +76,7 @@ static void free_page_list(struct list_head *pages) > */ > static int gather_array(struct list_head *pagelist, > unsigned nelem, size_t size, > - void __user *data) > + const void __user *data) > { > unsigned pageidx; > void *pagedata; > @@ -248,18 +248,37 @@ struct mmap_batch_state { > struct vm_area_struct *vma; > int err; > > - xen_pfn_t __user *user; > + xen_pfn_t __user *user_mfn; > + int __user *user_err; > }; > > static int mmap_batch_fn(void *data, void *state) > { > xen_pfn_t *mfnp = data; > struct mmap_batch_state *st = state; > + int ret; > > - if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, > - st->vma->vm_page_prot, st->domain) < 0) { > - *mfnp |= 0xf0000000U; > - st->err++; > + ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, > + st->vma->vm_page_prot, st->domain); > + if (ret < 0) { > + /* > + * Error reporting is a mess but userspace relies on > + * it behaving this way. > + * > + * V2 needs to a) return the result of each frame''s > + * remap; and b) return -ENOENT if any frame failed > + * with -ENOENT. > + * > + * In this first pass the error code is saved by > + * overwriting the mfn and an error is indicated in > + * st->err. > + * > + * The second pass by mmap_return_errors() will write > + * the error codes to user space and get the right > + * ioctl return value. > + */ > + *(int *)mfnp = ret; > + st->err = ret; > } > st->va += PAGE_SIZE; > > @@ -270,16 +289,33 @@ static int mmap_return_errors(void *data, void *state) > { > xen_pfn_t *mfnp = data; > struct mmap_batch_state *st = state; > + int ret; > + > + if (st->user_err) { > + int err = *(int *)mfnp; > + > + if (err == -ENOENT) > + st->err = err; > > - return put_user(*mfnp, st->user++); > + return __put_user(err, st->user_err++); > + } else { > + xen_pfn_t mfn; > + > + ret = __get_user(mfn, st->user_mfn); > + if (ret < 0) > + return ret; > + > + mfn |= PRIVCMD_MMAPBATCH_MFN_ERROR; > + return __put_user(mfn, st->user_mfn++); > + } > } > > static struct vm_operations_struct privcmd_vm_ops; > > -static long privcmd_ioctl_mmap_batch(void __user *udata) > +static long privcmd_ioctl_mmap_batch(void __user *udata, int version) > { > int ret; > - struct privcmd_mmapbatch m; > + struct privcmd_mmapbatch_v2 m; > struct mm_struct *mm = current->mm; > struct vm_area_struct *vma; > unsigned long nr_pages; > @@ -289,15 +325,31 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) > if (!xen_initial_domain()) > return -EPERM; > > - if (copy_from_user(&m, udata, sizeof(m))) > - return -EFAULT; > + switch (version) { > + case 1: > + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) > + return -EFAULT; > + /* Returns per-frame error in m.arr. */ > + m.err = NULL; > + if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) > + return -EFAULT; > + break; > + case 2: > + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) > + return -EFAULT; > + /* Returns per-frame error code in m.err. */ > + if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) > + return -EFAULT; > + break; > + default: > + return -EINVAL; > + } > > nr_pages = m.num; > if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) > return -EINVAL; > > - ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), > - m.arr); > + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); > > if (ret || list_empty(&pagelist)) > goto out; > @@ -325,12 +377,17 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) > > up_write(&mm->mmap_sem); > > - if (state.err > 0) { > - state.user = m.arr; > + if (state.err) { > + state.err = 0; > + state.user_mfn = (xen_pfn_t *)m.arr; > + state.user_err = m.err; > ret = traverse_pages(m.num, sizeof(xen_pfn_t), > - &pagelist, > - mmap_return_errors, &state); > - } > + &pagelist, > + mmap_return_errors, &state); > + if (ret >= 0) > + ret = state.err; > + } else if (m.err) > + __clear_user(m.err, m.num * sizeof(*m.err)); > > out: > free_page_list(&pagelist); > @@ -354,7 +411,11 @@ static long privcmd_ioctl(struct file *file, > break; > > case IOCTL_PRIVCMD_MMAPBATCH: > - ret = privcmd_ioctl_mmap_batch(udata); > + ret = privcmd_ioctl_mmap_batch(udata, 1); > + break; > + > + case IOCTL_PRIVCMD_MMAPBATCH_V2: > + ret = privcmd_ioctl_mmap_batch(udata, 2); > break; > > default: > diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h > index 17857fb..f60d75c 100644 > --- a/include/xen/privcmd.h > +++ b/include/xen/privcmd.h > @@ -59,13 +59,32 @@ struct privcmd_mmapbatch { > int num; /* number of pages to populate */ > domid_t dom; /* target domain */ > __u64 addr; /* virtual address */ > - xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ > + xen_pfn_t __user *arr; /* array of mfns - or''d with > + PRIVCMD_MMAPBATCH_MFN_ERROR on err */ > +}; > + > +#define PRIVCMD_MMAPBATCH_MFN_ERROR 0xf0000000U > + > +struct privcmd_mmapbatch_v2 { > + unsigned int num; /* number of pages to populate */ > + domid_t dom; /* target domain */ > + __u64 addr; /* virtual address */ > + const xen_pfn_t __user *arr; /* array of mfns */ > + int __user *err; /* array of error codes */ > }; > > /* > * @cmd: IOCTL_PRIVCMD_HYPERCALL > * @arg: &privcmd_hypercall_t > * Return: Value returned from execution of the specified hypercall. > + * > + * @cmd: IOCTL_PRIVCMD_MMAPBATCH_V2 > + * @arg: &struct privcmd_mmapbatch_v2 > + * Return: 0 on success (i.e., arg->err contains valid error codes for > + * each frame). On an error other than a failed frame remap, -1 is > + * returned and errno is set to EINVAL, EFAULT etc. As an exception, > + * if the operation was otherwise successful but any frame failed with > + * -ENOENT, then -1 is returned and errno is set to ENOENT. > */ > #define IOCTL_PRIVCMD_HYPERCALL \ > _IOC(_IOC_NONE, ''P'', 0, sizeof(struct privcmd_hypercall)) > @@ -73,5 +92,7 @@ struct privcmd_mmapbatch { > _IOC(_IOC_NONE, ''P'', 2, sizeof(struct privcmd_mmap)) > #define IOCTL_PRIVCMD_MMAPBATCH \ > _IOC(_IOC_NONE, ''P'', 3, sizeof(struct privcmd_mmapbatch)) > +#define IOCTL_PRIVCMD_MMAPBATCH_V2 \ > + _IOC(_IOC_NONE, ''P'', 4, sizeof(struct privcmd_mmapbatch_v2)) > > #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ > -- > 1.7.2.5 >
Konrad Rzeszutek Wilk
2012-Sep-05 16:17 UTC
Re: [PATCH 2/2] xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl
On Fri, Aug 31, 2012 at 09:13:44AM -0400, Andres Lagar-Cavilla wrote:> > On Aug 31, 2012, at 9:08 AM, David Vrabel wrote: > > > On 30/08/12 19:32, Andres Lagar-Cavilla wrote: > >> Second repost of my version, heavily based on David''s. > > > > Doing another allocation that may be larger than a page (and its > > associated additional error paths) seems to me to be a hammer to crack > > the (admittedly bit wonky) casting nut. > > > > That said, it''s up to Konrad which version he prefers. > > Yeah absolutely.This one (with the comments) looks nicer.> > > > > There are also some minor improvements you could make if you respin this > > patch. > > > >> Complementary to this patch, on the xen tree I intend to add > >> PRIVCMD_MMAPBATCH_*_ERROR into the libxc header files, and remove > >> XEN_DOMCTL_PFINFO_PAGEDTAB from domctl.h > > > > Yes, a good idea. There''s no correspondence between the ioctl''s error > > reporting values and the DOMCTL_PFINFO flags. > > > >> commit 3f40e8d79b7e032527ee207a97499ddbc81ca12b > >> Author: Andres Lagar-Cavilla <andres@lagarcavilla.org> > >> Date: Thu Aug 30 12:23:33 2012 -0400 > >> > >> xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl > >> > >> PRIVCMD_MMAPBATCH_V2 extends PRIVCMD_MMAPBATCH with an additional > >> field for reporting the error code for every frame that could not be > >> mapped. libxc prefers PRIVCMD_MMAPBATCH_V2 over PRIVCMD_MMAPBATCH. > >> > >> Also expand PRIVCMD_MMAPBATCH to return appropriate error-encoding top nibble > >> in the mfn array. > >> > >> Signed-off-by: David Vrabel <david.vrabel@citrix.com> > >> Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org> > > [...] > >> -static long privcmd_ioctl_mmap_batch(void __user *udata) > >> +static long privcmd_ioctl_mmap_batch(void __user *udata, int version) > >> { > >> int ret; > >> - struct privcmd_mmapbatch m; > >> + struct privcmd_mmapbatch_v2 m; > >> struct mm_struct *mm = current->mm; > >> struct vm_area_struct *vma; > >> unsigned long nr_pages; > >> LIST_HEAD(pagelist); > >> + int *err_array; > > > > int *err_array = NULL; > > > > and you could avoid the additional jump label as kfree(NULL) is safe. > > Didn''t know, great. > > > > >> struct mmap_batch_state state; > >> > >> if (!xen_initial_domain()) > >> return -EPERM; > >> > >> - if (copy_from_user(&m, udata, sizeof(m))) > >> - return -EFAULT; > >> + switch (version) { > >> + case 1: > >> + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) > >> + return -EFAULT; > >> + /* Returns per-frame error in m.arr. */ > >> + m.err = NULL; > >> + if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) > >> + return -EFAULT; > >> + break; > >> + case 2: > >> + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) > >> + return -EFAULT; > >> + /* Returns per-frame error code in m.err. */ > >> + if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) > >> + return -EFAULT; > >> + break; > >> + default: > >> + return -EINVAL; > >> + } > >> > >> nr_pages = m.num; > >> if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) > >> return -EINVAL; > >> > >> - ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), > >> - m.arr); > >> + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); > >> > >> if (ret || list_empty(&pagelist)) > >> - goto out; > >> + goto out_no_err_array; > >> + > >> + err_array = kmalloc_array(m.num, sizeof(int), GFP_KERNEL); > > > > kcalloc() (see below). > > > >> + if (err_array == NULL) > >> + { > > > > Style: if (err_array == NULL) { > > > >> + if (state.global_error) { > >> + int efault; > >> + > >> + if (state.global_error == -ENOENT) > >> + ret = -ENOENT; > >> + > >> + /* Write back errors in second pass. */ > >> + state.user_mfn = (xen_pfn_t *)m.arr; > >> + state.user_err = m.err; > >> + state.err = err_array; > >> + efault = traverse_pages(m.num, sizeof(xen_pfn_t), > >> + &pagelist, mmap_return_errors, &state); > >> + if (efault) > >> + ret = efault; > >> + } else if (m.err) > >> + ret = __clear_user(m.err, m.num * sizeof(*m.err)); > > > > Since you have an array of errors already there''s no need to iterate > > through all the MFNs again for V2. A simple copy_to_user() is > > sufficient provided err_array was zeroed with kcalloc(). > I can kcalloc for extra paranoia, but the code will set all error slots, and is guaranteed to not jump over anything. I +1 the shortcut you outline below. Re-spin coming. > Andres > > > > if (m.err) > > ret = __copy_to_user(m.err, err_array, m.num * sizeof(*m.err)); > > else { > > /* Write back errors in second pass. */ > > state.user_mfn = (xen_pfn_t *)m.arr; > > state.user_err = m.err; > > state.err = err_array; > > ret = traverse_pages(m.num, sizeof(xen_pfn_t), > > &pagelist, mmap_return_errors, &state); > > } > > > > if (ret == 0 && state.global_error == -ENOENT) > > ret = -ENOENT; > > > > David
Konrad Rzeszutek Wilk
2012-Sep-05 16:21 UTC
Re: [PATCH 2/2] xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl
On Fri, Aug 31, 2012 at 09:59:30AM -0400, Andres Lagar-Cavilla wrote:> Re-spin of alternative patch after David''s feedback. > Thanks > Andresapplied. fixed some whitespace issues.> > commit ab351a5cef1797935b083c2f6e72800a8949c515 > Author: Andres Lagar-Cavilla <andres@lagarcavilla.org> > Date: Thu Aug 30 12:23:33 2012 -0400 > > xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl > > PRIVCMD_MMAPBATCH_V2 extends PRIVCMD_MMAPBATCH with an additional > field for reporting the error code for every frame that could not be > mapped. libxc prefers PRIVCMD_MMAPBATCH_V2 over PRIVCMD_MMAPBATCH. > > Also expand PRIVCMD_MMAPBATCH to return appropriate error-encoding top nibble > in the mfn array. > > Signed-off-by: David Vrabel <david.vrabel@citrix.com> > Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org> > > diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c > index 85226cb..5386f20 100644 > --- a/drivers/xen/privcmd.c > +++ b/drivers/xen/privcmd.c > @@ -76,7 +76,7 @@ static void free_page_list(struct list_head *pages) > */ > static int gather_array(struct list_head *pagelist, > unsigned nelem, size_t size, > - void __user *data) > + const void __user *data) > { > unsigned pageidx; > void *pagedata; > @@ -246,61 +246,117 @@ struct mmap_batch_state { > domid_t domain; > unsigned long va; > struct vm_area_struct *vma; > - int err; > - > - xen_pfn_t __user *user; > + /* A tristate: > + * 0 for no errors > + * 1 if at least one error has happened (and no > + * -ENOENT errors have happened) > + * -ENOENT if at least 1 -ENOENT has happened. > + */ > + int global_error; > + /* An array for individual errors */ > + int *err; > + > + /* User-space mfn array to store errors in the second pass for V1. */ > + xen_pfn_t __user *user_mfn; > }; > > static int mmap_batch_fn(void *data, void *state) > { > xen_pfn_t *mfnp = data; > struct mmap_batch_state *st = state; > + int ret; > > - if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, > - st->vma->vm_page_prot, st->domain) < 0) { > - *mfnp |= 0xf0000000U; > - st->err++; > + ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, > + st->vma->vm_page_prot, st->domain); > + > + /* Store error code for second pass. */ > + *(st->err++) = ret; > + > + /* And see if it affects the global_error. */ > + if (ret < 0) { > + if (ret == -ENOENT) > + st->global_error = -ENOENT; > + else { > + /* Record that at least one error has happened. */ > + if (st->global_error == 0) > + st->global_error = 1; > + } > } > st->va += PAGE_SIZE; > > return 0; > } > > -static int mmap_return_errors(void *data, void *state) > +static int mmap_return_errors_v1(void *data, void *state) > { > xen_pfn_t *mfnp = data; > struct mmap_batch_state *st = state; > - > - return put_user(*mfnp, st->user++); > + int err = *(st->err++); > + > + /* > + * V1 encodes the error codes in the 32bit top nibble of the > + * mfn (with its known limitations vis-a-vis 64 bit callers). > + */ > + *mfnp |= (err == -ENOENT) ? > + PRIVCMD_MMAPBATCH_PAGED_ERROR : > + PRIVCMD_MMAPBATCH_MFN_ERROR; > + return __put_user(*mfnp, st->user_mfn++); > } > > static struct vm_operations_struct privcmd_vm_ops; > > -static long privcmd_ioctl_mmap_batch(void __user *udata) > +static long privcmd_ioctl_mmap_batch(void __user *udata, int version) > { > int ret; > - struct privcmd_mmapbatch m; > + struct privcmd_mmapbatch_v2 m; > struct mm_struct *mm = current->mm; > struct vm_area_struct *vma; > unsigned long nr_pages; > LIST_HEAD(pagelist); > + int *err_array = NULL; > struct mmap_batch_state state; > > if (!xen_initial_domain()) > return -EPERM; > > - if (copy_from_user(&m, udata, sizeof(m))) > - return -EFAULT; > + switch (version) { > + case 1: > + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) > + return -EFAULT; > + /* Returns per-frame error in m.arr. */ > + m.err = NULL; > + if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) > + return -EFAULT; > + break; > + case 2: > + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) > + return -EFAULT; > + /* Returns per-frame error code in m.err. */ > + if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) > + return -EFAULT; > + break; > + default: > + return -EINVAL; > + } > > nr_pages = m.num; > if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) > return -EINVAL; > > - ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), > - m.arr); > + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); > + > + if (ret) > + goto out; > + if (list_empty(&pagelist)) { > + ret = -EINVAL; > + goto out; > + } > > - if (ret || list_empty(&pagelist)) > + err_array = kcalloc(m.num, sizeof(int), GFP_KERNEL); > + if (err_array == NULL) { > + ret = -ENOMEM; > goto out; > + } > > down_write(&mm->mmap_sem); > > @@ -315,24 +371,34 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) > goto out; > } > > - state.domain = m.dom; > - state.vma = vma; > - state.va = m.addr; > - state.err = 0; > + state.domain = m.dom; > + state.vma = vma; > + state.va = m.addr; > + state.global_error = 0; > + state.err = err_array; > > - ret = traverse_pages(m.num, sizeof(xen_pfn_t), > - &pagelist, mmap_batch_fn, &state); > + /* mmap_batch_fn guarantees ret == 0 */ > + BUG_ON(traverse_pages(m.num, sizeof(xen_pfn_t), > + &pagelist, mmap_batch_fn, &state)); > > up_write(&mm->mmap_sem); > > - if (state.err > 0) { > - state.user = m.arr; > + if (state.global_error && (version == 1)) { > + /* Write back errors in second pass. */ > + state.user_mfn = (xen_pfn_t *)m.arr; > + state.err = err_array; > ret = traverse_pages(m.num, sizeof(xen_pfn_t), > - &pagelist, > - mmap_return_errors, &state); > - } > + &pagelist, mmap_return_errors_v1, &state); > + } else > + ret = __copy_to_user(m.err, err_array, m.num * sizeof(int)); > + > + /* If we have not had any EFAULT-like global errors then set the global > + * error to -ENOENT if necessary. */ > + if ((ret == 0) && (state.global_error == -ENOENT)) > + ret = -ENOENT; > > out: > + kfree(err_array); > free_page_list(&pagelist); > > return ret; > @@ -354,7 +420,11 @@ static long privcmd_ioctl(struct file *file, > break; > > case IOCTL_PRIVCMD_MMAPBATCH: > - ret = privcmd_ioctl_mmap_batch(udata); > + ret = privcmd_ioctl_mmap_batch(udata, 1); > + break; > + > + case IOCTL_PRIVCMD_MMAPBATCH_V2: > + ret = privcmd_ioctl_mmap_batch(udata, 2); > break; > > default: > diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h > index 45c1aa1..a853168 100644 > --- a/include/xen/privcmd.h > +++ b/include/xen/privcmd.h > @@ -58,13 +58,33 @@ struct privcmd_mmapbatch { > int num; /* number of pages to populate */ > domid_t dom; /* target domain */ > __u64 addr; /* virtual address */ > - xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ > + xen_pfn_t __user *arr; /* array of mfns - or''d with > + PRIVCMD_MMAPBATCH_*_ERROR on err */ > +}; > + > +#define PRIVCMD_MMAPBATCH_MFN_ERROR 0xf0000000U > +#define PRIVCMD_MMAPBATCH_PAGED_ERROR 0x80000000U > + > +struct privcmd_mmapbatch_v2 { > + unsigned int num; /* number of pages to populate */ > + domid_t dom; /* target domain */ > + __u64 addr; /* virtual address */ > + const xen_pfn_t __user *arr; /* array of mfns */ > + int __user *err; /* array of error codes */ > }; > > /* > * @cmd: IOCTL_PRIVCMD_HYPERCALL > * @arg: &privcmd_hypercall_t > * Return: Value returned from execution of the specified hypercall. > + * > + * @cmd: IOCTL_PRIVCMD_MMAPBATCH_V2 > + * @arg: &struct privcmd_mmapbatch_v2 > + * Return: 0 on success (i.e., arg->err contains valid error codes for > + * each frame). On an error other than a failed frame remap, -1 is > + * returned and errno is set to EINVAL, EFAULT etc. As an exception, > + * if the operation was otherwise successful but any frame failed with > + * -ENOENT, then -1 is returned and errno is set to ENOENT. > */ > #define IOCTL_PRIVCMD_HYPERCALL \ > _IOC(_IOC_NONE, ''P'', 0, sizeof(struct privcmd_hypercall)) > @@ -72,5 +92,7 @@ struct privcmd_mmapbatch { > _IOC(_IOC_NONE, ''P'', 2, sizeof(struct privcmd_mmap)) > #define IOCTL_PRIVCMD_MMAPBATCH \ > _IOC(_IOC_NONE, ''P'', 3, sizeof(struct privcmd_mmapbatch)) > +#define IOCTL_PRIVCMD_MMAPBATCH_V2 \ > + _IOC(_IOC_NONE, ''P'', 4, sizeof(struct privcmd_mmapbatch_v2)) > > #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ > > On Aug 30, 2012, at 8:58 AM, David Vrabel wrote: > > > From: David Vrabel <david.vrabel@citrix.com> > > > > PRIVCMD_MMAPBATCH_V2 extends PRIVCMD_MMAPBATCH with an additional > > field for reporting the error code for every frame that could not be > > mapped. libxc prefers PRIVCMD_MMAPBATCH_V2 over PRIVCMD_MMAPBATCH. > > > > Signed-off-by: David Vrabel <david.vrabel@citrix.com> > > --- > > drivers/xen/privcmd.c | 99 +++++++++++++++++++++++++++++++++++++++--------- > > include/xen/privcmd.h | 23 +++++++++++- > > 2 files changed, 102 insertions(+), 20 deletions(-) > > > > diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c > > index ccee0f1..c0e89e7 100644 > > --- a/drivers/xen/privcmd.c > > +++ b/drivers/xen/privcmd.c > > @@ -76,7 +76,7 @@ static void free_page_list(struct list_head *pages) > > */ > > static int gather_array(struct list_head *pagelist, > > unsigned nelem, size_t size, > > - void __user *data) > > + const void __user *data) > > { > > unsigned pageidx; > > void *pagedata; > > @@ -248,18 +248,37 @@ struct mmap_batch_state { > > struct vm_area_struct *vma; > > int err; > > > > - xen_pfn_t __user *user; > > + xen_pfn_t __user *user_mfn; > > + int __user *user_err; > > }; > > > > static int mmap_batch_fn(void *data, void *state) > > { > > xen_pfn_t *mfnp = data; > > struct mmap_batch_state *st = state; > > + int ret; > > > > - if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, > > - st->vma->vm_page_prot, st->domain) < 0) { > > - *mfnp |= 0xf0000000U; > > - st->err++; > > + ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, > > + st->vma->vm_page_prot, st->domain); > > + if (ret < 0) { > > + /* > > + * Error reporting is a mess but userspace relies on > > + * it behaving this way. > > + * > > + * V2 needs to a) return the result of each frame''s > > + * remap; and b) return -ENOENT if any frame failed > > + * with -ENOENT. > > + * > > + * In this first pass the error code is saved by > > + * overwriting the mfn and an error is indicated in > > + * st->err. > > + * > > + * The second pass by mmap_return_errors() will write > > + * the error codes to user space and get the right > > + * ioctl return value. > > + */ > > + *(int *)mfnp = ret; > > + st->err = ret; > > } > > st->va += PAGE_SIZE; > > > > @@ -270,16 +289,33 @@ static int mmap_return_errors(void *data, void *state) > > { > > xen_pfn_t *mfnp = data; > > struct mmap_batch_state *st = state; > > + int ret; > > + > > + if (st->user_err) { > > + int err = *(int *)mfnp; > > + > > + if (err == -ENOENT) > > + st->err = err; > > > > - return put_user(*mfnp, st->user++); > > + return __put_user(err, st->user_err++); > > + } else { > > + xen_pfn_t mfn; > > + > > + ret = __get_user(mfn, st->user_mfn); > > + if (ret < 0) > > + return ret; > > + > > + mfn |= PRIVCMD_MMAPBATCH_MFN_ERROR; > > + return __put_user(mfn, st->user_mfn++); > > + } > > } > > > > static struct vm_operations_struct privcmd_vm_ops; > > > > -static long privcmd_ioctl_mmap_batch(void __user *udata) > > +static long privcmd_ioctl_mmap_batch(void __user *udata, int version) > > { > > int ret; > > - struct privcmd_mmapbatch m; > > + struct privcmd_mmapbatch_v2 m; > > struct mm_struct *mm = current->mm; > > struct vm_area_struct *vma; > > unsigned long nr_pages; > > @@ -289,15 +325,31 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) > > if (!xen_initial_domain()) > > return -EPERM; > > > > - if (copy_from_user(&m, udata, sizeof(m))) > > - return -EFAULT; > > + switch (version) { > > + case 1: > > + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) > > + return -EFAULT; > > + /* Returns per-frame error in m.arr. */ > > + m.err = NULL; > > + if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) > > + return -EFAULT; > > + break; > > + case 2: > > + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) > > + return -EFAULT; > > + /* Returns per-frame error code in m.err. */ > > + if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) > > + return -EFAULT; > > + break; > > + default: > > + return -EINVAL; > > + } > > > > nr_pages = m.num; > > if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) > > return -EINVAL; > > > > - ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), > > - m.arr); > > + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); > > > > if (ret || list_empty(&pagelist)) > > goto out; > > @@ -325,12 +377,17 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) > > > > up_write(&mm->mmap_sem); > > > > - if (state.err > 0) { > > - state.user = m.arr; > > + if (state.err) { > > + state.err = 0; > > + state.user_mfn = (xen_pfn_t *)m.arr; > > + state.user_err = m.err; > > ret = traverse_pages(m.num, sizeof(xen_pfn_t), > > - &pagelist, > > - mmap_return_errors, &state); > > - } > > + &pagelist, > > + mmap_return_errors, &state); > > + if (ret >= 0) > > + ret = state.err; > > + } else if (m.err) > > + __clear_user(m.err, m.num * sizeof(*m.err)); > > > > out: > > free_page_list(&pagelist); > > @@ -354,7 +411,11 @@ static long privcmd_ioctl(struct file *file, > > break; > > > > case IOCTL_PRIVCMD_MMAPBATCH: > > - ret = privcmd_ioctl_mmap_batch(udata); > > + ret = privcmd_ioctl_mmap_batch(udata, 1); > > + break; > > + > > + case IOCTL_PRIVCMD_MMAPBATCH_V2: > > + ret = privcmd_ioctl_mmap_batch(udata, 2); > > break; > > > > default: > > diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h > > index 17857fb..f60d75c 100644 > > --- a/include/xen/privcmd.h > > +++ b/include/xen/privcmd.h > > @@ -59,13 +59,32 @@ struct privcmd_mmapbatch { > > int num; /* number of pages to populate */ > > domid_t dom; /* target domain */ > > __u64 addr; /* virtual address */ > > - xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ > > + xen_pfn_t __user *arr; /* array of mfns - or''d with > > + PRIVCMD_MMAPBATCH_MFN_ERROR on err */ > > +}; > > + > > +#define PRIVCMD_MMAPBATCH_MFN_ERROR 0xf0000000U > > + > > +struct privcmd_mmapbatch_v2 { > > + unsigned int num; /* number of pages to populate */ > > + domid_t dom; /* target domain */ > > + __u64 addr; /* virtual address */ > > + const xen_pfn_t __user *arr; /* array of mfns */ > > + int __user *err; /* array of error codes */ > > }; > > > > /* > > * @cmd: IOCTL_PRIVCMD_HYPERCALL > > * @arg: &privcmd_hypercall_t > > * Return: Value returned from execution of the specified hypercall. > > + * > > + * @cmd: IOCTL_PRIVCMD_MMAPBATCH_V2 > > + * @arg: &struct privcmd_mmapbatch_v2 > > + * Return: 0 on success (i.e., arg->err contains valid error codes for > > + * each frame). On an error other than a failed frame remap, -1 is > > + * returned and errno is set to EINVAL, EFAULT etc. As an exception, > > + * if the operation was otherwise successful but any frame failed with > > + * -ENOENT, then -1 is returned and errno is set to ENOENT. > > */ > > #define IOCTL_PRIVCMD_HYPERCALL \ > > _IOC(_IOC_NONE, ''P'', 0, sizeof(struct privcmd_hypercall)) > > @@ -73,5 +92,7 @@ struct privcmd_mmapbatch { > > _IOC(_IOC_NONE, ''P'', 2, sizeof(struct privcmd_mmap)) > > #define IOCTL_PRIVCMD_MMAPBATCH \ > > _IOC(_IOC_NONE, ''P'', 3, sizeof(struct privcmd_mmapbatch)) > > +#define IOCTL_PRIVCMD_MMAPBATCH_V2 \ > > + _IOC(_IOC_NONE, ''P'', 4, sizeof(struct privcmd_mmapbatch_v2)) > > > > #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ > > -- > > 1.7.2.5 > >
Andres Lagar-Cavilla
2012-Sep-05 17:09 UTC
Re: [PATCH 2/2] xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl
Super. To which branch are you applying these now? (n00b question but have to ask) Andres On Sep 5, 2012, at 12:21 PM, Konrad Rzeszutek Wilk wrote:> On Fri, Aug 31, 2012 at 09:59:30AM -0400, Andres Lagar-Cavilla wrote: >> Re-spin of alternative patch after David''s feedback. >> Thanks >> Andres > > applied. fixed some whitespace issues. >> >> commit ab351a5cef1797935b083c2f6e72800a8949c515 >> Author: Andres Lagar-Cavilla <andres@lagarcavilla.org> >> Date: Thu Aug 30 12:23:33 2012 -0400 >> >> xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl >> >> PRIVCMD_MMAPBATCH_V2 extends PRIVCMD_MMAPBATCH with an additional >> field for reporting the error code for every frame that could not be >> mapped. libxc prefers PRIVCMD_MMAPBATCH_V2 over PRIVCMD_MMAPBATCH. >> >> Also expand PRIVCMD_MMAPBATCH to return appropriate error-encoding top nibble >> in the mfn array. >> >> Signed-off-by: David Vrabel <david.vrabel@citrix.com> >> Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org> >> >> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c >> index 85226cb..5386f20 100644 >> --- a/drivers/xen/privcmd.c >> +++ b/drivers/xen/privcmd.c >> @@ -76,7 +76,7 @@ static void free_page_list(struct list_head *pages) >> */ >> static int gather_array(struct list_head *pagelist, >> unsigned nelem, size_t size, >> - void __user *data) >> + const void __user *data) >> { >> unsigned pageidx; >> void *pagedata; >> @@ -246,61 +246,117 @@ struct mmap_batch_state { >> domid_t domain; >> unsigned long va; >> struct vm_area_struct *vma; >> - int err; >> - >> - xen_pfn_t __user *user; >> + /* A tristate: >> + * 0 for no errors >> + * 1 if at least one error has happened (and no >> + * -ENOENT errors have happened) >> + * -ENOENT if at least 1 -ENOENT has happened. >> + */ >> + int global_error; >> + /* An array for individual errors */ >> + int *err; >> + >> + /* User-space mfn array to store errors in the second pass for V1. */ >> + xen_pfn_t __user *user_mfn; >> }; >> >> static int mmap_batch_fn(void *data, void *state) >> { >> xen_pfn_t *mfnp = data; >> struct mmap_batch_state *st = state; >> + int ret; >> >> - if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, >> - st->vma->vm_page_prot, st->domain) < 0) { >> - *mfnp |= 0xf0000000U; >> - st->err++; >> + ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, >> + st->vma->vm_page_prot, st->domain); >> + >> + /* Store error code for second pass. */ >> + *(st->err++) = ret; >> + >> + /* And see if it affects the global_error. */ >> + if (ret < 0) { >> + if (ret == -ENOENT) >> + st->global_error = -ENOENT; >> + else { >> + /* Record that at least one error has happened. */ >> + if (st->global_error == 0) >> + st->global_error = 1; >> + } >> } >> st->va += PAGE_SIZE; >> >> return 0; >> } >> >> -static int mmap_return_errors(void *data, void *state) >> +static int mmap_return_errors_v1(void *data, void *state) >> { >> xen_pfn_t *mfnp = data; >> struct mmap_batch_state *st = state; >> - >> - return put_user(*mfnp, st->user++); >> + int err = *(st->err++); >> + >> + /* >> + * V1 encodes the error codes in the 32bit top nibble of the >> + * mfn (with its known limitations vis-a-vis 64 bit callers). >> + */ >> + *mfnp |= (err == -ENOENT) ? >> + PRIVCMD_MMAPBATCH_PAGED_ERROR : >> + PRIVCMD_MMAPBATCH_MFN_ERROR; >> + return __put_user(*mfnp, st->user_mfn++); >> } >> >> static struct vm_operations_struct privcmd_vm_ops; >> >> -static long privcmd_ioctl_mmap_batch(void __user *udata) >> +static long privcmd_ioctl_mmap_batch(void __user *udata, int version) >> { >> int ret; >> - struct privcmd_mmapbatch m; >> + struct privcmd_mmapbatch_v2 m; >> struct mm_struct *mm = current->mm; >> struct vm_area_struct *vma; >> unsigned long nr_pages; >> LIST_HEAD(pagelist); >> + int *err_array = NULL; >> struct mmap_batch_state state; >> >> if (!xen_initial_domain()) >> return -EPERM; >> >> - if (copy_from_user(&m, udata, sizeof(m))) >> - return -EFAULT; >> + switch (version) { >> + case 1: >> + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) >> + return -EFAULT; >> + /* Returns per-frame error in m.arr. */ >> + m.err = NULL; >> + if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) >> + return -EFAULT; >> + break; >> + case 2: >> + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) >> + return -EFAULT; >> + /* Returns per-frame error code in m.err. */ >> + if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) >> + return -EFAULT; >> + break; >> + default: >> + return -EINVAL; >> + } >> >> nr_pages = m.num; >> if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) >> return -EINVAL; >> >> - ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), >> - m.arr); >> + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); >> + >> + if (ret) >> + goto out; >> + if (list_empty(&pagelist)) { >> + ret = -EINVAL; >> + goto out; >> + } >> >> - if (ret || list_empty(&pagelist)) >> + err_array = kcalloc(m.num, sizeof(int), GFP_KERNEL); >> + if (err_array == NULL) { >> + ret = -ENOMEM; >> goto out; >> + } >> >> down_write(&mm->mmap_sem); >> >> @@ -315,24 +371,34 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) >> goto out; >> } >> >> - state.domain = m.dom; >> - state.vma = vma; >> - state.va = m.addr; >> - state.err = 0; >> + state.domain = m.dom; >> + state.vma = vma; >> + state.va = m.addr; >> + state.global_error = 0; >> + state.err = err_array; >> >> - ret = traverse_pages(m.num, sizeof(xen_pfn_t), >> - &pagelist, mmap_batch_fn, &state); >> + /* mmap_batch_fn guarantees ret == 0 */ >> + BUG_ON(traverse_pages(m.num, sizeof(xen_pfn_t), >> + &pagelist, mmap_batch_fn, &state)); >> >> up_write(&mm->mmap_sem); >> >> - if (state.err > 0) { >> - state.user = m.arr; >> + if (state.global_error && (version == 1)) { >> + /* Write back errors in second pass. */ >> + state.user_mfn = (xen_pfn_t *)m.arr; >> + state.err = err_array; >> ret = traverse_pages(m.num, sizeof(xen_pfn_t), >> - &pagelist, >> - mmap_return_errors, &state); >> - } >> + &pagelist, mmap_return_errors_v1, &state); >> + } else >> + ret = __copy_to_user(m.err, err_array, m.num * sizeof(int)); >> + >> + /* If we have not had any EFAULT-like global errors then set the global >> + * error to -ENOENT if necessary. */ >> + if ((ret == 0) && (state.global_error == -ENOENT)) >> + ret = -ENOENT; >> >> out: >> + kfree(err_array); >> free_page_list(&pagelist); >> >> return ret; >> @@ -354,7 +420,11 @@ static long privcmd_ioctl(struct file *file, >> break; >> >> case IOCTL_PRIVCMD_MMAPBATCH: >> - ret = privcmd_ioctl_mmap_batch(udata); >> + ret = privcmd_ioctl_mmap_batch(udata, 1); >> + break; >> + >> + case IOCTL_PRIVCMD_MMAPBATCH_V2: >> + ret = privcmd_ioctl_mmap_batch(udata, 2); >> break; >> >> default: >> diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h >> index 45c1aa1..a853168 100644 >> --- a/include/xen/privcmd.h >> +++ b/include/xen/privcmd.h >> @@ -58,13 +58,33 @@ struct privcmd_mmapbatch { >> int num; /* number of pages to populate */ >> domid_t dom; /* target domain */ >> __u64 addr; /* virtual address */ >> - xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ >> + xen_pfn_t __user *arr; /* array of mfns - or''d with >> + PRIVCMD_MMAPBATCH_*_ERROR on err */ >> +}; >> + >> +#define PRIVCMD_MMAPBATCH_MFN_ERROR 0xf0000000U >> +#define PRIVCMD_MMAPBATCH_PAGED_ERROR 0x80000000U >> + >> +struct privcmd_mmapbatch_v2 { >> + unsigned int num; /* number of pages to populate */ >> + domid_t dom; /* target domain */ >> + __u64 addr; /* virtual address */ >> + const xen_pfn_t __user *arr; /* array of mfns */ >> + int __user *err; /* array of error codes */ >> }; >> >> /* >> * @cmd: IOCTL_PRIVCMD_HYPERCALL >> * @arg: &privcmd_hypercall_t >> * Return: Value returned from execution of the specified hypercall. >> + * >> + * @cmd: IOCTL_PRIVCMD_MMAPBATCH_V2 >> + * @arg: &struct privcmd_mmapbatch_v2 >> + * Return: 0 on success (i.e., arg->err contains valid error codes for >> + * each frame). On an error other than a failed frame remap, -1 is >> + * returned and errno is set to EINVAL, EFAULT etc. As an exception, >> + * if the operation was otherwise successful but any frame failed with >> + * -ENOENT, then -1 is returned and errno is set to ENOENT. >> */ >> #define IOCTL_PRIVCMD_HYPERCALL \ >> _IOC(_IOC_NONE, ''P'', 0, sizeof(struct privcmd_hypercall)) >> @@ -72,5 +92,7 @@ struct privcmd_mmapbatch { >> _IOC(_IOC_NONE, ''P'', 2, sizeof(struct privcmd_mmap)) >> #define IOCTL_PRIVCMD_MMAPBATCH \ >> _IOC(_IOC_NONE, ''P'', 3, sizeof(struct privcmd_mmapbatch)) >> +#define IOCTL_PRIVCMD_MMAPBATCH_V2 \ >> + _IOC(_IOC_NONE, ''P'', 4, sizeof(struct privcmd_mmapbatch_v2)) >> >> #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ >> >> On Aug 30, 2012, at 8:58 AM, David Vrabel wrote: >> >>> From: David Vrabel <david.vrabel@citrix.com> >>> >>> PRIVCMD_MMAPBATCH_V2 extends PRIVCMD_MMAPBATCH with an additional >>> field for reporting the error code for every frame that could not be >>> mapped. libxc prefers PRIVCMD_MMAPBATCH_V2 over PRIVCMD_MMAPBATCH. >>> >>> Signed-off-by: David Vrabel <david.vrabel@citrix.com> >>> --- >>> drivers/xen/privcmd.c | 99 +++++++++++++++++++++++++++++++++++++++--------- >>> include/xen/privcmd.h | 23 +++++++++++- >>> 2 files changed, 102 insertions(+), 20 deletions(-) >>> >>> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c >>> index ccee0f1..c0e89e7 100644 >>> --- a/drivers/xen/privcmd.c >>> +++ b/drivers/xen/privcmd.c >>> @@ -76,7 +76,7 @@ static void free_page_list(struct list_head *pages) >>> */ >>> static int gather_array(struct list_head *pagelist, >>> unsigned nelem, size_t size, >>> - void __user *data) >>> + const void __user *data) >>> { >>> unsigned pageidx; >>> void *pagedata; >>> @@ -248,18 +248,37 @@ struct mmap_batch_state { >>> struct vm_area_struct *vma; >>> int err; >>> >>> - xen_pfn_t __user *user; >>> + xen_pfn_t __user *user_mfn; >>> + int __user *user_err; >>> }; >>> >>> static int mmap_batch_fn(void *data, void *state) >>> { >>> xen_pfn_t *mfnp = data; >>> struct mmap_batch_state *st = state; >>> + int ret; >>> >>> - if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, >>> - st->vma->vm_page_prot, st->domain) < 0) { >>> - *mfnp |= 0xf0000000U; >>> - st->err++; >>> + ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, >>> + st->vma->vm_page_prot, st->domain); >>> + if (ret < 0) { >>> + /* >>> + * Error reporting is a mess but userspace relies on >>> + * it behaving this way. >>> + * >>> + * V2 needs to a) return the result of each frame''s >>> + * remap; and b) return -ENOENT if any frame failed >>> + * with -ENOENT. >>> + * >>> + * In this first pass the error code is saved by >>> + * overwriting the mfn and an error is indicated in >>> + * st->err. >>> + * >>> + * The second pass by mmap_return_errors() will write >>> + * the error codes to user space and get the right >>> + * ioctl return value. >>> + */ >>> + *(int *)mfnp = ret; >>> + st->err = ret; >>> } >>> st->va += PAGE_SIZE; >>> >>> @@ -270,16 +289,33 @@ static int mmap_return_errors(void *data, void *state) >>> { >>> xen_pfn_t *mfnp = data; >>> struct mmap_batch_state *st = state; >>> + int ret; >>> + >>> + if (st->user_err) { >>> + int err = *(int *)mfnp; >>> + >>> + if (err == -ENOENT) >>> + st->err = err; >>> >>> - return put_user(*mfnp, st->user++); >>> + return __put_user(err, st->user_err++); >>> + } else { >>> + xen_pfn_t mfn; >>> + >>> + ret = __get_user(mfn, st->user_mfn); >>> + if (ret < 0) >>> + return ret; >>> + >>> + mfn |= PRIVCMD_MMAPBATCH_MFN_ERROR; >>> + return __put_user(mfn, st->user_mfn++); >>> + } >>> } >>> >>> static struct vm_operations_struct privcmd_vm_ops; >>> >>> -static long privcmd_ioctl_mmap_batch(void __user *udata) >>> +static long privcmd_ioctl_mmap_batch(void __user *udata, int version) >>> { >>> int ret; >>> - struct privcmd_mmapbatch m; >>> + struct privcmd_mmapbatch_v2 m; >>> struct mm_struct *mm = current->mm; >>> struct vm_area_struct *vma; >>> unsigned long nr_pages; >>> @@ -289,15 +325,31 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) >>> if (!xen_initial_domain()) >>> return -EPERM; >>> >>> - if (copy_from_user(&m, udata, sizeof(m))) >>> - return -EFAULT; >>> + switch (version) { >>> + case 1: >>> + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) >>> + return -EFAULT; >>> + /* Returns per-frame error in m.arr. */ >>> + m.err = NULL; >>> + if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) >>> + return -EFAULT; >>> + break; >>> + case 2: >>> + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) >>> + return -EFAULT; >>> + /* Returns per-frame error code in m.err. */ >>> + if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) >>> + return -EFAULT; >>> + break; >>> + default: >>> + return -EINVAL; >>> + } >>> >>> nr_pages = m.num; >>> if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) >>> return -EINVAL; >>> >>> - ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), >>> - m.arr); >>> + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); >>> >>> if (ret || list_empty(&pagelist)) >>> goto out; >>> @@ -325,12 +377,17 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) >>> >>> up_write(&mm->mmap_sem); >>> >>> - if (state.err > 0) { >>> - state.user = m.arr; >>> + if (state.err) { >>> + state.err = 0; >>> + state.user_mfn = (xen_pfn_t *)m.arr; >>> + state.user_err = m.err; >>> ret = traverse_pages(m.num, sizeof(xen_pfn_t), >>> - &pagelist, >>> - mmap_return_errors, &state); >>> - } >>> + &pagelist, >>> + mmap_return_errors, &state); >>> + if (ret >= 0) >>> + ret = state.err; >>> + } else if (m.err) >>> + __clear_user(m.err, m.num * sizeof(*m.err)); >>> >>> out: >>> free_page_list(&pagelist); >>> @@ -354,7 +411,11 @@ static long privcmd_ioctl(struct file *file, >>> break; >>> >>> case IOCTL_PRIVCMD_MMAPBATCH: >>> - ret = privcmd_ioctl_mmap_batch(udata); >>> + ret = privcmd_ioctl_mmap_batch(udata, 1); >>> + break; >>> + >>> + case IOCTL_PRIVCMD_MMAPBATCH_V2: >>> + ret = privcmd_ioctl_mmap_batch(udata, 2); >>> break; >>> >>> default: >>> diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h >>> index 17857fb..f60d75c 100644 >>> --- a/include/xen/privcmd.h >>> +++ b/include/xen/privcmd.h >>> @@ -59,13 +59,32 @@ struct privcmd_mmapbatch { >>> int num; /* number of pages to populate */ >>> domid_t dom; /* target domain */ >>> __u64 addr; /* virtual address */ >>> - xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ >>> + xen_pfn_t __user *arr; /* array of mfns - or''d with >>> + PRIVCMD_MMAPBATCH_MFN_ERROR on err */ >>> +}; >>> + >>> +#define PRIVCMD_MMAPBATCH_MFN_ERROR 0xf0000000U >>> + >>> +struct privcmd_mmapbatch_v2 { >>> + unsigned int num; /* number of pages to populate */ >>> + domid_t dom; /* target domain */ >>> + __u64 addr; /* virtual address */ >>> + const xen_pfn_t __user *arr; /* array of mfns */ >>> + int __user *err; /* array of error codes */ >>> }; >>> >>> /* >>> * @cmd: IOCTL_PRIVCMD_HYPERCALL >>> * @arg: &privcmd_hypercall_t >>> * Return: Value returned from execution of the specified hypercall. >>> + * >>> + * @cmd: IOCTL_PRIVCMD_MMAPBATCH_V2 >>> + * @arg: &struct privcmd_mmapbatch_v2 >>> + * Return: 0 on success (i.e., arg->err contains valid error codes for >>> + * each frame). On an error other than a failed frame remap, -1 is >>> + * returned and errno is set to EINVAL, EFAULT etc. As an exception, >>> + * if the operation was otherwise successful but any frame failed with >>> + * -ENOENT, then -1 is returned and errno is set to ENOENT. >>> */ >>> #define IOCTL_PRIVCMD_HYPERCALL \ >>> _IOC(_IOC_NONE, ''P'', 0, sizeof(struct privcmd_hypercall)) >>> @@ -73,5 +92,7 @@ struct privcmd_mmapbatch { >>> _IOC(_IOC_NONE, ''P'', 2, sizeof(struct privcmd_mmap)) >>> #define IOCTL_PRIVCMD_MMAPBATCH \ >>> _IOC(_IOC_NONE, ''P'', 3, sizeof(struct privcmd_mmapbatch)) >>> +#define IOCTL_PRIVCMD_MMAPBATCH_V2 \ >>> + _IOC(_IOC_NONE, ''P'', 4, sizeof(struct privcmd_mmapbatch_v2)) >>> >>> #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ >>> -- >>> 1.7.2.5 >>>
Konrad Rzeszutek Wilk
2012-Sep-05 17:40 UTC
Re: [PATCH 2/2] xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl
On Wed, Sep 05, 2012 at 01:09:32PM -0400, Andres Lagar-Cavilla wrote:> Super. To which branch are you applying these now? (n00b question but have to ask)They will show up on stable/for-linus-3.7 once the overnight tests pass.> Andres > On Sep 5, 2012, at 12:21 PM, Konrad Rzeszutek Wilk wrote: > > > On Fri, Aug 31, 2012 at 09:59:30AM -0400, Andres Lagar-Cavilla wrote: > >> Re-spin of alternative patch after David''s feedback. > >> Thanks > >> Andres > > > > applied. fixed some whitespace issues. > >> > >> commit ab351a5cef1797935b083c2f6e72800a8949c515 > >> Author: Andres Lagar-Cavilla <andres@lagarcavilla.org> > >> Date: Thu Aug 30 12:23:33 2012 -0400 > >> > >> xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl > >> > >> PRIVCMD_MMAPBATCH_V2 extends PRIVCMD_MMAPBATCH with an additional > >> field for reporting the error code for every frame that could not be > >> mapped. libxc prefers PRIVCMD_MMAPBATCH_V2 over PRIVCMD_MMAPBATCH. > >> > >> Also expand PRIVCMD_MMAPBATCH to return appropriate error-encoding top nibble > >> in the mfn array. > >> > >> Signed-off-by: David Vrabel <david.vrabel@citrix.com> > >> Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org> > >> > >> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c > >> index 85226cb..5386f20 100644 > >> --- a/drivers/xen/privcmd.c > >> +++ b/drivers/xen/privcmd.c > >> @@ -76,7 +76,7 @@ static void free_page_list(struct list_head *pages) > >> */ > >> static int gather_array(struct list_head *pagelist, > >> unsigned nelem, size_t size, > >> - void __user *data) > >> + const void __user *data) > >> { > >> unsigned pageidx; > >> void *pagedata; > >> @@ -246,61 +246,117 @@ struct mmap_batch_state { > >> domid_t domain; > >> unsigned long va; > >> struct vm_area_struct *vma; > >> - int err; > >> - > >> - xen_pfn_t __user *user; > >> + /* A tristate: > >> + * 0 for no errors > >> + * 1 if at least one error has happened (and no > >> + * -ENOENT errors have happened) > >> + * -ENOENT if at least 1 -ENOENT has happened. > >> + */ > >> + int global_error; > >> + /* An array for individual errors */ > >> + int *err; > >> + > >> + /* User-space mfn array to store errors in the second pass for V1. */ > >> + xen_pfn_t __user *user_mfn; > >> }; > >> > >> static int mmap_batch_fn(void *data, void *state) > >> { > >> xen_pfn_t *mfnp = data; > >> struct mmap_batch_state *st = state; > >> + int ret; > >> > >> - if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, > >> - st->vma->vm_page_prot, st->domain) < 0) { > >> - *mfnp |= 0xf0000000U; > >> - st->err++; > >> + ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, > >> + st->vma->vm_page_prot, st->domain); > >> + > >> + /* Store error code for second pass. */ > >> + *(st->err++) = ret; > >> + > >> + /* And see if it affects the global_error. */ > >> + if (ret < 0) { > >> + if (ret == -ENOENT) > >> + st->global_error = -ENOENT; > >> + else { > >> + /* Record that at least one error has happened. */ > >> + if (st->global_error == 0) > >> + st->global_error = 1; > >> + } > >> } > >> st->va += PAGE_SIZE; > >> > >> return 0; > >> } > >> > >> -static int mmap_return_errors(void *data, void *state) > >> +static int mmap_return_errors_v1(void *data, void *state) > >> { > >> xen_pfn_t *mfnp = data; > >> struct mmap_batch_state *st = state; > >> - > >> - return put_user(*mfnp, st->user++); > >> + int err = *(st->err++); > >> + > >> + /* > >> + * V1 encodes the error codes in the 32bit top nibble of the > >> + * mfn (with its known limitations vis-a-vis 64 bit callers). > >> + */ > >> + *mfnp |= (err == -ENOENT) ? > >> + PRIVCMD_MMAPBATCH_PAGED_ERROR : > >> + PRIVCMD_MMAPBATCH_MFN_ERROR; > >> + return __put_user(*mfnp, st->user_mfn++); > >> } > >> > >> static struct vm_operations_struct privcmd_vm_ops; > >> > >> -static long privcmd_ioctl_mmap_batch(void __user *udata) > >> +static long privcmd_ioctl_mmap_batch(void __user *udata, int version) > >> { > >> int ret; > >> - struct privcmd_mmapbatch m; > >> + struct privcmd_mmapbatch_v2 m; > >> struct mm_struct *mm = current->mm; > >> struct vm_area_struct *vma; > >> unsigned long nr_pages; > >> LIST_HEAD(pagelist); > >> + int *err_array = NULL; > >> struct mmap_batch_state state; > >> > >> if (!xen_initial_domain()) > >> return -EPERM; > >> > >> - if (copy_from_user(&m, udata, sizeof(m))) > >> - return -EFAULT; > >> + switch (version) { > >> + case 1: > >> + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) > >> + return -EFAULT; > >> + /* Returns per-frame error in m.arr. */ > >> + m.err = NULL; > >> + if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) > >> + return -EFAULT; > >> + break; > >> + case 2: > >> + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) > >> + return -EFAULT; > >> + /* Returns per-frame error code in m.err. */ > >> + if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) > >> + return -EFAULT; > >> + break; > >> + default: > >> + return -EINVAL; > >> + } > >> > >> nr_pages = m.num; > >> if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) > >> return -EINVAL; > >> > >> - ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), > >> - m.arr); > >> + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); > >> + > >> + if (ret) > >> + goto out; > >> + if (list_empty(&pagelist)) { > >> + ret = -EINVAL; > >> + goto out; > >> + } > >> > >> - if (ret || list_empty(&pagelist)) > >> + err_array = kcalloc(m.num, sizeof(int), GFP_KERNEL); > >> + if (err_array == NULL) { > >> + ret = -ENOMEM; > >> goto out; > >> + } > >> > >> down_write(&mm->mmap_sem); > >> > >> @@ -315,24 +371,34 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) > >> goto out; > >> } > >> > >> - state.domain = m.dom; > >> - state.vma = vma; > >> - state.va = m.addr; > >> - state.err = 0; > >> + state.domain = m.dom; > >> + state.vma = vma; > >> + state.va = m.addr; > >> + state.global_error = 0; > >> + state.err = err_array; > >> > >> - ret = traverse_pages(m.num, sizeof(xen_pfn_t), > >> - &pagelist, mmap_batch_fn, &state); > >> + /* mmap_batch_fn guarantees ret == 0 */ > >> + BUG_ON(traverse_pages(m.num, sizeof(xen_pfn_t), > >> + &pagelist, mmap_batch_fn, &state)); > >> > >> up_write(&mm->mmap_sem); > >> > >> - if (state.err > 0) { > >> - state.user = m.arr; > >> + if (state.global_error && (version == 1)) { > >> + /* Write back errors in second pass. */ > >> + state.user_mfn = (xen_pfn_t *)m.arr; > >> + state.err = err_array; > >> ret = traverse_pages(m.num, sizeof(xen_pfn_t), > >> - &pagelist, > >> - mmap_return_errors, &state); > >> - } > >> + &pagelist, mmap_return_errors_v1, &state); > >> + } else > >> + ret = __copy_to_user(m.err, err_array, m.num * sizeof(int)); > >> + > >> + /* If we have not had any EFAULT-like global errors then set the global > >> + * error to -ENOENT if necessary. */ > >> + if ((ret == 0) && (state.global_error == -ENOENT)) > >> + ret = -ENOENT; > >> > >> out: > >> + kfree(err_array); > >> free_page_list(&pagelist); > >> > >> return ret; > >> @@ -354,7 +420,11 @@ static long privcmd_ioctl(struct file *file, > >> break; > >> > >> case IOCTL_PRIVCMD_MMAPBATCH: > >> - ret = privcmd_ioctl_mmap_batch(udata); > >> + ret = privcmd_ioctl_mmap_batch(udata, 1); > >> + break; > >> + > >> + case IOCTL_PRIVCMD_MMAPBATCH_V2: > >> + ret = privcmd_ioctl_mmap_batch(udata, 2); > >> break; > >> > >> default: > >> diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h > >> index 45c1aa1..a853168 100644 > >> --- a/include/xen/privcmd.h > >> +++ b/include/xen/privcmd.h > >> @@ -58,13 +58,33 @@ struct privcmd_mmapbatch { > >> int num; /* number of pages to populate */ > >> domid_t dom; /* target domain */ > >> __u64 addr; /* virtual address */ > >> - xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ > >> + xen_pfn_t __user *arr; /* array of mfns - or''d with > >> + PRIVCMD_MMAPBATCH_*_ERROR on err */ > >> +}; > >> + > >> +#define PRIVCMD_MMAPBATCH_MFN_ERROR 0xf0000000U > >> +#define PRIVCMD_MMAPBATCH_PAGED_ERROR 0x80000000U > >> + > >> +struct privcmd_mmapbatch_v2 { > >> + unsigned int num; /* number of pages to populate */ > >> + domid_t dom; /* target domain */ > >> + __u64 addr; /* virtual address */ > >> + const xen_pfn_t __user *arr; /* array of mfns */ > >> + int __user *err; /* array of error codes */ > >> }; > >> > >> /* > >> * @cmd: IOCTL_PRIVCMD_HYPERCALL > >> * @arg: &privcmd_hypercall_t > >> * Return: Value returned from execution of the specified hypercall. > >> + * > >> + * @cmd: IOCTL_PRIVCMD_MMAPBATCH_V2 > >> + * @arg: &struct privcmd_mmapbatch_v2 > >> + * Return: 0 on success (i.e., arg->err contains valid error codes for > >> + * each frame). On an error other than a failed frame remap, -1 is > >> + * returned and errno is set to EINVAL, EFAULT etc. As an exception, > >> + * if the operation was otherwise successful but any frame failed with > >> + * -ENOENT, then -1 is returned and errno is set to ENOENT. > >> */ > >> #define IOCTL_PRIVCMD_HYPERCALL \ > >> _IOC(_IOC_NONE, ''P'', 0, sizeof(struct privcmd_hypercall)) > >> @@ -72,5 +92,7 @@ struct privcmd_mmapbatch { > >> _IOC(_IOC_NONE, ''P'', 2, sizeof(struct privcmd_mmap)) > >> #define IOCTL_PRIVCMD_MMAPBATCH \ > >> _IOC(_IOC_NONE, ''P'', 3, sizeof(struct privcmd_mmapbatch)) > >> +#define IOCTL_PRIVCMD_MMAPBATCH_V2 \ > >> + _IOC(_IOC_NONE, ''P'', 4, sizeof(struct privcmd_mmapbatch_v2)) > >> > >> #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ > >> > >> On Aug 30, 2012, at 8:58 AM, David Vrabel wrote: > >> > >>> From: David Vrabel <david.vrabel@citrix.com> > >>> > >>> PRIVCMD_MMAPBATCH_V2 extends PRIVCMD_MMAPBATCH with an additional > >>> field for reporting the error code for every frame that could not be > >>> mapped. libxc prefers PRIVCMD_MMAPBATCH_V2 over PRIVCMD_MMAPBATCH. > >>> > >>> Signed-off-by: David Vrabel <david.vrabel@citrix.com> > >>> --- > >>> drivers/xen/privcmd.c | 99 +++++++++++++++++++++++++++++++++++++++--------- > >>> include/xen/privcmd.h | 23 +++++++++++- > >>> 2 files changed, 102 insertions(+), 20 deletions(-) > >>> > >>> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c > >>> index ccee0f1..c0e89e7 100644 > >>> --- a/drivers/xen/privcmd.c > >>> +++ b/drivers/xen/privcmd.c > >>> @@ -76,7 +76,7 @@ static void free_page_list(struct list_head *pages) > >>> */ > >>> static int gather_array(struct list_head *pagelist, > >>> unsigned nelem, size_t size, > >>> - void __user *data) > >>> + const void __user *data) > >>> { > >>> unsigned pageidx; > >>> void *pagedata; > >>> @@ -248,18 +248,37 @@ struct mmap_batch_state { > >>> struct vm_area_struct *vma; > >>> int err; > >>> > >>> - xen_pfn_t __user *user; > >>> + xen_pfn_t __user *user_mfn; > >>> + int __user *user_err; > >>> }; > >>> > >>> static int mmap_batch_fn(void *data, void *state) > >>> { > >>> xen_pfn_t *mfnp = data; > >>> struct mmap_batch_state *st = state; > >>> + int ret; > >>> > >>> - if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, > >>> - st->vma->vm_page_prot, st->domain) < 0) { > >>> - *mfnp |= 0xf0000000U; > >>> - st->err++; > >>> + ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, > >>> + st->vma->vm_page_prot, st->domain); > >>> + if (ret < 0) { > >>> + /* > >>> + * Error reporting is a mess but userspace relies on > >>> + * it behaving this way. > >>> + * > >>> + * V2 needs to a) return the result of each frame''s > >>> + * remap; and b) return -ENOENT if any frame failed > >>> + * with -ENOENT. > >>> + * > >>> + * In this first pass the error code is saved by > >>> + * overwriting the mfn and an error is indicated in > >>> + * st->err. > >>> + * > >>> + * The second pass by mmap_return_errors() will write > >>> + * the error codes to user space and get the right > >>> + * ioctl return value. > >>> + */ > >>> + *(int *)mfnp = ret; > >>> + st->err = ret; > >>> } > >>> st->va += PAGE_SIZE; > >>> > >>> @@ -270,16 +289,33 @@ static int mmap_return_errors(void *data, void *state) > >>> { > >>> xen_pfn_t *mfnp = data; > >>> struct mmap_batch_state *st = state; > >>> + int ret; > >>> + > >>> + if (st->user_err) { > >>> + int err = *(int *)mfnp; > >>> + > >>> + if (err == -ENOENT) > >>> + st->err = err; > >>> > >>> - return put_user(*mfnp, st->user++); > >>> + return __put_user(err, st->user_err++); > >>> + } else { > >>> + xen_pfn_t mfn; > >>> + > >>> + ret = __get_user(mfn, st->user_mfn); > >>> + if (ret < 0) > >>> + return ret; > >>> + > >>> + mfn |= PRIVCMD_MMAPBATCH_MFN_ERROR; > >>> + return __put_user(mfn, st->user_mfn++); > >>> + } > >>> } > >>> > >>> static struct vm_operations_struct privcmd_vm_ops; > >>> > >>> -static long privcmd_ioctl_mmap_batch(void __user *udata) > >>> +static long privcmd_ioctl_mmap_batch(void __user *udata, int version) > >>> { > >>> int ret; > >>> - struct privcmd_mmapbatch m; > >>> + struct privcmd_mmapbatch_v2 m; > >>> struct mm_struct *mm = current->mm; > >>> struct vm_area_struct *vma; > >>> unsigned long nr_pages; > >>> @@ -289,15 +325,31 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) > >>> if (!xen_initial_domain()) > >>> return -EPERM; > >>> > >>> - if (copy_from_user(&m, udata, sizeof(m))) > >>> - return -EFAULT; > >>> + switch (version) { > >>> + case 1: > >>> + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) > >>> + return -EFAULT; > >>> + /* Returns per-frame error in m.arr. */ > >>> + m.err = NULL; > >>> + if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) > >>> + return -EFAULT; > >>> + break; > >>> + case 2: > >>> + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) > >>> + return -EFAULT; > >>> + /* Returns per-frame error code in m.err. */ > >>> + if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) > >>> + return -EFAULT; > >>> + break; > >>> + default: > >>> + return -EINVAL; > >>> + } > >>> > >>> nr_pages = m.num; > >>> if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) > >>> return -EINVAL; > >>> > >>> - ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), > >>> - m.arr); > >>> + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); > >>> > >>> if (ret || list_empty(&pagelist)) > >>> goto out; > >>> @@ -325,12 +377,17 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) > >>> > >>> up_write(&mm->mmap_sem); > >>> > >>> - if (state.err > 0) { > >>> - state.user = m.arr; > >>> + if (state.err) { > >>> + state.err = 0; > >>> + state.user_mfn = (xen_pfn_t *)m.arr; > >>> + state.user_err = m.err; > >>> ret = traverse_pages(m.num, sizeof(xen_pfn_t), > >>> - &pagelist, > >>> - mmap_return_errors, &state); > >>> - } > >>> + &pagelist, > >>> + mmap_return_errors, &state); > >>> + if (ret >= 0) > >>> + ret = state.err; > >>> + } else if (m.err) > >>> + __clear_user(m.err, m.num * sizeof(*m.err)); > >>> > >>> out: > >>> free_page_list(&pagelist); > >>> @@ -354,7 +411,11 @@ static long privcmd_ioctl(struct file *file, > >>> break; > >>> > >>> case IOCTL_PRIVCMD_MMAPBATCH: > >>> - ret = privcmd_ioctl_mmap_batch(udata); > >>> + ret = privcmd_ioctl_mmap_batch(udata, 1); > >>> + break; > >>> + > >>> + case IOCTL_PRIVCMD_MMAPBATCH_V2: > >>> + ret = privcmd_ioctl_mmap_batch(udata, 2); > >>> break; > >>> > >>> default: > >>> diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h > >>> index 17857fb..f60d75c 100644 > >>> --- a/include/xen/privcmd.h > >>> +++ b/include/xen/privcmd.h > >>> @@ -59,13 +59,32 @@ struct privcmd_mmapbatch { > >>> int num; /* number of pages to populate */ > >>> domid_t dom; /* target domain */ > >>> __u64 addr; /* virtual address */ > >>> - xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ > >>> + xen_pfn_t __user *arr; /* array of mfns - or''d with > >>> + PRIVCMD_MMAPBATCH_MFN_ERROR on err */ > >>> +}; > >>> + > >>> +#define PRIVCMD_MMAPBATCH_MFN_ERROR 0xf0000000U > >>> + > >>> +struct privcmd_mmapbatch_v2 { > >>> + unsigned int num; /* number of pages to populate */ > >>> + domid_t dom; /* target domain */ > >>> + __u64 addr; /* virtual address */ > >>> + const xen_pfn_t __user *arr; /* array of mfns */ > >>> + int __user *err; /* array of error codes */ > >>> }; > >>> > >>> /* > >>> * @cmd: IOCTL_PRIVCMD_HYPERCALL > >>> * @arg: &privcmd_hypercall_t > >>> * Return: Value returned from execution of the specified hypercall. > >>> + * > >>> + * @cmd: IOCTL_PRIVCMD_MMAPBATCH_V2 > >>> + * @arg: &struct privcmd_mmapbatch_v2 > >>> + * Return: 0 on success (i.e., arg->err contains valid error codes for > >>> + * each frame). On an error other than a failed frame remap, -1 is > >>> + * returned and errno is set to EINVAL, EFAULT etc. As an exception, > >>> + * if the operation was otherwise successful but any frame failed with > >>> + * -ENOENT, then -1 is returned and errno is set to ENOENT. > >>> */ > >>> #define IOCTL_PRIVCMD_HYPERCALL \ > >>> _IOC(_IOC_NONE, ''P'', 0, sizeof(struct privcmd_hypercall)) > >>> @@ -73,5 +92,7 @@ struct privcmd_mmapbatch { > >>> _IOC(_IOC_NONE, ''P'', 2, sizeof(struct privcmd_mmap)) > >>> #define IOCTL_PRIVCMD_MMAPBATCH \ > >>> _IOC(_IOC_NONE, ''P'', 3, sizeof(struct privcmd_mmapbatch)) > >>> +#define IOCTL_PRIVCMD_MMAPBATCH_V2 \ > >>> + _IOC(_IOC_NONE, ''P'', 4, sizeof(struct privcmd_mmapbatch_v2)) > >>> > >>> #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ > >>> -- > >>> 1.7.2.5 > >>>
Konrad Rzeszutek Wilk
2012-Sep-05 18:57 UTC
Re: [PATCHv3 0/2] xen/privcmd: support for paged-out frames
On Thu, Aug 30, 2012 at 04:12:03PM -0400, Andres Lagar-Cavilla wrote:> On Aug 30, 2012, at 4:05 PM, Konrad Rzeszutek Wilk wrote: > > >> I think this should probably get a "Tested-by" Andres or someone else > >> who uses paging before being applied. > > > > How do I test it? Is there an easy explanation/tutorial Andres? > > I have a unit test lying somewhere but I''ll need a day to dig it out.If you can send that that would be super..> > Or you can start an hvm, pause it, fire up xenpaging with debug output so it tells you what it paged out, and then trace libxc as you xc_map_foreign_bulk a known batch of evicted pfns. On the first try it should get rc == -1, errno == ENOENT, error field for the pfn == ENOENT. On the second try it should all be dandy. >This works well with Xen 4.1 or should I use Xen 4.2?> Thanks > Andres
Andres Lagar-Cavilla
2012-Sep-05 19:51 UTC
Re: [PATCHv3 0/2] xen/privcmd: support for paged-out frames
On Sep 5, 2012, at 2:57 PM, Konrad Rzeszutek Wilk wrote:> On Thu, Aug 30, 2012 at 04:12:03PM -0400, Andres Lagar-Cavilla wrote: >> On Aug 30, 2012, at 4:05 PM, Konrad Rzeszutek Wilk wrote: >> >>>> I think this should probably get a "Tested-by" Andres or someone else >>>> who uses paging before being applied. >>> >>> How do I test it? Is there an easy explanation/tutorial Andres? >> >> I have a unit test lying somewhere but I''ll need a day to dig it out. > > If you can send that that would be super..Yeah. Sorry, forgot. Will take me a bit though.>> >> Or you can start an hvm, pause it, fire up xenpaging with debug output so it tells you what it paged out, and then trace libxc as you xc_map_foreign_bulk a known batch of evicted pfns. On the first try it should get rc == -1, errno == ENOENT, error field for the pfn == ENOENT. On the second try it should all be dandy. >> > > This works well with Xen 4.1 or should I use Xen 4.2?Xen-4.1 is advertised to work with this but I''ve never tried. Xen-4.2 is what I use all the time Andres>> Thanks >> Andres
Konrad Rzeszutek Wilk
2012-Sep-05 20:05 UTC
Re: [PATCHv3 0/2] xen/privcmd: support for paged-out frames
On Wed, Sep 05, 2012 at 03:51:03PM -0400, Andres Lagar-Cavilla wrote:> > On Sep 5, 2012, at 2:57 PM, Konrad Rzeszutek Wilk wrote: > > > On Thu, Aug 30, 2012 at 04:12:03PM -0400, Andres Lagar-Cavilla wrote: > >> On Aug 30, 2012, at 4:05 PM, Konrad Rzeszutek Wilk wrote: > >> > >>>> I think this should probably get a "Tested-by" Andres or someone else > >>>> who uses paging before being applied. > >>> > >>> How do I test it? Is there an easy explanation/tutorial Andres? > >> > >> I have a unit test lying somewhere but I''ll need a day to dig it out. > > > > If you can send that that would be super.. > Yeah. Sorry, forgot. Will take me a bit though. > >> > >> Or you can start an hvm, pause it, fire up xenpaging with debug output so it tells you what it paged out, and then trace libxc as you xc_map_foreign_bulk a known batch of evicted pfns. On the first try it should get rc == -1, errno == ENOENT, error field for the pfn == ENOENT. On the second try it should all be dandy. > >> > > > > This works well with Xen 4.1 or should I use Xen 4.2? > > Xen-4.1 is advertised to work with this but I''ve never tried. Xen-4.2 is what I use all the timeOK, Well I can also hoist the testing of this to you. So let me make sure it works overnight and then push out the patches to my #linux-next tree and you can test it and make sure nothing is amiss.> > Andres > > >> Thanks > >> Andres
Andres Lagar-Cavilla
2012-Sep-06 13:41 UTC
Re: [PATCH 2/2] xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl
On Sep 5, 2012, at 1:40 PM, Konrad Rzeszutek Wilk wrote:> On Wed, Sep 05, 2012 at 01:09:32PM -0400, Andres Lagar-Cavilla wrote: >> Super. To which branch are you applying these now? (n00b question but have to ask) > > They will show up on stable/for-linus-3.7 once the overnight tests pass.I would be very surprised if this passed your nighties. This is because the following hunk is necessary: diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 5386f20..e4dfa3b 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -389,7 +389,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) state.err = err_array; ret = traverse_pages(m.num, sizeof(xen_pfn_t), &pagelist, mmap_return_errors_v1, &state); - } else + } else if (version == 2) ret = __copy_to_user(m.err, err_array, m.num * sizeof(int)); /* If we have not had any EFAULT-like global errors then set the global I can either resubmit the original patch with this squashed in, or send this stand-alone. Nightlies may have passed if your libxc never exercises v1 in favor of v2. But this is broken for v1 as it will unconditionally attempt a copy to user on a NULL target and this set rc to EFAULT. Andres> >> Andres >> On Sep 5, 2012, at 12:21 PM, Konrad Rzeszutek Wilk wrote: >> >>> On Fri, Aug 31, 2012 at 09:59:30AM -0400, Andres Lagar-Cavilla wrote: >>>> Re-spin of alternative patch after David''s feedback. >>>> Thanks >>>> Andres >>> >>> applied. fixed some whitespace issues. >>>> >>>> commit ab351a5cef1797935b083c2f6e72800a8949c515 >>>> Author: Andres Lagar-Cavilla <andres@lagarcavilla.org> >>>> Date: Thu Aug 30 12:23:33 2012 -0400 >>>> >>>> xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl >>>> >>>> PRIVCMD_MMAPBATCH_V2 extends PRIVCMD_MMAPBATCH with an additional >>>> field for reporting the error code for every frame that could not be >>>> mapped. libxc prefers PRIVCMD_MMAPBATCH_V2 over PRIVCMD_MMAPBATCH. >>>> >>>> Also expand PRIVCMD_MMAPBATCH to return appropriate error-encoding top nibble >>>> in the mfn array. >>>> >>>> Signed-off-by: David Vrabel <david.vrabel@citrix.com> >>>> Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org> >>>> >>>> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c >>>> index 85226cb..5386f20 100644 >>>> --- a/drivers/xen/privcmd.c >>>> +++ b/drivers/xen/privcmd.c >>>> @@ -76,7 +76,7 @@ static void free_page_list(struct list_head *pages) >>>> */ >>>> static int gather_array(struct list_head *pagelist, >>>> unsigned nelem, size_t size, >>>> - void __user *data) >>>> + const void __user *data) >>>> { >>>> unsigned pageidx; >>>> void *pagedata; >>>> @@ -246,61 +246,117 @@ struct mmap_batch_state { >>>> domid_t domain; >>>> unsigned long va; >>>> struct vm_area_struct *vma; >>>> - int err; >>>> - >>>> - xen_pfn_t __user *user; >>>> + /* A tristate: >>>> + * 0 for no errors >>>> + * 1 if at least one error has happened (and no >>>> + * -ENOENT errors have happened) >>>> + * -ENOENT if at least 1 -ENOENT has happened. >>>> + */ >>>> + int global_error; >>>> + /* An array for individual errors */ >>>> + int *err; >>>> + >>>> + /* User-space mfn array to store errors in the second pass for V1. */ >>>> + xen_pfn_t __user *user_mfn; >>>> }; >>>> >>>> static int mmap_batch_fn(void *data, void *state) >>>> { >>>> xen_pfn_t *mfnp = data; >>>> struct mmap_batch_state *st = state; >>>> + int ret; >>>> >>>> - if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, >>>> - st->vma->vm_page_prot, st->domain) < 0) { >>>> - *mfnp |= 0xf0000000U; >>>> - st->err++; >>>> + ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, >>>> + st->vma->vm_page_prot, st->domain); >>>> + >>>> + /* Store error code for second pass. */ >>>> + *(st->err++) = ret; >>>> + >>>> + /* And see if it affects the global_error. */ >>>> + if (ret < 0) { >>>> + if (ret == -ENOENT) >>>> + st->global_error = -ENOENT; >>>> + else { >>>> + /* Record that at least one error has happened. */ >>>> + if (st->global_error == 0) >>>> + st->global_error = 1; >>>> + } >>>> } >>>> st->va += PAGE_SIZE; >>>> >>>> return 0; >>>> } >>>> >>>> -static int mmap_return_errors(void *data, void *state) >>>> +static int mmap_return_errors_v1(void *data, void *state) >>>> { >>>> xen_pfn_t *mfnp = data; >>>> struct mmap_batch_state *st = state; >>>> - >>>> - return put_user(*mfnp, st->user++); >>>> + int err = *(st->err++); >>>> + >>>> + /* >>>> + * V1 encodes the error codes in the 32bit top nibble of the >>>> + * mfn (with its known limitations vis-a-vis 64 bit callers). >>>> + */ >>>> + *mfnp |= (err == -ENOENT) ? >>>> + PRIVCMD_MMAPBATCH_PAGED_ERROR : >>>> + PRIVCMD_MMAPBATCH_MFN_ERROR; >>>> + return __put_user(*mfnp, st->user_mfn++); >>>> } >>>> >>>> static struct vm_operations_struct privcmd_vm_ops; >>>> >>>> -static long privcmd_ioctl_mmap_batch(void __user *udata) >>>> +static long privcmd_ioctl_mmap_batch(void __user *udata, int version) >>>> { >>>> int ret; >>>> - struct privcmd_mmapbatch m; >>>> + struct privcmd_mmapbatch_v2 m; >>>> struct mm_struct *mm = current->mm; >>>> struct vm_area_struct *vma; >>>> unsigned long nr_pages; >>>> LIST_HEAD(pagelist); >>>> + int *err_array = NULL; >>>> struct mmap_batch_state state; >>>> >>>> if (!xen_initial_domain()) >>>> return -EPERM; >>>> >>>> - if (copy_from_user(&m, udata, sizeof(m))) >>>> - return -EFAULT; >>>> + switch (version) { >>>> + case 1: >>>> + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) >>>> + return -EFAULT; >>>> + /* Returns per-frame error in m.arr. */ >>>> + m.err = NULL; >>>> + if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) >>>> + return -EFAULT; >>>> + break; >>>> + case 2: >>>> + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) >>>> + return -EFAULT; >>>> + /* Returns per-frame error code in m.err. */ >>>> + if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) >>>> + return -EFAULT; >>>> + break; >>>> + default: >>>> + return -EINVAL; >>>> + } >>>> >>>> nr_pages = m.num; >>>> if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) >>>> return -EINVAL; >>>> >>>> - ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), >>>> - m.arr); >>>> + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); >>>> + >>>> + if (ret) >>>> + goto out; >>>> + if (list_empty(&pagelist)) { >>>> + ret = -EINVAL; >>>> + goto out; >>>> + } >>>> >>>> - if (ret || list_empty(&pagelist)) >>>> + err_array = kcalloc(m.num, sizeof(int), GFP_KERNEL); >>>> + if (err_array == NULL) { >>>> + ret = -ENOMEM; >>>> goto out; >>>> + } >>>> >>>> down_write(&mm->mmap_sem); >>>> >>>> @@ -315,24 +371,34 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) >>>> goto out; >>>> } >>>> >>>> - state.domain = m.dom; >>>> - state.vma = vma; >>>> - state.va = m.addr; >>>> - state.err = 0; >>>> + state.domain = m.dom; >>>> + state.vma = vma; >>>> + state.va = m.addr; >>>> + state.global_error = 0; >>>> + state.err = err_array; >>>> >>>> - ret = traverse_pages(m.num, sizeof(xen_pfn_t), >>>> - &pagelist, mmap_batch_fn, &state); >>>> + /* mmap_batch_fn guarantees ret == 0 */ >>>> + BUG_ON(traverse_pages(m.num, sizeof(xen_pfn_t), >>>> + &pagelist, mmap_batch_fn, &state)); >>>> >>>> up_write(&mm->mmap_sem); >>>> >>>> - if (state.err > 0) { >>>> - state.user = m.arr; >>>> + if (state.global_error && (version == 1)) { >>>> + /* Write back errors in second pass. */ >>>> + state.user_mfn = (xen_pfn_t *)m.arr; >>>> + state.err = err_array; >>>> ret = traverse_pages(m.num, sizeof(xen_pfn_t), >>>> - &pagelist, >>>> - mmap_return_errors, &state); >>>> - } >>>> + &pagelist, mmap_return_errors_v1, &state); >>>> + } else >>>> + ret = __copy_to_user(m.err, err_array, m.num * sizeof(int)); >>>> + >>>> + /* If we have not had any EFAULT-like global errors then set the global >>>> + * error to -ENOENT if necessary. */ >>>> + if ((ret == 0) && (state.global_error == -ENOENT)) >>>> + ret = -ENOENT; >>>> >>>> out: >>>> + kfree(err_array); >>>> free_page_list(&pagelist); >>>> >>>> return ret; >>>> @@ -354,7 +420,11 @@ static long privcmd_ioctl(struct file *file, >>>> break; >>>> >>>> case IOCTL_PRIVCMD_MMAPBATCH: >>>> - ret = privcmd_ioctl_mmap_batch(udata); >>>> + ret = privcmd_ioctl_mmap_batch(udata, 1); >>>> + break; >>>> + >>>> + case IOCTL_PRIVCMD_MMAPBATCH_V2: >>>> + ret = privcmd_ioctl_mmap_batch(udata, 2); >>>> break; >>>> >>>> default: >>>> diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h >>>> index 45c1aa1..a853168 100644 >>>> --- a/include/xen/privcmd.h >>>> +++ b/include/xen/privcmd.h >>>> @@ -58,13 +58,33 @@ struct privcmd_mmapbatch { >>>> int num; /* number of pages to populate */ >>>> domid_t dom; /* target domain */ >>>> __u64 addr; /* virtual address */ >>>> - xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ >>>> + xen_pfn_t __user *arr; /* array of mfns - or''d with >>>> + PRIVCMD_MMAPBATCH_*_ERROR on err */ >>>> +}; >>>> + >>>> +#define PRIVCMD_MMAPBATCH_MFN_ERROR 0xf0000000U >>>> +#define PRIVCMD_MMAPBATCH_PAGED_ERROR 0x80000000U >>>> + >>>> +struct privcmd_mmapbatch_v2 { >>>> + unsigned int num; /* number of pages to populate */ >>>> + domid_t dom; /* target domain */ >>>> + __u64 addr; /* virtual address */ >>>> + const xen_pfn_t __user *arr; /* array of mfns */ >>>> + int __user *err; /* array of error codes */ >>>> }; >>>> >>>> /* >>>> * @cmd: IOCTL_PRIVCMD_HYPERCALL >>>> * @arg: &privcmd_hypercall_t >>>> * Return: Value returned from execution of the specified hypercall. >>>> + * >>>> + * @cmd: IOCTL_PRIVCMD_MMAPBATCH_V2 >>>> + * @arg: &struct privcmd_mmapbatch_v2 >>>> + * Return: 0 on success (i.e., arg->err contains valid error codes for >>>> + * each frame). On an error other than a failed frame remap, -1 is >>>> + * returned and errno is set to EINVAL, EFAULT etc. As an exception, >>>> + * if the operation was otherwise successful but any frame failed with >>>> + * -ENOENT, then -1 is returned and errno is set to ENOENT. >>>> */ >>>> #define IOCTL_PRIVCMD_HYPERCALL \ >>>> _IOC(_IOC_NONE, ''P'', 0, sizeof(struct privcmd_hypercall)) >>>> @@ -72,5 +92,7 @@ struct privcmd_mmapbatch { >>>> _IOC(_IOC_NONE, ''P'', 2, sizeof(struct privcmd_mmap)) >>>> #define IOCTL_PRIVCMD_MMAPBATCH \ >>>> _IOC(_IOC_NONE, ''P'', 3, sizeof(struct privcmd_mmapbatch)) >>>> +#define IOCTL_PRIVCMD_MMAPBATCH_V2 \ >>>> + _IOC(_IOC_NONE, ''P'', 4, sizeof(struct privcmd_mmapbatch_v2)) >>>> >>>> #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ >>>> >>>> On Aug 30, 2012, at 8:58 AM, David Vrabel wrote: >>>> >>>>> From: David Vrabel <david.vrabel@citrix.com> >>>>> >>>>> PRIVCMD_MMAPBATCH_V2 extends PRIVCMD_MMAPBATCH with an additional >>>>> field for reporting the error code for every frame that could not be >>>>> mapped. libxc prefers PRIVCMD_MMAPBATCH_V2 over PRIVCMD_MMAPBATCH. >>>>> >>>>> Signed-off-by: David Vrabel <david.vrabel@citrix.com> >>>>> --- >>>>> drivers/xen/privcmd.c | 99 +++++++++++++++++++++++++++++++++++++++--------- >>>>> include/xen/privcmd.h | 23 +++++++++++- >>>>> 2 files changed, 102 insertions(+), 20 deletions(-) >>>>> >>>>> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c >>>>> index ccee0f1..c0e89e7 100644 >>>>> --- a/drivers/xen/privcmd.c >>>>> +++ b/drivers/xen/privcmd.c >>>>> @@ -76,7 +76,7 @@ static void free_page_list(struct list_head *pages) >>>>> */ >>>>> static int gather_array(struct list_head *pagelist, >>>>> unsigned nelem, size_t size, >>>>> - void __user *data) >>>>> + const void __user *data) >>>>> { >>>>> unsigned pageidx; >>>>> void *pagedata; >>>>> @@ -248,18 +248,37 @@ struct mmap_batch_state { >>>>> struct vm_area_struct *vma; >>>>> int err; >>>>> >>>>> - xen_pfn_t __user *user; >>>>> + xen_pfn_t __user *user_mfn; >>>>> + int __user *user_err; >>>>> }; >>>>> >>>>> static int mmap_batch_fn(void *data, void *state) >>>>> { >>>>> xen_pfn_t *mfnp = data; >>>>> struct mmap_batch_state *st = state; >>>>> + int ret; >>>>> >>>>> - if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, >>>>> - st->vma->vm_page_prot, st->domain) < 0) { >>>>> - *mfnp |= 0xf0000000U; >>>>> - st->err++; >>>>> + ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, >>>>> + st->vma->vm_page_prot, st->domain); >>>>> + if (ret < 0) { >>>>> + /* >>>>> + * Error reporting is a mess but userspace relies on >>>>> + * it behaving this way. >>>>> + * >>>>> + * V2 needs to a) return the result of each frame''s >>>>> + * remap; and b) return -ENOENT if any frame failed >>>>> + * with -ENOENT. >>>>> + * >>>>> + * In this first pass the error code is saved by >>>>> + * overwriting the mfn and an error is indicated in >>>>> + * st->err. >>>>> + * >>>>> + * The second pass by mmap_return_errors() will write >>>>> + * the error codes to user space and get the right >>>>> + * ioctl return value. >>>>> + */ >>>>> + *(int *)mfnp = ret; >>>>> + st->err = ret; >>>>> } >>>>> st->va += PAGE_SIZE; >>>>> >>>>> @@ -270,16 +289,33 @@ static int mmap_return_errors(void *data, void *state) >>>>> { >>>>> xen_pfn_t *mfnp = data; >>>>> struct mmap_batch_state *st = state; >>>>> + int ret; >>>>> + >>>>> + if (st->user_err) { >>>>> + int err = *(int *)mfnp; >>>>> + >>>>> + if (err == -ENOENT) >>>>> + st->err = err; >>>>> >>>>> - return put_user(*mfnp, st->user++); >>>>> + return __put_user(err, st->user_err++); >>>>> + } else { >>>>> + xen_pfn_t mfn; >>>>> + >>>>> + ret = __get_user(mfn, st->user_mfn); >>>>> + if (ret < 0) >>>>> + return ret; >>>>> + >>>>> + mfn |= PRIVCMD_MMAPBATCH_MFN_ERROR; >>>>> + return __put_user(mfn, st->user_mfn++); >>>>> + } >>>>> } >>>>> >>>>> static struct vm_operations_struct privcmd_vm_ops; >>>>> >>>>> -static long privcmd_ioctl_mmap_batch(void __user *udata) >>>>> +static long privcmd_ioctl_mmap_batch(void __user *udata, int version) >>>>> { >>>>> int ret; >>>>> - struct privcmd_mmapbatch m; >>>>> + struct privcmd_mmapbatch_v2 m; >>>>> struct mm_struct *mm = current->mm; >>>>> struct vm_area_struct *vma; >>>>> unsigned long nr_pages; >>>>> @@ -289,15 +325,31 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) >>>>> if (!xen_initial_domain()) >>>>> return -EPERM; >>>>> >>>>> - if (copy_from_user(&m, udata, sizeof(m))) >>>>> - return -EFAULT; >>>>> + switch (version) { >>>>> + case 1: >>>>> + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) >>>>> + return -EFAULT; >>>>> + /* Returns per-frame error in m.arr. */ >>>>> + m.err = NULL; >>>>> + if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) >>>>> + return -EFAULT; >>>>> + break; >>>>> + case 2: >>>>> + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) >>>>> + return -EFAULT; >>>>> + /* Returns per-frame error code in m.err. */ >>>>> + if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) >>>>> + return -EFAULT; >>>>> + break; >>>>> + default: >>>>> + return -EINVAL; >>>>> + } >>>>> >>>>> nr_pages = m.num; >>>>> if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) >>>>> return -EINVAL; >>>>> >>>>> - ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), >>>>> - m.arr); >>>>> + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); >>>>> >>>>> if (ret || list_empty(&pagelist)) >>>>> goto out; >>>>> @@ -325,12 +377,17 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) >>>>> >>>>> up_write(&mm->mmap_sem); >>>>> >>>>> - if (state.err > 0) { >>>>> - state.user = m.arr; >>>>> + if (state.err) { >>>>> + state.err = 0; >>>>> + state.user_mfn = (xen_pfn_t *)m.arr; >>>>> + state.user_err = m.err; >>>>> ret = traverse_pages(m.num, sizeof(xen_pfn_t), >>>>> - &pagelist, >>>>> - mmap_return_errors, &state); >>>>> - } >>>>> + &pagelist, >>>>> + mmap_return_errors, &state); >>>>> + if (ret >= 0) >>>>> + ret = state.err; >>>>> + } else if (m.err) >>>>> + __clear_user(m.err, m.num * sizeof(*m.err)); >>>>> >>>>> out: >>>>> free_page_list(&pagelist); >>>>> @@ -354,7 +411,11 @@ static long privcmd_ioctl(struct file *file, >>>>> break; >>>>> >>>>> case IOCTL_PRIVCMD_MMAPBATCH: >>>>> - ret = privcmd_ioctl_mmap_batch(udata); >>>>> + ret = privcmd_ioctl_mmap_batch(udata, 1); >>>>> + break; >>>>> + >>>>> + case IOCTL_PRIVCMD_MMAPBATCH_V2: >>>>> + ret = privcmd_ioctl_mmap_batch(udata, 2); >>>>> break; >>>>> >>>>> default: >>>>> diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h >>>>> index 17857fb..f60d75c 100644 >>>>> --- a/include/xen/privcmd.h >>>>> +++ b/include/xen/privcmd.h >>>>> @@ -59,13 +59,32 @@ struct privcmd_mmapbatch { >>>>> int num; /* number of pages to populate */ >>>>> domid_t dom; /* target domain */ >>>>> __u64 addr; /* virtual address */ >>>>> - xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ >>>>> + xen_pfn_t __user *arr; /* array of mfns - or''d with >>>>> + PRIVCMD_MMAPBATCH_MFN_ERROR on err */ >>>>> +}; >>>>> + >>>>> +#define PRIVCMD_MMAPBATCH_MFN_ERROR 0xf0000000U >>>>> + >>>>> +struct privcmd_mmapbatch_v2 { >>>>> + unsigned int num; /* number of pages to populate */ >>>>> + domid_t dom; /* target domain */ >>>>> + __u64 addr; /* virtual address */ >>>>> + const xen_pfn_t __user *arr; /* array of mfns */ >>>>> + int __user *err; /* array of error codes */ >>>>> }; >>>>> >>>>> /* >>>>> * @cmd: IOCTL_PRIVCMD_HYPERCALL >>>>> * @arg: &privcmd_hypercall_t >>>>> * Return: Value returned from execution of the specified hypercall. >>>>> + * >>>>> + * @cmd: IOCTL_PRIVCMD_MMAPBATCH_V2 >>>>> + * @arg: &struct privcmd_mmapbatch_v2 >>>>> + * Return: 0 on success (i.e., arg->err contains valid error codes for >>>>> + * each frame). On an error other than a failed frame remap, -1 is >>>>> + * returned and errno is set to EINVAL, EFAULT etc. As an exception, >>>>> + * if the operation was otherwise successful but any frame failed with >>>>> + * -ENOENT, then -1 is returned and errno is set to ENOENT. >>>>> */ >>>>> #define IOCTL_PRIVCMD_HYPERCALL \ >>>>> _IOC(_IOC_NONE, ''P'', 0, sizeof(struct privcmd_hypercall)) >>>>> @@ -73,5 +92,7 @@ struct privcmd_mmapbatch { >>>>> _IOC(_IOC_NONE, ''P'', 2, sizeof(struct privcmd_mmap)) >>>>> #define IOCTL_PRIVCMD_MMAPBATCH \ >>>>> _IOC(_IOC_NONE, ''P'', 3, sizeof(struct privcmd_mmapbatch)) >>>>> +#define IOCTL_PRIVCMD_MMAPBATCH_V2 \ >>>>> + _IOC(_IOC_NONE, ''P'', 4, sizeof(struct privcmd_mmapbatch_v2)) >>>>> >>>>> #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ >>>>> -- >>>>> 1.7.2.5 >>>>>
Konrad Rzeszutek Wilk
2012-Sep-06 16:20 UTC
Re: [PATCH 2/2] xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl
On Thu, Sep 06, 2012 at 09:41:44AM -0400, Andres Lagar-Cavilla wrote:> On Sep 5, 2012, at 1:40 PM, Konrad Rzeszutek Wilk wrote: > > > On Wed, Sep 05, 2012 at 01:09:32PM -0400, Andres Lagar-Cavilla wrote: > >> Super. To which branch are you applying these now? (n00b question but have to ask) > > > > They will show up on stable/for-linus-3.7 once the overnight tests pass. > > I would be very surprised if this passed your nighties. This is because the following hunk is necessary:It did :-) I guess the Xen 4.1 isn''t using this that much.> > diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c > index 5386f20..e4dfa3b 100644 > --- a/drivers/xen/privcmd.c > +++ b/drivers/xen/privcmd.c > @@ -389,7 +389,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) > state.err = err_array; > ret = traverse_pages(m.num, sizeof(xen_pfn_t), > &pagelist, mmap_return_errors_v1, &state); > - } else > + } else if (version == 2) > ret = __copy_to_user(m.err, err_array, m.num * sizeof(int)); > > /* If we have not had any EFAULT-like global errors then set the global > > I can either resubmit the original patch with this squashed in, or send this stand-alone. Nightlies may have passed if your libxc never exercises v1 in favor of v2. But this is broken for v1 as it will unconditionally attempt a copy to user on a NULL target and this set rc to EFAULT. >Send it stand alone pls.