Jan Beulich
2010-Jan-11 14:49 UTC
[Xen-devel] [PATCH, RFC 1/4] linux: add new (replacement) mmap-batch ioctl
While the error indicator of IOCTL_PRIVCMD_MMAPBATCH should be in the top nibble (it is documented that way in include/xen/public/privcmd.h and include/xen/compat_ioctl.h), it really wasn''t for 64-bit implementations. With MFNs now possibly being 32 or more bits wide on x86-64, using bits 28-31 as failure indicator (and bit 31 as paged-out indicator) is not longer acceptable. Instead, a new ioctl with a separate error indication array is being introduced. As usual, written against 2.6.32.3 and made apply to the 2.6.18 tree without further testing. Signed-off-by: Jan Beulich <jbeulich@novell.com> --- head-2010-01-04.orig/drivers/xen/privcmd/compat_privcmd.c 2010-01-04 13:50:00.000000000 +0100 +++ head-2010-01-04/drivers/xen/privcmd/compat_privcmd.c 2010-01-04 15:13:52.000000000 +0100 @@ -96,6 +96,56 @@ int privcmd_ioctl_32(int fd, unsigned in #endif } break; + case IOCTL_PRIVCMD_MMAP_BATCH_32: { + struct privcmd_mmap_batch *p; + struct privcmd_mmap_batch_32 *p32; + struct privcmd_mmap_batch_32 n32; +#ifdef xen_pfn32_t + xen_pfn_t *__user arr; + xen_pfn32_t *__user arr32; + unsigned int i; +#endif + + p32 = compat_ptr(arg); + p = compat_alloc_user_space(sizeof(*p)); + if (copy_from_user(&n32, p32, sizeof(n32)) || + put_user(n32.num, &p->num) || + put_user(n32.dom, &p->dom) || + put_user(n32.addr, &p->addr) || + put_user(compat_ptr(n32.err), &p->err)) + return -EFAULT; +#ifdef xen_pfn32_t + arr = compat_alloc_user_space(n32.num * sizeof(*arr) + + sizeof(*p)); + arr32 = compat_ptr(n32.arr); + for (i = 0; i < n32.num; ++i) { + xen_pfn32_t mfn; + + if (get_user(mfn, arr32 + i) || put_user(mfn, arr + i)) + return -EFAULT; + } + + if (put_user(arr, &p->arr)) + return -EFAULT; +#else + if (put_user(compat_ptr(n32.arr), &p->arr)) + return -EFAULT; +#endif + + ret = sys_ioctl(fd, IOCTL_PRIVCMD_MMAP_BATCH, (unsigned long)p); + +#ifdef xen_pfn32_t + for (i = 0; !ret && i < n32.num; ++i) { + xen_pfn_t mfn; + + if (get_user(mfn, arr + i) || put_user(mfn, arr32 + i)) + ret = -EFAULT; + else if (mfn != (xen_pfn32_t)mfn) + ret = -ERANGE; + } +#endif + } + break; default: ret = -EINVAL; break; --- head-2010-01-04.orig/drivers/xen/privcmd/privcmd.c 2010-01-05 11:21:42.000000000 +0100 +++ head-2010-01-04/drivers/xen/privcmd/privcmd.c 2010-01-05 11:22:28.000000000 +0100 @@ -299,6 +299,108 @@ static long privcmd_ioctl(struct file *f mmapbatch_out: list_for_each_safe(l,l2,&pagelist) free_page((unsigned long)l); + } + break; + + case IOCTL_PRIVCMD_MMAP_BATCH: { + privcmd_mmap_batch_t m; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + const xen_pfn_t __user *p; + xen_pfn_t *mfn; + unsigned long addr, nr_pages; + unsigned int i, nr; + LIST_HEAD(pagelist); + struct list_head *l, *l2; + int *err, paged_out; + + if (!is_initial_xendomain()) + return -EPERM; + + if (copy_from_user(&m, udata, sizeof(m))) + return -EFAULT; + + nr_pages = m.num; + addr = m.addr; + if (m.num <= 0 || nr_pages > (ULONG_MAX >> PAGE_SHIFT) || + addr != m.addr || nr_pages > (-addr >> PAGE_SHIFT)) + return -EINVAL; + + p = m.arr; + for (i = 0; i < nr_pages; i += nr, p += nr) { + nr = min(nr_pages - i, MMAPBATCH_NR_PER_PAGE); + + ret = -ENOMEM; + l = (struct list_head *)__get_free_page(GFP_KERNEL); + if (l == NULL) + goto mmap_batch_out; + + INIT_LIST_HEAD(l); + list_add_tail(l, &pagelist); + + mfn = (void *)(l + 1); + ret = -EFAULT; + if (copy_from_user(mfn, p, nr * sizeof(*mfn))) + goto mmap_batch_out; + } + + down_write(&mm->mmap_sem); + + vma = find_vma(mm, addr); + ret = -EINVAL; + if (!vma || + addr < vma->vm_start || + addr + (nr_pages << PAGE_SHIFT) > vma->vm_end || + !enforce_singleshot_mapping(vma, addr, nr_pages)) { + up_write(&mm->mmap_sem); + goto mmap_batch_out; + } + + i = 0; + ret = 0; + paged_out = 0; + list_for_each(l, &pagelist) { + int rc; + + nr = i + min(nr_pages - i, MMAPBATCH_NR_PER_PAGE); + mfn = (void *)(l + 1); + err = (void *)(l + 1); + BUILD_BUG_ON(sizeof(*err) > sizeof(*mfn)); + + while (i < nr) { + rc = direct_remap_pfn_range(vma, addr & PAGE_MASK, + *mfn, PAGE_SIZE, + vma->vm_page_prot, m.dom); + if (rc < 0) { + if (rc == -ENOENT) + paged_out = 1; + ret++; + } else + BUG_ON(rc > 0); + *err++ = rc; + mfn++; i++; addr += PAGE_SIZE; + } + } + + up_write(&mm->mmap_sem); + + if (ret > 0) { + int __user *p = m.err; + + ret = paged_out ? -ENOENT : 0; + i = 0; + list_for_each(l, &pagelist) { + nr = min(nr_pages - i, MMAPBATCH_NR_PER_PAGE); + err = (void *)(l + 1); + if (copy_to_user(p, err, nr * sizeof(*err))) + ret = -EFAULT; + i += nr; p += nr; + } + } + + mmap_batch_out: + list_for_each_safe(l, l2, &pagelist) + free_page((unsigned long)l); #undef MMAPBATCH_NR_PER_PAGE } break; --- head-2010-01-04.orig/fs/compat_ioctl.c 2009-12-17 16:11:48.000000000 +0100 +++ head-2010-01-04/fs/compat_ioctl.c 2010-01-05 11:00:04.000000000 +0100 @@ -2937,6 +2937,7 @@ IGNORE_IOCTL(FBIOGCURSOR32) #ifdef CONFIG_XEN HANDLE_IOCTL(IOCTL_PRIVCMD_MMAP_32, privcmd_ioctl_32) HANDLE_IOCTL(IOCTL_PRIVCMD_MMAPBATCH_32, privcmd_ioctl_32) +HANDLE_IOCTL(IOCTL_PRIVCMD_MMAP_BATCH_32, privcmd_ioctl_32) COMPATIBLE_IOCTL(IOCTL_PRIVCMD_HYPERCALL) COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_VIRQ) COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_INTERDOMAIN) --- head-2010-01-04.orig/include/xen/compat_ioctl.h 2009-12-17 15:40:40.000000000 +0100 +++ head-2010-01-04/include/xen/compat_ioctl.h 2010-01-04 14:01:56.000000000 +0100 @@ -49,9 +49,27 @@ struct privcmd_mmapbatch_32 { #endif compat_uptr_t arr; /* array of mfns - top nibble set on err */ }; + +struct privcmd_mmap_batch_32 { + unsigned int num; /* number of pages to populate */ + domid_t dom; /* target domain */ +#if defined(CONFIG_X86) || defined(CONFIG_IA64) + union { /* virtual address */ + __u64 addr __packed; + __u32 va; + }; +#else + __u64 addr; /* virtual address */ +#endif + compat_uptr_t arr; /* array of mfns */ + compat_uptr_t err; /* array of error codes */ +}; + #define IOCTL_PRIVCMD_MMAP_32 \ _IOC(_IOC_NONE, ''P'', 2, sizeof(struct privcmd_mmap_32)) -#define IOCTL_PRIVCMD_MMAPBATCH_32 \ +#define IOCTL_PRIVCMD_MMAPBATCH_32 \ _IOC(_IOC_NONE, ''P'', 3, sizeof(struct privcmd_mmapbatch_32)) +#define IOCTL_PRIVCMD_MMAP_BATCH_32 \ + _IOC(_IOC_NONE, ''P'', 4, sizeof(struct privcmd_mmap_batch_32)) #endif /* __LINUX_XEN_COMPAT_H__ */ --- head-2010-01-04.orig/include/xen/public/privcmd.h 2009-12-18 10:38:09.000000000 +0100 +++ head-2010-01-04/include/xen/public/privcmd.h 2010-01-04 14:02:06.000000000 +0100 @@ -60,6 +60,14 @@ typedef struct privcmd_mmapbatch { xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ } privcmd_mmapbatch_t; +typedef struct privcmd_mmap_batch { + unsigned int num; /* number of pages to populate */ + domid_t dom; /* target domain */ + __u64 addr; /* virtual address */ + const xen_pfn_t __user *arr; /* array of mfns */ + int __user *err; /* array of error codes */ +} privcmd_mmap_batch_t; + /* * @cmd: IOCTL_PRIVCMD_HYPERCALL * @arg: &privcmd_hypercall_t @@ -71,5 +79,7 @@ typedef struct privcmd_mmapbatch { _IOC(_IOC_NONE, ''P'', 2, sizeof(privcmd_mmap_t)) #define IOCTL_PRIVCMD_MMAPBATCH \ _IOC(_IOC_NONE, ''P'', 3, sizeof(privcmd_mmapbatch_t)) +#define IOCTL_PRIVCMD_MMAP_BATCH \ + _IOC(_IOC_NONE, ''P'', 4, sizeof(privcmd_mmap_batch_t)) #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ian Campbell
2010-Jan-11 15:12 UTC
Re: [Xen-devel] [PATCH, RFC 1/4] linux: add new (replacement) mmap-batch ioctl
On Mon, 2010-01-11 at 14:49 +0000, Jan Beulich wrote:> #define IOCTL_PRIVCMD_MMAPBATCH \ > _IOC(_IOC_NONE, ''P'', 3, sizeof(privcmd_mmapbatch_t)) > +#define IOCTL_PRIVCMD_MMAP_BATCH \ > + _IOC(_IOC_NONE, ''P'', 4, sizeof(privcmd_mmap_batch_t))Distinguishing the old vs new ioctl by only a _ in the middle seems a bit horrid. MMAPBATCH2 would at least be less prone to confusion... Why not take the opportunity to make the ioctl (and hypercall?) interface 32 vs 64 agnostic by unconditionally using 64 bit sizes? Ian. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jan Beulich
2010-Jan-11 15:42 UTC
Re: [Xen-devel] [PATCH, RFC 1/4] linux: add new (replacement) mmap-batch ioctl
>>> Ian Campbell <Ian.Campbell@citrix.com> 11.01.10 16:12 >>> >On Mon, 2010-01-11 at 14:49 +0000, Jan Beulich wrote: >> #define IOCTL_PRIVCMD_MMAPBATCH \ >> _IOC(_IOC_NONE, ''P'', 3, sizeof(privcmd_mmapbatch_t)) >> +#define IOCTL_PRIVCMD_MMAP_BATCH \ >> + _IOC(_IOC_NONE, ''P'', 4, sizeof(privcmd_mmap_batch_t)) > >Distinguishing the old vs new ioctl by only a _ in the middle seems a >bit horrid. MMAPBATCH2 would at least be less prone to confusion...Otoh it clearly tells that the two are very similar...>Why not take the opportunity to make the ioctl (and hypercall?) >interface 32 vs 64 agnostic by unconditionally using 64 bit sizes?That''s not common practice for structures containing pointers afaict, and wouldn''t be nice for the pointer type of ''arr'' (would double the array size on 32-bits for no reason, and would require extra checking anyway). Jan _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2010-Jan-11 19:24 UTC
Re: [Xen-devel] [PATCH, RFC 1/4] linux: add new (replacement) mmap-batch ioctl
On 11/01/2010 15:42, "Jan Beulich" <JBeulich@novell.com> wrote:>>> #define IOCTL_PRIVCMD_MMAPBATCH \ >>> _IOC(_IOC_NONE, ''P'', 3, sizeof(privcmd_mmapbatch_t)) >>> +#define IOCTL_PRIVCMD_MMAP_BATCH \ >>> + _IOC(_IOC_NONE, ''P'', 4, sizeof(privcmd_mmap_batch_t)) >> >> Distinguishing the old vs new ioctl by only a _ in the middle seems a >> bit horrid. MMAPBATCH2 would at least be less prone to confusion... > > Otoh it clearly tells that the two are very similar...As does MMAPBATCH2. I actually strongly suggest IOCTL_PRIVCMD_MMAPBATCH_V2, which makes the situation even more crystal clear. Distinguishing based on a single underscore, to make the new name ''nice'', is misplaced aestheticism. In particular, the tools end up referencing both names (for compatibility) and a clearer name makes the code more intelligible. -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jan Beulich
2010-Jan-12 07:45 UTC
Re: [Xen-devel] [PATCH, RFC 1/4] linux: add new (replacement) mmap-batch ioctl
>>> Keir Fraser <keir.fraser@eu.citrix.com> 11.01.10 20:24 >>> >On 11/01/2010 15:42, "Jan Beulich" <JBeulich@novell.com> wrote: > >>>> #define IOCTL_PRIVCMD_MMAPBATCH \ >>>> _IOC(_IOC_NONE, ''P'', 3, sizeof(privcmd_mmapbatch_t)) >>>> +#define IOCTL_PRIVCMD_MMAP_BATCH \ >>>> + _IOC(_IOC_NONE, ''P'', 4, sizeof(privcmd_mmap_batch_t)) >>> >>> Distinguishing the old vs new ioctl by only a _ in the middle seems a >>> bit horrid. MMAPBATCH2 would at least be less prone to confusion... >> >> Otoh it clearly tells that the two are very similar... > >As does MMAPBATCH2. I actually strongly suggest IOCTL_PRIVCMD_MMAPBATCH_V2, >which makes the situation even more crystal clear. Distinguishing based on a >single underscore, to make the new name ''nice'', is misplaced aestheticism. >In particular, the tools end up referencing both names (for compatibility) >and a clearer name makes the code more intelligible.Okay, I''ll change the name then. Before re-spinning the set, are there any other items that need fixing? Comments on the questions raised? And are there chances this will still make 4.0? If not, can we at least add the interface definitions as place holders (similar to e.g. tmem before 3.4), allowing consuming code to be prepared for the addition? Jan _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2010-Jan-12 07:57 UTC
Re: [Xen-devel] [PATCH, RFC 1/4] linux: add new (replacement) mmap-batch ioctl
On 12/01/2010 07:45, "Jan Beulich" <JBeulich@novell.com> wrote:>> As does MMAPBATCH2. I actually strongly suggest IOCTL_PRIVCMD_MMAPBATCH_V2, >> which makes the situation even more crystal clear. Distinguishing based on a >> single underscore, to make the new name ''nice'', is misplaced aestheticism. >> In particular, the tools end up referencing both names (for compatibility) >> and a clearer name makes the code more intelligible. > > Okay, I''ll change the name then. Before re-spinning the set, are there > any other items that need fixing? Comments on the questions raised?Looked okay to me. The only question I saw was regarding xc_get_pfn_type_xxx: I don''t think they are used outside libxc, so you can do whatever makes internal sense for libxc (and remove the decls from xenctrl.h).> And are there chances this will still make 4.0? If not, can we at least > add the interface definitions as place holders (similar to e.g. tmem > before 3.4), allowing consuming code to be prepared for the addition?It could go in for 4.0.0-rc2 I suppose. -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2010-Jan-12 07:59 UTC
Re: [Xen-devel] [PATCH, RFC 1/4] linux: add new (replacement) mmap-batch ioctl
On 12/01/2010 07:57, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:>> Okay, I''ll change the name then. Before re-spinning the set, are there >> any other items that need fixing? Comments on the questions raised? > > Looked okay to me. The only question I saw was regarding > xc_get_pfn_type_xxx: I don''t think they are used outside libxc, so you can > do whatever makes internal sense for libxc (and remove the decls from > xenctrl.h).Oh, and libxc''s version number should definitely be bumped. -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jan Beulich
2010-Jan-12 08:13 UTC
Re: [Xen-devel] [PATCH, RFC 1/4] linux: add new (replacement) mmap-batch ioctl
>>> Keir Fraser <keir.fraser@eu.citrix.com> 12.01.10 08:57 >>> >Looked okay to me. The only question I saw was regarding >xc_get_pfn_type_xxx: I don''t think they are used outside libxc, so you can >do whatever makes internal sense for libxc (and remove the decls from >xenctrl.h).The other question (of wider scope) was that on the save format currently used: Using bits 28-31 of the PFNs to encode their types (effectively wasting bits 32-63 on 64-bit) isn''t forward compatible, and hence sooner or later will require a change. But of course there are other tools limitations (restricting guests to even smaller amounts of memory), but those are mostly internal to the tools (i.e. fixing them would not break compatibility). Jan _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2010-Jan-12 08:50 UTC
Re: [Xen-devel] [PATCH, RFC 1/4] linux: add new (replacement) mmap-batch ioctl
On 12/01/2010 08:13, "Jan Beulich" <JBeulich@novell.com> wrote:>>>> Keir Fraser <keir.fraser@eu.citrix.com> 12.01.10 08:57 >>> >> Looked okay to me. The only question I saw was regarding >> xc_get_pfn_type_xxx: I don''t think they are used outside libxc, so you can >> do whatever makes internal sense for libxc (and remove the decls from >> xenctrl.h). > > The other question (of wider scope) was that on the save format > currently used: Using bits 28-31 of the PFNs to encode their types > (effectively wasting bits 32-63 on 64-bit) isn''t forward compatible, > and hence sooner or later will require a change. But of course there > are other tools limitations (restricting guests to even smaller amounts > of memory), but those are mostly internal to the tools (i.e. fixing > them would not break compatibility).We''ll have to stick some kind of version-bump indicator in the save file, and change the way PFNs and their types are encoded. This is very doable, when the development window reopens. This is definitely a 4.1 feature -- it only prevents us saving 1TB+ guests. It''s quite a separate concern from our support for sparse-memory-map hosts. -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel