Isaku Yamahata
2007-Aug-13 03:59 UTC
[Xen-devel] [PATCH 1/4] xencomm take 2: xen side varisous fixes and preparation for consolidation
# HG changeset patch
# User yamahata@valinux.co.jp
# Date 1186974651 -32400
# Node ID bb8eb757a79c0e209f973fecda9e1f13208f3c56
# Parent c362bcee8047d3d30b8c7655d26d8a8702371b6f
Various xencomm fixes for xencomm consolidation.
This patch fixes following issues.
- Xen/powerpc runs in real mode so that it uses maddr interchangably with
vaddr.
But it isn''t the case in xen/ia64. It is necessary to convert maddr
to vaddr
to access the page. maddr_to_virt() doesn''t convert on powerpc, so it
works
on both archtechture.
- Xencomm should check whether struct xencomm_desc itself (8 bytes)
doesn''t
cross page boundary. Otherwise a hostile guest kernel can pass such
a pointer that may across page boundary. Then xencomm accesses a unrelated
page.
- Xencomm should check whether struct xencomm_desc::address[] array crosses
page boundary. Otherwise xencomm may access unrelated pages.
- Xencomm should get_page()/put_page() after address conversion from paddr
to maddr because xen supports SMP and balloon driver.
Otherwise another vcpu may free the page at the same time.
Such a domain behaviour doesn''t make sense, however nothing prevents
it.
- Current implementation doesn''t allow struct xencomm_desc::address
array
to be more than single page. On IA64 it causes 64GB+ domain creation
failure. This patch generalizes xencomm to allow multipage
xencomm_desc::address array.
PATCHNAME: various_fix_xencomm
Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
diff -r c362bcee8047 -r bb8eb757a79c xen/common/xencomm.c
--- a/xen/common/xencomm.c Sun Aug 12 16:09:13 2007 +0100
+++ b/xen/common/xencomm.c Mon Aug 13 12:10:51 2007 +0900
@@ -17,6 +17,7 @@
*
* Authors: Hollis Blanchard <hollisb@us.ibm.com>
* Tristan Gingold <tristan.gingold@bull.net>
+ * Isaku Yamahata <yamahata@valinux.co.jp>
*/
#include <xen/config.h>
@@ -34,6 +35,94 @@ static int xencomm_debug = 1; /* extreme
#define xencomm_debug 0
#endif
+/* get_page() to prevent from another vcpu freeing the page */
+static int
+xencomm_paddr_to_vaddr(unsigned long paddr, unsigned long *vaddr,
+ struct page_info **page)
+{
+ unsigned long maddr = paddr_to_maddr(paddr);
+ if (maddr == 0)
+ return -EFAULT;
+
+ *vaddr = (unsigned long)maddr_to_virt(maddr);
+ if (*vaddr == 0)
+ return -EFAULT;
+
+ *page = virt_to_page(*vaddr);
+ if (get_page(*page, current->domain) == 0) {
+ if (page_get_owner(*page) != current->domain) {
+ /*
+ * This page might be a page granted by another domain, or
+ * this page is freed with decrease reservation hypercall at
+ * the same time.
+ */
+ gdprintk(XENLOG_WARNING,
+ "bad page is passed. paddr 0x%lx maddr 0x%lx\n",
+ paddr, maddr);
+ return -EFAULT;
+ }
+
+ /* Try again. */
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+/* check if struct desc doesn''t cross page boundry */
+static int
+xencomm_desc_cross_page_boundary(unsigned long paddr)
+{
+ unsigned long offset = paddr & ~PAGE_MASK;
+ if (offset > PAGE_SIZE - sizeof(struct xencomm_desc))
+ return 1;
+ return 0;
+}
+
+static int
+xencomm_get_address(const void *handle, struct xencomm_desc *desc, int i,
+ unsigned long **address, struct page_info **page)
+{
+ if (i == 0)
+ *address = &desc->address[0];
+ else
+ (*address)++;
+
+ /* When crossing page boundary, machine address must be calculated. */
+ if (((unsigned long)*address & ~PAGE_MASK) == 0) {
+ unsigned long paddr = (unsigned long)
+ &(((const struct xencomm_desc*)handle)->address[i]);
+ put_page(*page);
+ return xencomm_paddr_to_vaddr(paddr, *address, page);
+ }
+ return 0;
+}
+
+static int
+xencomm_copy_chunk_from(unsigned long to, unsigned long paddr,
+ unsigned int len)
+{
+ unsigned long vaddr;
+ struct page_info *page;
+
+ while (1) {
+ int res;
+ res = xencomm_paddr_to_vaddr(paddr, &vaddr, &page);
+ if (res != 0) {
+ if (res == -EAGAIN)
+ continue; /* Try again. */
+ return res;
+ }
+ if (xencomm_debug)
+ printk("%lx[%d] -> %lx\n", vaddr, len, to);
+
+ memcpy((void *)to, (void *)vaddr, len);
+ put_page(page);
+ return 0;
+ }
+ /* NOTREACHED */
+}
+
static unsigned long
xencomm_inline_from_guest(void *to, const void *from, unsigned int n,
unsigned int skip)
@@ -44,17 +133,17 @@ xencomm_inline_from_guest(void *to, cons
while (n > 0) {
unsigned int chunksz;
- unsigned long src_maddr;
unsigned int bytes;
+ int res;
chunksz = PAGE_SIZE - (src_paddr % PAGE_SIZE);
bytes = min(chunksz, n);
- src_maddr = paddr_to_maddr(src_paddr);
- if (xencomm_debug)
- printk("%lx[%d] -> %lx\n", src_maddr, bytes, (unsigned
long)to);
- memcpy(to, (void *)src_maddr, bytes);
+ res = xencomm_copy_chunk_from((unsigned long)to, src_paddr, bytes);
+ if (res != 0)
+ return n;
+
src_paddr += bytes;
to += bytes;
n -= bytes;
@@ -81,6 +170,8 @@ xencomm_copy_from_guest(void *to, const
unsigned int skip)
{
struct xencomm_desc *desc;
+ struct page_info *page;
+ unsigned long *address;
unsigned int from_pos = 0;
unsigned int to_pos = 0;
unsigned int i = 0;
@@ -88,24 +179,30 @@ xencomm_copy_from_guest(void *to, const
if (xencomm_is_inline(from))
return xencomm_inline_from_guest(to, from, n, skip);
+ if (xencomm_desc_cross_page_boundary((unsigned long)from))
+ return n;
/* first we need to access the descriptor */
- desc = (struct xencomm_desc *)paddr_to_maddr((unsigned long)from);
- if (desc == NULL)
+ if (xencomm_paddr_to_vaddr((unsigned long)from,
+ (unsigned long*)&desc, &page))
return n;
if (desc->magic != XENCOMM_MAGIC) {
printk("%s: error: %p magic was 0x%x\n",
__func__, desc, desc->magic);
+ put_page(page);
return n;
}
/* iterate through the descriptor, copying up to a page at a time */
while ((to_pos < n) && (i < desc->nr_addrs)) {
- unsigned long src_paddr = desc->address[i];
+ unsigned long src_paddr;
unsigned int pgoffset;
unsigned int chunksz;
unsigned int chunk_skip;
+ if (xencomm_get_address(from, desc, i, &address, &page))
+ return n - to_pos;
+ src_paddr = *address;
if (src_paddr == XENCOMM_INVALID) {
i++;
continue;
@@ -120,17 +217,16 @@ xencomm_copy_from_guest(void *to, const
skip -= chunk_skip;
if (skip == 0 && chunksz > 0) {
- unsigned long src_maddr;
- unsigned long dest = (unsigned long)to + to_pos;
unsigned int bytes = min(chunksz, n - to_pos);
-
- src_maddr = paddr_to_maddr(src_paddr + chunk_skip);
- if (src_maddr == 0)
+ int res;
+
+ res = xencomm_copy_chunk_from((unsigned long)to + to_pos,
+ src_paddr + chunk_skip, bytes);
+ if (res != 0) {
+ put_page(page);
return n - to_pos;
-
- if (xencomm_debug)
- printk("%lx[%d] -> %lx\n", src_maddr, bytes,
dest);
- memcpy((void *)dest, (void *)src_maddr, bytes);
+ }
+
from_pos += bytes;
to_pos += bytes;
}
@@ -138,7 +234,33 @@ xencomm_copy_from_guest(void *to, const
i++;
}
+ put_page(page);
return n - to_pos;
+}
+
+static int
+xencomm_copy_chunk_to(unsigned long paddr, unsigned long from,
+ unsigned int len)
+{
+ unsigned long vaddr;
+ struct page_info *page;
+
+ while (1) {
+ int res;
+ res = xencomm_paddr_to_vaddr(paddr, &vaddr, &page);
+ if (res != 0) {
+ if (res == -EAGAIN)
+ continue; /* Try again. */
+ return res;
+ }
+ if (xencomm_debug)
+ printk("%lx[%d] -> %lx\n", from, len, vaddr);
+
+ memcpy((void *)vaddr, (void *)from, len);
+ put_page(page);
+ return 0;
+ }
+ /* NOTREACHED */
}
static unsigned long
@@ -151,17 +273,16 @@ xencomm_inline_to_guest(void *to, const
while (n > 0) {
unsigned int chunksz;
- unsigned long dest_maddr;
unsigned int bytes;
+ int res;
chunksz = PAGE_SIZE - (dest_paddr % PAGE_SIZE);
bytes = min(chunksz, n);
-
- dest_maddr = paddr_to_maddr(dest_paddr);
- if (xencomm_debug)
- printk("%lx[%d] -> %lx\n", (unsigned long)from, bytes,
dest_maddr);
- memcpy((void *)dest_maddr, (void *)from, bytes);
+ res = xencomm_copy_chunk_to(dest_paddr, (unsigned long)from, bytes);
+ if (res != 0)
+ return n;
+
dest_paddr += bytes;
from += bytes;
n -= bytes;
@@ -188,6 +309,8 @@ xencomm_copy_to_guest(void *to, const vo
unsigned int skip)
{
struct xencomm_desc *desc;
+ struct page_info *page;
+ unsigned long *address;
unsigned int from_pos = 0;
unsigned int to_pos = 0;
unsigned int i = 0;
@@ -195,23 +318,29 @@ xencomm_copy_to_guest(void *to, const vo
if (xencomm_is_inline(to))
return xencomm_inline_to_guest(to, from, n, skip);
+ if (xencomm_desc_cross_page_boundary((unsigned long)to))
+ return n;
/* first we need to access the descriptor */
- desc = (struct xencomm_desc *)paddr_to_maddr((unsigned long)to);
- if (desc == NULL)
+ if (xencomm_paddr_to_vaddr((unsigned long)to,
+ (unsigned long*)&desc, &page))
return n;
if (desc->magic != XENCOMM_MAGIC) {
printk("%s error: %p magic was 0x%x\n", __func__, desc,
desc->magic);
+ put_page(page);
return n;
}
/* iterate through the descriptor, copying up to a page at a time */
while ((from_pos < n) && (i < desc->nr_addrs)) {
- unsigned long dest_paddr = desc->address[i];
+ unsigned long dest_paddr;
unsigned int pgoffset;
unsigned int chunksz;
unsigned int chunk_skip;
+ if (xencomm_get_address(to, desc, i, &address, &page))
+ return n - from_pos;
+ dest_paddr = *address;
if (dest_paddr == XENCOMM_INVALID) {
i++;
continue;
@@ -226,17 +355,16 @@ xencomm_copy_to_guest(void *to, const vo
skip -= chunk_skip;
if (skip == 0 && chunksz > 0) {
- unsigned long dest_maddr;
- unsigned long source = (unsigned long)from + from_pos;
unsigned int bytes = min(chunksz, n - from_pos);
-
- dest_maddr = paddr_to_maddr(dest_paddr + chunk_skip);
- if (dest_maddr == 0)
- return -1;
-
- if (xencomm_debug)
- printk("%lx[%d] -> %lx\n", source, bytes,
dest_maddr);
- memcpy((void *)dest_maddr, (void *)source, bytes);
+ int res;
+
+ res = xencomm_copy_chunk_to(dest_paddr + chunk_skip,
+ (unsigned long)from + from_pos, bytes);
+ if (res != 0) {
+ put_page(page);
+ return n - from_pos;
+ }
+
from_pos += bytes;
to_pos += bytes;
}
@@ -244,6 +372,7 @@ xencomm_copy_to_guest(void *to, const vo
i++;
}
+ put_page(page);
return n - from_pos;
}
@@ -258,27 +387,40 @@ int xencomm_add_offset(void **handle, un
int xencomm_add_offset(void **handle, unsigned int bytes)
{
struct xencomm_desc *desc;
+ struct page_info *page;
+ unsigned long *address;
int i = 0;
if (xencomm_is_inline(*handle))
return xencomm_inline_add_offset(handle, bytes);
+ if (xencomm_desc_cross_page_boundary(*(unsigned long*)handle))
+ return -EINVAL;
/* first we need to access the descriptor */
- desc = (struct xencomm_desc *)paddr_to_maddr((unsigned long)*handle);
- if (desc == NULL)
- return -1;
+ if (xencomm_paddr_to_vaddr(*(unsigned long*)handle,
+ (unsigned long*)&desc, &page))
+ return -EFAULT;
if (desc->magic != XENCOMM_MAGIC) {
printk("%s error: %p magic was 0x%x\n", __func__, desc,
desc->magic);
- return -1;
+ put_page(page);
+ return -EINVAL;
}
/* iterate through the descriptor incrementing addresses */
while ((bytes > 0) && (i < desc->nr_addrs)) {
- unsigned long dest_paddr = desc->address[i];
+ unsigned long dest_paddr;
unsigned int pgoffset;
unsigned int chunksz;
unsigned int chunk_skip;
+
+ if (xencomm_get_address(*handle, desc, i, &address, &page))
+ return -EFAULT;
+ dest_paddr = *address;
+ if (dest_paddr == XENCOMM_INVALID) {
+ i++;
+ continue;
+ }
pgoffset = dest_paddr % PAGE_SIZE;
chunksz = PAGE_SIZE - pgoffset;
@@ -286,31 +428,45 @@ int xencomm_add_offset(void **handle, un
chunk_skip = min(chunksz, bytes);
if (chunk_skip == chunksz) {
/* exhausted this page */
- desc->address[i] = XENCOMM_INVALID;
+ *address = XENCOMM_INVALID;
} else {
- desc->address[i] += chunk_skip;
+ *address += chunk_skip;
}
bytes -= chunk_skip;
- }
+
+ i++;
+ }
+ put_page(page);
return 0;
}
int xencomm_handle_is_null(void *handle)
{
struct xencomm_desc *desc;
+ struct page_info *page;
+ unsigned long *address;
int i;
if (xencomm_is_inline(handle))
return xencomm_inline_addr(handle) == 0;
- desc = (struct xencomm_desc *)paddr_to_maddr((unsigned long)handle);
- if (desc == NULL)
+ if (xencomm_desc_cross_page_boundary((unsigned long)handle))
+ return 1; /* EINVAL */
+ if (xencomm_paddr_to_vaddr((unsigned long)handle,
+ (unsigned long*)&desc, &page))
return 1;
- for (i = 0; i < desc->nr_addrs; i++)
- if (desc->address[i] != XENCOMM_INVALID)
+ for (i = 0; i < desc->nr_addrs; i++) {
+ if (xencomm_get_address(handle, desc, i, &address, &page))
+ return 1; /* EFAULT */
+
+ if (*address != XENCOMM_INVALID) {
+ put_page(page);
return 0;
-
+ }
+ }
+
+ put_page(page);
return 1;
}
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
Hollis Blanchard
2007-Aug-13 20:08 UTC
[Xen-devel] Re: [XenPPC] [PATCH 1/4] xencomm take 2: xen side varisous fixes and preparation for consolidation
On Mon, 2007-08-13 at 12:59 +0900, Isaku Yamahata wrote:> - Xencomm should get_page()/put_page() after address conversion from paddr > to maddr because xen supports SMP and balloon driver. > Otherwise another vcpu may free the page at the same time. > Such a domain behaviour doesn''t make sense, however nothing prevents it.Unfortunately my test system is currently down, so I can''t test this today. However, one initial comment: I really dislike the way get/put_page() are scattered through this code. Maybe every pair is balanced today, but it will be difficult to maintain, and especially to test all the error paths. I think this needs a more symmetrical API. Right now get_page() and put_page() are being done at multiple levels, and in xencomm_get_address() we''re calling put_page() only to call get_page() a moment later in xencomm_paddr_to_vaddr(). I don''t have a concrete proposal for simplifying this. Also, it looks like we''re calling put_page() on the ''desc'' page itself before we''re done with it, so that''s a bug.> +static int > +xencomm_paddr_to_vaddr(unsigned long paddr, unsigned long *vaddr, > + struct page_info **page)Since we can use page_to_vaddr(), I don''t think you need to pass ''vaddr'' here. That should simplify the code a little bit. By the way, this looks bogus:> > +static int > +xencomm_get_address(const void *handle, struct xencomm_desc *desc, > int i, > + unsigned long **address, struct page_info **page) > +{ > + if (i == 0) > + *address = &desc->address[0]; > + else > + (*address)++;Shouldn''t that be *address = &desc->address[i] ? I definitely agree that some of these fixes are needed (especially checking for the descriptor page overlap, and using get/put_page()). However, this code is difficult to follow already, and these patches are confusing *me* (and I wrote it! :) so I''m very nervous about increasing the complexity. Since the only issue you''ve identified is populate_physmap, and that has an easy workaround, I would prefer the easier solution. -- Hollis Blanchard IBM Linux Technology Center _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Isaku Yamahata
2007-Aug-14 01:39 UTC
Re: [Xen-devel] Re: [XenPPC] [PATCH 1/4] xencomm take 2: xen side varisous fixes and preparation for consolidation
Thank you for review. I will try to simplfy/clean up it. Probably I will split the patch into the consolidation part (maddr and vaddr), bug fix part (page boundary check and get_page()/put_page()), and new feature part(multipage support). BTW although I know you need to test it before ack, how do you like other patches (2/4 and 3/4)? I''d like to finish linux side xencomm consolidation at first so that I can focus on xen side xencomm. On Mon, Aug 13, 2007 at 03:08:58PM -0500, Hollis Blanchard wrote:> > +static int > > +xencomm_paddr_to_vaddr(unsigned long paddr, unsigned long *vaddr, > > + struct page_info **page) > > Since we can use page_to_vaddr(), I don''t think you need to pass ''vaddr'' > here. That should simplify the code a little bit.I see.> By the way, this looks bogus: > > > > +static int > > +xencomm_get_address(const void *handle, struct xencomm_desc *desc, > > int i, > > + unsigned long **address, struct page_info **page) > > +{ > > + if (i == 0) > > + *address = &desc->address[0]; > > + else > > + (*address)++; > > Shouldn''t that be *address = &desc->address[i] ?This is very confusing point. The array is paddr contiguous, but not maddr contiguous. So we can''t calculate it by simple offsetting.> I definitely agree that some of these fixes are needed (especially > checking for the descriptor page overlap, and using get/put_page()). > However, this code is difficult to follow already, and these patches are > confusing *me* (and I wrote it! :) so I''m very nervous about increasing > the complexity. > > Since the only issue you''ve identified is populate_physmap, and that has > an easy workaround, I would prefer the easier solution.Understood. I have to admit that the patch is complex. I hope splitting the patch will help. -- yamahata _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Hollis Blanchard
2007-Aug-15 13:50 UTC
Re: [Xen-devel] Re: [XenPPC] [PATCH 1/4] xencomm take 2: xen side varisous fixes and preparation for consolidation
On Tue, 2007-08-14 at 10:39 +0900, Isaku Yamahata wrote:> > BTW although I know you need to test it before ack, > how do you like other patches (2/4 and 3/4)? > I''d like to finish linux side xencomm consolidation at first so that > I can focus on xen side xencomm.BTW, I''ve been looking at the Xen patches first because those can go in before the Linux patches. The opposite of course isn''t true... -- Hollis Blanchard IBM Linux Technology Center _______________________________________________ Xen-ppc-devel mailing list Xen-ppc-devel@lists.xensource.com http://lists.xensource.com/xen-ppc-devel