George Dunlap
2012-Jun-08 11:54 UTC
[PATCH 0 of 4 RFC] Populate-on-demand: Check pages being returned by the balloon driver
Populate-on-demand: Check pages being returned by the balloon driver This patch series is the second result of my work last summer on decreasing fragmentation of superpages in a guests'' p2m when using populate-on-demand. This patch series is against 4.1; I''m posting it to get feedback on the viability of getting a ported version of this patch into 4.2. As with the previous series, the patces against 4.1 have been extensively in the XenServer testing framework and have been in use by XenServer customers for over 9 months now. But the p2m code has changed extensively in that time, so one could argue that the testing doesn''t give us the same degree of confidence in the patches against 4.2 as against 4.1. Looking at it now, there are a number of ugly bits -- "#if 0" and such; please overlook this for now, as it will all be cleaned up as part of the porting process. If this looks good, I''ll do the work of porting and testing it before posting it again.
George Dunlap
2012-Jun-08 11:54 UTC
[PATCH 1 of 4 RFC] xen, p2m: get_entry returns level of entry as well
Return the p2m level of the entry which filled this request. Intended to be used to see if pages returned by the balloon driver are part of a superpage, and reclaim them if so. Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -3644,6 +3644,7 @@ long do_hvm_op(unsigned long op, XEN_GUE p2m_type_t t; p2m_access_t ac; mfn_t mfn; + int l; /* Interface access to internal p2m accesses */ hvmmem_access_t memaccess[] = { @@ -3681,7 +3682,7 @@ long do_hvm_op(unsigned long op, XEN_GUE goto param_fail6; rc = -ESRCH; - mfn = p2m->get_entry(p2m, a.pfn, &t, &ac, p2m_query); + mfn = p2m->get_entry(p2m, a.pfn, &t, &ac, &l, p2m_query); if ( mfn_x(mfn) == INVALID_MFN ) goto param_fail6; diff --git a/xen/arch/x86/mm/hap/p2m-ept.c b/xen/arch/x86/mm/hap/p2m-ept.c --- a/xen/arch/x86/mm/hap/p2m-ept.c +++ b/xen/arch/x86/mm/hap/p2m-ept.c @@ -527,14 +527,14 @@ out: /* Read ept p2m entries */ static mfn_t ept_get_entry(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, p2m_access_t* a, - p2m_query_t q) + int *level, p2m_query_t q) { struct domain *d = p2m->domain; ept_entry_t *table = map_domain_page(ept_get_asr(d)); unsigned long gfn_remainder = gfn; ept_entry_t *ept_entry; u32 index; - int i; + int i=5; int ret = 0; mfn_t mfn = _mfn(INVALID_MFN); @@ -616,6 +616,7 @@ static mfn_t ept_get_entry(struct p2m_do } out: + *level=i+1; unmap_domain_page(table); return mfn; } @@ -709,9 +710,10 @@ out: static mfn_t ept_get_entry_current(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, p2m_access_t *a, + int *l, p2m_query_t q) { - return ept_get_entry(p2m, gfn, t, a, q); + return ept_get_entry(p2m, gfn, t, a, l, q); } /* diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c --- a/xen/arch/x86/mm/p2m.c +++ b/xen/arch/x86/mm/p2m.c @@ -1579,7 +1579,7 @@ out: static mfn_t p2m_gfn_to_mfn(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, p2m_access_t *a, - p2m_query_t q) + int *level, p2m_query_t q) { mfn_t mfn; paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT; @@ -1594,7 +1594,8 @@ p2m_gfn_to_mfn(struct p2m_domain *p2m, u * XXX we will return p2m_invalid for unmapped gfns */ *t = p2m_mmio_dm; /* Not implemented except with EPT */ - *a = p2m_access_rwx; + *a = p2m_access_rwx; + *level = 5; mfn = pagetable_get_mfn(p2m_get_pagetable(p2m)); @@ -1602,6 +1603,7 @@ p2m_gfn_to_mfn(struct p2m_domain *p2m, u /* This pfn is higher than the highest the p2m map currently holds */ return _mfn(INVALID_MFN); + *level = 4; #if CONFIG_PAGING_LEVELS >= 4 { l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn)); @@ -1615,6 +1617,7 @@ p2m_gfn_to_mfn(struct p2m_domain *p2m, u unmap_domain_page(l4e); } #endif + *level=3; { l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn)); #if CONFIG_PAGING_LEVELS == 3 @@ -1662,6 +1665,7 @@ pod_retry_l3: l2e = map_domain_page(mfn_x(mfn)); l2e += l2_table_offset(addr); + *level=2; pod_retry_l2: if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 ) @@ -1695,6 +1699,7 @@ pod_retry_l2: l1e = map_domain_page(mfn_x(mfn)); l1e += l1_table_offset(addr); + *level=1; pod_retry_l1: if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 ) { @@ -1723,7 +1728,7 @@ pod_retry_l1: /* Read the current domain''s p2m table (through the linear mapping). */ static mfn_t p2m_gfn_to_mfn_current(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, p2m_access_t *a, - p2m_query_t q) + int *level, p2m_query_t q) { mfn_t mfn = _mfn(INVALID_MFN); p2m_type_t p2mt = p2m_mmio_dm; @@ -1735,6 +1740,7 @@ static mfn_t p2m_gfn_to_mfn_current(stru /* Not currently implemented except for EPT */ *a = p2m_access_rwx; + *level=5; if ( gfn <= p2m->max_mapped_pfn ) { @@ -1749,6 +1755,7 @@ static mfn_t p2m_gfn_to_mfn_current(stru / sizeof(l1_pgentry_t)); #if CONFIG_PAGING_LEVELS >= 4 + *level=3; /* * Read & process L3 */ @@ -1802,6 +1809,7 @@ static mfn_t p2m_gfn_to_mfn_current(stru p2m_entry = &__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START) + l2_linear_offset(addr)]; + *level=2; pod_retry_l2: ret = __copy_from_user(&l2e, p2m_entry, @@ -1855,6 +1863,7 @@ static mfn_t p2m_gfn_to_mfn_current(stru /* Need to __copy_from_user because the p2m is sparse and this * part might not exist */ + *level=1; pod_retry_l1: p2m_entry = &phys_to_machine_mapping[gfn]; @@ -2069,6 +2078,7 @@ void p2m_teardown(struct p2m_domain *p2m unsigned long gfn; p2m_type_t t; p2m_access_t a; + int l; mfn_t mfn; #endif @@ -2077,7 +2087,7 @@ void p2m_teardown(struct p2m_domain *p2m #ifdef __x86_64__ for ( gfn=0; gfn < p2m->max_mapped_pfn; gfn++ ) { - mfn = p2m->get_entry(p2m, gfn, &t, &a, p2m_query); + mfn = p2m->get_entry(p2m, gfn, &t, &a, &l, p2m_query); if ( mfn_valid(mfn) && (t == p2m_ram_shared) ) BUG_ON(mem_sharing_unshare_page(p2m, gfn, MEM_SHARING_DESTROY_GFN)); } @@ -2375,6 +2385,7 @@ p2m_remove_page(struct p2m_domain *p2m, mfn_t mfn_return; p2m_type_t t; p2m_access_t a; + int l; if ( !paging_mode_translate(p2m->domain) ) { @@ -2390,7 +2401,7 @@ p2m_remove_page(struct p2m_domain *p2m, { for ( i = 0; i < (1UL << page_order); i++ ) { - mfn_return = p2m->get_entry(p2m, gfn + i, &t, &a, p2m_query); + mfn_return = p2m->get_entry(p2m, gfn + i, &t, &a, &l, p2m_query); if ( !p2m_is_grant(t) ) set_gpfn_from_mfn(mfn+i, INVALID_M2P_ENTRY); ASSERT( !p2m_is_valid(t) || mfn + i == mfn_x(mfn_return) ); @@ -3079,10 +3090,11 @@ void p2m_mem_access_check(unsigned long mfn_t mfn; p2m_type_t p2mt; p2m_access_t p2ma; + int p2ml; /* First, handle rx2rw conversion automatically */ p2m_lock(p2m); - mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, p2m_query); + mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, &p2ml, p2m_query); if ( access_w && p2ma == p2m_access_rx2rw ) { diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h --- a/xen/include/asm-x86/p2m.h +++ b/xen/include/asm-x86/p2m.h @@ -216,11 +216,13 @@ struct p2m_domain { unsigned long gfn, p2m_type_t *p2mt, p2m_access_t *p2ma, + int *level, p2m_query_t q); mfn_t (*get_entry_current)(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *p2mt, p2m_access_t *p2ma, + int *level, p2m_query_t q); void (*change_entry_type_global)(struct p2m_domain *p2m, p2m_type_t ot, @@ -334,7 +336,8 @@ static inline mfn_t gfn_to_mfn_type_curr p2m_access_t *a, p2m_query_t q) { - return p2m->get_entry_current(p2m, gfn, t, a, q); + int l; + return p2m->get_entry_current(p2m, gfn, t, a, &l, q); } /* Read P2M table, mapping pages as we go. @@ -344,7 +347,8 @@ gfn_to_mfn_type_p2m(struct p2m_domain *p p2m_type_t *t, p2m_query_t q) { p2m_access_t a = 0; - return p2m->get_entry(p2m, gfn, t, &a, q); + int l; + return p2m->get_entry(p2m, gfn, t, &a, &l, q); }
George Dunlap
2012-Jun-08 11:54 UTC
[PATCH 2 of 4 RFC] xen, pod: Check zero-check pages returned by the balloon driver
When a gfn is passed back by the balloon driver to Xen, check to see if it''s in a superpage; and if so, check to see if that page is zeroed, and if so reclaim it. This patch significantly reduces superpage fragmentation when running on a high number of vcpus and significant memory ballooning. Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com> diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c --- a/xen/arch/x86/mm/p2m.c +++ b/xen/arch/x86/mm/p2m.c @@ -74,6 +74,7 @@ boolean_param("hap_2mb", opt_hap_2mb); #define SUPERPAGE_PAGES (1UL << 9) #define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0) +#define order_aligned(_x,_o) (((_x)&((1<<(_o))-1))==0) static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn) { @@ -767,6 +768,9 @@ p2m_pod_offline_or_broken_replace(struct return; } +static int p2m_pod_zero_check_superpage(struct p2m_domain *p2m, unsigned long gfn); + + /* This function is needed for two reasons: * + To properly handle clearing of PoD entries * + To "steal back" memory being freed for the PoD cache, rather than @@ -774,6 +778,9 @@ p2m_pod_offline_or_broken_replace(struct * * Once both of these functions have been completed, we can return and * allow decrease_reservation() to handle everything else. + * + * Because the balloon driver races with the page scrubber, we also + * scan for zeroed superpages we can reclaim. */ int p2m_pod_decrease_reservation(struct domain *d, @@ -781,11 +788,16 @@ p2m_pod_decrease_reservation(struct doma unsigned int order) { int ret=0; - int i; + int i, entry_size=0; struct p2m_domain *p2m = p2m_get_hostp2m(d); int steal_for_cache = 0; - int pod = 0, nonpod = 0, ram = 0; + int nonpod = 0; +#if 0 + unsigned long gfn_aligned; + + int nonpod = 0, ram = 0; +#endif /* If we don''t have any outstanding PoD entries, let things take their @@ -794,7 +806,8 @@ p2m_pod_decrease_reservation(struct doma goto out; /* Figure out if we need to steal some freed memory for our cache */ - steal_for_cache = ( p2m->pod.entry_count > p2m->pod.count ); + if ( p2m->pod.entry_count > p2m->pod.count ) + steal_for_cache = ( p2m->pod.entry_count - p2m->pod.count ); p2m_lock(p2m); audit_p2m(p2m, 1); @@ -802,7 +815,7 @@ p2m_pod_decrease_reservation(struct doma if ( unlikely(d->is_dying) ) goto out_unlock; - /* See what''s in here. */ +#if 0 /* FIXME: Add contiguous; query for PSE entries? */ for ( i=0; i<(1<<order); i++) { @@ -876,12 +889,105 @@ p2m_pod_decrease_reservation(struct doma } } +#else +/* + * while(more pages to look at) + * - Find the next page unit + * - if req_decrease >= page_size + * or page is zero + * - Reclaim whole page + * - else + * - Reclaim req_decrease + * - break out of the loop + */ + /* FIXME: Add contiguous; query for PSE entries? */ + for ( i=0; i<(1<<order); i+=entry_size) + { + mfn_t mfn; + p2m_type_t t; + p2m_access_t a; + int l, entry_order; + + /* See what''s in here. */ + mfn = p2m->get_entry(p2m, gpfn + i, &t, &a, &l, p2m_query); + + /* NB: entry_size is used by the loop update; so if you change the + * level at which you''re working (i.e., decide to work w/ 4k + * pages instead of a superpage), you must change entry_size + * so that the loop logic works right. */ + entry_order = (l-1)*9; + entry_size = 1<<(entry_order); + + /* If this is in a superpage and we need ram, try to nab it */ + if ( steal_for_cache + && entry_order == 9 + && p2m_is_ram(t) + && p2m_pod_zero_check_superpage(p2m, gpfn & ~(SUPERPAGE_PAGES-1)) ) + { + /* Now we have a PoD entry to deal with; update + * steal_for_cache, and set entry_size to 0 so that we + * re-process the entry. */ + entry_size=0; + + /* Update steal_for_cache */ + if ( p2m->pod.entry_count > p2m->pod.count ) + steal_for_cache = ( p2m->pod.entry_count - p2m->pod.count ); + else + steal_for_cache = 0; + + continue; + } + + /* Switch to singleton pages if we don''t want to reclaim the + * whole page, or if the start is not SP aligned. Future + * iterations to this superpage frame will then be */ + if ( entry_order > 0 + && ( i + entry_size > (1<<order) + || !superpage_aligned(gpfn + i) ) ) + { + entry_order = 0; + entry_size = 1; + } + + if ( t == p2m_populate_on_demand ) + { + set_p2m_entry(p2m, gpfn+i, _mfn(INVALID_MFN), entry_order, p2m_invalid, p2m->default_access); + p2m->pod.entry_count-=(1<<entry_order); /* Lock: p2m */ + BUG_ON(p2m->pod.entry_count < 0); + } + else if( p2m_is_ram(t) ) + { + if ( steal_for_cache ) + { + struct page_info *page; + + ASSERT(mfn_valid(mfn)); + + page = mfn_to_page(mfn); + + set_p2m_entry(p2m, gpfn + i, _mfn(INVALID_MFN), entry_order, + p2m_invalid, p2m->default_access); + set_gpfn_from_mfn(mfn_x(mfn), INVALID_M2P_ENTRY); + + p2m_pod_cache_add(p2m, page, entry_order); + + /* Update steal_for_cache */ + if ( p2m->pod.entry_count > p2m->pod.count ) + steal_for_cache = ( p2m->pod.entry_count - p2m->pod.count ); + else + steal_for_cache = 0; + } + else + nonpod++; + } + } +#endif /* If there are no more non-PoD entries, tell decrease_reservation() that * there''s nothing left to do. */ if ( nonpod == 0 ) ret = 1; -out_entry_check: +//out_entry_check: /* If we''ve reduced our "liabilities" beyond our "assets", free some */ if ( p2m->pod.entry_count < p2m->pod.count ) { @@ -1040,6 +1146,8 @@ p2m_pod_zero_check_superpage(struct p2m_ p2m_pod_cache_add(p2m, mfn_to_page(mfn0), 9); p2m->pod.entry_count += SUPERPAGE_PAGES; + ret = SUPERPAGE_PAGES; + out_reset: if ( reset ) set_p2m_entry(p2m, gfn, mfn0, 9, type0, p2m->default_access);
George Dunlap
2012-Jun-08 11:54 UTC
[PATCH 3 of 4 RFC] imported patch CA-72761-libxc-error.patch
diff --git a/tools/libxc/xc_domain_restore.c b/tools/libxc/xc_domain_restore.c --- a/tools/libxc/xc_domain_restore.c +++ b/tools/libxc/xc_domain_restore.c @@ -87,8 +87,8 @@ static ssize_t rdexact(xc_interface *xch if ( (len == -1) && ((errno == EINTR) || (errno == EAGAIN)) ) continue; if ( len == 0 ) { - ERROR("0-length read"); - errno = 0; + errno = EPIPE; + return -1; } if ( len <= 0 ) { ERROR("read_exact_timed failed (read rc: %d, errno: %d, size %d, offset %d)", @@ -1253,7 +1253,7 @@ int xc_domain_restore(xc_interface *xch, if ( !ctx->completed ) { pagebuf.nr_physpages = pagebuf.nr_pages = 0; if ( pagebuf_get_one(xch, ctx, &pagebuf, io_fd, dom) < 0 ) { - DPERROR(tdom, "Error when reading batch"); + /* pagebuf_get_one already returned a proper error */ goto out; } }
George Dunlap
2012-Jun-08 11:54 UTC
[PATCH 4 of 4 RFC] imported patch pygrub-invalid-disk-catch.patch
diff --git a/tools/pygrub/src/pygrub b/tools/pygrub/src/pygrub --- a/tools/pygrub/src/pygrub +++ b/tools/pygrub/src/pygrub @@ -814,22 +814,27 @@ if __name__ == "__main__": # get list of offsets into file which start partitions part_offs = get_partition_offsets(file) + if len(part_offs) < 1: + raise RuntimeError, "Disk has no partitions" - fs = fsimage.open(file, part_offs[0], bootfsoptions) + try: + fs = fsimage.open(file, part_offs[0], bootfsoptions) - # We always boot the "default" kernel if it exists, rather than - # parsing the grub menu - initrd_path = None - if fs.file_exists("/xenkernel"): - incfg["kernel"] = "/xenkernel" - incfg["args"] = default_args - if fs.file_exists("/xeninitrd"): - incfg["ramdisk"] = "/xeninitrd" - elif fs.file_exists("/boot/xenkernel"): - incfg["kernel"] = "/boot/xenkernel" - incfg["args"] = default_args - if fs.file_exists("/boot/xeninitrd"): - incfg["ramdisk"] = "/boot/xeninitrd" + # We always boot the "default" kernel if it exists, rather than + # parsing the grub menu + initrd_path = None + if fs.file_exists("/xenkernel"): + incfg["kernel"] = "/xenkernel" + incfg["args"] = default_args + if fs.file_exists("/xeninitrd"): + incfg["ramdisk"] = "/xeninitrd" + elif fs.file_exists("/boot/xenkernel"): + incfg["kernel"] = "/boot/xenkernel" + incfg["args"] = default_args + if fs.file_exists("/boot/xeninitrd"): + incfg["ramdisk"] = "/boot/xeninitrd" + except: + raise RuntimeError, "Unable to find partition containing kernel" for offset in part_offs: try:
George Dunlap
2012-Jun-08 11:57 UTC
Re: [PATCH 3 of 4 RFC] imported patch CA-72761-libxc-error.patch
GAH... please ignore this particular patch, it''s an accident. On Fri, Jun 8, 2012 at 12:54 PM, George Dunlap <george.dunlap@eu.citrix.com> wrote:> diff --git a/tools/libxc/xc_domain_restore.c b/tools/libxc/xc_domain_restore.c > --- a/tools/libxc/xc_domain_restore.c > +++ b/tools/libxc/xc_domain_restore.c > @@ -87,8 +87,8 @@ static ssize_t rdexact(xc_interface *xch > if ( (len == -1) && ((errno == EINTR) || (errno == EAGAIN)) ) > continue; > if ( len == 0 ) { > - ERROR("0-length read"); > - errno = 0; > + errno = EPIPE; > + return -1; > } > if ( len <= 0 ) { > ERROR("read_exact_timed failed (read rc: %d, errno: %d, size %d, offset %d)", > @@ -1253,7 +1253,7 @@ int xc_domain_restore(xc_interface *xch, > if ( !ctx->completed ) { > pagebuf.nr_physpages = pagebuf.nr_pages = 0; > if ( pagebuf_get_one(xch, ctx, &pagebuf, io_fd, dom) < 0 ) { > - DPERROR(tdom, "Error when reading batch"); > + /* pagebuf_get_one already returned a proper error */ > goto out; > } > } > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel
George Dunlap
2012-Jun-08 11:58 UTC
Re: [PATCH 4 of 4 RFC] imported patch pygrub-invalid-disk-catch.patch
GAH... please ignore this particular patch, it''s a mistake. On Fri, Jun 8, 2012 at 12:54 PM, George Dunlap <george.dunlap@eu.citrix.com> wrote:> diff --git a/tools/pygrub/src/pygrub b/tools/pygrub/src/pygrub > --- a/tools/pygrub/src/pygrub > +++ b/tools/pygrub/src/pygrub > @@ -814,22 +814,27 @@ if __name__ == "__main__": > > # get list of offsets into file which start partitions > part_offs = get_partition_offsets(file) > + if len(part_offs) < 1: > + raise RuntimeError, "Disk has no partitions" > > - fs = fsimage.open(file, part_offs[0], bootfsoptions) > + try: > + fs = fsimage.open(file, part_offs[0], bootfsoptions) > > - # We always boot the "default" kernel if it exists, rather than > - # parsing the grub menu > - initrd_path = None > - if fs.file_exists("/xenkernel"): > - incfg["kernel"] = "/xenkernel" > - incfg["args"] = default_args > - if fs.file_exists("/xeninitrd"): > - incfg["ramdisk"] = "/xeninitrd" > - elif fs.file_exists("/boot/xenkernel"): > - incfg["kernel"] = "/boot/xenkernel" > - incfg["args"] = default_args > - if fs.file_exists("/boot/xeninitrd"): > - incfg["ramdisk"] = "/boot/xeninitrd" > + # We always boot the "default" kernel if it exists, rather than > + # parsing the grub menu > + initrd_path = None > + if fs.file_exists("/xenkernel"): > + incfg["kernel"] = "/xenkernel" > + incfg["args"] = default_args > + if fs.file_exists("/xeninitrd"): > + incfg["ramdisk"] = "/xeninitrd" > + elif fs.file_exists("/boot/xenkernel"): > + incfg["kernel"] = "/boot/xenkernel" > + incfg["args"] = default_args > + if fs.file_exists("/boot/xeninitrd"): > + incfg["ramdisk"] = "/boot/xeninitrd" > + except: > + raise RuntimeError, "Unable to find partition containing kernel" > > for offset in part_offs: > try: > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel
George Dunlap
2012-Jun-08 11:58 UTC
Re: [PATCH 0 of 4 RFC] Populate-on-demand: Check pages being returned by the balloon driver
Hmm, a mistake in my command line (and the lack of -n) sent two extraneous patches. Please look at just patches 1 and 2, and ignore 3 and 4. Thanks, -George On Fri, Jun 8, 2012 at 12:54 PM, George Dunlap <george.dunlap@eu.citrix.com> wrote:> Populate-on-demand: Check pages being returned by the balloon driver > > This patch series is the second result of my work last summer on > decreasing fragmentation of superpages in a guests'' p2m when using > populate-on-demand. > > This patch series is against 4.1; I''m posting it to get feedback on > the viability of getting a ported version of this patch into 4.2. > > As with the previous series, the patces against 4.1 have been > extensively in the XenServer testing framework and have been in use by > XenServer customers for over 9 months now. But the p2m code has > changed extensively in that time, so one could argue that the testing > doesn''t give us the same degree of confidence in the patches against > 4.2 as against 4.1. > > Looking at it now, there are a number of ugly bits -- "#if 0" and > such; please overlook this for now, as it will all be cleaned up as > part of the porting process. > > If this looks good, I''ll do the work of porting and testing it before > posting it again. > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel
Tim Deegan
2012-Jun-14 10:14 UTC
Re: [PATCH 0 of 4 RFC] Populate-on-demand: Check pages being returned by the balloon driver
At 12:54 +0100 on 08 Jun (1339160090), George Dunlap wrote:> Populate-on-demand: Check pages being returned by the balloon driver > > This patch series is the second result of my work last summer on > decreasing fragmentation of superpages in a guests'' p2m when using > populate-on-demand. > > This patch series is against 4.1; I''m posting it to get feedback on > the viability of getting a ported version of this patch into 4.2. > > As with the previous series, the patces against 4.1 have been > extensively in the XenServer testing framework and have been in use by > XenServer customers for over 9 months now. But the p2m code has > changed extensively in that time, so one could argue that the testing > doesn''t give us the same degree of confidence in the patches against > 4.2 as against 4.1.I think there''s a reasonable chance of getting this in for 4.2 -- any errors should at least be isolated in the PoD parts. :)
Tim Deegan
2012-Jun-14 10:21 UTC
Re: [PATCH 1 of 4 RFC] xen, p2m: get_entry returns level of entry as well
At 12:54 +0100 on 08 Jun (1339160091), George Dunlap wrote:> Return the p2m level of the entry which filled this request. > Intended to be used to see if pages returned by the balloon > driver are part of a superpage, and reclaim them if so.This looks broadly correct, but it''s a bit invasive. If there''s any way to rework patch 2 not to need it that would be helpful. It looks like the main use is for detecting and removing superpage PoD entries; maybe that could be done with a sweep like you have in the non-PoD case? One niggle: returning level=5 on error is non-obvious, and it doesn''t look like your code in patch 2 uses that. Cheers, Tim.
Tim Deegan
2012-Jun-14 10:28 UTC
Re: [PATCH 2 of 4 RFC] xen, pod: Check zero-check pages returned by the balloon driver
At 12:54 +0100 on 08 Jun (1339160092), George Dunlap wrote:> + else if( p2m_is_ram(t) ) > + { > + if ( steal_for_cache ) > + { > + struct page_info *page; > + > + ASSERT(mfn_valid(mfn));I think this assertion is broken by the paging code, which adds paged-out pages to the RAM types. One fix is to undo that change and have p2m_is_ram() only be true for present pages, but I think for 4.2 it would be less disruptive to test here for the paged-out and paging types. Cc''ing Andres on the larger question -- do you think we can have p2m_is_ram() imply mfn_valid() again? I''m not sure whether it will make things cleaner (or at least move the burden of handling paged-out memory into the paging code) or add more boilerplate to paths where the paging will be handled after a check for p2m_os_ram(). What do you think? Cheers, Tim.
Andres Lagar-Cavilla
2012-Jun-14 12:57 UTC
Re: [PATCH 2 of 4 RFC] xen, pod: Check zero-check pages returned by the balloon driver
> At 12:54 +0100 on 08 Jun (1339160092), George Dunlap wrote: >> + else if( p2m_is_ram(t) ) >> + { >> + if ( steal_for_cache ) >> + { >> + struct page_info *page; >> + >> + ASSERT(mfn_valid(mfn)); > > I think this assertion is broken by the paging code, which adds > paged-out pages to the RAM types. One fix is to undo that change and > have p2m_is_ram() only be true for present pages, but I think for 4.2 it > would be less disruptive to test here for the paged-out and paging > types. > > Cc''ing Andres on the larger question -- do you think we can have > p2m_is_ram() imply mfn_valid() again? I''m not sure whether it will make > things cleaner (or at least move the burden of handling paged-out memory > into the paging code) or add more boilerplate to paths where the paging > will be handled after a check for p2m_os_ram(). What do you think? >Imho, pages in any of the states of the paging state machine, are ram. They are not mmio, grants or whatever. They just happen to be absent at the moment. This is not an immediately useful response because its corollary entails that PoD pages are also ram. There are a few possible ways around this. One problem is that p2m_ram_paging_in may or may not have a backing mfn, so we can''t just check the p2mt, and that is frankly annoying (plenty of wth conditionals in the p2m code relate to this). We could disambiguate with two states, e.g.: p2m_ram_paging_in_absent p2m_ram_paging_present p2m_is_paging_in(p2mt) returns true for both -> use in most cases where p2m_ram_paging_in is used right now p2m_is_ram_present(p2mt) returns true for p2m_ram_rw, log dirty, shared, paging_out and paging_in present -> used in cases like above p2m_is_ram(p2mt) includes ram_present and pod and paging_in_absent -> used in broader cases That looks like a fairly dangerous changeset. But the more comprehensive solution. The path of least resistance is obviously to just fix George''s current patches. btw, George, couple of #if 0''s and //''s on your "Check zero-check pages returned by the balloon driver" patch. Those are going out before tree inclusion, I assume? Thanks Andres ) Then most checks for p2m_ram_paging, s/> Cheers, > > Tim. >
Tim Deegan
2012-Jun-14 13:19 UTC
Re: [PATCH 2 of 4 RFC] xen, pod: Check zero-check pages returned by the balloon driver
At 05:57 -0700 on 14 Jun (1339653458), Andres Lagar-Cavilla wrote:> Imho, pages in any of the states of the paging state machine, are ram. > They are not mmio, grants or whatever. They just happen to be absent at > the moment. > > This is not an immediately useful response because its corollary entails > that PoD pages are also ram.I''m OK with that interpretation, as long as we decide it one way or the other and make sure that all users are correct. :)> There are a few possible ways around this. One problem is that > p2m_ram_paging_in may or may not have a backing mfn, so we can''t just > check the p2mt, and that is frankly annoying (plenty of wth conditionals > in the p2m code relate to this). We could disambiguate with two states, > e.g.: > p2m_ram_paging_in_absent > p2m_ram_paging_present > p2m_is_paging_in(p2mt) returns true for both -> use in most cases where > p2m_ram_paging_in is used right nowThat sounds like a good idea; for after 4.2 though.> p2m_is_ram_present(p2mt) returns true for p2m_ram_rw, log dirty, shared, > paging_out and paging_in present -> used in cases like above > p2m_is_ram(p2mt) includes ram_present and pod and paging_in_absent -> > used in broader cases > > That looks like a fairly dangerous changeset. But the more comprehensive > solution. The path of least resistance is obviously to just fix George''s > current patches.Agreed. I was really asking how you''d like to approach this after 4.2 has branched. Cheers, Tim.
Olaf Hering
2012-Jun-14 13:19 UTC
Re: [PATCH 2 of 4 RFC] xen, pod: Check zero-check pages returned by the balloon driver
On Thu, Jun 14, Tim Deegan wrote:> At 12:54 +0100 on 08 Jun (1339160092), George Dunlap wrote: > > + else if( p2m_is_ram(t) ) > > + { > > + if ( steal_for_cache ) > > + { > > + struct page_info *page; > > + > > + ASSERT(mfn_valid(mfn)); > > I think this assertion is broken by the paging code, which adds > paged-out pages to the RAM types. One fix is to undo that change and > have p2m_is_ram() only be true for present pages, but I think for 4.2 it > would be less disruptive to test here for the paged-out and paging > types.I assume the posted hunk is in PoD code, for 4.2 it should be fine because paging cant be enabled if PoD is already enabled. Olaf
Tim Deegan
2012-Jun-14 13:20 UTC
Re: [PATCH 2 of 4 RFC] xen, pod: Check zero-check pages returned by the balloon driver
At 15:19 +0200 on 14 Jun (1339687163), Olaf Hering wrote:> On Thu, Jun 14, Tim Deegan wrote: > > > At 12:54 +0100 on 08 Jun (1339160092), George Dunlap wrote: > > > + else if( p2m_is_ram(t) ) > > > + { > > > + if ( steal_for_cache ) > > > + { > > > + struct page_info *page; > > > + > > > + ASSERT(mfn_valid(mfn)); > > > > I think this assertion is broken by the paging code, which adds > > paged-out pages to the RAM types. One fix is to undo that change and > > have p2m_is_ram() only be true for present pages, but I think for 4.2 it > > would be less disruptive to test here for the paged-out and paging > > types. > > I assume the posted hunk is in PoD code, for 4.2 it should be fine > because paging cant be enabled if PoD is already enabled.Good point. OK, we can ignore it for now and sort it all out after 4.2 Cheers, Tim.
George Dunlap
2012-Jun-14 15:01 UTC
Re: [PATCH 1 of 4 RFC] xen, p2m: get_entry returns level of entry as well
On 14/06/12 11:21, Tim Deegan wrote:> At 12:54 +0100 on 08 Jun (1339160091), George Dunlap wrote: >> Return the p2m level of the entry which filled this request. >> Intended to be used to see if pages returned by the balloon >> driver are part of a superpage, and reclaim them if so. > This looks broadly correct, but it''s a bit invasive. If there''s any way > to rework patch 2 not to need it that would be helpful. It looks like > the main use is for detecting and removing superpage PoD entries; maybe > that could be done with a sweep like you have in the non-PoD case?You mean the non-balloon case? I forget why I thought that was a bad idea, exactly. Let me think about that.> One niggle: returning level=5 on error is non-obvious, and it doesn''t > look like your code in patch 2 uses that.Were you thinking -1 or something like that? -George
Tim Deegan
2012-Jun-14 15:31 UTC
Re: [PATCH 1 of 4 RFC] xen, p2m: get_entry returns level of entry as well
At 16:01 +0100 on 14 Jun (1339689663), George Dunlap wrote:> On 14/06/12 11:21, Tim Deegan wrote: > >At 12:54 +0100 on 08 Jun (1339160091), George Dunlap wrote: > >>Return the p2m level of the entry which filled this request. > >>Intended to be used to see if pages returned by the balloon > >>driver are part of a superpage, and reclaim them if so. > >This looks broadly correct, but it''s a bit invasive. If there''s any way > >to rework patch 2 not to need it that would be helpful. It looks like > >the main use is for detecting and removing superpage PoD entries; maybe > >that could be done with a sweep like you have in the non-PoD case? > You mean the non-balloon case? I forget why I thought that was a bad > idea, exactly. Let me think about that. > >One niggle: returning level=5 on error is non-obvious, and it doesn''t > >look like your code in patch 2 uses that. > Were you thinking -1 or something like that?Actually, looking at it again, and at the way the value is used by its caller, maybe 5 is OK (though it looks like it will return 6 in the EPT code). So assuming this patch doesn''t go away, I''d be OK with just a comment explaining it. Cheers, Tim.
George Dunlap
2012-Jun-14 16:39 UTC
Re: [PATCH 1 of 4 RFC] xen, p2m: get_entry returns level of entry as well
On Thu, Jun 14, 2012 at 4:01 PM, George Dunlap <george.dunlap@eu.citrix.com> wrote:> On 14/06/12 11:21, Tim Deegan wrote: >> >> At 12:54 +0100 on 08 Jun (1339160091), George Dunlap wrote: >>> >>> Return the p2m level of the entry which filled this request. >>> Intended to be used to see if pages returned by the balloon >>> driver are part of a superpage, and reclaim them if so. >> >> This looks broadly correct, but it''s a bit invasive. If there''s any way >> to rework patch 2 not to need it that would be helpful. It looks like >> the main use is for detecting and removing superpage PoD entries; maybe >> that could be done with a sweep like you have in the non-PoD case? > > You mean the non-balloon case? I forget why I thought that was a bad idea, > exactly. Let me think about that.I think I basically didn''t want to have to do the full check of each of the 512 individual entries of the p2m in the case that it wasn''t, in fact, a superpage; but I think in the common case it should either be a full superpage, or the check at the beginning of p2m_pod_zero_check_superpage() should fail out relatively early. So we could probably get away without it. Why don''t I re-write patch 2 of this series to not require patch 1, and include it in my other series. If there''s a performance gain to be had by avoiding the check, we can discuss that post-4.2. -George
Tim Deegan
2012-Jun-14 17:00 UTC
Re: [PATCH 1 of 4 RFC] xen, p2m: get_entry returns level of entry as well
At 17:39 +0100 on 14 Jun (1339695541), George Dunlap wrote:> On Thu, Jun 14, 2012 at 4:01 PM, George Dunlap > <george.dunlap@eu.citrix.com> wrote: > > On 14/06/12 11:21, Tim Deegan wrote: > >> This looks broadly correct, but it''s a bit invasive. If there''s any way > >> to rework patch 2 not to need it that would be helpful. It looks like > >> the main use is for detecting and removing superpage PoD entries; maybe > >> that could be done with a sweep like you have in the non-PoD case? > > > > You mean the non-balloon case? I forget why I thought that was a bad idea, > > exactly. Let me think about that. > > I think I basically didn''t want to have to do the full check of each > of the 512 individual entries of the p2m in the case that it wasn''t, > in fact, a superpage; but I think in the common case it should either > be a full superpage, or the check at the beginning of > p2m_pod_zero_check_superpage() should fail out relatively early. So > we could probably get away without it. > > Why don''t I re-write patch 2 of this series to not require patch 1, > and include it in my other series. If there''s a performance gain to > be had by avoiding the check, we can discuss that post-4.2.That sounds ideal. Thanks, Tim.