This is the latest version of a patch that adds hugepage support to the Xen hypervisor in a PV environment. It is against the latest xen-unstable tree on xenbits.xensource.com. I believe this version addresses the comments made about the previous version of the patch. Hugepage support must be enabled via the hypervisor command line option "allowhugepage". It assumes the guest guarantees that the hugepage is physically aligned and contiguous. There is currently no support in the tools for saving/restoring/migrating guests who use hugepages. Note to the maintainers: What else needs to be done for this patch to be considered for inclusion into the tree? Signed-off-by: Dave McCracken <dave.mccracken@oracle.com> ---- --- xen-unstable//./xen/include/asm-x86/mm.h 2008-10-02 14:23:17.000000000 -0500 +++ xen-hpage/./xen/include/asm-x86/mm.h 2008-11-04 08:24:35.000000000 -0600 @@ -240,6 +240,8 @@ pae_copy_root(struct vcpu *v, l3_pgentry int check_descriptor(const struct domain *, struct desc_struct *d); +extern int opt_allow_hugepage; + /****************************************************************************** * With shadow pagetables, the different kinds of address start * to get get confusing. --- xen-unstable//./xen/include/asm-x86/x86_32/page.h 2008-07-17 09:49:27.000000000 -0500 +++ xen-hpage/./xen/include/asm-x86/x86_32/page.h 2008-11-04 08:24:35.000000000 -0600 @@ -115,7 +115,7 @@ extern unsigned int PAGE_HYPERVISOR_NOCA #define BASE_DISALLOW_MASK (0xFFFFF198U & ~_PAGE_NX) #define L1_DISALLOW_MASK (BASE_DISALLOW_MASK | _PAGE_GNTTAB) -#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK) +#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK & ~_PAGE_PSE) #define L3_DISALLOW_MASK 0xFFFFF1FEU /* must-be-zero */ #endif /* __X86_32_PAGE_H__ */ --- xen-unstable//./xen/include/asm-x86/x86_64/page.h 2008-10-02 14:23:17.000000000 -0500 +++ xen-hpage/./xen/include/asm-x86/x86_64/page.h 2008-11-04 08:24:35.000000000 -0600 @@ -115,7 +115,7 @@ typedef l4_pgentry_t root_pgentry_t; #define BASE_DISALLOW_MASK (0xFF800198U & ~_PAGE_NX) #define L1_DISALLOW_MASK (BASE_DISALLOW_MASK | _PAGE_GNTTAB) -#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK) +#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK & ~_PAGE_PSE) #define L3_DISALLOW_MASK (BASE_DISALLOW_MASK) #define L4_DISALLOW_MASK (BASE_DISALLOW_MASK) --- xen-unstable//./xen/arch/x86/mm.c 2008-11-04 08:22:40.000000000 -0600 +++ xen-hpage/./xen/arch/x86/mm.c 2008-11-04 08:24:35.000000000 -0600 @@ -160,6 +160,9 @@ unsigned long total_pages; #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT) +int opt_allow_hugepage = 0; +boolean_param("allowhugepage", opt_allow_hugepage); + #define l1_disallow_mask(d) \ ((d != dom_io) && \ (rangeset_is_empty((d)->iomem_caps) && \ @@ -584,6 +587,26 @@ static int get_page_and_type_from_pagenr return rc; } +static int get_data_page(struct page_info *page, struct domain *d, int writeable) +{ + int rc; + + if ( writeable ) + rc = get_page_and_type(page, d, PGT_writable_page); + else + rc = get_page(page, d); + + return rc; +} + +static void put_data_page(struct page_info *page, int writeable) +{ + if ( writeable ) + put_page_and_type(page); + else + put_page(page); +} + /* * We allow root tables to map each other (a.k.a. linear page tables). It * needs some special care with reference counts and access permissions: @@ -656,6 +679,7 @@ get_page_from_l1e( struct vcpu *curr = current; struct domain *owner; int okay; + int writeable; if ( !(l1f & _PAGE_PRESENT) ) return 1; @@ -698,10 +722,9 @@ get_page_from_l1e( * contribute to writeable mapping refcounts. (This allows the * qemu-dm helper process in dom0 to map the domain''s memory without * messing up the count of "real" writable mappings.) */ - okay = (((l1f & _PAGE_RW) && - !(unlikely(paging_mode_external(d) && (d != curr->domain)))) - ? get_page_and_type(page, d, PGT_writable_page) - : get_page(page, d)); + writeable = (l1f & _PAGE_RW) && + !( unlikely(paging_mode_external(d) && (d != curr->domain)) ); + okay = get_data_page(page, d, writeable); if ( !okay ) { MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte @@ -759,11 +782,43 @@ get_page_from_l2e( MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK); return -EINVAL; } + if ( l2e_get_flags(l2e) & _PAGE_PSE ) + { + unsigned long mfn = l2e_get_pfn(l2e); + unsigned long m, me; + struct page_info *page = mfn_to_page(mfn); + int writeable; - rc = get_page_and_type_from_pagenr( - l2e_get_pfn(l2e), PGT_l1_page_table, d, 0); - if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) ) - rc = 0; + if ( !opt_allow_hugepage ) + return -EINVAL; + + writeable = l2e_get_flags(l2e) & _PAGE_RW; + + rc = get_data_page(page, d, writeable); + if ( unlikely(!rc) ) + return rc; + + for ( m = mfn+1, me = mfn + (L1_PAGETABLE_ENTRIES-1); m <= me; m++ ) + { + rc = get_data_page(mfn_to_page(m), d, writeable); + if ( unlikely(!rc) ) + { + for ( --m; m > mfn; --m ) + put_data_page(mfn_to_page(m), writeable); + put_data_page(page, writeable); + return 0; + } + } +#ifdef __x86_64__ + map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, L1_PAGETABLE_ENTRIES, + PAGE_HYPERVISOR | l2e_get_flags(l2e)); +#endif + } else { + rc = get_page_and_type_from_pagenr( + l2e_get_pfn(l2e), PGT_l1_page_table, d, 0); + if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) ) + rc = 0; + } return rc; } @@ -955,7 +1010,18 @@ static int put_page_from_l2e(l2_pgentry_ if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) && (l2e_get_pfn(l2e) != pfn) ) { - put_page_and_type(l2e_get_page(l2e)); + if ( l2e_get_flags(l2e) & _PAGE_PSE ) + { + unsigned long mfn = l2e_get_pfn(l2e); + unsigned long m, me; + struct page_info *page = mfn_to_page(mfn); + int writeable = l2e_get_flags(l2e) & _PAGE_RW; + + for ( m = mfn+1, me = mfn + (L1_PAGETABLE_ENTRIES-1); m <= me; m++ ) + put_data_page(mfn_to_page(m), writeable); + put_data_page(page, writeable); + } else + put_page_and_type(l2e_get_page(l2e)); return 0; } return 1; --- xen-unstable//./xen/arch/x86/traps.c 2008-11-04 08:22:40.000000000 -0600 +++ xen-hpage/./xen/arch/x86/traps.c 2008-11-04 08:24:35.000000000 -0600 @@ -723,7 +723,8 @@ static void pv_cpuid(struct cpu_user_reg { /* Modify Feature Information. */ __clear_bit(X86_FEATURE_VME, &d); - __clear_bit(X86_FEATURE_PSE, &d); + if (!opt_allow_hugepage) + __clear_bit(X86_FEATURE_PSE, &d); __clear_bit(X86_FEATURE_PGE, &d); __clear_bit(X86_FEATURE_MCE, &d); __clear_bit(X86_FEATURE_MCA, &d); @@ -2002,9 +2003,12 @@ static int emulate_privileged_op(struct case 4: /* Read CR4 */ /* * Guests can read CR4 to see what features Xen has enabled. We - * therefore lie about PGE & PSE as they are unavailable to guests. + * therefore lie about PGE as it is unavailable to guests. + * Also disallow PSE if hugepages are not enabled. */ - *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE); + *reg = read_cr4() & ~(X86_CR4_PGE); + if (!opt_allow_hugepage) + *reg &= ~(X86_CR4_PSE); break; default: _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2008-Nov-05 10:59 UTC
Re: [Xen-devel] [PATCH 1/1] Xen PV support for hugepages
On 4/11/08 15:41, "dcm@mccr.org" <dcm@mccr.org> wrote:> This is the latest version of a patch that adds hugepage support to the Xen > hypervisor in a PV environment. It is against the latest xen-unstable tree > on xenbits.xensource.com. I believe this version addresses the comments > made about the previous version of the patch. > > Hugepage support must be enabled via the hypervisor command line option > "allowhugepage". > > It assumes the guest guarantees that the hugepage is physically aligned and > contiguous. > > There is currently no support in the tools for saving/restoring/migrating > guests who use hugepages. > > Note to the maintainers: What else needs to be done for this patch to be > considered for inclusion into the tree?I cleaned it up, fixed some return codes, and applied as c/s 18756. It may be worth Jan Beulich taking a quick look as the return-code conventions for the get/put functions in mm.c is now rather convoluted. I think I got it right but I''m not 100% sure. Indeed, Jan, it''d sure be nice to have some comments explaining what the <0,=0,>0 codes mean from the various get/put functions, and how they differ between the l1e variants and the others (I''m pretty sure they do differ). -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jan Beulich
2008-Nov-05 15:13 UTC
Re: [Xen-devel] [PATCH 1/1] Xen PV support for hugepages
>>> Keir Fraser <keir.fraser@eu.citrix.com> 05.11.08 11:59 >>> >On 4/11/08 15:41, "dcm@mccr.org" <dcm@mccr.org> wrote: > >> This is the latest version of a patch that adds hugepage support to the Xen >> hypervisor in a PV environment. It is against the latest xen-unstable tree >> on xenbits.xensource.com. I believe this version addresses the comments >> made about the previous version of the patch. >> >> Hugepage support must be enabled via the hypervisor command line option >> "allowhugepage". >> >> It assumes the guest guarantees that the hugepage is physically aligned and >> contiguous. >> >> There is currently no support in the tools for saving/restoring/migrating >> guests who use hugepages. >> >> Note to the maintainers: What else needs to be done for this patch to be >> considered for inclusion into the tree? > >I cleaned it up, fixed some return codes, and applied as c/s 18756. It may >be worth Jan Beulich taking a quick look as the return-code conventions for >the get/put functions in mm.c is now rather convoluted. I think I got it >right but I''m not 100% sure. Indeed, Jan, it''d sure be nice to have some >comments explaining what the <0,=0,>0 codes mean from the various get/put >functions, and how they differ between the l1e variants and the others (I''m >pretty sure they do differ).The return value handling seems correct, but the call to map_pages_to_xen() at the end of get_page_from_l2e() doesn''t: Simply or-ing the guest-provided flags into PAGE_HYPERVISOR definitely isn''t correct. The cache attributes must be extracted, converted to L1 notion, and then be or-ed in. Also, the special code to deal with LDT pages in put_page_from_l1e() should now also be required in put_page_from_l2e(). Perhaps that logic could be moved into put_data_page()? Further, with L2_DISALLOW_MASK now permitting _PAGE_PSE, the use site in mod_l2_entry() must certainly also be modified, i.e. the subsequent l2e_has_changed() would need to include _PAGE_PSE, and the case of _PAGE_RW toggling when _PAGE_PSE was and will be set needs to be taken care of. Next, adjust_guest_l2e() would apparently also need to be modified. Finally, no longer hiding X86_FEATURE_PSE from the guest seems rather risky: I''m pretty sure the pv-ops kernel relies on this flag being clear - when set, it would blindly (i.e. without ensuring the underlying memory is contiguous and suitably aligned) try to use 2Mb mappings for e.g. the 1:1 mapping. I would think this capability ought to be propagated by another means. Likewise I''m uncertain about letting X86_CR4_PSE shine through. And a general question: How is a trivial DomU going to be able to make use of this, without being permitted to allocate non-order-zero chunks of memory? Jan _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Dave McCracken
2008-Nov-06 13:44 UTC
Re: [Xen-devel] [PATCH 1/1] Xen PV support for hugepages
On Wednesday 05 November 2008, Jan Beulich wrote:> The return value handling seems correct, but the call to map_pages_to_xen() > at the end of get_page_from_l2e() doesn''t: Simply or-ing the guest-provided > flags into PAGE_HYPERVISOR definitely isn''t correct. The cache attributes > must be extracted, converted to L1 notion, and then be or-ed in.Oops. I thought I''d fixed that. Apparently I looked at it and got sidetracked. I''ll fix.> Also, the special code to deal with LDT pages in put_page_from_l1e() should > now also be required in put_page_from_l2e(). Perhaps that logic could be > moved into put_data_page()?It looks to me like this is only done for PGT_seg_desc_page. I don''t think any hugepage can be of this type. I can add it to put_data_page() if you think we need to be sure.> Further, with L2_DISALLOW_MASK now permitting _PAGE_PSE, the use > site in mod_l2_entry() must certainly also be modified, i.e. the subsequent > l2e_has_changed() would need to include _PAGE_PSE, and the case of > _PAGE_RW toggling when _PAGE_PSE was and will be set needs to be > taken care of. > > Next, adjust_guest_l2e() would apparently also need to be modified.I''ll look more closely into those.> Finally, no longer hiding X86_FEATURE_PSE from the guest seems rather > risky: I''m pretty sure the pv-ops kernel relies on this flag being clear - > when set, it would blindly (i.e. without ensuring the underlying memory is > contiguous and suitably aligned) try to use 2Mb mappings for e.g. the 1:1 > mapping. I would think this capability ought to be propagated by another > means. Likewise I''m uncertain about letting X86_CR4_PSE shine through. > > And a general question: How is a trivial DomU going to be able to make > use of this, without being permitted to allocate non-order-zero chunks of > memory?This is the point of having hugepages a command line option. It should only be turned on if you intend to run guests who enforce the alignment rules. I''m working on ways to get guests to run in aligned memory. Dave McCracken Oracle Corp. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2008-Nov-06 13:54 UTC
Re: [Xen-devel] [PATCH 1/1] Xen PV support for hugepages
On 6/11/08 13:44, "Dave McCracken" <dcm@mccr.org> wrote:>> Also, the special code to deal with LDT pages in put_page_from_l1e() should >> now also be required in put_page_from_l2e(). Perhaps that logic could be >> moved into put_data_page()? > > It looks to me like this is only done for PGT_seg_desc_page. I don''t think > any hugepage can be of this type. I can add it to put_data_page() if you > think we need to be sure.The data pages mapped by a superpage mapping aren''t special. One of them could certainly be PGT_seg_desc_page. Hence moving the logic into put_data_page() makes sense. -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jan Beulich
2008-Nov-06 14:04 UTC
Re: [Xen-devel] [PATCH 1/1] Xen PV support for hugepages
>>> Dave McCracken <dcm@mccr.org> 06.11.08 14:44 >>> >> Finally, no longer hiding X86_FEATURE_PSE from the guest seems rather >> risky: I''m pretty sure the pv-ops kernel relies on this flag being clear - >> when set, it would blindly (i.e. without ensuring the underlying memory is >> contiguous and suitably aligned) try to use 2Mb mappings for e.g. the 1:1 >> mapping. I would think this capability ought to be propagated by another >> means. Likewise I''m uncertain about letting X86_CR4_PSE shine through. >> >> And a general question: How is a trivial DomU going to be able to make >> use of this, without being permitted to allocate non-order-zero chunks of >> memory? > >This is the point of having hugepages a command line option. It should only >be turned on if you intend to run guests who enforce the alignment rules.You mean ''if you intend to run *only* guests ...'', including dom0. Any guest unaware of the connection of X86_FEATURE_PSE and the need to create contiguous 2M chunks would fail, and any guest not having I/O memory assigned would never manage to create such chunks. Jan _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Dave McCracken
2008-Nov-07 14:39 UTC
Re: [Xen-devel] [PATCH 1/1] Xen PV support for hugepages
On Thursday 06 November 2008, Jan Beulich wrote:> >This is the point of having hugepages a command line option. It should > > only be turned on if you intend to run guests who enforce the alignment > > rules. > > You mean ''if you intend to run *only* guests ...'', including dom0. Any > guest unaware of the connection of X86_FEATURE_PSE and the need to create > contiguous 2M chunks would fail, and any guest not having I/O memory > assigned would never manage to create such chunks.Hmm... Sounds like a per-domain flag would be a good solution here. I''ll look into it. Dave McCracken _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jeremy Fitzhardinge
2008-Nov-07 18:07 UTC
Re: [Xen-devel] [PATCH 1/1] Xen PV support for hugepages
Dave McCracken wrote:> On Thursday 06 November 2008, Jan Beulich wrote: > >>> This is the point of having hugepages a command line option. It should >>> only be turned on if you intend to run guests who enforce the alignment >>> rules. >>> >> You mean ''if you intend to run *only* guests ...'', including dom0. Any >> guest unaware of the connection of X86_FEATURE_PSE and the need to create >> contiguous 2M chunks would fail, and any guest not having I/O memory >> assigned would never manage to create such chunks. >> > > Hmm... Sounds like a per-domain flag would be a good solution here. I''ll look > into it. >I think a guest should explicitly enable large page support via something like vmassist. In other words: * keep PSE clear - you are not implementing something similar to PSE * export the ability to map large pages via a feature flag or something * require guests to explicitly enable large page support I''m sceptical about the value of this patch as it stands. We already have a vast number of operating modes and combinations, and I think we need to have a pretty high bar before adding any more. If this work arranged things so that we could just set PSE - and thereby make supporting it on the guest side much simpler - then I think the decision to include it would be easier. In practical terms, I think this means that we need to modify Xen and the domain builder to *always* allocate memory to guests in 2M contig chunks, and then sort out all the tricky details ;) In practice, however, we would lose a lot of the benefit of doing this we''d get from the native kernel, because all the RO pagetable mappings would chop up the linear and kernel maps so much that we''d not get much chance to use the 2M mappings. It would, however, make hugetlbfs work without much drama. J _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel