EPT: 1GB large page support. Alloc 1GB large page for EPT if possible. It also contains the logic to split large page into small ones (2M or 4K). Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com> Signed-off-by: Xiaohui Xin <xiaohui.xin@intel.com> _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
At 03:20 +0000 on 23 Feb (1266895248), Xu, Dongxiao wrote:> EPT: 1GB large page support. > > Alloc 1GB large page for EPT if possible. It also contains the logic to split > large page into small ones (2M or 4k). > > Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com> > Signed-off-by: Xiaohui Xin <xiaohui.xin@intel.com>> diff -r af2a0ed32ad5 xen/arch/x86/mm/hap/p2m-ept.c > --- a/xen/arch/x86/mm/hap/p2m-ept.c Sat Feb 20 19:25:07 2010 +0800 > +++ b/xen/arch/x86/mm/hap/p2m-ept.c Sat Feb 20 19:43:49 2010 +0800 > @@ -183,17 +239,21 @@ ept_set_entry(struct domain *d, unsigned > int i; > int rv = 0; > int ret = 0; > + int split_level = 0; > int walk_level = order / EPT_TABLE_ORDER; > int direct_mmio = (p2mt == p2m_mmio_direct); > uint8_t ipat = 0; > int need_modify_vtd_table = 1; > > - /* We only support 4k and 2m pages now */ > - BUG_ON(order && order != EPT_TABLE_ORDER); > + /* We support 4K, 2M, and 1G pages now */ > + BUG_ON(order % EPT_TABLE_ORDER);So order 27 allocations are OK? :) Actually, now that I''m looking at this BUG_ON, I can''t see why it should be true even before this change. Is it enforced anywhere?> if ( order != 0 ) > if ( (gfn & ((1UL << order) - 1)) ) > return 1; > + > + if ( order == 18 ) > + printk(KERN_INFO "Got 1GB EPT entry.\n");Definitely not.> table = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table))); > > @@ -208,15 +268,21 @@ ept_set_entry(struct domain *d, unsigned > break; > } > > - /* If order == 9, we should never get SUPERPAGE or PoD. > + /* If order == 9 or 18, we should never get SUPERPAGE or PoD.Why is that true? Surely having introduced order-18 entries you can now get SUPERPAGE when asking for an order-9 entry.> * If order == 0, we should only get POD if we have a POD superpage. > * If i > walk_level, we need to split the page; otherwise, > * just behave as normal. */ > ASSERT(order == 0 || ret == GUEST_TABLE_NORMAL_PAGE); > ASSERT(ret != GUEST_TABLE_POD_PAGE || i != walk_level); > > + if ( i && i == walk_level && i <= cpu_vmx_ept_super_page_level_limit > + && ret == GUEST_TABLE_NORMAL_PAGE ) > + ret = GUEST_TABLE_SUPER_PAGE; > +That needs a comment explaining it.> index = gfn_remainder >> ( i ? (i * EPT_TABLE_ORDER): order); > offset = (gfn_remainder & ( ((1 << (i*EPT_TABLE_ORDER)) - 1))); > + > + split_level = i; > > ept_entry = table + index; > > @@ -240,7 +306,7 @@ ept_set_entry(struct domain *d, unsigned > > if ( (ept_entry->avail1 == p2m_ram_logdirty) > && (p2mt == p2m_ram_rw) ) > - for ( i = 0; i < 512; i++ ) > + for ( i = 0; i < (1UL << order); i++ )Gosh. What''s the performance of *that* like on live migration?> paging_mark_dirty(d, mfn_x(mfn) - offset + i); > } > else > @@ -261,51 +327,22 @@ ept_set_entry(struct domain *d, unsigned > } > else > { > - /* > - * It''s super page before, now set one of the 4k pages, so > - * we should split the 2m page to 4k pages now. > - */ > - /* Pointers to / into new (split) middle-level table */ > - ept_entry_t *split_table = NULL; > - ept_entry_t *split_ept_entry = NULL; > - /* Info about old (superpage) table */ > - unsigned long super_mfn = ept_entry->mfn; > - p2m_type_t super_p2mt = ept_entry->avail1; > - /* The new l2 entry which we''ll write after we''ve build the new l1 table */ > - ept_entry_t l2_ept_entry; > - > - /* > - * Allocate new page for new ept middle level entry which is > - * before a leaf super entry > - */ > - if ( !ept_set_middle_entry(d, &l2_ept_entry) ) > - goto out; > - > - /* Split the super page before to 4k pages */ > - split_table = map_domain_page(l2_ept_entry.mfn); > - offset = gfn & ((1 << EPT_TABLE_ORDER) - 1); > - > - for ( i = 0; i < 512; i++ ) > - { > - split_ept_entry = split_table + i; > - split_ept_entry->emt = epte_get_entry_emt(d, gfn - offset + i, > - _mfn(super_mfn + i), > - &ipat, direct_mmio); > - split_ept_entry->ipat = ipat; > - split_ept_entry->sp_avail = 0; > - /* Don''t increment mfn if it''s a PoD mfn */ > - if ( super_p2mt != p2m_populate_on_demand ) > - split_ept_entry->mfn = super_mfn + i; > - else > - split_ept_entry->mfn = super_mfn; > - split_ept_entry->avail1 = super_p2mt; > - split_ept_entry->avail2 = 0; > - > - ept_p2m_type_to_flags(split_ept_entry, super_p2mt); > - } > - > - /* Set the destinated 4k page as normal */ > - split_ept_entry = split_table + offset; > + int num = order / EPT_TABLE_ORDER; > + int level; > + ept_entry_t *split_ept_entry; > + > + if ( num >= cpu_vmx_ept_super_page_level_limit ) > + num = cpu_vmx_ept_super_page_level_limit; > + for ( level = split_level; level > num ; level-- ) > + { > + rv = ept_split_large_page(d, &table, &index, gfn, level); > + if ( !rv ) > + goto out; > + } > + > + split_ept_entry = table + index; > + split_ept_entry->avail1 = p2mt; > + ept_p2m_type_to_flags(split_ept_entry, p2mt); > split_ept_entry->emt = epte_get_entry_emt(d, gfn, mfn, &ipat, > direct_mmio); > split_ept_entry->ipat = ipat; > @@ -314,12 +351,6 @@ ept_set_entry(struct domain *d, unsigned > need_modify_vtd_table = 0; > else > split_ept_entry->mfn = mfn_x(mfn); > - > - split_ept_entry->avail1 = p2mt; > - ept_p2m_type_to_flags(split_ept_entry, p2mt); > - > - unmap_domain_page(split_table); > - *ept_entry = l2_ept_entry; > } > > /* Track the highest gfn for which we have ever had a valid mapping */ > @@ -336,7 +367,7 @@ out: > ept_sync_domain(d); > > /* Now the p2m table is not shared with vt-d page table */ > - if ( iommu_enabled && need_iommu(d) && need_modify_vtd_table ) > + if ( rv && iommu_enabled && need_iommu(d) && need_modify_vtd_table ) > { > if ( p2mt == p2m_ram_rw ) > { > @@ -459,7 +490,7 @@ out: > /* WARNING: Only caller doesn''t care about PoD pages. So this function will > * always return 0 for PoD pages, not populate them. If that becomes necessary, > * pass a p2m_query_t type along to distinguish. */ > -static ept_entry_t ept_get_entry_content(struct domain *d, unsigned long gfn) > +static ept_entry_t ept_get_entry_content(struct domain *d, unsigned long gfn, int *level) > { > ept_entry_t *table > map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table))); > @@ -487,6 +518,7 @@ static ept_entry_t ept_get_entry_content > index = gfn_remainder >> (i * EPT_TABLE_ORDER); > ept_entry = table + index; > content = *ept_entry; > + *level = i; > > out: > unmap_domain_page(table); > @@ -579,7 +611,10 @@ void ept_change_entry_emt_with_range(str > p2m_lock(d->arch.p2m); > for ( gfn = start_gfn; gfn <= end_gfn; gfn++ ) > { > - e = ept_get_entry_content(d, gfn); > + int level = 0; > + uint64_t trunk = 0; > + > + e = ept_get_entry_content(d, gfn, &level); > if ( !p2m_has_emt(e.avail1) ) > continue; > > @@ -588,25 +623,23 @@ void ept_change_entry_emt_with_range(str > > if ( e.sp_avail ) > { > - if ( !(gfn & ((1 << EPT_TABLE_ORDER) - 1)) && > - ((gfn + 0x1FF) <= end_gfn) ) > + while ( level ) > { > - /* > - * gfn assigned with 2M, and the end covers more than 2m areas. > - * Set emt for super page. > - */ > - order = EPT_TABLE_ORDER; > - if ( need_modify_ept_entry(d, gfn, mfn, e.ipat, e.emt, e.avail1) ) > - ept_set_entry(d, gfn, mfn, order, e.avail1); > - gfn += 0x1FF; > - } > - else > - { > - /* Change emt for partial entries of the 2m area. */ > - if ( need_modify_ept_entry(d, gfn, mfn, e.ipat, e.emt, e.avail1) ) > - ept_set_entry(d, gfn, mfn, order, e.avail1); > - gfn = ((gfn >> EPT_TABLE_ORDER) << EPT_TABLE_ORDER) + 0x1FF; > - } > + trunk = (1UL << (level * EPT_TABLE_ORDER)) - 1; > + if ( !(gfn & trunk) && (gfn + trunk <= end_gfn) ) > + { > + /* gfn assigned with 2M, and the end covers more than 2m areas.This code covers 2M and 1G superpages; please fix the comment too.> + * Set emt for super page. > + */ > + order = level * EPT_TABLE_ORDER; > + if ( need_modify_ept_entry(d, gfn, mfn, > + e.ipat, e.emt, e.avail1) ) > + ept_set_entry(d, gfn, mfn, order, e.avail1); > + gfn += trunk; > + break; > + } > + level--; > + } > } > else /* gfn assigned with 4k */ > {Cheers, Tim. -- Tim Deegan <Tim.Deegan@citrix.com> Principal Software Engineer, XenServer Engineering Citrix Systems UK Ltd. (Company #02937203, SL9 0BG) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Xu, Dongxiao
2010-Feb-24 04:44 UTC
RE: [Xen-devel][PATCH] Intel EPT 1GB large page support
Hi Tim, Thanks for your careful review. I will change the patch according to your comments in the next version. Thanks! Dongxiao Tim Deegan wrote:> At 03:20 +0000 on 23 Feb (1266895248), Xu, Dongxiao wrote: >> EPT: 1GB large page support. >> >> Alloc 1GB large page for EPT if possible. It also contains the logic >> to split >> large page into small ones (2M or 4k). >> >> Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com> >> Signed-off-by: Xiaohui Xin <xiaohui.xin@intel.com> > > >> diff -r af2a0ed32ad5 xen/arch/x86/mm/hap/p2m-ept.c >> --- a/xen/arch/x86/mm/hap/p2m-ept.c Sat Feb 20 19:25:07 2010 +0800 >> +++ b/xen/arch/x86/mm/hap/p2m-ept.c Sat Feb 20 19:43:49 2010 +0800 >> @@ -183,17 +239,21 @@ ept_set_entry(struct domain *d, unsigned >> int i; int rv = 0; >> int ret = 0; >> + int split_level = 0; >> int walk_level = order / EPT_TABLE_ORDER; >> int direct_mmio = (p2mt == p2m_mmio_direct); >> uint8_t ipat = 0; >> int need_modify_vtd_table = 1; >> >> - /* We only support 4k and 2m pages now */ >> - BUG_ON(order && order != EPT_TABLE_ORDER); >> + /* We support 4K, 2M, and 1G pages now */ >> + BUG_ON(order % EPT_TABLE_ORDER); > > So order 27 allocations are OK? :) Actually, now that I''m looking at > this BUG_ON, I can''t see why it should be true even before this > change. Is it enforced anywhere? > >> if ( order != 0 ) >> if ( (gfn & ((1UL << order) - 1)) ) >> return 1; >> + >> + if ( order == 18 ) >> + printk(KERN_INFO "Got 1GB EPT entry.\n"); > > Definitely not. > >> table >> map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table))); >> >> @@ -208,15 +268,21 @@ ept_set_entry(struct domain *d, unsigned >> break; } >> >> - /* If order == 9, we should never get SUPERPAGE or PoD. >> + /* If order == 9 or 18, we should never get SUPERPAGE or PoD. > > Why is that true? Surely having introduced order-18 entries you can > now get SUPERPAGE when asking for an order-9 entry. > >> * If order == 0, we should only get POD if we have a POD >> superpage. >> * If i > walk_level, we need to split the page; otherwise, >> * just behave as normal. */ >> ASSERT(order == 0 || ret == GUEST_TABLE_NORMAL_PAGE); >> ASSERT(ret != GUEST_TABLE_POD_PAGE || i != walk_level); >> >> + if ( i && i == walk_level && i <>> cpu_vmx_ept_super_page_level_limit + && ret =>> GUEST_TABLE_NORMAL_PAGE ) + ret = GUEST_TABLE_SUPER_PAGE; >> + > > That needs a comment explaining it. > >> index = gfn_remainder >> ( i ? (i * EPT_TABLE_ORDER): order); >> offset = (gfn_remainder & ( ((1 << (i*EPT_TABLE_ORDER)) - 1))); >> + + split_level = i; >> >> ept_entry = table + index; >> >> @@ -240,7 +306,7 @@ ept_set_entry(struct domain *d, unsigned >> >> if ( (ept_entry->avail1 == p2m_ram_logdirty) >> && (p2mt == p2m_ram_rw) ) >> - for ( i = 0; i < 512; i++ ) >> + for ( i = 0; i < (1UL << order); i++ ) > > Gosh. What''s the performance of *that* like on live migration? > >> paging_mark_dirty(d, mfn_x(mfn) - offset + >> i); } else >> @@ -261,51 +327,22 @@ ept_set_entry(struct domain *d, unsigned } >> else >> { >> - /* >> - * It''s super page before, now set one of the 4k pages, so >> - * we should split the 2m page to 4k pages now. >> - */ >> - /* Pointers to / into new (split) middle-level table */ >> - ept_entry_t *split_table = NULL; >> - ept_entry_t *split_ept_entry = NULL; >> - /* Info about old (superpage) table */ >> - unsigned long super_mfn = ept_entry->mfn; >> - p2m_type_t super_p2mt = ept_entry->avail1; >> - /* The new l2 entry which we''ll write after we''ve build the >> new l1 table */ >> - ept_entry_t l2_ept_entry; >> - >> - /* >> - * Allocate new page for new ept middle level entry which is >> - * before a leaf super entry >> - */ >> - if ( !ept_set_middle_entry(d, &l2_ept_entry) ) >> - goto out; >> - >> - /* Split the super page before to 4k pages */ >> - split_table = map_domain_page(l2_ept_entry.mfn); >> - offset = gfn & ((1 << EPT_TABLE_ORDER) - 1); - >> - for ( i = 0; i < 512; i++ ) >> - { >> - split_ept_entry = split_table + i; >> - split_ept_entry->emt = epte_get_entry_emt(d, gfn - >> offset + i, >> - >> _mfn(super_mfn + i), >> - &ipat, >> direct_mmio); >> - split_ept_entry->ipat = ipat; >> - split_ept_entry->sp_avail = 0; >> - /* Don''t increment mfn if it''s a PoD mfn */ >> - if ( super_p2mt != p2m_populate_on_demand ) >> - split_ept_entry->mfn = super_mfn + i; >> - else >> - split_ept_entry->mfn = super_mfn; >> - split_ept_entry->avail1 = super_p2mt; >> - split_ept_entry->avail2 = 0; >> - >> - ept_p2m_type_to_flags(split_ept_entry, super_p2mt); >> - } >> - >> - /* Set the destinated 4k page as normal */ >> - split_ept_entry = split_table + offset; >> + int num = order / EPT_TABLE_ORDER; >> + int level; >> + ept_entry_t *split_ept_entry; >> + >> + if ( num >= cpu_vmx_ept_super_page_level_limit ) >> + num = cpu_vmx_ept_super_page_level_limit; >> + for ( level = split_level; level > num ; level-- ) + >> { + rv = ept_split_large_page(d, &table, &index, gfn, >> level); + if ( !rv ) + goto out; >> + } >> + >> + split_ept_entry = table + index; >> + split_ept_entry->avail1 = p2mt; >> + ept_p2m_type_to_flags(split_ept_entry, p2mt); >> split_ept_entry->emt = epte_get_entry_emt(d, gfn, mfn, >> &ipat, >> direct_mmio); split_ept_entry->ipat = ipat; >> @@ -314,12 +351,6 @@ ept_set_entry(struct domain *d, unsigned >> need_modify_vtd_table = 0; >> else >> split_ept_entry->mfn = mfn_x(mfn); >> - >> - split_ept_entry->avail1 = p2mt; >> - ept_p2m_type_to_flags(split_ept_entry, p2mt); - >> - unmap_domain_page(split_table); >> - *ept_entry = l2_ept_entry; >> } >> >> /* Track the highest gfn for which we have ever had a valid >> mapping */ @@ -336,7 +367,7 @@ out: ept_sync_domain(d); >> >> /* Now the p2m table is not shared with vt-d page table */ >> - if ( iommu_enabled && need_iommu(d) && need_modify_vtd_table ) >> + if ( rv && iommu_enabled && need_iommu(d) && >> need_modify_vtd_table ) { if ( p2mt == p2m_ram_rw ) >> { >> @@ -459,7 +490,7 @@ out: >> /* WARNING: Only caller doesn''t care about PoD pages. So this >> function will >> * always return 0 for PoD pages, not populate them. If that >> becomes necessary, >> * pass a p2m_query_t type along to distinguish. */ >> -static ept_entry_t ept_get_entry_content(struct domain *d, unsigned >> long gfn) +static ept_entry_t ept_get_entry_content(struct domain >> *d, unsigned long gfn, int *level) { ept_entry_t *table >> >> map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table))); @@ >> -487,6 +518,7 @@ static ept_entry_t ept_get_entry_content index >> = gfn_remainder >> (i * EPT_TABLE_ORDER); ept_entry = table + >> index; content = *ept_entry; >> + *level = i; >> >> out: >> unmap_domain_page(table); >> @@ -579,7 +611,10 @@ void ept_change_entry_emt_with_range(str >> p2m_lock(d->arch.p2m); for ( gfn = start_gfn; gfn <= end_gfn; >> gfn++ ) { - e = ept_get_entry_content(d, gfn); >> + int level = 0; >> + uint64_t trunk = 0; >> + >> + e = ept_get_entry_content(d, gfn, &level); >> if ( !p2m_has_emt(e.avail1) ) >> continue; >> >> @@ -588,25 +623,23 @@ void ept_change_entry_emt_with_range(str >> >> if ( e.sp_avail ) >> { >> - if ( !(gfn & ((1 << EPT_TABLE_ORDER) - 1)) && >> - ((gfn + 0x1FF) <= end_gfn) ) >> + while ( level ) >> { >> - /* >> - * gfn assigned with 2M, and the end covers more >> than 2m areas. >> - * Set emt for super page. >> - */ >> - order = EPT_TABLE_ORDER; >> - if ( need_modify_ept_entry(d, gfn, mfn, e.ipat, >> e.emt, e.avail1) ) >> - ept_set_entry(d, gfn, mfn, order, e.avail1); >> - gfn += 0x1FF; >> - } >> - else >> - { >> - /* Change emt for partial entries of the 2m area. */ >> - if ( need_modify_ept_entry(d, gfn, mfn, e.ipat, >> e.emt, e.avail1) ) >> - ept_set_entry(d, gfn, mfn, order, e.avail1); >> - gfn = ((gfn >> EPT_TABLE_ORDER) << EPT_TABLE_ORDER) >> + 0x1FF; >> - } >> + trunk = (1UL << (level * EPT_TABLE_ORDER)) - 1; >> + if ( !(gfn & trunk) && (gfn + trunk <= end_gfn) ) + >> { + /* gfn assigned with 2M, and the end covers >> more than 2m areas. > > This code covers 2M and 1G superpages; please fix the comment too. > >> + * Set emt for super page. >> + */ >> + order = level * EPT_TABLE_ORDER; >> + if ( need_modify_ept_entry(d, gfn, mfn, >> + e.ipat, e.emt, e.avail1) ) >> + ept_set_entry(d, gfn, mfn, order, e.avail1); >> + gfn += trunk; >> + break; >> + } >> + level--; >> + } >> } >> else /* gfn assigned with 4k */ >> { > > Cheers, > > Tim._______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel