Jan Beulich
2008-Apr-22 13:01 UTC
[Xen-devel] [PATCH] x86: remove use of per-domain lock from page table entry handling
This is only a first step, the use of the domain lock in do_mmuext_op() and do_set_gdt() still need looking at, as do the assertions of the lock being held in create_grant_{pte,va}_mapping(). This change results in a 5% performance improvement for kernel builds on dual-socket quad-core systems (which is what I used for reference for both 32- and 64-bit). Along with that, the amount of time reported as spent in the kernel gets reduced by almost 25% (the fraction of time spent in the kernel is generally reported significantly higher under Xen than with a native kernel). Signed-off-by: Jan Beulich <jbeulich@novell.com> Index: 2008-04-15/xen/arch/x86/mm.c ==================================================================--- 2008-04-15.orig/xen/arch/x86/mm.c 2008-04-15 08:48:15.000000000 +0200 +++ 2008-04-15/xen/arch/x86/mm.c 2008-04-15 16:21:12.000000000 +0200 @@ -1371,6 +1371,37 @@ static void free_l4_table(struct page_in #endif +static void lock_page(struct page_info *page) +{ + unsigned long x, y; + + y = page->u.inuse.type_info; + do { + for ( ; ; ) + { + ASSERT(y & PGT_count_mask); + if (likely(y & PGT_validated) && likely(!(y & PGT_locked))) + break; + cpu_relax(); + y = page->u.inuse.type_info; + } + x = y; + } + while ( (y = cmpxchg(&page->u.inuse.type_info, x, x | PGT_locked)) != x ); +} + +static void unlock_page(struct page_info *page) +{ + unsigned long x, y; + + y = page->u.inuse.type_info; + do { + ASSERT(y & PGT_locked); + ASSERT(y & PGT_count_mask); + x = y; + } + while ( (y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_locked)) != x ); +} /* How to write an entry to the guest pagetables. * Returns 0 for failure (pointer not valid), 1 for success. */ @@ -1432,24 +1463,33 @@ static int mod_l1_entry(l1_pgentry_t *pl struct vcpu *curr = current; struct domain *d = curr->domain; unsigned long mfn; + struct page_info *l1pg = mfn_to_page(gl1mfn); + int rc = 1; + + lock_page(l1pg); if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ) - return 0; + return unlock_page(l1pg), 0; if ( unlikely(paging_mode_refcounts(d)) ) - return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, preserve_ad); + { + rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, preserve_ad); + unlock_page(l1pg); + return rc; + } if ( l1e_get_flags(nl1e) & _PAGE_PRESENT ) { /* Translate foreign guest addresses. */ mfn = gmfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e)); if ( unlikely(mfn == INVALID_MFN) ) - return 0; + return unlock_page(l1pg), 0; ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0); nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e)); if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(d)) ) { + unlock_page(l1pg); MEM_LOG("Bad L1 flags %x", l1e_get_flags(nl1e) & l1_disallow_mask(d)); return 0; @@ -1459,30 +1499,33 @@ static int mod_l1_entry(l1_pgentry_t *pl if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) ) { adjust_guest_l1e(nl1e, d); - return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, - preserve_ad); + rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, + preserve_ad); + unlock_page(l1pg); + return rc; } if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) ) - return 0; + return unlock_page(l1pg), 0; adjust_guest_l1e(nl1e, d); if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, preserve_ad)) ) { - put_page_from_l1e(nl1e, d); - return 0; + ol1e = nl1e; + rc = 0; } } - else + else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, + preserve_ad)) ) { - if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, - preserve_ad)) ) - return 0; + unlock_page(l1pg); + return 0; } + unlock_page(l1pg); put_page_from_l1e(ol1e, d); - return 1; + return rc; } @@ -1496,6 +1539,8 @@ static int mod_l2_entry(l2_pgentry_t *pl l2_pgentry_t ol2e; struct vcpu *curr = current; struct domain *d = curr->domain; + struct page_info *l2pg = mfn_to_page(pfn); + int rc = 1; if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) ) { @@ -1503,13 +1548,16 @@ static int mod_l2_entry(l2_pgentry_t *pl return 0; } + lock_page(l2pg); + if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) ) - return 0; + return unlock_page(l2pg), 0; if ( l2e_get_flags(nl2e) & _PAGE_PRESENT ) { if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) ) { + unlock_page(l2pg); MEM_LOG("Bad L2 flags %x", l2e_get_flags(nl2e) & L2_DISALLOW_MASK); return 0; @@ -1519,28 +1567,32 @@ static int mod_l2_entry(l2_pgentry_t *pl if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT) ) { adjust_guest_l2e(nl2e, d); - return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr, preserve_ad); + rc = UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr, preserve_ad); + unlock_page(l2pg); + return rc; } if ( unlikely(!get_page_from_l2e(nl2e, pfn, d)) ) - return 0; + return unlock_page(l2pg), 0; adjust_guest_l2e(nl2e, d); if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr, preserve_ad)) ) { - put_page_from_l2e(nl2e, pfn); - return 0; + ol2e = nl2e; + rc = 0; } } else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr, preserve_ad)) ) { + unlock_page(l2pg); return 0; } + unlock_page(l2pg); put_page_from_l2e(ol2e, pfn); - return 1; + return rc; } #if CONFIG_PAGING_LEVELS >= 3 @@ -1554,7 +1606,8 @@ static int mod_l3_entry(l3_pgentry_t *pl l3_pgentry_t ol3e; struct vcpu *curr = current; struct domain *d = curr->domain; - int okay; + struct page_info *l3pg = mfn_to_page(pfn); + int okay, rc = 1; if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) ) { @@ -1571,13 +1624,16 @@ static int mod_l3_entry(l3_pgentry_t *pl return 0; #endif + lock_page(l3pg); + if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) ) - return 0; + return unlock_page(l3pg), 0; if ( l3e_get_flags(nl3e) & _PAGE_PRESENT ) { if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) ) { + unlock_page(l3pg); MEM_LOG("Bad L3 flags %x", l3e_get_flags(nl3e) & l3_disallow_mask(d)); return 0; @@ -1587,23 +1643,26 @@ static int mod_l3_entry(l3_pgentry_t *pl if ( !l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT) ) { adjust_guest_l3e(nl3e, d); - return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, preserve_ad); + rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, preserve_ad); + unlock_page(l3pg); + return rc; } if ( unlikely(!get_page_from_l3e(nl3e, pfn, d)) ) - return 0; + return unlock_page(l3pg), 0; adjust_guest_l3e(nl3e, d); if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, preserve_ad)) ) { - put_page_from_l3e(nl3e, pfn); - return 0; + ol3e = nl3e; + rc = 0; } } else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, preserve_ad)) ) { + unlock_page(l3pg); return 0; } @@ -1612,8 +1671,9 @@ static int mod_l3_entry(l3_pgentry_t *pl pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e); + unlock_page(l3pg); put_page_from_l3e(ol3e, pfn); - return 1; + return rc; } #endif @@ -1629,6 +1689,8 @@ static int mod_l4_entry(l4_pgentry_t *pl struct vcpu *curr = current; struct domain *d = curr->domain; l4_pgentry_t ol4e; + struct page_info *l4pg = mfn_to_page(pfn); + int rc = 1; if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) ) { @@ -1636,13 +1698,16 @@ static int mod_l4_entry(l4_pgentry_t *pl return 0; } + lock_page(l4pg); + if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) ) - return 0; + return unlock_page(l4pg), 0; if ( l4e_get_flags(nl4e) & _PAGE_PRESENT ) { if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) ) { + unlock_page(l4pg); MEM_LOG("Bad L4 flags %x", l4e_get_flags(nl4e) & L4_DISALLOW_MASK); return 0; @@ -1652,28 +1717,32 @@ static int mod_l4_entry(l4_pgentry_t *pl if ( !l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT) ) { adjust_guest_l4e(nl4e, d); - return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, preserve_ad); + rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, preserve_ad); + unlock_page(l4pg); + return rc; } if ( unlikely(!get_page_from_l4e(nl4e, pfn, d)) ) - return 0; + return unlock_page(l4pg), 0; adjust_guest_l4e(nl4e, d); if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, preserve_ad)) ) { - put_page_from_l4e(nl4e, pfn); - return 0; + ol4e = nl4e; + rc = 0; } } else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, preserve_ad)) ) { + unlock_page(l4pg); return 0; } + unlock_page(l4pg); put_page_from_l4e(ol4e, pfn); - return 1; + return rc; } #endif @@ -2492,8 +2561,6 @@ int do_mmu_update( domain_mmap_cache_init(&mapcache); - domain_lock(d); - for ( i = 0; i < count; i++ ) { if ( hypercall_preempt_check() ) @@ -2664,8 +2731,6 @@ int do_mmu_update( process_deferred_ops(); - domain_unlock(d); - domain_mmap_cache_destroy(&mapcache); perfc_add(num_page_updates, i); @@ -2718,14 +2783,19 @@ static int create_grant_pte_mapping( goto failed; } + lock_page(page); + ol1e = *(l1_pgentry_t *)va; if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v, 0) ) { + unlock_page(page); put_page_type(page); rc = GNTST_general_error; goto failed; } + unlock_page(page); + if ( !paging_mode_refcounts(d) ) put_page_from_l1e(ol1e, d); @@ -2769,16 +2839,14 @@ static int destroy_grant_pte_mapping( goto failed; } - if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) ) - { - put_page_type(page); - rc = GNTST_general_error; - goto failed; - } + lock_page(page); + + ol1e = *(l1_pgentry_t *)va; /* Check that the virtual address supplied is actually mapped to frame. */ if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) ) { + unlock_page(page); MEM_LOG("PTE entry %lx for address %"PRIx64" doesn''t match frame %lx", (unsigned long)l1e_get_intpte(ol1e), addr, frame); put_page_type(page); @@ -2793,12 +2861,14 @@ static int destroy_grant_pte_mapping( d->vcpu[0] /* Change if we go to per-vcpu shadows. */, 0)) ) { + unlock_page(page); MEM_LOG("Cannot delete PTE entry at %p", va); put_page_type(page); rc = GNTST_general_error; goto failed; } + unlock_page(page); put_page_type(page); failed: @@ -2814,6 +2884,7 @@ static int create_grant_va_mapping( l1_pgentry_t *pl1e, ol1e; struct domain *d = v->domain; unsigned long gl1mfn; + struct page_info *l1pg; int okay; ASSERT(domain_is_locked(d)); @@ -2826,8 +2897,11 @@ static int create_grant_va_mapping( MEM_LOG("Could not find L1 PTE for address %lx", va); return GNTST_general_error; } + l1pg = mfn_to_page(gl1mfn); + lock_page(l1pg); ol1e = *pl1e; okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0); + unlock_page(l1pg); guest_unmap_l1e(v, pl1e); pl1e = NULL; @@ -2845,6 +2919,7 @@ static int replace_grant_va_mapping( { l1_pgentry_t *pl1e, ol1e; unsigned long gl1mfn; + struct page_info *l1pg; int rc = 0; pl1e = guest_map_l1e(v, addr, &gl1mfn); @@ -2853,11 +2928,15 @@ static int replace_grant_va_mapping( MEM_LOG("Could not find L1 PTE for address %lx", addr); return GNTST_general_error; } + + l1pg = mfn_to_page(gl1mfn); + lock_page(l1pg); ol1e = *pl1e; /* Check that the virtual address supplied is actually mapped to frame. */ if ( unlikely(l1e_get_pfn(ol1e) != frame) ) { + unlock_page(l1pg); MEM_LOG("PTE entry %lx for address %lx doesn''t match frame %lx", l1e_get_pfn(ol1e), addr, frame); rc = GNTST_general_error; @@ -2867,11 +2946,14 @@ static int replace_grant_va_mapping( /* Delete pagetable entry. */ if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0)) ) { + unlock_page(l1pg); MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e); rc = GNTST_general_error; goto out; } + unlock_page(l1pg); + out: guest_unmap_l1e(v, pl1e); return rc; @@ -2906,6 +2988,7 @@ int replace_grant_host_mapping( struct vcpu *curr = current; l1_pgentry_t *pl1e, ol1e; unsigned long gl1mfn; + struct page_info *l1pg; int rc; if ( flags & GNTMAP_contains_pte ) @@ -2927,16 +3010,21 @@ int replace_grant_host_mapping( (unsigned long)new_addr); return GNTST_general_error; } + + l1pg = mfn_to_page(gl1mfn); + lock_page(l1pg); ol1e = *pl1e; if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(), gl1mfn, curr, 0)) ) { + unlock_page(l1pg); MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e); guest_unmap_l1e(curr, pl1e); return GNTST_general_error; } + unlock_page(l1pg); guest_unmap_l1e(curr, pl1e); rc = replace_grant_va_mapping(addr, frame, ol1e, curr); @@ -3014,8 +3102,6 @@ int do_update_va_mapping(unsigned long v if ( rc ) return rc; - domain_lock(d); - pl1e = guest_map_l1e(v, va, &gl1mfn); if ( unlikely(!pl1e || !mod_l1_entry(pl1e, val, gl1mfn, 0)) ) @@ -3027,8 +3113,6 @@ int do_update_va_mapping(unsigned long v process_deferred_ops(); - domain_unlock(d); - switch ( flags & UVMF_FLUSHTYPE_MASK ) { case UVMF_TLB_FLUSH: @@ -3669,8 +3753,6 @@ int ptwr_do_page_fault(struct vcpu *v, u struct ptwr_emulate_ctxt ptwr_ctxt; int rc; - domain_lock(d); - /* Attempt to read the PTE that maps the VA being accessed. */ guest_get_eff_l1e(v, addr, &pte); page = l1e_get_page(pte); @@ -3690,16 +3772,16 @@ int ptwr_do_page_fault(struct vcpu *v, u ptwr_ctxt.cr2 = addr; ptwr_ctxt.pte = pte; + lock_page(page); rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops); + unlock_page(page); if ( rc == X86EMUL_UNHANDLEABLE ) goto bail; - domain_unlock(d); perfc_incr(ptwr_emulations); return EXCRET_fault_fixed; bail: - domain_unlock(d); return 0; } Index: 2008-04-15/xen/include/asm-x86/mm.h ==================================================================--- 2008-04-15.orig/xen/include/asm-x86/mm.h 2008-04-15 08:48:06.000000000 +0200 +++ 2008-04-15/xen/include/asm-x86/mm.h 2008-04-15 15:16:03.000000000 +0200 @@ -82,9 +82,12 @@ struct page_info /* PAE only: is this an L2 page directory containing Xen-private mappings? */ #define _PGT_pae_xen_l2 26 #define PGT_pae_xen_l2 (1U<<_PGT_pae_xen_l2) + /* The page is currently locked for modification. */ +#define _PGT_locked 25 +#define PGT_locked (1U<<_PGT_locked) - /* 26-bit count of uses of this frame as its current type. */ -#define PGT_count_mask ((1U<<26)-1) + /* 25-bit count of uses of this frame as its current type. */ +#define PGT_count_mask ((1U<<25)-1) /* Cleared when the owning guest ''frees'' this page. */ #define _PGC_allocated 31 _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2008-Apr-22 13:30 UTC
Re: [Xen-devel] [PATCH] x86: remove use of per-domain lock from page table entry handling
On 22/4/08 14:01, "Jan Beulich" <jbeulich@novell.com> wrote:> This is only a first step, the use of the domain lock in do_mmuext_op() > and do_set_gdt() still need looking at, as do the assertions of the > lock being held in create_grant_{pte,va}_mapping(). > > This change results in a 5% performance improvement for kernel builds > on dual-socket quad-core systems (which is what I used for reference > for both 32- and 64-bit). Along with that, the amount of time reported > as spent in the kernel gets reduced by almost 25% (the fraction of time > spent in the kernel is generally reported significantly higher under > Xen than with a native kernel).I''m pretty sure we don''t need per-page locks. -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jan Beulich
2008-Apr-22 13:39 UTC
Re: [Xen-devel] [PATCH] x86: remove use of per-domain lock frompage table entry handling
>>> Keir Fraser <keir.fraser@eu.citrix.com> 22.04.08 15:30 >>> >On 22/4/08 14:01, "Jan Beulich" <jbeulich@novell.com> wrote: > >> This is only a first step, the use of the domain lock in do_mmuext_op() >> and do_set_gdt() still need looking at, as do the assertions of the >> lock being held in create_grant_{pte,va}_mapping(). >> >> This change results in a 5% performance improvement for kernel builds >> on dual-socket quad-core systems (which is what I used for reference >> for both 32- and 64-bit). Along with that, the amount of time reported >> as spent in the kernel gets reduced by almost 25% (the fraction of time >> spent in the kernel is generally reported significantly higher under >> Xen than with a native kernel). > >I''m pretty sure we don''t need per-page locks.If you want to make serialization by the caller part of the ABI - yes. If not (and I think everything else would be wrong, as a malicious guest could take advantage of that), then the window between reading of the old entry and writing the new one must be protected against racing updates, as otherwise the page type and reference counters could get screwed up. Jan _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2008-Apr-22 13:51 UTC
Re: [Xen-devel] [PATCH] x86: remove use of per-domain lock frompage table entry handling
On 22/4/08 14:39, "Jan Beulich" <jbeulich@novell.com> wrote:>> I''m pretty sure we don''t need per-page locks. > > If you want to make serialization by the caller part of the ABI - yes. > If not (and I think everything else would be wrong, as a malicious guest > could take advantage of that), then the window between reading of > the old entry and writing the new one must be protected against racing > updates, as otherwise the page type and reference counters could get > screwed up.Ah, yes my statement assumes we return to doing cmpxchg for all PTE updates (and retry on failure). As to whether that is better than your approach, I suppose it depends on whether you can amortise the locking overheads across multiple PTE updates. Right now it appears you lock/unlock on every PTE update, which is double the number of locked cmpxchg operations compared with just doing the cmpxchg on the pte itself. Bear in mind that all the page reference counting was designed in the first place to be safe, assuming the PTE update itself is done as an atomic read-modify-write operation. It''s just that we changed the latter to a simple write some time ago because of course we observed that that''s a safe thing to do under the big lock. -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel