Up until now, p2m lookups (get_gfn*) in the x86 architecture were not synchronized against concurrent updates. With the addition of sharing and paging subsystems that can dynamically modify the p2m, the need for synchronized lookups becomes more pressing. Without synchronized lookups, we''ve encountered host crashes triggered by race conditions, particularly when exercising the sharing subsystem. In this patch series we make p2m lookups fully synchronized. We still use a single per-domain p2m lock (profiling work tbd may or may not indicate the need for fine-grained locking). We only enforce locking of p2m queries for hap-assisted domains. These are the domains (in the case of EPT) that can exercise the paging and sharing subsystems and thus most in need of synchronized lookups. While the code *should* work for domains relying on shadow paging, it has not been tested, hence we do not enable this at the moment. Finally, the locking discipline in the PoD subsystem has been updated, since PoD relied on some assumptions about the way the p2m lock is used. Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org> Signed-off-by: Adin Scannell <adin@scannell.ca> xen/arch/x86/mm/hap/hap.c | 6 ++ xen/arch/x86/mm/mm-locks.h | 40 ++++++++----- xen/arch/x86/mm/p2m.c | 21 ++++++- xen/include/asm-x86/p2m.h | 47 ++++++++++------ xen/arch/x86/mm/mem_sharing.c | 2 - xen/arch/x86/mm/mm-locks.h | 4 +- xen/arch/x86/mm/p2m-ept.c | 33 +---------- xen/arch/x86/mm/p2m-pod.c | 14 +++- xen/arch/x86/mm/p2m-pt.c | 45 ++------------ xen/arch/x86/mm/p2m.c | 64 +++++++++++---------- xen/arch/x86/mm/mm-locks.h | 10 +++ xen/arch/x86/mm/p2m-pod.c | 112 +++++++++++++++++++++++--------------- xen/arch/x86/mm/p2m-pt.c | 1 + xen/arch/x86/mm/p2m.c | 8 ++- xen/include/asm-x86/p2m.h | 27 ++------ xen/arch/x86/hvm/emulate.c | 2 +- xen/arch/x86/hvm/hvm.c | 25 +++++-- xen/arch/x86/mm.c | 10 +- xen/arch/x86/mm/guest_walk.c | 2 +- xen/arch/x86/mm/hap/guest_walk.c | 6 +- xen/arch/x86/mm/mem_access.c | 10 +++ xen/arch/x86/mm/mem_event.c | 5 + xen/arch/x86/mm/p2m.c | 52 ++++++++++-------- xen/common/grant_table.c | 2 +- xen/common/memory.c | 2 +- xen/include/asm-x86/mem_access.h | 1 + xen/include/asm-x86/mem_event.h | 3 + xen/include/asm-x86/p2m.h | 6 +- xen/arch/x86/mm/p2m.c | 4 +- 29 files changed, 312 insertions(+), 252 deletions(-)
Andres Lagar-Cavilla
2012-Feb-01 19:56 UTC
[PATCH 1 of 5] Make p2m lookups fully synchronized wrt modifications
xen/arch/x86/mm/hap/hap.c | 6 +++++ xen/arch/x86/mm/mm-locks.h | 40 ++++++++++++++++++++++++-------------- xen/arch/x86/mm/p2m.c | 21 ++++++++++++++++++- xen/include/asm-x86/p2m.h | 47 ++++++++++++++++++++++++++++----------------- 4 files changed, 79 insertions(+), 35 deletions(-) We achieve this by locking/unlocking the global p2m_lock in get/put_gfn. The lock is always taken recursively, as there are many paths that do get_gfn, and later, another attempt at grabbing the p2m_lock The lock is not taken for shadow lookups, as the testing needed to rule out the possibility of locking inversions and deadlock has not yet been carried out for shadow-paging. This is tolerable as long as shadows do not gain support for paging or sharing. HAP (EPT) lookups and all modifications do take the lock. Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org> diff -r 9a55109e4d7e -r 223512f9fb5b xen/arch/x86/mm/hap/hap.c --- a/xen/arch/x86/mm/hap/hap.c +++ b/xen/arch/x86/mm/hap/hap.c @@ -786,7 +786,12 @@ hap_paging_get_mode(struct vcpu *v) static void hap_update_paging_modes(struct vcpu *v) { struct domain *d = v->domain; + unsigned long cr3_gfn = v->arch.hvm_vcpu.guest_cr[3]; + p2m_type_t t; + /* We hold onto the cr3 as it may be modified later, and + * we need to respect lock ordering */ + (void)get_gfn(d, cr3_gfn, &t); paging_lock(d); v->arch.paging.mode = hap_paging_get_mode(v); @@ -803,6 +808,7 @@ static void hap_update_paging_modes(stru hap_update_cr3(v, 0); paging_unlock(d); + put_gfn(d, cr3_gfn); } #if CONFIG_PAGING_LEVELS == 3 diff -r 9a55109e4d7e -r 223512f9fb5b xen/arch/x86/mm/mm-locks.h --- a/xen/arch/x86/mm/mm-locks.h +++ b/xen/arch/x86/mm/mm-locks.h @@ -144,6 +144,31 @@ static inline void mm_enforce_order_unlo * * ************************************************************************/ +declare_mm_lock(nestedp2m) +#define nestedp2m_lock(d) mm_lock(nestedp2m, &(d)->arch.nested_p2m_lock) +#define nestedp2m_unlock(d) mm_unlock(&(d)->arch.nested_p2m_lock) + +/* P2M lock (per-p2m-table) + * + * This protects all queries and updates to the p2m table. + * + * A note about ordering: + * The order established here is enforced on all mutations of a p2m. + * For lookups, the order established here is enforced only for hap + * domains (1. shadow domains present a few nasty inversions; + * 2. shadow domains do not support paging and sharing, + * the main sources of dynamic p2m mutations) + * + * The lock is recursive as it is common for a code path to look up a gfn + * and later mutate it. + */ + +declare_mm_lock(p2m) +#define p2m_lock(p) mm_lock_recursive(p2m, &(p)->lock) +#define p2m_lock_recursive(p) mm_lock_recursive(p2m, &(p)->lock) +#define p2m_unlock(p) mm_unlock(&(p)->lock) +#define p2m_locked_by_me(p) mm_locked_by_me(&(p)->lock) + /* Sharing per page lock * * This is an external lock, not represented by an mm_lock_t. The memory @@ -167,21 +192,6 @@ declare_mm_order_constraint(per_page_sha * - setting the "cr3" field of any p2m table to a non-CR3_EADDR value. * (i.e. assigning a p2m table to be the shadow of that cr3 */ -declare_mm_lock(nestedp2m) -#define nestedp2m_lock(d) mm_lock(nestedp2m, &(d)->arch.nested_p2m_lock) -#define nestedp2m_unlock(d) mm_unlock(&(d)->arch.nested_p2m_lock) - -/* P2M lock (per-p2m-table) - * - * This protects all updates to the p2m table. Updates are expected to - * be safe against concurrent reads, which do *not* require the lock. */ - -declare_mm_lock(p2m) -#define p2m_lock(p) mm_lock(p2m, &(p)->lock) -#define p2m_lock_recursive(p) mm_lock_recursive(p2m, &(p)->lock) -#define p2m_unlock(p) mm_unlock(&(p)->lock) -#define p2m_locked_by_me(p) mm_locked_by_me(&(p)->lock) - /* Page alloc lock (per-domain) * * This is an external lock, not represented by an mm_lock_t. However, diff -r 9a55109e4d7e -r 223512f9fb5b xen/arch/x86/mm/p2m.c --- a/xen/arch/x86/mm/p2m.c +++ b/xen/arch/x86/mm/p2m.c @@ -144,9 +144,9 @@ void p2m_change_entry_type_global(struct p2m_unlock(p2m); } -mfn_t get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn, +mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, p2m_access_t *a, p2m_query_t q, - unsigned int *page_order) + unsigned int *page_order, bool_t locked) { mfn_t mfn; @@ -158,6 +158,11 @@ mfn_t get_gfn_type_access(struct p2m_dom return _mfn(gfn); } + /* For now only perform locking on hap domains */ + if ( locked && (hap_enabled(p2m->domain)) ) + /* Grab the lock here, don''t release until put_gfn */ + p2m_lock(p2m); + mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order); #ifdef __x86_64__ @@ -182,6 +187,18 @@ mfn_t get_gfn_type_access(struct p2m_dom return mfn; } +void __put_gfn(struct p2m_domain *p2m, unsigned long gfn) +{ + if ( !p2m || !paging_mode_translate(p2m->domain) + || !hap_enabled(p2m->domain) ) + /* Nothing to do in this case */ + return; + + ASSERT(p2m_locked_by_me(p2m)); + + p2m_unlock(p2m); +} + int set_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma) { diff -r 9a55109e4d7e -r 223512f9fb5b xen/include/asm-x86/p2m.h --- a/xen/include/asm-x86/p2m.h +++ b/xen/include/asm-x86/p2m.h @@ -309,9 +309,14 @@ struct p2m_domain *p2m_get_p2m(struct vc #define p2m_get_pagetable(p2m) ((p2m)->phys_table) -/**** p2m query accessors. After calling any of the variants below, you - * need to call put_gfn(domain, gfn). If you don''t, you''ll lock the - * hypervisor. ****/ +/**** p2m query accessors. They lock p2m_lock, and thus serialize + * lookups wrt modifications. They _do not_ release the lock on exit. + * After calling any of the variants below, caller needs to use + * put_gfn. ****/ + +mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn, + p2m_type_t *t, p2m_access_t *a, p2m_query_t q, + unsigned int *page_order, bool_t locked); /* Read a particular P2M table, mapping pages as we go. Most callers * should _not_ call this directly; use the other get_gfn* functions @@ -320,9 +325,8 @@ struct p2m_domain *p2m_get_p2m(struct vc * If the lookup succeeds, the return value is != INVALID_MFN and * *page_order is filled in with the order of the superpage (if any) that * the entry was found in. */ -mfn_t get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn, - p2m_type_t *t, p2m_access_t *a, p2m_query_t q, - unsigned int *page_order); +#define get_gfn_type_access(p, g, t, a, q, o) \ + __get_gfn_type_access((p), (g), (t), (a), (q), (o), 1) /* General conversion function from gfn to mfn */ static inline mfn_t get_gfn_type(struct domain *d, @@ -353,21 +357,28 @@ static inline unsigned long get_gfn_unty return INVALID_MFN; } -/* This is a noop for now. */ -static inline void __put_gfn(struct p2m_domain *p2m, unsigned long gfn) -{ -} +/* Will release the p2m_lock for this gfn entry. */ +void __put_gfn(struct p2m_domain *p2m, unsigned long gfn); #define put_gfn(d, gfn) __put_gfn(p2m_get_hostp2m((d)), (gfn)) -/* These are identical for now. The intent is to have the caller not worry - * about put_gfn. To only be used in printk''s, crash situations, or to - * peek at a type. You''re not holding the p2m entry exclsively after calling - * this. */ -#define get_gfn_unlocked(d, g, t) get_gfn_type((d), (g), (t), p2m_alloc) -#define get_gfn_query_unlocked(d, g, t) get_gfn_type((d), (g), (t), p2m_query) -#define get_gfn_guest_unlocked(d, g, t) get_gfn_type((d), (g), (t), p2m_guest) -#define get_gfn_unshare_unlocked(d, g, t) get_gfn_type((d), (g), (t), p2m_unshare) +/* The intent of the "unlocked" accessor is to have the caller not worry about + * put_gfn. They apply to very specific situations: debug printk''s, dumps + * during a domain crash, or to peek at a p2m entry/type. Caller is not + * holding the p2m entry exclusively during or after calling this. + * + * Note that an unlocked accessor only makes sense for a "query" lookup. + * Any other type of query can cause a change in the p2m and may need to + * perform locking. + */ +static inline mfn_t get_gfn_query_unlocked(struct domain *d, + unsigned long gfn, + p2m_type_t *t) +{ + p2m_access_t a; + return __get_gfn_type_access(p2m_get_hostp2m(d), gfn, t, &a, + p2m_query, NULL, 0); +} /* General conversion function from mfn to gfn */ static inline unsigned long mfn_to_gfn(struct domain *d, mfn_t mfn)
Andres Lagar-Cavilla
2012-Feb-01 19:56 UTC
[PATCH 2 of 5] Clean up locking now that p2m lockups are fully synchronized
xen/arch/x86/mm/mem_sharing.c | 2 - xen/arch/x86/mm/mm-locks.h | 4 ++- xen/arch/x86/mm/p2m-ept.c | 33 ++------------------- xen/arch/x86/mm/p2m-pod.c | 14 ++++++--- xen/arch/x86/mm/p2m-pt.c | 45 +++++------------------------ xen/arch/x86/mm/p2m.c | 64 ++++++++++++++++++++++-------------------- 6 files changed, 58 insertions(+), 104 deletions(-) With p2m lookups fully synchronized, many routines need not call p2m_lock any longer. Also, many routines can logically assert holding the p2m for a specific gfn. Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org> diff -r 223512f9fb5b -r fb9c06df05f2 xen/arch/x86/mm/mem_sharing.c --- a/xen/arch/x86/mm/mem_sharing.c +++ b/xen/arch/x86/mm/mem_sharing.c @@ -882,9 +882,7 @@ int mem_sharing_add_to_physmap(struct do goto err_unlock; } - p2m_lock(p2m); ret = set_p2m_entry(p2m, cgfn, smfn, PAGE_ORDER_4K, p2m_ram_shared, a); - p2m_unlock(p2m); /* Tempted to turn this into an assert */ if ( !ret ) diff -r 223512f9fb5b -r fb9c06df05f2 xen/arch/x86/mm/mm-locks.h --- a/xen/arch/x86/mm/mm-locks.h +++ b/xen/arch/x86/mm/mm-locks.h @@ -165,9 +165,11 @@ declare_mm_lock(nestedp2m) declare_mm_lock(p2m) #define p2m_lock(p) mm_lock_recursive(p2m, &(p)->lock) -#define p2m_lock_recursive(p) mm_lock_recursive(p2m, &(p)->lock) +#define gfn_lock(p,g,o) mm_lock_recursive(p2m, &(p)->lock) #define p2m_unlock(p) mm_unlock(&(p)->lock) +#define gfn_unlock(p,g,o) mm_unlock(&(p)->lock) #define p2m_locked_by_me(p) mm_locked_by_me(&(p)->lock) +#define gfn_locked_by_me(p,g) mm_locked_by_me(&(p)->lock) /* Sharing per page lock * diff -r 223512f9fb5b -r fb9c06df05f2 xen/arch/x86/mm/p2m-ept.c --- a/xen/arch/x86/mm/p2m-ept.c +++ b/xen/arch/x86/mm/p2m-ept.c @@ -46,31 +46,6 @@ static inline bool_t is_epte_valid(ept_e return (e->epte != 0 && e->sa_p2mt != p2m_invalid); } -/* Non-ept "lock-and-check" wrapper */ -static int ept_pod_check_and_populate(struct p2m_domain *p2m, unsigned long gfn, - ept_entry_t *entry, int order, - p2m_query_t q) -{ - int r; - - /* This is called from the p2m lookups, which can happen with or - * without the lock hed. */ - p2m_lock_recursive(p2m); - - /* Check to make sure this is still PoD */ - if ( entry->sa_p2mt != p2m_populate_on_demand ) - { - p2m_unlock(p2m); - return 0; - } - - r = p2m_pod_demand_populate(p2m, gfn, order, q); - - p2m_unlock(p2m); - - return r; -} - static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type, p2m_access_t access) { /* First apply type permissions */ @@ -551,8 +526,8 @@ static mfn_t ept_get_entry(struct p2m_do index = gfn_remainder >> ( i * EPT_TABLE_ORDER); ept_entry = table + index; - if ( !ept_pod_check_and_populate(p2m, gfn, - ept_entry, 9, q) ) + if ( !p2m_pod_demand_populate(p2m, gfn, + PAGE_ORDER_2M, q) ) goto retry; else goto out; @@ -574,8 +549,8 @@ static mfn_t ept_get_entry(struct p2m_do ASSERT(i == 0); - if ( ept_pod_check_and_populate(p2m, gfn, - ept_entry, 0, q) ) + if ( p2m_pod_demand_populate(p2m, gfn, + PAGE_ORDER_4K, q) ) goto out; } diff -r 223512f9fb5b -r fb9c06df05f2 xen/arch/x86/mm/p2m-pod.c --- a/xen/arch/x86/mm/p2m-pod.c +++ b/xen/arch/x86/mm/p2m-pod.c @@ -521,7 +521,7 @@ p2m_pod_decrease_reservation(struct doma /* Figure out if we need to steal some freed memory for our cache */ steal_for_cache = ( p2m->pod.entry_count > p2m->pod.count ); - p2m_lock(p2m); + gfn_lock(p2m, gpfn, order); if ( unlikely(d->is_dying) ) goto out_unlock; @@ -615,7 +615,7 @@ out_entry_check: } out_unlock: - p2m_unlock(p2m); + gfn_unlock(p2m, gpfn, order); out: return ret; @@ -964,7 +964,7 @@ p2m_pod_demand_populate(struct p2m_domai mfn_t mfn; int i; - ASSERT(p2m_locked_by_me(p2m)); + ASSERT(gfn_locked_by_me(p2m, gfn)); /* This check is done with the p2m lock held. This will make sure that * even if d->is_dying changes under our feet, p2m_pod_empty_cache() @@ -972,6 +972,7 @@ p2m_pod_demand_populate(struct p2m_domai if ( unlikely(d->is_dying) ) goto out_fail; + /* Because PoD does not have cache list for 1GB pages, it has to remap * 1GB region to 2MB chunks for a retry. */ if ( order == PAGE_ORDER_1G ) @@ -981,6 +982,9 @@ p2m_pod_demand_populate(struct p2m_domai * split 1GB into 512 2MB pages here. But We only do once here because * set_p2m_entry() should automatically shatter the 1GB page into * 512 2MB pages. The rest of 511 calls are unnecessary. + * + * NOTE: In a fine-grained p2m locking scenario this operation + * may need to promote its locking from gfn->1g superpage */ set_p2m_entry(p2m, gfn_aligned, _mfn(0), PAGE_ORDER_2M, p2m_populate_on_demand, p2m->default_access); @@ -1104,7 +1108,7 @@ guest_physmap_mark_populate_on_demand(st if ( rc != 0 ) return rc; - p2m_lock(p2m); + gfn_lock(p2m, gfn, order); P2M_DEBUG("mark pod gfn=%#lx\n", gfn); @@ -1138,7 +1142,7 @@ guest_physmap_mark_populate_on_demand(st BUG_ON(p2m->pod.entry_count < 0); } - p2m_unlock(p2m); + gfn_unlock(p2m, gfn, order); out: return rc; diff -r 223512f9fb5b -r fb9c06df05f2 xen/arch/x86/mm/p2m-pt.c --- a/xen/arch/x86/mm/p2m-pt.c +++ b/xen/arch/x86/mm/p2m-pt.c @@ -473,31 +473,6 @@ out: } -/* Non-ept "lock-and-check" wrapper */ -static int p2m_pod_check_and_populate(struct p2m_domain *p2m, unsigned long gfn, - l1_pgentry_t *p2m_entry, int order, - p2m_query_t q) -{ - int r; - - /* This is called from the p2m lookups, which can happen with or - * without the lock hed. */ - p2m_lock_recursive(p2m); - - /* Check to make sure this is still PoD */ - if ( p2m_flags_to_type(l1e_get_flags(*p2m_entry)) != p2m_populate_on_demand ) - { - p2m_unlock(p2m); - return 0; - } - - r = p2m_pod_demand_populate(p2m, gfn, order, q); - - p2m_unlock(p2m); - - return r; -} - /* Read the current domain''s p2m table (through the linear mapping). */ static mfn_t p2m_gfn_to_mfn_current(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, @@ -540,8 +515,7 @@ pod_retry_l3: /* The read has succeeded, so we know that mapping exists */ if ( q != p2m_query ) { - if ( !p2m_pod_check_and_populate(p2m, gfn, - (l1_pgentry_t *) &l3e, PAGE_ORDER_1G, q) ) + if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_1G, q) ) goto pod_retry_l3; p2mt = p2m_invalid; gdprintk(XENLOG_ERR, "%s: Allocate 1GB failed!\n", __func__); @@ -593,8 +567,8 @@ pod_retry_l2: * exits at this point. */ if ( q != p2m_query ) { - if ( !p2m_pod_check_and_populate(p2m, gfn, - p2m_entry, 9, q) ) + if ( !p2m_pod_demand_populate(p2m, gfn, + PAGE_ORDER_2M, q) ) goto pod_retry_l2; /* Allocate failed. */ @@ -651,8 +625,8 @@ pod_retry_l1: * exits at this point. */ if ( q != p2m_query ) { - if ( !p2m_pod_check_and_populate(p2m, gfn, - (l1_pgentry_t *)p2m_entry, 0, q) ) + if ( !p2m_pod_demand_populate(p2m, gfn, + PAGE_ORDER_4K, q) ) goto pod_retry_l1; /* Allocate failed. */ @@ -742,8 +716,7 @@ pod_retry_l3: { if ( q != p2m_query ) { - if ( !p2m_pod_check_and_populate(p2m, gfn, - (l1_pgentry_t *) l3e, PAGE_ORDER_1G, q) ) + if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_1G, q) ) goto pod_retry_l3; gdprintk(XENLOG_ERR, "%s: Allocate 1GB failed!\n", __func__); } @@ -781,8 +754,7 @@ pod_retry_l2: if ( p2m_flags_to_type(l2e_get_flags(*l2e)) == p2m_populate_on_demand ) { if ( q != p2m_query ) { - if ( !p2m_pod_check_and_populate(p2m, gfn, - (l1_pgentry_t *)l2e, PAGE_ORDER_2M, q) ) + if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_2M, q) ) goto pod_retry_l2; } else *t = p2m_populate_on_demand; @@ -815,8 +787,7 @@ pod_retry_l1: if ( p2m_flags_to_type(l1e_get_flags(*l1e)) == p2m_populate_on_demand ) { if ( q != p2m_query ) { - if ( !p2m_pod_check_and_populate(p2m, gfn, - (l1_pgentry_t *)l1e, PAGE_ORDER_4K, q) ) + if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_4K, q) ) goto pod_retry_l1; } else *t = p2m_populate_on_demand; diff -r 223512f9fb5b -r fb9c06df05f2 xen/arch/x86/mm/p2m.c --- a/xen/arch/x86/mm/p2m.c +++ b/xen/arch/x86/mm/p2m.c @@ -161,7 +161,7 @@ mfn_t __get_gfn_type_access(struct p2m_d /* For now only perform locking on hap domains */ if ( locked && (hap_enabled(p2m->domain)) ) /* Grab the lock here, don''t release until put_gfn */ - p2m_lock(p2m); + gfn_lock(p2m, gfn, 0); mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order); @@ -194,9 +194,9 @@ void __put_gfn(struct p2m_domain *p2m, u /* Nothing to do in this case */ return; - ASSERT(p2m_locked_by_me(p2m)); + ASSERT(gfn_locked_by_me(p2m, gfn)); - p2m_unlock(p2m); + gfn_unlock(p2m, gfn, 0); } int set_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, @@ -207,7 +207,7 @@ int set_p2m_entry(struct p2m_domain *p2m unsigned int order; int rc = 1; - ASSERT(p2m_locked_by_me(p2m)); + ASSERT(gfn_locked_by_me(p2m, gfn)); while ( todo ) { @@ -429,6 +429,7 @@ p2m_remove_page(struct p2m_domain *p2m, return; } + ASSERT(gfn_locked_by_me(p2m, gfn)); P2M_DEBUG("removing gfn=%#lx mfn=%#lx\n", gfn, mfn); if ( mfn_valid(_mfn(mfn)) ) @@ -449,9 +450,9 @@ guest_physmap_remove_page(struct domain unsigned long mfn, unsigned int page_order) { struct p2m_domain *p2m = p2m_get_hostp2m(d); - p2m_lock(p2m); + gfn_lock(p2m, gfn, page_order); p2m_remove_page(p2m, gfn, mfn, page_order); - p2m_unlock(p2m); + gfn_unlock(p2m, gfn, page_order); } int @@ -609,13 +610,13 @@ p2m_type_t p2m_change_type(struct domain BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt)); - p2m_lock(p2m); + gfn_lock(p2m, gfn, 0); mfn = p2m->get_entry(p2m, gfn, &pt, &a, p2m_query, NULL); if ( pt == ot ) set_p2m_entry(p2m, gfn, mfn, PAGE_ORDER_4K, nt, p2m->default_access); - p2m_unlock(p2m); + gfn_unlock(p2m, gfn, 0); return pt; } @@ -664,7 +665,7 @@ set_mmio_p2m_entry(struct domain *d, uns if ( !paging_mode_translate(d) ) return 0; - p2m_lock(p2m); + gfn_lock(p2m, gfn, 0); omfn = p2m->get_entry(p2m, gfn, &ot, &a, p2m_query, NULL); if ( p2m_is_grant(ot) ) { @@ -680,7 +681,7 @@ set_mmio_p2m_entry(struct domain *d, uns P2M_DEBUG("set mmio %lx %lx\n", gfn, mfn_x(mfn)); rc = set_p2m_entry(p2m, gfn, mfn, PAGE_ORDER_4K, p2m_mmio_direct, p2m->default_access); - p2m_unlock(p2m); + gfn_unlock(p2m, gfn, 0); if ( 0 == rc ) gdprintk(XENLOG_ERR, "set_mmio_p2m_entry: set_p2m_entry failed! mfn=%08lx\n", @@ -700,7 +701,7 @@ clear_mmio_p2m_entry(struct domain *d, u if ( !paging_mode_translate(d) ) return 0; - p2m_lock(p2m); + gfn_lock(p2m, gfn, 0); mfn = p2m->get_entry(p2m, gfn, &t, &a, p2m_query, NULL); /* Do not use mfn_valid() here as it will usually fail for MMIO pages. */ @@ -713,7 +714,7 @@ clear_mmio_p2m_entry(struct domain *d, u rc = set_p2m_entry(p2m, gfn, _mfn(INVALID_MFN), PAGE_ORDER_4K, p2m_invalid, p2m->default_access); out: - p2m_unlock(p2m); + gfn_unlock(p2m, gfn, 0); return rc; } @@ -730,7 +731,7 @@ set_shared_p2m_entry(struct domain *d, u if ( !paging_mode_translate(p2m->domain) ) return 0; - p2m_lock(p2m); + gfn_lock(p2m, gfn, 0); omfn = p2m->get_entry(p2m, gfn, &ot, &a, p2m_query, NULL); /* At the moment we only allow p2m change if gfn has already been made * sharable first */ @@ -742,7 +743,7 @@ set_shared_p2m_entry(struct domain *d, u P2M_DEBUG("set shared %lx %lx\n", gfn, mfn_x(mfn)); rc = set_p2m_entry(p2m, gfn, mfn, PAGE_ORDER_4K, p2m_ram_shared, p2m->default_access); - p2m_unlock(p2m); + gfn_unlock(p2m, gfn, 0); if ( 0 == rc ) gdprintk(XENLOG_ERR, "set_shared_p2m_entry: set_p2m_entry failed! mfn=%08lx\n", @@ -778,7 +779,7 @@ int p2m_mem_paging_nominate(struct domai mfn_t mfn; int ret; - p2m_lock(p2m); + gfn_lock(p2m, gfn, 0); mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, p2m_query, NULL); @@ -810,7 +811,7 @@ int p2m_mem_paging_nominate(struct domai ret = 0; out: - p2m_unlock(p2m); + gfn_unlock(p2m, gfn, 0); return ret; } @@ -842,7 +843,7 @@ int p2m_mem_paging_evict(struct domain * struct p2m_domain *p2m = p2m_get_hostp2m(d); int ret = -EINVAL; - p2m_lock(p2m); + gfn_lock(p2m, gfn, 0); /* Get mfn */ mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, p2m_query, NULL); @@ -887,7 +888,7 @@ int p2m_mem_paging_evict(struct domain * put_page(page); out: - p2m_unlock(p2m); + gfn_unlock(p2m, gfn, 0); return ret; } @@ -972,7 +973,7 @@ void p2m_mem_paging_populate(struct doma req.type = MEM_EVENT_TYPE_PAGING; /* Fix p2m mapping */ - p2m_lock(p2m); + gfn_lock(p2m, gfn, 0); mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, p2m_query, NULL); /* Allow only nominated or evicted pages to enter page-in path */ if ( p2mt == p2m_ram_paging_out || p2mt == p2m_ram_paged ) @@ -983,7 +984,7 @@ void p2m_mem_paging_populate(struct doma set_p2m_entry(p2m, gfn, mfn, PAGE_ORDER_4K, p2m_ram_paging_in, a); } - p2m_unlock(p2m); + gfn_unlock(p2m, gfn, 0); /* Pause domain if request came from guest and gfn has paging type */ if ( p2m_is_paging(p2mt) && v->domain == d ) @@ -1034,7 +1035,7 @@ int p2m_mem_paging_prep(struct domain *d (!access_ok(user_ptr, PAGE_SIZE)) ) return -EINVAL; - p2m_lock(p2m); + gfn_lock(p2m, gfn, 0); mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, p2m_query, NULL); @@ -1093,7 +1094,7 @@ int p2m_mem_paging_prep(struct domain *d ret = 0; out: - p2m_unlock(p2m); + gfn_unlock(p2m, gfn, 0); return ret; } @@ -1128,7 +1129,7 @@ void p2m_mem_paging_resume(struct domain /* Fix p2m entry if the page was not dropped */ if ( !(rsp.flags & MEM_EVENT_FLAG_DROP_PAGE) ) { - p2m_lock(p2m); + gfn_lock(p2m, rsp.gfn, 0); mfn = p2m->get_entry(p2m, rsp.gfn, &p2mt, &a, p2m_query, NULL); /* Allow only pages which were prepared properly, or pages which * were nominated but not evicted */ @@ -1139,7 +1140,7 @@ void p2m_mem_paging_resume(struct domain p2m_ram_rw, a); set_gpfn_from_mfn(mfn_x(mfn), rsp.gfn); } - p2m_unlock(p2m); + gfn_unlock(p2m, rsp.gfn, 0); } /* Unpause domain */ if ( rsp.flags & MEM_EVENT_FLAG_VCPU_PAUSED ) @@ -1160,13 +1161,13 @@ bool_t p2m_mem_access_check(unsigned lon p2m_access_t p2ma; /* First, handle rx2rw conversion automatically */ - p2m_lock(p2m); + gfn_lock(p2m, gfn, 0); mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, p2m_query, NULL); if ( access_w && p2ma == p2m_access_rx2rw ) { p2m->set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, p2mt, p2m_access_rw); - p2m_unlock(p2m); + gfn_unlock(p2m, gfn, 0); return 1; } else if ( p2ma == p2m_access_n2rwx ) @@ -1174,7 +1175,7 @@ bool_t p2m_mem_access_check(unsigned lon ASSERT(access_w || access_r || access_x); p2m->set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, p2mt, p2m_access_rwx); } - p2m_unlock(p2m); + gfn_unlock(p2m, gfn, 0); /* Otherwise, check if there is a memory event listener, and send the message along */ if ( mem_event_claim_slot(d, &d->mem_event->access) == -ENOSYS ) @@ -1193,9 +1194,9 @@ bool_t p2m_mem_access_check(unsigned lon if ( p2ma != p2m_access_n2rwx ) { /* A listener is not required, so clear the access restrictions */ - p2m_lock(p2m); + gfn_lock(p2m, gfn, 0); p2m->set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, p2mt, p2m_access_rwx); - p2m_unlock(p2m); + gfn_unlock(p2m, gfn, 0); } return 1; } @@ -1326,7 +1327,10 @@ int p2m_get_mem_access(struct domain *d, return 0; } + gfn_lock(p2m, gfn, 0); mfn = p2m->get_entry(p2m, pfn, &t, &a, p2m_query, NULL); + gfn_unlock(p2m, gfn, 0); + if ( mfn_x(mfn) == INVALID_MFN ) return -ESRCH; @@ -1445,7 +1449,7 @@ p2m_get_nestedp2m(struct vcpu *v, uint64 nestedp2m_unlock(d); return p2m; } - p2m_unlock(p2m); + gfn_unlock(p2m, gfn, 0); } /* All p2m''s are or were in use. Take the least recent used one,
xen/arch/x86/mm/mm-locks.h | 10 ++++ xen/arch/x86/mm/p2m-pod.c | 112 ++++++++++++++++++++++++++------------------ xen/arch/x86/mm/p2m-pt.c | 1 + xen/arch/x86/mm/p2m.c | 8 ++- xen/include/asm-x86/p2m.h | 27 +++------- 5 files changed, 93 insertions(+), 65 deletions(-) The PoD layer has a comples locking discipline. It relies on the p2m being globally locked, and it also relies on the page alloc lock to protect some of its data structures. Replace this all by an explicit pod lock: per p2m, order enforced. Three consequences: - Critical sections in the pod code protected by the page alloc lock are now reduced to modifications of the domain page list. - When the p2m lock becomes fine-grained, there are no assumptions broken in the PoD layer. - The locking is easier to understand. Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org> diff -r fb9c06df05f2 -r 56ceab0118cb xen/arch/x86/mm/mm-locks.h --- a/xen/arch/x86/mm/mm-locks.h +++ b/xen/arch/x86/mm/mm-locks.h @@ -194,6 +194,16 @@ declare_mm_order_constraint(per_page_sha * - setting the "cr3" field of any p2m table to a non-CR3_EADDR value. * (i.e. assigning a p2m table to be the shadow of that cr3 */ +/* PoD lock (per-p2m-table) + * + * Protects private PoD data structs: entry and cache + * counts, page lists, sweep parameters. */ + +declare_mm_lock(pod) +#define pod_lock(p) mm_lock(pod, &(p)->pod.lock) +#define pod_unlock(p) mm_unlock(&(p)->pod.lock) +#define pod_locked_by_me(p) mm_locked_by_me(&(p)->pod.lock) + /* Page alloc lock (per-domain) * * This is an external lock, not represented by an mm_lock_t. However, diff -r fb9c06df05f2 -r 56ceab0118cb xen/arch/x86/mm/p2m-pod.c --- a/xen/arch/x86/mm/p2m-pod.c +++ b/xen/arch/x86/mm/p2m-pod.c @@ -100,7 +100,7 @@ p2m_pod_cache_add(struct p2m_domain *p2m } #endif - ASSERT(p2m_locked_by_me(p2m)); + ASSERT(pod_locked_by_me(p2m)); /* * Pages from domain_alloc and returned by the balloon driver aren''t @@ -114,15 +114,20 @@ p2m_pod_cache_add(struct p2m_domain *p2m unmap_domain_page(b); } + /* First, take all pages off the domain list */ lock_page_alloc(p2m); - - /* First, take all pages off the domain list */ for(i=0; i < 1 << order ; i++) { p = page + i; page_list_del(p, &d->page_list); } + /* Ensure that the PoD cache has never been emptied. + * This may cause "zombie domains" since the page will never be freed. */ + BUG_ON( d->arch.relmem != RELMEM_not_started ); + + unlock_page_alloc(p2m); + /* Then add the first one to the appropriate populate-on-demand list */ switch(order) { @@ -138,25 +143,20 @@ p2m_pod_cache_add(struct p2m_domain *p2m BUG(); } - /* Ensure that the PoD cache has never been emptied. - * This may cause "zombie domains" since the page will never be freed. */ - BUG_ON( d->arch.relmem != RELMEM_not_started ); - - unlock_page_alloc(p2m); - return 0; } /* Get a page of size order from the populate-on-demand cache. Will break * down 2-meg pages into singleton pages automatically. Returns null if - * a superpage is requested and no superpages are available. Must be called - * with the d->page_lock held. */ + * a superpage is requested and no superpages are available. */ static struct page_info * p2m_pod_cache_get(struct p2m_domain *p2m, unsigned long order) { struct page_info *p = NULL; int i; + ASSERT(pod_locked_by_me(p2m)); + if ( order == PAGE_ORDER_2M && page_list_empty(&p2m->pod.super) ) { return NULL; @@ -185,7 +185,7 @@ static struct page_info * p2m_pod_cache_ case PAGE_ORDER_2M: BUG_ON( page_list_empty(&p2m->pod.super) ); p = page_list_remove_head(&p2m->pod.super); - p2m->pod.count -= 1 << order; /* Lock: page_alloc */ + p2m->pod.count -= 1 << order; break; case PAGE_ORDER_4K: BUG_ON( page_list_empty(&p2m->pod.single) ); @@ -197,11 +197,13 @@ static struct page_info * p2m_pod_cache_ } /* Put the pages back on the domain page_list */ + lock_page_alloc(p2m); for ( i = 0 ; i < (1 << order); i++ ) { BUG_ON(page_get_owner(p + i) != p2m->domain); page_list_add_tail(p + i, &p2m->domain->page_list); } + unlock_page_alloc(p2m); return p; } @@ -213,6 +215,8 @@ p2m_pod_set_cache_target(struct p2m_doma struct domain *d = p2m->domain; int ret = 0; + ASSERT(pod_locked_by_me(p2m)); + /* Increasing the target */ while ( pod_target > p2m->pod.count ) { @@ -250,17 +254,13 @@ p2m_pod_set_cache_target(struct p2m_doma } /* Decreasing the target */ - /* We hold the p2m lock here, so we don''t need to worry about + /* We hold the pod lock here, so we don''t need to worry about * cache disappearing under our feet. */ while ( pod_target < p2m->pod.count ) { struct page_info * page; int order, i; - /* Grab the lock before checking that pod.super is empty, or the last - * entries may disappear before we grab the lock. */ - lock_page_alloc(p2m); - if ( (p2m->pod.count - pod_target) > SUPERPAGE_PAGES && !page_list_empty(&p2m->pod.super) ) order = PAGE_ORDER_2M; @@ -271,8 +271,6 @@ p2m_pod_set_cache_target(struct p2m_doma ASSERT(page != NULL); - unlock_page_alloc(p2m); - /* Then free them */ for ( i = 0 ; i < (1 << order) ; i++ ) { @@ -348,7 +346,7 @@ p2m_pod_set_mem_target(struct domain *d, int ret = 0; unsigned long populated; - p2m_lock(p2m); + pod_lock(p2m); /* P == B: Nothing to do. */ if ( p2m->pod.entry_count == 0 ) @@ -377,7 +375,7 @@ p2m_pod_set_mem_target(struct domain *d, ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/); out: - p2m_unlock(p2m); + pod_unlock(p2m); return ret; } @@ -390,7 +388,7 @@ p2m_pod_empty_cache(struct domain *d) /* After this barrier no new PoD activities can happen. */ BUG_ON(!d->is_dying); - spin_barrier(&p2m->lock.lock); + spin_barrier(&p2m->pod.lock.lock); lock_page_alloc(p2m); @@ -431,7 +429,7 @@ p2m_pod_offline_or_broken_hit(struct pag if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) ) return 0; - lock_page_alloc(p2m); + pod_lock(p2m); bmfn = mfn_x(page_to_mfn(p)); page_list_for_each_safe(q, tmp, &p2m->pod.super) { @@ -462,12 +460,14 @@ p2m_pod_offline_or_broken_hit(struct pag } } - unlock_page_alloc(p2m); + pod_unlock(p2m); return 0; pod_hit: + lock_page_alloc(p2m); page_list_add_tail(p, &d->arch.relmem_list); unlock_page_alloc(p2m); + pod_unlock(p2m); return 1; } @@ -486,9 +486,9 @@ p2m_pod_offline_or_broken_replace(struct if ( unlikely(!p) ) return; - p2m_lock(p2m); + pod_lock(p2m); p2m_pod_cache_add(p2m, p, PAGE_ORDER_4K); - p2m_unlock(p2m); + pod_unlock(p2m); return; } @@ -511,18 +511,18 @@ p2m_pod_decrease_reservation(struct doma int steal_for_cache = 0; int pod = 0, nonpod = 0, ram = 0; - + + gfn_lock(p2m, gpfn, order); + pod_lock(p2m); /* If we don''t have any outstanding PoD entries, let things take their * course */ if ( p2m->pod.entry_count == 0 ) - goto out; + goto out_unlock; /* Figure out if we need to steal some freed memory for our cache */ steal_for_cache = ( p2m->pod.entry_count > p2m->pod.count ); - gfn_lock(p2m, gpfn, order); - if ( unlikely(d->is_dying) ) goto out_unlock; @@ -554,7 +554,7 @@ p2m_pod_decrease_reservation(struct doma /* All PoD: Mark the whole region invalid and tell caller * we''re done. */ set_p2m_entry(p2m, gpfn, _mfn(INVALID_MFN), order, p2m_invalid, p2m->default_access); - p2m->pod.entry_count-=(1<<order); /* Lock: p2m */ + p2m->pod.entry_count-=(1<<order); BUG_ON(p2m->pod.entry_count < 0); ret = 1; goto out_entry_check; @@ -578,7 +578,7 @@ p2m_pod_decrease_reservation(struct doma if ( t == p2m_populate_on_demand ) { set_p2m_entry(p2m, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid, p2m->default_access); - p2m->pod.entry_count--; /* Lock: p2m */ + p2m->pod.entry_count--; BUG_ON(p2m->pod.entry_count < 0); pod--; } @@ -615,9 +615,8 @@ out_entry_check: } out_unlock: + pod_unlock(p2m); gfn_unlock(p2m, gpfn, order); - -out: return ret; } @@ -630,7 +629,8 @@ void p2m_pod_dump_data(struct domain *d) /* Search for all-zero superpages to be reclaimed as superpages for the - * PoD cache. Must be called w/ p2m lock held, page_alloc lock not held. */ + * PoD cache. Must be called w/ pod lock held, must lock the superpage + * in the p2m */ static int p2m_pod_zero_check_superpage(struct p2m_domain *p2m, unsigned long gfn) { @@ -642,6 +642,8 @@ p2m_pod_zero_check_superpage(struct p2m_ int max_ref = 1; struct domain *d = p2m->domain; + ASSERT(pod_locked_by_me(p2m)); + if ( !superpage_aligned(gfn) ) goto out; @@ -649,6 +651,10 @@ p2m_pod_zero_check_superpage(struct p2m_ if ( paging_mode_shadow(d) ) max_ref++; + /* NOTE: this is why we don''t enforce deadlock constraints between p2m + * and pod locks */ + gfn_lock(p2m, gfn, SUPERPAGE_ORDER); + /* Look up the mfns, checking to make sure they''re the same mfn * and aligned, and mapping them. */ for ( i=0; i<SUPERPAGE_PAGES; i++ ) @@ -761,6 +767,7 @@ out_reset: set_p2m_entry(p2m, gfn, mfn0, 9, type0, p2m->default_access); out: + gfn_unlock(p2m, gfn, SUPERPAGE_ORDER); return ret; } @@ -922,6 +929,12 @@ p2m_pod_emergency_sweep(struct p2m_domai limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0; /* FIXME: Figure out how to avoid superpages */ + /* NOTE: Promote to globally locking the p2m. This will get complicated + * in a fine-grained scenario. Even if we''re to lock each gfn + * individually we must be careful about recursion limits and + * POD_SWEEP_STRIDE. This is why we don''t enforce deadlock constraints + * between p2m and pod locks */ + p2m_lock(p2m); for ( i=p2m->pod.reclaim_single; i > 0 ; i-- ) { p2m_access_t a; @@ -940,7 +953,7 @@ p2m_pod_emergency_sweep(struct p2m_domai /* Stop if we''re past our limit and we have found *something*. * * NB that this is a zero-sum game; we''re increasing our cache size - * by re-increasing our ''debt''. Since we hold the p2m lock, + * by re-increasing our ''debt''. Since we hold the pod lock, * (entry_count - count) must remain the same. */ if ( p2m->pod.count > 0 && i < limit ) break; @@ -949,6 +962,7 @@ p2m_pod_emergency_sweep(struct p2m_domai if ( j ) p2m_pod_zero_check(p2m, gfns, j); + p2m_unlock(p2m); p2m->pod.reclaim_single = i ? i - 1 : i; } @@ -965,8 +979,9 @@ p2m_pod_demand_populate(struct p2m_domai int i; ASSERT(gfn_locked_by_me(p2m, gfn)); + pod_lock(p2m); - /* This check is done with the p2m lock held. This will make sure that + /* This check is done with the pod lock held. This will make sure that * even if d->is_dying changes under our feet, p2m_pod_empty_cache() * won''t start until we''re done. */ if ( unlikely(d->is_dying) ) @@ -977,6 +992,7 @@ p2m_pod_demand_populate(struct p2m_domai * 1GB region to 2MB chunks for a retry. */ if ( order == PAGE_ORDER_1G ) { + pod_unlock(p2m); gfn_aligned = (gfn >> order) << order; /* Note that we are supposed to call set_p2m_entry() 512 times to * split 1GB into 512 2MB pages here. But We only do once here because @@ -1000,11 +1016,15 @@ p2m_pod_demand_populate(struct p2m_domai /* If we''re low, start a sweep */ if ( order == PAGE_ORDER_2M && page_list_empty(&p2m->pod.super) ) + /* Note that sweeps scan other ranges in the p2m. In an scenario + * in which p2m locks are order-enforced wrt pod lock and p2m + * locks are fine grained, this will result in deadlock panic */ p2m_pod_emergency_sweep_super(p2m); if ( page_list_empty(&p2m->pod.single) && ( ( order == PAGE_ORDER_4K ) || (order == PAGE_ORDER_2M && page_list_empty(&p2m->pod.super) ) ) ) + /* Same comment regarding deadlock applies */ p2m_pod_emergency_sweep(p2m); } @@ -1012,8 +1032,6 @@ p2m_pod_demand_populate(struct p2m_domai if ( q == p2m_guest && gfn > p2m->pod.max_guest ) p2m->pod.max_guest = gfn; - lock_page_alloc(p2m); - if ( p2m->pod.count == 0 ) goto out_of_memory; @@ -1026,8 +1044,6 @@ p2m_pod_demand_populate(struct p2m_domai BUG_ON((mfn_x(mfn) & ((1 << order)-1)) != 0); - unlock_page_alloc(p2m); - gfn_aligned = (gfn >> order) << order; set_p2m_entry(p2m, gfn_aligned, mfn, order, p2m_ram_rw, p2m->default_access); @@ -1038,7 +1054,7 @@ p2m_pod_demand_populate(struct p2m_domai paging_mark_dirty(d, mfn_x(mfn) + i); } - p2m->pod.entry_count -= (1 << order); /* Lock: p2m */ + p2m->pod.entry_count -= (1 << order); BUG_ON(p2m->pod.entry_count < 0); if ( tb_init_done ) @@ -1056,20 +1072,24 @@ p2m_pod_demand_populate(struct p2m_domai __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t); } + pod_unlock(p2m); return 0; out_of_memory: - unlock_page_alloc(p2m); + pod_unlock(p2m); printk("%s: Out of populate-on-demand memory! tot_pages %" PRIu32 " pod_entries %" PRIi32 "\n", __func__, d->tot_pages, p2m->pod.entry_count); domain_crash(d); out_fail: + pod_unlock(p2m); return -1; remap_and_retry: BUG_ON(order != PAGE_ORDER_2M); - unlock_page_alloc(p2m); + pod_unlock(p2m); /* Remap this 2-meg region in singleton chunks */ + /* NOTE: In a p2m fine-grained lock scenario this might + * need promoting the gfn lock from gfn->2M superpage */ gfn_aligned = (gfn>>order)<<order; for(i=0; i<(1<<order); i++) set_p2m_entry(p2m, gfn_aligned+i, _mfn(0), PAGE_ORDER_4K, @@ -1137,9 +1157,11 @@ guest_physmap_mark_populate_on_demand(st rc = -EINVAL; else { - p2m->pod.entry_count += 1 << order; /* Lock: p2m */ + pod_lock(p2m); + p2m->pod.entry_count += 1 << order; p2m->pod.entry_count -= pod_count; BUG_ON(p2m->pod.entry_count < 0); + pod_unlock(p2m); } gfn_unlock(p2m, gfn, order); diff -r fb9c06df05f2 -r 56ceab0118cb xen/arch/x86/mm/p2m-pt.c --- a/xen/arch/x86/mm/p2m-pt.c +++ b/xen/arch/x86/mm/p2m-pt.c @@ -954,6 +954,7 @@ long p2m_pt_audit_p2m(struct p2m_domain struct domain *d = p2m->domain; ASSERT(p2m_locked_by_me(p2m)); + ASSERT(pod_locked_by_me(p2m)); test_linear = ( (d == current->domain) && !pagetable_is_null(current->arch.monitor_table) ); diff -r fb9c06df05f2 -r 56ceab0118cb xen/arch/x86/mm/p2m.c --- a/xen/arch/x86/mm/p2m.c +++ b/xen/arch/x86/mm/p2m.c @@ -72,6 +72,7 @@ boolean_param("hap_2mb", opt_hap_2mb); static void p2m_initialise(struct domain *d, struct p2m_domain *p2m) { mm_lock_init(&p2m->lock); + mm_lock_init(&p2m->pod.lock); INIT_LIST_HEAD(&p2m->np2m_list); INIT_PAGE_LIST_HEAD(&p2m->pages); INIT_PAGE_LIST_HEAD(&p2m->pod.super); @@ -587,8 +588,10 @@ guest_physmap_add_entry(struct domain *d rc = -EINVAL; else { - p2m->pod.entry_count -= pod_count; /* Lock: p2m */ + pod_lock(p2m); + p2m->pod.entry_count -= pod_count; BUG_ON(p2m->pod.entry_count < 0); + pod_unlock(p2m); } } @@ -1372,6 +1375,7 @@ p2m_flush_table(struct p2m_domain *p2m) /* "Host" p2m tables can have shared entries &c that need a bit more * care when discarding them */ ASSERT(p2m_is_nestedp2m(p2m)); + /* Nested p2m''s do not do pod, hence the asserts (and no pod lock)*/ ASSERT(page_list_empty(&p2m->pod.super)); ASSERT(page_list_empty(&p2m->pod.single)); @@ -1529,6 +1533,7 @@ void audit_p2m(struct domain *d, P2M_PRINTK("p2m audit starts\n"); p2m_lock(p2m); + pod_lock(p2m); if (p2m->audit_p2m) pmbad = p2m->audit_p2m(p2m); @@ -1589,6 +1594,7 @@ void audit_p2m(struct domain *d, } spin_unlock(&d->page_alloc_lock); + pod_unlock(p2m); p2m_unlock(p2m); P2M_PRINTK("p2m audit complete\n"); diff -r fb9c06df05f2 -r 56ceab0118cb xen/include/asm-x86/p2m.h --- a/xen/include/asm-x86/p2m.h +++ b/xen/include/asm-x86/p2m.h @@ -261,25 +261,12 @@ struct p2m_domain { unsigned long max_mapped_pfn; /* Populate-on-demand variables - * NB on locking. {super,single,count} are - * covered by d->page_alloc_lock, since they''re almost always used in - * conjunction with that functionality. {entry_count} is covered by - * the domain p2m lock, since it''s almost always used in conjunction - * with changing the p2m tables. - * - * At this point, both locks are held in two places. In both, - * the order is [p2m,page_alloc]: - * + p2m_pod_decrease_reservation() calls p2m_pod_cache_add(), - * which grabs page_alloc - * + p2m_pod_demand_populate() grabs both; the p2m lock to avoid - * double-demand-populating of pages, the page_alloc lock to - * protect moving stuff from the PoD cache to the domain page list. - * - * We enforce this lock ordering through a construct in mm-locks.h. - * This demands, however, that we store the previous lock-ordering - * level in effect before grabbing the page_alloc lock. The unlock - * level is stored in the arch section of the domain struct. - */ + * All variables are protected with the pod lock. We cannot rely on + * the p2m lock if it''s turned into a fine-grained lock. + * We only use the domain page_alloc lock for additions and + * deletions to the domain''s page list. Because we use it nested + * within the PoD lock, we enforce it''s ordering (by remembering + * the unlock level in the arch_domain sub struct). */ struct { struct page_list_head super, /* List of superpages */ single; /* Non-super lists */ @@ -288,6 +275,8 @@ struct p2m_domain { unsigned reclaim_super; /* Last gpfn of a scan */ unsigned reclaim_single; /* Last gpfn of a scan */ unsigned max_guest; /* gpfn of max guest demand-populate */ + mm_lock_t lock; /* Locking of private pod structs, * + * not relying on the p2m lock. */ } pod; };
Andres Lagar-Cavilla
2012-Feb-01 19:56 UTC
[PATCH 4 of 5] Re-order calls to put_gfn() around wait queu invocations
xen/arch/x86/hvm/emulate.c | 2 +- xen/arch/x86/hvm/hvm.c | 25 +++++++++++++------ xen/arch/x86/mm.c | 10 +++--- xen/arch/x86/mm/guest_walk.c | 2 +- xen/arch/x86/mm/hap/guest_walk.c | 6 +--- xen/arch/x86/mm/mem_access.c | 10 +++++++ xen/arch/x86/mm/mem_event.c | 5 +++ xen/arch/x86/mm/p2m.c | 52 ++++++++++++++++++++++----------------- xen/common/grant_table.c | 2 +- xen/common/memory.c | 2 +- xen/include/asm-x86/mem_access.h | 1 + xen/include/asm-x86/mem_event.h | 3 ++ xen/include/asm-x86/p2m.h | 6 +++- 13 files changed, 80 insertions(+), 46 deletions(-) Since we use wait queues to handle potential ring congestion cases, code paths that try to generate a mem event while holding a gfn lock would go to sleep in non-preemptible mode. Most such code paths can be fixed by simply postponing event generation until locks are released. Signed-off-by: Adin Scannell <adin@scannell.ca> Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org> diff -r 56ceab0118cb -r 211872232129 xen/arch/x86/hvm/emulate.c --- a/xen/arch/x86/hvm/emulate.c +++ b/xen/arch/x86/hvm/emulate.c @@ -67,8 +67,8 @@ static int hvmemul_do_io( ram_mfn = get_gfn_unshare(curr->domain, ram_gfn, &p2mt); if ( p2m_is_paging(p2mt) ) { + put_gfn(curr->domain, ram_gfn); p2m_mem_paging_populate(curr->domain, ram_gfn); - put_gfn(curr->domain, ram_gfn); return X86EMUL_RETRY; } if ( p2m_is_shared(p2mt) ) diff -r 56ceab0118cb -r 211872232129 xen/arch/x86/hvm/hvm.c --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -64,6 +64,7 @@ #include <public/version.h> #include <public/memory.h> #include <asm/mem_event.h> +#include <asm/mem_access.h> #include <public/mem_event.h> bool_t __read_mostly hvm_enabled; @@ -363,8 +364,8 @@ static int hvm_set_ioreq_page( } if ( p2m_is_paging(p2mt) ) { + put_gfn(d, gmfn); p2m_mem_paging_populate(d, gmfn); - put_gfn(d, gmfn); return -ENOENT; } if ( p2m_is_shared(p2mt) ) @@ -1195,7 +1196,8 @@ int hvm_hap_nested_page_fault(unsigned l mfn_t mfn; struct vcpu *v = current; struct p2m_domain *p2m; - int rc, fall_through = 0; + int rc, fall_through = 0, paged = 0; + mem_event_request_t *req_ptr = NULL; /* On Nested Virtualization, walk the guest page table. * If this succeeds, all is fine. @@ -1270,7 +1272,7 @@ int hvm_hap_nested_page_fault(unsigned l if ( violation ) { if ( p2m_mem_access_check(gpa, gla_valid, gla, access_r, - access_w, access_x) ) + access_w, access_x, &req_ptr) ) { fall_through = 1; } else { @@ -1297,7 +1299,7 @@ int hvm_hap_nested_page_fault(unsigned l #ifdef __x86_64__ /* Check if the page has been paged out */ if ( p2m_is_paged(p2mt) || (p2mt == p2m_ram_paging_out) ) - p2m_mem_paging_populate(v->domain, gfn); + paged = 1; /* Mem sharing: unshare the page and try again */ if ( access_w && (p2mt == p2m_ram_shared) ) @@ -1343,6 +1345,13 @@ int hvm_hap_nested_page_fault(unsigned l out_put_gfn: put_gfn(p2m->domain, gfn); + if ( paged ) + p2m_mem_paging_populate(v->domain, gfn); + if ( req_ptr ) + { + mem_access_send_req(v->domain, req_ptr); + xfree(req_ptr); + } return rc; } @@ -1849,8 +1858,8 @@ static void *__hvm_map_guest_frame(unsig } if ( p2m_is_paging(p2mt) ) { + put_gfn(d, gfn); p2m_mem_paging_populate(d, gfn); - put_gfn(d, gfn); return NULL; } @@ -2325,8 +2334,8 @@ static enum hvm_copy_result __hvm_copy( if ( p2m_is_paging(p2mt) ) { + put_gfn(curr->domain, gfn); p2m_mem_paging_populate(curr->domain, gfn); - put_gfn(curr->domain, gfn); return HVMCOPY_gfn_paged_out; } if ( p2m_is_shared(p2mt) ) @@ -3923,8 +3932,8 @@ long do_hvm_op(unsigned long op, XEN_GUE mfn_t mfn = get_gfn_unshare(d, pfn, &t); if ( p2m_is_paging(t) ) { + put_gfn(d, pfn); p2m_mem_paging_populate(d, pfn); - put_gfn(d, pfn); rc = -EINVAL; goto param_fail3; } @@ -4040,8 +4049,8 @@ long do_hvm_op(unsigned long op, XEN_GUE mfn = get_gfn_unshare(d, pfn, &t); if ( p2m_is_paging(t) ) { + put_gfn(d, pfn); p2m_mem_paging_populate(d, pfn); - put_gfn(d, pfn); rc = -EINVAL; goto param_fail4; } diff -r 56ceab0118cb -r 211872232129 xen/arch/x86/mm.c --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -3536,8 +3536,8 @@ int do_mmu_update( if ( p2m_is_paged(p2mt) ) { + put_gfn(pt_owner, gmfn); p2m_mem_paging_populate(pg_owner, gmfn); - put_gfn(pt_owner, gmfn); rc = -ENOENT; break; } @@ -3568,8 +3568,8 @@ int do_mmu_update( if ( p2m_is_paged(l1e_p2mt) ) { + put_gfn(pg_owner, l1egfn); p2m_mem_paging_populate(pg_owner, l1e_get_pfn(l1e)); - put_gfn(pg_owner, l1egfn); rc = -ENOENT; break; } @@ -3617,8 +3617,8 @@ int do_mmu_update( if ( p2m_is_paged(l2e_p2mt) ) { + put_gfn(pg_owner, l2egfn); p2m_mem_paging_populate(pg_owner, l2egfn); - put_gfn(pg_owner, l2egfn); rc = -ENOENT; break; } @@ -3652,8 +3652,8 @@ int do_mmu_update( if ( p2m_is_paged(l3e_p2mt) ) { + put_gfn(pg_owner, l3egfn); p2m_mem_paging_populate(pg_owner, l3egfn); - put_gfn(pg_owner, l3egfn); rc = -ENOENT; break; } @@ -3687,8 +3687,8 @@ int do_mmu_update( if ( p2m_is_paged(l4e_p2mt) ) { + put_gfn(pg_owner, l4egfn); p2m_mem_paging_populate(pg_owner, l4egfn); - put_gfn(pg_owner, l4egfn); rc = -ENOENT; break; } diff -r 56ceab0118cb -r 211872232129 xen/arch/x86/mm/guest_walk.c --- a/xen/arch/x86/mm/guest_walk.c +++ b/xen/arch/x86/mm/guest_walk.c @@ -102,8 +102,8 @@ static inline void *map_domain_gfn(struc if ( p2m_is_paging(*p2mt) ) { ASSERT(!p2m_is_nestedp2m(p2m)); + __put_gfn(p2m, gfn_x(gfn)); p2m_mem_paging_populate(p2m->domain, gfn_x(gfn)); - __put_gfn(p2m, gfn_x(gfn)); *rc = _PAGE_PAGED; return NULL; } diff -r 56ceab0118cb -r 211872232129 xen/arch/x86/mm/hap/guest_walk.c --- a/xen/arch/x86/mm/hap/guest_walk.c +++ b/xen/arch/x86/mm/hap/guest_walk.c @@ -64,10 +64,9 @@ unsigned long hap_p2m_ga_to_gfn(GUEST_PA if ( p2m_is_paging(p2mt) ) { ASSERT(!p2m_is_nestedp2m(p2m)); - p2m_mem_paging_populate(p2m->domain, cr3 >> PAGE_SHIFT); - pfec[0] = PFEC_page_paged; __put_gfn(p2m, top_gfn); + p2m_mem_paging_populate(p2m->domain, cr3 >> PAGE_SHIFT); return INVALID_GFN; } if ( p2m_is_shared(p2mt) ) @@ -101,10 +100,9 @@ unsigned long hap_p2m_ga_to_gfn(GUEST_PA if ( p2m_is_paging(p2mt) ) { ASSERT(!p2m_is_nestedp2m(p2m)); - p2m_mem_paging_populate(p2m->domain, gfn_x(gfn)); - pfec[0] = PFEC_page_paged; __put_gfn(p2m, gfn_x(gfn)); + p2m_mem_paging_populate(p2m->domain, gfn_x(gfn)); return INVALID_GFN; } if ( p2m_is_shared(p2mt) ) diff -r 56ceab0118cb -r 211872232129 xen/arch/x86/mm/mem_access.c --- a/xen/arch/x86/mm/mem_access.c +++ b/xen/arch/x86/mm/mem_access.c @@ -47,6 +47,16 @@ int mem_access_domctl(struct domain *d, return rc; } +int mem_access_send_req(struct domain *d, mem_event_request_t *req) +{ + int rc = mem_event_claim_slot(d, &d->mem_event->access); + if ( rc < 0 ) + return rc; + + mem_event_put_request(d, &d->mem_event->access, req); + + return 0; +} /* * Local variables: diff -r 56ceab0118cb -r 211872232129 xen/arch/x86/mm/mem_event.c --- a/xen/arch/x86/mm/mem_event.c +++ b/xen/arch/x86/mm/mem_event.c @@ -423,6 +423,11 @@ static int mem_event_wait_slot(struct me return rc; } +bool_t mem_event_check_ring(struct mem_event_domain *med) +{ + return (med->ring_page != NULL); +} + /* * Determines whether or not the current vCPU belongs to the target domain, * and calls the appropriate wait function. If it is a guest vCPU, then we diff -r 56ceab0118cb -r 211872232129 xen/arch/x86/mm/p2m.c --- a/xen/arch/x86/mm/p2m.c +++ b/xen/arch/x86/mm/p2m.c @@ -1152,17 +1152,18 @@ void p2m_mem_paging_resume(struct domain } bool_t p2m_mem_access_check(unsigned long gpa, bool_t gla_valid, unsigned long gla, - bool_t access_r, bool_t access_w, bool_t access_x) + bool_t access_r, bool_t access_w, bool_t access_x, + mem_event_request_t **req_ptr) { struct vcpu *v = current; - mem_event_request_t req; unsigned long gfn = gpa >> PAGE_SHIFT; struct domain *d = v->domain; struct p2m_domain* p2m = p2m_get_hostp2m(d); mfn_t mfn; p2m_type_t p2mt; p2m_access_t p2ma; - + mem_event_request_t *req; + /* First, handle rx2rw conversion automatically */ gfn_lock(p2m, gfn, 0); mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, p2m_query, NULL); @@ -1181,7 +1182,7 @@ bool_t p2m_mem_access_check(unsigned lon gfn_unlock(p2m, gfn, 0); /* Otherwise, check if there is a memory event listener, and send the message along */ - if ( mem_event_claim_slot(d, &d->mem_event->access) == -ENOSYS ) + if ( !mem_event_check_ring(&d->mem_event->access) || !req_ptr ) { /* No listener */ if ( p2m->access_required ) @@ -1205,29 +1206,34 @@ bool_t p2m_mem_access_check(unsigned lon } } - memset(&req, 0, sizeof(req)); - req.type = MEM_EVENT_TYPE_ACCESS; - req.reason = MEM_EVENT_REASON_VIOLATION; + *req_ptr = NULL; + req = xmalloc(mem_event_request_t); + if ( req ) + { + *req_ptr = req; + memset(req, 0, sizeof(req)); + req->type = MEM_EVENT_TYPE_ACCESS; + req->reason = MEM_EVENT_REASON_VIOLATION; + + /* Pause the current VCPU */ + if ( p2ma != p2m_access_n2rwx ) + req->flags |= MEM_EVENT_FLAG_VCPU_PAUSED; + + /* Send request to mem event */ + req->gfn = gfn; + req->offset = gpa & ((1 << PAGE_SHIFT) - 1); + req->gla_valid = gla_valid; + req->gla = gla; + req->access_r = access_r; + req->access_w = access_w; + req->access_x = access_x; + + req->vcpu_id = v->vcpu_id; + } /* Pause the current VCPU */ if ( p2ma != p2m_access_n2rwx ) - { vcpu_pause_nosync(v); - req.flags |= MEM_EVENT_FLAG_VCPU_PAUSED; - } - - /* Send request to mem event */ - req.gfn = gfn; - req.offset = gpa & ((1 << PAGE_SHIFT) - 1); - req.gla_valid = gla_valid; - req.gla = gla; - req.access_r = access_r; - req.access_w = access_w; - req.access_x = access_x; - - req.vcpu_id = v->vcpu_id; - - mem_event_put_request(d, &d->mem_event->access, &req); /* VCPU may be paused, return whether we promoted automatically */ return (p2ma == p2m_access_n2rwx); diff -r 56ceab0118cb -r 211872232129 xen/common/grant_table.c --- a/xen/common/grant_table.c +++ b/xen/common/grant_table.c @@ -164,8 +164,8 @@ static int __get_paged_frame(unsigned lo *frame = mfn_x(mfn); if ( p2m_is_paging(p2mt) ) { + put_gfn(rd, gfn); p2m_mem_paging_populate(rd, gfn); - put_gfn(rd, gfn); rc = GNTST_eagain; } } else { diff -r 56ceab0118cb -r 211872232129 xen/common/memory.c --- a/xen/common/memory.c +++ b/xen/common/memory.c @@ -166,8 +166,8 @@ int guest_remove_page(struct domain *d, if ( unlikely(p2m_is_paging(p2mt)) ) { guest_physmap_remove_page(d, gmfn, mfn, 0); + put_gfn(d, gmfn); p2m_mem_paging_drop_page(d, gmfn, p2mt); - put_gfn(d, gmfn); return 1; } #else diff -r 56ceab0118cb -r 211872232129 xen/include/asm-x86/mem_access.h --- a/xen/include/asm-x86/mem_access.h +++ b/xen/include/asm-x86/mem_access.h @@ -23,6 +23,7 @@ int mem_access_domctl(struct domain *d, xen_domctl_mem_event_op_t *mec, XEN_GUEST_HANDLE(void) u_domctl); +int mem_access_send_req(struct domain *d, mem_event_request_t *req); /* diff -r 56ceab0118cb -r 211872232129 xen/include/asm-x86/mem_event.h --- a/xen/include/asm-x86/mem_event.h +++ b/xen/include/asm-x86/mem_event.h @@ -24,6 +24,9 @@ #ifndef __MEM_EVENT_H__ #define __MEM_EVENT_H__ +/* Returns whether a ring has been set up */ +bool_t mem_event_check_ring(struct mem_event_domain *med); + /* Returns 0 on success, -ENOSYS if there is no ring, -EBUSY if there is no * available space. For success or -EBUSY, the vCPU may be left blocked * temporarily to ensure that the ring does not lose future events. In diff -r 56ceab0118cb -r 211872232129 xen/include/asm-x86/p2m.h --- a/xen/include/asm-x86/p2m.h +++ b/xen/include/asm-x86/p2m.h @@ -587,7 +587,8 @@ static inline void p2m_mem_paging_popula * the rw2rx conversion. Boolean return value indicates if access rights have * been promoted with no underlying vcpu pause. */ bool_t p2m_mem_access_check(unsigned long gpa, bool_t gla_valid, unsigned long gla, - bool_t access_r, bool_t access_w, bool_t access_x); + bool_t access_r, bool_t access_w, bool_t access_x, + mem_event_request_t **req_ptr); /* Resumes the running of the VCPU, restarting the last instruction */ void p2m_mem_access_resume(struct domain *d); @@ -604,7 +605,8 @@ int p2m_get_mem_access(struct domain *d, #else static inline bool_t p2m_mem_access_check(unsigned long gpa, bool_t gla_valid, unsigned long gla, bool_t access_r, - bool_t access_w, bool_t access_x) + bool_t access_w, bool_t access_x, + mem_event_request_t **req_ptr); { return 1; } static inline int p2m_set_mem_access(struct domain *d, unsigned long start_pfn,
Andres Lagar-Cavilla
2012-Feb-01 19:56 UTC
[PATCH 5 of 5] x86/mm: Revert changeset 24582:f6c33cfe7333
xen/arch/x86/mm/p2m.c | 4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) With synchronized p2m lookups this is no longer needed, and we can lock the p2m up-front. Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org> diff -r 211872232129 -r a8ba1fc6e4b3 xen/arch/x86/mm/p2m.c --- a/xen/arch/x86/mm/p2m.c +++ b/xen/arch/x86/mm/p2m.c @@ -361,6 +361,8 @@ void p2m_teardown(struct p2m_domain *p2m if (p2m == NULL) return; + p2m_lock(p2m); + #ifdef __x86_64__ for ( gfn=0; gfn < p2m->max_mapped_pfn; gfn++ ) { @@ -374,8 +376,6 @@ void p2m_teardown(struct p2m_domain *p2m } #endif - p2m_lock(p2m); - p2m->phys_table = pagetable_null(); while ( (pg = page_list_remove_head(&p2m->pages)) )
Tim Deegan
2012-Feb-02 13:08 UTC
Re: [PATCH 1 of 5] Make p2m lookups fully synchronized wrt modifications
At 14:56 -0500 on 01 Feb (1328108165), Andres Lagar-Cavilla wrote:> xen/arch/x86/mm/hap/hap.c | 6 +++++ > xen/arch/x86/mm/mm-locks.h | 40 ++++++++++++++++++++++++-------------- > xen/arch/x86/mm/p2m.c | 21 ++++++++++++++++++- > xen/include/asm-x86/p2m.h | 47 ++++++++++++++++++++++++++++----------------- > 4 files changed, 79 insertions(+), 35 deletions(-) > > > We achieve this by locking/unlocking the global p2m_lock in get/put_gfn. > > The lock is always taken recursively, as there are many paths that > do get_gfn, and later, another attempt at grabbing the p2m_lock > > The lock is not taken for shadow lookups, as the testing needed to rule out the > possibility of locking inversions and deadlock has not yet been carried out for > shadow-paging. This is tolerable as long as shadows do not gain support for > paging or sharing.Are you aware of any inversions or are you just suggesting there might be some left?> HAP (EPT) lookups and all modifications do take the lock. > > Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org> > > diff -r 9a55109e4d7e -r 223512f9fb5b xen/arch/x86/mm/hap/hap.c > --- a/xen/arch/x86/mm/hap/hap.c > +++ b/xen/arch/x86/mm/hap/hap.c > @@ -786,7 +786,12 @@ hap_paging_get_mode(struct vcpu *v) > static void hap_update_paging_modes(struct vcpu *v) > { > struct domain *d = v->domain; > + unsigned long cr3_gfn = v->arch.hvm_vcpu.guest_cr[3]; > + p2m_type_t t; > > + /* We hold onto the cr3 as it may be modified later, and > + * we need to respect lock ordering */ > + (void)get_gfn(d, cr3_gfn, &t);Don''t you need to check for errors?> paging_lock(d); > > v->arch.paging.mode = hap_paging_get_mode(v); > @@ -803,6 +808,7 @@ static void hap_update_paging_modes(stru > hap_update_cr3(v, 0); > > paging_unlock(d); > + put_gfn(d, cr3_gfn); > } > > #if CONFIG_PAGING_LEVELS == 3 > diff -r 9a55109e4d7e -r 223512f9fb5b xen/arch/x86/mm/mm-locks.h > --- a/xen/arch/x86/mm/mm-locks.h > +++ b/xen/arch/x86/mm/mm-locks.h > @@ -144,6 +144,31 @@ static inline void mm_enforce_order_unlo > * * > ************************************************************************/ > > +declare_mm_lock(nestedp2m) > +#define nestedp2m_lock(d) mm_lock(nestedp2m, &(d)->arch.nested_p2m_lock) > +#define nestedp2m_unlock(d) mm_unlock(&(d)->arch.nested_p2m_lock) > + > +/* P2M lock (per-p2m-table) > + * > + * This protects all queries and updates to the p2m table. > + * > + * A note about ordering: > + * The order established here is enforced on all mutations of a p2m. > + * For lookups, the order established here is enforced only for hap > + * domains (1. shadow domains present a few nasty inversions; > + * 2. shadow domains do not support paging and sharing, > + * the main sources of dynamic p2m mutations) > + * > + * The lock is recursive as it is common for a code path to look up a gfn > + * and later mutate it. > + */ > + > +declare_mm_lock(p2m) > +#define p2m_lock(p) mm_lock_recursive(p2m, &(p)->lock) > +#define p2m_lock_recursive(p) mm_lock_recursive(p2m, &(p)->lock) > +#define p2m_unlock(p) mm_unlock(&(p)->lock) > +#define p2m_locked_by_me(p) mm_locked_by_me(&(p)->lock) > + > /* Sharing per page lock > * > * This is an external lock, not represented by an mm_lock_t. The memorySince you''ve just reversed the locking order between this page-sharing lock and the p2m lock, you need to update the comment that describes it. Also, presumably, the code that it describes? Cheers, Tim.
Tim Deegan
2012-Feb-02 13:10 UTC
Re: [PATCH 2 of 5] Clean up locking now that p2m lockups are fully synchronized
At 14:56 -0500 on 01 Feb (1328108166), Andres Lagar-Cavilla wrote:> xen/arch/x86/mm/mem_sharing.c | 2 - > xen/arch/x86/mm/mm-locks.h | 4 ++- > xen/arch/x86/mm/p2m-ept.c | 33 ++------------------- > xen/arch/x86/mm/p2m-pod.c | 14 ++++++--- > xen/arch/x86/mm/p2m-pt.c | 45 +++++------------------------ > xen/arch/x86/mm/p2m.c | 64 ++++++++++++++++++++++-------------------- > 6 files changed, 58 insertions(+), 104 deletions(-) > > > With p2m lookups fully synchronized, many routines need not > call p2m_lock any longer. Also, many routines can logically > assert holding the p2m for a specific gfn. > > Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org>This looks OK to me, but I''d like an Ack from George as well for the PoD changes. Cheers, Tim.
At 14:56 -0500 on 01 Feb (1328108167), Andres Lagar-Cavilla wrote:> xen/arch/x86/mm/mm-locks.h | 10 ++++ > xen/arch/x86/mm/p2m-pod.c | 112 ++++++++++++++++++++++++++------------------ > xen/arch/x86/mm/p2m-pt.c | 1 + > xen/arch/x86/mm/p2m.c | 8 ++- > xen/include/asm-x86/p2m.h | 27 +++------- > 5 files changed, 93 insertions(+), 65 deletions(-) > > > The PoD layer has a comples locking discipline. It relies on the > p2m being globally locked, and it also relies on the page alloc > lock to protect some of its data structures. Replace this all by an > explicit pod lock: per p2m, order enforced. > > Three consequences: > - Critical sections in the pod code protected by the page alloc > lock are now reduced to modifications of the domain page list. > - When the p2m lock becomes fine-grained, there are no > assumptions broken in the PoD layer. > - The locking is easier to understand. > > Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org>This needs an Ack from George, too. Also:> @@ -922,6 +929,12 @@ p2m_pod_emergency_sweep(struct p2m_domai > limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0; > > /* FIXME: Figure out how to avoid superpages */ > + /* NOTE: Promote to globally locking the p2m. This will get complicated > + * in a fine-grained scenario. Even if we''re to lock each gfn > + * individually we must be careful about recursion limits and > + * POD_SWEEP_STRIDE. This is why we don''t enforce deadlock constraints > + * between p2m and pod locks */ > + p2m_lock(p2m);That''s a scary comment. It looks to me as if the mm-locks.h mechanism _does_ enforce those constraints - am I missing something? Cheers, Tim.
Tim Deegan
2012-Feb-02 13:26 UTC
Re: [PATCH 4 of 5] Re-order calls to put_gfn() around wait queu invocations
At 14:56 -0500 on 01 Feb (1328108168), Andres Lagar-Cavilla wrote:> xen/arch/x86/hvm/emulate.c | 2 +- > xen/arch/x86/hvm/hvm.c | 25 +++++++++++++------ > xen/arch/x86/mm.c | 10 +++--- > xen/arch/x86/mm/guest_walk.c | 2 +- > xen/arch/x86/mm/hap/guest_walk.c | 6 +--- > xen/arch/x86/mm/mem_access.c | 10 +++++++ > xen/arch/x86/mm/mem_event.c | 5 +++ > xen/arch/x86/mm/p2m.c | 52 ++++++++++++++++++++++----------------- > xen/common/grant_table.c | 2 +- > xen/common/memory.c | 2 +- > xen/include/asm-x86/mem_access.h | 1 + > xen/include/asm-x86/mem_event.h | 3 ++ > xen/include/asm-x86/p2m.h | 6 +++- > 13 files changed, 80 insertions(+), 46 deletions(-) > > > Since we use wait queues to handle potential ring congestion cases, > code paths that try to generate a mem event while holding a gfn lock > would go to sleep in non-preemptible mode. > > Most such code paths can be fixed by simply postponing event generation until > locks are released. > > Signed-off-by: Adin Scannell <adin@scannell.ca> > Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org>Looks OK, once the preceding patches are sorted out. If you add some comments to p2m_mem_access_check() describing its new argument and what the caller''s supposed to do about it, you can put my Ack on this next time. Cheers, Tim.
Tim Deegan
2012-Feb-02 13:27 UTC
Re: [PATCH 5 of 5] x86/mm: Revert changeset 24582:f6c33cfe7333
At 14:56 -0500 on 01 Feb (1328108169), Andres Lagar-Cavilla wrote:> xen/arch/x86/mm/p2m.c | 4 ++-- > 1 files changed, 2 insertions(+), 2 deletions(-) > > > With synchronized p2m lookups this is no longer needed, and we can lock the p2m > up-front. > > Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org>Acked-by: Tim Deegan <tjd-xen@phlegethon.org> (once the earlier patches are in, of course!) Cheers, Tim.
Andres Lagar-Cavilla
2012-Feb-02 14:01 UTC
Re: [PATCH 1 of 5] Make p2m lookups fully synchronized wrt modifications
> At 14:56 -0500 on 01 Feb (1328108165), Andres Lagar-Cavilla wrote: >> xen/arch/x86/mm/hap/hap.c | 6 +++++ >> xen/arch/x86/mm/mm-locks.h | 40 ++++++++++++++++++++++++-------------- >> xen/arch/x86/mm/p2m.c | 21 ++++++++++++++++++- >> xen/include/asm-x86/p2m.h | 47 >> ++++++++++++++++++++++++++++----------------- >> 4 files changed, 79 insertions(+), 35 deletions(-) >> >> >> We achieve this by locking/unlocking the global p2m_lock in get/put_gfn. >> >> The lock is always taken recursively, as there are many paths that >> do get_gfn, and later, another attempt at grabbing the p2m_lock >> >> The lock is not taken for shadow lookups, as the testing needed to rule >> out the >> possibility of locking inversions and deadlock has not yet been carried >> out for >> shadow-paging. This is tolerable as long as shadows do not gain support >> for >> paging or sharing. > > Are you aware of any inversions or are you just suggesting there might > be some left?I''m not aware of any inversions. A previously posted patch cleans up all that I could see. But I have not tested shadows, so I don''t feel comfortable about enabling it just yet.> >> HAP (EPT) lookups and all modifications do take the lock. >> >> Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org> >> >> diff -r 9a55109e4d7e -r 223512f9fb5b xen/arch/x86/mm/hap/hap.c >> --- a/xen/arch/x86/mm/hap/hap.c >> +++ b/xen/arch/x86/mm/hap/hap.c >> @@ -786,7 +786,12 @@ hap_paging_get_mode(struct vcpu *v) >> static void hap_update_paging_modes(struct vcpu *v) >> { >> struct domain *d = v->domain; >> + unsigned long cr3_gfn = v->arch.hvm_vcpu.guest_cr[3]; >> + p2m_type_t t; >> >> + /* We hold onto the cr3 as it may be modified later, and >> + * we need to respect lock ordering */ >> + (void)get_gfn(d, cr3_gfn, &t); > > Don''t you need to check for errors?Good to have, yes. As in, bail out if !p2m_is_ram || !mfn_valid.> >> paging_lock(d); >> >> v->arch.paging.mode = hap_paging_get_mode(v); >> @@ -803,6 +808,7 @@ static void hap_update_paging_modes(stru >> hap_update_cr3(v, 0); >> >> paging_unlock(d); >> + put_gfn(d, cr3_gfn); >> } >> >> #if CONFIG_PAGING_LEVELS == 3 >> diff -r 9a55109e4d7e -r 223512f9fb5b xen/arch/x86/mm/mm-locks.h >> --- a/xen/arch/x86/mm/mm-locks.h >> +++ b/xen/arch/x86/mm/mm-locks.h >> @@ -144,6 +144,31 @@ static inline void mm_enforce_order_unlo >> * >> * >> ************************************************************************/ >> >> +declare_mm_lock(nestedp2m) >> +#define nestedp2m_lock(d) mm_lock(nestedp2m, >> &(d)->arch.nested_p2m_lock) >> +#define nestedp2m_unlock(d) mm_unlock(&(d)->arch.nested_p2m_lock) >> + >> +/* P2M lock (per-p2m-table) >> + * >> + * This protects all queries and updates to the p2m table. >> + * >> + * A note about ordering: >> + * The order established here is enforced on all mutations of a p2m. >> + * For lookups, the order established here is enforced only for hap >> + * domains (1. shadow domains present a few nasty inversions; >> + * 2. shadow domains do not support paging and sharing, >> + * the main sources of dynamic p2m mutations) >> + * >> + * The lock is recursive as it is common for a code path to look up a >> gfn >> + * and later mutate it. >> + */ >> + >> +declare_mm_lock(p2m) >> +#define p2m_lock(p) mm_lock_recursive(p2m, &(p)->lock) >> +#define p2m_lock_recursive(p) mm_lock_recursive(p2m, &(p)->lock) >> +#define p2m_unlock(p) mm_unlock(&(p)->lock) >> +#define p2m_locked_by_me(p) mm_locked_by_me(&(p)->lock) >> + >> /* Sharing per page lock >> * >> * This is an external lock, not represented by an mm_lock_t. The >> memory > > Since you''ve just reversed the locking order between this page-sharing > lock and the p2m lock, you need to update the comment that describes > it. Also, presumably, the code that it describes?The comment might be stale, certainly, will look into it. The code is just fine, per testing. The next patch does a proper cleanup: the "complicated condition" is grabbing a p2m lock nested inside a page-sharing critical section. But thanks to this patch, the p2m lock will have been taken before, so no race (hope that makes sense). Bottomline is, correct with this patch, clean with next. Thanks! Andres> > Cheers, > > Tim. >
Andres Lagar-Cavilla
2012-Feb-02 14:04 UTC
Re: [PATCH 3 of 5] Rework locking in the PoD layer
> At 14:56 -0500 on 01 Feb (1328108167), Andres Lagar-Cavilla wrote: >> xen/arch/x86/mm/mm-locks.h | 10 ++++ >> xen/arch/x86/mm/p2m-pod.c | 112 >> ++++++++++++++++++++++++++------------------ >> xen/arch/x86/mm/p2m-pt.c | 1 + >> xen/arch/x86/mm/p2m.c | 8 ++- >> xen/include/asm-x86/p2m.h | 27 +++------- >> 5 files changed, 93 insertions(+), 65 deletions(-) >> >> >> The PoD layer has a comples locking discipline. It relies on the >> p2m being globally locked, and it also relies on the page alloc >> lock to protect some of its data structures. Replace this all by an >> explicit pod lock: per p2m, order enforced. >> >> Three consequences: >> - Critical sections in the pod code protected by the page alloc >> lock are now reduced to modifications of the domain page list. >> - When the p2m lock becomes fine-grained, there are no >> assumptions broken in the PoD layer. >> - The locking is easier to understand. >> >> Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org> > > This needs an Ack from George, too. Also: > >> @@ -922,6 +929,12 @@ p2m_pod_emergency_sweep(struct p2m_domai >> limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0; >> >> /* FIXME: Figure out how to avoid superpages */ >> + /* NOTE: Promote to globally locking the p2m. This will get >> complicated >> + * in a fine-grained scenario. Even if we''re to lock each gfn >> + * individually we must be careful about recursion limits and >> + * POD_SWEEP_STRIDE. This is why we don''t enforce deadlock >> constraints >> + * between p2m and pod locks */ >> + p2m_lock(p2m); > > That''s a scary comment. It looks to me as if the mm-locks.h mechanism > _does_ enforce those constraints - am I missing something?The problem is that the recurse count of a spinlock is not particularly wide. So if you have a loop that does a lot of nested get_gfn*, you may overflow. The funny bit is that we do enforce ordering, so that part of the comment is stale. Will update. Andres> > Cheers, > > Tim. >
George Dunlap
2012-Feb-02 14:31 UTC
Re: [PATCH 2 of 5] Clean up locking now that p2m lockups are fully synchronized
On Wed, 2012-02-01 at 19:56 +0000, Andres Lagar-Cavilla wrote:> @@ -1445,7 +1449,7 @@ p2m_get_nestedp2m(struct vcpu *v, uint64 > nestedp2m_unlock(d); > return p2m; > } > - p2m_unlock(p2m); > + gfn_unlock(p2m, gfn, 0); > }It looks like maybe you forgot to change the corresponding p2m_lock() to gfn_lock(), here in p2m.c:p2m_get_nestedp2m()? Other than that, I think the PoD stuff looks fine. So regarding PoD stuff: Acked-by: George Dunlap <george.dunlap@eu.citrix.com>
On Wed, 2012-02-01 at 19:56 +0000, Andres Lagar-Cavilla wrote:> @@ -114,15 +114,20 @@ p2m_pod_cache_add(struct p2m_domain *p2m > unmap_domain_page(b); > } > > + /* First, take all pages off the domain list */ > lock_page_alloc(p2m); > - > - /* First, take all pages off the domain list */ > for(i=0; i < 1 << order ; i++) > { > p = page + i; > page_list_del(p, &d->page_list); > } > > + /* Ensure that the PoD cache has never been emptied. > + * This may cause "zombie domains" since the page will never be freed. */ > + BUG_ON( d->arch.relmem != RELMEM_not_started ); > + > + unlock_page_alloc(p2m); > +This assert should stay where it is. The idea is to verify after-the-fact that the page_list_add_tail()s *have not* raced with emptying the PoD cache. Having the assert before cannot guarantee that they *will not* race with emptying the PoD cache. Alternately, if we''re positive that condition can never happen again, we should just remove the BUG_ON(). If I recall correctly, I put this in here because something ended up calling cache_add after empty_cache(), potentially with the p2m lock already held; that''s why I put the BUG_ON() there to begin with.> @@ -922,6 +929,12 @@ p2m_pod_emergency_sweep(struct p2m_domai > limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0; > > /* FIXME: Figure out how to avoid superpages */ > + /* NOTE: Promote to globally locking the p2m. This will get complicated > + * in a fine-grained scenario. Even if we''re to lock each gfn > + * individually we must be careful about recursion limits and > + * POD_SWEEP_STRIDE. This is why we don''t enforce deadlock constraints > + * between p2m and pod locks */ > + p2m_lock(p2m); > for ( i=p2m->pod.reclaim_single; i > 0 ; i-- ) > { > p2m_access_t a; > @@ -940,7 +953,7 @@ p2m_pod_emergency_sweep(struct p2m_domai > /* Stop if we''re past our limit and we have found *something*. > * > * NB that this is a zero-sum game; we''re increasing our cache size > - * by re-increasing our ''debt''. Since we hold the p2m lock, > + * by re-increasing our ''debt''. Since we hold the pod lock, > * (entry_count - count) must remain the same. */ > if ( p2m->pod.count > 0 && i < limit ) > break; > @@ -949,6 +962,7 @@ p2m_pod_emergency_sweep(struct p2m_domai > if ( j ) > p2m_pod_zero_check(p2m, gfns, j); > > + p2m_unlock(p2m); > p2m->pod.reclaim_single = i ? i - 1 : i; > > } > @@ -965,8 +979,9 @@ p2m_pod_demand_populate(struct p2m_domai > int i; > > ASSERT(gfn_locked_by_me(p2m, gfn)); > + pod_lock(p2m); > > - /* This check is done with the p2m lock held. This will make sure that > + /* This check is done with the pod lock held. This will make sure that > * even if d->is_dying changes under our feet, p2m_pod_empty_cache() > * won''t start until we''re done. */ > if ( unlikely(d->is_dying) ) > @@ -977,6 +992,7 @@ p2m_pod_demand_populate(struct p2m_domai > * 1GB region to 2MB chunks for a retry. */ > if ( order == PAGE_ORDER_1G ) > { > + pod_unlock(p2m); > gfn_aligned = (gfn >> order) << order; > /* Note that we are supposed to call set_p2m_entry() 512 times to > * split 1GB into 512 2MB pages here. But We only do once here because > @@ -1000,11 +1016,15 @@ p2m_pod_demand_populate(struct p2m_domai > > /* If we''re low, start a sweep */ > if ( order == PAGE_ORDER_2M && page_list_empty(&p2m->pod.super) ) > + /* Note that sweeps scan other ranges in the p2m. In an scenario > + * in which p2m locks are order-enforced wrt pod lock and p2m > + * locks are fine grained, this will result in deadlock panic */ > p2m_pod_emergency_sweep_super(p2m); > > if ( page_list_empty(&p2m->pod.single) && > ( ( order == PAGE_ORDER_4K ) > || (order == PAGE_ORDER_2M && page_list_empty(&p2m->pod.super) ) ) ) > + /* Same comment regarding deadlock applies */ > p2m_pod_emergency_sweep(p2m); > } >Regarding locking on emergency sweeps: I think it should suffice if we could do the equivalent of a spin_trylock() on each gpfn, and just move on (not checking that gfn) if it fails. What do you think? Other than that, I don''t see anything wrong with this locking at first glance; but it''s complicated enough that I don''t think I''ve quite grokked it yet. I''ll look at it again tomorrow and see if things are more clear. :-) -George> @@ -1012,8 +1032,6 @@ p2m_pod_demand_populate(struct p2m_domai > if ( q == p2m_guest && gfn > p2m->pod.max_guest ) > p2m->pod.max_guest = gfn; > > - lock_page_alloc(p2m); > - > if ( p2m->pod.count == 0 ) > goto out_of_memory; > > @@ -1026,8 +1044,6 @@ p2m_pod_demand_populate(struct p2m_domai > > BUG_ON((mfn_x(mfn) & ((1 << order)-1)) != 0); > > - unlock_page_alloc(p2m); > - > gfn_aligned = (gfn >> order) << order; > > set_p2m_entry(p2m, gfn_aligned, mfn, order, p2m_ram_rw, p2m->default_access); > @@ -1038,7 +1054,7 @@ p2m_pod_demand_populate(struct p2m_domai > paging_mark_dirty(d, mfn_x(mfn) + i); > } > > - p2m->pod.entry_count -= (1 << order); /* Lock: p2m */ > + p2m->pod.entry_count -= (1 << order); > BUG_ON(p2m->pod.entry_count < 0); > > if ( tb_init_done ) > @@ -1056,20 +1072,24 @@ p2m_pod_demand_populate(struct p2m_domai > __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t); > } > > + pod_unlock(p2m); > return 0; > out_of_memory: > - unlock_page_alloc(p2m); > + pod_unlock(p2m); > > printk("%s: Out of populate-on-demand memory! tot_pages %" PRIu32 " pod_entries %" PRIi32 "\n", > __func__, d->tot_pages, p2m->pod.entry_count); > domain_crash(d); > out_fail: > + pod_unlock(p2m); > return -1; > remap_and_retry: > BUG_ON(order != PAGE_ORDER_2M); > - unlock_page_alloc(p2m); > + pod_unlock(p2m); > > /* Remap this 2-meg region in singleton chunks */ > + /* NOTE: In a p2m fine-grained lock scenario this might > + * need promoting the gfn lock from gfn->2M superpage */ > gfn_aligned = (gfn>>order)<<order; > for(i=0; i<(1<<order); i++) > set_p2m_entry(p2m, gfn_aligned+i, _mfn(0), PAGE_ORDER_4K, > @@ -1137,9 +1157,11 @@ guest_physmap_mark_populate_on_demand(st > rc = -EINVAL; > else > { > - p2m->pod.entry_count += 1 << order; /* Lock: p2m */ > + pod_lock(p2m); > + p2m->pod.entry_count += 1 << order; > p2m->pod.entry_count -= pod_count; > BUG_ON(p2m->pod.entry_count < 0); > + pod_unlock(p2m); > } > > gfn_unlock(p2m, gfn, order); > diff -r fb9c06df05f2 -r 56ceab0118cb xen/arch/x86/mm/p2m-pt.c > --- a/xen/arch/x86/mm/p2m-pt.c > +++ b/xen/arch/x86/mm/p2m-pt.c > @@ -954,6 +954,7 @@ long p2m_pt_audit_p2m(struct p2m_domain > struct domain *d = p2m->domain; > > ASSERT(p2m_locked_by_me(p2m)); > + ASSERT(pod_locked_by_me(p2m)); > > test_linear = ( (d == current->domain) > && !pagetable_is_null(current->arch.monitor_table) ); > diff -r fb9c06df05f2 -r 56ceab0118cb xen/arch/x86/mm/p2m.c > --- a/xen/arch/x86/mm/p2m.c > +++ b/xen/arch/x86/mm/p2m.c > @@ -72,6 +72,7 @@ boolean_param("hap_2mb", opt_hap_2mb); > static void p2m_initialise(struct domain *d, struct p2m_domain *p2m) > { > mm_lock_init(&p2m->lock); > + mm_lock_init(&p2m->pod.lock); > INIT_LIST_HEAD(&p2m->np2m_list); > INIT_PAGE_LIST_HEAD(&p2m->pages); > INIT_PAGE_LIST_HEAD(&p2m->pod.super); > @@ -587,8 +588,10 @@ guest_physmap_add_entry(struct domain *d > rc = -EINVAL; > else > { > - p2m->pod.entry_count -= pod_count; /* Lock: p2m */ > + pod_lock(p2m); > + p2m->pod.entry_count -= pod_count; > BUG_ON(p2m->pod.entry_count < 0); > + pod_unlock(p2m); > } > } > > @@ -1372,6 +1375,7 @@ p2m_flush_table(struct p2m_domain *p2m) > /* "Host" p2m tables can have shared entries &c that need a bit more > * care when discarding them */ > ASSERT(p2m_is_nestedp2m(p2m)); > + /* Nested p2m''s do not do pod, hence the asserts (and no pod lock)*/ > ASSERT(page_list_empty(&p2m->pod.super)); > ASSERT(page_list_empty(&p2m->pod.single)); > > @@ -1529,6 +1533,7 @@ void audit_p2m(struct domain *d, > P2M_PRINTK("p2m audit starts\n"); > > p2m_lock(p2m); > + pod_lock(p2m); > > if (p2m->audit_p2m) > pmbad = p2m->audit_p2m(p2m); > @@ -1589,6 +1594,7 @@ void audit_p2m(struct domain *d, > } > spin_unlock(&d->page_alloc_lock); > > + pod_unlock(p2m); > p2m_unlock(p2m); > > P2M_PRINTK("p2m audit complete\n"); > diff -r fb9c06df05f2 -r 56ceab0118cb xen/include/asm-x86/p2m.h > --- a/xen/include/asm-x86/p2m.h > +++ b/xen/include/asm-x86/p2m.h > @@ -261,25 +261,12 @@ struct p2m_domain { > unsigned long max_mapped_pfn; > > /* Populate-on-demand variables > - * NB on locking. {super,single,count} are > - * covered by d->page_alloc_lock, since they''re almost always used in > - * conjunction with that functionality. {entry_count} is covered by > - * the domain p2m lock, since it''s almost always used in conjunction > - * with changing the p2m tables. > - * > - * At this point, both locks are held in two places. In both, > - * the order is [p2m,page_alloc]: > - * + p2m_pod_decrease_reservation() calls p2m_pod_cache_add(), > - * which grabs page_alloc > - * + p2m_pod_demand_populate() grabs both; the p2m lock to avoid > - * double-demand-populating of pages, the page_alloc lock to > - * protect moving stuff from the PoD cache to the domain page list. > - * > - * We enforce this lock ordering through a construct in mm-locks.h. > - * This demands, however, that we store the previous lock-ordering > - * level in effect before grabbing the page_alloc lock. The unlock > - * level is stored in the arch section of the domain struct. > - */ > + * All variables are protected with the pod lock. We cannot rely on > + * the p2m lock if it''s turned into a fine-grained lock. > + * We only use the domain page_alloc lock for additions and > + * deletions to the domain''s page list. Because we use it nested > + * within the PoD lock, we enforce it''s ordering (by remembering > + * the unlock level in the arch_domain sub struct). */ > struct { > struct page_list_head super, /* List of superpages */ > single; /* Non-super lists */ > @@ -288,6 +275,8 @@ struct p2m_domain { > unsigned reclaim_super; /* Last gpfn of a scan */ > unsigned reclaim_single; /* Last gpfn of a scan */ > unsigned max_guest; /* gpfn of max guest demand-populate */ > + mm_lock_t lock; /* Locking of private pod structs, * > + * not relying on the p2m lock. */ > } pod; > }; >
Andres Lagar-Cavilla
2012-Feb-02 19:40 UTC
Re: [PATCH 2 of 5] Clean up locking now that p2m lockups are fully synchronized
> On Wed, 2012-02-01 at 19:56 +0000, Andres Lagar-Cavilla wrote: >> @@ -1445,7 +1449,7 @@ p2m_get_nestedp2m(struct vcpu *v, uint64 >> nestedp2m_unlock(d); >> return p2m; >> } >> - p2m_unlock(p2m); >> + gfn_unlock(p2m, gfn, 0); >> } > > It looks like maybe you forgot to change the corresponding p2m_lock() to > gfn_lock(), here in p2m.c:p2m_get_nestedp2m()?Good catch! (evil macro argument squashing!) Thanks Andres> > Other than that, I think the PoD stuff looks fine. So regarding PoD > stuff: > Acked-by: George Dunlap <george.dunlap@eu.citrix.com> > >
Andres Lagar-Cavilla
2012-Feb-02 20:03 UTC
Re: [PATCH 3 of 5] Rework locking in the PoD layer
> On Wed, 2012-02-01 at 19:56 +0000, Andres Lagar-Cavilla wrote: >> @@ -114,15 +114,20 @@ p2m_pod_cache_add(struct p2m_domain *p2m >> unmap_domain_page(b); >> } >> >> + /* First, take all pages off the domain list */ >> lock_page_alloc(p2m); >> - >> - /* First, take all pages off the domain list */ >> for(i=0; i < 1 << order ; i++) >> { >> p = page + i; >> page_list_del(p, &d->page_list); >> } >> >> + /* Ensure that the PoD cache has never been emptied. >> + * This may cause "zombie domains" since the page will never be >> freed. */ >> + BUG_ON( d->arch.relmem != RELMEM_not_started ); >> + >> + unlock_page_alloc(p2m); >> + > > This assert should stay where it is. The idea is to verify > after-the-fact that the page_list_add_tail()s *have not* raced with > emptying the PoD cache. Having the assert before cannot guarantee that > they *will not* race with emptying the PoD cache. Alternately, if we''re > positive that condition can never happen again, we should just remove > the BUG_ON().As is, the code in cache_add mutually excludes p2m_pod_empty_cache until pod_lock is released by the caller. So, the page_list_add_tails()s cannot race with emptying the PoD cache. On further look, the BUG_ON can go. relmem moves out of _not_started only after p2m_pod_empty_cache has finished. So, transitively, we''re safe. Emptying the pod cache does the spin barrier. Question: should it not take the lock after the spin barrier, to be super safe? Thanks! Andres> > If I recall correctly, I put this in here because something ended up > calling cache_add after empty_cache(), potentially with the p2m lock > already held; that''s why I put the BUG_ON() there to begin with. > >> @@ -922,6 +929,12 @@ p2m_pod_emergency_sweep(struct p2m_domai >> limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0; >> >> /* FIXME: Figure out how to avoid superpages */ >> + /* NOTE: Promote to globally locking the p2m. This will get >> complicated >> + * in a fine-grained scenario. Even if we''re to lock each gfn >> + * individually we must be careful about recursion limits and >> + * POD_SWEEP_STRIDE. This is why we don''t enforce deadlock >> constraints >> + * between p2m and pod locks */ >> + p2m_lock(p2m); >> for ( i=p2m->pod.reclaim_single; i > 0 ; i-- ) >> { >> p2m_access_t a; >> @@ -940,7 +953,7 @@ p2m_pod_emergency_sweep(struct p2m_domai >> /* Stop if we''re past our limit and we have found *something*. >> * >> * NB that this is a zero-sum game; we''re increasing our cache >> size >> - * by re-increasing our ''debt''. Since we hold the p2m lock, >> + * by re-increasing our ''debt''. Since we hold the pod lock, >> * (entry_count - count) must remain the same. */ >> if ( p2m->pod.count > 0 && i < limit ) >> break; >> @@ -949,6 +962,7 @@ p2m_pod_emergency_sweep(struct p2m_domai >> if ( j ) >> p2m_pod_zero_check(p2m, gfns, j); >> >> + p2m_unlock(p2m); >> p2m->pod.reclaim_single = i ? i - 1 : i; >> >> } >> @@ -965,8 +979,9 @@ p2m_pod_demand_populate(struct p2m_domai >> int i; >> >> ASSERT(gfn_locked_by_me(p2m, gfn)); >> + pod_lock(p2m); >> >> - /* This check is done with the p2m lock held. This will make sure >> that >> + /* This check is done with the pod lock held. This will make sure >> that >> * even if d->is_dying changes under our feet, >> p2m_pod_empty_cache() >> * won''t start until we''re done. */ >> if ( unlikely(d->is_dying) ) >> @@ -977,6 +992,7 @@ p2m_pod_demand_populate(struct p2m_domai >> * 1GB region to 2MB chunks for a retry. */ >> if ( order == PAGE_ORDER_1G ) >> { >> + pod_unlock(p2m); >> gfn_aligned = (gfn >> order) << order; >> /* Note that we are supposed to call set_p2m_entry() 512 times >> to >> * split 1GB into 512 2MB pages here. But We only do once here >> because >> @@ -1000,11 +1016,15 @@ p2m_pod_demand_populate(struct p2m_domai >> >> /* If we''re low, start a sweep */ >> if ( order == PAGE_ORDER_2M && page_list_empty(&p2m->pod.super) >> ) >> + /* Note that sweeps scan other ranges in the p2m. In an >> scenario >> + * in which p2m locks are order-enforced wrt pod lock and >> p2m >> + * locks are fine grained, this will result in deadlock >> panic */ >> p2m_pod_emergency_sweep_super(p2m); >> >> if ( page_list_empty(&p2m->pod.single) && >> ( ( order == PAGE_ORDER_4K ) >> || (order == PAGE_ORDER_2M && >> page_list_empty(&p2m->pod.super) ) ) ) >> + /* Same comment regarding deadlock applies */ >> p2m_pod_emergency_sweep(p2m); >> } >> >You know what, these comments are *old*. I really need to cut down on the FUD factor :) As is, there is no risk of deadlock. With a (hypothetical as of today) fine-grained p2m locking approach, I want to leave a "be careful" reminder. And document your trylock suggestion, which I believe to be accurate. Thanks for the detailed look! Andres> Regarding locking on emergency sweeps: I think it should suffice if we > could do the equivalent of a spin_trylock() on each gpfn, and just move > on (not checking that gfn) if it fails. What do you think? > > Other than that, I don''t see anything wrong with this locking at first > glance; but it''s complicated enough that I don''t think I''ve quite > grokked it yet. I''ll look at it again tomorrow and see if things are > more clear. :-) > > -George > > >> @@ -1012,8 +1032,6 @@ p2m_pod_demand_populate(struct p2m_domai >> if ( q == p2m_guest && gfn > p2m->pod.max_guest ) >> p2m->pod.max_guest = gfn; >> >> - lock_page_alloc(p2m); >> - >> if ( p2m->pod.count == 0 ) >> goto out_of_memory; >> >> @@ -1026,8 +1044,6 @@ p2m_pod_demand_populate(struct p2m_domai >> >> BUG_ON((mfn_x(mfn) & ((1 << order)-1)) != 0); >> >> - unlock_page_alloc(p2m); >> - >> gfn_aligned = (gfn >> order) << order; >> >> set_p2m_entry(p2m, gfn_aligned, mfn, order, p2m_ram_rw, >> p2m->default_access); >> @@ -1038,7 +1054,7 @@ p2m_pod_demand_populate(struct p2m_domai >> paging_mark_dirty(d, mfn_x(mfn) + i); >> } >> >> - p2m->pod.entry_count -= (1 << order); /* Lock: p2m */ >> + p2m->pod.entry_count -= (1 << order); >> BUG_ON(p2m->pod.entry_count < 0); >> >> if ( tb_init_done ) >> @@ -1056,20 +1072,24 @@ p2m_pod_demand_populate(struct p2m_domai >> __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t); >> } >> >> + pod_unlock(p2m); >> return 0; >> out_of_memory: >> - unlock_page_alloc(p2m); >> + pod_unlock(p2m); >> >> printk("%s: Out of populate-on-demand memory! tot_pages %" PRIu32 " >> pod_entries %" PRIi32 "\n", >> __func__, d->tot_pages, p2m->pod.entry_count); >> domain_crash(d); >> out_fail: >> + pod_unlock(p2m); >> return -1; >> remap_and_retry: >> BUG_ON(order != PAGE_ORDER_2M); >> - unlock_page_alloc(p2m); >> + pod_unlock(p2m); >> >> /* Remap this 2-meg region in singleton chunks */ >> + /* NOTE: In a p2m fine-grained lock scenario this might >> + * need promoting the gfn lock from gfn->2M superpage */ >> gfn_aligned = (gfn>>order)<<order; >> for(i=0; i<(1<<order); i++) >> set_p2m_entry(p2m, gfn_aligned+i, _mfn(0), PAGE_ORDER_4K, >> @@ -1137,9 +1157,11 @@ guest_physmap_mark_populate_on_demand(st >> rc = -EINVAL; >> else >> { >> - p2m->pod.entry_count += 1 << order; /* Lock: p2m */ >> + pod_lock(p2m); >> + p2m->pod.entry_count += 1 << order; >> p2m->pod.entry_count -= pod_count; >> BUG_ON(p2m->pod.entry_count < 0); >> + pod_unlock(p2m); >> } >> >> gfn_unlock(p2m, gfn, order); >> diff -r fb9c06df05f2 -r 56ceab0118cb xen/arch/x86/mm/p2m-pt.c >> --- a/xen/arch/x86/mm/p2m-pt.c >> +++ b/xen/arch/x86/mm/p2m-pt.c >> @@ -954,6 +954,7 @@ long p2m_pt_audit_p2m(struct p2m_domain >> struct domain *d = p2m->domain; >> >> ASSERT(p2m_locked_by_me(p2m)); >> + ASSERT(pod_locked_by_me(p2m)); >> >> test_linear = ( (d == current->domain) >> && !pagetable_is_null(current->arch.monitor_table) >> ); >> diff -r fb9c06df05f2 -r 56ceab0118cb xen/arch/x86/mm/p2m.c >> --- a/xen/arch/x86/mm/p2m.c >> +++ b/xen/arch/x86/mm/p2m.c >> @@ -72,6 +72,7 @@ boolean_param("hap_2mb", opt_hap_2mb); >> static void p2m_initialise(struct domain *d, struct p2m_domain *p2m) >> { >> mm_lock_init(&p2m->lock); >> + mm_lock_init(&p2m->pod.lock); >> INIT_LIST_HEAD(&p2m->np2m_list); >> INIT_PAGE_LIST_HEAD(&p2m->pages); >> INIT_PAGE_LIST_HEAD(&p2m->pod.super); >> @@ -587,8 +588,10 @@ guest_physmap_add_entry(struct domain *d >> rc = -EINVAL; >> else >> { >> - p2m->pod.entry_count -= pod_count; /* Lock: p2m */ >> + pod_lock(p2m); >> + p2m->pod.entry_count -= pod_count; >> BUG_ON(p2m->pod.entry_count < 0); >> + pod_unlock(p2m); >> } >> } >> >> @@ -1372,6 +1375,7 @@ p2m_flush_table(struct p2m_domain *p2m) >> /* "Host" p2m tables can have shared entries &c that need a bit >> more >> * care when discarding them */ >> ASSERT(p2m_is_nestedp2m(p2m)); >> + /* Nested p2m''s do not do pod, hence the asserts (and no pod >> lock)*/ >> ASSERT(page_list_empty(&p2m->pod.super)); >> ASSERT(page_list_empty(&p2m->pod.single)); >> >> @@ -1529,6 +1533,7 @@ void audit_p2m(struct domain *d, >> P2M_PRINTK("p2m audit starts\n"); >> >> p2m_lock(p2m); >> + pod_lock(p2m); >> >> if (p2m->audit_p2m) >> pmbad = p2m->audit_p2m(p2m); >> @@ -1589,6 +1594,7 @@ void audit_p2m(struct domain *d, >> } >> spin_unlock(&d->page_alloc_lock); >> >> + pod_unlock(p2m); >> p2m_unlock(p2m); >> >> P2M_PRINTK("p2m audit complete\n"); >> diff -r fb9c06df05f2 -r 56ceab0118cb xen/include/asm-x86/p2m.h >> --- a/xen/include/asm-x86/p2m.h >> +++ b/xen/include/asm-x86/p2m.h >> @@ -261,25 +261,12 @@ struct p2m_domain { >> unsigned long max_mapped_pfn; >> >> /* Populate-on-demand variables >> - * NB on locking. {super,single,count} are >> - * covered by d->page_alloc_lock, since they''re almost always used >> in >> - * conjunction with that functionality. {entry_count} is covered >> by >> - * the domain p2m lock, since it''s almost always used in >> conjunction >> - * with changing the p2m tables. >> - * >> - * At this point, both locks are held in two places. In both, >> - * the order is [p2m,page_alloc]: >> - * + p2m_pod_decrease_reservation() calls p2m_pod_cache_add(), >> - * which grabs page_alloc >> - * + p2m_pod_demand_populate() grabs both; the p2m lock to avoid >> - * double-demand-populating of pages, the page_alloc lock to >> - * protect moving stuff from the PoD cache to the domain page >> list. >> - * >> - * We enforce this lock ordering through a construct in mm-locks.h. >> - * This demands, however, that we store the previous lock-ordering >> - * level in effect before grabbing the page_alloc lock. The unlock >> - * level is stored in the arch section of the domain struct. >> - */ >> + * All variables are protected with the pod lock. We cannot rely on >> + * the p2m lock if it''s turned into a fine-grained lock. >> + * We only use the domain page_alloc lock for additions and >> + * deletions to the domain''s page list. Because we use it nested >> + * within the PoD lock, we enforce it''s ordering (by remembering >> + * the unlock level in the arch_domain sub struct). */ >> struct { >> struct page_list_head super, /* List of superpages >> */ >> single; /* Non-super lists >> */ >> @@ -288,6 +275,8 @@ struct p2m_domain { >> unsigned reclaim_super; /* Last gpfn of a scan */ >> unsigned reclaim_single; /* Last gpfn of a scan */ >> unsigned max_guest; /* gpfn of max guest >> demand-populate */ >> + mm_lock_t lock; /* Locking of private pod >> structs, * >> + * not relying on the p2m lock. >> */ >> } pod; >> }; >> > > >