Jan Beulich
2008-Oct-30 12:27 UTC
[Xen-devel] [PATCH] x86: fix preemptable page type handling (v2)
- retain a page reference when PGT_partial is set on a page (and drop it when clearing that flag) - don''t drop a page reference never acquired when freeing the page type of a page where the allocation of the type got preempted (and never completed) - don''t acquire a page reference when allocating the page type of a page where freeing the type got preempted (and never completed, and hence didn''t drop the respective reference) Signed-off-by: Jan Beulich <jbeulich@novell.com> Index: 2008-10-27/xen/arch/x86/domain.c ==================================================================--- 2008-10-27.orig/xen/arch/x86/domain.c 2008-10-29 17:10:41.000000000 +0100 +++ 2008-10-27/xen/arch/x86/domain.c 2008-10-29 17:11:51.000000000 +0100 @@ -1683,18 +1683,24 @@ static int relinquish_memory( break; case -EINTR: page->u.inuse.type_info |= PGT_validated; + if ( x & PGT_partial ) + put_page(page); put_page(page); ret = -EAGAIN; goto out; case -EAGAIN: page->u.inuse.type_info |= PGT_partial; - put_page(page); + if ( x & PGT_partial ) + put_page(page); goto out; default: BUG(); } if ( x & PGT_partial ) + { page->u.inuse.type_info--; + put_page(page); + } break; } } Index: 2008-10-27/xen/arch/x86/mm.c ==================================================================--- 2008-10-27.orig/xen/arch/x86/mm.c 2008-10-28 14:53:16.000000000 +0100 +++ 2008-10-27/xen/arch/x86/mm.c 2008-10-29 16:46:52.000000000 +0100 @@ -566,19 +566,21 @@ static int get_page_from_pagenr(unsigned static int get_page_and_type_from_pagenr(unsigned long page_nr, unsigned long type, struct domain *d, + int partial, int preemptible) { struct page_info *page = mfn_to_page(page_nr); int rc; - if ( unlikely(!get_page_from_pagenr(page_nr, d)) ) + if ( likely(partial >= 0) && + unlikely(!get_page_from_pagenr(page_nr, d)) ) return -EINVAL; rc = (preemptible ? get_page_type_preemptible(page, type) : (get_page_type(page, type) ? 0 : -EINVAL)); - if ( rc ) + if ( unlikely(rc) && partial >= 0 ) put_page(page); return rc; @@ -761,7 +763,7 @@ get_page_from_l2e( } rc = get_page_and_type_from_pagenr( - l2e_get_pfn(l2e), PGT_l1_page_table, d, 0); + l2e_get_pfn(l2e), PGT_l1_page_table, d, 0, 0); if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) ) rc = 0; @@ -772,7 +774,7 @@ get_page_from_l2e( define_get_linear_pagetable(l3); static int get_page_from_l3e( - l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int preemptible) + l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial, int preemptible) { int rc; @@ -786,7 +788,7 @@ get_page_from_l3e( } rc = get_page_and_type_from_pagenr( - l3e_get_pfn(l3e), PGT_l2_page_table, d, preemptible); + l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, preemptible); if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) ) rc = 0; @@ -797,7 +799,7 @@ get_page_from_l3e( define_get_linear_pagetable(l4); static int get_page_from_l4e( - l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int preemptible) + l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial, int preemptible) { int rc; @@ -811,7 +813,7 @@ get_page_from_l4e( } rc = get_page_and_type_from_pagenr( - l4e_get_pfn(l4e), PGT_l3_page_table, d, preemptible); + l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, preemptible); if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) ) rc = 0; @@ -961,23 +963,32 @@ static int put_page_from_l2e(l2_pgentry_ return 1; } +static int __put_page_type(struct page_info *, int preemptible); static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, - int preemptible) + int partial, int preemptible) { if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) && (l3e_get_pfn(l3e) != pfn) ) + { + if ( unlikely(partial > 0) ) + return __put_page_type(l3e_get_page(l3e), preemptible); return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible); + } return 1; } #if CONFIG_PAGING_LEVELS >= 4 static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, - int preemptible) + int partial, int preemptible) { if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && (l4e_get_pfn(l4e) != pfn) ) + { + if ( unlikely(partial > 0) ) + return __put_page_type(l4e_get_page(l4e), preemptible); return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible); + } return 1; } #endif @@ -1184,7 +1195,7 @@ static int alloc_l3_table(struct page_in unsigned long pfn = page_to_mfn(page); l3_pgentry_t *pl3e; unsigned int i; - int rc = 0; + int rc = 0, partial = page->partial_pte; #if CONFIG_PAGING_LEVELS == 3 /* @@ -1213,7 +1224,8 @@ static int alloc_l3_table(struct page_in if ( is_pv_32on64_domain(d) ) memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e)); - for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; i++ ) + for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; + i++, partial = 0 ) { if ( is_pv_32bit_domain(d) && (i == 3) ) { @@ -1224,16 +1236,17 @@ static int alloc_l3_table(struct page_in rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]), PGT_l2_page_table | PGT_pae_xen_l2, - d, preemptible); + d, partial, preemptible); } else if ( !is_guest_l3_slot(i) || - (rc = get_page_from_l3e(pl3e[i], pfn, d, preemptible)) > 0 ) + (rc = get_page_from_l3e(pl3e[i], pfn, d, + partial, preemptible)) > 0 ) continue; if ( rc == -EAGAIN ) { page->nr_validated_ptes = i; - page->partial_pte = 1; + page->partial_pte = partial ?: 1; } else if ( rc == -EINTR && i ) { @@ -1257,7 +1270,7 @@ static int alloc_l3_table(struct page_in if ( !is_guest_l3_slot(i) ) continue; unadjust_guest_l3e(pl3e[i], d); - put_page_from_l3e(pl3e[i], pfn, 0); + put_page_from_l3e(pl3e[i], pfn, 0, 0); } } @@ -1272,18 +1285,20 @@ static int alloc_l4_table(struct page_in unsigned long pfn = page_to_mfn(page); l4_pgentry_t *pl4e = page_to_virt(page); unsigned int i; - int rc = 0; + int rc = 0, partial = page->partial_pte; - for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; i++ ) + for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; + i++, partial = 0 ) { if ( !is_guest_l4_slot(d, i) || - (rc = get_page_from_l4e(pl4e[i], pfn, d, preemptible)) > 0 ) + (rc = get_page_from_l4e(pl4e[i], pfn, d, + partial, preemptible)) > 0 ) continue; if ( rc == -EAGAIN ) { page->nr_validated_ptes = i; - page->partial_pte = 1; + page->partial_pte = partial ?: 1; } else if ( rc == -EINTR ) { @@ -1299,7 +1314,7 @@ static int alloc_l4_table(struct page_in MEM_LOG("Failure in alloc_l4_table: entry %d", i); while ( i-- > 0 ) if ( is_guest_l4_slot(d, i) ) - put_page_from_l4e(pl4e[i], pfn, 0); + put_page_from_l4e(pl4e[i], pfn, 0, 0); } if ( rc < 0 ) return rc; @@ -1377,19 +1392,20 @@ static int free_l3_table(struct page_inf struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l3_pgentry_t *pl3e; - unsigned int i = page->nr_validated_ptes - !page->partial_pte; - int rc = 0; + int rc = 0, partial = page->partial_pte; + unsigned int i = page->nr_validated_ptes - !partial; pl3e = map_domain_page(pfn); do { if ( is_guest_l3_slot(i) ) { - rc = put_page_from_l3e(pl3e[i], pfn, preemptible); + rc = put_page_from_l3e(pl3e[i], pfn, partial, preemptible); + if ( rc < 0 ) + break; + partial = 0; if ( rc > 0 ) continue; - if ( rc ) - break; unadjust_guest_l3e(pl3e[i], d); } } while ( i-- ); @@ -1399,7 +1415,7 @@ static int free_l3_table(struct page_inf if ( rc == -EAGAIN ) { page->nr_validated_ptes = i; - page->partial_pte = 1; + page->partial_pte = partial ?: -1; } else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 ) { @@ -1416,18 +1432,21 @@ static int free_l4_table(struct page_inf struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l4_pgentry_t *pl4e = page_to_virt(page); - unsigned int i = page->nr_validated_ptes - !page->partial_pte; - int rc = 0; + int rc = 0, partial = page->partial_pte; + unsigned int i = page->nr_validated_ptes - !partial; do { if ( is_guest_l4_slot(d, i) ) - rc = put_page_from_l4e(pl4e[i], pfn, preemptible); - } while ( rc >= 0 && i-- ); + rc = put_page_from_l4e(pl4e[i], pfn, partial, preemptible); + if ( rc < 0 ) + break; + partial = 0; + } while ( i-- ); if ( rc == -EAGAIN ) { page->nr_validated_ptes = i; - page->partial_pte = 1; + page->partial_pte = partial ?: -1; } else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 ) { @@ -1703,7 +1722,7 @@ static int mod_l3_entry(l3_pgentry_t *pl return rc ? 0 : -EFAULT; } - rc = get_page_from_l3e(nl3e, pfn, d, preemptible); + rc = get_page_from_l3e(nl3e, pfn, d, 0, preemptible); if ( unlikely(rc < 0) ) return page_unlock(l3pg), rc; rc = 0; @@ -1732,7 +1751,7 @@ static int mod_l3_entry(l3_pgentry_t *pl } page_unlock(l3pg); - put_page_from_l3e(ol3e, pfn, 0); + put_page_from_l3e(ol3e, pfn, 0, 0); return rc; } @@ -1781,7 +1800,7 @@ static int mod_l4_entry(l4_pgentry_t *pl return rc ? 0 : -EFAULT; } - rc = get_page_from_l4e(nl4e, pfn, d, preemptible); + rc = get_page_from_l4e(nl4e, pfn, d, 0, preemptible); if ( unlikely(rc < 0) ) return page_unlock(l4pg), rc; rc = 0; @@ -1802,7 +1821,7 @@ static int mod_l4_entry(l4_pgentry_t *pl } page_unlock(l4pg); - put_page_from_l4e(ol4e, pfn, 0); + put_page_from_l4e(ol4e, pfn, 0, 0); return rc; } @@ -1866,6 +1885,10 @@ static int alloc_page_type(struct page_i struct domain *owner = page_get_owner(page); int rc; + /* Obtain an extra reference to retain if we set PGT_partial. */ + if ( preemptible && !get_page(page, owner) ) + return -EINVAL; + /* A page table is dirtied when its type count becomes non-zero. */ if ( likely(owner != NULL) ) paging_mark_dirty(owner, page_to_mfn(page)); @@ -1900,8 +1923,13 @@ static int alloc_page_type(struct page_i if ( rc == -EAGAIN ) { page->u.inuse.type_info |= PGT_partial; + return -EAGAIN; } - else if ( rc == -EINTR ) + + if ( preemptible ) + put_page(page); + + if ( rc == -EINTR ) { ASSERT((page->u.inuse.type_info & (PGT_count_mask|PGT_validated|PGT_partial)) == 1); @@ -2020,11 +2048,16 @@ static int __put_page_type_final(struct case -EAGAIN: wmb(); page->u.inuse.type_info |= PGT_partial; + /* Must skip put_page() below. */ + preemptible = 0; break; default: BUG(); } + if ( preemptible ) + put_page(page); + return rc; } @@ -2034,6 +2067,10 @@ static int __put_page_type(struct page_i { unsigned long nx, x, y = page->u.inuse.type_info; + /* Obtain an extra reference to retain if we set PGT_partial. */ + if ( preemptible && !get_page(page, page_get_owner(page)) ) + return -EINVAL; + for ( ; ; ) { x = y; @@ -2055,6 +2092,8 @@ static int __put_page_type(struct page_i if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ) continue; + if ( x & PGT_partial ) + put_page(page); /* We cleared the ''valid bit'' so we do the clean up. */ return __put_page_type_final(page, x, preemptible); } @@ -2075,9 +2114,16 @@ static int __put_page_type(struct page_i break; if ( preemptible && hypercall_preempt_check() ) + { + if ( preemptible ) + put_page(page); return -EINTR; + } } + if ( preemptible ) + put_page(page); + return 0; } @@ -2181,7 +2227,11 @@ static int __get_page_type(struct page_i } if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) ) + { + if ( (x & PGT_partial) && !(nx & PGT_partial) ) + put_page(page); break; + } if ( preemptible && hypercall_preempt_check() ) return -EINTR; @@ -2290,7 +2340,7 @@ int new_guest_cr3(unsigned long mfn) #endif okay = paging_mode_refcounts(d) ? get_page_from_pagenr(mfn, d) - : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0); + : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0); if ( unlikely(!okay) ) { MEM_LOG("Error while installing new baseptr %lx", mfn); @@ -2534,7 +2584,7 @@ int do_mmuext_op( if ( paging_mode_refcounts(FOREIGNDOM) ) break; - rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 1); + rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 0, 1); okay = !rc; if ( unlikely(!okay) ) { @@ -2615,7 +2665,7 @@ int do_mmuext_op( okay = get_page_from_pagenr(mfn, d); else okay = !get_page_and_type_from_pagenr( - mfn, PGT_root_page_table, d, 0); + mfn, PGT_root_page_table, d, 0, 0); if ( unlikely(!okay) ) { MEM_LOG("Error while installing new mfn %lx", mfn); @@ -2722,7 +2772,7 @@ int do_mmuext_op( unsigned char *ptr; okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page, - FOREIGNDOM, 0); + FOREIGNDOM, 0, 0); if ( unlikely(!okay) ) { MEM_LOG("Error while clearing mfn %lx", mfn); @@ -2755,7 +2805,7 @@ int do_mmuext_op( } okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page, - FOREIGNDOM, 0); + FOREIGNDOM, 0, 0); if ( unlikely(!okay) ) { put_page(mfn_to_page(src_mfn)); Index: 2008-10-27/xen/include/asm-x86/mm.h ==================================================================--- 2008-10-27.orig/xen/include/asm-x86/mm.h 2008-10-29 17:09:06.000000000 +0100 +++ 2008-10-27/xen/include/asm-x86/mm.h 2008-10-30 13:25:44.000000000 +0100 @@ -61,12 +61,36 @@ struct page_info /* * When PGT_partial is true then this field is valid and indicates * that PTEs in the range [0, @nr_validated_ptes) have been validated. - * If @partial_pte is true then PTE at @nr_validated_ptes+1 has been - * partially validated. + * An extra page reference must be acquired (or not dropped) whenever + * PGT_partial gets set, and it must be dropped when the flag gets + * cleared. This is so that a get() leaving a page in partially + * validated state (where the caller would drop the reference acquired + * due to the getting of the type [apparently] failing [-EAGAIN]) + * would not accidentally result in a page left with zero general + * reference count, but non-zero type reference count (possible when + * the partial get() is followed immediately by domain destruction). + * Likewise, the ownership of the single type reference for partially + * (in-)validated pages is tied to this flag, i.e. the instance + * setting the flag must not drop that reference, whereas the instance + * clearing it will have to. + * + * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has + * been partially validated. This implies that the general reference + * to the page (acquired from get_page_from_lNe()) would be dropped + * (again due to the apparent failure) and hence must be re-acquired + * when resuming the validation, but must not be dropped when picking + * up the page for invalidation. + * + * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has + * been partially invalidated. This is basically the opposite case of + * above, i.e. the general reference to the page was not dropped in + * put_page_from_lNe() (due to the apparent failure), and hence it + * must be dropped when the put operation is resumed (and completes), + * but it must not be acquired if picking up the page for validation. */ struct { u16 nr_validated_ptes; - bool_t partial_pte; + s8 partial_pte; }; /* _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2008-Oct-30 14:46 UTC
Re: [Xen-devel] [PATCH] x86: fix preemptable page type handling (v2)
Why does this patch not do what the new comment says and directly get_page() when PGT_partial is set and put_page() when PGT_partial is cleared? Doing get_page() across all __put_page() operations and then skipping the put_page() in some cases, for example, seems inefficient and also makes the code less clear. Why would you do it that way? -- Keir On 30/10/08 12:27, "Jan Beulich" <jbeulich@novell.com> wrote:> - retain a page reference when PGT_partial is set on a page (and drop > it when clearing that flag) > - don''t drop a page reference never acquired when freeing the page type > of a page where the allocation of the type got preempted (and never > completed) > - don''t acquire a page reference when allocating the page type of a > page where freeing the type got preempted (and never completed, and > hence didn''t drop the respective reference) > > Signed-off-by: Jan Beulich <jbeulich@novell.com> > > Index: 2008-10-27/xen/arch/x86/domain.c > ==================================================================> --- 2008-10-27.orig/xen/arch/x86/domain.c 2008-10-29 17:10:41.000000000 +0100 > +++ 2008-10-27/xen/arch/x86/domain.c 2008-10-29 17:11:51.000000000 +0100 > @@ -1683,18 +1683,24 @@ static int relinquish_memory( > break; > case -EINTR: > page->u.inuse.type_info |= PGT_validated; > + if ( x & PGT_partial ) > + put_page(page); > put_page(page); > ret = -EAGAIN; > goto out; > case -EAGAIN: > page->u.inuse.type_info |= PGT_partial; > - put_page(page); > + if ( x & PGT_partial ) > + put_page(page); > goto out; > default: > BUG(); > } > if ( x & PGT_partial ) > + { > page->u.inuse.type_info--; > + put_page(page); > + } > break; > } > } > Index: 2008-10-27/xen/arch/x86/mm.c > ==================================================================> --- 2008-10-27.orig/xen/arch/x86/mm.c 2008-10-28 14:53:16.000000000 +0100 > +++ 2008-10-27/xen/arch/x86/mm.c 2008-10-29 16:46:52.000000000 +0100 > @@ -566,19 +566,21 @@ static int get_page_from_pagenr(unsigned > static int get_page_and_type_from_pagenr(unsigned long page_nr, > unsigned long type, > struct domain *d, > + int partial, > int preemptible) > { > struct page_info *page = mfn_to_page(page_nr); > int rc; > > - if ( unlikely(!get_page_from_pagenr(page_nr, d)) ) > + if ( likely(partial >= 0) && > + unlikely(!get_page_from_pagenr(page_nr, d)) ) > return -EINVAL; > > rc = (preemptible ? > get_page_type_preemptible(page, type) : > (get_page_type(page, type) ? 0 : -EINVAL)); > > - if ( rc ) > + if ( unlikely(rc) && partial >= 0 ) > put_page(page); > > return rc; > @@ -761,7 +763,7 @@ get_page_from_l2e( > } > > rc = get_page_and_type_from_pagenr( > - l2e_get_pfn(l2e), PGT_l1_page_table, d, 0); > + l2e_get_pfn(l2e), PGT_l1_page_table, d, 0, 0); > if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) ) > rc = 0; > > @@ -772,7 +774,7 @@ get_page_from_l2e( > define_get_linear_pagetable(l3); > static int > get_page_from_l3e( > - l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int preemptible) > + l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial, int > preemptible) > { > int rc; > > @@ -786,7 +788,7 @@ get_page_from_l3e( > } > > rc = get_page_and_type_from_pagenr( > - l3e_get_pfn(l3e), PGT_l2_page_table, d, preemptible); > + l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, preemptible); > if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) ) > rc = 0; > > @@ -797,7 +799,7 @@ get_page_from_l3e( > define_get_linear_pagetable(l4); > static int > get_page_from_l4e( > - l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int preemptible) > + l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial, int > preemptible) > { > int rc; > > @@ -811,7 +813,7 @@ get_page_from_l4e( > } > > rc = get_page_and_type_from_pagenr( > - l4e_get_pfn(l4e), PGT_l3_page_table, d, preemptible); > + l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, preemptible); > if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) ) > rc = 0; > > @@ -961,23 +963,32 @@ static int put_page_from_l2e(l2_pgentry_ > return 1; > } > > +static int __put_page_type(struct page_info *, int preemptible); > > static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, > - int preemptible) > + int partial, int preemptible) > { > if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) && > (l3e_get_pfn(l3e) != pfn) ) > + { > + if ( unlikely(partial > 0) ) > + return __put_page_type(l3e_get_page(l3e), preemptible); > return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible); > + } > return 1; > } > > #if CONFIG_PAGING_LEVELS >= 4 > static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, > - int preemptible) > + int partial, int preemptible) > { > if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && > (l4e_get_pfn(l4e) != pfn) ) > + { > + if ( unlikely(partial > 0) ) > + return __put_page_type(l4e_get_page(l4e), preemptible); > return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible); > + } > return 1; > } > #endif > @@ -1184,7 +1195,7 @@ static int alloc_l3_table(struct page_in > unsigned long pfn = page_to_mfn(page); > l3_pgentry_t *pl3e; > unsigned int i; > - int rc = 0; > + int rc = 0, partial = page->partial_pte; > > #if CONFIG_PAGING_LEVELS == 3 > /* > @@ -1213,7 +1224,8 @@ static int alloc_l3_table(struct page_in > if ( is_pv_32on64_domain(d) ) > memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e)); > > - for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; i++ ) > + for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; > + i++, partial = 0 ) > { > if ( is_pv_32bit_domain(d) && (i == 3) ) > { > @@ -1224,16 +1236,17 @@ static int alloc_l3_table(struct page_in > rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]), > PGT_l2_page_table | > PGT_pae_xen_l2, > - d, preemptible); > + d, partial, preemptible); > } > else if ( !is_guest_l3_slot(i) || > - (rc = get_page_from_l3e(pl3e[i], pfn, d, preemptible)) > 0 > ) > + (rc = get_page_from_l3e(pl3e[i], pfn, d, > + partial, preemptible)) > 0 ) > continue; > > if ( rc == -EAGAIN ) > { > page->nr_validated_ptes = i; > - page->partial_pte = 1; > + page->partial_pte = partial ?: 1; > } > else if ( rc == -EINTR && i ) > { > @@ -1257,7 +1270,7 @@ static int alloc_l3_table(struct page_in > if ( !is_guest_l3_slot(i) ) > continue; > unadjust_guest_l3e(pl3e[i], d); > - put_page_from_l3e(pl3e[i], pfn, 0); > + put_page_from_l3e(pl3e[i], pfn, 0, 0); > } > } > > @@ -1272,18 +1285,20 @@ static int alloc_l4_table(struct page_in > unsigned long pfn = page_to_mfn(page); > l4_pgentry_t *pl4e = page_to_virt(page); > unsigned int i; > - int rc = 0; > + int rc = 0, partial = page->partial_pte; > > - for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; i++ ) > + for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; > + i++, partial = 0 ) > { > if ( !is_guest_l4_slot(d, i) || > - (rc = get_page_from_l4e(pl4e[i], pfn, d, preemptible)) > 0 ) > + (rc = get_page_from_l4e(pl4e[i], pfn, d, > + partial, preemptible)) > 0 ) > continue; > > if ( rc == -EAGAIN ) > { > page->nr_validated_ptes = i; > - page->partial_pte = 1; > + page->partial_pte = partial ?: 1; > } > else if ( rc == -EINTR ) > { > @@ -1299,7 +1314,7 @@ static int alloc_l4_table(struct page_in > MEM_LOG("Failure in alloc_l4_table: entry %d", i); > while ( i-- > 0 ) > if ( is_guest_l4_slot(d, i) ) > - put_page_from_l4e(pl4e[i], pfn, 0); > + put_page_from_l4e(pl4e[i], pfn, 0, 0); > } > if ( rc < 0 ) > return rc; > @@ -1377,19 +1392,20 @@ static int free_l3_table(struct page_inf > struct domain *d = page_get_owner(page); > unsigned long pfn = page_to_mfn(page); > l3_pgentry_t *pl3e; > - unsigned int i = page->nr_validated_ptes - !page->partial_pte; > - int rc = 0; > + int rc = 0, partial = page->partial_pte; > + unsigned int i = page->nr_validated_ptes - !partial; > > pl3e = map_domain_page(pfn); > > do { > if ( is_guest_l3_slot(i) ) > { > - rc = put_page_from_l3e(pl3e[i], pfn, preemptible); > + rc = put_page_from_l3e(pl3e[i], pfn, partial, preemptible); > + if ( rc < 0 ) > + break; > + partial = 0; > if ( rc > 0 ) > continue; > - if ( rc ) > - break; > unadjust_guest_l3e(pl3e[i], d); > } > } while ( i-- ); > @@ -1399,7 +1415,7 @@ static int free_l3_table(struct page_inf > if ( rc == -EAGAIN ) > { > page->nr_validated_ptes = i; > - page->partial_pte = 1; > + page->partial_pte = partial ?: -1; > } > else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 ) > { > @@ -1416,18 +1432,21 @@ static int free_l4_table(struct page_inf > struct domain *d = page_get_owner(page); > unsigned long pfn = page_to_mfn(page); > l4_pgentry_t *pl4e = page_to_virt(page); > - unsigned int i = page->nr_validated_ptes - !page->partial_pte; > - int rc = 0; > + int rc = 0, partial = page->partial_pte; > + unsigned int i = page->nr_validated_ptes - !partial; > > do { > if ( is_guest_l4_slot(d, i) ) > - rc = put_page_from_l4e(pl4e[i], pfn, preemptible); > - } while ( rc >= 0 && i-- ); > + rc = put_page_from_l4e(pl4e[i], pfn, partial, preemptible); > + if ( rc < 0 ) > + break; > + partial = 0; > + } while ( i-- ); > > if ( rc == -EAGAIN ) > { > page->nr_validated_ptes = i; > - page->partial_pte = 1; > + page->partial_pte = partial ?: -1; > } > else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 ) > { > @@ -1703,7 +1722,7 @@ static int mod_l3_entry(l3_pgentry_t *pl > return rc ? 0 : -EFAULT; > } > > - rc = get_page_from_l3e(nl3e, pfn, d, preemptible); > + rc = get_page_from_l3e(nl3e, pfn, d, 0, preemptible); > if ( unlikely(rc < 0) ) > return page_unlock(l3pg), rc; > rc = 0; > @@ -1732,7 +1751,7 @@ static int mod_l3_entry(l3_pgentry_t *pl > } > > page_unlock(l3pg); > - put_page_from_l3e(ol3e, pfn, 0); > + put_page_from_l3e(ol3e, pfn, 0, 0); > return rc; > } > > @@ -1781,7 +1800,7 @@ static int mod_l4_entry(l4_pgentry_t *pl > return rc ? 0 : -EFAULT; > } > > - rc = get_page_from_l4e(nl4e, pfn, d, preemptible); > + rc = get_page_from_l4e(nl4e, pfn, d, 0, preemptible); > if ( unlikely(rc < 0) ) > return page_unlock(l4pg), rc; > rc = 0; > @@ -1802,7 +1821,7 @@ static int mod_l4_entry(l4_pgentry_t *pl > } > > page_unlock(l4pg); > - put_page_from_l4e(ol4e, pfn, 0); > + put_page_from_l4e(ol4e, pfn, 0, 0); > return rc; > } > > @@ -1866,6 +1885,10 @@ static int alloc_page_type(struct page_i > struct domain *owner = page_get_owner(page); > int rc; > > + /* Obtain an extra reference to retain if we set PGT_partial. */ > + if ( preemptible && !get_page(page, owner) ) > + return -EINVAL; > + > /* A page table is dirtied when its type count becomes non-zero. */ > if ( likely(owner != NULL) ) > paging_mark_dirty(owner, page_to_mfn(page)); > @@ -1900,8 +1923,13 @@ static int alloc_page_type(struct page_i > if ( rc == -EAGAIN ) > { > page->u.inuse.type_info |= PGT_partial; > + return -EAGAIN; > } > - else if ( rc == -EINTR ) > + > + if ( preemptible ) > + put_page(page); > + > + if ( rc == -EINTR ) > { > ASSERT((page->u.inuse.type_info & > (PGT_count_mask|PGT_validated|PGT_partial)) == 1); > @@ -2020,11 +2048,16 @@ static int __put_page_type_final(struct > case -EAGAIN: > wmb(); > page->u.inuse.type_info |= PGT_partial; > + /* Must skip put_page() below. */ > + preemptible = 0; > break; > default: > BUG(); > } > > + if ( preemptible ) > + put_page(page); > + > return rc; > } > > @@ -2034,6 +2067,10 @@ static int __put_page_type(struct page_i > { > unsigned long nx, x, y = page->u.inuse.type_info; > > + /* Obtain an extra reference to retain if we set PGT_partial. */ > + if ( preemptible && !get_page(page, page_get_owner(page)) ) > + return -EINVAL; > + > for ( ; ; ) > { > x = y; > @@ -2055,6 +2092,8 @@ static int __put_page_type(struct page_i > if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, > x, nx)) != x) ) > continue; > + if ( x & PGT_partial ) > + put_page(page); > /* We cleared the ''valid bit'' so we do the clean up. */ > return __put_page_type_final(page, x, preemptible); > } > @@ -2075,9 +2114,16 @@ static int __put_page_type(struct page_i > break; > > if ( preemptible && hypercall_preempt_check() ) > + { > + if ( preemptible ) > + put_page(page); > return -EINTR; > + } > } > > + if ( preemptible ) > + put_page(page); > + > return 0; > } > > @@ -2181,7 +2227,11 @@ static int __get_page_type(struct page_i > } > > if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) ) > + { > + if ( (x & PGT_partial) && !(nx & PGT_partial) ) > + put_page(page); > break; > + } > > if ( preemptible && hypercall_preempt_check() ) > return -EINTR; > @@ -2290,7 +2340,7 @@ int new_guest_cr3(unsigned long mfn) > #endif > okay = paging_mode_refcounts(d) > ? get_page_from_pagenr(mfn, d) > - : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0); > + : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0); > if ( unlikely(!okay) ) > { > MEM_LOG("Error while installing new baseptr %lx", mfn); > @@ -2534,7 +2584,7 @@ int do_mmuext_op( > if ( paging_mode_refcounts(FOREIGNDOM) ) > break; > > - rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 1); > + rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 0, 1); > okay = !rc; > if ( unlikely(!okay) ) > { > @@ -2615,7 +2665,7 @@ int do_mmuext_op( > okay = get_page_from_pagenr(mfn, d); > else > okay = !get_page_and_type_from_pagenr( > - mfn, PGT_root_page_table, d, 0); > + mfn, PGT_root_page_table, d, 0, 0); > if ( unlikely(!okay) ) > { > MEM_LOG("Error while installing new mfn %lx", mfn); > @@ -2722,7 +2772,7 @@ int do_mmuext_op( > unsigned char *ptr; > > okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page, > - FOREIGNDOM, 0); > + FOREIGNDOM, 0, 0); > if ( unlikely(!okay) ) > { > MEM_LOG("Error while clearing mfn %lx", mfn); > @@ -2755,7 +2805,7 @@ int do_mmuext_op( > } > > okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page, > - FOREIGNDOM, 0); > + FOREIGNDOM, 0, 0); > if ( unlikely(!okay) ) > { > put_page(mfn_to_page(src_mfn)); > Index: 2008-10-27/xen/include/asm-x86/mm.h > ==================================================================> --- 2008-10-27.orig/xen/include/asm-x86/mm.h 2008-10-29 17:09:06.000000000 > +0100 > +++ 2008-10-27/xen/include/asm-x86/mm.h 2008-10-30 13:25:44.000000000 +0100 > @@ -61,12 +61,36 @@ struct page_info > /* > * When PGT_partial is true then this field is valid and indicates > * that PTEs in the range [0, @nr_validated_ptes) have been > validated. > - * If @partial_pte is true then PTE at @nr_validated_ptes+1 has been > - * partially validated. > + * An extra page reference must be acquired (or not dropped) whenever > + * PGT_partial gets set, and it must be dropped when the flag gets > + * cleared. This is so that a get() leaving a page in partially > + * validated state (where the caller would drop the reference > acquired > + * due to the getting of the type [apparently] failing [-EAGAIN]) > + * would not accidentally result in a page left with zero general > + * reference count, but non-zero type reference count (possible when > + * the partial get() is followed immediately by domain destruction). > + * Likewise, the ownership of the single type reference for partially > + * (in-)validated pages is tied to this flag, i.e. the instance > + * setting the flag must not drop that reference, whereas the > instance > + * clearing it will have to. > + * > + * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has > + * been partially validated. This implies that the general reference > + * to the page (acquired from get_page_from_lNe()) would be dropped > + * (again due to the apparent failure) and hence must be re-acquired > + * when resuming the validation, but must not be dropped when picking > + * up the page for invalidation. > + * > + * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has > + * been partially invalidated. This is basically the opposite case of > + * above, i.e. the general reference to the page was not dropped in > + * put_page_from_lNe() (due to the apparent failure), and hence it > + * must be dropped when the put operation is resumed (and completes), > + * but it must not be acquired if picking up the page for validation. > */ > struct { > u16 nr_validated_ptes; > - bool_t partial_pte; > + s8 partial_pte; > }; > > /* > > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xensource.com > http://lists.xensource.com/xen-devel_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jan Beulich
2008-Oct-30 14:56 UTC
Re: [Xen-devel] [PATCH] x86: fix preemptable page type handling(v2)
>>> Keir Fraser <keir.fraser@eu.citrix.com> 30.10.08 15:46 >>> >Why does this patch not do what the new comment says and directly get_page() >when PGT_partial is set and put_page() when PGT_partial is cleared? DoingHmm, the description was meant to tell the net effect, not the way it''s implemented.>get_page() across all __put_page() operations and then skipping the >put_page() in some cases, for example, seems inefficient and also makes the >code less clear. Why would you do it that way?Because I can''t handle a failure of get_page() (due to count overflow) once I''m past the put()s. Jan _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2008-Oct-30 15:00 UTC
Re: [Xen-devel] [PATCH] x86: fix preemptable page type handling(v2)
On 30/10/08 14:56, "Jan Beulich" <jbeulich@novell.com> wrote:>> get_page() across all __put_page() operations and then skipping the >> put_page() in some cases, for example, seems inefficient and also makes the >> code less clear. Why would you do it that way? > > Because I can''t handle a failure of get_page() (due to count overflow) > once I''m past the put()s.We could make get_page() always leave a few available references, and then define a new get_page() variant which: A. Does not check the owner B. Knows that it is incrementing a non-zero count C. Is able to use the few references never claimed by get_page(). This would be basically a small cmpxchg() loop. Since it''s only executed before setting PGT_partial we know that only one such special reference is needed per page and we could just make get_page() leave headroom of one reference. Simpler code and faster code? -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jan Beulich
2008-Oct-30 17:03 UTC
Re: [Xen-devel] [PATCH] x86: fix preemptable page typehandling(v2)
>>> Keir Fraser <keir.fraser@eu.citrix.com> 30.10.08 16:00 >>> >On 30/10/08 14:56, "Jan Beulich" <jbeulich@novell.com> wrote: > >>> get_page() across all __put_page() operations and then skipping the >>> put_page() in some cases, for example, seems inefficient and also makes the >>> code less clear. Why would you do it that way? >> >> Because I can''t handle a failure of get_page() (due to count overflow) >> once I''m past the put()s. > >We could make get_page() always leave a few available references, and then >define a new get_page() variant which: > A. Does not check the owner > B. Knows that it is incrementing a non-zero count > C. Is able to use the few references never claimed by get_page(). > >This would be basically a small cmpxchg() loop. Since it''s only executed >before setting PGT_partial we know that only one such special reference is >needed per page and we could just make get_page() leave headroom of one >reference. > >Simpler code and faster code?Yes - admittedly I had expected you to dislike a special casing approach like this... Jan _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2008-Oct-30 17:27 UTC
Re: [Xen-devel] [PATCH] x86: fix preemptable page typehandling(v2)
On 30/10/08 17:03, "Jan Beulich" <jbeulich@novell.com> wrote:>> This would be basically a small cmpxchg() loop. Since it''s only executed >> before setting PGT_partial we know that only one such special reference is >> needed per page and we could just make get_page() leave headroom of one >> reference. >> >> Simpler code and faster code? > > Yes - admittedly I had expected you to dislike a special casing approach like > this...I think it will overall make the code clearer. -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
We''re using the V3.2 of the hypervisor code and while tracking down a suspend/resume bug, I came across the following code that caused me to ask myself whether the "has_shutdown_code" should also be cleared when the domain is resumed. This hypercall will not set a new shutdown reason if the domain already has the flag set to 1. So, if a domain is suspended and then resumed, the flag would remain set. case SCHEDOP_shutdown_code: { struct sched_shutdown sched_shutdown; ret = -EFAULT; if ( copy_from_guest(&sched_shutdown, arg, 1) ) break; ret = 0; TRACE_3D(TRC_SCHED_SHUTDOWN_CODE, current->domain->domain_id, current->vcpu_id, sched_shutdown.reason); spin_lock(¤t->domain->shutdown_lock); if ( !current->domain->has_shutdown_code ) { current->domain->shutdown_code = (u8)sched_shutdown.reason; current->domain->has_shutdown_code = 1; } So I asked myself, should the domain_resume routine also clear that flag? void domain_resume(struct domain *d) { struct vcpu *v; /* * Some code paths assume that shutdown status does not get reset under * their feet (e.g., some assertions make this assumption). */ domain_pause(d); spin_lock(&d->shutdown_lock); d->is_shutting_down = d->is_shut_down = 0; <---- should it also include: "d->has_shutdown_code = " for_each_vcpu ( d, v ) { if ( v->paused_for_shutdown ) vcpu_unpause(v); v->paused_for_shutdown = 0; } spin_unlock(&d->shutdown_lock); domain_unpause(d); } _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2008-Oct-30 22:02 UTC
Re: [Xen-devel] Use of the "has_shutdown_code" code in v3.2
On 30/10/08 21:19, "Roger Cruz" <rcruz@marathontechnologies.com> wrote:> We''re using the V3.2 of the hypervisor code and while tracking down a > suspend/resume bug, I came across the following code that caused me to > ask myself whether the "has_shutdown_code" should also be cleared when > the domain is resumed.You''re musing on a flag which has never been part of mainline Xen. Looks like you''re right though -- a tree which has the has_shutdown_code flag should clear it on domain resume. -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel