Hi Jan, I redid construct_dom0 to how you had wanted, temporarily running PVH also on dom0 page tables, and removing the PVH special copy function. Kindly take a look. Other surrouding patches to boot PVH in dom0 mode are mostly unchanged, so just leaving them out. thanks Mukesh
Mukesh Rathor
2013-Sep-25 21:03 UTC
[RFC 0 PATCH 1/3] PVH dom0: create domctl_memory_mapping() function
In this preparatory patch XEN_DOMCTL_memory_mapping code is put into a function so it can be called later for PVH from construct_dom0. There is no change in it''s functionality. The function is made non-static in the construct_dom0 patch. Note, iomem_access_permitted is not moved to the function because current doesn''t point to dom0 in construct_dom0. Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> --- xen/arch/x86/domctl.c | 122 ++++++++++++++++++++++++++---------------------- 1 files changed, 66 insertions(+), 56 deletions(-) diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c index e75918a..fe7ca00 100644 --- a/xen/arch/x86/domctl.c +++ b/xen/arch/x86/domctl.c @@ -46,6 +46,71 @@ static int gdbsx_guest_mem_io( return (iop->remain ? -EFAULT : 0); } +static long domctl_memory_mapping(struct domain *d, unsigned long gfn, + unsigned long mfn, unsigned long nr_mfns, + bool_t add) +{ + unsigned long i; + long ret; + + if ( (mfn + nr_mfns - 1) < mfn || /* wrap? */ + ((mfn | (mfn + nr_mfns - 1)) >> (paddr_bits - PAGE_SHIFT)) || + (gfn + nr_mfns - 1) < gfn ) /* wrap? */ + return -EINVAL; + + ret = xsm_iomem_mapping(XSM_HOOK, d, mfn, mfn + nr_mfns - 1, add); + if ( ret ) + return ret; + + if ( add ) + { + printk(XENLOG_G_INFO + "memory_map:add: dom%d gfn=%lx mfn=%lx nr=%lx\n", + d->domain_id, gfn, mfn, nr_mfns); + + ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1); + if ( !ret && paging_mode_translate(d) ) + { + for ( i = 0; !ret && i < nr_mfns; i++ ) + if ( !set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i)) ) + ret = -EIO; + if ( ret ) + { + printk(XENLOG_G_WARNING + "memory_map:fail: dom%d gfn=%lx mfn=%lx\n", + d->domain_id, gfn + i, mfn + i); + while ( i-- ) + clear_mmio_p2m_entry(d, gfn + i); + if ( iomem_deny_access(d, mfn, mfn + nr_mfns - 1) && + is_hardware_domain(current->domain) ) + printk(XENLOG_ERR + "memory_map: failed to deny dom%d access to [%lx,%lx]\n", + d->domain_id, mfn, mfn + nr_mfns - 1); + } + } + } + else + { + printk(XENLOG_G_INFO + "memory_map:remove: dom%d gfn=%lx mfn=%lx nr=%lx\n", + d->domain_id, gfn, mfn, nr_mfns); + + if ( paging_mode_translate(d) ) + for ( i = 0; i < nr_mfns; i++ ) + add |= !clear_mmio_p2m_entry(d, gfn + i); + ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1); + if ( !ret && add ) + ret = -EIO; + if ( ret && is_hardware_domain(current->domain) ) + printk(XENLOG_ERR + "memory_map: error %ld %s dom%d access to [%lx,%lx]\n", + ret, add ? "removing" : "denying", d->domain_id, + mfn, mfn + nr_mfns - 1); + } + + return ret; +} + long arch_do_domctl( struct xen_domctl *domctl, struct domain *d, XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) @@ -625,67 +690,12 @@ long arch_do_domctl( unsigned long mfn = domctl->u.memory_mapping.first_mfn; unsigned long nr_mfns = domctl->u.memory_mapping.nr_mfns; int add = domctl->u.memory_mapping.add_mapping; - unsigned long i; - - ret = -EINVAL; - if ( (mfn + nr_mfns - 1) < mfn || /* wrap? */ - ((mfn | (mfn + nr_mfns - 1)) >> (paddr_bits - PAGE_SHIFT)) || - (gfn + nr_mfns - 1) < gfn ) /* wrap? */ - break; ret = -EPERM; if ( !iomem_access_permitted(current->domain, mfn, mfn + nr_mfns - 1) ) break; - ret = xsm_iomem_mapping(XSM_HOOK, d, mfn, mfn + nr_mfns - 1, add); - if ( ret ) - break; - - if ( add ) - { - printk(XENLOG_G_INFO - "memory_map:add: dom%d gfn=%lx mfn=%lx nr=%lx\n", - d->domain_id, gfn, mfn, nr_mfns); - - ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1); - if ( !ret && paging_mode_translate(d) ) - { - for ( i = 0; !ret && i < nr_mfns; i++ ) - if ( !set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i)) ) - ret = -EIO; - if ( ret ) - { - printk(XENLOG_G_WARNING - "memory_map:fail: dom%d gfn=%lx mfn=%lx\n", - d->domain_id, gfn + i, mfn + i); - while ( i-- ) - clear_mmio_p2m_entry(d, gfn + i); - if ( iomem_deny_access(d, mfn, mfn + nr_mfns - 1) && - is_hardware_domain(current->domain) ) - printk(XENLOG_ERR - "memory_map: failed to deny dom%d access to [%lx,%lx]\n", - d->domain_id, mfn, mfn + nr_mfns - 1); - } - } - } - else - { - printk(XENLOG_G_INFO - "memory_map:remove: dom%d gfn=%lx mfn=%lx nr=%lx\n", - d->domain_id, gfn, mfn, nr_mfns); - - if ( paging_mode_translate(d) ) - for ( i = 0; i < nr_mfns; i++ ) - add |= !clear_mmio_p2m_entry(d, gfn + i); - ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1); - if ( !ret && add ) - ret = -EIO; - if ( ret && is_hardware_domain(current->domain) ) - printk(XENLOG_ERR - "memory_map: error %ld %s dom%d access to [%lx,%lx]\n", - ret, add ? "removing" : "denying", d->domain_id, - mfn, mfn + nr_mfns - 1); - } + ret = domctl_memory_mapping(d, gfn, mfn, nr_mfns, add); } break; -- 1.7.2.3
Mukesh Rathor
2013-Sep-25 21:03 UTC
[RFC 0 PATCH 2/3] PVH dom0: move some pv specific code to static functions
In this preparatory patch also, some pv specific code is carved out into static functions. No functionality change. Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> --- xen/arch/x86/domain_build.c | 358 +++++++++++++++++++++++------------------- 1 files changed, 196 insertions(+), 162 deletions(-) diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c index 232adf8..5125aa2 100644 --- a/xen/arch/x86/domain_build.c +++ b/xen/arch/x86/domain_build.c @@ -307,6 +307,197 @@ static void __init process_dom0_ioports_disable(void) } } +static __init void mark_pv_pt_pages_rdonly(struct domain *d, + l4_pgentry_t *l4start, + unsigned long vpt_start, + unsigned long nr_pt_pages) +{ + unsigned long count; + struct page_info *page; + l4_pgentry_t *l4tab; + l3_pgentry_t *l3tab, *l3start; + l2_pgentry_t *l2tab, *l2start; + l1_pgentry_t *l1tab, *l1start; + + /* Pages that are part of page tables must be read only. */ + l4tab = l4start + l4_table_offset(vpt_start); + l3start = l3tab = l4e_to_l3e(*l4tab); + l3tab += l3_table_offset(vpt_start); + l2start = l2tab = l3e_to_l2e(*l3tab); + l2tab += l2_table_offset(vpt_start); + l1start = l1tab = l2e_to_l1e(*l2tab); + l1tab += l1_table_offset(vpt_start); + for ( count = 0; count < nr_pt_pages; count++ ) + { + l1e_remove_flags(*l1tab, _PAGE_RW); + page = mfn_to_page(l1e_get_pfn(*l1tab)); + + /* Read-only mapping + PGC_allocated + page-table page. */ + page->count_info = PGC_allocated | 3; + page->u.inuse.type_info |= PGT_validated | 1; + + /* Top-level p.t. is pinned. */ + if ( (page->u.inuse.type_info & PGT_type_mask) =+ (!is_pv_32on64_domain(d) ? + PGT_l4_page_table : PGT_l3_page_table) ) + { + page->count_info += 1; + page->u.inuse.type_info += 1 | PGT_pinned; + } + + /* Iterate. */ + if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) ) + { + if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) ) + { + if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) ) + l3start = l3tab = l4e_to_l3e(*++l4tab); + l2start = l2tab = l3e_to_l2e(*l3tab); + } + l1start = l1tab = l2e_to_l1e(*l2tab); + } + } +} + +static __init void setup_pv_p2m_table( + struct domain *d, struct vcpu *v, struct elf_dom_parms *parms, + unsigned long v_start, unsigned long vphysmap_start, + unsigned long vphysmap_end, unsigned long v_end, unsigned long nr_pages) +{ + struct page_info *page = NULL; + l4_pgentry_t *l4tab = NULL, *l4start = NULL; + l3_pgentry_t *l3tab = NULL; + l2_pgentry_t *l2tab = NULL; + l1_pgentry_t *l1tab = NULL; + + l4start = map_domain_page(pagetable_get_pfn(v->arch.guest_table)); + l3tab = NULL; + l2tab = NULL; + l1tab = NULL; + + /* Set up the phys->machine table if not part of the initial mapping. */ + if ( parms->p2m_base != UNSET_ADDR ) + { + unsigned long va = vphysmap_start; + + if ( v_start <= vphysmap_end && vphysmap_start <= v_end ) + panic("DOM0 P->M table overlaps initial mapping"); + + while ( va < vphysmap_end ) + { + if ( d->tot_pages + ((round_pgup(vphysmap_end) - va) + >> PAGE_SHIFT) + 3 > nr_pages ) + panic("Dom0 allocation too small for initial P->M table.\n"); + + if ( l1tab ) + { + unmap_domain_page(l1tab); + l1tab = NULL; + } + if ( l2tab ) + { + unmap_domain_page(l2tab); + l2tab = NULL; + } + if ( l3tab ) + { + unmap_domain_page(l3tab); + l3tab = NULL; + } + l4tab = l4start + l4_table_offset(va); + if ( !l4e_get_intpte(*l4tab) ) + { + page = alloc_domheap_page(d, 0); + if ( !page ) + break; + /* No mapping, PGC_allocated + page-table page. */ + page->count_info = PGC_allocated | 2; + page->u.inuse.type_info + PGT_l3_page_table | PGT_validated | 1; + l3tab = __map_domain_page(page); + clear_page(l3tab); + *l4tab = l4e_from_page(page, L4_PROT); + } else + l3tab = map_domain_page(l4e_get_pfn(*l4tab)); + l3tab += l3_table_offset(va); + if ( !l3e_get_intpte(*l3tab) ) + { + if ( cpu_has_page1gb && + !(va & ((1UL << L3_PAGETABLE_SHIFT) - 1)) && + vphysmap_end >= va + (1UL << L3_PAGETABLE_SHIFT) && + (page = alloc_domheap_pages(d, + L3_PAGETABLE_SHIFT - + PAGE_SHIFT, + 0)) != NULL ) + { + *l3tab = l3e_from_page(page, + L1_PROT|_PAGE_DIRTY|_PAGE_PSE); + va += 1UL << L3_PAGETABLE_SHIFT; + continue; + } + if ( (page = alloc_domheap_page(d, 0)) == NULL ) + break; + /* No mapping, PGC_allocated + page-table page. */ + page->count_info = PGC_allocated | 2; + page->u.inuse.type_info + PGT_l2_page_table | PGT_validated | 1; + l2tab = __map_domain_page(page); + clear_page(l2tab); + *l3tab = l3e_from_page(page, L3_PROT); + } + else + l2tab = map_domain_page(l3e_get_pfn(*l3tab)); + l2tab += l2_table_offset(va); + if ( !l2e_get_intpte(*l2tab) ) + { + if ( !(va & ((1UL << L2_PAGETABLE_SHIFT) - 1)) && + vphysmap_end >= va + (1UL << L2_PAGETABLE_SHIFT) && + (page = alloc_domheap_pages(d, + L2_PAGETABLE_SHIFT - + PAGE_SHIFT, + 0)) != NULL ) + { + *l2tab = l2e_from_page(page, + L1_PROT|_PAGE_DIRTY|_PAGE_PSE); + if ( opt_allow_superpage ) + get_superpage(page_to_mfn(page), d); + va += 1UL << L2_PAGETABLE_SHIFT; + continue; + } + if ( (page = alloc_domheap_page(d, 0)) == NULL ) + break; + /* No mapping, PGC_allocated + page-table page. */ + page->count_info = PGC_allocated | 2; + page->u.inuse.type_info + PGT_l1_page_table | PGT_validated | 1; + l1tab = __map_domain_page(page); + clear_page(l1tab); + *l2tab = l2e_from_page(page, L2_PROT); + } + else + l1tab = map_domain_page(l2e_get_pfn(*l2tab)); + l1tab += l1_table_offset(va); + BUG_ON(l1e_get_intpte(*l1tab)); + page = alloc_domheap_page(d, 0); + if ( !page ) + break; + *l1tab = l1e_from_page(page, L1_PROT|_PAGE_DIRTY); + va += PAGE_SIZE; + va &= PAGE_MASK; + } + if ( !page ) + panic("Not enough RAM for DOM0 P->M table.\n"); + } + + if ( l1tab ) + unmap_domain_page(l1tab); + if ( l2tab ) + unmap_domain_page(l2tab); + if ( l3tab ) + unmap_domain_page(l3tab); + unmap_domain_page(l4start); +} + int __init construct_dom0( struct domain *d, const module_t *image, unsigned long image_headroom, @@ -705,44 +896,8 @@ int __init construct_dom0( COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab)); } - /* Pages that are part of page tables must be read only. */ - l4tab = l4start + l4_table_offset(vpt_start); - l3start = l3tab = l4e_to_l3e(*l4tab); - l3tab += l3_table_offset(vpt_start); - l2start = l2tab = l3e_to_l2e(*l3tab); - l2tab += l2_table_offset(vpt_start); - l1start = l1tab = l2e_to_l1e(*l2tab); - l1tab += l1_table_offset(vpt_start); - for ( count = 0; count < nr_pt_pages; count++ ) - { - l1e_remove_flags(*l1tab, _PAGE_RW); - page = mfn_to_page(l1e_get_pfn(*l1tab)); - - /* Read-only mapping + PGC_allocated + page-table page. */ - page->count_info = PGC_allocated | 3; - page->u.inuse.type_info |= PGT_validated | 1; - - /* Top-level p.t. is pinned. */ - if ( (page->u.inuse.type_info & PGT_type_mask) =- (!is_pv_32on64_domain(d) ? - PGT_l4_page_table : PGT_l3_page_table) ) - { - page->count_info += 1; - page->u.inuse.type_info += 1 | PGT_pinned; - } - - /* Iterate. */ - if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) ) - { - if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) ) - { - if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) ) - l3start = l3tab = l4e_to_l3e(*++l4tab); - l2start = l2tab = l3e_to_l2e(*l3tab); - } - l1start = l1tab = l2e_to_l1e(*l2tab); - } - } + if ( is_pv_domain(d) ) + mark_pv_pt_pages_rdonly(d, l4start, vpt_start, nr_pt_pages); /* Mask all upcalls... */ for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ ) @@ -814,131 +969,10 @@ int __init construct_dom0( elf_64bit(&elf) ? 64 : 32, parms.pae ? "p" : ""); count = d->tot_pages; - l4start = map_domain_page(pagetable_get_pfn(v->arch.guest_table)); - l3tab = NULL; - l2tab = NULL; - l1tab = NULL; - /* Set up the phys->machine table if not part of the initial mapping. */ - if ( parms.p2m_base != UNSET_ADDR ) - { - unsigned long va = vphysmap_start; - if ( v_start <= vphysmap_end && vphysmap_start <= v_end ) - panic("DOM0 P->M table overlaps initial mapping"); - - while ( va < vphysmap_end ) - { - if ( d->tot_pages + ((round_pgup(vphysmap_end) - va) - >> PAGE_SHIFT) + 3 > nr_pages ) - panic("Dom0 allocation too small for initial P->M table.\n"); - - if ( l1tab ) - { - unmap_domain_page(l1tab); - l1tab = NULL; - } - if ( l2tab ) - { - unmap_domain_page(l2tab); - l2tab = NULL; - } - if ( l3tab ) - { - unmap_domain_page(l3tab); - l3tab = NULL; - } - l4tab = l4start + l4_table_offset(va); - if ( !l4e_get_intpte(*l4tab) ) - { - page = alloc_domheap_page(d, 0); - if ( !page ) - break; - /* No mapping, PGC_allocated + page-table page. */ - page->count_info = PGC_allocated | 2; - page->u.inuse.type_info - PGT_l3_page_table | PGT_validated | 1; - l3tab = __map_domain_page(page); - clear_page(l3tab); - *l4tab = l4e_from_page(page, L4_PROT); - } else - l3tab = map_domain_page(l4e_get_pfn(*l4tab)); - l3tab += l3_table_offset(va); - if ( !l3e_get_intpte(*l3tab) ) - { - if ( cpu_has_page1gb && - !(va & ((1UL << L3_PAGETABLE_SHIFT) - 1)) && - vphysmap_end >= va + (1UL << L3_PAGETABLE_SHIFT) && - (page = alloc_domheap_pages(d, - L3_PAGETABLE_SHIFT - - PAGE_SHIFT, - 0)) != NULL ) - { - *l3tab = l3e_from_page(page, - L1_PROT|_PAGE_DIRTY|_PAGE_PSE); - va += 1UL << L3_PAGETABLE_SHIFT; - continue; - } - if ( (page = alloc_domheap_page(d, 0)) == NULL ) - break; - /* No mapping, PGC_allocated + page-table page. */ - page->count_info = PGC_allocated | 2; - page->u.inuse.type_info - PGT_l2_page_table | PGT_validated | 1; - l2tab = __map_domain_page(page); - clear_page(l2tab); - *l3tab = l3e_from_page(page, L3_PROT); - } - else - l2tab = map_domain_page(l3e_get_pfn(*l3tab)); - l2tab += l2_table_offset(va); - if ( !l2e_get_intpte(*l2tab) ) - { - if ( !(va & ((1UL << L2_PAGETABLE_SHIFT) - 1)) && - vphysmap_end >= va + (1UL << L2_PAGETABLE_SHIFT) && - (page = alloc_domheap_pages(d, - L2_PAGETABLE_SHIFT - - PAGE_SHIFT, - 0)) != NULL ) - { - *l2tab = l2e_from_page(page, - L1_PROT|_PAGE_DIRTY|_PAGE_PSE); - if ( opt_allow_superpage ) - get_superpage(page_to_mfn(page), d); - va += 1UL << L2_PAGETABLE_SHIFT; - continue; - } - if ( (page = alloc_domheap_page(d, 0)) == NULL ) - break; - /* No mapping, PGC_allocated + page-table page. */ - page->count_info = PGC_allocated | 2; - page->u.inuse.type_info - PGT_l1_page_table | PGT_validated | 1; - l1tab = __map_domain_page(page); - clear_page(l1tab); - *l2tab = l2e_from_page(page, L2_PROT); - } - else - l1tab = map_domain_page(l2e_get_pfn(*l2tab)); - l1tab += l1_table_offset(va); - BUG_ON(l1e_get_intpte(*l1tab)); - page = alloc_domheap_page(d, 0); - if ( !page ) - break; - *l1tab = l1e_from_page(page, L1_PROT|_PAGE_DIRTY); - va += PAGE_SIZE; - va &= PAGE_MASK; - } - if ( !page ) - panic("Not enough RAM for DOM0 P->M table.\n"); - } - - if ( l1tab ) - unmap_domain_page(l1tab); - if ( l2tab ) - unmap_domain_page(l2tab); - if ( l3tab ) - unmap_domain_page(l3tab); - unmap_domain_page(l4start); + if ( is_pv_domain(d) ) + setup_pv_p2m_table(d, v, &parms, v_start, vphysmap_start, + vphysmap_end, v_end, nr_pages); /* Write the phys->machine and machine->phys table entries. */ for ( pfn = 0; pfn < count; pfn++ ) -- 1.7.2.3
This patch changes construct_dom0 to boot in PVH mode. Changes need to support it are also included here. Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> --- xen/arch/x86/domain_build.c | 231 +++++++++++++++++++++++++++++++++++++++---- xen/arch/x86/domctl.c | 2 +- xen/arch/x86/mm/hap/hap.c | 14 +++ xen/include/asm-x86/hap.h | 1 + xen/include/xen/domain.h | 3 + 5 files changed, 231 insertions(+), 20 deletions(-) diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c index 5125aa2..3fd2b6c 100644 --- a/xen/arch/x86/domain_build.c +++ b/xen/arch/x86/domain_build.c @@ -35,6 +35,7 @@ #include <asm/setup.h> #include <asm/bzimage.h> /* for bzimage_parse */ #include <asm/io_apic.h> +#include <asm/hap.h> #include <public/version.h> @@ -307,6 +308,136 @@ static void __init process_dom0_ioports_disable(void) } } +/* + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0 will have + * the entire io region mapped in the EPT/NPT. + * + * PVH FIXME: The following doesn''t map MMIO ranges when they sit above the + * highest E820 covered address. + */ +static __init void pvh_map_all_iomem(struct domain *d) +{ + unsigned long start_pfn, end_pfn, end = 0, start = 0; + const struct e820entry *entry; + unsigned int i, nump; + int rc; + + for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ ) + { + end = entry->addr + entry->size; + + if ( entry->type == E820_RAM || entry->type == E820_UNUSABLE || + i == e820.nr_map - 1 ) + { + start_pfn = PFN_DOWN(start); + end_pfn = PFN_UP(end); + + if ( entry->type == E820_RAM || entry->type == E820_UNUSABLE ) + end_pfn = PFN_UP(entry->addr); + + if ( start_pfn < end_pfn ) + { + nump = end_pfn - start_pfn; + /* Add pages to the mapping */ + rc = domctl_memory_mapping(d, start_pfn, start_pfn, nump, 1); + BUG_ON(rc); + } + start = end; + } + } + + /* If the e820 ended under 4GB, we must map the remaining space upto 4GB */ + if ( end < GB(4) ) + { + start_pfn = PFN_UP(end); + end_pfn = (GB(4)) >> PAGE_SHIFT; + nump = end_pfn - start_pfn; + rc = domctl_memory_mapping(d, start_pfn, start_pfn, nump, 1); + BUG_ON(rc); + } +} + +static __init void dom0_update_physmap(struct domain *d, unsigned long pfn, + unsigned long mfn, unsigned long vphysmap_s) +{ + if ( is_pvh_domain(d) ) + { + int rc = guest_physmap_add_page(d, pfn, mfn, 0); + BUG_ON(rc); + return; + } + if ( !is_pv_32on64_domain(d) ) + ((unsigned long *)vphysmap_s)[pfn] = mfn; + else + ((unsigned int *)vphysmap_s)[pfn] = mfn; + + set_gpfn_from_mfn(mfn, pfn); +} + +static __init void pvh_fixup_page_tables_for_hap(struct vcpu *v, + unsigned long v_start) +{ + int i, j, k; + l4_pgentry_t *l4tab = NULL, *l4start = NULL; + l3_pgentry_t *l3tab = NULL; + l2_pgentry_t *l2tab = NULL; + l1_pgentry_t *l1tab = NULL; + l4_pgentry_t sav_guest_l4; + unsigned long cr3_pfn; + + ASSERT(paging_mode_enabled(v->domain)); + + l4start = map_domain_page(pagetable_get_pfn(v->arch.guest_table)); + l4tab = l4start + l4_table_offset(v_start); + sav_guest_l4 = *l4tab; + + /* Give guest a clean slate to start with */ + clear_page(l4start); + *l4tab = sav_guest_l4; + BUG_ON(!l4e_get_pfn(sav_guest_l4)); + + l3tab = map_l3t_from_l4e(*l4tab); + for (i=0; i < PAGE_SIZE / sizeof(l3_pgentry_t); i++, l3tab++) + { + if ( l3e_get_pfn(*l3tab) == 0 ) + continue; + + l2tab = map_l2t_from_l3e(*l3tab); + for (j=0; j < PAGE_SIZE / sizeof(l2_pgentry_t); j++, l2tab++) + { + if ( l2e_get_pfn(*l2tab) == 0 ) + continue; + + l1tab = map_l1t_from_l2e(*l2tab); + + for (k=0; k < PAGE_SIZE / sizeof(l2_pgentry_t); k++, l1tab++) + { + if ( l1e_get_pfn(*l1tab) == 0 ) + continue; + + *l1tab = l1e_from_pfn(get_gpfn_from_mfn(l1e_get_pfn(*l1tab)), + l1e_get_flags(*l1tab)); + } + *l2tab = l2e_from_pfn(get_gpfn_from_mfn(l2e_get_pfn(*l2tab)), + l2e_get_flags(*l2tab)); + } + *l3tab = l3e_from_pfn(get_gpfn_from_mfn(l3e_get_pfn(*l3tab)), + l3e_get_flags(*l3tab)); + } + *l4tab = l4e_from_pfn(get_gpfn_from_mfn(l4e_get_pfn(*l4tab)), + l4e_get_flags(*l4tab)); + + cr3_pfn = get_gpfn_from_mfn(paddr_to_pfn(v->arch.cr3)); + v->arch.hvm_vcpu.guest_cr[3] = pfn_to_paddr(cr3_pfn); + + /* + * now we update the paging modes (hap_update_paging_modes). This will + * create monitor_table for us, update v->arch.cr3, and vmcs.cr3. + */ + paging_update_paging_modes(v); + +} + static __init void mark_pv_pt_pages_rdonly(struct domain *d, l4_pgentry_t *l4start, unsigned long vpt_start, @@ -513,7 +644,7 @@ int __init construct_dom0( unsigned long alloc_spfn; unsigned long alloc_epfn; unsigned long initrd_pfn = -1, initrd_mfn = 0; - unsigned long count; + unsigned long count, shared_info_paddr = 0; struct page_info *page = NULL; start_info_t *si; struct vcpu *v = d->vcpu[0]; @@ -526,6 +657,7 @@ int __init construct_dom0( l3_pgentry_t *l3tab = NULL, *l3start = NULL; l2_pgentry_t *l2tab = NULL, *l2start = NULL; l1_pgentry_t *l1tab = NULL, *l1start = NULL; + u32 save_pvh_pg_mode = 0; /* * This fully describes the memory layout of the initial domain. All @@ -603,12 +735,20 @@ int __init construct_dom0( goto out; } - if ( parms.elf_notes[XEN_ELFNOTE_SUPPORTED_FEATURES].type != XEN_ENT_NONE && - !test_bit(XENFEAT_dom0, parms.f_supported) ) + if ( parms.elf_notes[XEN_ELFNOTE_SUPPORTED_FEATURES].type != XEN_ENT_NONE ) { - printk("Kernel does not support Dom0 operation\n"); - rc = -EINVAL; - goto out; + if ( !test_bit(XENFEAT_dom0, parms.f_supported) ) + { + printk("Kernel does not support Dom0 operation\n"); + rc = -EINVAL; + goto out; + } + if ( is_pvh_domain(d) && + !test_bit(XENFEAT_hvm_callback_vector, parms.f_supported) ) + { + printk("Kernel does not support PVH mode\n"); + return -EINVAL; + } } if ( compat32 ) @@ -673,6 +813,14 @@ int __init construct_dom0( vstartinfo_end = (vstartinfo_start + sizeof(struct start_info) + sizeof(struct dom0_vga_console_info)); + + if ( is_pvh_domain(d) ) + { + /* note, following is paddr and not maddr */ + shared_info_paddr = round_pgup(vstartinfo_end) - v_start; + vstartinfo_end += PAGE_SIZE; + } + vpt_start = round_pgup(vstartinfo_end); for ( nr_pt_pages = 2; ; nr_pt_pages++ ) { @@ -868,6 +1016,9 @@ int __init construct_dom0( L1_PROT : COMPAT_L1_PROT)); l1tab++; + if ( is_pvh_domain(d) ) + continue; + page = mfn_to_page(mfn); if ( (page->u.inuse.type_info == 0) && !get_page_and_type(page, d, PGT_writable_page) ) @@ -912,6 +1063,16 @@ int __init construct_dom0( (void)alloc_vcpu(d, i, cpu); } + /* + * pvh: we temporarily disable paging mode so that we can build cr3 needed + * to run on dom0''s page tables. + */ + if ( is_pvh_domain(d) ) + { + save_pvh_pg_mode = d->arch.paging.mode; + d->arch.paging.mode = 0; + } + /* Set up CR3 value for write_ptbase */ if ( paging_mode_enabled(d) ) paging_update_paging_modes(v); @@ -974,6 +1135,17 @@ int __init construct_dom0( setup_pv_p2m_table(d, v, &parms, v_start, vphysmap_start, vphysmap_end, v_end, nr_pages); + if ( is_pvh_domain(d) ) + { + hap_set_pvh_alloc_for_dom0(d, nr_pages); + + /* + * We enable paging mode again so guest_physmap_add_page will do the + * right thing for us. + */ + d->arch.paging.mode = save_pvh_pg_mode; + } + /* Write the phys->machine and machine->phys table entries. */ for ( pfn = 0; pfn < count; pfn++ ) { @@ -990,11 +1162,7 @@ int __init construct_dom0( if ( pfn > REVERSE_START && (vinitrd_start || pfn < initrd_pfn) ) mfn = alloc_epfn - (pfn - REVERSE_START); #endif - if ( !is_pv_32on64_domain(d) ) - ((unsigned long *)vphysmap_start)[pfn] = mfn; - else - ((unsigned int *)vphysmap_start)[pfn] = mfn; - set_gpfn_from_mfn(mfn, pfn); + dom0_update_physmap(d, pfn, mfn, vphysmap_start); if (!(pfn & 0xfffff)) process_pending_softirqs(); } @@ -1010,8 +1178,8 @@ int __init construct_dom0( if ( !page->u.inuse.type_info && !get_page_and_type(page, d, PGT_writable_page) ) BUG(); - ((unsigned long *)vphysmap_start)[pfn] = mfn; - set_gpfn_from_mfn(mfn, pfn); + + dom0_update_physmap(d, pfn, mfn, vphysmap_start); ++pfn; if (!(pfn & 0xfffff)) process_pending_softirqs(); @@ -1031,11 +1199,7 @@ int __init construct_dom0( #ifndef NDEBUG #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn))) #endif - if ( !is_pv_32on64_domain(d) ) - ((unsigned long *)vphysmap_start)[pfn] = mfn; - else - ((unsigned int *)vphysmap_start)[pfn] = mfn; - set_gpfn_from_mfn(mfn, pfn); + dom0_update_physmap(d, pfn, mfn, vphysmap_start); #undef pfn page++; pfn++; if (!(pfn & 0xfffff)) @@ -1059,6 +1223,15 @@ int __init construct_dom0( si->console.dom0.info_size = sizeof(struct dom0_vga_console_info); } + /* + * PVH: We need to update si->shared_info while we are on dom0 page tables, + * but need to defer the p2m update until after we have fixed up the + * page tables for PVH so that the m2p for the si pte entry returns + * correct pfn. + */ + if ( is_pvh_domain(d) ) + si->shared_info = shared_info_paddr; + if ( is_pv_32on64_domain(d) ) xlat_start_info(si, XLAT_start_info_console_dom0); @@ -1089,11 +1262,18 @@ int __init construct_dom0( regs->eip = parms.virt_entry; regs->esp = vstack_end; regs->esi = vstartinfo_start; - regs->eflags = X86_EFLAGS_IF; + regs->eflags = X86_EFLAGS_IF | 0x2; if ( opt_dom0_shadow ) + { + if ( is_pvh_domain(d) ) + { + printk("Invalid option dom0_shadow for PVH\n"); + return -EINVAL; + } if ( paging_enable(d, PG_SH_enable) == 0 ) paging_update_paging_modes(v); + } if ( supervisor_mode_kernel ) { @@ -1183,6 +1363,19 @@ int __init construct_dom0( printk(" Xen warning: dom0 kernel broken ELF: %s\n", elf_check_broken(&elf)); + if ( is_pvh_domain(d) ) + { + /* finally, fixup the page table, replacing mfns with pfns */ + pvh_fixup_page_tables_for_hap(v, v_start); + + /* the pt has correct pfn for si, now update the mfn in the p2m */ + mfn = virt_to_mfn(d->shared_info); + pfn = shared_info_paddr >> PAGE_SHIFT; + dom0_update_physmap(d, pfn, mfn, 0); + + pvh_map_all_iomem(d); + } + iommu_dom0_init(dom0); return 0; diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c index fe7ca00..998827c 100644 --- a/xen/arch/x86/domctl.c +++ b/xen/arch/x86/domctl.c @@ -46,7 +46,7 @@ static int gdbsx_guest_mem_io( return (iop->remain ? -EFAULT : 0); } -static long domctl_memory_mapping(struct domain *d, unsigned long gfn, +long domctl_memory_mapping(struct domain *d, unsigned long gfn, unsigned long mfn, unsigned long nr_mfns, bool_t add) { diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c index bff05d9..2f5a48b 100644 --- a/xen/arch/x86/mm/hap/hap.c +++ b/xen/arch/x86/mm/hap/hap.c @@ -580,6 +580,20 @@ int hap_domctl(struct domain *d, xen_domctl_shadow_op_t *sc, } } +void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages) +{ + int rc; + unsigned long memkb = num_pages * (PAGE_SIZE / 1024); + + /* Copied from: libxl_get_required_shadow_memory() */ + memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024)); + num_pages = ((memkb+1023)/1024) << (20 - PAGE_SHIFT); + paging_lock(d); + rc = hap_set_allocation(d, num_pages, NULL); + paging_unlock(d); + BUG_ON(rc); +} + static const struct paging_mode hap_paging_real_mode; static const struct paging_mode hap_paging_protected_mode; static const struct paging_mode hap_paging_pae_mode; diff --git a/xen/include/asm-x86/hap.h b/xen/include/asm-x86/hap.h index e03f983..aab8558 100644 --- a/xen/include/asm-x86/hap.h +++ b/xen/include/asm-x86/hap.h @@ -63,6 +63,7 @@ int hap_track_dirty_vram(struct domain *d, XEN_GUEST_HANDLE_64(uint8) dirty_bitmap); extern const struct paging_mode *hap_paging_get_mode(struct vcpu *); +void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages); #endif /* XEN_HAP_H */ diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h index a057069..6436bab 100644 --- a/xen/include/xen/domain.h +++ b/xen/include/xen/domain.h @@ -89,4 +89,7 @@ extern unsigned int xen_processor_pmbits; extern bool_t opt_dom0_vcpus_pin; +extern long domctl_memory_mapping(struct domain *d, unsigned long gfn, + unsigned long mfn, unsigned long nr_mfns, bool_t add_map); + #endif /* __XEN_DOMAIN_H__ */ -- 1.7.2.3
Jan Beulich
2013-Sep-26 07:03 UTC
Re: [RFC 0 PATCH 1/3] PVH dom0: create domctl_memory_mapping() function
>>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: > +static long domctl_memory_mapping(struct domain *d, unsigned long gfn, > + unsigned long mfn, unsigned long nr_mfns, > + bool_t add) > +{ > + unsigned long i; > + long ret; > + > + if ( (mfn + nr_mfns - 1) < mfn || /* wrap? */ > + ((mfn | (mfn + nr_mfns - 1)) >> (paddr_bits - PAGE_SHIFT)) || > + (gfn + nr_mfns - 1) < gfn ) /* wrap? */ > + return -EINVAL; > + > + ret = xsm_iomem_mapping(XSM_HOOK, d, mfn, mfn + nr_mfns - 1, add); > + if ( ret ) > + return ret;Sigh - I said this more than once before: The order of the checks in the original code _is_ relevant. You can''t move the first and third checks here, but leave the second one there. The way you use it in the later patch, the function is mis-named, and the checks aren''t needed for that other case anyway (maybe the XSM one ought to be done, but that''s the last check, so can be moved here if necessary). Hence just leave them in the caller (and thus retain their proper order). Jan
Jan Beulich
2013-Sep-26 07:21 UTC
Re: [RFC 0 PATCH 2/3] PVH dom0: move some pv specific code to static functions
>>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: > +static __init void setup_pv_p2m_table(Let''s call this setup_pv_physmap() or some such; avoid including p2m in the name in particular.> + struct domain *d, struct vcpu *v, struct elf_dom_parms *parms, > + unsigned long v_start, unsigned long vphysmap_start, > + unsigned long vphysmap_end, unsigned long v_end, unsigned long nr_pages)Parameters could be grouped a little more logically.> +{ > + struct page_info *page = NULL; > + l4_pgentry_t *l4tab = NULL, *l4start = NULL; > + l3_pgentry_t *l3tab = NULL; > + l2_pgentry_t *l2tab = NULL; > + l1_pgentry_t *l1tab = NULL; > + > + l4start = map_domain_page(pagetable_get_pfn(v->arch.guest_table));This appears to be the only use of "v", so it''d be more logical to pass pagetable_get_pfn(v->arch.guest_table) or v->arch.guest_table. Also, with l4start being initialized here, there''s no point in setting it to NULL in its declaration.> + l3tab = NULL; > + l2tab = NULL; > + l1tab = NULL;Their declarations above have initializers, so what''s the point of these?> + > + /* Set up the phys->machine table if not part of the initial mapping. */ > + if ( parms->p2m_base != UNSET_ADDR )This check could now be easily left at the call site, thus reducing indentation of the entire body below. Which would perhaps allow not passing parms to this function at all.> + { > + unsigned long va = vphysmap_start;Now that you would no longer clobber vphysmap_start (in the caller), there''s no need for a separate variable here.> + > + if ( v_start <= vphysmap_end && vphysmap_start <= v_end ) > + panic("DOM0 P->M table overlaps initial mapping"); > + > + while ( va < vphysmap_end ) > + { > + if ( d->tot_pages + ((round_pgup(vphysmap_end) - va) > + >> PAGE_SHIFT) + 3 > nr_pages ) > + panic("Dom0 allocation too small for initial P->M table.\n"); > + > + if ( l1tab ) > + { > + unmap_domain_page(l1tab); > + l1tab = NULL; > + } > + if ( l2tab ) > + { > + unmap_domain_page(l2tab); > + l2tab = NULL; > + } > + if ( l3tab ) > + { > + unmap_domain_page(l3tab); > + l3tab = NULL; > + } > + l4tab = l4start + l4_table_offset(va);The way this l4tab gets used here (and l[123]tab below) makes these names bogus - they should be pl[1234]e instead. The original uses were just attributed to the desire of re-using existing variables, which is mute with this stuff getting moved into its own function. This also applies to the similarly named variables in mark_pv_pt_pages_rdonly(). Jan
>>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: > @@ -307,6 +308,136 @@ static void __init process_dom0_ioports_disable(void) > } > } > > +/* > + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0 will have > + * the entire io region mapped in the EPT/NPT. > + * > + * PVH FIXME: The following doesn''t map MMIO ranges when they sit above the > + * highest E820 covered address.This absolutely needs fixing before this can go in.> + */ > +static __init void pvh_map_all_iomem(struct domain *d) > +{ > + unsigned long start_pfn, end_pfn, end = 0, start = 0; > + const struct e820entry *entry; > + unsigned int i, nump; > + int rc; > + > + for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ ) > + { > + end = entry->addr + entry->size; > + > + if ( entry->type == E820_RAM || entry->type == E820_UNUSABLE ||The inclusion of E820_UNUSABLE here clearly needs an explanation (ideally in the shape of a code comment).> + i == e820.nr_map - 1 ) > + { > + start_pfn = PFN_DOWN(start); > + end_pfn = PFN_UP(end); > + > + if ( entry->type == E820_RAM || entry->type == E820_UNUSABLE ) > + end_pfn = PFN_UP(entry->addr);Better coded as if/else construct (making more obvious what the intention here is - it took me quite some time to see that what you''re doing here is correct, because the general case really is what sits in the if() body, and the exception is what gets done before the if(), which is counter intuitive).> + > + if ( start_pfn < end_pfn ) > + { > + nump = end_pfn - start_pfn; > + /* Add pages to the mapping */ > + rc = domctl_memory_mapping(d, start_pfn, start_pfn, nump, 1); > + BUG_ON(rc); > + } > + start = end; > + } > + } > + > + /* If the e820 ended under 4GB, we must map the remaining space upto 4GB */This is arbitrary (related to the FIXME above).> +static __init void pvh_fixup_page_tables_for_hap(struct vcpu *v, > + unsigned long v_start) > +{ > + int i, j, k; > + l4_pgentry_t *l4tab = NULL, *l4start = NULL; > + l3_pgentry_t *l3tab = NULL; > + l2_pgentry_t *l2tab = NULL; > + l1_pgentry_t *l1tab = NULL; > + l4_pgentry_t sav_guest_l4; > + unsigned long cr3_pfn; > + > + ASSERT(paging_mode_enabled(v->domain)); > + > + l4start = map_domain_page(pagetable_get_pfn(v->arch.guest_table)); > + l4tab = l4start + l4_table_offset(v_start);Many of the comments on patch 2 apply here too.> + sav_guest_l4 = *l4tab; > + > + /* Give guest a clean slate to start with */ > + clear_page(l4start); > + *l4tab = sav_guest_l4; > + BUG_ON(!l4e_get_pfn(sav_guest_l4));This assumes that the initial mapping is covered by a single L4 entry. While true for Linux, this is not generic, and hence needs fixing (the problem continues further down).> + > + l3tab = map_l3t_from_l4e(*l4tab); > + for (i=0; i < PAGE_SIZE / sizeof(l3_pgentry_t); i++, l3tab++)Coding style (not going to make further notes on violations).> + { > + if ( l3e_get_pfn(*l3tab) == 0 )Is that really a good check? Shouldn''t that rather look at _PAGE_PRESENT?> + continue; > + > + l2tab = map_l2t_from_l3e(*l3tab); > + for (j=0; j < PAGE_SIZE / sizeof(l2_pgentry_t); j++, l2tab++) > + { > + if ( l2e_get_pfn(*l2tab) == 0 ) > + continue; > + > + l1tab = map_l1t_from_l2e(*l2tab); > + > + for (k=0; k < PAGE_SIZE / sizeof(l2_pgentry_t); k++, l1tab++)l2_pgentry_t? You be better off using sizeof(*l1tab) here anyway (and similarly in the other loops).> @@ -513,7 +644,7 @@ int __init construct_dom0( > unsigned long alloc_spfn; > unsigned long alloc_epfn; > unsigned long initrd_pfn = -1, initrd_mfn = 0; > - unsigned long count; > + unsigned long count, shared_info_paddr = 0;Even if benign for x86-64, please used paddr_t for physical addresses.> @@ -603,12 +735,20 @@ int __init construct_dom0( > goto out; > } > > - if ( parms.elf_notes[XEN_ELFNOTE_SUPPORTED_FEATURES].type != XEN_ENT_NONE && > - !test_bit(XENFEAT_dom0, parms.f_supported) ) > + if ( parms.elf_notes[XEN_ELFNOTE_SUPPORTED_FEATURES].type != XEN_ENT_NONE ) > { > - printk("Kernel does not support Dom0 operation\n"); > - rc = -EINVAL; > - goto out; > + if ( !test_bit(XENFEAT_dom0, parms.f_supported) ) > + { > + printk("Kernel does not support Dom0 operation\n"); > + rc = -EINVAL; > + goto out; > + } > + if ( is_pvh_domain(d) && > + !test_bit(XENFEAT_hvm_callback_vector, parms.f_supported) ) > + { > + printk("Kernel does not support PVH mode\n"); > + return -EINVAL;Inconsistent: Above you see "goto out", and here you use "return".> @@ -673,6 +813,14 @@ int __init construct_dom0( > vstartinfo_end = (vstartinfo_start + > sizeof(struct start_info) + > sizeof(struct dom0_vga_console_info)); > + > + if ( is_pvh_domain(d) ) > + { > + /* note, following is paddr and not maddr */ > + shared_info_paddr = round_pgup(vstartinfo_end) - v_start;Hardly worth the comment considering the variable name.> @@ -868,6 +1016,9 @@ int __init construct_dom0( > L1_PROT : COMPAT_L1_PROT)); > l1tab++; > > + if ( is_pvh_domain(d) ) > + continue; > + > page = mfn_to_page(mfn); > if ( (page->u.inuse.type_info == 0) && > !get_page_and_type(page, d, PGT_writable_page) )So why is the remaining part of this loop not applicable to PVH?> @@ -912,6 +1063,16 @@ int __init construct_dom0( > (void)alloc_vcpu(d, i, cpu); > } > > + /* > + * pvh: we temporarily disable paging mode so that we can build cr3 needed > + * to run on dom0''s page tables. > + */ > + if ( is_pvh_domain(d) ) > + { > + save_pvh_pg_mode = d->arch.paging.mode; > + d->arch.paging.mode = 0; > + }Does this really need to be in a conditional?> @@ -1089,11 +1262,18 @@ int __init construct_dom0( > regs->eip = parms.virt_entry; > regs->esp = vstack_end; > regs->esi = vstartinfo_start; > - regs->eflags = X86_EFLAGS_IF; > + regs->eflags = X86_EFLAGS_IF | 0x2;Unrelated change?> +void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages)__init Jan
Mukesh Rathor
2013-Sep-26 23:32 UTC
Re: [RFC 0 PATCH 2/3] PVH dom0: move some pv specific code to static functions
On Thu, 26 Sep 2013 08:21:37 +0100 "Jan Beulich" <JBeulich@suse.com> wrote:> >>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com> > >>> wrote: > > +static __init void setup_pv_p2m_table( >...> > + if ( l3tab ) > > + { > > + unmap_domain_page(l3tab); > > + l3tab = NULL; > > + } > > + l4tab = l4start + l4_table_offset(va); > > The way this l4tab gets used here (and l[123]tab below) makes > these names bogus - they should be pl[1234]e instead. The > original uses were just attributed to the desire of re-using > existing variables, which is mute with this stuff getting moved > into its own function. > > This also applies to the similarly named variables in > mark_pv_pt_pages_rdonly().Ok, all done. Also, noticed the exhisting code for mark_pv_pt_pages_rdonly unnecessarily sets l[321]start in two places. thanks mukesh
Mukesh Rathor
2013-Sep-27 00:17 UTC
Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
On Thu, 26 Sep 2013 09:02:41 +0100 "Jan Beulich" <JBeulich@suse.com> wrote:> >>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com> > >>> wrote: > > @@ -307,6 +308,136 @@ static void __init > > process_dom0_ioports_disable(void) } > > } > > > > +/* > > + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0 > > will have > > + * the entire io region mapped in the EPT/NPT. > > + * > > + * PVH FIXME: The following doesn''t map MMIO ranges when they sit > > above the > > + * highest E820 covered address. > > This absolutely needs fixing before this can go in.Any suggestions on how to fix it? Mapping all the way to end could result in a huge hap table.> > + const struct e820entry *entry; > > + unsigned int i, nump; > > + int rc; > > + > > + for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ ) > > + { > > + end = entry->addr + entry->size; > > + > > + if ( entry->type == E820_RAM || entry->type => > E820_UNUSABLE || > > The inclusion of E820_UNUSABLE here clearly needs an explanation > (ideally in the shape of a code comment). > > > + i == e820.nr_map - 1 ) > > + { > > + start_pfn = PFN_DOWN(start); > > + end_pfn = PFN_UP(end); > > + > > + if ( entry->type == E820_RAM || entry->type => > E820_UNUSABLE ) > > + end_pfn = PFN_UP(entry->addr); > > Better coded as if/else construct (making more obvious what the > intention here is - it took me quite some time to see that what > you''re doing here is correct, because the general case really is > what sits in the if() body, and the exception is what gets done > before the if(), which is counter intuitive).Based on xen_set_identity_and_release linux function. Took me a while to figure that one too. So, I tried simplifying it, but in the end came back to this. Not sure where I would the else. thanks Mukesh
Mukesh Rathor
2013-Sep-27 01:55 UTC
Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
On Thu, 26 Sep 2013 09:02:41 +0100 "Jan Beulich" <JBeulich@suse.com> wrote:> >>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com> > >>> wrote: > > @@ -307,6 +308,136 @@ static void __init.....>> > > + sav_guest_l4 = *l4tab; > > + > > + /* Give guest a clean slate to start with */ > > + clear_page(l4start); > > + *l4tab = sav_guest_l4; > > + BUG_ON(!l4e_get_pfn(sav_guest_l4)); > > This assumes that the initial mapping is covered by a single L4 > entry. While true for Linux, this is not generic, and hence needs > fixing (the problem continues further down).True. How about I just create the opposite of init_guest_l4_table, ie, clear_guest_l4_table and just clear out xen entries?> > > + { > > + if ( l3e_get_pfn(*l3tab) == 0 ) > > Is that really a good check? Shouldn''t that rather look at > _PAGE_PRESENT?Ok, I will change that.> Hardly worth the comment considering the variable name. > > > @@ -868,6 +1016,9 @@ int __init construct_dom0( > > L1_PROT : COMPAT_L1_PROT)); > > l1tab++; > > > > + if ( is_pvh_domain(d) ) > > + continue; > > + > > page = mfn_to_page(mfn); > > if ( (page->u.inuse.type_info == 0) && > > !get_page_and_type(page, d, PGT_writable_page) ) > > So why is the remaining part of this loop not applicable to PVH?My bad, looks like it should be. I''ll remove it. BTW, looking at it again I realized we don''t really need to set type_info to PGT* for PVH, but it''s harmless I guess. Should I just leave it or condition them for PV only?> > (void)alloc_vcpu(d, i, cpu); > > } > > > > + /* > > + * pvh: we temporarily disable paging mode so that we can > > build cr3 needed > > + * to run on dom0''s page tables. > > + */ > > + if ( is_pvh_domain(d) ) > > + { > > + save_pvh_pg_mode = d->arch.paging.mode; > > + d->arch.paging.mode = 0; > > + } > > Does this really need to be in a conditional?Not really, wasn''t sure what you''d prefer. I''ll remove conditions.> > @@ -1089,11 +1262,18 @@ int __init construct_dom0( > > regs->eip = parms.virt_entry; > > regs->esp = vstack_end; > > regs->esi = vstartinfo_start; > > - regs->eflags = X86_EFLAGS_IF; > > + regs->eflags = X86_EFLAGS_IF | 0x2; > > Unrelated change?Nop, we need to make sure the resvd bit is set in eflags otherwise it won''t vmenter (invalid guest state). Should be harmless for PV, right? Not sure where it does it for PV before actually scheduling it.. thanks Mukesh
>>> On 27.09.13 at 02:17, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: > On Thu, 26 Sep 2013 09:02:41 +0100 "Jan Beulich" <JBeulich@suse.com> wrote: >> >>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: >> > +/* >> > + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0 will have >> > + * the entire io region mapped in the EPT/NPT. >> > + * >> > + * PVH FIXME: The following doesn''t map MMIO ranges when they sit above the >> > + * highest E820 covered address. >> >> This absolutely needs fixing before this can go in. > > Any suggestions on how to fix it? Mapping all the way to end could > result in a huge hap table.You''ll probably need a call down from Dom0 telling you where it finds/puts MMIO resources. Or perhaps that could be mapped in on demand from the EPT fault handler (since these regions shouldn''t be subject to DMA, and hence IOMMU faults shouldn''t occur - perhaps that''s even a reason to not share page tables at least in dom0-strict mode)?>> > + const struct e820entry *entry; >> > + unsigned int i, nump; >> > + int rc; >> > + >> > + for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ ) >> > + { >> > + end = entry->addr + entry->size; >> > + >> > + if ( entry->type == E820_RAM || entry->type == E820_UNUSABLE || >> >> The inclusion of E820_UNUSABLE here clearly needs an explanation >> (ideally in the shape of a code comment). >> >> > + i == e820.nr_map - 1 ) >> > + { >> > + start_pfn = PFN_DOWN(start); >> > + end_pfn = PFN_UP(end); >> > + >> > + if ( entry->type == E820_RAM || entry->type == E820_UNUSABLE ) >> > + end_pfn = PFN_UP(entry->addr); >> >> Better coded as if/else construct (making more obvious what the >> intention here is - it took me quite some time to see that what >> you''re doing here is correct, because the general case really is >> what sits in the if() body, and the exception is what gets done >> before the if(), which is counter intuitive). > > Based on xen_set_identity_and_release linux function. Took me a while > to figure that one too. So, I tried simplifying it, but in the end > came back to this. Not sure where I would the else.if ( entry->type != E820_RAM && entry->type != E820_UNUSABLE ) end_pfn = PFN_UP(end); else end_pfn = PFN_UP(entry->addr); Jan
>>> On 27.09.13 at 03:55, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: > On Thu, 26 Sep 2013 09:02:41 +0100 "Jan Beulich" <JBeulich@suse.com> wrote: >> >>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: >> > + sav_guest_l4 = *l4tab; >> > + >> > + /* Give guest a clean slate to start with */ >> > + clear_page(l4start); >> > + *l4tab = sav_guest_l4; >> > + BUG_ON(!l4e_get_pfn(sav_guest_l4)); >> >> This assumes that the initial mapping is covered by a single L4 >> entry. While true for Linux, this is not generic, and hence needs >> fixing (the problem continues further down). > > True. How about I just create the opposite of init_guest_l4_table, > ie, clear_guest_l4_table and just clear out xen entries?I''d prefer you clearing everything below and above the range you need mapped, instead of using clear_page().>> > @@ -868,6 +1016,9 @@ int __init construct_dom0( >> > L1_PROT : COMPAT_L1_PROT)); >> > l1tab++; >> > >> > + if ( is_pvh_domain(d) ) >> > + continue; >> > + >> > page = mfn_to_page(mfn); >> > if ( (page->u.inuse.type_info == 0) && >> > !get_page_and_type(page, d, PGT_writable_page) ) >> >> So why is the remaining part of this loop not applicable to PVH? > > My bad, looks like it should be. I''ll remove it. BTW, looking at it again > I realized we don''t really need to set type_info to PGT* for PVH, but it''s > harmless I guess. Should I just leave it or condition them for PV only?No, I''m pretty certain you want them marked writable. If nothing else then for forward compatibility with an eventual change needing to mark certain pages R/O. But I could also imaging the page sharing code to look at this attribute of a page (but I say this without knowing that code at all).>> > @@ -1089,11 +1262,18 @@ int __init construct_dom0( >> > regs->eip = parms.virt_entry; >> > regs->esp = vstack_end; >> > regs->esi = vstartinfo_start; >> > - regs->eflags = X86_EFLAGS_IF; >> > + regs->eflags = X86_EFLAGS_IF | 0x2; >> >> Unrelated change? > > Nop, we need to make sure the resvd bit is set in eflags otherwise > it won''t vmenter (invalid guest state). Should be harmless for PV, > right? Not sure where it does it for PV before actually scheduling it..PV doesn''t set this anywhere - the hardware doesn''t allow the flag to be cleared (writes are ignored). If VMENTER is picky about this, the GUEST_RFLAGS write at the end of vmx_vmenter_helper() should be doing this instead of having to do it here (and obviously in some other place for DomU creation). Jan
Mukesh Rathor
2013-Sep-27 23:03 UTC
Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
On Fri, 27 Sep 2013 08:01:16 +0100 "Jan Beulich" <JBeulich@suse.com> wrote:> >>> On 27.09.13 at 03:55, Mukesh Rathor <mukesh.rathor@oracle.com> > >>> wrote: > > On Thu, 26 Sep 2013 09:02:41 +0100 "Jan Beulich" > > <JBeulich@suse.com> wrote: > >> > @@ -868,6 +1016,9 @@ int __init construct_dom0( > >> > L1_PROT : COMPAT_L1_PROT)); > >> > l1tab++; > >> > > >> > + if ( is_pvh_domain(d) ) > >> > + continue; > >> > + > >> > page = mfn_to_page(mfn); > >> > if ( (page->u.inuse.type_info == 0) && > >> > !get_page_and_type(page, d, PGT_writable_page) ) > >> > >> So why is the remaining part of this loop not applicable to PVH? > > > > My bad, looks like it should be. I''ll remove it. BTW, looking at it > > again I realized we don''t really need to set type_info to PGT* for > > PVH, but it''s harmless I guess. Should I just leave it or condition > > them for PV only? > > No, I''m pretty certain you want them marked writable. If nothing > else then for forward compatibility with an eventual change > needing to mark certain pages R/O. But I could also imaging the > page sharing code to look at this attribute of a page (but I say > this without knowing that code at all).Sorry, I meant PGT_l*_page_table settings for type_info. Yes, we do need the PGT_writable_page settings.> >> > @@ -1089,11 +1262,18 @@ int __init construct_dom0( > >> > regs->eip = parms.virt_entry; > >> > regs->esp = vstack_end; > >> > regs->esi = vstartinfo_start; > >> > - regs->eflags = X86_EFLAGS_IF; > >> > + regs->eflags = X86_EFLAGS_IF | 0x2; > >> > >> Unrelated change? > > > > Nop, we need to make sure the resvd bit is set in eflags otherwise > > it won''t vmenter (invalid guest state). Should be harmless for PV, > > right? Not sure where it does it for PV before actually scheduling > > it.. > > PV doesn''t set this anywhere - the hardware doesn''t allow the > flag to be cleared (writes are ignored). If VMENTER is picky > about this, the GUEST_RFLAGS write at the end of > vmx_vmenter_helper() should be doing this instead of having to > do it here (and obviously in some other place for DomU creation).For domU we set it in arch_set_info_guest. vmx_vmenter_helper gets called on every vmentry, we just need this setting once. So I think this is the best place. Do you want me to if it: regs->eflags = X86_EFLAGS_IF; if ( pvh ) regs->eflags |= 0x2. thanks Mukesh
>>> On 28.09.13 at 01:03, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: > On Fri, 27 Sep 2013 08:01:16 +0100 > "Jan Beulich" <JBeulich@suse.com> wrote: > >> >>> On 27.09.13 at 03:55, Mukesh Rathor <mukesh.rathor@oracle.com> >> >>> wrote: >> > On Thu, 26 Sep 2013 09:02:41 +0100 "Jan Beulich" >> > <JBeulich@suse.com> wrote: >> >> > @@ -868,6 +1016,9 @@ int __init construct_dom0( >> >> > L1_PROT : COMPAT_L1_PROT)); >> >> > l1tab++; >> >> > >> >> > + if ( is_pvh_domain(d) ) >> >> > + continue; >> >> > + >> >> > page = mfn_to_page(mfn); >> >> > if ( (page->u.inuse.type_info == 0) && >> >> > !get_page_and_type(page, d, PGT_writable_page) ) >> >> >> >> So why is the remaining part of this loop not applicable to PVH? >> > >> > My bad, looks like it should be. I''ll remove it. BTW, looking at it >> > again I realized we don''t really need to set type_info to PGT* for >> > PVH, but it''s harmless I guess. Should I just leave it or condition >> > them for PV only? >> >> No, I''m pretty certain you want them marked writable. If nothing >> else then for forward compatibility with an eventual change >> needing to mark certain pages R/O. But I could also imaging the >> page sharing code to look at this attribute of a page (but I say >> this without knowing that code at all). > > Sorry, I meant PGT_l*_page_table settings for type_info. Yes, we do > need the PGT_writable_page settings. > >> >> > @@ -1089,11 +1262,18 @@ int __init construct_dom0( >> >> > regs->eip = parms.virt_entry; >> >> > regs->esp = vstack_end; >> >> > regs->esi = vstartinfo_start; >> >> > - regs->eflags = X86_EFLAGS_IF; >> >> > + regs->eflags = X86_EFLAGS_IF | 0x2; >> >> >> >> Unrelated change? >> > >> > Nop, we need to make sure the resvd bit is set in eflags otherwise >> > it won''t vmenter (invalid guest state). Should be harmless for PV, >> > right? Not sure where it does it for PV before actually scheduling >> > it.. >> >> PV doesn''t set this anywhere - the hardware doesn''t allow the >> flag to be cleared (writes are ignored). If VMENTER is picky >> about this, the GUEST_RFLAGS write at the end of >> vmx_vmenter_helper() should be doing this instead of having to >> do it here (and obviously in some other place for DomU creation). > > For domU we set it in arch_set_info_guest.Which is bogus too. 15910:ec3b23d8d544 ("hvm: Always keep canonical copy of RIP/RSP/RFLAGS in guest_cpu_user_regs()") did this adjustment without really explaining why it can''t be done centrally in just the two places copying regs->eflags into the VMCS/VMCB spot.> vmx_vmenter_helper gets > called on every vmentry, we just need this setting once.Would a debugger update guest state via arch_set_info_guest()? I doubt it. It would imo be a desirable up front cleanup patch to move this bogus thing out of arch_set_info_guest() into vmx_vmenter_helper() (and whatever SVM equivalent, should SVM too be incapable of dealing with the flag being clear). See how e.g. hvm_load_cpu_ctxt() already sets the flag? It''s really like being done almost at random... The only place where it gets legitimately enforced outside of the vmx_vmenter_helper() is in the x86 emulator code. And if we''d have such a cleanup patch, doing away with the literal 2 in favor of a proper symbolic (e.g. X86_EFLAGS_MBS) should probably be done at once.> So I think this is the best place. Do you want me to if it: > > regs->eflags = X86_EFLAGS_IF; > if ( pvh ) > regs->eflags |= 0x2.No, that would be pointless. Jan
Mukesh Rathor
2013-Oct-03 00:53 UTC
Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
On Fri, 27 Sep 2013 07:54:39 +0100 "Jan Beulich" <JBeulich@suse.com> wrote:> >>> On 27.09.13 at 02:17, Mukesh Rathor <mukesh.rathor@oracle.com> > >>> wrote: > > On Thu, 26 Sep 2013 09:02:41 +0100 "Jan Beulich" > > <JBeulich@suse.com> wrote: > >> >>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com> > >> >>> wrote: > >> > +/* > >> > + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, > >> > dom0 will have > >> > + * the entire io region mapped in the EPT/NPT. > >> > + * > >> > + * PVH FIXME: The following doesn''t map MMIO ranges when they > >> > sit above the > >> > + * highest E820 covered address. > >> > >> This absolutely needs fixing before this can go in. > > > > Any suggestions on how to fix it? Mapping all the way to end could > > result in a huge hap table. > > You''ll probably need a call down from Dom0 telling you where it > finds/puts MMIO resources. Or perhaps that could be mapped > in on demand from the EPT fault handler (since these regions > shouldn''t be subject to DMA, and hence IOMMU faults shouldn''t > occur - perhaps that''s even a reason to not share page tables > at least in dom0-strict mode)?Thinking about mapping in on demand from the EPT fault handler, how would I know if the access beyond last e820 entry is genuine and not a faulty pte in a buggy guest? Could I consult the mmconfig table (?) or the ACPI table in xen? Any pointers would be helpful... my knowledge runs out quickly here. FWIW, at present pv-ops linux doesn''t allow any mmio access beyond the last e820 entry. So, we''d need a fix there too. In my very orig patch, I was updating all IO mappings on demand by putting hook in linux native_pte_update if it was _PAGE_BIT_IOMAP. Another possibility would be do that for any mappings above the last e820 entry. What do you think? For testing purposes, do you have reference for hardware? I don''t see any here with such configuration. thanks mukesh
>>> On 03.10.13 at 02:53, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: > On Fri, 27 Sep 2013 07:54:39 +0100 > "Jan Beulich" <JBeulich@suse.com> wrote: > >> >>> On 27.09.13 at 02:17, Mukesh Rathor <mukesh.rathor@oracle.com> >> >>> wrote: >> > On Thu, 26 Sep 2013 09:02:41 +0100 "Jan Beulich" >> > <JBeulich@suse.com> wrote: >> >> >>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com> >> >> >>> wrote: >> >> > +/* >> >> > + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, >> >> > dom0 will have >> >> > + * the entire io region mapped in the EPT/NPT. >> >> > + * >> >> > + * PVH FIXME: The following doesn''t map MMIO ranges when they >> >> > sit above the >> >> > + * highest E820 covered address. >> >> >> >> This absolutely needs fixing before this can go in. >> > >> > Any suggestions on how to fix it? Mapping all the way to end could >> > result in a huge hap table. >> >> You''ll probably need a call down from Dom0 telling you where it >> finds/puts MMIO resources. Or perhaps that could be mapped >> in on demand from the EPT fault handler (since these regions >> shouldn''t be subject to DMA, and hence IOMMU faults shouldn''t >> occur - perhaps that''s even a reason to not share page tables >> at least in dom0-strict mode)? > > Thinking about mapping in on demand from the EPT fault handler, how > would I know if the access beyond last e820 entry is genuine and not > a faulty pte in a buggy guest? Could I consult the mmconfig table (?) > or the ACPI table in xen? Any pointers would be helpful... my > knowledge runs out quickly here.You''d have to inspect all the BARs of the devices the domain owns. Hence the thought of having Dom0 tell you about those resource assignments.> FWIW, at present pv-ops linux doesn''t allow any mmio access beyond > the last e820 entry. So, we''d need a fix there too. In my very orig > patch, I was updating all IO mappings on demand by putting hook > in linux native_pte_update if it was _PAGE_BIT_IOMAP. Another > possibility would be do that for any mappings above the last > e820 entry. What do you think?Special casing IOMAP page table creation might be an option, but has the downside of allowing kernel bugs to propagate into Xen''s view of the world.> For testing purposes, do you have reference for hardware? I don''t see > any here with such configuration.Nothing specific, but I know that SR-IOV virtual functions easily cause kernels to run out of MMIO space below 4G (namely when the hole is only around 1Gb or even less), and Intel must have knowledge of graphics cards having so huge a frame buffer that it can only be mapped above 4G. Jan
Konrad Rzeszutek Wilk
2013-Oct-04 13:35 UTC
Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
On Fri, Oct 04, 2013 at 07:53:20AM +0100, Jan Beulich wrote:> >>> On 03.10.13 at 02:53, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: > > On Fri, 27 Sep 2013 07:54:39 +0100 > > "Jan Beulich" <JBeulich@suse.com> wrote: > > > >> >>> On 27.09.13 at 02:17, Mukesh Rathor <mukesh.rathor@oracle.com> > >> >>> wrote: > >> > On Thu, 26 Sep 2013 09:02:41 +0100 "Jan Beulich" > >> > <JBeulich@suse.com> wrote: > >> >> >>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com> > >> >> >>> wrote: > >> >> > +/* > >> >> > + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, > >> >> > dom0 will have > >> >> > + * the entire io region mapped in the EPT/NPT. > >> >> > + * > >> >> > + * PVH FIXME: The following doesn''t map MMIO ranges when they > >> >> > sit above the > >> >> > + * highest E820 covered address. > >> >> > >> >> This absolutely needs fixing before this can go in. > >> > > >> > Any suggestions on how to fix it? Mapping all the way to end could > >> > result in a huge hap table. > >> > >> You''ll probably need a call down from Dom0 telling you where it > >> finds/puts MMIO resources. Or perhaps that could be mapped > >> in on demand from the EPT fault handler (since these regions > >> shouldn''t be subject to DMA, and hence IOMMU faults shouldn''t > >> occur - perhaps that''s even a reason to not share page tables > >> at least in dom0-strict mode)? > > > > Thinking about mapping in on demand from the EPT fault handler, how > > would I know if the access beyond last e820 entry is genuine and not > > a faulty pte in a buggy guest? Could I consult the mmconfig table (?) > > or the ACPI table in xen? Any pointers would be helpful... my > > knowledge runs out quickly here. > > You''d have to inspect all the BARs of the devices the domain owns. > Hence the thought of having Dom0 tell you about those resource > assignments.Doesn''t that happen via PHYSDEVOP_pci_device_add hypercalls?> > > FWIW, at present pv-ops linux doesn''t allow any mmio access beyond > > the last e820 entry. So, we''d need a fix there too. In my very orig > > patch, I was updating all IO mappings on demand by putting hook > > in linux native_pte_update if it was _PAGE_BIT_IOMAP. Another > > possibility would be do that for any mappings above the last > > e820 entry. What do you think? > > Special casing IOMAP page table creation might be an option, but > has the downside of allowing kernel bugs to propagate into Xen''s > view of the world. > > > For testing purposes, do you have reference for hardware? I don''t see > > any here with such configuration. > > Nothing specific, but I know that SR-IOV virtual functions easily > cause kernels to run out of MMIO space below 4G (namely when > the hole is only around 1Gb or even less), and Intel must have > knowledge of graphics cards having so huge a frame buffer that > it can only be mapped above 4G.Right, but the BIOS Writers Guide and docs all talk setting the MCFG up for that. Granted the MCFG (or was the ACPI spec?) says that the MCFG regions do not have to be defined in the E820. You pointed out also that the MCFG entries might come out from the ACPI DSDT. Which I think all comes back to dom0 parsing this and providing this sort of information back to the hypervisor?
>>> On 04.10.13 at 15:35, Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote: > On Fri, Oct 04, 2013 at 07:53:20AM +0100, Jan Beulich wrote: >> >>> On 03.10.13 at 02:53, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: >> > On Fri, 27 Sep 2013 07:54:39 +0100 >> > "Jan Beulich" <JBeulich@suse.com> wrote: >> > >> >> >>> On 27.09.13 at 02:17, Mukesh Rathor <mukesh.rathor@oracle.com> >> >> >>> wrote: >> >> > On Thu, 26 Sep 2013 09:02:41 +0100 "Jan Beulich" >> >> > <JBeulich@suse.com> wrote: >> >> >> >>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com> >> >> >> >>> wrote: >> >> >> > +/* >> >> >> > + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, >> >> >> > dom0 will have >> >> >> > + * the entire io region mapped in the EPT/NPT. >> >> >> > + * >> >> >> > + * PVH FIXME: The following doesn''t map MMIO ranges when they >> >> >> > sit above the >> >> >> > + * highest E820 covered address. >> >> >> >> >> >> This absolutely needs fixing before this can go in. >> >> > >> >> > Any suggestions on how to fix it? Mapping all the way to end could >> >> > result in a huge hap table. >> >> >> >> You''ll probably need a call down from Dom0 telling you where it >> >> finds/puts MMIO resources. Or perhaps that could be mapped >> >> in on demand from the EPT fault handler (since these regions >> >> shouldn''t be subject to DMA, and hence IOMMU faults shouldn''t >> >> occur - perhaps that''s even a reason to not share page tables >> >> at least in dom0-strict mode)? >> > >> > Thinking about mapping in on demand from the EPT fault handler, how >> > would I know if the access beyond last e820 entry is genuine and not >> > a faulty pte in a buggy guest? Could I consult the mmconfig table (?) >> > or the ACPI table in xen? Any pointers would be helpful... my >> > knowledge runs out quickly here. >> >> You''d have to inspect all the BARs of the devices the domain owns. >> Hence the thought of having Dom0 tell you about those resource >> assignments. > > Doesn''t that happen via PHYSDEVOP_pci_device_add hypercalls?That may (and I think does) happen before resource assignment.>> > FWIW, at present pv-ops linux doesn''t allow any mmio access beyond >> > the last e820 entry. So, we''d need a fix there too. In my very orig >> > patch, I was updating all IO mappings on demand by putting hook >> > in linux native_pte_update if it was _PAGE_BIT_IOMAP. Another >> > possibility would be do that for any mappings above the last >> > e820 entry. What do you think? >> >> Special casing IOMAP page table creation might be an option, but >> has the downside of allowing kernel bugs to propagate into Xen''s >> view of the world. >> >> > For testing purposes, do you have reference for hardware? I don''t see >> > any here with such configuration. >> >> Nothing specific, but I know that SR-IOV virtual functions easily >> cause kernels to run out of MMIO space below 4G (namely when >> the hole is only around 1Gb or even less), and Intel must have >> knowledge of graphics cards having so huge a frame buffer that >> it can only be mapped above 4G. > > Right, but the BIOS Writers Guide and docs all talk setting the MCFG > up for that. Granted the MCFG (or was the ACPI spec?) says that the > MCFG regions do not have to be defined in the E820.What do MCFG regions have to do with device MMIO ones?> You pointed out also that the MCFG entries might come out from > the ACPI DSDT. Which I think all comes back to dom0 parsing this and > providing this sort of information back to the hypervisor?For the MCFG, yes. But not for individual BARs of devices. Jan
Konrad Rzeszutek Wilk
2013-Oct-04 16:02 UTC
Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
On Fri, Oct 04, 2013 at 03:05:57PM +0100, Jan Beulich wrote:> >>> On 04.10.13 at 15:35, Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote: > > On Fri, Oct 04, 2013 at 07:53:20AM +0100, Jan Beulich wrote: > >> >>> On 03.10.13 at 02:53, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: > >> > On Fri, 27 Sep 2013 07:54:39 +0100 > >> > "Jan Beulich" <JBeulich@suse.com> wrote: > >> > > >> >> >>> On 27.09.13 at 02:17, Mukesh Rathor <mukesh.rathor@oracle.com> > >> >> >>> wrote: > >> >> > On Thu, 26 Sep 2013 09:02:41 +0100 "Jan Beulich" > >> >> > <JBeulich@suse.com> wrote: > >> >> >> >>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com> > >> >> >> >>> wrote: > >> >> >> > +/* > >> >> >> > + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, > >> >> >> > dom0 will have > >> >> >> > + * the entire io region mapped in the EPT/NPT. > >> >> >> > + * > >> >> >> > + * PVH FIXME: The following doesn''t map MMIO ranges when they > >> >> >> > sit above the > >> >> >> > + * highest E820 covered address. > >> >> >> > >> >> >> This absolutely needs fixing before this can go in. > >> >> > > >> >> > Any suggestions on how to fix it? Mapping all the way to end could > >> >> > result in a huge hap table. > >> >> > >> >> You''ll probably need a call down from Dom0 telling you where it > >> >> finds/puts MMIO resources. Or perhaps that could be mapped > >> >> in on demand from the EPT fault handler (since these regions > >> >> shouldn''t be subject to DMA, and hence IOMMU faults shouldn''t > >> >> occur - perhaps that''s even a reason to not share page tables > >> >> at least in dom0-strict mode)? > >> > > >> > Thinking about mapping in on demand from the EPT fault handler, how > >> > would I know if the access beyond last e820 entry is genuine and not > >> > a faulty pte in a buggy guest? Could I consult the mmconfig table (?) > >> > or the ACPI table in xen? Any pointers would be helpful... my > >> > knowledge runs out quickly here. > >> > >> You''d have to inspect all the BARs of the devices the domain owns. > >> Hence the thought of having Dom0 tell you about those resource > >> assignments. > > > > Doesn''t that happen via PHYSDEVOP_pci_device_add hypercalls? > > That may (and I think does) happen before resource assignment. > > >> > FWIW, at present pv-ops linux doesn''t allow any mmio access beyond > >> > the last e820 entry. So, we''d need a fix there too. In my very orig > >> > patch, I was updating all IO mappings on demand by putting hook > >> > in linux native_pte_update if it was _PAGE_BIT_IOMAP. Another > >> > possibility would be do that for any mappings above the last > >> > e820 entry. What do you think? > >> > >> Special casing IOMAP page table creation might be an option, but > >> has the downside of allowing kernel bugs to propagate into Xen''s > >> view of the world. > >> > >> > For testing purposes, do you have reference for hardware? I don''t see > >> > any here with such configuration. > >> > >> Nothing specific, but I know that SR-IOV virtual functions easily > >> cause kernels to run out of MMIO space below 4G (namely when > >> the hole is only around 1Gb or even less), and Intel must have > >> knowledge of graphics cards having so huge a frame buffer that > >> it can only be mapped above 4G. > > > > Right, but the BIOS Writers Guide and docs all talk setting the MCFG > > up for that. Granted the MCFG (or was the ACPI spec?) says that the > > MCFG regions do not have to be defined in the E820. > > What do MCFG regions have to do with device MMIO ones?Actually - nothing at all. I somehow was under the impression that MCFG and MMIO regions would be in the same memory area (as in MCFG follows the end of MMIO region). But of course nothing would be that simple.> > > You pointed out also that the MCFG entries might come out from > > the ACPI DSDT. Which I think all comes back to dom0 parsing this and > > providing this sort of information back to the hypervisor? > > For the MCFG, yes. But not for individual BARs of devices.So back to hooking up a new hypercall in the PCI subsystem when resource assigment has been completed? And also if the PCI subsystem decides to re-write the resource addresses to odd locations. Can''t one also trap for the configuration changes on the PCI devices and extract the physical locations then?> > Jan >
>>> On 04.10.13 at 18:02, Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote: > So back to hooking up a new hypercall in the PCI subsystem when > resource assigment has been completed? And also if the PCI subsystem > decides to re-write the resource addresses to odd locations. > > Can''t one also trap for the configuration changes on the PCI > devices and extract the physical locations then?Yes, of course we could be snooping the CFG writes, but that''s as simple as it sounds only for the port CF8 based accesses. For MCFG based accesses it would mean we''d have to write protect the whole MCFG range, and use emulation there. Not very pretty, but doable. Jan
Konrad Rzeszutek Wilk
2013-Oct-04 20:59 UTC
Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
On Fri, Oct 04, 2013 at 05:07:59PM +0100, Jan Beulich wrote:> >>> On 04.10.13 at 18:02, Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote: > > So back to hooking up a new hypercall in the PCI subsystem when > > resource assigment has been completed? And also if the PCI subsystem > > decides to re-write the resource addresses to odd locations. > > > > Can''t one also trap for the configuration changes on the PCI > > devices and extract the physical locations then? > > Yes, of course we could be snooping the CFG writes, but that''s > as simple as it sounds only for the port CF8 based accesses. For > MCFG based accesses it would mean we''d have to write protect > the whole MCFG range, and use emulation there. Not very > pretty, but doable.Hypervisor call is then a more appropiate if it can be done (Mukesh says that v0 of the patch had something like that in so he will try to recreate it) and then as a fallback we could do the emulation.> > Jan >
Mukesh Rathor
2013-Oct-05 01:06 UTC
Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
On Fri, 4 Oct 2013 16:59:53 -0400 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:> On Fri, Oct 04, 2013 at 05:07:59PM +0100, Jan Beulich wrote: > > >>> On 04.10.13 at 18:02, Konrad Rzeszutek Wilk > > >>> <konrad.wilk@oracle.com> wrote: > > > So back to hooking up a new hypercall in the PCI subsystem when > > > resource assigment has been completed? And also if the PCI > > > subsystem decides to re-write the resource addresses to odd > > > locations. > > > > > > Can''t one also trap for the configuration changes on the PCI > > > devices and extract the physical locations then? > > > > Yes, of course we could be snooping the CFG writes, but that''s > > as simple as it sounds only for the port CF8 based accesses. For > > MCFG based accesses it would mean we''d have to write protect > > the whole MCFG range, and use emulation there. Not very > > pretty, but doable. > > Hypervisor call is then a more appropiate if it can be done (Mukesh > says that v0 of the patch had something like that in so he will try > to recreate it) and then as a fallback we could do the emulation.Right, looks like it has to be guest driven. xen can provide the facility via the PHYSDEVOP_map_iomem I had in V0. For linux, we will do such mappings above the highest e820 entry via the ioremap path. As a result of this, we don''t need to change pvh_map_all_iomem() and it will continue to map upto the highest e820 entry. I will change the comment: * PVH FIXME: The following doesn''t map MMIO ranges when they sit above the * highest E820 covered address. to * Note: we map the ranges upto 4GB or the last e820 entry, whichever is * higher. Any ranges beyond are mapped by the guest via * PHYSDEVOP_map_iomem. thanks Mukesh diff -r 774a211e6b8f -r 2c728d96f876 xen/arch/x86/physdev.c --- a/xen/arch/x86/physdev.c Wed Jan 30 12:30:34 2013 -0800 +++ b/xen/arch/x86/physdev.c Mon Jan 28 15:30:45 2013 -0800 @@ -740,6 +740,25 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H break; } + case PHYSDEVOP_map_iomem : { + + struct physdev_map_iomem iomem; + struct domain *d = current->domain; + + ret = -EPERM; + if ( !IS_PRIV(d) || !is_pvh_domain(d)) + break; + d = rcu_lock_current_domain(); + + ret = -EFAULT; + if ( copy_from_guest(&iomem, arg, 1) != 0 ) + break; + + ret = domctl_memory_mapping(d, iomem.first_gfn, iomem.first_mfn, + iomem.nr_mfns, iomem.add_mapping); + break; + } + default: ret = -ENOSYS; break; diff -r 774a211e6b8f -r 2c728d96f876 xen/include/public/physdev.h --- a/xen/include/public/physdev.h Wed Jan 30 12:30:34 2013 -0800 +++ b/xen/include/public/physdev.h Mon Jan 28 15:30:45 2013 -0800 @@ -330,6 +330,20 @@ struct physdev_dbgp_op { typedef struct physdev_dbgp_op physdev_dbgp_op_t; DEFINE_XEN_GUEST_HANDLE(physdev_dbgp_op_t); + +/* Map given gfns to mfns where mfns are part of IO space. */ +#define PHYSDEVOP_map_iomem 30 +struct physdev_map_iomem { + /* IN */ + uint64_t first_gfn; + uint64_t first_mfn; + uint32_t nr_mfns; + uint32_t add_mapping; /* 1 == add mapping; 0 == unmap */ + +}; +typedef struct physdev_map_iomem physdev_map_iomem_t; +DEFINE_XEN_GUEST_HANDLE(physdev_map_iomem_t); + /* * Notify that some PIRQ-bound event channels have been unmasked. * ** This command is obsolete since interface version 0x00030202 and is **
>>> On 05.10.13 at 03:06, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: > On Fri, 4 Oct 2013 16:59:53 -0400 > Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote: >> On Fri, Oct 04, 2013 at 05:07:59PM +0100, Jan Beulich wrote: >> > >>> On 04.10.13 at 18:02, Konrad Rzeszutek Wilk >> > >>> <konrad.wilk@oracle.com> wrote: >> > > Can''t one also trap for the configuration changes on the PCI >> > > devices and extract the physical locations then? >> > >> > Yes, of course we could be snooping the CFG writes, but that''s >> > as simple as it sounds only for the port CF8 based accesses. For >> > MCFG based accesses it would mean we''d have to write protect >> > the whole MCFG range, and use emulation there. Not very >> > pretty, but doable. >> >> Hypervisor call is then a more appropiate if it can be done (Mukesh >> says that v0 of the patch had something like that in so he will try >> to recreate it) and then as a fallback we could do the emulation. > > Right, looks like it has to be guest driven. xen can provide the > facility via the PHYSDEVOP_map_iomem I had in V0. For linux, we will do > such mappings above the highest e820 entry via the ioremap path.As I tried to make clear before - this is wrong both conceptually and technically: Conceptually because there''s no need for a driver to ioremap() all MMIO ranges in order for a device to make use of them. And technically because you would allow bad arguments passed to ioremap() propagate into bad IOMMU mappings, while the IOMMU is precisely there to catch bad accesses (even for Dom0, at least in dom0-strict mode). Jan
Mukesh Rathor
2013-Oct-08 00:52 UTC
Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
On Mon, 30 Sep 2013 07:56:30 +0100 "Jan Beulich" <JBeulich@suse.com> wrote:> >>> On 28.09.13 at 01:03, Mukesh Rathor <mukesh.rathor@oracle.com> > >>> wrote: > > On Fri, 27 Sep 2013 08:01:16 +0100 > > "Jan Beulich" <JBeulich@suse.com> wrote:.......> >> >> > @@ -1089,11 +1262,18 @@ int __init construct_dom0( > >> >> > regs->eip = parms.virt_entry; > >> >> > regs->esp = vstack_end; > >> >> > regs->esi = vstartinfo_start; > >> >> > - regs->eflags = X86_EFLAGS_IF; > >> >> > + regs->eflags = X86_EFLAGS_IF | 0x2; > >> >> > >> >> Unrelated change? > >> > > >> > Nop, we need to make sure the resvd bit is set in eflags > >> > otherwise it won''t vmenter (invalid guest state). Should be > >> > harmless for PV, right? Not sure where it does it for PV before > >> > actually scheduling it.. > >> > >> PV doesn''t set this anywhere - the hardware doesn''t allow the > >> flag to be cleared (writes are ignored). If VMENTER is picky > >> about this, the GUEST_RFLAGS write at the end of > >> vmx_vmenter_helper() should be doing this instead of having to > >> do it here (and obviously in some other place for DomU creation). > > > > For domU we set it in arch_set_info_guest. > > Which is bogus too. 15910:ec3b23d8d544 ("hvm: Always keep > canonical copy of RIP/RSP/RFLAGS in guest_cpu_user_regs()") did > this adjustment without really explaining why it can''t be done > centrally in just the two places copying regs->eflags into the > VMCS/VMCB spot.I beg to differ.... such nit picking is equally bogus IMHO. The bit needs to be set once, putting it in vmx_vmenter_helper adds an unnecessary slowdown IMO.> > vmx_vmenter_helper gets > > called on every vmentry, we just need this setting once. > > Would a debugger update guest state via arch_set_info_guest()? > I doubt it. It would imo be a desirable up front cleanup patch to > move this bogus thing out of arch_set_info_guest() into > vmx_vmenter_helper() (and whatever SVM equivalent, should > SVM too be incapable of dealing with the flag being clear). See > how e.g. hvm_load_cpu_ctxt() already sets the flag? It''s really > like being done almost at random...The debugger would always read eflags, muck with only the bits it needs to, leaving the resvd bit as is, then send it down.> The only place where it gets legitimately enforced outside of > the vmx_vmenter_helper() is in the x86 emulator code. > > And if we''d have such a cleanup patch, doing away with the literal > 2 in favor of a proper symbolic (e.g. X86_EFLAGS_MBS) should > probably be done at once.Having X86_EFLAGS_MBS makes sense.> > So I think this is the best place. Do you want me to if it: > > > > regs->eflags = X86_EFLAGS_IF; > > if ( pvh ) > > regs->eflags |= 0x2. > > No, that would be pointless.Mukesh
Mukesh Rathor
2013-Oct-08 00:58 UTC
Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
On Fri, 04 Oct 2013 07:53:20 +0100 "Jan Beulich" <JBeulich@suse.com> wrote:> >>> On 03.10.13 at 02:53, Mukesh Rathor <mukesh.rathor@oracle.com> > >>> wrote: > > On Fri, 27 Sep 2013 07:54:39 +0100 > > "Jan Beulich" <JBeulich@suse.com> wrote: > > > >> >>> On 27.09.13 at 02:17, Mukesh Rathor <mukesh.rathor@oracle.com> > >> >>> wrote: > >> > On Thu, 26 Sep 2013 09:02:41 +0100 "Jan Beulich" > >> > <JBeulich@suse.com> wrote: > >> >> >>> On 25.09.13 at 23:03, Mukesh Rathor > >> >> >>> <mukesh.rathor@oracle.com> wrote: > >> >> > +/* > >> >> > + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, > >> >> > dom0 will have > >> >> > + * the entire io region mapped in the EPT/NPT. > >> >> > + * > >> >> > + * PVH FIXME: The following doesn''t map MMIO ranges when they > >> >> > sit above the > >> >> > + * highest E820 covered address. > >> >> > >> >> This absolutely needs fixing before this can go in. > >> > > >> > Any suggestions on how to fix it? Mapping all the way to end > >> > could result in a huge hap table. > >> > >> You''ll probably need a call down from Dom0 telling you where it > >> finds/puts MMIO resources. Or perhaps that could be mapped > >> in on demand from the EPT fault handler (since these regions > >> shouldn''t be subject to DMA, and hence IOMMU faults shouldn''t > >> occur - perhaps that''s even a reason to not share page tables > >> at least in dom0-strict mode)? > > > > Thinking about mapping in on demand from the EPT fault handler, how > > would I know if the access beyond last e820 entry is genuine and > > not a faulty pte in a buggy guest? Could I consult the mmconfig > > table (?) or the ACPI table in xen? Any pointers would be > > helpful... my knowledge runs out quickly here. > > You''d have to inspect all the BARs of the devices the domain owns. > Hence the thought of having Dom0 tell you about those resource > assignments. > > > FWIW, at present pv-ops linux doesn''t allow any mmio access beyond > > the last e820 entry. So, we''d need a fix there too. In my very orig > > patch, I was updating all IO mappings on demand by putting hook > > in linux native_pte_update if it was _PAGE_BIT_IOMAP. Another > > possibility would be do that for any mappings above the last > > e820 entry. What do you think? > > Special casing IOMAP page table creation might be an option, but > has the downside of allowing kernel bugs to propagate into Xen''s > view of the world. > > > For testing purposes, do you have reference for hardware? I don''t > > see any here with such configuration. > > Nothing specific, but I know that SR-IOV virtual functions easily > cause kernels to run out of MMIO space below 4G (namely when > the hole is only around 1Gb or even less), and Intel must have > knowledge of graphics cards having so huge a frame buffer that > it can only be mapped above 4G.In that case, I don''t see why this is a MUST for the patch. Combined with the fact that at present pv-ops doesn''t even allow for mapping above the last e820 entry, I think this can be left FIXME/bug-fix for near future that can be done relatively quickly by an expert in that area if learning that takes me a long time. thanks Mukesh
>>> On 08.10.13 at 02:52, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: > On Mon, 30 Sep 2013 07:56:30 +0100 > "Jan Beulich" <JBeulich@suse.com> wrote: > >> >>> On 28.09.13 at 01:03, Mukesh Rathor <mukesh.rathor@oracle.com> >> >>> wrote: >> > On Fri, 27 Sep 2013 08:01:16 +0100 >> > "Jan Beulich" <JBeulich@suse.com> wrote: > ....... >> >> >> > @@ -1089,11 +1262,18 @@ int __init construct_dom0( >> >> >> > regs->eip = parms.virt_entry; >> >> >> > regs->esp = vstack_end; >> >> >> > regs->esi = vstartinfo_start; >> >> >> > - regs->eflags = X86_EFLAGS_IF; >> >> >> > + regs->eflags = X86_EFLAGS_IF | 0x2; >> >> >> >> >> >> Unrelated change? >> >> > >> >> > Nop, we need to make sure the resvd bit is set in eflags >> >> > otherwise it won''t vmenter (invalid guest state). Should be >> >> > harmless for PV, right? Not sure where it does it for PV before >> >> > actually scheduling it.. >> >> >> >> PV doesn''t set this anywhere - the hardware doesn''t allow the >> >> flag to be cleared (writes are ignored). If VMENTER is picky >> >> about this, the GUEST_RFLAGS write at the end of >> >> vmx_vmenter_helper() should be doing this instead of having to >> >> do it here (and obviously in some other place for DomU creation). >> > >> > For domU we set it in arch_set_info_guest. >> >> Which is bogus too. 15910:ec3b23d8d544 ("hvm: Always keep >> canonical copy of RIP/RSP/RFLAGS in guest_cpu_user_regs()") did >> this adjustment without really explaining why it can''t be done >> centrally in just the two places copying regs->eflags into the >> VMCS/VMCB spot. > > I beg to differ.... such nit picking is equally bogus IMHO. The > bit needs to be set once, putting it in vmx_vmenter_helper adds an > unnecessary slowdown IMO.An "or" being a measurable slowdown?>> > vmx_vmenter_helper gets >> > called on every vmentry, we just need this setting once. >> >> Would a debugger update guest state via arch_set_info_guest()? >> I doubt it. It would imo be a desirable up front cleanup patch to >> move this bogus thing out of arch_set_info_guest() into >> vmx_vmenter_helper() (and whatever SVM equivalent, should >> SVM too be incapable of dealing with the flag being clear). See >> how e.g. hvm_load_cpu_ctxt() already sets the flag? It''s really >> like being done almost at random... > > The debugger would always read eflags, muck with only > the bits it needs to, leaving the resvd bit as is, then send it down.So you''d expect every debugger to be responsible for setting this bit? Pretty odd a requirement, when it can be done centrally in a single place, covering all cases. Jan
>>> On 08.10.13 at 02:58, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: > On Fri, 04 Oct 2013 07:53:20 +0100 > "Jan Beulich" <JBeulich@suse.com> wrote: >> Nothing specific, but I know that SR-IOV virtual functions easily >> cause kernels to run out of MMIO space below 4G (namely when >> the hole is only around 1Gb or even less), and Intel must have >> knowledge of graphics cards having so huge a frame buffer that >> it can only be mapped above 4G. > > In that case, I don''t see why this is a MUST for the patch. Combined > with the fact that at present pv-ops doesn''t even allow for mapping > above the last e820 entry, I think this can be left FIXME/bug-fix for > near future that can be done relatively quickly by an expert in that > area if learning that takes me a long time.Okay, so it seems you''re back in Linux-only mode. I''m asking for there not being fundamental holes. Dealing with the majority of the "fixme"s you have already sprinkled around the code has at least a clear path associated with it. Here, however, we''re not even certain how to properly deal with the problem. And just to be clear - _any_ of the "fixme"s we intend to allow to go in with this series is a compromise already, with - afaict - no precedent. The more you ask for, the more likely it''ll become for at least me to start re-thinking about this. Over the past years I think we made good progress in morphing Xen from an experimental project to a stable, enterprise ready one. I''m not eager to see us get pushed back significantly on that road. Jan
>>> On 08.10.13 at 09:51, "Jan Beulich" <JBeulich@suse.com> wrote: > And just to be clear - _any_ of the "fixme"s we intend to allow to > go in with this series is a compromise already, with - afaict - no > precedent. The more you ask for, the more likely it''ll become for > at least me to start re-thinking about this. Over the past years I > think we made good progress in morphing Xen from an > experimental project to a stable, enterprise ready one. I''m not > eager to see us get pushed back significantly on that road.Considering the thread we''re in - perhaps before adding Dom0 support with more hacks/fixme-s it would be more appropriate to eliminate all the ones in the current pending series? Question to both of you: What''s the status of this anyway? If I''m not mistaken there was (on IRC) talk of a regression the latest series caused for HVM guests? Was this already sorted out? Is there going to be an updated series getting us reasonably close to finally get this in? Jan
George Dunlap
2013-Oct-08 09:39 UTC
Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
On 10/08/2013 09:03 AM, Jan Beulich wrote:>>>> On 08.10.13 at 09:51, "Jan Beulich" <JBeulich@suse.com> wrote: >> And just to be clear - _any_ of the "fixme"s we intend to allow to >> go in with this series is a compromise already, with - afaict - no >> precedent. The more you ask for, the more likely it''ll become for >> at least me to start re-thinking about this. Over the past years I >> think we made good progress in morphing Xen from an >> experimental project to a stable, enterprise ready one. I''m not >> eager to see us get pushed back significantly on that road. > > Considering the thread we''re in - perhaps before adding Dom0 > support with more hacks/fixme-s it would be more appropriate > to eliminate all the ones in the current pending series? > > Question to both of you: What''s the status of this anyway? If > I''m not mistaken there was (on IRC) talk of a regression the > latest series caused for HVM guests? Was this already sorted > out? Is there going to be an updated series getting us > reasonably close to finally get this in?Right now I''ve had to put PVH on the back burner: I''ve got a talk for LinuxCon EU to prepare, and two for XenSummit. There are two bugs that have been identified -- the HVM crash due to a mistake in one of the "code motion" patches, and the bug with setting VMXE in the guest CR. Tim has also noticed another functional change in the code-motion patch (which cannot, I think, be causing the HVM crash). If it were just a question of cleaning up those bits, I could probably have another draft posted sometime this week. But if we''re stepping back and looking at whether this is the right approach, or whether something like Tim has suggested -- basically making PVH to be HVM minus qemu plus a handful of hypercalls, and most of the changes in the domain builder rather than in Xen -- that will take a bit longer, particularly because it would probably mean me having to understand and modify the Linux side of things as well. At this point I''m not really sure what the best approach is going forward. -George
>>> On 08.10.13 at 11:39, George Dunlap <george.dunlap@eu.citrix.com> wrote: > If it were just a question of cleaning up those bits, I could probably > have another draft posted sometime this week. But if we''re stepping > back and looking at whether this is the right approach, or whether > something like Tim has suggested -- basically making PVH to be HVM minus > qemu plus a handful of hypercalls, and most of the changes in the domain > builder rather than in Xen -- that will take a bit longer, particularly > because it would probably mean me having to understand and modify the > Linux side of things as well. At this point I''m not really sure what > the best approach is going forward.Indeed, this all sounds more like being in need of a discussion on the summit then (which however would very likely mean that this won''t make 4.4). Jan
George Dunlap
2013-Oct-08 10:01 UTC
Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
On 10/08/2013 10:57 AM, Jan Beulich wrote:>>>> On 08.10.13 at 11:39, George Dunlap <george.dunlap@eu.citrix.com> wrote: >> If it were just a question of cleaning up those bits, I could probably >> have another draft posted sometime this week. But if we''re stepping >> back and looking at whether this is the right approach, or whether >> something like Tim has suggested -- basically making PVH to be HVM minus >> qemu plus a handful of hypercalls, and most of the changes in the domain >> builder rather than in Xen -- that will take a bit longer, particularly >> because it would probably mean me having to understand and modify the >> Linux side of things as well. At this point I''m not really sure what >> the best approach is going forward. > > Indeed, this all sounds more like being in need of a discussion on > the summit then (which however would very likely mean that this > won''t make 4.4).Hmm, which will make my "PVH technical deep-dive" talk a bit challenging to write. :-) Lars, any suggestions? I could just back out the talk (which would reduce my talk-writing load a bit), or I could try to talk about things in general and the issues we''ve been discussing. -George
George, there are two options. Option 1: Withdraw the talk. Option 2: Don''t withdraw the talk * Have a discussion on Wednesday * Cover the current design and highlight the issues and possible solutions. * Have a Bof afterwards to get more community involvement. BoF slot 6, which is on the next day is still up for grabs. We could also see whether maybe Stefano Panella would swap his BoF slot such that the discussion could immediately follow your talk. Option 2 potentially has the advantage of getting more people involved in PVH going forward. Option 1 feels a little closed. Regards Lars -----Original Message----- From: George Dunlap [mailto:george.dunlap@eu.citrix.com] Sent: 08 October 2013 11:01 To: Jan Beulich Cc: keir.xen@gmail.com; xen-devel; Mukesh Rathor; Lars Kurth Subject: Re: [Xen-devel] [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes On 10/08/2013 10:57 AM, Jan Beulich wrote:>>>> On 08.10.13 at 11:39, George Dunlap <george.dunlap@eu.citrix.com> wrote: >> If it were just a question of cleaning up those bits, I could >> probably have another draft posted sometime this week. But if we''re >> stepping back and looking at whether this is the right approach, or >> whether something like Tim has suggested -- basically making PVH to >> be HVM minus qemu plus a handful of hypercalls, and most of the >> changes in the domain builder rather than in Xen -- that will take a >> bit longer, particularly because it would probably mean me having to >> understand and modify the Linux side of things as well. At this >> point I''m not really sure what the best approach is going forward. > > Indeed, this all sounds more like being in need of a discussion on the > summit then (which however would very likely mean that this won''t make > 4.4).Hmm, which will make my "PVH technical deep-dive" talk a bit challenging to write. :-) Lars, any suggestions? I could just back out the talk (which would reduce my talk-writing load a bit), or I could try to talk about things in general and the issues we''ve been discussing. -George
Konrad Rzeszutek Wilk
2013-Oct-08 12:30 UTC
Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
George Dunlap <george.dunlap@eu.citrix.com> wrote:>On 10/08/2013 09:03 AM, Jan Beulich wrote: >>>>> On 08.10.13 at 09:51, "Jan Beulich" <JBeulich@suse.com> wrote: >>> And just to be clear - _any_ of the "fixme"s we intend to allow to >>> go in with this series is a compromise already, with - afaict - no >>> precedent. The more you ask for, the more likely it''ll become for >>> at least me to start re-thinking about this. Over the past years I >>> think we made good progress in morphing Xen from an >>> experimental project to a stable, enterprise ready one. I''m not >>> eager to see us get pushed back significantly on that road. >> >> Considering the thread we''re in - perhaps before adding Dom0 >> support with more hacks/fixme-s it would be more appropriate >> to eliminate all the ones in the current pending series? >> >> Question to both of you: What''s the status of this anyway? If >> I''m not mistaken there was (on IRC) talk of a regression the >> latest series caused for HVM guests? Was this already sorted >> out? Is there going to be an updated series getting us >> reasonably close to finally get this in? > >Right now I''ve had to put PVH on the back burner: I''ve got a talk for >LinuxCon EU to prepare, and two for XenSummit. > >There are two bugs that have been identified -- the HVM crash due to a >mistake in one of the "code motion" patches, and the bug with setting >VMXE in the guest CR. Tim has also noticed another functional change >in >the code-motion patch (which cannot, I think, be causing the HVM >crash).These issues sprang up with your patches, not with the ones that Mukesh posted ? Which did get tested for regression and passed with flying colours? That means there is a working baseline. Would that help? Mukesh do you have any ideas what might be amiss?> >If it were just a question of cleaning up those bits, I could probably >have another draft posted sometime this week. But if we''re stepping >back and looking at whether this is the right approach, or whether >something like Tim has suggested -- basically making PVH to be HVM >minus >qemu plus a handful of hypercalls, and most of the changes in the >domain >builder rather than in Xen -- that will take a bit longer, particularly > >because it would probably mean me having to understand and modify the >Linux side of things as well. At this point I''m not really sure what >the best approach is going forward.Arrg. I am not really sure how to express myself here but from the start Mukesh has asking for assistance and review of ideas and design of this and gotten it and acted on it. Now after two years of going this path folks are suggesting a new design? Sadly I won''t be able to be on Wed (family matters) call but my opinion is that we should keep on heading this way. If we want HVM with less (so no QEMU) that could be a separate mode that would work in parallel with PVH and be a separate project. They should be able to coexist right?> > -George > >_______________________________________________ >Xen-devel mailing list >Xen-devel@lists.xen.org >http://lists.xen.org/xen-devel
George Dunlap
2013-Oct-09 13:02 UTC
Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
On 10/08/2013 01:30 PM, Konrad Rzeszutek Wilk wrote:> George Dunlap <george.dunlap@eu.citrix.com> wrote: >> On 10/08/2013 09:03 AM, Jan Beulich wrote: >>>>>> On 08.10.13 at 09:51, "Jan Beulich" <JBeulich@suse.com> wrote: >>>> And just to be clear - _any_ of the "fixme"s we intend to allow to >>>> go in with this series is a compromise already, with - afaict - no >>>> precedent. The more you ask for, the more likely it''ll become for >>>> at least me to start re-thinking about this. Over the past years I >>>> think we made good progress in morphing Xen from an >>>> experimental project to a stable, enterprise ready one. I''m not >>>> eager to see us get pushed back significantly on that road. >>> >>> Considering the thread we''re in - perhaps before adding Dom0 >>> support with more hacks/fixme-s it would be more appropriate >>> to eliminate all the ones in the current pending series? >>> >>> Question to both of you: What''s the status of this anyway? If >>> I''m not mistaken there was (on IRC) talk of a regression the >>> latest series caused for HVM guests? Was this already sorted >>> out? Is there going to be an updated series getting us >>> reasonably close to finally get this in? >> >> Right now I''ve had to put PVH on the back burner: I''ve got a talk for >> LinuxCon EU to prepare, and two for XenSummit. >> >> There are two bugs that have been identified -- the HVM crash due to a >> mistake in one of the "code motion" patches, and the bug with setting >> VMXE in the guest CR. Tim has also noticed another functional change >> in >> the code-motion patch (which cannot, I think, be causing the HVM >> crash). > > These issues sprang up with your patches, not with the ones that Mukesh posted ? Which did get tested for regression and passed with flying colours? That means there is a working baseline. Would that help? > Mukesh do you have any ideas what might be amiss?The issues with the code motion patch are almost certainly mine. Andy Cooper ran one of Mukesh''s versions through one of the XenRT tests (probably a nightly) and it came up fine. I just need to go through and figure out what I messed up. The issue with the cr4 turned up because of the work Roger Pau Monne has done porting FreeBSD to PVH. The core issue here is that the guest CR4 has VMXE set even though nested virt is disabled; so when the kernel does "read, set/clear bit, write", the resulting write also has the VMXE bit set, which causes the HVM handler to crash the guest. The core issue, that the VMXE bit is set even though nested virt is disabled, was probably there in Mukesh''s series. I''m not sure if his handler would crash in a similar way, but it probably did; in any case, there would have been problems when it came to live migration. But that''s a one-line fix that Roger has already tested.> >> >> If it were just a question of cleaning up those bits, I could probably >> have another draft posted sometime this week. But if we''re stepping >> back and looking at whether this is the right approach, or whether >> something like Tim has suggested -- basically making PVH to be HVM >> minus >> qemu plus a handful of hypercalls, and most of the changes in the >> domain >> builder rather than in Xen -- that will take a bit longer, particularly >> >> because it would probably mean me having to understand and modify the >> Linux side of things as well. At this point I''m not really sure what >> the best approach is going forward. > > Arrg. I am not really sure how to express myself here but from the start Mukesh has asking for assistance and review of ideas and design of this and gotten it and acted on it. Now after two years of going this path folks are suggesting a new design? > > Sadly I won''t be able to be on Wed (family matters) call but my opinion is that we should keep on heading this way. If we want HVM with less (so no QEMU) that could be a separate mode that would work in parallel with PVH and be a separate project. They should be able to coexist right?So to begin with, the series I posted, from a guest perspective, is already close to what Tim described. The main thing a PVH guest has that an HVM guest doesn''t are: - Starts in 64-bit mode - A few more hypercalls - PV CPUID and IO - More state set when bringing up a cpu The main difference between PV and HVM CPUID is the special-casing for dom0, which would presumably have to be extended in order to run a PVH dom0 in any case; and the IO is either the same, or again has special cases for driver domains and dom0, and would need to be implemented one way or another for PVH guests as well. Most of what Tim seems do be advocating are things that should be able to be changed "behind the scenes": having the domain builder put the guest in long mode + paging, &c. So it might be worth just checking this version in (once it gets fixed up), and look at refactoring things at a later date. -George
Andrew Cooper
2013-Oct-09 13:13 UTC
Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
On 09/10/13 14:02, George Dunlap wrote:> On 10/08/2013 01:30 PM, Konrad Rzeszutek Wilk wrote: >> >> These issues sprang up with your patches, not with the ones that >> Mukesh posted ? Which did get tested for regression and passed with >> flying colours? That means there is a working baseline. Would that help? >> Mukesh do you have any ideas what might be amiss? > > The issues with the code motion patch are almost certainly mine. Andy > Cooper ran one of Mukesh''s versions through one of the XenRT tests > (probably a nightly) and it came up fine. I just need to go through > and figure out what I messed up.I ran Mukesh''s v10 series through a XenServer BST, which confirmed no glaring function regressions in PV and HVM domains. ~Andrew
George Dunlap
2013-Oct-09 13:16 UTC
Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
On 09/10/13 14:13, Andrew Cooper wrote:> On 09/10/13 14:02, George Dunlap wrote: >> On 10/08/2013 01:30 PM, Konrad Rzeszutek Wilk wrote: >>> These issues sprang up with your patches, not with the ones that >>> Mukesh posted ? Which did get tested for regression and passed with >>> flying colours? That means there is a working baseline. Would that help? >>> Mukesh do you have any ideas what might be amiss? >> The issues with the code motion patch are almost certainly mine. Andy >> Cooper ran one of Mukesh''s versions through one of the XenRT tests >> (probably a nightly) and it came up fine. I just need to go through >> and figure out what I messed up. > I ran Mukesh''s v10 series through a XenServer BST, which confirmed no > glaring function regressions in PV and HVM domains.I''m not sure anyone outside the XenServer team knows what a "BST" might entail. :-) Is that "Basic Smoke Test"? "Build Sanity Test"? "Big Stress Test"? -George
Andrew Cooper
2013-Oct-09 14:37 UTC
Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
On 09/10/13 14:16, George Dunlap wrote:> On 09/10/13 14:13, Andrew Cooper wrote: >> On 09/10/13 14:02, George Dunlap wrote: >>> On 10/08/2013 01:30 PM, Konrad Rzeszutek Wilk wrote: >>>> These issues sprang up with your patches, not with the ones that >>>> Mukesh posted ? Which did get tested for regression and passed with >>>> flying colours? That means there is a working baseline. Would that >>>> help? >>>> Mukesh do you have any ideas what might be amiss? >>> The issues with the code motion patch are almost certainly mine. Andy >>> Cooper ran one of Mukesh''s versions through one of the XenRT tests >>> (probably a nightly) and it came up fine. I just need to go through >>> and figure out what I messed up. >> I ran Mukesh''s v10 series through a XenServer BST, which confirmed no >> glaring function regressions in PV and HVM domains. > > I''m not sure anyone outside the XenServer team knows what a "BST" > might entail. :-) Is that "Basic Smoke Test"? "Build Sanity Test"? > "Big Stress Test"? > > -GeorgeFor those interested, XenRT is the test system for XenServer, and has several categories. The first is the BVT - "Basic Verification Test". This does install, and running a single test from each of the primary areas (e.g. single PV vm, single HVM vm, one migrate, one vlan, etc). This test is performed automatically on all code submission. The next is the BST - "Branch Safety Test". This is a substantially larger set of basic functional tests. It is 5-8 hours to run and is run as-and-when a developer decides they have put in a large chunk of work and want rather more testing than a BVT, but don''t want to commit the resources of a nightly on something which might crash and burn. The third is the Nightly test. This is 2.5K tests taking ~16h and ~150 machines, which is a fairly comprehensive test of XenServer. Beyond that there are sets of tests run less often, including the nightly and weekend stress tests. ~Andrew
At 08:30 -0400 on 08 Oct (1381221032), Konrad Rzeszutek Wilk wrote:> George Dunlap <george.dunlap@eu.citrix.com> wrote: > >If it were just a question of cleaning up those bits, I could probably > >have another draft posted sometime this week. But if we''re stepping > >back and looking at whether this is the right approach, or whether > >something like Tim has suggested -- basically making PVH to be HVM > >minus > >qemu plus a handful of hypercalls, and most of the changes in the > >domain > >builder rather than in Xen -- that will take a bit longer, particularly > > > >because it would probably mean me having to understand and modify the > >Linux side of things as well. At this point I''m not really sure what > >the best approach is going forward. > > Arrg. I am not really sure how to express myself here but from the > start Mukesh has asking for assistance and review of ideas and design > of this and gotten it and acted on it. Now after two years of going > this path folks are suggesting a new design?Sorry for not making the suggestion sooner -- it honestly hadn''t occurred to me. I had read a number of revisions of the PVH Xen patches (and many discussions of what the new type of guests should be called) before thinking that there didn''t need to be a new kind of guest at all. FWIW, I don''t think this would be a complete redesign. AFAICT the guest kernel changes would stay as they are, and most of the toolstack changes too. Some of the Xen changes would stay (implementation of setvcpuinfo, for example) and some would just go away. Anyway, given that I can''t offer to help with the coding (I have basically no time between now and XenSummit) I don''t want to stand in the way of this. If it goes in in something like the current form I''ll try and work out a follow-up series that does it the other way. Cheers, Tim.
Mukesh Rathor
2013-Oct-09 21:59 UTC
Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
On Tue, 08 Oct 2013 08:43:43 +0100 "Jan Beulich" <JBeulich@suse.com> wrote:> >>> On 08.10.13 at 02:52, Mukesh Rathor <mukesh.rathor@oracle.com> > >>> wrote: > > On Mon, 30 Sep 2013 07:56:30 +0100 > > "Jan Beulich" <JBeulich@suse.com> wrote: > > > >> >>> On 28.09.13 at 01:03, Mukesh Rathor <mukesh.rathor@oracle.com> > >> >>> wrote: > >> > On Fri, 27 Sep 2013 08:01:16 +0100 > >> > "Jan Beulich" <JBeulich@suse.com> wrote: > > ....... > >> >> >> > @@ -1089,11 +1262,18 @@ int __init construct_dom0( > >> >> >> > regs->eip = parms.virt_entry; > >> >> >> > regs->esp = vstack_end; > >> >> >> > regs->esi = vstartinfo_start; > >> >> >> > - regs->eflags = X86_EFLAGS_IF; > >> >> >> > + regs->eflags = X86_EFLAGS_IF | 0x2; > >> >> >> > >> >> >> Unrelated change? > >> >> > > >> >> > Nop, we need to make sure the resvd bit is set in eflags > >> >> > otherwise it won''t vmenter (invalid guest state). Should be > >> >> > harmless for PV, right? Not sure where it does it for PV > >> >> > before actually scheduling it.. > >> >> > >> >> PV doesn''t set this anywhere - the hardware doesn''t allow the > >> >> flag to be cleared (writes are ignored). If VMENTER is picky > >> >> about this, the GUEST_RFLAGS write at the end of > >> >> vmx_vmenter_helper() should be doing this instead of having to > >> >> do it here (and obviously in some other place for DomU > >> >> creation). > >> > > >> > For domU we set it in arch_set_info_guest. > >> > >> Which is bogus too. 15910:ec3b23d8d544 ("hvm: Always keep > >> canonical copy of RIP/RSP/RFLAGS in guest_cpu_user_regs()") did > >> this adjustment without really explaining why it can''t be done > >> centrally in just the two places copying regs->eflags into the > >> VMCS/VMCB spot. > > > > I beg to differ.... such nit picking is equally bogus IMHO. The > > bit needs to be set once, putting it in vmx_vmenter_helper adds an > > unnecessary slowdown IMO. > > An "or" being a measurable slowdown?Well, unnecessary dirtying of cache line. But, I''ll make the change just to be done with this.> >> > vmx_vmenter_helper gets > >> > called on every vmentry, we just need this setting once. > >> > >> Would a debugger update guest state via arch_set_info_guest()? > >> I doubt it. It would imo be a desirable up front cleanup patch to > >> move this bogus thing out of arch_set_info_guest() into > >> vmx_vmenter_helper() (and whatever SVM equivalent, should > >> SVM too be incapable of dealing with the flag being clear). See > >> how e.g. hvm_load_cpu_ctxt() already sets the flag? It''s really > >> like being done almost at random... > > > > The debugger would always read eflags, muck with only > > the bits it needs to, leaving the resvd bit as is, then send it > > down. > > So you''d expect every debugger to be responsible for setting this > bit? Pretty odd a requirement, when it can be done centrally in a > single place, covering all cases.Like I said, the debuggers are expected to read existing eflags, change the bits (leaving resvd bit set). But, ir-relevant now after the above change. Mukesh
Mukesh Rathor
2013-Oct-09 22:31 UTC
Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
On Wed, 9 Oct 2013 18:50:40 +0100 Tim Deegan <tim@xen.org> wrote:> At 08:30 -0400 on 08 Oct (1381221032), Konrad Rzeszutek Wilk wrote: > > George Dunlap <george.dunlap@eu.citrix.com> wrote: > > >If it were just a question of cleaning up those bits, I could > > >probably have another draft posted sometime this week. But if > > >we''re stepping back and looking at whether this is the right > > >approach, or whether something like Tim has suggested -- basically > > >making PVH to be HVM minus > > >qemu plus a handful of hypercalls, and most of the changes in the > > >domain > > >builder rather than in Xen -- that will take a bit longer, > > >particularly > > > > > >because it would probably mean me having to understand and modify > > >the Linux side of things as well. At this point I''m not really > > >sure what the best approach is going forward. > > > > Arrg. I am not really sure how to express myself here but from the > > start Mukesh has asking for assistance and review of ideas and > > design of this and gotten it and acted on it. Now after two years > > of going this path folks are suggesting a new design? > > Sorry for not making the suggestion sooner -- it honestly hadn''t > occurred to me. I had read a number of revisions of the PVH Xen > patches (and many discussions of what the new type of guests should > be called) before thinking that there didn''t need to be a new kind of > guest at all.The orig series didn''t have a new guest type, but I was sorta compelled to create one. From guest perspective, it''s in PV mode with auto translate. It appears this is now losing sight of the orig goal. The main problem being solved was 64bit PV syscall overhead. The best way to solve the sys call overhead was to run in HVM container. I discussed briefly with Ian P and then Keir on the sidelines of xen summit. Then I did code walk with Steffano and Ian C at hackathon last year. Looked like we were all on same page. Still, if something could be done better, I''m in favor. But, lets get something done asap so as to not get left behind. Everyone will have a different opinion on the best approach, and unless some compromises are made things can just drag on.....> FWIW, I don''t think this would be a complete redesign. AFAICT the > guest kernel changes would stay as they are, and most of the > toolstack changes too. Some of the Xen changes would stay > (implementation of setvcpuinfo, for example) and some would just go > away.Good idea, but keep in mind that dom0 will need most of the xen changes for coming in PVH mode. So, there might not be enough to make it worthwhile. thanks Mukesh