Finally, the hardest. Mostly modify construct_dom0() to boot PV dom0 in PVH mode. Introduce, opt_dom0pvh, which when specified in the command line, causes dom0 to boot in PVH mode. Change in V2: - Map the entire IO region upfront in the P2M for PVH dom0. Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> --- xen/arch/x86/domain_build.c | 241 +++++++++++++++++++++++++++++++++---------- xen/arch/x86/mm/hap/hap.c | 17 +++- xen/arch/x86/setup.c | 10 ++- xen/include/asm-x86/hap.h | 1 + 4 files changed, 212 insertions(+), 57 deletions(-) diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c index 8c5b27a..72aa70b 100644 --- a/xen/arch/x86/domain_build.c +++ b/xen/arch/x86/domain_build.c @@ -35,6 +35,8 @@ #include <asm/setup.h> #include <asm/bzimage.h> /* for bzimage_parse */ #include <asm/io_apic.h> +#include <asm/hap.h> +#include <asm/debugger.h> #include <public/version.h> @@ -307,6 +309,65 @@ static void __init process_dom0_ioports_disable(void) } } +/* + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0 will have + * the entire io region mapped in the EPT/NPT. + */ +static __init void pvh_map_all_iomem(struct domain *d) +{ + unsigned long start = 0; + const struct e820entry *entry; + int rc, i, nump; + + for (i = 0, entry = e820.map; i < e820.nr_map; i++, entry++) { + unsigned long end = entry->addr + entry->size; + + if (entry->type == E820_RAM || i == e820.nr_map - 1) { + unsigned long start_pfn = PFN_DOWN(start); + unsigned long end_pfn = PFN_UP(end); + + if (entry->type == E820_RAM) + end_pfn = PFN_UP(entry->addr); + + if (start_pfn < end_pfn) { + nump = end_pfn - start_pfn + 1; + rc = domctl_memory_mapping(d, start_pfn, start_pfn, nump, 1); + BUG_ON(rc); + } + start = end; + } + } +} + +static __init void dom0_update_physmap(struct domain *d, unsigned long pfn, + unsigned long mfn, unsigned long vphysmap_s) +{ + if ( is_pvh_domain(d) ) { + int rc = guest_physmap_add_page(d, pfn, mfn, 0); + BUG_ON(rc); + return; + } + if ( !is_pv_32on64_domain(d) ) + ((unsigned long *)vphysmap_s)[pfn] = mfn; + else + ((unsigned int *)vphysmap_s)[pfn] = mfn; + + set_gpfn_from_mfn(mfn, pfn); +} + +static __init void copy_pvh(char *dest, char *src, int bytes) +{ + /* raw_copy_to_guest() -> copy_to_user_hvm -> __hvm_copy needs curr + * to point to the hvm/pvh vcpu. Hence for PVH dom0 we can''t use that. + * So we just use dbg_rw_mem(). + */ + int rem = dbg_rw_mem((dbgva_t)dest, (unsigned char *)src, bytes, 0, 1, 0); + if (rem) { + printk("PVH: Failed to copy to dom0. len:%d rem:%d\n", bytes, rem); + BUG(); + } +} + int __init construct_dom0( struct domain *d, const module_t *image, unsigned long image_headroom, @@ -314,6 +375,7 @@ int __init construct_dom0( void *(*bootstrap_map)(const module_t *), char *cmdline) { + char *si_buf=NULL, *tmp_buf=NULL; int i, cpu, rc, compatible, compat32, order, machine; struct cpu_user_regs *regs; unsigned long pfn, mfn; @@ -322,7 +384,7 @@ int __init construct_dom0( unsigned long alloc_spfn; unsigned long alloc_epfn; unsigned long initrd_pfn = -1, initrd_mfn = 0; - unsigned long count; + unsigned long count, shared_info_pfn_addr = 0; struct page_info *page = NULL; start_info_t *si; struct vcpu *v = d->vcpu[0]; @@ -416,6 +478,13 @@ int __init construct_dom0( { printk("Kernel does not support Dom0 operation\n"); return -EINVAL; + + if ( is_pvh_domain(d) && + !test_bit(XENFEAT_hvm_callback_vector, parms.f_supported) ) + { + printk("Kernel does not support PVH mode\n"); + return -EINVAL; + } } if ( compat32 ) @@ -480,6 +549,12 @@ int __init construct_dom0( vstartinfo_end = (vstartinfo_start + sizeof(struct start_info) + sizeof(struct dom0_vga_console_info)); + + if ( is_pvh_domain(d) ) { + shared_info_pfn_addr = round_pgup(vstartinfo_end) - v_start; + vstartinfo_end += PAGE_SIZE; + } + vpt_start = round_pgup(vstartinfo_end); for ( nr_pt_pages = 2; ; nr_pt_pages++ ) { @@ -621,16 +696,26 @@ int __init construct_dom0( maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l3_page_table; l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE; } - clear_page(l4tab); - init_guest_l4_table(l4tab, d); - v->arch.guest_table = pagetable_from_paddr(__pa(l4start)); - if ( is_pv_32on64_domain(d) ) - v->arch.guest_table_user = v->arch.guest_table; + if ( is_pvh_domain(d) ) + { + v->arch.guest_table = pagetable_from_paddr(vpt_start - v_start); + pfn = 0; + } else { + clear_page(l4tab); + init_guest_l4_table(l4tab, d); + v->arch.guest_table = pagetable_from_paddr(__pa(l4start)); + if ( is_pv_32on64_domain(d) ) + v->arch.guest_table_user = v->arch.guest_table; + pfn = alloc_spfn; + } l4tab += l4_table_offset(v_start); - pfn = alloc_spfn; for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ ) { + /* initrd chunk''s mfns are separate, so we need to adjust for them */ + signed long pvh_adj = is_pvh_domain(d) ? + (PFN_UP(initrd_len) - alloc_spfn)<<PAGE_SHIFT : 0; + if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) ) { maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table; @@ -657,16 +742,17 @@ int __init construct_dom0( clear_page(l3tab); if ( count == 0 ) l3tab += l3_table_offset(v_start); - *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT); + *l4tab = l4e_from_paddr(__pa(l3start) + pvh_adj, L4_PROT); l4tab++; } - *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT); + *l3tab = l3e_from_paddr(__pa(l2start) + pvh_adj, L3_PROT); l3tab++; } - *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT); + *l2tab = l2e_from_paddr(__pa(l1start) + pvh_adj, L2_PROT); l2tab++; } - if ( count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) ) + if ( is_pvh_domain(d) || + count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) ) mfn = pfn++; else mfn = initrd_mfn++; @@ -674,6 +760,9 @@ int __init construct_dom0( L1_PROT : COMPAT_L1_PROT)); l1tab++; + if ( is_pvh_domain(d) ) + continue; + page = mfn_to_page(mfn); if ( (page->u.inuse.type_info == 0) && !get_page_and_type(page, d, PGT_writable_page) ) @@ -702,6 +791,9 @@ int __init construct_dom0( COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab)); } + if ( is_pvh_domain(d) ) + goto pvh_skip_pt_rdonly; + /* Pages that are part of page tables must be read only. */ l4tab = l4start + l4_table_offset(vpt_start); l3start = l3tab = l4e_to_l3e(*l4tab); @@ -741,6 +833,8 @@ int __init construct_dom0( } } +pvh_skip_pt_rdonly: + /* Mask all upcalls... */ for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ ) shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1; @@ -754,6 +848,11 @@ int __init construct_dom0( (void)alloc_vcpu(d, i, cpu); } + if ( is_pvh_domain(d) ) + { + v->arch.cr3 = v->arch.hvm_vcpu.guest_cr[3] + (pagetable_get_pfn(v->arch.guest_table)) << PAGE_SHIFT; + } /* Set up CR3 value for write_ptbase */ if ( paging_mode_enabled(d) ) paging_update_paging_modes(v); @@ -764,35 +863,16 @@ int __init construct_dom0( write_ptbase(v); mapcache_override_current(v); - /* Copy the OS image and free temporary buffer. */ - elf.dest = (void*)vkern_start; - rc = elf_load_binary(&elf, 0); - if ( rc < 0 ) - { - printk("Failed to load the kernel binary\n"); - return rc; - } - bootstrap_map(NULL); - - if ( UNSET_ADDR != parms.virt_hypercall ) - { - if ( (parms.virt_hypercall < v_start) || - (parms.virt_hypercall >= v_end) ) - { - mapcache_override_current(NULL); - write_ptbase(current); - printk("Invalid HYPERCALL_PAGE field in ELF notes.\n"); - return -1; + /* Set up start info area. */ + if ( is_pvh_domain(d) ) { + if ( (si_buf=xmalloc_bytes(PAGE_SIZE)) == NULL) { + printk("PVH: xmalloc failed to alloc %ld bytes.\n", PAGE_SIZE); + return -ENOMEM; } - hypercall_page_initialise( - d, (void *)(unsigned long)parms.virt_hypercall); - } - - /* Free temporary buffers. */ - discard_initial_images(); + si = (start_info_t *)si_buf; + } else + si = (start_info_t *)vstartinfo_start; - /* Set up start info area. */ - si = (start_info_t *)vstartinfo_start; clear_page(si); si->nr_pages = nr_pages; @@ -814,7 +894,7 @@ int __init construct_dom0( l2tab = NULL; l1tab = NULL; /* Set up the phys->machine table if not part of the initial mapping. */ - if ( parms.p2m_base != UNSET_ADDR ) + if ( parms.p2m_base != UNSET_ADDR && !is_pvh_domain(d) ) { unsigned long va = vphysmap_start; @@ -935,6 +1015,9 @@ int __init construct_dom0( unmap_domain_page(l3tab); unmap_domain_page(l4start); + if (is_pvh_domain(d) ) + hap_set_pvh_alloc_for_dom0(d, nr_pages); + /* Write the phys->machine and machine->phys table entries. */ for ( pfn = 0; pfn < count; pfn++ ) { @@ -951,11 +1034,8 @@ int __init construct_dom0( if ( pfn > REVERSE_START && (vinitrd_start || pfn < initrd_pfn) ) mfn = alloc_epfn - (pfn - REVERSE_START); #endif - if ( !is_pv_32on64_domain(d) ) - ((unsigned long *)vphysmap_start)[pfn] = mfn; - else - ((unsigned int *)vphysmap_start)[pfn] = mfn; - set_gpfn_from_mfn(mfn, pfn); + dom0_update_physmap(d, pfn, mfn, vphysmap_start); + if (!(pfn & 0xfffff)) process_pending_softirqs(); } @@ -971,8 +1051,8 @@ int __init construct_dom0( if ( !page->u.inuse.type_info && !get_page_and_type(page, d, PGT_writable_page) ) BUG(); - ((unsigned long *)vphysmap_start)[pfn] = mfn; - set_gpfn_from_mfn(mfn, pfn); + + dom0_update_physmap(d, pfn, mfn, vphysmap_start); ++pfn; if (!(pfn & 0xfffff)) process_pending_softirqs(); @@ -992,11 +1072,7 @@ int __init construct_dom0( #ifndef NDEBUG #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn))) #endif - if ( !is_pv_32on64_domain(d) ) - ((unsigned long *)vphysmap_start)[pfn] = mfn; - else - ((unsigned int *)vphysmap_start)[pfn] = mfn; - set_gpfn_from_mfn(mfn, pfn); + dom0_update_physmap(d, pfn, mfn, vphysmap_start); #undef pfn page++; pfn++; if (!(pfn & 0xfffff)) @@ -1004,6 +1080,47 @@ int __init construct_dom0( } } + /* Copy the OS image and free temporary buffer. */ + elf.dest = (void*)vkern_start; + rc = elf_load_binary(&elf, is_pvh_domain(d) ); + if ( rc < 0 ) + { + printk("Failed to load the kernel binary\n"); + return rc; + } + bootstrap_map(NULL); + + if ( UNSET_ADDR != parms.virt_hypercall ) + { + void *addr; + + if ( is_pvh_domain(d) ) { + if ( (tmp_buf=xzalloc_bytes(PAGE_SIZE)) == NULL ) { + printk("xzalloc failed for tmp_buf. %ld bytes.\n", PAGE_SIZE); + return -ENOMEM; + } + addr = tmp_buf; + } else + addr = (void *)parms.virt_hypercall; + + if ( (parms.virt_hypercall < v_start) || + (parms.virt_hypercall >= v_end) ) + { + write_ptbase(current); + printk("Invalid HYPERCALL_PAGE field in ELF notes.\n"); + return -1; + } + hypercall_page_initialise(d, addr); + + if ( is_pvh_domain(d) ) { + copy_pvh((void *)parms.virt_hypercall, tmp_buf, PAGE_SIZE); + xfree(tmp_buf); + } + } + + /* Free temporary buffers. */ + discard_initial_images(); + if ( initrd_len != 0 ) { si->mod_start = vinitrd_start ?: initrd_pfn; @@ -1019,6 +1136,15 @@ int __init construct_dom0( si->console.dom0.info_off = sizeof(struct start_info); si->console.dom0.info_size = sizeof(struct dom0_vga_console_info); } + if ( is_pvh_domain(d) ) { + unsigned long mfn = virt_to_mfn(d->shared_info); + unsigned long pfn = shared_info_pfn_addr>>PAGE_SHIFT; + si->shared_info = shared_info_pfn_addr; + dom0_update_physmap(d, pfn, mfn, 0); + + copy_pvh((char *)vstartinfo_start, si_buf, PAGE_SIZE); + xfree(si_buf); + } if ( is_pv_32on64_domain(d) ) xlat_start_info(si, XLAT_start_info_console_dom0); @@ -1050,12 +1176,16 @@ int __init construct_dom0( regs->eip = parms.virt_entry; regs->esp = vstack_end; regs->esi = vstartinfo_start; - regs->eflags = X86_EFLAGS_IF; + regs->eflags = X86_EFLAGS_IF | 0x2; - if ( opt_dom0_shadow ) + if ( opt_dom0_shadow ) { + if ( is_pvh_domain(d) ) { + printk("Invalid option dom0_shadow for PVH\n"); + return -EINVAL; + } if ( paging_enable(d, PG_SH_enable) == 0 ) paging_update_paging_modes(v); - + } if ( supervisor_mode_kernel ) { v->arch.pv_vcpu.kernel_ss &= ~3; @@ -1132,6 +1262,9 @@ int __init construct_dom0( BUG_ON(rc != 0); + if ( is_pvh_domain(d) ) + pvh_map_all_iomem(d); + iommu_dom0_init(dom0); return 0; diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c index 055833d..d3d5697 100644 --- a/xen/arch/x86/mm/hap/hap.c +++ b/xen/arch/x86/mm/hap/hap.c @@ -574,6 +574,20 @@ int hap_domctl(struct domain *d, xen_domctl_shadow_op_t *sc, } } +/* Resize hap table. Copied from: libxl_get_required_shadow_memory() */ +void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages) +{ + int rc; + unsigned long memkb = num_pages * (PAGE_SIZE / 1024); + + memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024)); + num_pages = ((memkb+1023)/1024) << (20 - PAGE_SHIFT); + paging_lock(d); + rc = hap_set_allocation(d, num_pages, NULL); + paging_unlock(d); + BUG_ON(rc); +} + static const struct paging_mode hap_paging_real_mode; static const struct paging_mode hap_paging_protected_mode; static const struct paging_mode hap_paging_pae_mode; @@ -633,7 +647,8 @@ static void hap_update_cr3(struct vcpu *v, int do_locking) const struct paging_mode * hap_paging_get_mode(struct vcpu *v) { - return !hvm_paging_enabled(v) ? &hap_paging_real_mode : + return is_pvh_vcpu(v) ? &hap_paging_long_mode : + !hvm_paging_enabled(v) ? &hap_paging_real_mode : hvm_long_mode_enabled(v) ? &hap_paging_long_mode : hvm_pae_enabled(v) ? &hap_paging_pae_mode : &hap_paging_protected_mode; diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index 43301a5..f307f24 100644 --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -60,6 +60,10 @@ integer_param("maxcpus", max_cpus); static bool_t __initdata disable_smep; invbool_param("smep", disable_smep); +/* Boot dom0 in PVH mode */ +static bool_t __initdata opt_dom0pvh; +boolean_param("dom0pvh", opt_dom0pvh); + /* **** Linux config option: propagated to domain0. */ /* "acpi=off": Sisables both ACPI table parsing and interpreter. */ /* "acpi=force": Override the disable blacklist. */ @@ -545,7 +549,7 @@ void __init __start_xen(unsigned long mbi_p) { char *memmap_type = NULL; char *cmdline, *kextra, *loader; - unsigned int initrdidx; + unsigned int initrdidx, domcr_flags = 0; multiboot_info_t *mbi = __va(mbi_p); module_t *mod = (module_t *)__va(mbi->mods_addr); unsigned long nr_pages, modules_headroom, *module_map; @@ -1314,7 +1318,9 @@ void __init __start_xen(unsigned long mbi_p) panic("Could not protect TXT memory regions\n"); /* Create initial domain 0. */ - dom0 = domain_create(0, DOMCRF_s3_integrity, 0); + domcr_flags = (opt_dom0pvh ? DOMCRF_pvh | DOMCRF_hap : 0); + domcr_flags |= DOMCRF_s3_integrity; + dom0 = domain_create(0, domcr_flags, 0); if ( IS_ERR(dom0) || (alloc_dom0_vcpu0() == NULL) ) panic("Error creating domain 0\n"); diff --git a/xen/include/asm-x86/hap.h b/xen/include/asm-x86/hap.h index e03f983..aab8558 100644 --- a/xen/include/asm-x86/hap.h +++ b/xen/include/asm-x86/hap.h @@ -63,6 +63,7 @@ int hap_track_dirty_vram(struct domain *d, XEN_GUEST_HANDLE_64(uint8) dirty_bitmap); extern const struct paging_mode *hap_paging_get_mode(struct vcpu *); +void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages); #endif /* XEN_HAP_H */ -- 1.7.2.3
>>> On 16.03.13 at 02:06, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: > @@ -307,6 +309,65 @@ static void __init process_dom0_ioports_disable(void) > } > } > > +/* > + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0 will have > + * the entire io region mapped in the EPT/NPT. > + */ > +static __init void pvh_map_all_iomem(struct domain *d) > +{ > + unsigned long start = 0; > + const struct e820entry *entry; > + int rc, i, nump; > + > + for (i = 0, entry = e820.map; i < e820.nr_map; i++, entry++) { > + unsigned long end = entry->addr + entry->size; > + > + if (entry->type == E820_RAM || i == e820.nr_map - 1) { > + unsigned long start_pfn = PFN_DOWN(start); > + unsigned long end_pfn = PFN_UP(end); > + > + if (entry->type == E820_RAM) > + end_pfn = PFN_UP(entry->addr); > + > + if (start_pfn < end_pfn) { > + nump = end_pfn - start_pfn + 1; > + rc = domctl_memory_mapping(d, start_pfn, start_pfn, nump, 1); > + BUG_ON(rc); > + } > + start = end; > + } > + }At least E820_UNUSABLE must be excluded here. And as you''re mapping the holes only - how do you deal with the MMIO range past end of RAM? And perhaps even more important - how do you deal with the split between RAM and MMIO not being at the end of currently populated RAM, but at the end of possible hotpluggable regions.> +static __init void copy_pvh(char *dest, char *src, int bytes) > +{ > + /* raw_copy_to_guest() -> copy_to_user_hvm -> __hvm_copy needs curr > + * to point to the hvm/pvh vcpu. Hence for PVH dom0 we can''t use that. > + * So we just use dbg_rw_mem(). > + */ > + int rem = dbg_rw_mem((dbgva_t)dest, (unsigned char *)src, bytes, 0, 1, 0);Same comment as before: This is not acceptable for a submission of a patch intended to be committed (i.e. non-RFC). You should have worked out a suitable solution to this before posting.> @@ -416,6 +478,13 @@ int __init construct_dom0( > { > printk("Kernel does not support Dom0 operation\n"); > return -EINVAL; > + > + if ( is_pvh_domain(d) && > + !test_bit(XENFEAT_hvm_callback_vector, parms.f_supported) ) > + { > + printk("Kernel does not support PVH mode\n"); > + return -EINVAL; > + }Adding dead code (after a return).> @@ -621,16 +696,26 @@ int __init construct_dom0( > maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l3_page_table; > l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE; > } > - clear_page(l4tab); > - init_guest_l4_table(l4tab, d); > - v->arch.guest_table = pagetable_from_paddr(__pa(l4start)); > - if ( is_pv_32on64_domain(d) ) > - v->arch.guest_table_user = v->arch.guest_table; > + if ( is_pvh_domain(d) ) > + { > + v->arch.guest_table = pagetable_from_paddr(vpt_start - v_start);Am I understanding right that you''re making v->arch.guest_table store a guest physical address rather than a host physical one? Can that really be done consistently across of uses of this field? Jan
Konrad Rzeszutek Wilk
2013-Mar-18 20:06 UTC
Re: [PATCH 17/18 V2]: PVH xen: PVH dom0 creation...
On Fri, Mar 15, 2013 at 06:06:45PM -0700, Mukesh Rathor wrote:> Finally, the hardest. Mostly modify construct_dom0() to boot PV dom0 in > PVH mode. Introduce, opt_dom0pvh, which when specified in the command > line, causes dom0 to boot in PVH mode. > > Change in V2: > - Map the entire IO region upfront in the P2M for PVH dom0. > > Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> > --- > xen/arch/x86/domain_build.c | 241 +++++++++++++++++++++++++++++++++---------- > xen/arch/x86/mm/hap/hap.c | 17 +++- > xen/arch/x86/setup.c | 10 ++- > xen/include/asm-x86/hap.h | 1 + > 4 files changed, 212 insertions(+), 57 deletions(-) > > diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c > index 8c5b27a..72aa70b 100644 > --- a/xen/arch/x86/domain_build.c > +++ b/xen/arch/x86/domain_build.c > @@ -35,6 +35,8 @@ > #include <asm/setup.h> > #include <asm/bzimage.h> /* for bzimage_parse */ > #include <asm/io_apic.h> > +#include <asm/hap.h> > +#include <asm/debugger.h> > > #include <public/version.h> > > @@ -307,6 +309,65 @@ static void __init process_dom0_ioports_disable(void) > } > } > > +/* > + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0 will have > + * the entire io region mapped in the EPT/NPT. > + */ > +static __init void pvh_map_all_iomem(struct domain *d) > +{ > + unsigned long start = 0; > + const struct e820entry *entry; > + int rc, i, nump;unsigned int for ''nump''. Could it be called ''pfns'' ?> + > + for (i = 0, entry = e820.map; i < e820.nr_map; i++, entry++) { > + unsigned long end = entry->addr + entry->size; > + > + if (entry->type == E820_RAM || i == e820.nr_map - 1) { > + unsigned long start_pfn = PFN_DOWN(start); > + unsigned long end_pfn = PFN_UP(end); > + > + if (entry->type == E820_RAM) > + end_pfn = PFN_UP(entry->addr); > + > + if (start_pfn < end_pfn) { > + nump = end_pfn - start_pfn + 1; > + rc = domctl_memory_mapping(d, start_pfn, start_pfn, nump, 1);Probably want a comment for the ''1'' by saying: /* adding */.> + BUG_ON(rc);It would help to have a bit more details. Perhaps if (rc) { printk(XENLOG_ERR "1-1 mapping on %lx -> %lx for %ld failed with %d!\n", start_pfn, end_pfn, pfns, rc); BUG_ON(rc);> + } > + start = end; > + } > + } > +} > + > +static __init void dom0_update_physmap(struct domain *d, unsigned long pfn, > + unsigned long mfn, unsigned long vphysmap_s) > +{ > + if ( is_pvh_domain(d) ) { > + int rc = guest_physmap_add_page(d, pfn, mfn, 0);A comment about the 0 would be good. And you can collapse this in: BUG_ON( guest_physmap_add_page(d, pfn, mfn, 0 /* remove *) );> + BUG_ON(rc); > + return; > + } > + if ( !is_pv_32on64_domain(d) ) > + ((unsigned long *)vphysmap_s)[pfn] = mfn; > + else > + ((unsigned int *)vphysmap_s)[pfn] = mfn; > + > + set_gpfn_from_mfn(mfn, pfn); > +} > + > +static __init void copy_pvh(char *dest, char *src, int bytes) > +{ > + /* raw_copy_to_guest() -> copy_to_user_hvm -> __hvm_copy needs curr > + * to point to the hvm/pvh vcpu. Hence for PVH dom0 we can''t use that. > + * So we just use dbg_rw_mem(). > + */ > + int rem = dbg_rw_mem((dbgva_t)dest, (unsigned char *)src, bytes, 0, 1, 0); > + if (rem) { > + printk("PVH: Failed to copy to dom0. len:%d rem:%d\n", bytes, rem); > + BUG();Wait a minute? This is debug code but you use for copying?! Why not __copy_to_user?> + } > +} > + > int __init construct_dom0( > struct domain *d, > const module_t *image, unsigned long image_headroom, > @@ -314,6 +375,7 @@ int __init construct_dom0( > void *(*bootstrap_map)(const module_t *), > char *cmdline) > { > + char *si_buf=NULL, *tmp_buf=NULL; > int i, cpu, rc, compatible, compat32, order, machine; > struct cpu_user_regs *regs; > unsigned long pfn, mfn; > @@ -322,7 +384,7 @@ int __init construct_dom0( > unsigned long alloc_spfn; > unsigned long alloc_epfn; > unsigned long initrd_pfn = -1, initrd_mfn = 0; > - unsigned long count; > + unsigned long count, shared_info_pfn_addr = 0; > struct page_info *page = NULL; > start_info_t *si; > struct vcpu *v = d->vcpu[0]; > @@ -416,6 +478,13 @@ int __init construct_dom0( > { > printk("Kernel does not support Dom0 operation\n"); > return -EINVAL;You are adding it right above the return - so it will never reach this.> + > + if ( is_pvh_domain(d) && > + !test_bit(XENFEAT_hvm_callback_vector, parms.f_supported) ) > + { > + printk("Kernel does not support PVH mode\n"); > + return -EINVAL; > + } > } > > if ( compat32 ) > @@ -480,6 +549,12 @@ int __init construct_dom0(So .. what about (which is right above the vstartinfo_end) 474 vphysmap_end = vphysmap_start + (nr_pages * (!is_pv_32on64_domain(d) ? 475 sizeof(unsigned long) : 476 sizeof(unsigned int))); That is the code the figures out how big the P2M array should be. Shouldn''t that be just vphysmap_end = vphysmap_start? Or are doing some cunning work and setting the XEN_ELFNOTE_INIT_P2M to point to some wacky address and later on in this code checking for the base (params.p2m_base) and if so, then not creating the neccessary space for the P2M array?> vstartinfo_end = (vstartinfo_start + > sizeof(struct start_info) + > sizeof(struct dom0_vga_console_info)); > + > + if ( is_pvh_domain(d) ) { > + shared_info_pfn_addr = round_pgup(vstartinfo_end) - v_start;Huh. I don''t think that is the PFN. Rather it is the physical address offset. Perhaps it should be called ''shared_info_gaddr'' ? Looking at the code above, this means it is past the kernel, past the ramdisk, past the P2M array, and past the ''struct start_info'' and ''dom0_vga_console_info''. Which means that in include/public/xen.h also needs this: * 1. The domain is started within contiguous virtual-memory region. * 2. The contiguous region ends on an aligned 4MB boundary. * 3. This the order of bootstrap elements in the initial virtual region: * a. relocated kernel image * b. initial ram disk [mod_start, mod_len] * c. list of allocated page frames [mfn_list, nr_pages] * (unless relocated due to XEN_ELFNOTE_INIT_P2M) * d. start_info_t structure [register ESI (x86)] ==> d1. And struct shared_info_t structure [shared_info] if autotranslated guest. * e. bootstrap page tables [pt_base and CR3 (x86)] * f. bootstrap stack [register ESP (x86)] Which looks to match what the toolstack does.> + vstartinfo_end += PAGE_SIZE; > + } > + > vpt_start = round_pgup(vstartinfo_end); > for ( nr_pt_pages = 2; ; nr_pt_pages++ ) > { > @@ -621,16 +696,26 @@ int __init construct_dom0( > maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l3_page_table; > l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE; > } > - clear_page(l4tab); > - init_guest_l4_table(l4tab, d); > - v->arch.guest_table = pagetable_from_paddr(__pa(l4start)); > - if ( is_pv_32on64_domain(d) ) > - v->arch.guest_table_user = v->arch.guest_table; > + if ( is_pvh_domain(d) ) > + { > + v->arch.guest_table = pagetable_from_paddr(vpt_start - v_start);Couldn''t we use mpt_alloc? It should point to the same location? So with this code we still allocated one page for the L4: 616 page = alloc_domheap_page(NULL, 0); Do you think it makes sense to free it? You are not using it - you are using the mpt_alloc as your L4. And you did not clean it either. Perhaps also have clear_page(mpt_alloc);> + pfn = 0;OK, so instead of starting the alloc_spfn which is the starting MFN for the page that covers the kernel, initrd, P2M array and the three various PAGE_SIZE structs, you start at zero. I think this warrants a comment. I believe this is b/c it does not matter to Xen - you are just creating a nice initial page-tables that the guest can use and the MFNs in it - have to be PFNs as it is running in auto-translated mode right? In which case you could also give this extra comment: /* Since the guest is going to run in auto-translated * mode and none of the pagiging is done via PV MMU calls * we create the page-tables from GPFN=0 up to boostrap. * TODO: Instead of using 4KB L4 we could use 2MB or so * similar to how the P2M array can if parms.p2m_base is set */ which begs a question. You are doing it from PFN 0 up to d->tot_pages and also include in this region the P2M array. Why?> + } else { > + clear_page(l4tab); > + init_guest_l4_table(l4tab, d); > + v->arch.guest_table = pagetable_from_paddr(__pa(l4start)); > + if ( is_pv_32on64_domain(d) ) > + v->arch.guest_table_user = v->arch.guest_table; > + pfn = alloc_spfn; > + } > > l4tab += l4_table_offset(v_start); > - pfn = alloc_spfn; > for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ ) > { > + /* initrd chunk''s mfns are separate, so we need to adjust for them */Perhaps you can say: /* initrd chunk (see the ''alloc_domheap_pages'' calls above) is allocated from a seperate MFN chunk. Hence we need to adjust for that. */> + signed long pvh_adj = is_pvh_domain(d) ?Just ''long'' and perhaps ''pvh_offset''? Or ''initrd_offset''?> + (PFN_UP(initrd_len) - alloc_spfn)<<PAGE_SHIFT : 0;Oh gosh, what a mix. So alloc_spfn is actually the MFN of the memory allocated for the kernel. The PFN_UP(initrd_len) gives the length of the initrd in pfns. And you are using ''GPFN''s as ''pfn'' here. And the code ''if (count < initrd_pfn)'' is shortcircuited - so we will only put in ''pfn 0->d_tot_pages'' So a 50MB initrd is 12800 pfns. Wait. Why are you subtracting that value from the MFN of the memory that will be used for the kernel (say it is at 0x100 MFN for fun)? I am not sure I understand that. Could you explain it to me please?> + > if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) ) > { > maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table; > @@ -657,16 +742,17 @@ int __init construct_dom0( > clear_page(l3tab); > if ( count == 0 ) > l3tab += l3_table_offset(v_start); > - *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT); > + *l4tab = l4e_from_paddr(__pa(l3start) + pvh_adj, L4_PROT); > l4tab++; > } > - *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT); > + *l3tab = l3e_from_paddr(__pa(l2start) + pvh_adj, L3_PROT); > l3tab++; > } > - *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT); > + *l2tab = l2e_from_paddr(__pa(l1start) + pvh_adj, L2_PROT); > l2tab++; > } > - if ( count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) ) > + if ( is_pvh_domain(d) || > + count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) ) > mfn = pfn++; > else > mfn = initrd_mfn++; > @@ -674,6 +760,9 @@ int __init construct_dom0( > L1_PROT : COMPAT_L1_PROT)); > l1tab++; > > + if ( is_pvh_domain(d) ) > + continue; > + > page = mfn_to_page(mfn); > if ( (page->u.inuse.type_info == 0) && > !get_page_and_type(page, d, PGT_writable_page) ) > @@ -702,6 +791,9 @@ int __init construct_dom0( > COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab)); > } > > + if ( is_pvh_domain(d) ) > + goto pvh_skip_pt_rdonly; > + > /* Pages that are part of page tables must be read only. */ > l4tab = l4start + l4_table_offset(vpt_start); > l3start = l3tab = l4e_to_l3e(*l4tab); > @@ -741,6 +833,8 @@ int __init construct_dom0( > } > } > > +pvh_skip_pt_rdonly: > +Hm, so the pagetable creation for the initial page-tables virtual addresses has been eliminated. That means the initial calculation of ''order'' (alloc_domheap_pages) and the big loop of: 484 for ( nr_pt_pages = 2; ; nr_pt_pages++ ) 485 { calculates the wrong amount right? This needs a bit of mulling over. Since most of the Xen PV MMU code is disabled that means (from a Linux kernel perspective), that we do bootup with init_level4_pgt[272] -> level3_ident_pgt[0] level2_ident_pgt [0-511] [copy from the L2 that we created above.] init_level4_pgt[511] -> level3_kernel_pgt [510] -> level2_kernel_pgt [0-511] [copy from the L2 that we created above.] Since this is a whole-sale copy and only created page-tables from [0 -> end of ''shared info'' structure] and did create any page-table entries for the L4, L3, L2, L1, L1, L1.... page-tables. So any __va or __ka will work. And cleanup_highmap takes care of ripping out any entries from _brk_limit up to max_pfn_mapped (end of ramdisk). OK, so far so good. Wait, not it is not good - max_pfn_mapped is up to: xen_start_info->mfn_list. That is the ramdisk. But that means that __ka for the shared_struct and dom0_vga_console and start_info are <p2m array + xen_start_info->mfn_list> in. And cleanup_highmap would rip that out. Ah, we do have a bunch of revectoring: __va(xen_start_info->shared_info); and xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); in the initial boot paths. So that is OK. OK. The xen_pagetable_init uses level2_kernel_pgt (so in Linux __ka space) to walk and clear up the PMD entries. And since we do not use the __ka to update the pagetables, but switch over to using __va as fast as possible this does look to work. But this needs to be documented. And we probably should not allocate that extra space for L1 entries that we are never going to use.> /* Mask all upcalls... */ > for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ ) > shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1; > @@ -754,6 +848,11 @@ int __init construct_dom0( > (void)alloc_vcpu(d, i, cpu); > } > > + if ( is_pvh_domain(d) ) > + { > + v->arch.cr3 = v->arch.hvm_vcpu.guest_cr[3] > + (pagetable_get_pfn(v->arch.guest_table)) << PAGE_SHIFT; > + } > /* Set up CR3 value for write_ptbase */ > if ( paging_mode_enabled(d) ) > paging_update_paging_modes(v); > @@ -764,35 +863,16 @@ int __init construct_dom0( > write_ptbase(v); > mapcache_override_current(v); > > - /* Copy the OS image and free temporary buffer. */ > - elf.dest = (void*)vkern_start; > - rc = elf_load_binary(&elf, 0); > - if ( rc < 0 ) > - { > - printk("Failed to load the kernel binary\n"); > - return rc; > - } > - bootstrap_map(NULL); > - > - if ( UNSET_ADDR != parms.virt_hypercall ) > - { > - if ( (parms.virt_hypercall < v_start) || > - (parms.virt_hypercall >= v_end) ) > - { > - mapcache_override_current(NULL); > - write_ptbase(current); > - printk("Invalid HYPERCALL_PAGE field in ELF notes.\n"); > - return -1;Wait, you just ripped out this code? Why?!> + /* Set up start info area. */ > + if ( is_pvh_domain(d) ) { > + if ( (si_buf=xmalloc_bytes(PAGE_SIZE)) == NULL) {Odd style.> + printk("PVH: xmalloc failed to alloc %ld bytes.\n", PAGE_SIZE); > + return -ENOMEM; > } > - hypercall_page_initialise( > - d, (void *)(unsigned long)parms.virt_hypercall); > - } > - > - /* Free temporary buffers. */ > - discard_initial_images(); > + si = (start_info_t *)si_buf; > + } else > + si = (start_info_t *)vstartinfo_start;Why do we allocate a page for this? Isn''t the allocation for how many pages are need for this space already taken care of in: 479 vstartinfo_start = round_pgup(vphysmap_end); 480 vstartinfo_end = (vstartinfo_start + 481 sizeof(struct start_info) + 482 sizeof(struct dom0_vga_console_info)); 483 vpt_start = round_pgup(vstartinfo_end); and then 517 page = alloc_domheap_pages(d, order, 0); allocates the appropiate amount?> > - /* Set up start info area. */ > - si = (start_info_t *)vstartinfo_start; > clear_page(si); > si->nr_pages = nr_pages; > > @@ -814,7 +894,7 @@ int __init construct_dom0( > l2tab = NULL; > l1tab = NULL; > /* Set up the phys->machine table if not part of the initial mapping. */ > - if ( parms.p2m_base != UNSET_ADDR ) > + if ( parms.p2m_base != UNSET_ADDR && !is_pvh_domain(d) )AHA! I knew it! You could also do something like this at the start of the code: params.p2m_base = 0xdeadbeef; Or something better? 0xdeadf00d ? And then in here check if we use 0xdeadbeef and we skip over setting up the page-table entries for the P2M?> { > unsigned long va = vphysmap_start; > > @@ -935,6 +1015,9 @@ int __init construct_dom0( > unmap_domain_page(l3tab); > unmap_domain_page(l4start); > > + if (is_pvh_domain(d) ) > + hap_set_pvh_alloc_for_dom0(d, nr_pages); > + > /* Write the phys->machine and machine->phys table entries. */ > for ( pfn = 0; pfn < count; pfn++ ) > { > @@ -951,11 +1034,8 @@ int __init construct_dom0( > if ( pfn > REVERSE_START && (vinitrd_start || pfn < initrd_pfn) ) > mfn = alloc_epfn - (pfn - REVERSE_START); > #endif > - if ( !is_pv_32on64_domain(d) ) > - ((unsigned long *)vphysmap_start)[pfn] = mfn; > - else > - ((unsigned int *)vphysmap_start)[pfn] = mfn; > - set_gpfn_from_mfn(mfn, pfn); > + dom0_update_physmap(d, pfn, mfn, vphysmap_start); > + > if (!(pfn & 0xfffff)) > process_pending_softirqs(); > } > @@ -971,8 +1051,8 @@ int __init construct_dom0( > if ( !page->u.inuse.type_info && > !get_page_and_type(page, d, PGT_writable_page) ) > BUG(); > - ((unsigned long *)vphysmap_start)[pfn] = mfn; > - set_gpfn_from_mfn(mfn, pfn); > + > + dom0_update_physmap(d, pfn, mfn, vphysmap_start); > ++pfn; > if (!(pfn & 0xfffff)) > process_pending_softirqs(); > @@ -992,11 +1072,7 @@ int __init construct_dom0( > #ifndef NDEBUG > #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn))) > #endif > - if ( !is_pv_32on64_domain(d) ) > - ((unsigned long *)vphysmap_start)[pfn] = mfn; > - else > - ((unsigned int *)vphysmap_start)[pfn] = mfn; > - set_gpfn_from_mfn(mfn, pfn); > + dom0_update_physmap(d, pfn, mfn, vphysmap_start); > #undef pfn > page++; pfn++; > if (!(pfn & 0xfffff)) > @@ -1004,6 +1080,47 @@ int __init construct_dom0( > } > } > > + /* Copy the OS image and free temporary buffer. */ > + elf.dest = (void*)vkern_start; > + rc = elf_load_binary(&elf, is_pvh_domain(d) ); > + if ( rc < 0 ) > + { > + printk("Failed to load the kernel binary\n"); > + return rc; > + } > + bootstrap_map(NULL); > + > + if ( UNSET_ADDR != parms.virt_hypercall ) > + { > + void *addr; > + > + if ( is_pvh_domain(d) ) { > + if ( (tmp_buf=xzalloc_bytes(PAGE_SIZE)) == NULL ) { > + printk("xzalloc failed for tmp_buf. %ld bytes.\n", PAGE_SIZE); > + return -ENOMEM; > + } > + addr = tmp_buf; > + } else > + addr = (void *)parms.virt_hypercall; > + > + if ( (parms.virt_hypercall < v_start) || > + (parms.virt_hypercall >= v_end) ) > + { > + write_ptbase(current); > + printk("Invalid HYPERCALL_PAGE field in ELF notes.\n"); > + return -1; > + } > + hypercall_page_initialise(d, addr); > + > + if ( is_pvh_domain(d) ) { > + copy_pvh((void *)parms.virt_hypercall, tmp_buf, PAGE_SIZE); > + xfree(tmp_buf); > + } > + }Wait? Why the move of the code here?> + > + /* Free temporary buffers. */ > + discard_initial_images(); > + > if ( initrd_len != 0 ) > { > si->mod_start = vinitrd_start ?: initrd_pfn; > @@ -1019,6 +1136,15 @@ int __init construct_dom0( > si->console.dom0.info_off = sizeof(struct start_info); > si->console.dom0.info_size = sizeof(struct dom0_vga_console_info); > } > + if ( is_pvh_domain(d) ) { > + unsigned long mfn = virt_to_mfn(d->shared_info); > + unsigned long pfn = shared_info_pfn_addr>>PAGE_SHIFT; > + si->shared_info = shared_info_pfn_addr;So that is not an PFN. That is the guest physical address I reckon? Ah, which matches (kind of) the comment: 725 unsigned long shared_info; /* MACHINE address of shared info struct. */> + dom0_update_physmap(d, pfn, mfn, 0);, 0 /* ignored. */> + > + copy_pvh((char *)vstartinfo_start, si_buf, PAGE_SIZE); > + xfree(si_buf);Oh. Why the copy? Why not just operate directly on the vstartinfo_start virtual address?> + } > > if ( is_pv_32on64_domain(d) ) > xlat_start_info(si, XLAT_start_info_console_dom0); > @@ -1050,12 +1176,16 @@ int __init construct_dom0( > regs->eip = parms.virt_entry; > regs->esp = vstack_end; > regs->esi = vstartinfo_start; > - regs->eflags = X86_EFLAGS_IF; > + regs->eflags = X86_EFLAGS_IF | 0x2;Ahem!> > - if ( opt_dom0_shadow ) > + if ( opt_dom0_shadow ) { > + if ( is_pvh_domain(d) ) { > + printk("Invalid option dom0_shadow for PVH\n"); > + return -EINVAL; > + } > if ( paging_enable(d, PG_SH_enable) == 0 ) > paging_update_paging_modes(v); > - > + } > if ( supervisor_mode_kernel ) > { > v->arch.pv_vcpu.kernel_ss &= ~3; > @@ -1132,6 +1262,9 @@ int __init construct_dom0( > > BUG_ON(rc != 0); > > + if ( is_pvh_domain(d) ) > + pvh_map_all_iomem(d); > + > iommu_dom0_init(dom0); > > return 0; > diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c > index 055833d..d3d5697 100644 > --- a/xen/arch/x86/mm/hap/hap.c > +++ b/xen/arch/x86/mm/hap/hap.c > @@ -574,6 +574,20 @@ int hap_domctl(struct domain *d, xen_domctl_shadow_op_t *sc, > } > } > > +/* Resize hap table. Copied from: libxl_get_required_shadow_memory() */ > +void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages) > +{ > + int rc; > + unsigned long memkb = num_pages * (PAGE_SIZE / 1024); > + > + memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024));Could you include an explanation behind this formula please?> + num_pages = ((memkb+1023)/1024) << (20 - PAGE_SHIFT);num_pages = memkb >> 2 ?> + paging_lock(d); > + rc = hap_set_allocation(d, num_pages, NULL); > + paging_unlock(d); > + BUG_ON(rc); > +} > + > static const struct paging_mode hap_paging_real_mode; > static const struct paging_mode hap_paging_protected_mode; > static const struct paging_mode hap_paging_pae_mode; > @@ -633,7 +647,8 @@ static void hap_update_cr3(struct vcpu *v, int do_locking) > const struct paging_mode * > hap_paging_get_mode(struct vcpu *v) > { > - return !hvm_paging_enabled(v) ? &hap_paging_real_mode : > + return is_pvh_vcpu(v) ? &hap_paging_long_mode : > + !hvm_paging_enabled(v) ? &hap_paging_real_mode : > hvm_long_mode_enabled(v) ? &hap_paging_long_mode : > hvm_pae_enabled(v) ? &hap_paging_pae_mode : > &hap_paging_protected_mode; > diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c > index 43301a5..f307f24 100644 > --- a/xen/arch/x86/setup.c > +++ b/xen/arch/x86/setup.c > @@ -60,6 +60,10 @@ integer_param("maxcpus", max_cpus); > static bool_t __initdata disable_smep; > invbool_param("smep", disable_smep); > > +/* Boot dom0 in PVH mode */ > +static bool_t __initdata opt_dom0pvh; > +boolean_param("dom0pvh", opt_dom0pvh); > + > /* **** Linux config option: propagated to domain0. */ > /* "acpi=off": Sisables both ACPI table parsing and interpreter. */ > /* "acpi=force": Override the disable blacklist. */ > @@ -545,7 +549,7 @@ void __init __start_xen(unsigned long mbi_p) > { > char *memmap_type = NULL; > char *cmdline, *kextra, *loader; > - unsigned int initrdidx; > + unsigned int initrdidx, domcr_flags = 0; > multiboot_info_t *mbi = __va(mbi_p); > module_t *mod = (module_t *)__va(mbi->mods_addr); > unsigned long nr_pages, modules_headroom, *module_map; > @@ -1314,7 +1318,9 @@ void __init __start_xen(unsigned long mbi_p) > panic("Could not protect TXT memory regions\n"); > > /* Create initial domain 0. */ > - dom0 = domain_create(0, DOMCRF_s3_integrity, 0); > + domcr_flags = (opt_dom0pvh ? DOMCRF_pvh | DOMCRF_hap : 0); > + domcr_flags |= DOMCRF_s3_integrity; > + dom0 = domain_create(0, domcr_flags, 0); > if ( IS_ERR(dom0) || (alloc_dom0_vcpu0() == NULL) ) > panic("Error creating domain 0\n"); > > diff --git a/xen/include/asm-x86/hap.h b/xen/include/asm-x86/hap.h > index e03f983..aab8558 100644 > --- a/xen/include/asm-x86/hap.h > +++ b/xen/include/asm-x86/hap.h > @@ -63,6 +63,7 @@ int hap_track_dirty_vram(struct domain *d, > XEN_GUEST_HANDLE_64(uint8) dirty_bitmap); > > extern const struct paging_mode *hap_paging_get_mode(struct vcpu *); > +void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages); > > #endif /* XEN_HAP_H */ > > -- > 1.7.2.3 > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel >
>>> On 16.03.13 at 02:06, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: > Finally, the hardest. Mostly modify construct_dom0() to boot PV dom0 in > PVH mode. Introduce, opt_dom0pvh, which when specified in the command > line, causes dom0 to boot in PVH mode.Now that Konrad mentioned that PVH is intended for 64-bit guests only, I fail to see where this restriction is being enforced in this patch. And it would certainly have been worthwhile to state that more prominently, even more so since Konrad keeps telling people that this patch set is expected to initiate a deprecation phase for the PV MMU interfaces in Linux (which then would mean no 32-bit PV guest support at all anymore). Jan
Konrad Rzeszutek Wilk
2013-Mar-19 13:32 UTC
Re: [PATCH 17/18 V2]: PVH xen: PVH dom0 creation...
On Tue, Mar 19, 2013 at 09:27:34AM +0000, Jan Beulich wrote:> >>> On 16.03.13 at 02:06, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: > > Finally, the hardest. Mostly modify construct_dom0() to boot PV dom0 in > > PVH mode. Introduce, opt_dom0pvh, which when specified in the command > > line, causes dom0 to boot in PVH mode. > > Now that Konrad mentioned that PVH is intended for 64-bit guests > only, I fail to see where this restriction is being enforced in this > patch. And it would certainly have been worthwhile to state that > more prominently, even more so since Konrad keeps telling people > that this patch set is expected to initiate a deprecation phase for > the PV MMU interfaces in Linux (which then would mean no 32-bit > PV guest support at all anymore).It was not in my mind for the first stage of this work, thought I have to admit that the 32-bit part completly vanished in my mind. You are right - we need then 32-bit support otherwise we are screwed in the X years we initiate the deprecation phase in the upstream PV MMU interface. I was not expecting that this set of patches would be the ''ok, the timer for deprecation is ticking once the patch goes in Xen'' - rather when folks agree that it is the right time.
On Mon, 18 Mar 2013 16:06:02 -0400 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:> On Fri, Mar 15, 2013 at 06:06:45PM -0700, Mukesh Rathor wrote: > > +static __init void copy_pvh(char *dest, char *src, int bytes) > > +{ > > + /* raw_copy_to_guest() -> copy_to_user_hvm -> __hvm_copy needs > > curr > > + * to point to the hvm/pvh vcpu. Hence for PVH dom0 we can''t > > use that. > > + * So we just use dbg_rw_mem(). > > + */ > > + int rem = dbg_rw_mem((dbgva_t)dest, (unsigned char *)src, > > bytes, 0, 1, 0); > > + if (rem) { > > + printk("PVH: Failed to copy to dom0. len:%d rem:%d\n", > > bytes, rem); > > + BUG(); > > Wait a minute? This is debug code but you use for copying?! Why not > __copy_to_user?No, it''s not debug code, I wrote the function to work for all guests under all conditions, but it was only for debugger, so called it dbg_*. Looking at alternatives.
On Tue, 19 Mar 2013 09:27:34 +0000 "Jan Beulich" <JBeulich@suse.com> wrote:> >>> On 16.03.13 at 02:06, Mukesh Rathor <mukesh.rathor@oracle.com> > >>> wrote: > > Finally, the hardest. Mostly modify construct_dom0() to boot PV > > dom0 in PVH mode. Introduce, opt_dom0pvh, which when specified in > > the command line, causes dom0 to boot in PVH mode. > > Now that Konrad mentioned that PVH is intended for 64-bit guests > only, I fail to see where this restriction is being enforced in this > patch. And it would certainly have been worthwhile to state thatYup. I forgot to mention that in the cover letter, and forgot to add a check for that. Will be in the next patch for sure. Right now, there is no code in linux for 32bit PVH btw.> more prominently, even more so since Konrad keeps telling people > that this patch set is expected to initiate a deprecation phase for > the PV MMU interfaces in Linux (which then would mean no 32-bit > PV guest support at all anymore).Well, we definitely intend to head that way. And it is most feasible to do this big work in steps. We are by no means done when this phase I patch is checked in. I''ve already outlined phase II, III etc.. Having some baseline working in xen will allow me to progress faster, and allow to add others to contribute. I think I should leave PVH disabled until we have reached a satisfactory point, and we can enable it in future. thanks Mukesh
On Mon, 18 Mar 2013 13:01:23 +0000 "Jan Beulich" <JBeulich@suse.com> wrote:> >>> On 16.03.13 at 02:06, Mukesh Rathor <mukesh.rathor@oracle.com> > >>> wrote: > > +static __init void copy_pvh(char *dest, char *src, int bytes) > > +{ > > + /* raw_copy_to_guest() -> copy_to_user_hvm -> __hvm_copy needs > > curr > > + * to point to the hvm/pvh vcpu. Hence for PVH dom0 we can''t > > use that. > > + * So we just use dbg_rw_mem(). > > + */ > > + int rem = dbg_rw_mem((dbgva_t)dest, (unsigned char *)src, > > bytes, 0, 1, 0); > > Same comment as before: This is not acceptable for a submission > of a patch intended to be committed (i.e. non-RFC). You should > have worked out a suitable solution to this before posting.Well, the prev patch was RFC and I dind''t hear anything on this, so (mis) understood this was acceptable. Ok, I will just implmement copy_pvh() here.
On Mon, 18 Mar 2013 13:01:23 +0000 "Jan Beulich" <JBeulich@suse.com> wrote:> >>> On 16.03.13 at 02:06, Mukesh Rathor <mukesh.rathor@oracle.com> > >>> wrote: > > @@ -307,6 +309,65 @@ static void __init > > process_dom0_ioports_disable(void) } > > } > > > > +/* > > + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0 > > will have > > + * the entire io region mapped in the EPT/NPT. > > + */ > > +static __init void pvh_map_all_iomem(struct domain *d) > > +{ > > + unsigned long start = 0; > > + const struct e820entry *entry; > > + int rc, i, nump; > > + > > + for (i = 0, entry = e820.map; i < e820.nr_map; i++, entry++) { > > + unsigned long end = entry->addr + entry->size; > > + > > + if (entry->type == E820_RAM || i == e820.nr_map - 1) { > > + unsigned long start_pfn = PFN_DOWN(start); > > + unsigned long end_pfn = PFN_UP(end); > > + > > + if (entry->type == E820_RAM) > > + end_pfn = PFN_UP(entry->addr); > > + > > + if (start_pfn < end_pfn) { > > + nump = end_pfn - start_pfn + 1; > > + rc = domctl_memory_mapping(d, start_pfn, > > start_pfn, nump, 1); > > + BUG_ON(rc); > > + } > > + start = end; > > + } > > + } > > At least E820_UNUSABLE must be excluded here. > > And as you''re mapping the holes only - how do you deal with > the MMIO range past end of RAM? And perhaps even more > important - how do you deal with the split between RAM and > MMIO not being at the end of currently populated RAM, but > at the end of possible hotpluggable regions.Right now, phase I, no support for hotplug. Are there any other cases of this, can you please give an example if yes?
>>> On 29.03.13 at 01:32, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: > On Mon, 18 Mar 2013 13:01:23 +0000 > "Jan Beulich" <JBeulich@suse.com> wrote: > >> >>> On 16.03.13 at 02:06, Mukesh Rathor <mukesh.rathor@oracle.com> >> >>> wrote: >> > @@ -307,6 +309,65 @@ static void __init >> > process_dom0_ioports_disable(void) } >> > } >> > >> > +/* >> > + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0 >> > will have >> > + * the entire io region mapped in the EPT/NPT. >> > + */ >> > +static __init void pvh_map_all_iomem(struct domain *d) >> > +{ >> > + unsigned long start = 0; >> > + const struct e820entry *entry; >> > + int rc, i, nump; >> > + >> > + for (i = 0, entry = e820.map; i < e820.nr_map; i++, entry++) { >> > + unsigned long end = entry->addr + entry->size; >> > + >> > + if (entry->type == E820_RAM || i == e820.nr_map - 1) { >> > + unsigned long start_pfn = PFN_DOWN(start); >> > + unsigned long end_pfn = PFN_UP(end); >> > + >> > + if (entry->type == E820_RAM) >> > + end_pfn = PFN_UP(entry->addr); >> > + >> > + if (start_pfn < end_pfn) { >> > + nump = end_pfn - start_pfn + 1; >> > + rc = domctl_memory_mapping(d, start_pfn, >> > start_pfn, nump, 1); >> > + BUG_ON(rc); >> > + } >> > + start = end; >> > + } >> > + } >> >> At least E820_UNUSABLE must be excluded here. >> >> And as you''re mapping the holes only - how do you deal with >> the MMIO range past end of RAM? And perhaps even more >> important - how do you deal with the split between RAM and >> MMIO not being at the end of currently populated RAM, but >> at the end of possible hotpluggable regions. > > Right now, phase I, no support for hotplug. Are there any other cases > of this, can you please give an example if yes?On a system with just 2Gb (and no hotplug) the code as I read it stops mapping at the 2Gb boundary, i.e. all MMIO regions (up to 4Gb as well as beyond) would remain unmapped. Jan