Finally, the hardest. Mostly modify construct_dom0() to boot PV dom0 in PVH mode. Introduce, opt_dom0pvh, which when specified in the command line, causes dom0 to boot in PVH mode. Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> diff -r c65051a66d7d -r b66895f53279 xen/arch/x86/domain_build.c --- a/xen/arch/x86/domain_build.c Fri Jan 11 16:51:30 2013 -0800 +++ b/xen/arch/x86/domain_build.c Fri Jan 11 16:56:48 2013 -0800 @@ -35,6 +35,8 @@ #include <asm/setup.h> #include <asm/bzimage.h> /* for bzimage_parse */ #include <asm/io_apic.h> +#include <asm/hap.h> +#include <asm/debugger.h> #include <public/version.h> @@ -307,6 +309,36 @@ static void __init process_dom0_ioports_ } } +static noinline __init void dom0_update_physmap(struct domain *d, unsigned long pfn, + unsigned long mfn, unsigned long vphysmap_s) +{ + if ( is_pvh_domain(d) ) { + int rc = guest_physmap_add_page(d, pfn, mfn, 0); + BUG_ON(rc); /* for now while PVH feature is experimental */ + return; + } + if ( !is_pv_32on64_domain(d) ) + ((unsigned long *)vphysmap_s)[pfn] = mfn; + else + ((unsigned int *)vphysmap_s)[pfn] = mfn; + + set_gpfn_from_mfn(mfn, pfn); +} + +static __init void noinline copy_pvh(char *dest, char *src, int bytes) +{ + /* + * NOTE: raw_copy_to_guest() -> copy_to_user_hvm -> __hvm_copy needs curr + * to point to the hvm/pvh vcpu. Hence for PVH dom0 we can''t use that. + * So we just use dbg_rw_mem(). + */ + int rem = dbg_rw_mem((dbgva_t)dest, (unsigned char *)src, bytes, 0, 1, 0); + if (rem) { + printk("Failed to copy to dom0. len:%d rem:%d\n", bytes, rem); + domain_crash_synchronous(); + } +} + int __init construct_dom0( struct domain *d, const module_t *image, unsigned long image_headroom, @@ -314,6 +346,7 @@ int __init construct_dom0( void *(*bootstrap_map)(const module_t *), char *cmdline) { + char *si_buf=NULL, *tmp_buf=NULL; int i, cpu, rc, compatible, compat32, order, machine; struct cpu_user_regs *regs; unsigned long pfn, mfn; @@ -322,7 +355,7 @@ int __init construct_dom0( unsigned long alloc_spfn; unsigned long alloc_epfn; unsigned long initrd_pfn = -1, initrd_mfn = 0; - unsigned long count; + unsigned long count, shared_info_pfn_addr = 0; struct page_info *page = NULL; start_info_t *si; struct vcpu *v = d->vcpu[0]; @@ -416,6 +449,13 @@ int __init construct_dom0( { printk("Kernel does not support Dom0 operation\n"); return -EINVAL; + + if ( is_pvh_domain(d) && + !test_bit(XENFEAT_hvm_callback_vector, parms.f_supported) ) + { + printk("Kernel does not support PVH mode\n"); + return -EINVAL; + } } if ( compat32 ) @@ -480,6 +520,12 @@ int __init construct_dom0( vstartinfo_end = (vstartinfo_start + sizeof(struct start_info) + sizeof(struct dom0_vga_console_info)); + + if ( is_pvh_domain(d) ) { + shared_info_pfn_addr = round_pgup(vstartinfo_end) - v_start; + vstartinfo_end += PAGE_SIZE; + } + vpt_start = round_pgup(vstartinfo_end); for ( nr_pt_pages = 2; ; nr_pt_pages++ ) { @@ -621,20 +667,30 @@ int __init construct_dom0( maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l3_page_table; l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE; } - copy_page(l4tab, idle_pg_table); - l4tab[0] = l4e_empty(); /* zap trampoline mapping */ - l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] - l4e_from_paddr(__pa(l4start), __PAGE_HYPERVISOR); - l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] - l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR); - v->arch.guest_table = pagetable_from_paddr(__pa(l4start)); - if ( is_pv_32on64_domain(d) ) - v->arch.guest_table_user = v->arch.guest_table; + if ( is_pvh_domain(d) ) + { + v->arch.guest_table = pagetable_from_paddr(vpt_start - v_start); + pfn = 0; + } else { + copy_page(l4tab, idle_pg_table); + l4tab[0] = l4e_empty(); /* zap trampoline mapping */ + l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] + l4e_from_paddr(__pa(l4start), __PAGE_HYPERVISOR); + l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] + l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR); + v->arch.guest_table = pagetable_from_paddr(__pa(l4start)); + if ( is_pv_32on64_domain(d) ) + v->arch.guest_table_user = v->arch.guest_table; + pfn = alloc_spfn; + } l4tab += l4_table_offset(v_start); - pfn = alloc_spfn; for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ ) { + /* initrd chunk''s mfns are separate, so we need to adjust for them */ + signed long pvh_adj = is_pvh_domain(d) ? + (-alloc_spfn + PFN_UP(initrd_len))<<PAGE_SHIFT : 0; + if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) ) { maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table; @@ -661,16 +717,17 @@ int __init construct_dom0( clear_page(l3tab); if ( count == 0 ) l3tab += l3_table_offset(v_start); - *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT); + *l4tab = l4e_from_paddr(__pa(l3start) + pvh_adj, L4_PROT); l4tab++; } - *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT); + *l3tab = l3e_from_paddr(__pa(l2start) + pvh_adj, L3_PROT); l3tab++; } - *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT); + *l2tab = l2e_from_paddr(__pa(l1start) + pvh_adj, L2_PROT); l2tab++; } - if ( count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) ) + if ( is_pvh_domain(d) || + count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) ) mfn = pfn++; else mfn = initrd_mfn++; @@ -678,6 +735,9 @@ int __init construct_dom0( L1_PROT : COMPAT_L1_PROT)); l1tab++; + if ( is_pvh_domain(d) ) + continue; + page = mfn_to_page(mfn); if ( (page->u.inuse.type_info == 0) && !get_page_and_type(page, d, PGT_writable_page) ) @@ -706,6 +766,9 @@ int __init construct_dom0( COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab)); } + if ( is_pvh_domain(d) ) + goto skip_ptrdonly_for_pvh; + /* Pages that are part of page tables must be read only. */ l4tab = l4start + l4_table_offset(vpt_start); l3start = l3tab = l4e_to_l3e(*l4tab); @@ -745,6 +808,7 @@ int __init construct_dom0( } } +skip_ptrdonly_for_pvh: /* Mask all upcalls... */ for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ ) shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1; @@ -758,6 +822,11 @@ int __init construct_dom0( (void)alloc_vcpu(d, i, cpu); } + if ( is_pvh_domain(d) ) + { + v->arch.cr3 = v->arch.hvm_vcpu.guest_cr[3] + (pagetable_get_pfn(v->arch.guest_table)) << PAGE_SHIFT; + } /* Set up CR3 value for write_ptbase */ if ( paging_mode_enabled(d) ) paging_update_paging_modes(v); @@ -767,34 +836,16 @@ int __init construct_dom0( /* We run on dom0''s page tables for the final part of the build process. */ write_ptbase(v); - /* Copy the OS image and free temporary buffer. */ - elf.dest = (void*)vkern_start; - rc = elf_load_binary(&elf, 0); - if ( rc < 0 ) - { - printk("Failed to load the kernel binary\n"); - return rc; - } - bootstrap_map(NULL); + /* Set up start info area. */ + if ( is_pvh_domain(d) ) { + if ( (si_buf=xmalloc_bytes(PAGE_SIZE)) == NULL) { + printk("xmalloc failed to alloc %ld bytes.\n", PAGE_SIZE); + return -ENOMEM; + } + si = (start_info_t *)si_buf; + } else + si = (start_info_t *)vstartinfo_start; - if ( UNSET_ADDR != parms.virt_hypercall ) - { - if ( (parms.virt_hypercall < v_start) || - (parms.virt_hypercall >= v_end) ) - { - write_ptbase(current); - printk("Invalid HYPERCALL_PAGE field in ELF notes.\n"); - return -1; - } - hypercall_page_initialise( - d, (void *)(unsigned long)parms.virt_hypercall); - } - - /* Free temporary buffers. */ - discard_initial_images(); - - /* Set up start info area. */ - si = (start_info_t *)vstartinfo_start; clear_page(si); si->nr_pages = nr_pages; @@ -812,7 +863,7 @@ int __init construct_dom0( count = d->tot_pages; /* Set up the phys->machine table if not part of the initial mapping. */ - if ( parms.p2m_base != UNSET_ADDR ) + if ( parms.p2m_base != UNSET_ADDR && !is_pvh_domain(d) ) { unsigned long va = vphysmap_start; @@ -911,6 +962,9 @@ int __init construct_dom0( panic("Not enough RAM for DOM0 P->M table.\n"); } + if (is_pvh_domain(d) ) + hap_set_pvh_alloc_for_dom0(d, nr_pages); + /* Write the phys->machine and machine->phys table entries. */ for ( pfn = 0; pfn < count; pfn++ ) { @@ -927,11 +981,8 @@ int __init construct_dom0( if ( pfn > REVERSE_START && (vinitrd_start || pfn < initrd_pfn) ) mfn = alloc_epfn - (pfn - REVERSE_START); #endif - if ( !is_pv_32on64_domain(d) ) - ((unsigned long *)vphysmap_start)[pfn] = mfn; - else - ((unsigned int *)vphysmap_start)[pfn] = mfn; - set_gpfn_from_mfn(mfn, pfn); + dom0_update_physmap(d, pfn, mfn, vphysmap_start); + if (!(pfn & 0xfffff)) process_pending_softirqs(); } @@ -947,8 +998,8 @@ int __init construct_dom0( if ( !page->u.inuse.type_info && !get_page_and_type(page, d, PGT_writable_page) ) BUG(); - ((unsigned long *)vphysmap_start)[pfn] = mfn; - set_gpfn_from_mfn(mfn, pfn); + + dom0_update_physmap(d, pfn, mfn, vphysmap_start); ++pfn; if (!(pfn & 0xfffff)) process_pending_softirqs(); @@ -968,11 +1019,7 @@ int __init construct_dom0( #ifndef NDEBUG #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn))) #endif - if ( !is_pv_32on64_domain(d) ) - ((unsigned long *)vphysmap_start)[pfn] = mfn; - else - ((unsigned int *)vphysmap_start)[pfn] = mfn; - set_gpfn_from_mfn(mfn, pfn); + dom0_update_physmap(d, pfn, mfn, vphysmap_start); #undef pfn page++; pfn++; if (!(pfn & 0xfffff)) @@ -980,6 +1027,47 @@ int __init construct_dom0( } } + /* Copy the OS image and free temporary buffer. */ + elf.dest = (void*)vkern_start; + rc = elf_load_binary(&elf, is_pvh_domain(d) ); + if ( rc < 0 ) + { + printk("Failed to load the kernel binary\n"); + return rc; + } + bootstrap_map(NULL); + + if ( UNSET_ADDR != parms.virt_hypercall ) + { + void *addr; + + if ( is_pvh_domain(d) ) { + if ( (tmp_buf=xzalloc_bytes(PAGE_SIZE)) == NULL ) { + printk("xzalloc failed for tmp_buf. %ld bytes.\n", PAGE_SIZE); + return -ENOMEM; + } + addr = tmp_buf; + } else + addr = (void *)parms.virt_hypercall; + + if ( (parms.virt_hypercall < v_start) || + (parms.virt_hypercall >= v_end) ) + { + write_ptbase(current); + printk("Invalid HYPERCALL_PAGE field in ELF notes.\n"); + return -1; + } + hypercall_page_initialise(d, addr); + + if ( is_pvh_domain(d) ) { + copy_pvh((void *)parms.virt_hypercall, tmp_buf, PAGE_SIZE); + xfree(tmp_buf); + } + } + + /* Free temporary buffers. */ + discard_initial_images(); + if ( initrd_len != 0 ) { si->mod_start = vinitrd_start ?: initrd_pfn; @@ -995,6 +1083,15 @@ int __init construct_dom0( si->console.dom0.info_off = sizeof(struct start_info); si->console.dom0.info_size = sizeof(struct dom0_vga_console_info); } + if ( is_pvh_domain(d) ) { + unsigned long mfn = virt_to_mfn(d->shared_info); + unsigned long pfn = shared_info_pfn_addr>>PAGE_SHIFT; + si->shared_info = shared_info_pfn_addr; + dom0_update_physmap(d, pfn, mfn, 0); + + copy_pvh((char *)vstartinfo_start, si_buf, PAGE_SIZE); + xfree(si_buf); + } if ( is_pv_32on64_domain(d) ) xlat_start_info(si, XLAT_start_info_console_dom0); @@ -1025,12 +1122,16 @@ int __init construct_dom0( regs->eip = parms.virt_entry; regs->esp = vstack_end; regs->esi = vstartinfo_start; - regs->eflags = X86_EFLAGS_IF; + regs->eflags = X86_EFLAGS_IF | 0x2; - if ( opt_dom0_shadow ) + if ( opt_dom0_shadow ) { + if ( is_pvh_domain(d) ) { + printk("Invalid option dom0_shadow for PVH\n"); + return -EINVAL; + } if ( paging_enable(d, PG_SH_enable) == 0 ) paging_update_paging_modes(v); - + } if ( supervisor_mode_kernel ) { v->arch.pv_vcpu.kernel_ss &= ~3; diff -r c65051a66d7d -r b66895f53279 xen/arch/x86/mm/hap/hap.c --- a/xen/arch/x86/mm/hap/hap.c Fri Jan 11 16:51:30 2013 -0800 +++ b/xen/arch/x86/mm/hap/hap.c Fri Jan 11 16:56:48 2013 -0800 @@ -600,6 +600,20 @@ int hap_domctl(struct domain *d, xen_dom } } +/* Resize hap table. Copied from: libxl_get_required_shadow_memory() */ +void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages) +{ + int rc; + unsigned long memkb = num_pages * (PAGE_SIZE / 1024); + + memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024)); + num_pages = ((memkb+1023)/1024) << (20 - PAGE_SHIFT); + paging_lock(d); + rc = hap_set_allocation(d, num_pages, NULL); + paging_unlock(d); + BUG_ON(rc); +} + static const struct paging_mode hap_paging_real_mode; static const struct paging_mode hap_paging_protected_mode; static const struct paging_mode hap_paging_pae_mode; @@ -659,7 +673,8 @@ static void hap_update_cr3(struct vcpu * const struct paging_mode * hap_paging_get_mode(struct vcpu *v) { - return !hvm_paging_enabled(v) ? &hap_paging_real_mode : + return is_pvh_vcpu(v) ? &hap_paging_long_mode : + !hvm_paging_enabled(v) ? &hap_paging_real_mode : hvm_long_mode_enabled(v) ? &hap_paging_long_mode : hvm_pae_enabled(v) ? &hap_paging_pae_mode : &hap_paging_protected_mode; diff -r c65051a66d7d -r b66895f53279 xen/arch/x86/setup.c --- a/xen/arch/x86/setup.c Fri Jan 11 16:51:30 2013 -0800 +++ b/xen/arch/x86/setup.c Fri Jan 11 16:56:48 2013 -0800 @@ -59,6 +59,10 @@ integer_param("maxcpus", max_cpus); static bool_t __initdata disable_smep; invbool_param("smep", disable_smep); +/* Boot dom0 in PVH mode */ +static bool_t __initdata opt_dom0pvh; +boolean_param("dom0pvh", opt_dom0pvh); + /* **** Linux config option: propagated to domain0. */ /* "acpi=off": Sisables both ACPI table parsing and interpreter. */ /* "acpi=force": Override the disable blacklist. */ @@ -535,7 +539,7 @@ void __init __start_xen(unsigned long mb { char *memmap_type = NULL; char *cmdline, *kextra, *loader; - unsigned int initrdidx; + unsigned int initrdidx, domcr_flags = 0; multiboot_info_t *mbi = __va(mbi_p); module_t *mod = (module_t *)__va(mbi->mods_addr); unsigned long nr_pages, modules_headroom, *module_map; @@ -1248,7 +1252,9 @@ void __init __start_xen(unsigned long mb panic("Could not protect TXT memory regions\n"); /* Create initial domain 0. */ - dom0 = domain_create(0, DOMCRF_s3_integrity, 0); + domcr_flags = (opt_dom0pvh ? DOMCRF_pvh | DOMCRF_hap : 0); + domcr_flags |= DOMCRF_s3_integrity; + dom0 = domain_create(0, domcr_flags, 0); if ( IS_ERR(dom0) || (alloc_dom0_vcpu0() == NULL) ) panic("Error creating domain 0\n"); diff -r c65051a66d7d -r b66895f53279 xen/include/asm-x86/hap.h --- a/xen/include/asm-x86/hap.h Fri Jan 11 16:51:30 2013 -0800 +++ b/xen/include/asm-x86/hap.h Fri Jan 11 16:56:48 2013 -0800 @@ -63,6 +63,7 @@ int hap_track_dirty_vram(struct domain XEN_GUEST_HANDLE_64(uint8) dirty_bitmap); extern const struct paging_mode *hap_paging_get_mode(struct vcpu *); +void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages); #endif /* XEN_HAP_H */