Mukesh Rathor
2013-Mar-19 22:33 UTC
[PATCH 01/18 V2 RESEND] PVH xen: turn gdb_frames/gdt_ents into union
NOTE: This is a resend of V2 using git send-email. PVH only needs gdtaddr and gdtsz, so a union is created. There is no functional code change in this patch. Changes in V2: - Add __XEN_INTERFACE_VERSION__ Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> --- tools/libxc/xc_domain_restore.c | 8 ++++---- tools/libxc/xc_domain_save.c | 6 +++--- xen/arch/x86/domain.c | 12 ++++++------ xen/arch/x86/domctl.c | 12 ++++++------ xen/include/public/arch-x86/xen.h | 13 +++++++++++++ 5 files changed, 32 insertions(+), 19 deletions(-) diff --git a/tools/libxc/xc_domain_restore.c b/tools/libxc/xc_domain_restore.c index a15f86a..9a22e2a 100644 --- a/tools/libxc/xc_domain_restore.c +++ b/tools/libxc/xc_domain_restore.c @@ -2020,15 +2020,15 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom, munmap(start_info, PAGE_SIZE); } /* Uncanonicalise each GDT frame number. */ - if ( GET_FIELD(ctxt, gdt_ents) > 8192 ) + if ( GET_FIELD(ctxt, u.pv.gdt_ents) > 8192 ) { ERROR("GDT entry count out of range"); goto out; } - for ( j = 0; (512*j) < GET_FIELD(ctxt, gdt_ents); j++ ) + for ( j = 0; (512*j) < GET_FIELD(ctxt, u.pv.gdt_ents); j++ ) { - pfn = GET_FIELD(ctxt, gdt_frames[j]); + pfn = GET_FIELD(ctxt, u.pv.gdt_frames[j]); if ( (pfn >= dinfo->p2m_size) || (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) ) { @@ -2036,7 +2036,7 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom, j, (unsigned long)pfn); goto out; } - SET_FIELD(ctxt, gdt_frames[j], ctx->p2m[pfn]); + SET_FIELD(ctxt, u.pv.gdt_frames[j], ctx->p2m[pfn]); } /* Uncanonicalise the page table base pointer. */ pfn = UNFOLD_CR3(GET_FIELD(ctxt, ctrlreg[3])); diff --git a/tools/libxc/xc_domain_save.c b/tools/libxc/xc_domain_save.c index ff76626..4ec5e7e 100644 --- a/tools/libxc/xc_domain_save.c +++ b/tools/libxc/xc_domain_save.c @@ -1900,15 +1900,15 @@ int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom, uint32_t max_iter } /* Canonicalise each GDT frame number. */ - for ( j = 0; (512*j) < GET_FIELD(&ctxt, gdt_ents); j++ ) + for ( j = 0; (512*j) < GET_FIELD(&ctxt, u.pv.gdt_ents); j++ ) { - mfn = GET_FIELD(&ctxt, gdt_frames[j]); + mfn = GET_FIELD(&ctxt, u.pv.gdt_frames[j]); if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) { ERROR("GDT frame is not in range of pseudophys map"); goto out; } - SET_FIELD(&ctxt, gdt_frames[j], mfn_to_pfn(mfn)); + SET_FIELD(&ctxt, u.pv.gdt_frames[j], mfn_to_pfn(mfn)); } /* Canonicalise the page table base pointer. */ diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 8d30d08..ea1381c 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -780,8 +780,8 @@ int arch_set_info_guest( } for ( i = 0; i < ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames); ++i ) - fail |= v->arch.pv_vcpu.gdt_frames[i] != c(gdt_frames[i]); - fail |= v->arch.pv_vcpu.gdt_ents != c(gdt_ents); + fail |= v->arch.pv_vcpu.gdt_frames[i] != c(u.pv.gdt_frames[i]); + fail |= v->arch.pv_vcpu.gdt_ents != c(u.pv.gdt_ents); fail |= v->arch.pv_vcpu.ldt_base != c(ldt_base); fail |= v->arch.pv_vcpu.ldt_ents != c(ldt_ents); @@ -830,17 +830,17 @@ int arch_set_info_guest( d->vm_assist = c(vm_assist); if ( !compat ) - rc = (int)set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents); + rc = (int)set_gdt(v, c.nat->u.pv.gdt_frames, c.nat->u.pv.gdt_ents); else { unsigned long gdt_frames[ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames)]; - unsigned int n = (c.cmp->gdt_ents + 511) / 512; + unsigned int n = (c.cmp->u.pv.gdt_ents + 511) / 512; if ( n > ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames) ) return -EINVAL; for ( i = 0; i < n; ++i ) - gdt_frames[i] = c.cmp->gdt_frames[i]; - rc = (int)set_gdt(v, gdt_frames, c.cmp->gdt_ents); + gdt_frames[i] = c.cmp->u.pv.gdt_frames[i]; + rc = (int)set_gdt(v, gdt_frames, c.cmp->u.pv.gdt_ents); } if ( rc != 0 ) return rc; diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c index a196e2a..31937e0 100644 --- a/xen/arch/x86/domctl.c +++ b/xen/arch/x86/domctl.c @@ -1305,12 +1305,12 @@ void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c) c(ldt_base = v->arch.pv_vcpu.ldt_base); c(ldt_ents = v->arch.pv_vcpu.ldt_ents); for ( i = 0; i < ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames); ++i ) - c(gdt_frames[i] = v->arch.pv_vcpu.gdt_frames[i]); - BUILD_BUG_ON(ARRAY_SIZE(c.nat->gdt_frames) !- ARRAY_SIZE(c.cmp->gdt_frames)); - for ( ; i < ARRAY_SIZE(c.nat->gdt_frames); ++i ) - c(gdt_frames[i] = 0); - c(gdt_ents = v->arch.pv_vcpu.gdt_ents); + c(u.pv.gdt_frames[i] = v->arch.pv_vcpu.gdt_frames[i]); + BUILD_BUG_ON(ARRAY_SIZE(c.nat->u.pv.gdt_frames) !+ ARRAY_SIZE(c.cmp->u.pv.gdt_frames)); + for ( ; i < ARRAY_SIZE(c.nat->u.pv.gdt_frames); ++i ) + c(u.pv.gdt_frames[i] = 0); + c(u.pv.gdt_ents = v->arch.pv_vcpu.gdt_ents); c(kernel_ss = v->arch.pv_vcpu.kernel_ss); c(kernel_sp = v->arch.pv_vcpu.kernel_sp); for ( i = 0; i < ARRAY_SIZE(v->arch.pv_vcpu.ctrlreg); ++i ) diff --git a/xen/include/public/arch-x86/xen.h b/xen/include/public/arch-x86/xen.h index b7f6a51..ea72532 100644 --- a/xen/include/public/arch-x86/xen.h +++ b/xen/include/public/arch-x86/xen.h @@ -170,7 +170,20 @@ struct vcpu_guest_context { struct cpu_user_regs user_regs; /* User-level CPU registers */ struct trap_info trap_ctxt[256]; /* Virtual IDT */ unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */ +#if __XEN_INTERFACE_VERSION__ < 0x00040300 unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */ +#else + union { + struct { + /* GDT (machine frames, # ents) */ + unsigned long gdt_frames[16], gdt_ents; + } pv; + struct { + /* PVH: GDTR addr and size */ + unsigned long gdtaddr, gdtsz; + } pvh; + } u; +#endif unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */ /* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */ unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */ -- 1.7.2.3
Mukesh Rathor
2013-Mar-19 22:33 UTC
[PATCH 02/18 V2 RESEND]: PVH xen: add XENMEM_add_to_physmap_range
In this patch we add a new function xenmem_add_to_physmap_range(), and change xenmem_add_to_physmap_once parameters so it can be called from xenmem_add_to_physmap_range. There is no PVH specific change here. Changes in V2: - Do not break parameter so xenmem_add_to_physmap_once() but pass in struct xen_add_to_physmap. Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> --- xen/arch/x86/mm.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 files changed, 79 insertions(+), 3 deletions(-) diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index d00d9a2..6603752 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -4268,7 +4268,8 @@ static int handle_iomem_range(unsigned long s, unsigned long e, void *p) static int xenmem_add_to_physmap_once( struct domain *d, - const struct xen_add_to_physmap *xatp) + const struct xen_add_to_physmap *xatp, + domid_t foreign_domid) { struct page_info *page = NULL; unsigned long gfn = 0; /* gcc ... */ @@ -4395,7 +4396,7 @@ static int xenmem_add_to_physmap(struct domain *d, start_xatp = *xatp; while ( xatp->size > 0 ) { - rc = xenmem_add_to_physmap_once(d, xatp); + rc = xenmem_add_to_physmap_once(d, xatp, -1); if ( rc < 0 ) return rc; @@ -4421,7 +4422,52 @@ static int xenmem_add_to_physmap(struct domain *d, return rc; } - return xenmem_add_to_physmap_once(d, xatp); + return xenmem_add_to_physmap_once(d, xatp, -1); +} + +static noinline int xenmem_add_to_physmap_range(struct domain *d, + struct xen_add_to_physmap_range *xatpr) +{ + int rc; + + /* Process entries in reverse order to allow continuations */ + while ( xatpr->size > 0 ) + { + xen_ulong_t idx; + xen_pfn_t gpfn; + struct xen_add_to_physmap xatp; + + rc = copy_from_guest_offset(&idx, xatpr->idxs, xatpr->size-1, 1); + if ( rc < 0 ) + goto out; + + rc = copy_from_guest_offset(&gpfn, xatpr->gpfns, xatpr->size-1, 1); + if ( rc < 0 ) + goto out; + + xatp.space = xatpr->space; + xatp.idx = idx; + xatp.gpfn = gpfn; + rc = xenmem_add_to_physmap_once(d, &xatp, xatpr->foreign_domid); + + if (rc) + goto out; + + xatpr->size--; + + /* Check for continuation if it''s not the last interation */ + if ( xatpr->size > 0 && hypercall_preempt_check() ) + { + rc = -EAGAIN; + goto out; + } + } + + rc = 0; + +out: + return rc; + } long arch_memory_op(int op, XEN_GUEST_HANDLE_PARAM(void) arg) @@ -4438,6 +4484,10 @@ long arch_memory_op(int op, XEN_GUEST_HANDLE_PARAM(void) arg) if ( copy_from_guest(&xatp, arg, 1) ) return -EFAULT; + /* This one is only supported for add_to_physmap_range */ + if ( xatp.space == XENMAPSPACE_gmfn_foreign ) + return -EINVAL; + d = rcu_lock_domain_by_any_id(xatp.domid); if ( d == NULL ) return -ESRCH; @@ -4465,6 +4515,32 @@ long arch_memory_op(int op, XEN_GUEST_HANDLE_PARAM(void) arg) return rc; } + case XENMEM_add_to_physmap_range: + { + struct xen_add_to_physmap_range xatpr; + struct domain *d; + + if ( copy_from_guest(&xatpr, arg, 1) ) + return -EFAULT; + + rc = rcu_lock_target_domain_by_id(xatpr.domid, &d); + if ( rc != 0 ) + return rc; + + rc = xenmem_add_to_physmap_range(d, &xatpr); + + rcu_unlock_domain(d); + + if ( rc && copy_to_guest(arg, &xatpr, 1) ) + rc = -EFAULT; + + if ( rc == -EAGAIN ) + rc = hypercall_create_continuation( + __HYPERVISOR_memory_op, "ih", op, arg); + + return rc; + } + case XENMEM_set_memory_map: { struct xen_foreign_memory_map fmap; -- 1.7.2.3
Mukesh Rathor
2013-Mar-19 22:33 UTC
[PATCH 03/18 V2 RESEND]: PVH xen: create domctl_memory_mapping() function
In this patch, XEN_DOMCTL_memory_mapping code is put into a function so it can be shared later for PVH. There is no change in it''s functionality. Changes in V2: - Remove PHYSDEVOP_map_iomem sub hypercall, and the code supporting it as the IO region is mapped transparently now. Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> --- xen/arch/x86/domctl.c | 119 ++++++++++++++++++++++++--------------------- xen/include/xen/domain.h | 2 + 2 files changed, 65 insertions(+), 56 deletions(-) diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c index 31937e0..ecc8240 100644 --- a/xen/arch/x86/domctl.c +++ b/xen/arch/x86/domctl.c @@ -46,6 +46,68 @@ static int gdbsx_guest_mem_io( return (iop->remain ? -EFAULT : 0); } +long domctl_memory_mapping(struct domain *d, unsigned long gfn, + unsigned long mfn, unsigned long nr_mfns, + int add_map) +{ + int i; + long ret = -EINVAL; + + if ( (mfn + nr_mfns - 1) < mfn || /* wrap? */ + ((mfn | (mfn + nr_mfns - 1)) >> (paddr_bits - PAGE_SHIFT)) || + (gfn + nr_mfns - 1) < gfn ) /* wrap? */ + return ret; + + ret = xsm_iomem_permission(XSM_HOOK, d, mfn, mfn + nr_mfns - 1, add_map); + if ( ret ) + return ret; + + if ( add_map ) + { + printk(XENLOG_G_INFO + "memory_map:add: dom%d gfn=%lx mfn=%lx nr=%lx\n", + d->domain_id, gfn, mfn, nr_mfns); + + ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1); + if ( !ret && paging_mode_translate(d) ) + { + for ( i = 0; !ret && i < nr_mfns; i++ ) + if ( !set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i)) ) + ret = -EIO; + if ( ret ) + { + printk(XENLOG_G_WARNING + "memory_map:fail: dom%d gfn=%lx mfn=%lx\n", + d->domain_id, gfn + i, mfn + i); + while ( i-- ) + clear_mmio_p2m_entry(d, gfn + i); + if ( iomem_deny_access(d, mfn, mfn + nr_mfns - 1) && + IS_PRIV(current->domain) ) + printk(XENLOG_ERR + "memory_map: failed to deny dom%d access to [%lx,%lx]\n", + d->domain_id, mfn, mfn + nr_mfns - 1); + } + } + } else { + printk(XENLOG_G_INFO + "memory_map:remove: dom%d gfn=%lx mfn=%lx nr=%lx\n", + d->domain_id, gfn, mfn, nr_mfns); + + if ( paging_mode_translate(d) ) + for ( i = 0; i < nr_mfns; i++ ) + add_map |= !clear_mmio_p2m_entry(d, gfn + i); + ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1); + if ( !ret && add_map ) + ret = -EIO; + if ( ret && IS_PRIV(current->domain) ) + printk(XENLOG_ERR + "memory_map: error %ld %s dom%d access to [%lx,%lx]\n", + ret, add_map ? "removing" : "denying", d->domain_id, + mfn, mfn + nr_mfns - 1); + } + return ret; +} + long arch_do_domctl( struct xen_domctl *domctl, struct domain *d, XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) @@ -628,68 +690,13 @@ long arch_do_domctl( unsigned long mfn = domctl->u.memory_mapping.first_mfn; unsigned long nr_mfns = domctl->u.memory_mapping.nr_mfns; int add = domctl->u.memory_mapping.add_mapping; - unsigned long i; - - ret = -EINVAL; - if ( (mfn + nr_mfns - 1) < mfn || /* wrap? */ - ((mfn | (mfn + nr_mfns - 1)) >> (paddr_bits - PAGE_SHIFT)) || - (gfn + nr_mfns - 1) < gfn ) /* wrap? */ - break; ret = -EPERM; if ( !IS_PRIV(current->domain) && !iomem_access_permitted(current->domain, mfn, mfn + nr_mfns - 1) ) break; - ret = xsm_iomem_mapping(XSM_HOOK, d, mfn, mfn + nr_mfns - 1, add); - if ( ret ) - break; - - if ( add ) - { - printk(XENLOG_G_INFO - "memory_map:add: dom%d gfn=%lx mfn=%lx nr=%lx\n", - d->domain_id, gfn, mfn, nr_mfns); - - ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1); - if ( !ret && paging_mode_translate(d) ) - { - for ( i = 0; !ret && i < nr_mfns; i++ ) - if ( !set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i)) ) - ret = -EIO; - if ( ret ) - { - printk(XENLOG_G_WARNING - "memory_map:fail: dom%d gfn=%lx mfn=%lx\n", - d->domain_id, gfn + i, mfn + i); - while ( i-- ) - clear_mmio_p2m_entry(d, gfn + i); - if ( iomem_deny_access(d, mfn, mfn + nr_mfns - 1) && - IS_PRIV(current->domain) ) - printk(XENLOG_ERR - "memory_map: failed to deny dom%d access to [%lx,%lx]\n", - d->domain_id, mfn, mfn + nr_mfns - 1); - } - } - } - else - { - printk(XENLOG_G_INFO - "memory_map:remove: dom%d gfn=%lx mfn=%lx nr=%lx\n", - d->domain_id, gfn, mfn, nr_mfns); - - if ( paging_mode_translate(d) ) - for ( i = 0; i < nr_mfns; i++ ) - add |= !clear_mmio_p2m_entry(d, gfn + i); - ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1); - if ( !ret && add ) - ret = -EIO; - if ( ret && IS_PRIV(current->domain) ) - printk(XENLOG_ERR - "memory_map: error %ld %s dom%d access to [%lx,%lx]\n", - ret, add ? "removing" : "denying", d->domain_id, - mfn, mfn + nr_mfns - 1); - } + ret = domctl_memory_mapping(d, gfn, mfn, nr_mfns, add); } break; diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h index d4ac50f..a7b4c34 100644 --- a/xen/include/xen/domain.h +++ b/xen/include/xen/domain.h @@ -86,4 +86,6 @@ extern unsigned int xen_processor_pmbits; extern bool_t opt_dom0_vcpus_pin; +extern long domctl_memory_mapping(struct domain *d, unsigned long gfn, + unsigned long mfn, unsigned long nr_mfns, int add_map); #endif /* __XEN_DOMAIN_H__ */ -- 1.7.2.3
Mukesh Rathor
2013-Mar-19 22:33 UTC
[PATCH 04/18 V2 RESEND]: PVH xen: add params to read_segment_register
In this patch, read_segment_register macro is changed to take vcpu and regs parameters so it can check if it''s PVH guest (change in upcoming patches). No functionality change. Also, make emulate_privileged_op() public for later while changing this file. Changes in V2: None Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> --- xen/arch/x86/domain.c | 8 ++++---- xen/arch/x86/traps.c | 28 ++++++++++++++-------------- xen/arch/x86/x86_64/traps.c | 16 ++++++++-------- xen/include/asm-x86/system.h | 2 +- xen/include/asm-x86/traps.h | 1 + 5 files changed, 28 insertions(+), 27 deletions(-) diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index ea1381c..e9549e0 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -1340,10 +1340,10 @@ static void save_segments(struct vcpu *v) struct cpu_user_regs *regs = &v->arch.user_regs; unsigned int dirty_segment_mask = 0; - regs->ds = read_segment_register(ds); - regs->es = read_segment_register(es); - regs->fs = read_segment_register(fs); - regs->gs = read_segment_register(gs); + regs->ds = read_segment_register(v, regs, ds); + regs->es = read_segment_register(v, regs, es); + regs->fs = read_segment_register(v, regs, fs); + regs->gs = read_segment_register(v, regs, gs); if ( regs->ds ) dirty_segment_mask |= DIRTY_DS; diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index d36eddd..ab3e814 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -1823,7 +1823,7 @@ static inline uint64_t guest_misc_enable(uint64_t val) } \ (eip) += sizeof(_x); _x; }) -#define read_sreg(regs, sr) read_segment_register(sr) +#define read_sreg(vcpu, regs, sr) read_segment_register(vcpu, regs, sr) static int is_cpufreq_controller(struct domain *d) { @@ -1833,7 +1833,7 @@ static int is_cpufreq_controller(struct domain *d) #include "x86_64/mmconfig.h" -static int emulate_privileged_op(struct cpu_user_regs *regs) +int emulate_privileged_op(struct cpu_user_regs *regs) { struct vcpu *v = current; unsigned long *reg, eip = regs->eip; @@ -1869,7 +1869,7 @@ static int emulate_privileged_op(struct cpu_user_regs *regs) goto fail; /* emulating only opcodes not allowing SS to be default */ - data_sel = read_sreg(regs, ds); + data_sel = read_sreg(v, regs, ds); /* Legacy prefixes. */ for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) ) @@ -1887,17 +1887,17 @@ static int emulate_privileged_op(struct cpu_user_regs *regs) data_sel = regs->cs; continue; case 0x3e: /* DS override */ - data_sel = read_sreg(regs, ds); + data_sel = read_sreg(v, regs, ds); continue; case 0x26: /* ES override */ - data_sel = read_sreg(regs, es); + data_sel = read_sreg(v, regs, es); continue; case 0x64: /* FS override */ - data_sel = read_sreg(regs, fs); + data_sel = read_sreg(v, regs, fs); lm_ovr = lm_seg_fs; continue; case 0x65: /* GS override */ - data_sel = read_sreg(regs, gs); + data_sel = read_sreg(v, regs, gs); lm_ovr = lm_seg_gs; continue; case 0x36: /* SS override */ @@ -1944,7 +1944,7 @@ static int emulate_privileged_op(struct cpu_user_regs *regs) if ( !(opcode & 2) ) { - data_sel = read_sreg(regs, es); + data_sel = read_sreg(v, regs, es); lm_ovr = lm_seg_none; } @@ -2677,22 +2677,22 @@ static void emulate_gate_op(struct cpu_user_regs *regs) ASSERT(opnd_sel); continue; case 0x3e: /* DS override */ - opnd_sel = read_sreg(regs, ds); + opnd_sel = read_sreg(v, regs, ds); if ( !opnd_sel ) opnd_sel = dpl; continue; case 0x26: /* ES override */ - opnd_sel = read_sreg(regs, es); + opnd_sel = read_sreg(v, regs, es); if ( !opnd_sel ) opnd_sel = dpl; continue; case 0x64: /* FS override */ - opnd_sel = read_sreg(regs, fs); + opnd_sel = read_sreg(v, regs, fs); if ( !opnd_sel ) opnd_sel = dpl; continue; case 0x65: /* GS override */ - opnd_sel = read_sreg(regs, gs); + opnd_sel = read_sreg(v, regs, gs); if ( !opnd_sel ) opnd_sel = dpl; continue; @@ -2745,7 +2745,7 @@ static void emulate_gate_op(struct cpu_user_regs *regs) switch ( modrm & 7 ) { default: - opnd_sel = read_sreg(regs, ds); + opnd_sel = read_sreg(v, regs, ds); break; case 4: case 5: opnd_sel = regs->ss; @@ -2773,7 +2773,7 @@ static void emulate_gate_op(struct cpu_user_regs *regs) break; } if ( !opnd_sel ) - opnd_sel = read_sreg(regs, ds); + opnd_sel = read_sreg(v, regs, ds); switch ( modrm & 7 ) { case 0: case 2: case 4: diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c index eec919a..d2f7209 100644 --- a/xen/arch/x86/x86_64/traps.c +++ b/xen/arch/x86/x86_64/traps.c @@ -122,10 +122,10 @@ void show_registers(struct cpu_user_regs *regs) fault_crs[0] = read_cr0(); fault_crs[3] = read_cr3(); fault_crs[4] = read_cr4(); - fault_regs.ds = read_segment_register(ds); - fault_regs.es = read_segment_register(es); - fault_regs.fs = read_segment_register(fs); - fault_regs.gs = read_segment_register(gs); + fault_regs.ds = read_segment_register(v, regs, ds); + fault_regs.es = read_segment_register(v, regs, es); + fault_regs.fs = read_segment_register(v, regs, fs); + fault_regs.gs = read_segment_register(v, regs, gs); } print_xen_info(); @@ -240,10 +240,10 @@ void do_double_fault(struct cpu_user_regs *regs) crs[2] = read_cr2(); crs[3] = read_cr3(); crs[4] = read_cr4(); - regs->ds = read_segment_register(ds); - regs->es = read_segment_register(es); - regs->fs = read_segment_register(fs); - regs->gs = read_segment_register(gs); + regs->ds = read_segment_register(current, regs, ds); + regs->es = read_segment_register(current, regs, es); + regs->fs = read_segment_register(current, regs, fs); + regs->gs = read_segment_register(current, regs, gs); printk("CPU: %d\n", cpu); _show_registers(regs, crs, CTXT_hypervisor, NULL); diff --git a/xen/include/asm-x86/system.h b/xen/include/asm-x86/system.h index b0876d6..d8dc6f2 100644 --- a/xen/include/asm-x86/system.h +++ b/xen/include/asm-x86/system.h @@ -4,7 +4,7 @@ #include <xen/lib.h> #include <asm/bitops.h> -#define read_segment_register(name) \ +#define read_segment_register(vcpu, regs, name) \ ({ u16 __sel; \ asm volatile ( "movw %%" STR(name) ",%0" : "=r" (__sel) ); \ __sel; \ diff --git a/xen/include/asm-x86/traps.h b/xen/include/asm-x86/traps.h index 82cbcee..202e3be 100644 --- a/xen/include/asm-x86/traps.h +++ b/xen/include/asm-x86/traps.h @@ -49,4 +49,5 @@ extern int guest_has_trap_callback(struct domain *d, uint16_t vcpuid, extern int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr); +int emulate_privileged_op(struct cpu_user_regs *regs); #endif /* ASM_TRAP_H */ -- 1.7.2.3
Mukesh Rathor
2013-Mar-19 22:33 UTC
[PATCH 05/18 V2 RESEND]: PVH xen: more preparatory changes for PVH
This is also a preparotary patch for PVH. In this patch, following functions are made non-static: vmx_fpu_enter(), get_instruction_length(), update_guest_eip(), vmx_dr_access(), vmx_do_extint(), pv_cpuid(), and emulate_forced_invalid_op(). There is no functionality change. Changes in V2: - prepend vmx_ to get_instruction_length and update_guest_eip. - Do not export/use vmr(). Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> --- xen/arch/x86/hvm/vmx/vmx.c | 74 +++++++++++++++--------------------- xen/arch/x86/hvm/vmx/vvmx.c | 2 +- xen/arch/x86/traps.c | 4 +- xen/include/asm-x86/hvm/vmx/vmcs.h | 1 + xen/include/asm-x86/hvm/vmx/vmx.h | 16 +++++++- xen/include/asm-x86/processor.h | 2 + 6 files changed, 52 insertions(+), 47 deletions(-) diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c index 04dbefb..e64980f 100644 --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -574,7 +574,7 @@ static int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt) return 0; } -static void vmx_fpu_enter(struct vcpu *v) +void vmx_fpu_enter(struct vcpu *v) { vcpu_restore_fpu_lazy(v); v->arch.hvm_vmx.exception_bitmap &= ~(1u << TRAP_no_device); @@ -1526,24 +1526,12 @@ struct hvm_function_table * __init start_vmx(void) return &vmx_function_table; } -/* - * Not all cases receive valid value in the VM-exit instruction length field. - * Callers must know what they''re doing! - */ -static int get_instruction_length(void) -{ - int len; - len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */ - BUG_ON((len < 1) || (len > 15)); - return len; -} - -void update_guest_eip(void) +void vmx_update_guest_eip(void) { struct cpu_user_regs *regs = guest_cpu_user_regs(); unsigned long x; - regs->eip += get_instruction_length(); /* Safe: callers audited */ + regs->eip += vmx_get_instruction_length(); /* Safe: callers audited */ regs->eflags &= ~X86_EFLAGS_RF; x = __vmread(GUEST_INTERRUPTIBILITY_INFO); @@ -1616,8 +1604,8 @@ static void vmx_do_cpuid(struct cpu_user_regs *regs) regs->edx = edx; } -static void vmx_dr_access(unsigned long exit_qualification, - struct cpu_user_regs *regs) +void vmx_dr_access(unsigned long exit_qualification, + struct cpu_user_regs *regs) { struct vcpu *v = current; @@ -2037,7 +2025,7 @@ gp_fault: return X86EMUL_EXCEPTION; } -static void vmx_do_extint(struct cpu_user_regs *regs) +void vmx_do_extint(struct cpu_user_regs *regs) { unsigned int vector; @@ -2221,7 +2209,7 @@ static int vmx_handle_eoi_write(void) if ( (((exit_qualification >> 12) & 0xf) == 1) && ((exit_qualification & 0xfff) == APIC_EOI) ) { - update_guest_eip(); /* Safe: APIC data write */ + vmx_update_guest_eip(); /* Safe: APIC data write */ vlapic_EOI_set(vcpu_vlapic(current)); HVMTRACE_0D(VLAPIC); return 1; @@ -2434,7 +2422,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) HVMTRACE_1D(TRAP, vector); if ( v->domain->debugger_attached ) { - update_guest_eip(); /* Safe: INT3 */ + vmx_update_guest_eip(); /* Safe: INT3 */ current->arch.gdbsx_vcpu_event = TRAP_int3; domain_pause_for_debugger(); break; @@ -2542,7 +2530,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) */ inst_len = ((source != 3) || /* CALL, IRET, or JMP? */ (idtv_info & (1u<<10))) /* IntrType > 3? */ - ? get_instruction_length() /* Safe: SDM 3B 23.2.4 */ : 0; + ? vmx_get_instruction_length() /* Safe: SDM 3B 23.2.4 */ : 0; if ( (source == 3) && (idtv_info & INTR_INFO_DELIVER_CODE_MASK) ) ecode = __vmread(IDT_VECTORING_ERROR_CODE); regs->eip += inst_len; @@ -2550,15 +2538,15 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) break; } case EXIT_REASON_CPUID: - update_guest_eip(); /* Safe: CPUID */ + vmx_update_guest_eip(); /* Safe: CPUID */ vmx_do_cpuid(regs); break; case EXIT_REASON_HLT: - update_guest_eip(); /* Safe: HLT */ + vmx_update_guest_eip(); /* Safe: HLT */ hvm_hlt(regs->eflags); break; case EXIT_REASON_INVLPG: - update_guest_eip(); /* Safe: INVLPG */ + vmx_update_guest_eip(); /* Safe: INVLPG */ exit_qualification = __vmread(EXIT_QUALIFICATION); vmx_invlpg_intercept(exit_qualification); break; @@ -2566,7 +2554,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) regs->ecx = hvm_msr_tsc_aux(v); /* fall through */ case EXIT_REASON_RDTSC: - update_guest_eip(); /* Safe: RDTSC, RDTSCP */ + vmx_update_guest_eip(); /* Safe: RDTSC, RDTSCP */ hvm_rdtsc_intercept(regs); break; case EXIT_REASON_VMCALL: @@ -2576,7 +2564,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) rc = hvm_do_hypercall(regs); if ( rc != HVM_HCALL_preempted ) { - update_guest_eip(); /* Safe: VMCALL */ + vmx_update_guest_eip(); /* Safe: VMCALL */ if ( rc == HVM_HCALL_invalidate ) send_invalidate_req(); } @@ -2586,7 +2574,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) { exit_qualification = __vmread(EXIT_QUALIFICATION); if ( vmx_cr_access(exit_qualification) == X86EMUL_OKAY ) - update_guest_eip(); /* Safe: MOV Cn, LMSW, CLTS */ + vmx_update_guest_eip(); /* Safe: MOV Cn, LMSW, CLTS */ break; } case EXIT_REASON_DR_ACCESS: @@ -2600,7 +2588,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) { regs->eax = (uint32_t)msr_content; regs->edx = (uint32_t)(msr_content >> 32); - update_guest_eip(); /* Safe: RDMSR */ + vmx_update_guest_eip(); /* Safe: RDMSR */ } break; } @@ -2609,63 +2597,63 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) uint64_t msr_content; msr_content = ((uint64_t)regs->edx << 32) | (uint32_t)regs->eax; if ( hvm_msr_write_intercept(regs->ecx, msr_content) == X86EMUL_OKAY ) - update_guest_eip(); /* Safe: WRMSR */ + vmx_update_guest_eip(); /* Safe: WRMSR */ break; } case EXIT_REASON_VMXOFF: if ( nvmx_handle_vmxoff(regs) == X86EMUL_OKAY ) - update_guest_eip(); + vmx_update_guest_eip(); break; case EXIT_REASON_VMXON: if ( nvmx_handle_vmxon(regs) == X86EMUL_OKAY ) - update_guest_eip(); + vmx_update_guest_eip(); break; case EXIT_REASON_VMCLEAR: if ( nvmx_handle_vmclear(regs) == X86EMUL_OKAY ) - update_guest_eip(); + vmx_update_guest_eip(); break; case EXIT_REASON_VMPTRLD: if ( nvmx_handle_vmptrld(regs) == X86EMUL_OKAY ) - update_guest_eip(); + vmx_update_guest_eip(); break; case EXIT_REASON_VMPTRST: if ( nvmx_handle_vmptrst(regs) == X86EMUL_OKAY ) - update_guest_eip(); + vmx_update_guest_eip(); break; case EXIT_REASON_VMREAD: if ( nvmx_handle_vmread(regs) == X86EMUL_OKAY ) - update_guest_eip(); + vmx_update_guest_eip(); break; case EXIT_REASON_VMWRITE: if ( nvmx_handle_vmwrite(regs) == X86EMUL_OKAY ) - update_guest_eip(); + vmx_update_guest_eip(); break; case EXIT_REASON_VMLAUNCH: if ( nvmx_handle_vmlaunch(regs) == X86EMUL_OKAY ) - update_guest_eip(); + vmx_update_guest_eip(); break; case EXIT_REASON_VMRESUME: if ( nvmx_handle_vmresume(regs) == X86EMUL_OKAY ) - update_guest_eip(); + vmx_update_guest_eip(); break; case EXIT_REASON_INVEPT: if ( nvmx_handle_invept(regs) == X86EMUL_OKAY ) - update_guest_eip(); + vmx_update_guest_eip(); break; case EXIT_REASON_INVVPID: if ( nvmx_handle_invvpid(regs) == X86EMUL_OKAY ) - update_guest_eip(); + vmx_update_guest_eip(); break; case EXIT_REASON_MWAIT_INSTRUCTION: @@ -2713,14 +2701,14 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) int bytes = (exit_qualification & 0x07) + 1; int dir = (exit_qualification & 0x08) ? IOREQ_READ : IOREQ_WRITE; if ( handle_pio(port, bytes, dir) ) - update_guest_eip(); /* Safe: IN, OUT */ + vmx_update_guest_eip(); /* Safe: IN, OUT */ } break; case EXIT_REASON_INVD: case EXIT_REASON_WBINVD: { - update_guest_eip(); /* Safe: INVD, WBINVD */ + vmx_update_guest_eip(); /* Safe: INVD, WBINVD */ vmx_wbinvd_intercept(); break; } @@ -2753,7 +2741,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) { u64 new_bv = (((u64)regs->edx) << 32) | regs->eax; if ( hvm_handle_xsetbv(new_bv) == 0 ) - update_guest_eip(); /* Safe: XSETBV */ + vmx_update_guest_eip(); /* Safe: XSETBV */ break; } diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c index bb7688f..225de9f 100644 --- a/xen/arch/x86/hvm/vmx/vvmx.c +++ b/xen/arch/x86/hvm/vmx/vvmx.c @@ -2136,7 +2136,7 @@ int nvmx_n2_vmexit_handler(struct cpu_user_regs *regs, tsc += __get_vvmcs(nvcpu->nv_vvmcx, TSC_OFFSET); regs->eax = (uint32_t)tsc; regs->edx = (uint32_t)(tsc >> 32); - update_guest_eip(); + vmx_update_guest_eip(); return 1; } diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index ab3e814..ab54f82 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -728,7 +728,7 @@ int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx, return 1; } -static void pv_cpuid(struct cpu_user_regs *regs) +void pv_cpuid(struct cpu_user_regs *regs) { uint32_t a, b, c, d; @@ -905,7 +905,7 @@ static int emulate_invalid_rdtscp(struct cpu_user_regs *regs) return EXCRET_fault_fixed; } -static int emulate_forced_invalid_op(struct cpu_user_regs *regs) +unsigned long emulate_forced_invalid_op(struct cpu_user_regs *regs) { char sig[5], instr[2]; unsigned long eip, rc; diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h index 37e6734..11b09ef 100644 --- a/xen/include/asm-x86/hvm/vmx/vmcs.h +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h @@ -461,6 +461,7 @@ void vmx_vmcs_switch(struct vmcs_struct *from, struct vmcs_struct *to); void vmx_set_eoi_exit_bitmap(struct vcpu *v, u8 vector); void vmx_clear_eoi_exit_bitmap(struct vcpu *v, u8 vector); int vmx_check_msr_bitmap(unsigned long *msr_bitmap, u32 msr, int access_type); +void vmx_fpu_enter(struct vcpu *v); void virtual_vmcs_enter(void *vvmcs); void virtual_vmcs_exit(void *vvmcs); u64 virtual_vmcs_vmread(void *vvmcs, u32 vmcs_encoding); diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h index d4d6feb..a742e16 100644 --- a/xen/include/asm-x86/hvm/vmx/vmx.h +++ b/xen/include/asm-x86/hvm/vmx/vmx.h @@ -420,6 +420,18 @@ static inline int __vmxon(u64 addr) return rc; } +/* + * Not all cases receive valid value in the VM-exit instruction length field. + * Callers must know what they''re doing! + */ +static inline int vmx_get_instruction_length(void) +{ + int len; + len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */ + BUG_ON((len < 1) || (len > 15)); + return len; +} + void vmx_get_segment_register(struct vcpu *, enum x86_segment, struct segment_register *); void vmx_inject_extint(int trap); @@ -431,7 +443,9 @@ void ept_p2m_uninit(struct p2m_domain *p2m); void ept_walk_table(struct domain *d, unsigned long gfn); void setup_ept_dump(void); -void update_guest_eip(void); +void vmx_update_guest_eip(void); +void vmx_dr_access(unsigned long exit_qualification,struct cpu_user_regs *regs); +void vmx_do_extint(struct cpu_user_regs *regs); int alloc_p2m_hap_data(struct p2m_domain *p2m); void free_p2m_hap_data(struct p2m_domain *p2m); diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h index 5cdacc7..096cdc9 100644 --- a/xen/include/asm-x86/processor.h +++ b/xen/include/asm-x86/processor.h @@ -566,6 +566,8 @@ void microcode_set_module(unsigned int); int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void), unsigned long len); int microcode_resume_cpu(int cpu); +void pv_cpuid(struct cpu_user_regs *regs); +unsigned long emulate_forced_invalid_op(struct cpu_user_regs *regs); #endif /* !__ASSEMBLY__ */ #endif /* __ASM_X86_PROCESSOR_H */ -- 1.7.2.3
Mukesh Rathor
2013-Mar-19 22:33 UTC
[PATCH 06/18 V2 RESEND]: PVH xen: Introduce PVH guest type
This patch introduces the concept of pvh guest. There also other basic changes like creating macros to check for pvh vcpu/domain, and creating new macros to see if it''s pvh/hvm domain/vcpu. Also, modify copy macros to test for pvh. Lastly, we introduce that PVH uses HVM style event delivery. Chagnes in V2: - make is_pvh/is_hvm enum instead of adding is_pvh as a new flag. - fix indentation and spacing in guest_kernel_mode macro. - add debug only BUG() in GUEST_KERNEL_RPL macro as it should no longer be called in any PVH paths. Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> --- xen/arch/x86/debug.c | 2 +- xen/arch/x86/domain.c | 7 +++++++ xen/common/domain.c | 2 +- xen/include/asm-x86/desc.h | 5 +++++ xen/include/asm-x86/domain.h | 9 ++++----- xen/include/asm-x86/event.h | 2 +- xen/include/asm-x86/guest_access.h | 12 ++++++------ xen/include/asm-x86/x86_64/regs.h | 9 +++++---- xen/include/xen/sched.h | 17 ++++++++++++++--- 9 files changed, 44 insertions(+), 21 deletions(-) diff --git a/xen/arch/x86/debug.c b/xen/arch/x86/debug.c index e67473e..502edbc 100644 --- a/xen/arch/x86/debug.c +++ b/xen/arch/x86/debug.c @@ -158,7 +158,7 @@ dbg_rw_guest_mem(dbgva_t addr, dbgbyte_t *buf, int len, struct domain *dp, pagecnt = min_t(long, PAGE_SIZE - (addr & ~PAGE_MASK), len); - mfn = (dp->is_hvm + mfn = (is_hvm_domain(dp) ? dbg_hvm_va2mfn(addr, dp, toaddr, &gfn) : dbg_pv_va2mfn(addr, dp, pgd3)); if ( mfn == INVALID_MFN ) diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index e9549e0..768c19d 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -649,6 +649,13 @@ int arch_set_info_guest( unsigned int i; int rc = 0, compat; + /* This removed when all patches are checked in */ + if ( is_pvh_vcpu(v) ) + { + printk("PVH: You don''t have the correct xen version for PVH\n"); + return -EINVAL; + } + /* The context is a compat-mode one if the target domain is compat-mode; * we expect the tools to DTRT even in compat-mode callers. */ compat = is_pv_32on64_domain(d); diff --git a/xen/common/domain.c b/xen/common/domain.c index 64ee29d..b6f10b7 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -231,7 +231,7 @@ struct domain *domain_create( goto fail; if ( domcr_flags & DOMCRF_hvm ) - d->is_hvm = 1; + d->guest_type = hvm_guest; if ( domid == 0 ) { diff --git a/xen/include/asm-x86/desc.h b/xen/include/asm-x86/desc.h index 354b889..4dca0a3 100644 --- a/xen/include/asm-x86/desc.h +++ b/xen/include/asm-x86/desc.h @@ -38,7 +38,12 @@ #ifndef __ASSEMBLY__ +#ifndef NDEBUG +#define GUEST_KERNEL_RPL(d) (is_pvh_domain(d) ? ({ BUG(); 0; }) : \ + is_pv_32bit_domain(d) ? 1 : 3) +#else #define GUEST_KERNEL_RPL(d) (is_pv_32bit_domain(d) ? 1 : 3) +#endif /* Fix up the RPL of a guest segment selector. */ #define __fixup_guest_selector(d, sel) \ diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h index 97e09ca..ecb3058 100644 --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -16,7 +16,7 @@ #define is_pv_32on64_domain(d) (is_pv_32bit_domain(d)) #define is_pv_32on64_vcpu(v) (is_pv_32on64_domain((v)->domain)) -#define is_hvm_pv_evtchn_domain(d) (is_hvm_domain(d) && \ +#define is_hvm_pv_evtchn_domain(d) (is_hvm_or_pvh_domain(d) && \ d->arch.hvm_domain.irq.callback_via_type == HVMIRQ_callback_vector) #define is_hvm_pv_evtchn_vcpu(v) (is_hvm_pv_evtchn_domain(v->domain)) @@ -254,10 +254,9 @@ struct arch_domain struct list_head pdev_list; - union { - struct pv_domain pv_domain; - struct hvm_domain hvm_domain; - }; + /* PVH : pvh uses fields from both pv and hvm, so separate the union */ + struct pv_domain pv_domain; + struct hvm_domain hvm_domain; struct paging_domain paging; struct p2m_domain *p2m; diff --git a/xen/include/asm-x86/event.h b/xen/include/asm-x86/event.h index 06057c7..9187606 100644 --- a/xen/include/asm-x86/event.h +++ b/xen/include/asm-x86/event.h @@ -18,7 +18,7 @@ int hvm_local_events_need_delivery(struct vcpu *v); static inline int local_events_need_delivery(void) { struct vcpu *v = current; - return (is_hvm_vcpu(v) ? hvm_local_events_need_delivery(v) : + return (is_hvm_or_pvh_vcpu(v) ? hvm_local_events_need_delivery(v) : (vcpu_info(v, evtchn_upcall_pending) && !vcpu_info(v, evtchn_upcall_mask))); } diff --git a/xen/include/asm-x86/guest_access.h b/xen/include/asm-x86/guest_access.h index ca700c9..1839fa4 100644 --- a/xen/include/asm-x86/guest_access.h +++ b/xen/include/asm-x86/guest_access.h @@ -14,27 +14,27 @@ /* Raw access functions: no type checking. */ #define raw_copy_to_guest(dst, src, len) \ - (is_hvm_vcpu(current) ? \ + (is_hvm_or_pvh_vcpu(current) ? \ copy_to_user_hvm((dst), (src), (len)) : \ copy_to_user((dst), (src), (len))) #define raw_copy_from_guest(dst, src, len) \ - (is_hvm_vcpu(current) ? \ + (is_hvm_or_pvh_vcpu(current) ? \ copy_from_user_hvm((dst), (src), (len)) : \ copy_from_user((dst), (src), (len))) #define raw_clear_guest(dst, len) \ - (is_hvm_vcpu(current) ? \ + (is_hvm_or_pvh_vcpu(current) ? \ clear_user_hvm((dst), (len)) : \ clear_user((dst), (len))) #define __raw_copy_to_guest(dst, src, len) \ - (is_hvm_vcpu(current) ? \ + (is_hvm_or_pvh_vcpu(current) ? \ copy_to_user_hvm((dst), (src), (len)) : \ __copy_to_user((dst), (src), (len))) #define __raw_copy_from_guest(dst, src, len) \ - (is_hvm_vcpu(current) ? \ + (is_hvm_or_pvh_vcpu(current) ? \ copy_from_user_hvm((dst), (src), (len)) : \ __copy_from_user((dst), (src), (len))) #define __raw_clear_guest(dst, len) \ - (is_hvm_vcpu(current) ? \ + (is_hvm_or_pvh_vcpu(current) ? \ clear_user_hvm((dst), (len)) : \ clear_user((dst), (len))) diff --git a/xen/include/asm-x86/x86_64/regs.h b/xen/include/asm-x86/x86_64/regs.h index 3cdc702..bb475cf 100644 --- a/xen/include/asm-x86/x86_64/regs.h +++ b/xen/include/asm-x86/x86_64/regs.h @@ -10,10 +10,11 @@ #define ring_2(r) (((r)->cs & 3) == 2) #define ring_3(r) (((r)->cs & 3) == 3) -#define guest_kernel_mode(v, r) \ - (!is_pv_32bit_vcpu(v) ? \ - (ring_3(r) && ((v)->arch.flags & TF_kernel_mode)) : \ - (ring_1(r))) +#define guest_kernel_mode(v, r) \ + (is_pvh_vcpu(v) ? ({ ASSERT(v == current); ring_0(r); }) : \ + (!is_pv_32bit_vcpu(v) ? \ + (ring_3(r) && ((v)->arch.flags & TF_kernel_mode)) : \ + (ring_1(r)))) #define permit_softint(dpl, v, r) \ ((dpl) >= (guest_kernel_mode(v, r) ? 1 : 3)) diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h index 569e76e..079daff 100644 --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -230,6 +230,9 @@ struct mem_event_per_domain struct mem_event_domain access; }; +/* PVH is a PV guest running in HVM container. While is_hvm is false for it, + * it uses many of the HVM data structs. + */ struct domain { domid_t domain_id; @@ -277,8 +280,8 @@ struct domain struct rangeset *iomem_caps; struct rangeset *irq_caps; - /* Is this an HVM guest? */ - bool_t is_hvm; + /* !is_pvh && !is_hvm ==> PV, else PVH or HVM */ + enum {hvm_guest=1, pvh_guest} guest_type; #ifdef HAS_PASSTHROUGH /* Does this guest need iommu mappings? */ bool_t need_iommu; @@ -450,6 +453,10 @@ struct domain *domain_create( /* DOMCRF_oos_off: dont use out-of-sync optimization for shadow page tables */ #define _DOMCRF_oos_off 4 #define DOMCRF_oos_off (1U<<_DOMCRF_oos_off) + /* DOMCRF_pvh: Create PV domain in HVM container */ +#define _DOMCRF_pvh 5 +#define DOMCRF_pvh (1U<<_DOMCRF_pvh) + /* * rcu_lock_domain_by_id() is more efficient than get_domain_by_id(). @@ -718,10 +725,14 @@ void watchdog_domain_destroy(struct domain *d); #define VM_ASSIST(_d,_t) (test_bit((_t), &(_d)->vm_assist)) -#define is_hvm_domain(d) ((d)->is_hvm) +#define is_hvm_domain(d) ((d)->guest_type == hvm_guest) #define is_hvm_vcpu(v) (is_hvm_domain(v->domain)) +#define is_pvh_domain(d) ((d)->guest_type == pvh_guest) +#define is_pvh_vcpu(v) (is_pvh_domain(v->domain)) #define is_pinned_vcpu(v) ((v)->domain->is_pinned || \ cpumask_weight((v)->cpu_affinity) == 1) +#define is_hvm_or_pvh_domain(d) (is_hvm_domain(d) || is_pvh_domain(d)) +#define is_hvm_or_pvh_vcpu(v) (is_hvm_or_pvh_domain(v->domain)) #ifdef HAS_PASSTHROUGH #define need_iommu(d) ((d)->need_iommu) #else -- 1.7.2.3
Mukesh Rathor
2013-Mar-19 22:33 UTC
[PATCH 07/18 V2 RESEND]: PVH xen: tools changes to create PVH domain
This patch contains tools changes for PVH. For now, only one mode is supported/tested: dom0> losetup /dev/loop1 guest.img dom0> In vm.cfg file: disk = [''phy:/dev/loop1,xvda,w''] Chnages in V2: None Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> --- tools/debugger/gdbsx/xg/xg_main.c | 4 +++- tools/libxc/xc_dom.h | 1 + tools/libxc/xc_dom_x86.c | 7 ++++--- tools/libxl/libxl_create.c | 2 ++ tools/libxl/libxl_dom.c | 18 +++++++++++++++++- tools/libxl/libxl_types.idl | 2 ++ tools/libxl/libxl_x86.c | 4 +++- tools/libxl/xl_cmdimpl.c | 11 +++++++++++ tools/xenstore/xenstored_domain.c | 14 ++++++++------ xen/include/public/domctl.h | 3 +++ 10 files changed, 54 insertions(+), 12 deletions(-) diff --git a/tools/debugger/gdbsx/xg/xg_main.c b/tools/debugger/gdbsx/xg/xg_main.c index 64c7484..5736b86 100644 --- a/tools/debugger/gdbsx/xg/xg_main.c +++ b/tools/debugger/gdbsx/xg/xg_main.c @@ -81,6 +81,7 @@ int xgtrc_on = 0; struct xen_domctl domctl; /* just use a global domctl */ static int _hvm_guest; /* hvm guest? 32bit HVMs have 64bit context */ +static int _pvh_guest; /* PV guest in HVM container */ static domid_t _dom_id; /* guest domid */ static int _max_vcpu_id; /* thus max_vcpu_id+1 VCPUs */ static int _dom0_fd; /* fd of /dev/privcmd */ @@ -309,6 +310,7 @@ xg_attach(int domid, int guest_bitness) _max_vcpu_id = domctl.u.getdomaininfo.max_vcpu_id; _hvm_guest = (domctl.u.getdomaininfo.flags & XEN_DOMINF_hvm_guest); + _pvh_guest = (domctl.u.getdomaininfo.flags & XEN_DOMINF_pvh_guest); return _max_vcpu_id; } @@ -369,7 +371,7 @@ _change_TF(vcpuid_t which_vcpu, int guest_bitness, int setit) int sz = sizeof(anyc); /* first try the MTF for hvm guest. otherwise do manually */ - if (_hvm_guest) { + if (_hvm_guest || _pvh_guest) { domctl.u.debug_op.vcpu = which_vcpu; domctl.u.debug_op.op = setit ? XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON : XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF; diff --git a/tools/libxc/xc_dom.h b/tools/libxc/xc_dom.h index 779b9d4..e8a5260 100644 --- a/tools/libxc/xc_dom.h +++ b/tools/libxc/xc_dom.h @@ -130,6 +130,7 @@ struct xc_dom_image { domid_t console_domid; domid_t xenstore_domid; xen_pfn_t shared_info_mfn; + int domcr_is_pvh; xc_interface *xch; domid_t guest_domid; diff --git a/tools/libxc/xc_dom_x86.c b/tools/libxc/xc_dom_x86.c index eb9ac07..ca1bc95 100644 --- a/tools/libxc/xc_dom_x86.c +++ b/tools/libxc/xc_dom_x86.c @@ -355,7 +355,8 @@ static int setup_pgtables_x86_64(struct xc_dom_image *dom) pgpfn = (addr - dom->parms.virt_base) >> PAGE_SHIFT_X86; l1tab[l1off] pfn_to_paddr(xc_dom_p2m_guest(dom, pgpfn)) | L1_PROT; - if ( (addr >= dom->pgtables_seg.vstart) && + if ( (!dom->domcr_is_pvh) && + (addr >= dom->pgtables_seg.vstart) && (addr < dom->pgtables_seg.vend) ) l1tab[l1off] &= ~_PAGE_RW; /* page tables are r/o */ if ( l1off == (L1_PAGETABLE_ENTRIES_X86_64 - 1) ) @@ -672,7 +673,7 @@ int arch_setup_meminit(struct xc_dom_image *dom) rc = x86_compat(dom->xch, dom->guest_domid, dom->guest_type); if ( rc ) return rc; - if ( xc_dom_feature_translated(dom) ) + if ( xc_dom_feature_translated(dom) && !dom->domcr_is_pvh ) { dom->shadow_enabled = 1; rc = x86_shadow(dom->xch, dom->guest_domid); @@ -786,7 +787,7 @@ int arch_setup_bootlate(struct xc_dom_image *dom) } /* Map grant table frames into guest physmap. */ - for ( i = 0; ; i++ ) + for ( i = 0; !dom->domcr_is_pvh; i++ ) { rc = xc_domain_add_to_physmap(dom->xch, dom->guest_domid, XENMAPSPACE_grant_table, diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c index efeebf2..7f96dbd 100644 --- a/tools/libxl/libxl_create.c +++ b/tools/libxl/libxl_create.c @@ -405,6 +405,8 @@ int libxl__domain_make(libxl__gc *gc, libxl_domain_create_info *info, flags |= XEN_DOMCTL_CDF_hvm_guest; flags |= libxl_defbool_val(info->hap) ? XEN_DOMCTL_CDF_hap : 0; flags |= libxl_defbool_val(info->oos) ? 0 : XEN_DOMCTL_CDF_oos_off; + } else if ( libxl_defbool_val(info->ci_pvh) ) { + flags |= XEN_DOMCTL_CDF_hap; } *domid = -1; diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c index de555ee..4b23cf4 100644 --- a/tools/libxl/libxl_dom.c +++ b/tools/libxl/libxl_dom.c @@ -322,9 +322,23 @@ int libxl__build_pv(libxl__gc *gc, uint32_t domid, struct xc_dom_image *dom; int ret; int flags = 0; + int is_pvh = libxl_defbool_val(info->bi_pvh); xc_dom_loginit(ctx->xch); + if (is_pvh) { + char *pv_feats = "writable_descriptor_tables|auto_translated_physmap" + "|supervisor_mode_kernel|hvm_callback_vector"; + + if (info->u.pv.features && info->u.pv.features[0] != ''\0'') + { + LOG(ERROR, "Didn''t expect info->u.pv.features to contain string\n"); + LOG(ERROR, "String: %s\n", info->u.pv.features); + return ERROR_FAIL; + } + info->u.pv.features = strdup(pv_feats); + } + dom = xc_dom_allocate(ctx->xch, state->pv_cmdline, info->u.pv.features); if (!dom) { LOGE(ERROR, "xc_dom_allocate failed"); @@ -363,6 +377,7 @@ int libxl__build_pv(libxl__gc *gc, uint32_t domid, } dom->flags = flags; + dom->domcr_is_pvh = is_pvh; dom->console_evtchn = state->console_port; dom->console_domid = state->console_domid; dom->xenstore_evtchn = state->store_port; @@ -392,7 +407,8 @@ int libxl__build_pv(libxl__gc *gc, uint32_t domid, LOGE(ERROR, "xc_dom_boot_image failed"); goto out; } - if ( (ret = xc_dom_gnttab_init(dom)) != 0 ) { + /* PVH sets up its own grant during boot via hvm mechanisms */ + if ( !is_pvh && (ret = xc_dom_gnttab_init(dom)) != 0 ) { LOGE(ERROR, "xc_dom_gnttab_init failed"); goto out; } diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl index 5b080ed..ae11309 100644 --- a/tools/libxl/libxl_types.idl +++ b/tools/libxl/libxl_types.idl @@ -244,6 +244,7 @@ libxl_domain_create_info = Struct("domain_create_info",[ ("platformdata", libxl_key_value_list), ("poolid", uint32), ("run_hotplug_scripts",libxl_defbool), + ("ci_pvh", libxl_defbool), ], dir=DIR_IN) MemKB = UInt(64, init_val = "LIBXL_MEMKB_DEFAULT") @@ -343,6 +344,7 @@ libxl_domain_build_info = Struct("domain_build_info",[ ])), ("invalid", Struct(None, [])), ], keyvar_init_val = "LIBXL_DOMAIN_TYPE_INVALID")), + ("bi_pvh", libxl_defbool), ], dir=DIR_IN ) diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c index a17f6ae..3caba5c 100644 --- a/tools/libxl/libxl_x86.c +++ b/tools/libxl/libxl_x86.c @@ -290,7 +290,9 @@ int libxl__arch_domain_create(libxl__gc *gc, libxl_domain_config *d_config, if (rtc_timeoffset) xc_domain_set_time_offset(ctx->xch, domid, rtc_timeoffset); - if (d_config->b_info.type == LIBXL_DOMAIN_TYPE_HVM) { + if (d_config->b_info.type == LIBXL_DOMAIN_TYPE_HVM || + libxl_defbool_val(d_config->b_info.bi_pvh)) { + unsigned long shadow; shadow = (d_config->b_info.shadow_memkb + 1023) / 1024; xc_shadow_control(ctx->xch, domid, XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION, NULL, 0, &shadow, 0, NULL); diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c index a98705e..788aa4a 100644 --- a/tools/libxl/xl_cmdimpl.c +++ b/tools/libxl/xl_cmdimpl.c @@ -633,8 +633,18 @@ static void parse_config_data(const char *config_source, !strncmp(buf, "hvm", strlen(buf))) c_info->type = LIBXL_DOMAIN_TYPE_HVM; + libxl_defbool_setdefault(&c_info->ci_pvh, false); + libxl_defbool_setdefault(&c_info->hap, false); + xlu_cfg_get_defbool(config, "pvh", &c_info->ci_pvh, 0); xlu_cfg_get_defbool(config, "hap", &c_info->hap, 0); + if (libxl_defbool_val(c_info->ci_pvh) && + !libxl_defbool_val(c_info->hap)) { + + fprintf(stderr, "hap is required for PVH domain\n"); + exit(1); + } + if (xlu_cfg_replace_string (config, "name", &c_info->name, 0)) { fprintf(stderr, "Domain name must be specified.\n"); exit(1); @@ -939,6 +949,7 @@ static void parse_config_data(const char *config_source, b_info->u.pv.cmdline = cmdline; xlu_cfg_replace_string (config, "ramdisk", &b_info->u.pv.ramdisk, 0); + libxl_defbool_set(&b_info->bi_pvh, libxl_defbool_val(c_info->ci_pvh)); break; } default: diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c index bf83d58..6b7b986 100644 --- a/tools/xenstore/xenstored_domain.c +++ b/tools/xenstore/xenstored_domain.c @@ -168,13 +168,15 @@ static int readchn(struct connection *conn, void *data, unsigned int len) static void *map_interface(domid_t domid, unsigned long mfn) { if (*xcg_handle != NULL) { - /* this is the preferred method */ - return xc_gnttab_map_grant_ref(*xcg_handle, domid, + void *addr; + /* this is the preferred method */ + addr = xc_gnttab_map_grant_ref(*xcg_handle, domid, GNTTAB_RESERVED_XENSTORE, PROT_READ|PROT_WRITE); - } else { - return xc_map_foreign_range(*xc_handle, domid, - getpagesize(), PROT_READ|PROT_WRITE, mfn); - } + if (addr) + return addr; + } + return xc_map_foreign_range(*xc_handle, domid, + getpagesize(), PROT_READ|PROT_WRITE, mfn); } static void unmap_interface(void *interface) diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h index 113b8dc..a6241ef 100644 --- a/xen/include/public/domctl.h +++ b/xen/include/public/domctl.h @@ -89,6 +89,9 @@ struct xen_domctl_getdomaininfo { /* Being debugged. */ #define _XEN_DOMINF_debugged 6 #define XEN_DOMINF_debugged (1U<<_XEN_DOMINF_debugged) + /* domain is PVH */ +#define _XEN_DOMINF_pvh_guest 7 +#define XEN_DOMINF_pvh_guest (1U<<_XEN_DOMINF_pvh_guest) /* XEN_DOMINF_shutdown guest-supplied code. */ #define XEN_DOMINF_shutdownmask 255 #define XEN_DOMINF_shutdownshift 16 -- 1.7.2.3
Mukesh Rathor
2013-Mar-19 22:33 UTC
[PATCH 08/18 V2 RESEND]: PVH xen: domain creation code changes
This patch contains changes to arch/x86/domain.c to allow for a PVH domain. Also, since PVH uses lot of HVM data structs and code paths, in hvm_vcpu {} a sub struct to store PVH specific info is created. Right now it only has one field, but it can grow over time. Changes in V2: - changes to read_segment_register() moved to this patch. - The other comment was to create NULL functions for pvh_set_vcpu_info and pvh_read_descriptor which are implemented in later patch, but since I disable PVH creation until all patches are checked in, it is not needed. But it helps breaking down of patches. Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> --- xen/arch/x86/domain.c | 99 +++++++++++++++++++++++++++------------- xen/include/asm-x86/hvm/hvm.h | 18 +++++++ xen/include/asm-x86/hvm/vcpu.h | 9 ++++ xen/include/asm-x86/system.h | 8 +++- 4 files changed, 101 insertions(+), 33 deletions(-) diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 768c19d..5b5444f 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -387,8 +387,11 @@ int vcpu_initialise(struct vcpu *v) vmce_init_vcpu(v); - if ( is_hvm_domain(d) ) + if ( is_hvm_or_pvh_domain(d) ) { + if ( is_pvh_domain(d) ) + v->arch.hvm_vcpu.hvm_pvh.vcpu_info_mfn = INVALID_MFN; + rc = hvm_vcpu_initialise(v); goto done; } @@ -455,7 +458,7 @@ void vcpu_destroy(struct vcpu *v) vcpu_destroy_fpu(v); - if ( is_hvm_vcpu(v) ) + if ( is_hvm_or_pvh_vcpu(v) ) hvm_vcpu_destroy(v); else xfree(v->arch.pv_vcpu.trap_ctxt); @@ -467,7 +470,7 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags) int rc = -ENOMEM; d->arch.hvm_domain.hap_enabled - is_hvm_domain(d) && + is_hvm_or_pvh_domain(d) && hvm_funcs.hap_supported && (domcr_flags & DOMCRF_hap); d->arch.hvm_domain.mem_sharing_enabled = 0; @@ -515,7 +518,7 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags) mapcache_domain_init(d); HYPERVISOR_COMPAT_VIRT_START(d) - is_hvm_domain(d) ? ~0u : __HYPERVISOR_COMPAT_VIRT_START; + is_hvm_or_pvh_domain(d) ? ~0u : __HYPERVISOR_COMPAT_VIRT_START; if ( (rc = paging_domain_init(d, domcr_flags)) != 0 ) goto fail; @@ -557,7 +560,7 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags) goto fail; } - if ( is_hvm_domain(d) ) + if ( is_hvm_or_pvh_domain(d) ) { if ( (rc = hvm_domain_initialise(d)) != 0 ) { @@ -569,9 +572,9 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags) { /* 64-bit PV guest by default. */ d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0; - - spin_lock_init(&d->arch.pv_domain.e820_lock); } + if ( !is_hvm_domain(d) ) + spin_lock_init(&d->arch.pv_domain.e820_lock); /* initialize default tsc behavior in case tools don''t */ tsc_set_info(d, TSC_MODE_DEFAULT, 0UL, 0, 0); @@ -593,9 +596,10 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags) void arch_domain_destroy(struct domain *d) { - if ( is_hvm_domain(d) ) + if ( is_hvm_or_pvh_domain(d) ) hvm_domain_destroy(d); - else + + if ( !is_hvm_domain(d) ) xfree(d->arch.pv_domain.e820); free_domain_pirqs(d); @@ -663,7 +667,7 @@ int arch_set_info_guest( #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld)) flags = c(flags); - if ( !is_hvm_vcpu(v) ) + if ( !is_hvm_or_pvh_vcpu(v) ) { if ( !compat ) { @@ -716,7 +720,7 @@ int arch_set_info_guest( v->fpu_initialised = !!(flags & VGCF_I387_VALID); v->arch.flags &= ~TF_kernel_mode; - if ( (flags & VGCF_in_kernel) || is_hvm_vcpu(v)/*???*/ ) + if ( (flags & VGCF_in_kernel) || is_hvm_or_pvh_vcpu(v)/*???*/ ) v->arch.flags |= TF_kernel_mode; v->arch.vgc_flags = flags; @@ -727,7 +731,7 @@ int arch_set_info_guest( if ( !compat ) { memcpy(&v->arch.user_regs, &c.nat->user_regs, sizeof(c.nat->user_regs)); - if ( !is_hvm_vcpu(v) ) + if ( !is_hvm_or_pvh_vcpu(v) ) memcpy(v->arch.pv_vcpu.trap_ctxt, c.nat->trap_ctxt, sizeof(c.nat->trap_ctxt)); } @@ -743,10 +747,13 @@ int arch_set_info_guest( v->arch.user_regs.eflags |= 2; - if ( is_hvm_vcpu(v) ) + if ( is_hvm_or_pvh_vcpu(v) ) { hvm_set_info_guest(v); - goto out; + if ( is_hvm_vcpu(v) || v->is_initialised ) + goto out; + else + goto pvh_skip_pv_stuff; } init_int80_direct_trap(v); @@ -755,7 +762,8 @@ int arch_set_info_guest( v->arch.pv_vcpu.iopl = (v->arch.user_regs.eflags >> 12) & 3; v->arch.user_regs.eflags &= ~X86_EFLAGS_IOPL; - /* Ensure real hardware interrupts are enabled. */ + /* Ensure real hardware interrupts are enabled. Note: PVH may not have + * IDT set on all vcpus so don''t enable IF for it yet. */ v->arch.user_regs.eflags |= X86_EFLAGS_IF; if ( !v->is_initialised ) @@ -852,6 +860,7 @@ int arch_set_info_guest( if ( rc != 0 ) return rc; +pvh_skip_pv_stuff: if ( !compat ) { cr3_gfn = xen_cr3_to_pfn(c.nat->ctrlreg[3]); @@ -859,19 +868,26 @@ int arch_set_info_guest( if ( !cr3_page ) { - destroy_gdt(v); + if ( !is_pvh_vcpu(v) ) + destroy_gdt(v); return -EINVAL; } if ( !paging_mode_refcounts(d) && !get_page_type(cr3_page, PGT_base_page_table) ) { put_page(cr3_page); - destroy_gdt(v); + if ( !is_pvh_vcpu(v) ) + destroy_gdt(v); return -EINVAL; } + if ( is_pvh_vcpu(v) ) { + v->arch.cr3 = page_to_mfn(cr3_page); + v->arch.hvm_vcpu.guest_cr[3] = c.nat->ctrlreg[3]; + } + v->arch.guest_table = pagetable_from_page(cr3_page); - if ( c.nat->ctrlreg[1] ) + if ( c.nat->ctrlreg[1] && !is_pvh_vcpu(v) ) { cr3_gfn = xen_cr3_to_pfn(c.nat->ctrlreg[1]); cr3_page = get_page_from_gfn(d, cr3_gfn, NULL, P2M_ALLOC); @@ -896,7 +912,8 @@ int arch_set_info_guest( } else if ( !(flags & VGCF_in_kernel) ) { - destroy_gdt(v); + if ( !is_pvh_vcpu(v) ) + destroy_gdt(v); return -EINVAL; } } @@ -938,6 +955,13 @@ int arch_set_info_guest( update_cr3(v); + if ( is_pvh_vcpu(v) ) + { + /* guest is bringing up non-boot SMP vcpu */ + if ( (rc=hvm_pvh_set_vcpu_info(v, c.nat)) != 0 ) + return rc; + } + out: if ( flags & VGCF_online ) clear_bit(_VPF_down, &v->pause_flags); @@ -968,16 +992,21 @@ void arch_vcpu_reset(struct vcpu *v) static void unmap_vcpu_info(struct vcpu *v) { - unsigned long mfn; + unsigned long mfn, *mfnp; + + if ( is_pvh_vcpu(v) ) + mfnp = &v->arch.hvm_vcpu.hvm_pvh.vcpu_info_mfn; + else + mfnp = &v->arch.pv_vcpu.vcpu_info_mfn; - if ( v->arch.pv_vcpu.vcpu_info_mfn == INVALID_MFN ) + mfn = *mfnp; + if ( mfn == INVALID_MFN ) return; - mfn = v->arch.pv_vcpu.vcpu_info_mfn; unmap_domain_page_global(v->vcpu_info); v->vcpu_info = &dummy_vcpu_info; - v->arch.pv_vcpu.vcpu_info_mfn = INVALID_MFN; + *mfnp = INVALID_MFN; put_page_and_type(mfn_to_page(mfn)); } @@ -996,11 +1025,17 @@ map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset) vcpu_info_t *new_info; struct page_info *page; int i; + unsigned long *mfnp; + + if ( is_pvh_vcpu(v) ) + mfnp = &v->arch.hvm_vcpu.hvm_pvh.vcpu_info_mfn; + else + mfnp = &v->arch.pv_vcpu.vcpu_info_mfn; if ( offset > (PAGE_SIZE - sizeof(vcpu_info_t)) ) return -EINVAL; - if ( v->arch.pv_vcpu.vcpu_info_mfn != INVALID_MFN ) + if ( *mfnp != INVALID_MFN ) return -EINVAL; /* Run this command on yourself or on other offline VCPUS. */ @@ -1037,7 +1072,7 @@ map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset) } v->vcpu_info = new_info; - v->arch.pv_vcpu.vcpu_info_mfn = page_to_mfn(page); + *mfnp = page_to_mfn(page); /* Set new vcpu_info pointer /before/ setting pending flags. */ wmb(); @@ -1443,7 +1478,7 @@ static void update_runstate_area(struct vcpu *v) static inline int need_full_gdt(struct vcpu *v) { - return (!is_hvm_vcpu(v) && !is_idle_vcpu(v)); + return (!is_hvm_or_pvh_vcpu(v) && !is_idle_vcpu(v)); } static void __context_switch(void) @@ -1571,7 +1606,7 @@ void context_switch(struct vcpu *prev, struct vcpu *next) /* Re-enable interrupts before restoring state which may fault. */ local_irq_enable(); - if ( !is_hvm_vcpu(next) ) + if ( !is_hvm_or_pvh_vcpu(next) ) { load_LDT(next); load_segments(next); @@ -1690,12 +1725,12 @@ unsigned long hypercall_create_continuation( regs->eax = op; /* Ensure the hypercall trap instruction is re-executed. */ - if ( !is_hvm_vcpu(current) ) + if ( !is_hvm_or_pvh_vcpu(current) ) regs->eip -= 2; /* re-execute ''syscall'' / ''int $xx'' */ else current->arch.hvm_vcpu.hcall_preempted = 1; - if ( !is_hvm_vcpu(current) ? + if ( !is_hvm_or_pvh_vcpu(current) ? !is_pv_32on64_vcpu(current) : (hvm_guest_x86_mode(current) == 8) ) { @@ -2011,7 +2046,7 @@ int domain_relinquish_resources(struct domain *d) for_each_vcpu ( d, v ) vcpu_destroy_pagetables(v); - if ( !is_hvm_domain(d) ) + if ( !is_hvm_or_pvh_domain(d) ) { for_each_vcpu ( d, v ) { @@ -2086,7 +2121,7 @@ int domain_relinquish_resources(struct domain *d) BUG(); } - if ( is_hvm_domain(d) ) + if ( is_hvm_or_pvh_domain(d) ) hvm_domain_relinquish_resources(d); return 0; @@ -2167,7 +2202,7 @@ void vcpu_mark_events_pending(struct vcpu *v) if ( already_pending ) return; - if ( is_hvm_vcpu(v) ) + if ( is_hvm_or_pvh_vcpu(v) ) hvm_assert_evtchn_irq(v); else vcpu_kick(v); diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h index 2fa2ea5..31aa04f 100644 --- a/xen/include/asm-x86/hvm/hvm.h +++ b/xen/include/asm-x86/hvm/hvm.h @@ -190,6 +190,11 @@ struct hvm_function_table { paddr_t *L1_gpa, unsigned int *page_order, uint8_t *p2m_acc, bool_t access_r, bool_t access_w, bool_t access_x); + /* PVH functions */ + int (*pvh_set_vcpu_info)(struct vcpu *v, struct vcpu_guest_context *ctxtp); + int (*pvh_read_descriptor)(unsigned int sel, const struct vcpu *v, + const struct cpu_user_regs *regs, unsigned long *base, + unsigned long *limit, unsigned int *ar); }; extern struct hvm_function_table hvm_funcs; @@ -323,6 +328,19 @@ static inline unsigned long hvm_get_shadow_gs_base(struct vcpu *v) return hvm_funcs.get_shadow_gs_base(v); } +static inline int hvm_pvh_set_vcpu_info(struct vcpu *v, + struct vcpu_guest_context *ctxtp) +{ + return hvm_funcs.pvh_set_vcpu_info(v, ctxtp); +} + +static inline int hvm_pvh_read_descriptor(unsigned int sel, + const struct vcpu *v, const struct cpu_user_regs *regs, + unsigned long *base, unsigned long *limit, unsigned int *ar) +{ + return hvm_funcs.pvh_read_descriptor(sel, v, regs, base, limit, ar); +} + #define is_viridian_domain(_d) \ (is_hvm_domain(_d) && ((_d)->arch.hvm_domain.params[HVM_PARAM_VIRIDIAN])) diff --git a/xen/include/asm-x86/hvm/vcpu.h b/xen/include/asm-x86/hvm/vcpu.h index e8b8cd7..2725a62 100644 --- a/xen/include/asm-x86/hvm/vcpu.h +++ b/xen/include/asm-x86/hvm/vcpu.h @@ -104,6 +104,13 @@ struct nestedvcpu { #define vcpu_nestedhvm(v) ((v)->arch.hvm_vcpu.nvcpu) +/* add any PVH specific fields here */ +struct pvh_hvm_vcpu_ext +{ + /* Guest-specified relocation of vcpu_info. */ + unsigned long vcpu_info_mfn; +}; + struct hvm_vcpu { /* Guest control-register and EFER values, just as the guest sees them. */ unsigned long guest_cr[5]; @@ -170,6 +177,8 @@ struct hvm_vcpu { struct hvm_trap inject_trap; struct viridian_vcpu viridian; + + struct pvh_hvm_vcpu_ext hvm_pvh; }; #endif /* __ASM_X86_HVM_VCPU_H__ */ diff --git a/xen/include/asm-x86/system.h b/xen/include/asm-x86/system.h index d8dc6f2..5681806 100644 --- a/xen/include/asm-x86/system.h +++ b/xen/include/asm-x86/system.h @@ -4,9 +4,15 @@ #include <xen/lib.h> #include <asm/bitops.h> +/* We need vcpu because during context switch, going from pure PV to PVH, + * in save_segments(), current has been updated to next, and no longer pointing + * to the pure PV. Note: for PVH, we update regs->selectors on each vmexit */ #define read_segment_register(vcpu, regs, name) \ ({ u16 __sel; \ - asm volatile ( "movw %%" STR(name) ",%0" : "=r" (__sel) ); \ + if (is_pvh_vcpu(vcpu)) \ + __sel = regs->name; \ + else \ + asm volatile ( "movw %%" STR(name) ",%0" : "=r" (__sel) ); \ __sel; \ }) -- 1.7.2.3
Mukesh Rathor
2013-Mar-19 22:33 UTC
[PATCH 09/18 V2 RESEND]: PVH xen: create PVH vmcs, and initialization
This patch mainly contains code to create a VMCS for PVH guest, and HVM specific vcpu/domain creation code. Changes in V2: - Avoid call to hvm_do_resume() at call site rather than return in it. - Return for PVH vmx_do_resume prior to intel debugger stuff. Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> --- xen/arch/x86/hvm/hvm.c | 90 ++++++++++++++- xen/arch/x86/hvm/vmx/vmcs.c | 266 ++++++++++++++++++++++++++++++++++++++++++- xen/arch/x86/hvm/vmx/vmx.c | 34 ++++++ 3 files changed, 383 insertions(+), 7 deletions(-) diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c index ea7adf6..18889ad 100644 --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -510,6 +510,29 @@ static int hvm_print_line( return X86EMUL_OKAY; } +static int hvm_pvh_dom_initialise(struct domain *d) +{ + int rc; + + if (!d->arch.hvm_domain.hap_enabled) + return -EINVAL; + + spin_lock_init(&d->arch.hvm_domain.irq_lock); + hvm_init_guest_time(d); + + hvm_init_cacheattr_region_list(d); + + if ( (rc=paging_enable(d, PG_refcounts|PG_translate|PG_external)) != 0 ) + goto fail1; + + if ( (rc = hvm_funcs.domain_initialise(d)) == 0 ) + return 0; + +fail1: + hvm_destroy_cacheattr_region_list(d); + return rc; +} + int hvm_domain_initialise(struct domain *d) { int rc; @@ -520,6 +543,8 @@ int hvm_domain_initialise(struct domain *d) "on a non-VT/AMDV platform.\n"); return -EINVAL; } + if ( is_pvh_domain(d) ) + return hvm_pvh_dom_initialise(d); spin_lock_init(&d->arch.hvm_domain.pbuf_lock); spin_lock_init(&d->arch.hvm_domain.irq_lock); @@ -584,6 +609,11 @@ int hvm_domain_initialise(struct domain *d) void hvm_domain_relinquish_resources(struct domain *d) { + if ( is_pvh_domain(d) ) + { + pit_deinit(d); + return; + } if ( hvm_funcs.nhvm_domain_relinquish_resources ) hvm_funcs.nhvm_domain_relinquish_resources(d); @@ -609,10 +639,14 @@ void hvm_domain_relinquish_resources(struct domain *d) void hvm_domain_destroy(struct domain *d) { hvm_funcs.domain_destroy(d); + hvm_destroy_cacheattr_region_list(d); + + if ( is_pvh_domain(d) ) + return; + rtc_deinit(d); stdvga_deinit(d); vioapic_deinit(d); - hvm_destroy_cacheattr_region_list(d); } static int hvm_save_tsc_adjust(struct domain *d, hvm_domain_context_t *h) @@ -1066,14 +1100,47 @@ static int __init __hvm_register_CPU_XSAVE_save_and_restore(void) } __initcall(__hvm_register_CPU_XSAVE_save_and_restore); +static int hvm_pvh_vcpu_initialise(struct vcpu *v) +{ + int rc; + + if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 ) + return rc; + + softirq_tasklet_init( &v->arch.hvm_vcpu.assert_evtchn_irq_tasklet, + (void(*)(unsigned long))hvm_assert_evtchn_irq, + (unsigned long)v ); + + v->arch.hvm_vcpu.hcall_64bit = 1; + v->arch.hvm_vcpu.hvm_pvh.vcpu_info_mfn = INVALID_MFN; + v->arch.user_regs.eflags = 2; + v->arch.hvm_vcpu.inject_trap.vector = -1; + + if ( (rc=hvm_vcpu_cacheattr_init(v)) != 0 ) { + hvm_funcs.vcpu_destroy(v); + return rc; + } + + /* during domain shutdown: pvh_vmx_vmexit_handler->emulate_privileged_op + * -> guest_io_read -> pv_pit_handler -> handle_speaker_io -> _spin_lock + * so we call pit_init to initialize the spin lock */ + if ( v->vcpu_id == 0 ) + pit_init(v, cpu_khz); + + return 0; +} + int hvm_vcpu_initialise(struct vcpu *v) { int rc; struct domain *d = v->domain; - domid_t dm_domid = d->arch.hvm_domain.params[HVM_PARAM_DM_DOMAIN]; + domid_t dm_domid; hvm_asid_flush_vcpu(v); + if ( is_pvh_vcpu(v) ) + return hvm_pvh_vcpu_initialise(v); + if ( (rc = vlapic_init(v)) != 0 ) goto fail1; @@ -1084,6 +1151,8 @@ int hvm_vcpu_initialise(struct vcpu *v) && (rc = nestedhvm_vcpu_initialise(v)) < 0 ) goto fail3; + dm_domid = d->arch.hvm_domain.params[HVM_PARAM_DM_DOMAIN]; + /* Create ioreq event channel. */ rc = alloc_unbound_xen_event_channel(v, dm_domid, NULL); if ( rc < 0 ) @@ -1163,7 +1232,10 @@ void hvm_vcpu_destroy(struct vcpu *v) tasklet_kill(&v->arch.hvm_vcpu.assert_evtchn_irq_tasklet); hvm_vcpu_cacheattr_destroy(v); - vlapic_destroy(v); + + if ( !is_pvh_vcpu(v) ) + vlapic_destroy(v); + hvm_funcs.vcpu_destroy(v); /* Event channel is already freed by evtchn_destroy(). */ @@ -4514,6 +4586,8 @@ static int hvm_memory_event_traps(long p, uint32_t reason, void hvm_memory_event_cr0(unsigned long value, unsigned long old) { + if ( is_pvh_vcpu(current) ) + return; hvm_memory_event_traps(current->domain->arch.hvm_domain .params[HVM_PARAM_MEMORY_EVENT_CR0], MEM_EVENT_REASON_CR0, @@ -4522,6 +4596,8 @@ void hvm_memory_event_cr0(unsigned long value, unsigned long old) void hvm_memory_event_cr3(unsigned long value, unsigned long old) { + if ( is_pvh_vcpu(current) ) + return; hvm_memory_event_traps(current->domain->arch.hvm_domain .params[HVM_PARAM_MEMORY_EVENT_CR3], MEM_EVENT_REASON_CR3, @@ -4530,6 +4606,8 @@ void hvm_memory_event_cr3(unsigned long value, unsigned long old) void hvm_memory_event_cr4(unsigned long value, unsigned long old) { + if ( is_pvh_vcpu(current) ) + return; hvm_memory_event_traps(current->domain->arch.hvm_domain .params[HVM_PARAM_MEMORY_EVENT_CR4], MEM_EVENT_REASON_CR4, @@ -4538,6 +4616,8 @@ void hvm_memory_event_cr4(unsigned long value, unsigned long old) void hvm_memory_event_msr(unsigned long msr, unsigned long value) { + if ( is_pvh_vcpu(current) ) + return; hvm_memory_event_traps(current->domain->arch.hvm_domain .params[HVM_PARAM_MEMORY_EVENT_MSR], MEM_EVENT_REASON_MSR, @@ -4550,6 +4630,8 @@ int hvm_memory_event_int3(unsigned long gla) unsigned long gfn; gfn = paging_gva_to_gfn(current, gla, &pfec); + if ( is_pvh_vcpu(current) ) + return 0; return hvm_memory_event_traps(current->domain->arch.hvm_domain .params[HVM_PARAM_MEMORY_EVENT_INT3], MEM_EVENT_REASON_INT3, @@ -4562,6 +4644,8 @@ int hvm_memory_event_single_step(unsigned long gla) unsigned long gfn; gfn = paging_gva_to_gfn(current, gla, &pfec); + if ( is_pvh_vcpu(current) ) + return 0; return hvm_memory_event_traps(current->domain->arch.hvm_domain .params[HVM_PARAM_MEMORY_EVENT_SINGLE_STEP], MEM_EVENT_REASON_SINGLESTEP, diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c index 9926ffb..b0bea9c 100644 --- a/xen/arch/x86/hvm/vmx/vmcs.c +++ b/xen/arch/x86/hvm/vmx/vmcs.c @@ -624,7 +624,7 @@ void vmx_vmcs_exit(struct vcpu *v) { /* Don''t confuse vmx_do_resume (for @v or @current!) */ vmx_clear_vmcs(v); - if ( is_hvm_vcpu(current) ) + if ( is_hvm_or_pvh_vcpu(current) ) vmx_load_vmcs(current); spin_unlock(&v->arch.hvm_vmx.vmcs_lock); @@ -815,6 +815,253 @@ void virtual_vmcs_vmwrite(void *vvmcs, u32 vmcs_encoding, u64 val) virtual_vmcs_exit(vvmcs); } +static int pvh_construct_vmcs(struct vcpu *v) +{ + uint16_t sysenter_cs; + unsigned long sysenter_eip; + struct domain *d = v->domain; + struct p2m_domain *p2m = p2m_get_hostp2m(d); + struct ept_data *ept = &p2m->ept; + u32 vmexit_ctl = vmx_vmexit_control; + u32 vmentry_ctl = vmx_vmentry_control; + u64 required, tmpval = -1; + + if ( !paging_mode_hap(d) ) + { + printk("ERROR: HAP is required to run PV in HVM container\n"); + return -EINVAL; + } + + /* VMCS controls. */ + vmx_pin_based_exec_control &= ~PIN_BASED_VIRTUAL_NMIS; + __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control); + + v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control; + + /* if rdtsc exiting is turned on and it goes thru emulate_privileged_op, + * then pv_vcpu.ctrlreg must be added to pvh struct */ + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_RDTSC_EXITING; + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_USE_TSC_OFFSETING; + + v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING | + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING); + v->arch.hvm_vmx.exec_control |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG; + v->arch.hvm_vmx.exec_control |= CPU_BASED_ACTIVATE_MSR_BITMAP; + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_TPR_SHADOW; + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; + + __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control); + + /* I/O access bitmap. */ + __vmwrite(IO_BITMAP_A, virt_to_maddr((char *)hvm_io_bitmap + 0)); + __vmwrite(IO_BITMAP_B, virt_to_maddr((char *)hvm_io_bitmap + PAGE_SIZE)); + + /* MSR access bitmap. */ + if ( cpu_has_vmx_msr_bitmap ) + { + unsigned long *msr_bitmap = alloc_xenheap_page(); + int msr_type = MSR_TYPE_R | MSR_TYPE_W; + + if ( msr_bitmap == NULL ) + return -ENOMEM; + + memset(msr_bitmap, ~0, PAGE_SIZE); + v->arch.hvm_vmx.msr_bitmap = msr_bitmap; + __vmwrite(MSR_BITMAP, virt_to_maddr(msr_bitmap)); + + vmx_disable_intercept_for_msr(v, MSR_FS_BASE, msr_type); + vmx_disable_intercept_for_msr(v, MSR_GS_BASE, msr_type); + vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS, msr_type); + vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP, msr_type); + vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP, msr_type); + vmx_disable_intercept_for_msr(v, MSR_SHADOW_GS_BASE, msr_type); + + /* pure hvm doesn''t do this. safe? see: long_mode_do_msr_write() */ +#if 0 + vmx_disable_intercept_for_msr(v, MSR_STAR); + vmx_disable_intercept_for_msr(v, MSR_LSTAR); + vmx_disable_intercept_for_msr(v, MSR_CSTAR); + vmx_disable_intercept_for_msr(v, MSR_SYSCALL_MASK); +#endif + } else { + printk("PVH: CPU does NOT have msr bitmap\n"); + return -EINVAL; + } + + if ( !cpu_has_vmx_vpid ) { + printk("PVH: At present VPID support is required to run PVH\n"); + return -EINVAL; + } + + v->arch.hvm_vmx.secondary_exec_control = vmx_secondary_exec_control; + + if ( cpu_has_vmx_secondary_exec_control ) { + v->arch.hvm_vmx.secondary_exec_control &= ~0x4FF; /* turn off all */ + v->arch.hvm_vmx.secondary_exec_control |= + SECONDARY_EXEC_PAUSE_LOOP_EXITING; + v->arch.hvm_vmx.secondary_exec_control |= SECONDARY_EXEC_ENABLE_VPID; + + v->arch.hvm_vmx.secondary_exec_control |= SECONDARY_EXEC_ENABLE_EPT; + __vmwrite(SECONDARY_VM_EXEC_CONTROL, + v->arch.hvm_vmx.secondary_exec_control); + } else { + printk("PVH: NO Secondary Exec control\n"); + return -EINVAL; + } + + __vmwrite(VM_EXIT_CONTROLS, vmexit_ctl); + + #define VM_ENTRY_LOAD_DEBUG_CTLS 0x4 + #define VM_ENTRY_LOAD_EFER 0x8000 + vmentry_ctl &= ~VM_ENTRY_LOAD_DEBUG_CTLS; + vmentry_ctl &= ~VM_ENTRY_LOAD_EFER; + vmentry_ctl &= ~VM_ENTRY_SMM; + vmentry_ctl &= ~VM_ENTRY_DEACT_DUAL_MONITOR; + vmentry_ctl |= VM_ENTRY_IA32E_MODE; + __vmwrite(VM_ENTRY_CONTROLS, vmentry_ctl); + + /* MSR intercepts. */ + __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0); + __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0); + __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0); + + /* Host data selectors. */ + __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS); + __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS); + __vmwrite(HOST_ES_SELECTOR, __HYPERVISOR_DS); + __vmwrite(HOST_FS_SELECTOR, 0); + __vmwrite(HOST_GS_SELECTOR, 0); + __vmwrite(HOST_FS_BASE, 0); + __vmwrite(HOST_GS_BASE, 0); + + vmx_set_host_env(v); + + /* Host control registers. */ + v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS; + __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0); + __vmwrite(HOST_CR4, mmu_cr4_features|(cpu_has_xsave ? X86_CR4_OSXSAVE : 0)); + + /* Host CS:RIP. */ + __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS); + __vmwrite(HOST_RIP, (unsigned long)vmx_asm_vmexit_handler); + + /* Host SYSENTER CS:RIP. */ + rdmsrl(MSR_IA32_SYSENTER_CS, sysenter_cs); + __vmwrite(HOST_SYSENTER_CS, sysenter_cs); + rdmsrl(MSR_IA32_SYSENTER_EIP, sysenter_eip); + __vmwrite(HOST_SYSENTER_EIP, sysenter_eip); + + __vmwrite(VM_ENTRY_INTR_INFO, 0); + + __vmwrite(CR3_TARGET_COUNT, 0); + + __vmwrite(GUEST_ACTIVITY_STATE, 0); + + /* Set default guest context values here. Some of these are then overwritten + * in vmx_pvh_set_vcpu_info() by guest itself during vcpu bringup */ + __vmwrite(GUEST_CS_BASE, 0); + __vmwrite(GUEST_CS_LIMIT, ~0u); + __vmwrite(GUEST_CS_AR_BYTES, 0xa09b); /* CS.L == 1 */ + __vmwrite(GUEST_CS_SELECTOR, 0x10); + + __vmwrite(GUEST_DS_BASE, 0); + __vmwrite(GUEST_DS_LIMIT, ~0u); + __vmwrite(GUEST_DS_AR_BYTES, 0xc093); + __vmwrite(GUEST_DS_SELECTOR, 0x18); + + __vmwrite(GUEST_SS_BASE, 0); /* use same seg as DS */ + __vmwrite(GUEST_SS_LIMIT, ~0u); + __vmwrite(GUEST_SS_AR_BYTES, 0xc093); + __vmwrite(GUEST_SS_SELECTOR, 0x18); + + __vmwrite(GUEST_ES_SELECTOR, 0); + __vmwrite(GUEST_FS_SELECTOR, 0); + __vmwrite(GUEST_GS_SELECTOR, 0); + + /* Guest segment bases. */ + __vmwrite(GUEST_ES_BASE, 0); + __vmwrite(GUEST_FS_BASE, 0); + __vmwrite(GUEST_GS_BASE, 0); + + /* Guest segment limits. */ + __vmwrite(GUEST_ES_LIMIT, ~0u); + __vmwrite(GUEST_FS_LIMIT, ~0u); + __vmwrite(GUEST_GS_LIMIT, ~0u); + + /* Guest segment AR bytes. */ + __vmwrite(GUEST_ES_AR_BYTES, 0xc093); /* read/write, accessed */ + __vmwrite(GUEST_FS_AR_BYTES, 0xc093); + __vmwrite(GUEST_GS_AR_BYTES, 0xc093); + + /* Guest IDT. */ + __vmwrite(GUEST_GDTR_BASE, 0); + __vmwrite(GUEST_GDTR_LIMIT, 0); + + /* Guest LDT. */ + __vmwrite(GUEST_LDTR_AR_BYTES, 0x82); /* LDT */ + __vmwrite(GUEST_LDTR_SELECTOR, 0); + __vmwrite(GUEST_LDTR_BASE, 0); + __vmwrite(GUEST_LDTR_LIMIT, 0); + + /* Guest TSS. */ + __vmwrite(GUEST_TR_AR_BYTES, 0x8b); /* 32-bit TSS (busy) */ + __vmwrite(GUEST_TR_BASE, 0); + __vmwrite(GUEST_TR_LIMIT, 0xff); + + __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0); + __vmwrite(GUEST_DR7, 0); + __vmwrite(VMCS_LINK_POINTER, ~0UL); + + __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0); + __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0); + + v->arch.hvm_vmx.exception_bitmap = + HVM_TRAP_MASK | (1 << TRAP_debug) | + (1U << TRAP_int3) | (1U << TRAP_no_device); + __vmwrite(EXCEPTION_BITMAP, v->arch.hvm_vmx.exception_bitmap); + + __vmwrite(TSC_OFFSET, 0); + + /* Set WP bit so rdonly pages are not written from CPL 0 */ + tmpval = X86_CR0_PG | X86_CR0_NE | X86_CR0_PE | X86_CR0_WP; + __vmwrite(GUEST_CR0, tmpval); + __vmwrite(CR0_READ_SHADOW, tmpval); + v->arch.hvm_vcpu.hw_cr[0] = v->arch.hvm_vcpu.guest_cr[0] = tmpval; + + tmpval = real_cr4_to_pv_guest_cr4(mmu_cr4_features); + required = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSFXSR; + if ( (tmpval & required) != required ) + { + printk("PVH: required CR4 features not available:%lx\n", required); + return -EINVAL; + } + __vmwrite(GUEST_CR4, tmpval); + __vmwrite(CR4_READ_SHADOW, tmpval); + v->arch.hvm_vcpu.guest_cr[4] = tmpval; + + __vmwrite(CR0_GUEST_HOST_MASK, ~0UL); + __vmwrite(CR4_GUEST_HOST_MASK, ~0UL); + + v->arch.hvm_vmx.vmx_realmode = 0; + + ept->asr = pagetable_get_pfn(p2m_get_pagetable(p2m)); + __vmwrite(EPT_POINTER, ept_get_eptp(ept)); + + if ( cpu_has_vmx_pat ) + { + u64 host_pat, guest_pat; + + rdmsrl(MSR_IA32_CR_PAT, host_pat); + guest_pat = MSR_IA32_CR_PAT_RESET; + + __vmwrite(HOST_PAT, host_pat); + __vmwrite(GUEST_PAT, guest_pat); + } + return 0; +} + static int construct_vmcs(struct vcpu *v) { struct domain *d = v->domain; @@ -825,6 +1072,12 @@ static int construct_vmcs(struct vcpu *v) vmx_vmcs_enter(v); + if ( is_pvh_vcpu(v) ) { + int rc = pvh_construct_vmcs(v); + vmx_vmcs_exit(v); + return rc; + } + /* VMCS controls. */ __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control); @@ -1259,8 +1512,10 @@ void vmx_do_resume(struct vcpu *v) vmx_clear_vmcs(v); vmx_load_vmcs(v); - hvm_migrate_timers(v); - hvm_migrate_pirqs(v); + if ( !is_pvh_vcpu(v) ) { + hvm_migrate_timers(v); + hvm_migrate_pirqs(v); + } vmx_set_host_env(v); /* * Both n1 VMCS and n2 VMCS need to update the host environment after @@ -1272,6 +1527,9 @@ void vmx_do_resume(struct vcpu *v) hvm_asid_flush_vcpu(v); } + if ( is_pvh_vcpu(v) ) + reset_stack_and_jump(vmx_asm_do_vmentry); + debug_state = v->domain->debugger_attached || v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_INT3] || v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_SINGLE_STEP]; @@ -1455,7 +1713,7 @@ static void vmcs_dump(unsigned char ch) for_each_domain ( d ) { - if ( !is_hvm_domain(d) ) + if ( !is_hvm_or_pvh_domain(d) ) continue; printk("\n>>> Domain %d <<<\n", d->domain_id); for_each_vcpu ( d, v ) diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c index e64980f..194c87b 100644 --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -79,6 +79,9 @@ static int vmx_domain_initialise(struct domain *d) { int rc; + if ( is_pvh_domain(d) ) + return 0; + if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 ) return rc; @@ -87,6 +90,9 @@ static int vmx_domain_initialise(struct domain *d) static void vmx_domain_destroy(struct domain *d) { + if ( is_pvh_domain(d) ) + return; + vmx_free_vlapic_mapping(d); } @@ -110,6 +116,12 @@ static int vmx_vcpu_initialise(struct vcpu *v) vpmu_initialise(v); + if (is_pvh_vcpu(v) ) + { + /* this for hvm_long_mode_enabled(v) */ + v->arch.hvm_vcpu.guest_efer = EFER_SCE | EFER_LMA | EFER_LME; + return 0; + } vmx_install_vlapic_mapping(v); /* %eax == 1 signals full real-mode support to the guest loader. */ @@ -1033,6 +1045,23 @@ static void vmx_update_host_cr3(struct vcpu *v) vmx_vmcs_exit(v); } +static void vmx_update_pvh_cr(struct vcpu *v, unsigned int cr) +{ + vmx_vmcs_enter(v); + switch ( cr ) + { + case 3: + __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.guest_cr[3]); + hvm_asid_flush_vcpu(v); + break; + + default: + printk("PVH: d%d v%d unexpected cr%d update at rip:%lx\n", + v->domain->domain_id, v->vcpu_id, cr, __vmread(GUEST_RIP)); + } + vmx_vmcs_exit(v); +} + void vmx_update_debug_state(struct vcpu *v) { unsigned long mask; @@ -1052,6 +1081,11 @@ void vmx_update_debug_state(struct vcpu *v) static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr) { + if ( is_pvh_vcpu(v) ) { + vmx_update_pvh_cr(v, cr); + return; + } + vmx_vmcs_enter(v); switch ( cr ) -- 1.7.2.3
Mukesh Rathor
2013-Mar-19 22:33 UTC
[PATCH 10/18 V2 RESEND]: PVH xen: introduce vmx_pvh.c and pvh.c
The heart of this patch is vmx exit handler for PVH guest. It is nicely isolated in a separate module as preferred by most of us. A call to it is added to vmx_pvh_vmexit_handler(). Changes in V2: - Move non VMX generic code to arch/x86/hvm/pvh.c - Remove get_gpr_ptr() and use existing decode_register() instead. - Defer call to pvh vmx exit handler until interrupts are enabled. So the caller vmx_pvh_vmexit_handler() handles the NMI/EXT-INT/TRIPLE_FAULT now. - Fix the CPUID (wrongly) clearing bit 24. No need to do this now, set the correct feature bits in CR4 during vmcs creation. - Fix few hard tabs. Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> --- xen/arch/x86/hvm/Makefile | 3 +- xen/arch/x86/hvm/pvh.c | 220 ++++++++++++++ xen/arch/x86/hvm/vmx/Makefile | 1 + xen/arch/x86/hvm/vmx/vmx.c | 7 + xen/arch/x86/hvm/vmx/vmx_pvh.c | 587 +++++++++++++++++++++++++++++++++++++ xen/include/asm-x86/hvm/vmx/vmx.h | 7 +- xen/include/asm-x86/pvh.h | 6 + 7 files changed, 829 insertions(+), 2 deletions(-) create mode 100644 xen/arch/x86/hvm/pvh.c create mode 100644 xen/arch/x86/hvm/vmx/vmx_pvh.c create mode 100644 xen/include/asm-x86/pvh.h diff --git a/xen/arch/x86/hvm/Makefile b/xen/arch/x86/hvm/Makefile index eea5555..65ff9f3 100644 --- a/xen/arch/x86/hvm/Makefile +++ b/xen/arch/x86/hvm/Makefile @@ -22,4 +22,5 @@ obj-y += vlapic.o obj-y += vmsi.o obj-y += vpic.o obj-y += vpt.o -obj-y += vpmu.o \ No newline at end of file +obj-y += vpmu.o +obj-y += pvh.o diff --git a/xen/arch/x86/hvm/pvh.c b/xen/arch/x86/hvm/pvh.c new file mode 100644 index 0000000..c12c4b7 --- /dev/null +++ b/xen/arch/x86/hvm/pvh.c @@ -0,0 +1,220 @@ +/* + * Copyright (C) 2013, Mukesh Rathor, Oracle Corp. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <xen/hypercall.h> +#include <xen/guest_access.h> +#include <asm/p2m.h> +#include <asm/traps.h> +#include <asm/hvm/vmx/vmx.h> +#include <public/sched.h> + +static int pvh_grant_table_op( + unsigned int cmd, XEN_GUEST_HANDLE(void) uop, unsigned int count) +{ + switch (cmd) + { + case GNTTABOP_map_grant_ref: + case GNTTABOP_unmap_grant_ref: + case GNTTABOP_setup_table: + case GNTTABOP_copy: + case GNTTABOP_query_size: + case GNTTABOP_set_version: + return do_grant_table_op(cmd, uop, count); + } + return -ENOSYS; +} + +static long pvh_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg) +{ + long rc = -ENOSYS; + + switch ( cmd ) + { + case VCPUOP_register_runstate_memory_area: + case VCPUOP_get_runstate_info: + case VCPUOP_set_periodic_timer: + case VCPUOP_stop_periodic_timer: + case VCPUOP_set_singleshot_timer: + case VCPUOP_stop_singleshot_timer: + case VCPUOP_is_up: + case VCPUOP_up: + case VCPUOP_initialise: + rc = do_vcpu_op(cmd, vcpuid, arg); + + /* pvh boot vcpu setting context for bringing up smp vcpu */ + if (cmd == VCPUOP_initialise) + vmx_vmcs_enter(current); + } + return rc; +} + +static long pvh_physdev_op(int cmd, XEN_GUEST_HANDLE(void) arg) +{ + switch ( cmd ) + { + case PHYSDEVOP_map_pirq: + case PHYSDEVOP_unmap_pirq: + case PHYSDEVOP_eoi: + case PHYSDEVOP_irq_status_query: + case PHYSDEVOP_get_free_pirq: + return do_physdev_op(cmd, arg); + + default: + if ( IS_PRIV(current->domain) ) + return do_physdev_op(cmd, arg); + } + return -ENOSYS; +} + +static long do_pvh_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) arg) +{ + long rc = -EINVAL; + struct xen_hvm_param harg; + struct domain *d; + + if ( copy_from_guest(&harg, arg, 1) ) + return -EFAULT; + + rc = rcu_lock_target_domain_by_id(harg.domid, &d); + if ( rc != 0 ) + return rc; + + if (is_hvm_domain(d)) { + /* pvh dom0 is building an hvm guest */ + rcu_unlock_domain(d); + return do_hvm_op(op, arg); + } + + rc = -ENOSYS; + if (op == HVMOP_set_param) { + if (harg.index == HVM_PARAM_CALLBACK_IRQ) { + struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; + uint64_t via = harg.value; + uint8_t via_type = (uint8_t)(via >> 56) + 1; + + if (via_type == HVMIRQ_callback_vector) { + hvm_irq->callback_via_type = HVMIRQ_callback_vector; + hvm_irq->callback_via.vector = (uint8_t)via; + rc = 0; + } + } + } + rcu_unlock_domain(d); + return rc; +} + +typedef unsigned long pvh_hypercall_t( + unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, + unsigned long); + +int hcall_a[NR_hypercalls]; + +static pvh_hypercall_t *pvh_hypercall64_table[NR_hypercalls] = { + [__HYPERVISOR_platform_op] = (pvh_hypercall_t *)do_platform_op, + [__HYPERVISOR_memory_op] = (pvh_hypercall_t *)do_memory_op, + /* [__HYPERVISOR_set_timer_op] = (pvh_hypercall_t *)do_set_timer_op, */ + [__HYPERVISOR_xen_version] = (pvh_hypercall_t *)do_xen_version, + [__HYPERVISOR_console_io] = (pvh_hypercall_t *)do_console_io, + [__HYPERVISOR_grant_table_op] = (pvh_hypercall_t *)pvh_grant_table_op, + [__HYPERVISOR_vcpu_op] = (pvh_hypercall_t *)pvh_vcpu_op, + [__HYPERVISOR_mmuext_op] = (pvh_hypercall_t *)do_mmuext_op, + [__HYPERVISOR_xsm_op] = (pvh_hypercall_t *)do_xsm_op, + [__HYPERVISOR_sched_op] = (pvh_hypercall_t *)do_sched_op, + [__HYPERVISOR_event_channel_op]= (pvh_hypercall_t *)do_event_channel_op, + [__HYPERVISOR_physdev_op] = (pvh_hypercall_t *)pvh_physdev_op, + [__HYPERVISOR_hvm_op] = (pvh_hypercall_t *)do_pvh_hvm_op, + [__HYPERVISOR_sysctl] = (pvh_hypercall_t *)do_sysctl, + [__HYPERVISOR_domctl] = (pvh_hypercall_t *)do_domctl +}; + +/* fixme: Do we need to worry about this and slow things down in this path? */ +static int pvh_long_mode_enabled(void) +{ + /* A 64bit linux guest should always run in this mode with CS.L selecting + * either 64bit mode or 32bit compat mode */ + return 1; +} + +/* Check if hypercall is valid + * Returns: 0 if hcall is not valid with eax set to the errno to ret to guest + */ +static int hcall_valid(struct cpu_user_regs *regs) +{ + struct segment_register sreg; + + if (!pvh_long_mode_enabled()) + { + gdprintk(XENLOG_ERR, "PVH Error: Expected long mode set\n"); + return 1; + } + hvm_get_segment_register(current, x86_seg_ss, &sreg); + if ( unlikely(sreg.attr.fields.dpl == 3) ) + { + regs->eax = -EPERM; + return 0; + } + + /* domU''s are not allowed following hcalls */ + if ( !IS_PRIV(current->domain) && + (regs->eax == __HYPERVISOR_xsm_op || + regs->eax == __HYPERVISOR_platform_op || + regs->eax == __HYPERVISOR_domctl) ) { /* for privcmd mmap */ + + regs->eax = -EPERM; + return 0; + } + return 1; +} + +int pvh_do_hypercall(struct cpu_user_regs *regs) +{ + uint32_t hnum = regs->eax; + + if ( hnum >= NR_hypercalls || pvh_hypercall64_table[hnum] == NULL ) + { + gdprintk(XENLOG_WARNING, "PVH: Unimplemented HCALL:%d. Returning " + "-ENOSYS. domid:%d IP:%lx SP:%lx\n", + hnum, current->domain->domain_id, regs->rip, regs->rsp); + regs->eax = -ENOSYS; + vmx_update_guest_eip(); + return HVM_HCALL_completed; + } + + if ( regs->eax == __HYPERVISOR_sched_op && regs->rdi == SCHEDOP_shutdown ) + { + regs->eax = -ENOSYS; + vmx_update_guest_eip(); + + /* PVH fixme: show_guest_stack() from domain crash causes PF */ + domain_crash_synchronous(); + return HVM_HCALL_completed; + } + + if ( !hcall_valid(regs) ) + return HVM_HCALL_completed; + + current->arch.hvm_vcpu.hcall_preempted = 0; + regs->rax = pvh_hypercall64_table[hnum](regs->rdi, regs->rsi, regs->rdx, + regs->r10, regs->r8, regs->r9); + + if ( current->arch.hvm_vcpu.hcall_preempted ) + return HVM_HCALL_preempted; + + return HVM_HCALL_completed; +} + diff --git a/xen/arch/x86/hvm/vmx/Makefile b/xen/arch/x86/hvm/vmx/Makefile index 373b3d9..8b71dae 100644 --- a/xen/arch/x86/hvm/vmx/Makefile +++ b/xen/arch/x86/hvm/vmx/Makefile @@ -5,3 +5,4 @@ obj-y += vmcs.o obj-y += vmx.o obj-y += vpmu_core2.o obj-y += vvmx.o +obj-y += vmx_pvh.o diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c index 194c87b..5503fc9 100644 --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -1529,6 +1529,8 @@ static struct hvm_function_table __read_mostly vmx_function_table = { .virtual_intr_delivery_enabled = vmx_virtual_intr_delivery_enabled, .process_isr = vmx_process_isr, .nhvm_hap_walk_L1_p2m = nvmx_hap_walk_L1_p2m, + .pvh_set_vcpu_info = vmx_pvh_set_vcpu_info, + .pvh_read_descriptor = vmx_pvh_read_descriptor, }; struct hvm_function_table * __init start_vmx(void) @@ -2364,6 +2366,11 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) ) return vmx_failed_vmentry(exit_reason, regs); + if ( is_pvh_vcpu(v) ) { + vmx_pvh_vmexit_handler(regs); + return; + } + if ( v->arch.hvm_vmx.vmx_realmode ) { /* Put RFLAGS back the way the guest wants it */ diff --git a/xen/arch/x86/hvm/vmx/vmx_pvh.c b/xen/arch/x86/hvm/vmx/vmx_pvh.c new file mode 100644 index 0000000..14ca0f6 --- /dev/null +++ b/xen/arch/x86/hvm/vmx/vmx_pvh.c @@ -0,0 +1,587 @@ +/* + * Copyright (C) 2013, Mukesh Rathor, Oracle Corp. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <xen/hypercall.h> +#include <xen/guest_access.h> +#include <asm/p2m.h> +#include <asm/traps.h> +#include <asm/hvm/vmx/vmx.h> +#include <public/sched.h> +#include <asm/pvh.h> + +volatile int pvhdbg=0; +#define dbgp1(...) {(pvhdbg==1) ? printk(__VA_ARGS__):0;} +#define dbgp2(...) {(pvhdbg==2) ? printk(__VA_ARGS__):0;} + + +static void read_vmcs_selectors(struct cpu_user_regs *regs) +{ + regs->cs = __vmread(GUEST_CS_SELECTOR); + regs->ss = __vmread(GUEST_SS_SELECTOR); + regs->ds = __vmread(GUEST_DS_SELECTOR); + regs->es = __vmread(GUEST_ES_SELECTOR); + regs->gs = __vmread(GUEST_GS_SELECTOR); + regs->fs = __vmread(GUEST_FS_SELECTOR); +} + +/* returns : 0 success */ +static int vmxit_msr_read(struct cpu_user_regs *regs) +{ + int rc=1; + + u64 msr_content = 0; + switch (regs->ecx) + { + case MSR_IA32_MISC_ENABLE: + { + rdmsrl(MSR_IA32_MISC_ENABLE, msr_content); + msr_content |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL | + MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL; + break; + } + default: + { + /* fixme: see hvm_msr_read_intercept() */ + rdmsrl(regs->ecx, msr_content); + break; + } + } + regs->eax = (uint32_t)msr_content; + regs->edx = (uint32_t)(msr_content >> 32); + vmx_update_guest_eip(); + rc = 0; + + dbgp1("msr read c:%lx a:%lx d:%lx RIP:%lx RSP:%lx\n", regs->ecx, regs->eax, + regs->edx, regs->rip, regs->rsp); + return rc; +} + +/* returns : 0 success */ +static int vmxit_msr_write(struct cpu_user_regs *regs) +{ + uint64_t msr_content = (uint32_t)regs->eax | ((uint64_t)regs->edx << 32); + int rc=1; + + dbgp1("PVH: msr write:0x%lx. eax:0x%lx edx:0x%lx\n", regs->ecx, + regs->eax,regs->edx); + + if ( hvm_msr_write_intercept(regs->ecx, msr_content) == X86EMUL_OKAY ) { + vmx_update_guest_eip(); + rc = 0; + } + return rc; +} + +/* Returns: rc == 0: handled the MTF vmexit */ +static int vmxit_mtf(struct cpu_user_regs *regs) +{ + struct vcpu *vp = current; + int rc=1, ss=vp->arch.hvm_vcpu.single_step; + + vp->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG; + __vmwrite(CPU_BASED_VM_EXEC_CONTROL, vp->arch.hvm_vmx.exec_control); + vp->arch.hvm_vcpu.single_step = 0; + + if ( vp->domain->debugger_attached && ss ) { + domain_pause_for_debugger(); + rc = 0; + } + return rc; +} + +static int vmxit_int3(struct cpu_user_regs *regs) +{ + int ilen = vmx_get_instruction_length(); + struct vcpu *vp = current; + struct hvm_trap trap_info = { + .vector = TRAP_int3, + .type = X86_EVENTTYPE_SW_EXCEPTION, + .error_code = HVM_DELIVER_NO_ERROR_CODE, + .insn_len = ilen + }; + + regs->eip += ilen; + + /* gdbsx or another debugger. Never pause dom0 */ + if ( vp->domain->domain_id != 0 && guest_kernel_mode(vp, regs) ) + { + dbgp1("[%d]PVH: domain pause for debugger\n", smp_processor_id()); + current->arch.gdbsx_vcpu_event = TRAP_int3; + domain_pause_for_debugger(); + return 0; + } + + regs->eip -= ilen; + hvm_inject_trap(&trap_info); + + return 0; +} + +static int vmxit_invalid_op(struct cpu_user_regs *regs) +{ + ulong addr=0; + + if ( guest_kernel_mode(current, regs) || + (addr = emulate_forced_invalid_op(regs)) == 0 ) + { + hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE); + return 0; + } + + if (addr != EXCRET_fault_fixed) + hvm_inject_page_fault(0, addr); + + return 0; +} + +/* Returns: rc == 0: handled the exception/NMI */ +static int vmxit_exception(struct cpu_user_regs *regs) +{ + unsigned int vector = (__vmread(VM_EXIT_INTR_INFO)) & INTR_INFO_VECTOR_MASK; + int rc=1; + struct vcpu *vp = current; + + dbgp2(" EXCPT: vec:%d cs:%lx r.IP:%lx\n", vector, + __vmread(GUEST_CS_SELECTOR), regs->eip); + + if (vector == TRAP_debug) { + unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION); + write_debugreg(6, exit_qualification | 0xffff0ff0); + rc = 0; + + /* gdbsx or another debugger */ + if ( vp->domain->domain_id != 0 && /* never pause dom0 */ + guest_kernel_mode(vp, regs) && vp->domain->debugger_attached ) + { + domain_pause_for_debugger(); + } else { + hvm_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE); + } + } + if (vector == TRAP_int3) { + rc = vmxit_int3(regs); + + } else if (vector == TRAP_invalid_op) { + rc = vmxit_invalid_op(regs); + + } else if (vector == TRAP_no_device) { + hvm_funcs.fpu_dirty_intercept(); /* calls vmx_fpu_dirty_intercept */ + rc = 0; + + } else if (vector == TRAP_gp_fault) { + regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE); + /* hvm_inject_hw_exception(TRAP_gp_fault, regs->error_code); */ + rc = 1; + + } else if (vector == TRAP_page_fault) { + printk("PVH: Unexpected vector page_fault. IP:%lx\n", regs->eip); + rc = 1; + } + if (rc) + printk("PVH: Unhandled trap vector:%d. IP:%lx\n", vector, regs->eip); + + return rc; +} + +static int vmxit_invlpg(void) +{ + ulong vaddr = __vmread(EXIT_QUALIFICATION); + + vmx_update_guest_eip(); + vpid_sync_vcpu_gva(current, vaddr); + return 0; +} + +static int vmxit_vmcall(struct cpu_user_regs *regs) +{ + if ( pvh_do_hypercall(regs) != HVM_HCALL_preempted) + vmx_update_guest_eip(); + + return 0;; +} + +/* Returns: rc == 0: success */ +static int access_cr0(struct cpu_user_regs *regs, uint acc_typ, + uint64_t *regp) +{ + struct vcpu *vp = current; + + if (acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR ) + { + unsigned long new_cr0 = *regp; + unsigned long old_cr0 = __vmread(GUEST_CR0); + + dbgp2("PVH:writing to CR0. RIP:%lx val:0x%lx\n", regs->rip, *regp); + if ( (u32)new_cr0 != new_cr0 ) + { + HVM_DBG_LOG(DBG_LEVEL_1, + "Guest setting upper 32 bits in CR0: %lx", new_cr0); + return 1; + } + + new_cr0 &= ~HVM_CR0_GUEST_RESERVED_BITS; + /* ET is reserved and should be always be 1. */ + new_cr0 |= X86_CR0_ET; + + /* pvh cannot change to real mode */ + if ( (new_cr0 & (X86_CR0_PE|X86_CR0_PG)) != (X86_CR0_PG|X86_CR0_PE) ) { + printk("PVH attempting to turn off PE/PG. CR0:%lx\n", new_cr0); + return 1; + } + /* TS going from 1 to 0 */ + if ( (old_cr0 & X86_CR0_TS) && ((new_cr0 & X86_CR0_TS)==0) ) + vmx_fpu_enter(vp); + + vp->arch.hvm_vcpu.hw_cr[0] = vp->arch.hvm_vcpu.guest_cr[0] = new_cr0; + __vmwrite(GUEST_CR0, new_cr0); + __vmwrite(CR0_READ_SHADOW, new_cr0); + } else { + *regp = __vmread(GUEST_CR0); + } + return 0; +} + +/* Returns: rc == 0: success */ +static int access_cr4(struct cpu_user_regs *regs, uint acc_typ, + uint64_t *regp) +{ + if (acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR ) + { + u64 old_cr4 = __vmread(GUEST_CR4); + + if ( (old_cr4 ^ (*regp)) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) ) + vpid_sync_all(); + + /* pvh_verify_cr4_wr(*regp)); */ + __vmwrite(GUEST_CR4, *regp); + } else { + *regp = __vmread(GUEST_CR4); + } + return 0; +} + +/* Returns: rc == 0: success */ +static int vmxit_cr_access(struct cpu_user_regs *regs) +{ + unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION); + uint acc_typ = VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification); + int cr, rc = 1; + + switch ( acc_typ ) + { + case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR: + case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR: + { + uint gpr = VMX_CONTROL_REG_ACCESS_GPR(exit_qualification); + uint64_t *regp = decode_register(gpr, regs, 0); + cr = VMX_CONTROL_REG_ACCESS_NUM(exit_qualification); + + if (regp == NULL) + break; + + /* pl don''t embed switch statements */ + if (cr == 0) + rc = access_cr0(regs, acc_typ, regp); + else if (cr == 3) { + printk("PVH: d%d: unexpected cr3 access vmexit. rip:%lx\n", + current->domain->domain_id, regs->rip); + domain_crash_synchronous(); + } else if (cr == 4) + rc = access_cr4(regs, acc_typ, regp); + + if (rc == 0) + vmx_update_guest_eip(); + break; + } + case VMX_CONTROL_REG_ACCESS_TYPE_CLTS: + { + struct vcpu *vp = current; + unsigned long cr0 = vp->arch.hvm_vcpu.guest_cr[0] & ~X86_CR0_TS; + vp->arch.hvm_vcpu.hw_cr[0] = vp->arch.hvm_vcpu.guest_cr[0] = cr0; + vmx_fpu_enter(vp); + __vmwrite(GUEST_CR0, cr0); + __vmwrite(CR0_READ_SHADOW, cr0); + vmx_update_guest_eip(); + rc = 0; + } + } + return rc; +} + +/* NOTE: a PVH sets IOPL natively by setting bits in the eflags and not by + * hypercalls used by a PV */ +static int vmxit_io_instr(struct cpu_user_regs *regs) +{ + int curr_lvl; + int requested = (regs->rflags >> 12) & 3; + + read_vmcs_selectors(regs); + curr_lvl = regs->cs & 3; + + if (requested >= curr_lvl && emulate_privileged_op(regs)) + return 0; + + hvm_inject_hw_exception(TRAP_gp_fault, regs->error_code); + return 0; +} + +static int pvh_ept_handle_violation(unsigned long qualification, + paddr_t gpa, struct cpu_user_regs *regs) +{ + unsigned long gla, gfn = gpa >> PAGE_SHIFT; + p2m_type_t p2mt; + mfn_t mfn = get_gfn_query_unlocked(current->domain, gfn, &p2mt); + + gdprintk(XENLOG_ERR, "Dom:%d EPT violation %#lx (%c%c%c/%c%c%c), " + "gpa %#"PRIpaddr", mfn %#lx, type %i. IP:0x%lx RSP:0x%lx\n", + current->domain->domain_id, qualification, + (qualification & EPT_READ_VIOLATION) ? ''r'' : ''-'', + (qualification & EPT_WRITE_VIOLATION) ? ''w'' : ''-'', + (qualification & EPT_EXEC_VIOLATION) ? ''x'' : ''-'', + (qualification & EPT_EFFECTIVE_READ) ? ''r'' : ''-'', + (qualification & EPT_EFFECTIVE_WRITE) ? ''w'' : ''-'', + (qualification & EPT_EFFECTIVE_EXEC) ? ''x'' : ''-'', + gpa, mfn_x(mfn), p2mt, regs->rip, regs->rsp); + + ept_walk_table(current->domain, gfn); + + if ( qualification & EPT_GLA_VALID ) + { + gla = __vmread(GUEST_LINEAR_ADDRESS); + gdprintk(XENLOG_ERR, " --- GLA %#lx\n", gla); + } + + hvm_inject_hw_exception(TRAP_gp_fault, 0); + return 0; +} + +static void pvh_user_cpuid(struct cpu_user_regs *regs) +{ + unsigned int eax, ebx, ecx, edx; + + asm volatile ( "cpuid" + : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) + : "0" (regs->eax), "2" (regs->rcx) ); + + regs->rax = eax; regs->rbx = ebx; regs->rcx = ecx; regs->rdx = edx; +} + +/* + * Main exit handler for PVH case. Called from vmx_vmexit_handler(). + * Note: in vmx_asm_vmexit_handler, rip/rsp/eflags are updated in regs{} + */ +void vmx_pvh_vmexit_handler(struct cpu_user_regs *regs) +{ + unsigned long exit_qualification; + unsigned int exit_reason = __vmread(VM_EXIT_REASON); + int rc=0, ccpu = smp_processor_id(); + struct vcpu *vp = current; + + dbgp1("PVH:[%d]left VMCS exitreas:%d RIP:%lx RSP:%lx EFLAGS:%lx CR0:%lx\n", + ccpu, exit_reason, regs->rip, regs->rsp, regs->rflags, + __vmread(GUEST_CR0)); + + /* for guest_kernel_mode() */ + regs->cs = __vmread(GUEST_CS_SELECTOR); + + switch ( (uint16_t)exit_reason ) + { + case EXIT_REASON_EXCEPTION_NMI: /* 0 */ + rc = vmxit_exception(regs); + break; + + case EXIT_REASON_EXTERNAL_INTERRUPT: /* 1 */ + case EXIT_REASON_MCE_DURING_VMENTRY: /* 41 */ + break; /* handled in vmx_vmexit_handler() */ + + case EXIT_REASON_TRIPLE_FAULT: /* 2 */ + { + printk("PVH:Triple Flt:[%d] RIP:%lx RSP:%lx EFLAGS:%lx CR3:%lx\n", + ccpu, regs->rip, regs->rsp, regs->rflags, + __vmread(GUEST_CR3)); + + rc = 1; + break; + } + case EXIT_REASON_PENDING_VIRT_INTR: /* 7 */ + { + struct vcpu *v = current; + + /* Disable the interrupt window. */ + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; + __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control); + break; + } + + case EXIT_REASON_CPUID: /* 10 */ + { + if ( guest_kernel_mode(vp, regs) ) { + pv_cpuid(regs); + } else + pvh_user_cpuid(regs); + + vmx_update_guest_eip(); + dbgp2("cpuid:%ld RIP:%lx\n", regs->eax, regs->rip); + break; + } + + case EXIT_REASON_HLT: /* 12 */ + { + vmx_update_guest_eip(); + hvm_hlt(regs->eflags); + break; + } + + case EXIT_REASON_INVLPG: /* 14 */ + rc = vmxit_invlpg(); + break; + + case EXIT_REASON_RDTSC: /* 16 */ + rc = 1; + break; + + case EXIT_REASON_VMCALL: /* 18 */ + rc = vmxit_vmcall(regs); + break; + + case EXIT_REASON_CR_ACCESS: /* 28 */ + rc = vmxit_cr_access(regs); + break; + + case EXIT_REASON_DR_ACCESS: /* 29 */ + { + exit_qualification = __vmread(EXIT_QUALIFICATION); + vmx_dr_access(exit_qualification, regs); + break; + } + + case EXIT_REASON_IO_INSTRUCTION: + vmxit_io_instr(regs); + break; + + case EXIT_REASON_MSR_READ: /* 31 */ + rc = vmxit_msr_read(regs); + break; + + case EXIT_REASON_MSR_WRITE: /* 32 */ + rc = vmxit_msr_write(regs); + break; + + case EXIT_REASON_MONITOR_TRAP_FLAG: /* 37 */ + rc = vmxit_mtf(regs); + break; + + case EXIT_REASON_EPT_VIOLATION: + { + paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS); + exit_qualification = __vmread(EXIT_QUALIFICATION); + rc = pvh_ept_handle_violation(exit_qualification, gpa, regs); + break; + } + default: + rc = 1; + printk("PVH: Unexpected exit reason:%d 0x%x\n", exit_reason, + exit_reason); + } + if (rc) { + exit_qualification = __vmread(EXIT_QUALIFICATION); + printk("PVH: [%d] exit_reas:%d 0x%x qual:%ld 0x%lx cr0:0x%016lx\n", + ccpu, exit_reason, exit_reason, exit_qualification, + exit_qualification, __vmread(GUEST_CR0)); + printk("PVH: [%d] RIP:%lx RSP:%lx\n", ccpu, regs->rip, regs->rsp); + domain_crash_synchronous(); + } +} + +/* + * Sets info for non boot vcpu. VCPU 0 context is set by library. + * We use this for nonboot vcpu in which case the call comes from the + * kernel cpu_initialize_context(). + */ +int vmx_pvh_set_vcpu_info(struct vcpu *v, struct vcpu_guest_context *ctxtp) +{ + if (v->vcpu_id == 0) + return 0; + + vmx_vmcs_enter(v); + __vmwrite(GUEST_GDTR_BASE, ctxtp->u.pvh.gdtaddr); + __vmwrite(GUEST_GDTR_LIMIT, ctxtp->u.pvh.gdtsz); + __vmwrite(GUEST_GS_BASE, ctxtp->gs_base_user); + + __vmwrite(GUEST_CS_SELECTOR, ctxtp->user_regs.cs); + __vmwrite(GUEST_DS_SELECTOR, ctxtp->user_regs.ds); + __vmwrite(GUEST_ES_SELECTOR, ctxtp->user_regs.es); + __vmwrite(GUEST_SS_SELECTOR, ctxtp->user_regs.ss); + __vmwrite(GUEST_GS_SELECTOR, ctxtp->user_regs.gs); + + if ( vmx_add_guest_msr(MSR_SHADOW_GS_BASE) ) + return -EINVAL; + + vmx_write_guest_msr(MSR_SHADOW_GS_BASE, ctxtp->gs_base_kernel); + + vmx_vmcs_exit(v); + return 0; +} + +int vmx_pvh_read_descriptor(unsigned int sel, const struct vcpu *v, + const struct cpu_user_regs *regs, + unsigned long *base, unsigned long *limit, + unsigned int *ar) +{ + unsigned int tmp_ar = 0; + BUG_ON(v!=current); + BUG_ON(!is_pvh_vcpu(v)); + + if (sel == (unsigned int)regs->cs) { + *base = __vmread(GUEST_CS_BASE); + *limit = __vmread(GUEST_CS_LIMIT); + tmp_ar = __vmread(GUEST_CS_AR_BYTES); + } else if (sel == (unsigned int)regs->ds) { + *base = __vmread(GUEST_DS_BASE); + *limit = __vmread(GUEST_DS_LIMIT); + tmp_ar = __vmread(GUEST_DS_AR_BYTES); + } else if (sel == (unsigned int)regs->ss) { + *base = __vmread(GUEST_SS_BASE); + *limit = __vmread(GUEST_SS_LIMIT); + tmp_ar = __vmread(GUEST_SS_AR_BYTES); + } else if (sel == (unsigned int)regs->gs) { + *base = __vmread(GUEST_GS_BASE); + *limit = __vmread(GUEST_GS_LIMIT); + tmp_ar = __vmread(GUEST_GS_AR_BYTES); + } else if (sel == (unsigned int)regs->fs) { + *base = __vmread(GUEST_FS_BASE); + *limit = __vmread(GUEST_FS_LIMIT); + tmp_ar = __vmread(GUEST_FS_AR_BYTES); + } else if (sel == (unsigned int)regs->es) { + *base = __vmread(GUEST_ES_BASE); + *limit = __vmread(GUEST_ES_LIMIT); + tmp_ar = __vmread(GUEST_ES_AR_BYTES); + } else { + printk("Unmatched segment selector:%d\n", sel); + return 0; + } + + if (tmp_ar & X86_SEG_AR_CS_LM_ACTIVE) { /* x86 mess!! */ + *base = 0UL; + *limit = ~0UL; + } + /* Fixup ar so that it looks the same as in native mode */ + *ar = (tmp_ar << 8); + return 1; +} + diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h index a742e16..5679e8d 100644 --- a/xen/include/asm-x86/hvm/vmx/vmx.h +++ b/xen/include/asm-x86/hvm/vmx/vmx.h @@ -189,6 +189,7 @@ void vmx_update_secondary_exec_control(struct vcpu *v); * Access Rights */ #define X86_SEG_AR_SEG_TYPE 0xf /* 3:0, segment type */ +#define X86_SEG_AR_SEG_TYPE_CODE (1u << 3) /* code (vs data) segment */ #define X86_SEG_AR_DESC_TYPE (1u << 4) /* 4, descriptor type */ #define X86_SEG_AR_DPL 0x60 /* 6:5, descriptor privilege level */ #define X86_SEG_AR_SEG_PRESENT (1u << 7) /* 7, segment present */ @@ -442,10 +443,14 @@ void ept_p2m_uninit(struct p2m_domain *p2m); void ept_walk_table(struct domain *d, unsigned long gfn); void setup_ept_dump(void); - void vmx_update_guest_eip(void); void vmx_dr_access(unsigned long exit_qualification,struct cpu_user_regs *regs); void vmx_do_extint(struct cpu_user_regs *regs); +void vmx_pvh_vmexit_handler(struct cpu_user_regs *regs); +int vmx_pvh_set_vcpu_info(struct vcpu *v, struct vcpu_guest_context *ctxtp); +int vmx_pvh_read_descriptor(unsigned int sel, const struct vcpu *v, + const struct cpu_user_regs *regs, unsigned long *base, + unsigned long *limit, unsigned int *ar); int alloc_p2m_hap_data(struct p2m_domain *p2m); void free_p2m_hap_data(struct p2m_domain *p2m); diff --git a/xen/include/asm-x86/pvh.h b/xen/include/asm-x86/pvh.h new file mode 100644 index 0000000..73e59d3 --- /dev/null +++ b/xen/include/asm-x86/pvh.h @@ -0,0 +1,6 @@ +#ifndef __ASM_X86_PVH_H__ +#define __ASM_X86_PVH_H__ + +int pvh_do_hypercall(struct cpu_user_regs *regs); + +#endif /* __ASM_X86_PVH_H__ */ -- 1.7.2.3
Mukesh Rathor
2013-Mar-19 22:33 UTC
[PATCH 11/18 V2 RESEND]: PVH xen: some misc changes like mtrr, intr, msi.
Changes in irq.c as PVH doesn''t use vlapic emulation. In mtrr we add assert and set MTRR_TYPEs for PVH. Changes in V2: - Some cleanup of redundant code. - time.c: Honor no rdtsc exiting for PVH by setting vtsc to 0 in time.c Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> --- xen/arch/x86/hvm/irq.c | 3 +++ xen/arch/x86/hvm/mtrr.c | 10 ++++++++++ xen/arch/x86/hvm/vmx/intr.c | 7 ++++--- xen/arch/x86/msi.c | 9 +++++++-- xen/arch/x86/time.c | 2 +- 5 files changed, 25 insertions(+), 6 deletions(-) diff --git a/xen/arch/x86/hvm/irq.c b/xen/arch/x86/hvm/irq.c index 9eae5de..92fb245 100644 --- a/xen/arch/x86/hvm/irq.c +++ b/xen/arch/x86/hvm/irq.c @@ -405,6 +405,9 @@ struct hvm_intack hvm_vcpu_has_pending_irq(struct vcpu *v) && vcpu_info(v, evtchn_upcall_pending) ) return hvm_intack_vector(plat->irq.callback_via.vector); + if ( is_pvh_vcpu(v) ) + return hvm_intack_none; + if ( vlapic_accept_pic_intr(v) && plat->vpic[0].int_output ) return hvm_intack_pic(0); diff --git a/xen/arch/x86/hvm/mtrr.c b/xen/arch/x86/hvm/mtrr.c index ef51a8d..8057e88 100644 --- a/xen/arch/x86/hvm/mtrr.c +++ b/xen/arch/x86/hvm/mtrr.c @@ -578,6 +578,9 @@ int32_t hvm_set_mem_pinned_cacheattr( { struct hvm_mem_pinned_cacheattr_range *range; + /* PVH note: The guest writes to MSR_IA32_CR_PAT natively */ + ASSERT( !is_pvh_domain(d) ); + if ( !((type == PAT_TYPE_UNCACHABLE) || (type == PAT_TYPE_WRCOMB) || (type == PAT_TYPE_WRTHROUGH) || @@ -693,6 +696,13 @@ uint8_t epte_get_entry_emt(struct domain *d, unsigned long gfn, mfn_t mfn, ((d->vcpu == NULL) || ((v = d->vcpu[0]) == NULL)) ) return MTRR_TYPE_WRBACK; + /* PVH: fixme/help: do I have this correct? */ + if ( is_pvh_domain(d) ) { + if (direct_mmio) + return MTRR_TYPE_UNCACHABLE; + return MTRR_TYPE_WRBACK; + } + if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_IDENT_PT] ) return MTRR_TYPE_WRBACK; diff --git a/xen/arch/x86/hvm/vmx/intr.c b/xen/arch/x86/hvm/vmx/intr.c index e376f3c..b94f9d5 100644 --- a/xen/arch/x86/hvm/vmx/intr.c +++ b/xen/arch/x86/hvm/vmx/intr.c @@ -219,15 +219,16 @@ void vmx_intr_assist(void) return; } - /* Crank the handle on interrupt state. */ - pt_vector = pt_update_irq(v); + if ( !is_pvh_vcpu(v) ) + /* Crank the handle on interrupt state. */ + pt_vector = pt_update_irq(v); do { intack = hvm_vcpu_has_pending_irq(v); if ( likely(intack.source == hvm_intsrc_none) ) goto out; - if ( unlikely(nvmx_intr_intercept(v, intack)) ) + if ( !is_pvh_vcpu(v) && unlikely(nvmx_intr_intercept(v, intack)) ) goto out; intblk = hvm_interrupt_blocked(v, intack); diff --git a/xen/arch/x86/msi.c b/xen/arch/x86/msi.c index 8804306..0fe3108 100644 --- a/xen/arch/x86/msi.c +++ b/xen/arch/x86/msi.c @@ -787,10 +787,15 @@ static int msix_capability_init(struct pci_dev *dev, if ( !dev->msix_used_entries ) { - if ( rangeset_add_range(mmio_ro_ranges, dev->msix_table.first, + /* PVH: this is temporary only until linux msi.c is fixed. See xen-devel + * thread: "[PVH]: Help: msi.c". + */ + if ( !is_pvh_domain(dev->domain) && + rangeset_add_range(mmio_ro_ranges, dev->msix_table.first, dev->msix_table.last) ) WARN(); - if ( rangeset_add_range(mmio_ro_ranges, dev->msix_pba.first, + if ( !is_pvh_domain(dev->domain) && + rangeset_add_range(mmio_ro_ranges, dev->msix_pba.first, dev->msix_pba.last) ) WARN(); diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c index 56bffdb..eaa1989 100644 --- a/xen/arch/x86/time.c +++ b/xen/arch/x86/time.c @@ -1879,7 +1879,7 @@ void tsc_set_info(struct domain *d, uint32_t tsc_mode, uint64_t elapsed_nsec, uint32_t gtsc_khz, uint32_t incarnation) { - if ( is_idle_domain(d) || (d->domain_id == 0) ) + if ( is_idle_domain(d) || (d->domain_id == 0) || is_pvh_domain(d) ) { d->arch.vtsc = 0; return; -- 1.7.2.3
Mukesh Rathor
2013-Mar-19 22:33 UTC
[PATCH 12/18 V2 RESEND]: PVH xen: support invalid op, return PVH features etc...
The biggest change in this patch is in traps.c to allow forced invalid op for PVH guest. Also, enable hypercall page init for PVH guest also. Finally, set guest type to PVH if PV with HAP is created. Changes in V2: - Fix emulate_forced_invalid_op() to use proper copy function, and inject PF in case it fails. - remove extraneous PVH check in STI/CLI ops en emulate_privileged_op(). - Make assert a debug ASSERT in show_registers(). - debug.c: keep get_gfn() locked and move put_gfn closer to it. Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> --- xen/arch/x86/debug.c | 9 ++++----- xen/arch/x86/traps.c | 43 +++++++++++++++++++++++++++++++++++++------ xen/arch/x86/x86_64/traps.c | 5 +++-- xen/common/domain.c | 9 +++++++++ xen/common/domctl.c | 4 ++++ xen/common/kernel.c | 6 +++++- 6 files changed, 62 insertions(+), 14 deletions(-) diff --git a/xen/arch/x86/debug.c b/xen/arch/x86/debug.c index 502edbc..abe538f 100644 --- a/xen/arch/x86/debug.c +++ b/xen/arch/x86/debug.c @@ -59,7 +59,9 @@ dbg_hvm_va2mfn(dbgva_t vaddr, struct domain *dp, int toaddr, return INVALID_MFN; } - mfn = mfn_x(get_gfn(dp, *gfn, &gfntype)); + mfn = mfn_x(get_gfn_query(dp, *gfn, &gfntype)); + put_gfn(dp, *gfn); + if ( p2m_is_readonly(gfntype) && toaddr ) { DBGP2("kdb:p2m_is_readonly: gfntype:%x\n", gfntype); @@ -158,7 +160,7 @@ dbg_rw_guest_mem(dbgva_t addr, dbgbyte_t *buf, int len, struct domain *dp, pagecnt = min_t(long, PAGE_SIZE - (addr & ~PAGE_MASK), len); - mfn = (is_hvm_domain(dp) + mfn = (is_hvm_or_pvh_domain(dp) ? dbg_hvm_va2mfn(addr, dp, toaddr, &gfn) : dbg_pv_va2mfn(addr, dp, pgd3)); if ( mfn == INVALID_MFN ) @@ -178,9 +180,6 @@ dbg_rw_guest_mem(dbgva_t addr, dbgbyte_t *buf, int len, struct domain *dp, } unmap_domain_page(va); - if ( gfn != INVALID_GFN ) - put_gfn(dp, gfn); - addr += pagecnt; buf += pagecnt; len -= pagecnt; diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index ab54f82..14656c1 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -459,6 +459,10 @@ static void instruction_done( struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch) { regs->eip = eip; + + if ( is_pvh_vcpu(current) ) + return; + regs->eflags &= ~X86_EFLAGS_RF; if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) ) { @@ -475,6 +479,10 @@ static unsigned int check_guest_io_breakpoint(struct vcpu *v, unsigned int width, i, match = 0; unsigned long start; + if ( is_pvh_vcpu(v) ) { + /* for pvh, ctrlreg field is not implemented/used unless we need to */ + return 0; + } if ( !(v->arch.debugreg[5]) || !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) ) return 0; @@ -908,14 +916,18 @@ static int emulate_invalid_rdtscp(struct cpu_user_regs *regs) unsigned long emulate_forced_invalid_op(struct cpu_user_regs *regs) { char sig[5], instr[2]; - unsigned long eip, rc; + unsigned long eip, rc, addr; eip = regs->eip; /* Check for forced emulation signature: ud2 ; .ascii "xen". */ - if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 ) + if ( (rc = raw_copy_from_guest(sig, (char *)eip, sizeof(sig))) != 0 ) { - propagate_page_fault(eip + sizeof(sig) - rc, 0); + addr = eip + sizeof(sig) - rc; + if ( is_pvh_vcpu(current) ) + return addr; + + propagate_page_fault(addr, 0); return EXCRET_fault_fixed; } if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) ) @@ -923,9 +935,13 @@ unsigned long emulate_forced_invalid_op(struct cpu_user_regs *regs) eip += sizeof(sig); /* We only emulate CPUID. */ - if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 ) + if ( ( rc = raw_copy_from_guest(instr, (char *)eip, sizeof(instr))) != 0 ) { - propagate_page_fault(eip + sizeof(instr) - rc, 0); + addr = eip + sizeof(instr) - rc; + if ( is_pvh_vcpu(current) ) + return addr; + + propagate_page_fault(addr, 0); return EXCRET_fault_fixed; } if ( memcmp(instr, "\xf\xa2", sizeof(instr)) ) @@ -1068,6 +1084,10 @@ void propagate_page_fault(unsigned long addr, u16 error_code) struct vcpu *v = current; struct trap_bounce *tb = &v->arch.pv_vcpu.trap_bounce; + /* PVH should not get here. ctrlreg is not implemented amongst other + * things */ + ASSERT( !is_pvh_vcpu(v) ); + v->arch.pv_vcpu.ctrlreg[2] = addr; arch_set_cr2(v, addr); @@ -1453,6 +1473,9 @@ static int read_descriptor(unsigned int sel, { struct desc_struct desc; + if ( is_pvh_vcpu(v) ) + return hvm_pvh_read_descriptor(sel, v, regs, base, limit, ar); + if ( !vm86_mode(regs) ) { if ( sel < 4) @@ -1571,6 +1594,11 @@ static int guest_io_okay( int user_mode = !(v->arch.flags & TF_kernel_mode); #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v) + /* for PVH we check this in vmexit for EXIT_REASON_IO_INSTRUCTION + * and so don''t need to check again here. */ + if (is_pvh_vcpu(v)) + return 1; + if ( !vm86_mode(regs) && (v->arch.pv_vcpu.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) ) return 1; @@ -1816,7 +1844,7 @@ static inline uint64_t guest_misc_enable(uint64_t val) _ptr = (unsigned int)_ptr; \ if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \ goto fail; \ - if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \ + if ( (_rc = raw_copy_from_guest(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \ { \ propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \ goto skip; \ @@ -3245,6 +3273,9 @@ void do_device_not_available(struct cpu_user_regs *regs) BUG_ON(!guest_mode(regs)); + /* PVH should not get here. ctrlreg is not implemented */ + ASSERT( !is_pvh_vcpu(curr) ); + vcpu_restore_fpu_lazy(curr); if ( curr->arch.pv_vcpu.ctrlreg[0] & X86_CR0_TS ) diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c index d2f7209..47ec2ff 100644 --- a/xen/arch/x86/x86_64/traps.c +++ b/xen/arch/x86/x86_64/traps.c @@ -147,7 +147,7 @@ void vcpu_show_registers(const struct vcpu *v) unsigned long crs[8]; /* No need to handle HVM for now. */ - if ( is_hvm_vcpu(v) ) + if ( is_hvm_or_pvh_vcpu(v) ) return; crs[0] = v->arch.pv_vcpu.ctrlreg[0]; @@ -440,6 +440,7 @@ static long register_guest_callback(struct callback_register *reg) long ret = 0; struct vcpu *v = current; + ASSERT( !is_pvh_vcpu(v) ); if ( !is_canonical_address(reg->address) ) return -EINVAL; @@ -620,7 +621,7 @@ static void hypercall_page_initialise_ring3_kernel(void *hypercall_page) void hypercall_page_initialise(struct domain *d, void *hypercall_page) { memset(hypercall_page, 0xCC, PAGE_SIZE); - if ( is_hvm_domain(d) ) + if ( is_hvm_or_pvh_domain(d) ) hvm_hypercall_page_initialise(d, hypercall_page); else if ( !is_pv_32bit_domain(d) ) hypercall_page_initialise_ring3_kernel(hypercall_page); diff --git a/xen/common/domain.c b/xen/common/domain.c index b6f10b7..aac6699 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -232,6 +232,15 @@ struct domain *domain_create( if ( domcr_flags & DOMCRF_hvm ) d->guest_type = hvm_guest; + else if ( domcr_flags & DOMCRF_pvh ) { + d->guest_type = pvh_guest; + if ( !(domcr_flags & DOMCRF_hap) ) { + printk("PVH guest must have HAP on\n"); + goto fail; + } else + printk("PVH guest. Please note it is experimental. domid:%d\n", + domid); + } if ( domid == 0 ) { diff --git a/xen/common/domctl.c b/xen/common/domctl.c index c98e99c..ab615f1 100644 --- a/xen/common/domctl.c +++ b/xen/common/domctl.c @@ -149,6 +149,8 @@ void getdomaininfo(struct domain *d, struct xen_domctl_getdomaininfo *info) if ( is_hvm_domain(d) ) info->flags |= XEN_DOMINF_hvm_guest; + else if ( is_pvh_domain(d) ) + info->flags |= XEN_DOMINF_pvh_guest; xsm_security_domaininfo(d, info); @@ -400,6 +402,8 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) domcr_flags = 0; if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_hvm_guest ) domcr_flags |= DOMCRF_hvm; + else if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_hap ) + domcr_flags |= DOMCRF_pvh; /* PV with HAP is a PVH guest */ if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_hap ) domcr_flags |= DOMCRF_hap; if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_s3_integrity ) diff --git a/xen/common/kernel.c b/xen/common/kernel.c index 72fb905..3bba758 100644 --- a/xen/common/kernel.c +++ b/xen/common/kernel.c @@ -289,7 +289,11 @@ DO(xen_version)(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) if ( current->domain == dom0 ) fi.submap |= 1U << XENFEAT_dom0; #ifdef CONFIG_X86 - if ( !is_hvm_vcpu(current) ) + if ( is_pvh_vcpu(current) ) + fi.submap |= (1U << XENFEAT_hvm_safe_pvclock) | + (1U << XENFEAT_supervisor_mode_kernel) | + (1U << XENFEAT_hvm_callback_vector); + else if ( !is_hvm_vcpu(current) ) fi.submap |= (1U << XENFEAT_mmu_pt_update_preserve_ad) | (1U << XENFEAT_highmem_assist) | (1U << XENFEAT_gnttab_map_avail_bits); -- 1.7.2.3
Mukesh Rathor
2013-Mar-19 22:33 UTC
[PATCH 13/18 V2 RESEND]: PVH xen: introduce p2m_map_foreign
In this patch, I introduce a new type p2m_map_foreign for pages that a dom0 maps from foreign domains its creating. Also, add set_foreign_p2m_entry() to map p2m_map_foreign type pages. Other misc changes related to p2m. Changes in V2: - Make guest_physmap_add_entry() same for PVH in terms of overwriting old entry. - In set_foreign_p2m_entry() do locked get_gfn and not unlocked. - Replace ASSERT with return -EINVAL in do_physdev_op. - Remove unnecessary check for PVH in do_physdev_op(). Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> --- xen/arch/x86/domctl.c | 19 +++++++++++++------ xen/arch/x86/mm/p2m-ept.c | 3 ++- xen/arch/x86/mm/p2m-pt.c | 3 ++- xen/arch/x86/mm/p2m.c | 30 +++++++++++++++++++++++++++++- xen/arch/x86/physdev.c | 8 ++++++++ xen/include/asm-x86/p2m.h | 4 ++++ 6 files changed, 58 insertions(+), 9 deletions(-) diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c index ecc8240..da49d6d 100644 --- a/xen/arch/x86/domctl.c +++ b/xen/arch/x86/domctl.c @@ -64,9 +64,10 @@ long domctl_memory_mapping(struct domain *d, unsigned long gfn, if ( add_map ) { - printk(XENLOG_G_INFO - "memory_map:add: dom%d gfn=%lx mfn=%lx nr=%lx\n", - d->domain_id, gfn, mfn, nr_mfns); + if ( !is_pvh_domain(d) ) /* PVH maps lots and lots */ + printk(XENLOG_G_INFO + "memory_map:add: dom%d gfn=%lx mfn=%lx nr=%lx\n", + d->domain_id, gfn, mfn, nr_mfns); ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1); if ( !ret && paging_mode_translate(d) ) @@ -89,9 +90,10 @@ long domctl_memory_mapping(struct domain *d, unsigned long gfn, } } } else { - printk(XENLOG_G_INFO - "memory_map:remove: dom%d gfn=%lx mfn=%lx nr=%lx\n", - d->domain_id, gfn, mfn, nr_mfns); + if ( !is_pvh_domain(d) ) /* PVH unmaps lots and lots */ + printk(XENLOG_G_INFO + "memory_map:remove: dom%d gfn=%lx mfn=%lx nr=%lx\n", + d->domain_id, gfn, mfn, nr_mfns); if ( paging_mode_translate(d) ) for ( i = 0; i < nr_mfns; i++ ) @@ -1307,6 +1309,11 @@ void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c) c.nat->gs_base_kernel = hvm_get_shadow_gs_base(v); } } + else if ( is_pvh_vcpu(v) ) + { + /* fixme: punt it to phase II */ + printk("PVH: fixme: arch_get_info_guest()\n"); + } else { c(ldt_base = v->arch.pv_vcpu.ldt_base); diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c index a2d1591..38ea9ec 100644 --- a/xen/arch/x86/mm/p2m-ept.c +++ b/xen/arch/x86/mm/p2m-ept.c @@ -75,6 +75,7 @@ static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type, p2m_acces entry->w = 0; break; case p2m_grant_map_rw: + case p2m_map_foreign: entry->r = entry->w = 1; entry->x = 0; break; @@ -430,7 +431,7 @@ ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, } /* Track the highest gfn for which we have ever had a valid mapping */ - if ( p2mt != p2m_invalid && + if ( p2mt != p2m_invalid && p2mt != p2m_mmio_dm && (gfn + (1UL << order) - 1 > p2m->max_mapped_pfn) ) p2m->max_mapped_pfn = gfn + (1UL << order) - 1; diff --git a/xen/arch/x86/mm/p2m-pt.c b/xen/arch/x86/mm/p2m-pt.c index 302b621..3f46418 100644 --- a/xen/arch/x86/mm/p2m-pt.c +++ b/xen/arch/x86/mm/p2m-pt.c @@ -89,6 +89,7 @@ static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn) case p2m_ram_rw: return flags | P2M_BASE_FLAGS | _PAGE_RW; case p2m_grant_map_rw: + case p2m_map_foreign: return flags | P2M_BASE_FLAGS | _PAGE_RW | _PAGE_NX_BIT; case p2m_mmio_direct: if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) ) @@ -429,7 +430,7 @@ p2m_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, } /* Track the highest gfn for which we have ever had a valid mapping */ - if ( p2mt != p2m_invalid + if ( p2mt != p2m_invalid && p2mt != p2m_mmio_dm && (gfn + (1UL << page_order) - 1 > p2m->max_mapped_pfn) ) p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1; diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c index 4837de3..6888cf1 100644 --- a/xen/arch/x86/mm/p2m.c +++ b/xen/arch/x86/mm/p2m.c @@ -523,7 +523,7 @@ p2m_remove_page(struct p2m_domain *p2m, unsigned long gfn, unsigned long mfn, for ( i = 0; i < (1UL << page_order); i++ ) { mfn_return = p2m->get_entry(p2m, gfn + i, &t, &a, 0, NULL); - if ( !p2m_is_grant(t) && !p2m_is_shared(t) ) + if ( !p2m_is_grant(t) && !p2m_is_shared(t) && !p2m_is_foreign(t) ) set_gpfn_from_mfn(mfn+i, INVALID_M2P_ENTRY); ASSERT( !p2m_is_valid(t) || mfn + i == mfn_x(mfn_return) ); } @@ -754,7 +754,35 @@ void p2m_change_type_range(struct domain *d, p2m_unlock(p2m); } +/* Returns: True for success. 0 for failure */ +int set_foreign_p2m_entry(struct domain *dp, unsigned long gfn, mfn_t mfn) +{ + int rc = 0; + p2m_type_t ot; + mfn_t omfn; + struct p2m_domain *p2m = p2m_get_hostp2m(dp); + + if ( !paging_mode_translate(dp) ) + return 0; + + omfn = get_gfn_query(dp, gfn, &ot); + if (mfn_valid(omfn)) { + gdprintk(XENLOG_ERR, "Already mapped mfn %lx at gfn:%lx\n", + mfn_x(omfn), gfn); + set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); + } + put_gfn(dp, gfn); + P2M_DEBUG("set foreign %lx %lx\n", gfn, mfn_x(mfn)); + p2m_lock(p2m); + rc = set_p2m_entry(p2m, gfn, mfn, 0, p2m_map_foreign, p2m->default_access); + p2m_unlock(p2m); + if ( rc == 0 ) + gdprintk(XENLOG_ERR, + "set_foreign_p2m_entry: set_p2m_entry failed! gfn:%lx mfn=%08lx\n", + gfn, mfn_x(get_gfn_query(dp, gfn, &ot))); + return rc; +} int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn) diff --git a/xen/arch/x86/physdev.c b/xen/arch/x86/physdev.c index 876ac9d..ca66c1c 100644 --- a/xen/arch/x86/physdev.c +++ b/xen/arch/x86/physdev.c @@ -475,6 +475,12 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) case PHYSDEVOP_set_iopl: { struct physdev_set_iopl set_iopl; + + if ( is_pvh_vcpu(current) ) { + ret = -EINVAL; + break; + } + ret = -EFAULT; if ( copy_from_guest(&set_iopl, arg, 1) != 0 ) break; @@ -488,6 +494,8 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) case PHYSDEVOP_set_iobitmap: { struct physdev_set_iobitmap set_iobitmap; + + ASSERT( !is_pvh_vcpu(current) ); ret = -EFAULT; if ( copy_from_guest(&set_iobitmap, arg, 1) != 0 ) break; diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h index 43583b2..b76dc33 100644 --- a/xen/include/asm-x86/p2m.h +++ b/xen/include/asm-x86/p2m.h @@ -70,6 +70,7 @@ typedef enum { p2m_ram_paging_in = 11, /* Memory that is being paged in */ p2m_ram_shared = 12, /* Shared or sharable memory */ p2m_ram_broken = 13, /* Broken page, access cause domain crash */ + p2m_map_foreign = 14, /* ram pages from foreign domain */ } p2m_type_t; /* @@ -180,6 +181,7 @@ typedef unsigned int p2m_query_t; #define p2m_is_sharable(_t) (p2m_to_mask(_t) & P2M_SHARABLE_TYPES) #define p2m_is_shared(_t) (p2m_to_mask(_t) & P2M_SHARED_TYPES) #define p2m_is_broken(_t) (p2m_to_mask(_t) & P2M_BROKEN_TYPES) +#define p2m_is_foreign(_t) (p2m_to_mask(_t) & p2m_to_mask(p2m_map_foreign)) /* Per-p2m-table state */ struct p2m_domain { @@ -510,6 +512,8 @@ p2m_type_t p2m_change_type(struct domain *d, unsigned long gfn, int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn); int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn); +/* Set foreign mfn in the current guest''s p2m table (for pvh dom0) */ +int set_foreign_p2m_entry(struct domain *domp, unsigned long gfn, mfn_t mfn); /* * Populate-on-demand -- 1.7.2.3
Mukesh Rathor
2013-Mar-19 22:33 UTC
[PATCH 14/18 V2 RESEND]: PVH xen: add xenmem_add_foreign_to_pmap()
In this patch, a new function, xenmem_add_foreign_to_pmap(), is added to map pages from foreign guest into current dom0 for domU creation. Also, allow XENMEM_remove_from_physmap to remove p2m_map_foreign pages. Note, in this path, we must release the refcount that was taken during the map phase. Changes in V2: - Move the XENMEM_remove_from_physmap changes here instead of prev patch - Move grant changes from this to one of the next patches. - In xenmem_add_foreign_to_pmap(), do locked get_gfn - Fail the mappings for qemu mapping pages for memory not there. Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> --- xen/arch/x86/mm.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++-- xen/common/memory.c | 44 +++++++++++++++++++++++++++--- 2 files changed, 110 insertions(+), 8 deletions(-) diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 6603752..dbac811 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -2656,7 +2656,7 @@ static struct domain *get_pg_owner(domid_t domid) goto out; } - if ( unlikely(paging_mode_translate(curr)) ) + if ( !is_pvh_domain(curr) && unlikely(paging_mode_translate(curr)) ) { MEM_LOG("Cannot mix foreign mappings with translated domains"); goto out; @@ -4192,7 +4192,7 @@ long do_update_descriptor(u64 pa, u64 desc) page = get_page_from_gfn(dom, gmfn, NULL, P2M_ALLOC); if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) || !page || - !check_descriptor(dom, &d) ) + (!is_pvh_domain(dom) && !check_descriptor(dom, &d)) ) { if ( page ) put_page(page); @@ -4266,6 +4266,66 @@ static int handle_iomem_range(unsigned long s, unsigned long e, void *p) return 0; } +/* + * Add frames from foreign domain to current domain''s physmap. Similar to + * XENMAPSPACE_gmfn but the frame is foreign being mapped into current, + * and is not removed from foreign domain. + * Usage: libxl on pvh dom0 creating a guest and doing privcmd_ioctl_mmap. + * Side Effect: the mfn for fgfn will be refcounted so it is not lost + * while mapped here. The refcnt is released in do_memory_op() + * via XENMEM_remove_from_physmap. + * Returns: 0 ==> success + */ +static int xenmem_add_foreign_to_pmap(domid_t foreign_domid, + unsigned long fgfn, unsigned long gpfn) +{ + p2m_type_t p2mt, p2mt_prev; + int rc = -EINVAL; + unsigned long prev_mfn, mfn = 0; + struct domain *fdom, *currd = current->domain; + + if ( (fdom = get_pg_owner(foreign_domid)) == NULL ) + return -EPERM; + + mfn = mfn_x(get_gfn_query(fdom, fgfn, &p2mt)); + if ( !mfn_valid(mfn) || !p2m_is_valid(p2mt) ) + goto out_rc; + + if ( !get_page_from_pagenr(mfn, fdom) ) + goto out_rc; + + /* Remove previously mapped page if it is present. */ + prev_mfn = mfn_x(get_gfn(currd, gpfn, &p2mt_prev)); + if ( mfn_valid(prev_mfn) ) + { + if ( is_xen_heap_mfn(prev_mfn) ) + /* Xen heap frames are simply unhooked from this phys slot */ + guest_physmap_remove_page(currd, gpfn, prev_mfn, 0); + else + /* Normal domain memory is freed, to avoid leaking memory. */ + guest_remove_page(currd, gpfn); + } + put_gfn(currd, gpfn); + + /* Create the new mapping. Can''t use guest_physmap_add_page() because it + * will update the m2p table which will result in mfn -> gpfn of dom0 + * and not fgfn of domU. + */ + if ( set_foreign_p2m_entry(currd, gpfn, _mfn(mfn)) == 0 ) { + + printk("guest_physmap_add_page failed. gpfn:%lx mfn:%lx fgfn:%lx\n", + gpfn, mfn, fgfn); + put_page(mfn_to_page(mfn)); + goto out_rc; + } + rc = 0; + +out_rc: + put_gfn(fdom, fgfn); + put_pg_owner(fdom); + return rc; +} + static int xenmem_add_to_physmap_once( struct domain *d, const struct xen_add_to_physmap *xatp, @@ -4328,6 +4388,14 @@ static int xenmem_add_to_physmap_once( page = mfn_to_page(mfn); break; } + + case XENMAPSPACE_gmfn_foreign: + { + rc = xenmem_add_foreign_to_pmap(foreign_domid, xatp->idx, + xatp->gpfn); + return rc; + } + default: break; } @@ -4425,7 +4493,7 @@ static int xenmem_add_to_physmap(struct domain *d, return xenmem_add_to_physmap_once(d, xatp, -1); } -static noinline int xenmem_add_to_physmap_range(struct domain *d, +static int xenmem_add_to_physmap_range(struct domain *d, struct xen_add_to_physmap_range *xatpr) { int rc; diff --git a/xen/common/memory.c b/xen/common/memory.c index 68501d1..91a56b6 100644 --- a/xen/common/memory.c +++ b/xen/common/memory.c @@ -675,9 +675,12 @@ long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg) case XENMEM_remove_from_physmap: { + unsigned long argmfn, foreign_mfn = INVALID_MFN; struct xen_remove_from_physmap xrfp; struct page_info *page; - struct domain *d; + struct domain *d, *foreign_dom = NULL; + p2m_type_t p2mt, tp; + int valid_pvh_pg, is_curr_pvh = is_pvh_vcpu(current); if ( copy_from_guest(&xrfp, arg, 1) ) return -EFAULT; @@ -695,14 +698,45 @@ long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg) domain_lock(d); - page = get_page_from_gfn(d, xrfp.gpfn, NULL, P2M_ALLOC); - if ( page ) + /* PVH note: if PVH, the gfn could be mapped to a mfn from foreign + * domain by the user space tool during domain creation. We need to + * check for that, free it up from the p2m, and release refcnt on it. + * In such a case, page would be NULL. */ + + page = get_page_from_gfn(d, xrfp.gpfn, &p2mt, P2M_ALLOC); + valid_pvh_pg = is_curr_pvh && + (p2m_is_mmio(p2mt) || p2m_is_foreign(p2mt)); + + if ( page || valid_pvh_pg) { - guest_physmap_remove_page(d, xrfp.gpfn, page_to_mfn(page), 0); - put_page(page); + argmfn = page ? page_to_mfn(page) : INVALID_MFN; + + if ( is_curr_pvh && p2m_is_foreign(p2mt) ) + { + foreign_mfn = mfn_x(get_gfn_query_unlocked(d, xrfp.gpfn, &tp)); + foreign_dom = page_get_owner(mfn_to_page(foreign_mfn)); + ASSERT(p2m_is_mmio(tp) || p2m_is_foreign(tp)); + } + + guest_physmap_remove_page(d, xrfp.gpfn, argmfn, 0); + if (page) + put_page(page); + + /* if pages were mapped from foreign domain via + * xenmem_add_foreign_to_pmap(), we must drop a refcnt here */ + if ( is_curr_pvh && p2m_is_foreign(p2mt) ) + { + ASSERT( d != foreign_dom ); + put_page(mfn_to_page(foreign_mfn)); + } } else + { + if ( is_curr_pvh ) + gdprintk(XENLOG_WARNING, "%s: Domain:%u gmfn:%lx invalid\n", + __func__, current->domain->domain_id, xrfp.gpfn); rc = -ENOENT; + } domain_unlock(d); -- 1.7.2.3
Mukesh Rathor
2013-Mar-19 22:33 UTC
[PATCH 15/18 V2 RESEND]: PVH xen: grant related changes
In this patch, we make pvh be acomodated in the newly created domain_page.c file. Also, in grant, we replenish the frame in the EPT so we don''t leave a hole in it. Change in V2: - None. domain_page.c changes newer in this changeset. - grant changes moved here in a separate patch. Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> --- xen/arch/x86/domain_page.c | 11 ++++++----- xen/arch/x86/mm.c | 23 +++++++++++++++++++++-- xen/common/grant_table.c | 4 ++-- 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c index 7421e03..be41304 100644 --- a/xen/arch/x86/domain_page.c +++ b/xen/arch/x86/domain_page.c @@ -34,7 +34,8 @@ static inline struct vcpu *mapcache_current_vcpu(void) * then it means we are running on the idle domain''s page table and must * therefore use its mapcache. */ - if ( unlikely(pagetable_is_null(v->arch.guest_table)) && !is_hvm_vcpu(v) ) + if ( unlikely(pagetable_is_null(v->arch.guest_table)) && + !is_hvm_or_pvh_vcpu(v) ) { /* If we really are idling, perform lazy context switch now. */ if ( (v = idle_vcpu[smp_processor_id()]) == current ) @@ -71,7 +72,7 @@ void *map_domain_page(unsigned long mfn) #endif v = mapcache_current_vcpu(); - if ( !v || is_hvm_vcpu(v) ) + if ( !v || is_hvm_or_pvh_vcpu(v) ) return mfn_to_virt(mfn); dcache = &v->domain->arch.pv_domain.mapcache; @@ -175,7 +176,7 @@ void unmap_domain_page(const void *ptr) ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END); v = mapcache_current_vcpu(); - ASSERT(v && !is_hvm_vcpu(v)); + ASSERT(v && !is_hvm_or_pvh_vcpu(v)); dcache = &v->domain->arch.pv_domain.mapcache; ASSERT(dcache->inuse); @@ -242,7 +243,7 @@ int mapcache_domain_init(struct domain *d) struct mapcache_domain *dcache = &d->arch.pv_domain.mapcache; unsigned int bitmap_pages; - if ( is_hvm_domain(d) || is_idle_domain(d) ) + if ( is_hvm_or_pvh_domain(d) || is_idle_domain(d) ) return 0; #ifdef NDEBUG @@ -273,7 +274,7 @@ int mapcache_vcpu_init(struct vcpu *v) unsigned int ents = d->max_vcpus * MAPCACHE_VCPU_ENTRIES; unsigned int nr = PFN_UP(BITS_TO_LONGS(ents) * sizeof(long)); - if ( is_hvm_vcpu(v) || !dcache->inuse ) + if ( is_hvm_or_pvh_vcpu(v) || !dcache->inuse ) return 0; if ( ents > dcache->entries ) diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index dbac811..64d0853 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -3780,16 +3780,35 @@ static int replace_grant_p2m_mapping( old_mfn = get_gfn(d, gfn, &type); if ( !p2m_is_grant(type) || mfn_x(old_mfn) != frame ) { - put_gfn(d, gfn); gdprintk(XENLOG_WARNING, "replace_grant_p2m_mapping: old mapping invalid (type %d, mfn %lx, frame %lx)\n", type, mfn_x(old_mfn), frame); - return GNTST_general_error; + goto out_err; } guest_physmap_remove_page(d, gfn, frame, PAGE_ORDER_4K); + /* PVH: Because we free the existing mfn in XENMEM_add_to_physmap during + * map, we undo that here so the guest P2M (EPT/NPT) is consistent */ + if ( is_pvh_domain(d) ) { + struct page_info *page = alloc_domheap_page(d, 0); + + if ( page == NULL ) { + gdprintk(XENLOG_ERR, "domid:%d Unable to alloc domheap page\n", + d->domain_id); + goto out_err; + } + if ( guest_physmap_add_page(d, gfn, page_to_mfn(page), 0) != 0 ) { + gdprintk(XENLOG_ERR, "Unable to add mfn to replace grant\n"); + goto out_err; + } + } + put_gfn(d, gfn); return GNTST_okay; + +out_err: + put_gfn(d, gfn); + return GNTST_general_error; } int replace_grant_host_mapping( diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c index 3f97328..84ce267 100644 --- a/xen/common/grant_table.c +++ b/xen/common/grant_table.c @@ -721,7 +721,7 @@ __gnttab_map_grant_ref( double_gt_lock(lgt, rgt); - if ( !is_hvm_domain(ld) && need_iommu(ld) ) + if ( !is_hvm_or_pvh_domain(ld) && need_iommu(ld) ) { unsigned int wrc, rdc; int err = 0; @@ -932,7 +932,7 @@ __gnttab_unmap_common( act->pin -= GNTPIN_hstw_inc; } - if ( !is_hvm_domain(ld) && need_iommu(ld) ) + if ( !is_hvm_or_pvh_domain(ld) && need_iommu(ld) ) { unsigned int wrc, rdc; int err = 0; -- 1.7.2.3
Mukesh Rathor
2013-Mar-19 22:33 UTC
[PATCH 16/18 V2 RESEND]: PVH xen: elf changes to pref for dom0 PVH.
This patch prepares for dom0 PVH by making some changes in the elf code; add a new parameter to indicate PVH dom0 and use different copy function for PVH. Also, add check in iommu.c to check for iommu enabled for dom0 PVH. Changes in V2: None Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> --- xen/arch/x86/domain_build.c | 2 +- xen/common/libelf/libelf-loader.c | 51 ++++++++++++++++++++++++++++++++++--- xen/drivers/passthrough/iommu.c | 18 +++++++++++- xen/include/xen/libelf.h | 3 +- 4 files changed, 66 insertions(+), 8 deletions(-) diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c index c8f435d..8c5b27a 100644 --- a/xen/arch/x86/domain_build.c +++ b/xen/arch/x86/domain_build.c @@ -766,7 +766,7 @@ int __init construct_dom0( /* Copy the OS image and free temporary buffer. */ elf.dest = (void*)vkern_start; - rc = elf_load_binary(&elf); + rc = elf_load_binary(&elf, 0); if ( rc < 0 ) { printk("Failed to load the kernel binary\n"); diff --git a/xen/common/libelf/libelf-loader.c b/xen/common/libelf/libelf-loader.c index 3cf9c59..d732f75 100644 --- a/xen/common/libelf/libelf-loader.c +++ b/xen/common/libelf/libelf-loader.c @@ -17,6 +17,10 @@ */ #include "libelf-private.h" +#ifdef __XEN__ +#include <public/xen.h> +#include <asm/debugger.h> +#endif /* ------------------------------------------------------------------------ */ @@ -108,7 +112,8 @@ void elf_set_log(struct elf_binary *elf, elf_log_callback *log_callback, elf->verbose = verbose; } -static int elf_load_image(void *dst, const void *src, uint64_t filesz, uint64_t memsz) +static int elf_load_image(void *dst, const void *src, uint64_t filesz, + uint64_t memsz, int not_used) { memcpy(dst, src, filesz); memset(dst + filesz, 0, memsz - filesz); @@ -122,11 +127,34 @@ void elf_set_verbose(struct elf_binary *elf) elf->verbose = 1; } -static int elf_load_image(void *dst, const void *src, uint64_t filesz, uint64_t memsz) +static int elf_load_image(void *dst, const void *src, uint64_t filesz, + uint64_t memsz, int is_pvh_dom0) { int rc; if ( filesz > ULONG_MAX || memsz > ULONG_MAX ) return -1; + + /* raw_copy_to_guest -> copy_to_user_hvm -> __hvm_copy needs curr to + * point to the hvm/pvh vcpu. Hence for PVH dom0 we can''t use that. For now + * just use dbg_rw_mem(). */ + if ( is_pvh_dom0 ) + { + int j, rem; + rem = dbg_rw_mem((dbgva_t)dst, (dbgbyte_t *)src, (int)filesz, 0, 1, 0); + if ( rem ) { + printk("Failed to copy elf binary. len:%ld rem:%d\n", filesz, rem); + return -1; + } + for (j=0; j < memsz - filesz; j++) { + unsigned char zero=0; + rem = dbg_rw_mem((dbgva_t)(dst+filesz+j), &zero, 1, 0, 1, 0); + if (rem) { + printk("Failed to copy to:%p rem:%d\n", dst+filesz+j, rem); + return -1; + } + } + return 0; + } rc = raw_copy_to_guest(dst, src, filesz); if ( rc != 0 ) return -1; @@ -260,7 +288,9 @@ void elf_parse_binary(struct elf_binary *elf) __FUNCTION__, elf->pstart, elf->pend); } -int elf_load_binary(struct elf_binary *elf) +/* This function called from the libraries when building guests, and also for + * dom0 from construct_dom0(). */ +static int _elf_load_binary(struct elf_binary *elf, int is_pvh_dom0) { const elf_phdr *phdr; uint64_t i, count, paddr, offset, filesz, memsz; @@ -279,7 +309,8 @@ int elf_load_binary(struct elf_binary *elf) dest = elf_get_ptr(elf, paddr); elf_msg(elf, "%s: phdr %" PRIu64 " at 0x%p -> 0x%p\n", __func__, i, dest, dest + filesz); - if ( elf_load_image(dest, elf->image + offset, filesz, memsz) != 0 ) + if ( elf_load_image(dest, elf->image + offset, filesz, memsz, + is_pvh_dom0) != 0 ) return -1; } @@ -287,6 +318,18 @@ int elf_load_binary(struct elf_binary *elf) return 0; } +#ifdef __XEN__ +int elf_load_binary(struct elf_binary *elf, int is_pvh_dom0) +{ + return _elf_load_binary(elf, is_pvh_dom0); +} +#else +int elf_load_binary(struct elf_binary *elf) +{ + return _elf_load_binary(elf, 0); +} +#endif + void *elf_get_ptr(struct elf_binary *elf, unsigned long addr) { return elf->dest + addr - elf->pstart; diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c index c1d3c12..9954e07 100644 --- a/xen/drivers/passthrough/iommu.c +++ b/xen/drivers/passthrough/iommu.c @@ -125,15 +125,25 @@ int iommu_domain_init(struct domain *d) return hd->platform_ops->init(d); } +static inline void check_dom0_pvh_reqs(struct domain *d) +{ + if (!iommu_enabled || iommu_passthrough) + panic("For pvh dom0, iommu must be enabled, dom0-passthrough must " + "not be enabled \n"); +} + void __init iommu_dom0_init(struct domain *d) { struct hvm_iommu *hd = domain_hvm_iommu(d); + if ( is_pvh_domain(d) ) + check_dom0_pvh_reqs(d); + if ( !iommu_enabled ) return; register_keyhandler(''o'', &iommu_p2m_table); - d->need_iommu = !!iommu_dom0_strict; + d->need_iommu = is_pvh_domain(d) || !!iommu_dom0_strict; if ( need_iommu(d) ) { struct page_info *page; @@ -146,7 +156,11 @@ void __init iommu_dom0_init(struct domain *d) ((page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page) ) mapping |= IOMMUF_writable; - hd->platform_ops->map_page(d, mfn, mfn, mapping); + if ( is_pvh_domain(d) ) { + unsigned long gfn = mfn_to_gfn(d, _mfn(mfn)); + hd->platform_ops->map_page(d, gfn, mfn, mapping); + } else + hd->platform_ops->map_page(d, mfn, mfn, mapping); if ( !(i++ & 0xfffff) ) process_pending_softirqs(); } diff --git a/xen/include/xen/libelf.h b/xen/include/xen/libelf.h index 218bb18..2dc2bdb 100644 --- a/xen/include/xen/libelf.h +++ b/xen/include/xen/libelf.h @@ -192,13 +192,14 @@ int elf_phdr_is_loadable(struct elf_binary *elf, const elf_phdr * phdr); int elf_init(struct elf_binary *elf, const char *image, size_t size); #ifdef __XEN__ void elf_set_verbose(struct elf_binary *elf); +int elf_load_binary(struct elf_binary *elf, int is_pvh_dom0); #else void elf_set_log(struct elf_binary *elf, elf_log_callback*, void *log_caller_pointer, int verbose); +int elf_load_binary(struct elf_binary *elf); #endif void elf_parse_binary(struct elf_binary *elf); -int elf_load_binary(struct elf_binary *elf); void *elf_get_ptr(struct elf_binary *elf, unsigned long addr); uint64_t elf_lookup_addr(struct elf_binary *elf, const char *symbol); -- 1.7.2.3
Mukesh Rathor
2013-Mar-19 22:33 UTC
[PATCH 17/18 V2 RESEND]: PVH xen: PVH dom0 creation...
Finally, the hardest. Mostly modify construct_dom0() to boot PV dom0 in PVH mode. Introduce, opt_dom0pvh, which when specified in the command line, causes dom0 to boot in PVH mode. Change in V2: - Map the entire IO region upfront in the P2M for PVH dom0. Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> --- xen/arch/x86/domain_build.c | 241 +++++++++++++++++++++++++++++++++---------- xen/arch/x86/mm/hap/hap.c | 17 +++- xen/arch/x86/setup.c | 10 ++- xen/include/asm-x86/hap.h | 1 + 4 files changed, 212 insertions(+), 57 deletions(-) diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c index 8c5b27a..72aa70b 100644 --- a/xen/arch/x86/domain_build.c +++ b/xen/arch/x86/domain_build.c @@ -35,6 +35,8 @@ #include <asm/setup.h> #include <asm/bzimage.h> /* for bzimage_parse */ #include <asm/io_apic.h> +#include <asm/hap.h> +#include <asm/debugger.h> #include <public/version.h> @@ -307,6 +309,65 @@ static void __init process_dom0_ioports_disable(void) } } +/* + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0 will have + * the entire io region mapped in the EPT/NPT. + */ +static __init void pvh_map_all_iomem(struct domain *d) +{ + unsigned long start = 0; + const struct e820entry *entry; + int rc, i, nump; + + for (i = 0, entry = e820.map; i < e820.nr_map; i++, entry++) { + unsigned long end = entry->addr + entry->size; + + if (entry->type == E820_RAM || i == e820.nr_map - 1) { + unsigned long start_pfn = PFN_DOWN(start); + unsigned long end_pfn = PFN_UP(end); + + if (entry->type == E820_RAM) + end_pfn = PFN_UP(entry->addr); + + if (start_pfn < end_pfn) { + nump = end_pfn - start_pfn + 1; + rc = domctl_memory_mapping(d, start_pfn, start_pfn, nump, 1); + BUG_ON(rc); + } + start = end; + } + } +} + +static __init void dom0_update_physmap(struct domain *d, unsigned long pfn, + unsigned long mfn, unsigned long vphysmap_s) +{ + if ( is_pvh_domain(d) ) { + int rc = guest_physmap_add_page(d, pfn, mfn, 0); + BUG_ON(rc); + return; + } + if ( !is_pv_32on64_domain(d) ) + ((unsigned long *)vphysmap_s)[pfn] = mfn; + else + ((unsigned int *)vphysmap_s)[pfn] = mfn; + + set_gpfn_from_mfn(mfn, pfn); +} + +static __init void copy_pvh(char *dest, char *src, int bytes) +{ + /* raw_copy_to_guest() -> copy_to_user_hvm -> __hvm_copy needs curr + * to point to the hvm/pvh vcpu. Hence for PVH dom0 we can''t use that. + * So we just use dbg_rw_mem(). + */ + int rem = dbg_rw_mem((dbgva_t)dest, (unsigned char *)src, bytes, 0, 1, 0); + if (rem) { + printk("PVH: Failed to copy to dom0. len:%d rem:%d\n", bytes, rem); + BUG(); + } +} + int __init construct_dom0( struct domain *d, const module_t *image, unsigned long image_headroom, @@ -314,6 +375,7 @@ int __init construct_dom0( void *(*bootstrap_map)(const module_t *), char *cmdline) { + char *si_buf=NULL, *tmp_buf=NULL; int i, cpu, rc, compatible, compat32, order, machine; struct cpu_user_regs *regs; unsigned long pfn, mfn; @@ -322,7 +384,7 @@ int __init construct_dom0( unsigned long alloc_spfn; unsigned long alloc_epfn; unsigned long initrd_pfn = -1, initrd_mfn = 0; - unsigned long count; + unsigned long count, shared_info_pfn_addr = 0; struct page_info *page = NULL; start_info_t *si; struct vcpu *v = d->vcpu[0]; @@ -416,6 +478,13 @@ int __init construct_dom0( { printk("Kernel does not support Dom0 operation\n"); return -EINVAL; + + if ( is_pvh_domain(d) && + !test_bit(XENFEAT_hvm_callback_vector, parms.f_supported) ) + { + printk("Kernel does not support PVH mode\n"); + return -EINVAL; + } } if ( compat32 ) @@ -480,6 +549,12 @@ int __init construct_dom0( vstartinfo_end = (vstartinfo_start + sizeof(struct start_info) + sizeof(struct dom0_vga_console_info)); + + if ( is_pvh_domain(d) ) { + shared_info_pfn_addr = round_pgup(vstartinfo_end) - v_start; + vstartinfo_end += PAGE_SIZE; + } + vpt_start = round_pgup(vstartinfo_end); for ( nr_pt_pages = 2; ; nr_pt_pages++ ) { @@ -621,16 +696,26 @@ int __init construct_dom0( maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l3_page_table; l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE; } - clear_page(l4tab); - init_guest_l4_table(l4tab, d); - v->arch.guest_table = pagetable_from_paddr(__pa(l4start)); - if ( is_pv_32on64_domain(d) ) - v->arch.guest_table_user = v->arch.guest_table; + if ( is_pvh_domain(d) ) + { + v->arch.guest_table = pagetable_from_paddr(vpt_start - v_start); + pfn = 0; + } else { + clear_page(l4tab); + init_guest_l4_table(l4tab, d); + v->arch.guest_table = pagetable_from_paddr(__pa(l4start)); + if ( is_pv_32on64_domain(d) ) + v->arch.guest_table_user = v->arch.guest_table; + pfn = alloc_spfn; + } l4tab += l4_table_offset(v_start); - pfn = alloc_spfn; for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ ) { + /* initrd chunk''s mfns are separate, so we need to adjust for them */ + signed long pvh_adj = is_pvh_domain(d) ? + (PFN_UP(initrd_len) - alloc_spfn)<<PAGE_SHIFT : 0; + if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) ) { maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table; @@ -657,16 +742,17 @@ int __init construct_dom0( clear_page(l3tab); if ( count == 0 ) l3tab += l3_table_offset(v_start); - *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT); + *l4tab = l4e_from_paddr(__pa(l3start) + pvh_adj, L4_PROT); l4tab++; } - *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT); + *l3tab = l3e_from_paddr(__pa(l2start) + pvh_adj, L3_PROT); l3tab++; } - *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT); + *l2tab = l2e_from_paddr(__pa(l1start) + pvh_adj, L2_PROT); l2tab++; } - if ( count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) ) + if ( is_pvh_domain(d) || + count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) ) mfn = pfn++; else mfn = initrd_mfn++; @@ -674,6 +760,9 @@ int __init construct_dom0( L1_PROT : COMPAT_L1_PROT)); l1tab++; + if ( is_pvh_domain(d) ) + continue; + page = mfn_to_page(mfn); if ( (page->u.inuse.type_info == 0) && !get_page_and_type(page, d, PGT_writable_page) ) @@ -702,6 +791,9 @@ int __init construct_dom0( COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab)); } + if ( is_pvh_domain(d) ) + goto pvh_skip_pt_rdonly; + /* Pages that are part of page tables must be read only. */ l4tab = l4start + l4_table_offset(vpt_start); l3start = l3tab = l4e_to_l3e(*l4tab); @@ -741,6 +833,8 @@ int __init construct_dom0( } } +pvh_skip_pt_rdonly: + /* Mask all upcalls... */ for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ ) shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1; @@ -754,6 +848,11 @@ int __init construct_dom0( (void)alloc_vcpu(d, i, cpu); } + if ( is_pvh_domain(d) ) + { + v->arch.cr3 = v->arch.hvm_vcpu.guest_cr[3] + (pagetable_get_pfn(v->arch.guest_table)) << PAGE_SHIFT; + } /* Set up CR3 value for write_ptbase */ if ( paging_mode_enabled(d) ) paging_update_paging_modes(v); @@ -764,35 +863,16 @@ int __init construct_dom0( write_ptbase(v); mapcache_override_current(v); - /* Copy the OS image and free temporary buffer. */ - elf.dest = (void*)vkern_start; - rc = elf_load_binary(&elf, 0); - if ( rc < 0 ) - { - printk("Failed to load the kernel binary\n"); - return rc; - } - bootstrap_map(NULL); - - if ( UNSET_ADDR != parms.virt_hypercall ) - { - if ( (parms.virt_hypercall < v_start) || - (parms.virt_hypercall >= v_end) ) - { - mapcache_override_current(NULL); - write_ptbase(current); - printk("Invalid HYPERCALL_PAGE field in ELF notes.\n"); - return -1; + /* Set up start info area. */ + if ( is_pvh_domain(d) ) { + if ( (si_buf=xmalloc_bytes(PAGE_SIZE)) == NULL) { + printk("PVH: xmalloc failed to alloc %ld bytes.\n", PAGE_SIZE); + return -ENOMEM; } - hypercall_page_initialise( - d, (void *)(unsigned long)parms.virt_hypercall); - } - - /* Free temporary buffers. */ - discard_initial_images(); + si = (start_info_t *)si_buf; + } else + si = (start_info_t *)vstartinfo_start; - /* Set up start info area. */ - si = (start_info_t *)vstartinfo_start; clear_page(si); si->nr_pages = nr_pages; @@ -814,7 +894,7 @@ int __init construct_dom0( l2tab = NULL; l1tab = NULL; /* Set up the phys->machine table if not part of the initial mapping. */ - if ( parms.p2m_base != UNSET_ADDR ) + if ( parms.p2m_base != UNSET_ADDR && !is_pvh_domain(d) ) { unsigned long va = vphysmap_start; @@ -935,6 +1015,9 @@ int __init construct_dom0( unmap_domain_page(l3tab); unmap_domain_page(l4start); + if (is_pvh_domain(d) ) + hap_set_pvh_alloc_for_dom0(d, nr_pages); + /* Write the phys->machine and machine->phys table entries. */ for ( pfn = 0; pfn < count; pfn++ ) { @@ -951,11 +1034,8 @@ int __init construct_dom0( if ( pfn > REVERSE_START && (vinitrd_start || pfn < initrd_pfn) ) mfn = alloc_epfn - (pfn - REVERSE_START); #endif - if ( !is_pv_32on64_domain(d) ) - ((unsigned long *)vphysmap_start)[pfn] = mfn; - else - ((unsigned int *)vphysmap_start)[pfn] = mfn; - set_gpfn_from_mfn(mfn, pfn); + dom0_update_physmap(d, pfn, mfn, vphysmap_start); + if (!(pfn & 0xfffff)) process_pending_softirqs(); } @@ -971,8 +1051,8 @@ int __init construct_dom0( if ( !page->u.inuse.type_info && !get_page_and_type(page, d, PGT_writable_page) ) BUG(); - ((unsigned long *)vphysmap_start)[pfn] = mfn; - set_gpfn_from_mfn(mfn, pfn); + + dom0_update_physmap(d, pfn, mfn, vphysmap_start); ++pfn; if (!(pfn & 0xfffff)) process_pending_softirqs(); @@ -992,11 +1072,7 @@ int __init construct_dom0( #ifndef NDEBUG #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn))) #endif - if ( !is_pv_32on64_domain(d) ) - ((unsigned long *)vphysmap_start)[pfn] = mfn; - else - ((unsigned int *)vphysmap_start)[pfn] = mfn; - set_gpfn_from_mfn(mfn, pfn); + dom0_update_physmap(d, pfn, mfn, vphysmap_start); #undef pfn page++; pfn++; if (!(pfn & 0xfffff)) @@ -1004,6 +1080,47 @@ int __init construct_dom0( } } + /* Copy the OS image and free temporary buffer. */ + elf.dest = (void*)vkern_start; + rc = elf_load_binary(&elf, is_pvh_domain(d) ); + if ( rc < 0 ) + { + printk("Failed to load the kernel binary\n"); + return rc; + } + bootstrap_map(NULL); + + if ( UNSET_ADDR != parms.virt_hypercall ) + { + void *addr; + + if ( is_pvh_domain(d) ) { + if ( (tmp_buf=xzalloc_bytes(PAGE_SIZE)) == NULL ) { + printk("xzalloc failed for tmp_buf. %ld bytes.\n", PAGE_SIZE); + return -ENOMEM; + } + addr = tmp_buf; + } else + addr = (void *)parms.virt_hypercall; + + if ( (parms.virt_hypercall < v_start) || + (parms.virt_hypercall >= v_end) ) + { + write_ptbase(current); + printk("Invalid HYPERCALL_PAGE field in ELF notes.\n"); + return -1; + } + hypercall_page_initialise(d, addr); + + if ( is_pvh_domain(d) ) { + copy_pvh((void *)parms.virt_hypercall, tmp_buf, PAGE_SIZE); + xfree(tmp_buf); + } + } + + /* Free temporary buffers. */ + discard_initial_images(); + if ( initrd_len != 0 ) { si->mod_start = vinitrd_start ?: initrd_pfn; @@ -1019,6 +1136,15 @@ int __init construct_dom0( si->console.dom0.info_off = sizeof(struct start_info); si->console.dom0.info_size = sizeof(struct dom0_vga_console_info); } + if ( is_pvh_domain(d) ) { + unsigned long mfn = virt_to_mfn(d->shared_info); + unsigned long pfn = shared_info_pfn_addr>>PAGE_SHIFT; + si->shared_info = shared_info_pfn_addr; + dom0_update_physmap(d, pfn, mfn, 0); + + copy_pvh((char *)vstartinfo_start, si_buf, PAGE_SIZE); + xfree(si_buf); + } if ( is_pv_32on64_domain(d) ) xlat_start_info(si, XLAT_start_info_console_dom0); @@ -1050,12 +1176,16 @@ int __init construct_dom0( regs->eip = parms.virt_entry; regs->esp = vstack_end; regs->esi = vstartinfo_start; - regs->eflags = X86_EFLAGS_IF; + regs->eflags = X86_EFLAGS_IF | 0x2; - if ( opt_dom0_shadow ) + if ( opt_dom0_shadow ) { + if ( is_pvh_domain(d) ) { + printk("Invalid option dom0_shadow for PVH\n"); + return -EINVAL; + } if ( paging_enable(d, PG_SH_enable) == 0 ) paging_update_paging_modes(v); - + } if ( supervisor_mode_kernel ) { v->arch.pv_vcpu.kernel_ss &= ~3; @@ -1132,6 +1262,9 @@ int __init construct_dom0( BUG_ON(rc != 0); + if ( is_pvh_domain(d) ) + pvh_map_all_iomem(d); + iommu_dom0_init(dom0); return 0; diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c index 055833d..d3d5697 100644 --- a/xen/arch/x86/mm/hap/hap.c +++ b/xen/arch/x86/mm/hap/hap.c @@ -574,6 +574,20 @@ int hap_domctl(struct domain *d, xen_domctl_shadow_op_t *sc, } } +/* Resize hap table. Copied from: libxl_get_required_shadow_memory() */ +void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages) +{ + int rc; + unsigned long memkb = num_pages * (PAGE_SIZE / 1024); + + memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024)); + num_pages = ((memkb+1023)/1024) << (20 - PAGE_SHIFT); + paging_lock(d); + rc = hap_set_allocation(d, num_pages, NULL); + paging_unlock(d); + BUG_ON(rc); +} + static const struct paging_mode hap_paging_real_mode; static const struct paging_mode hap_paging_protected_mode; static const struct paging_mode hap_paging_pae_mode; @@ -633,7 +647,8 @@ static void hap_update_cr3(struct vcpu *v, int do_locking) const struct paging_mode * hap_paging_get_mode(struct vcpu *v) { - return !hvm_paging_enabled(v) ? &hap_paging_real_mode : + return is_pvh_vcpu(v) ? &hap_paging_long_mode : + !hvm_paging_enabled(v) ? &hap_paging_real_mode : hvm_long_mode_enabled(v) ? &hap_paging_long_mode : hvm_pae_enabled(v) ? &hap_paging_pae_mode : &hap_paging_protected_mode; diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index 43301a5..f307f24 100644 --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -60,6 +60,10 @@ integer_param("maxcpus", max_cpus); static bool_t __initdata disable_smep; invbool_param("smep", disable_smep); +/* Boot dom0 in PVH mode */ +static bool_t __initdata opt_dom0pvh; +boolean_param("dom0pvh", opt_dom0pvh); + /* **** Linux config option: propagated to domain0. */ /* "acpi=off": Sisables both ACPI table parsing and interpreter. */ /* "acpi=force": Override the disable blacklist. */ @@ -545,7 +549,7 @@ void __init __start_xen(unsigned long mbi_p) { char *memmap_type = NULL; char *cmdline, *kextra, *loader; - unsigned int initrdidx; + unsigned int initrdidx, domcr_flags = 0; multiboot_info_t *mbi = __va(mbi_p); module_t *mod = (module_t *)__va(mbi->mods_addr); unsigned long nr_pages, modules_headroom, *module_map; @@ -1314,7 +1318,9 @@ void __init __start_xen(unsigned long mbi_p) panic("Could not protect TXT memory regions\n"); /* Create initial domain 0. */ - dom0 = domain_create(0, DOMCRF_s3_integrity, 0); + domcr_flags = (opt_dom0pvh ? DOMCRF_pvh | DOMCRF_hap : 0); + domcr_flags |= DOMCRF_s3_integrity; + dom0 = domain_create(0, domcr_flags, 0); if ( IS_ERR(dom0) || (alloc_dom0_vcpu0() == NULL) ) panic("Error creating domain 0\n"); diff --git a/xen/include/asm-x86/hap.h b/xen/include/asm-x86/hap.h index e03f983..aab8558 100644 --- a/xen/include/asm-x86/hap.h +++ b/xen/include/asm-x86/hap.h @@ -63,6 +63,7 @@ int hap_track_dirty_vram(struct domain *d, XEN_GUEST_HANDLE_64(uint8) dirty_bitmap); extern const struct paging_mode *hap_paging_get_mode(struct vcpu *); +void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages); #endif /* XEN_HAP_H */ -- 1.7.2.3
Mark completion of patches by enabling PVH domain creation. Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> --- xen/arch/x86/domain.c | 7 ------- 1 files changed, 0 insertions(+), 7 deletions(-) diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 5b5444f..09cdd46 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -653,13 +653,6 @@ int arch_set_info_guest( unsigned int i; int rc = 0, compat; - /* This removed when all patches are checked in */ - if ( is_pvh_vcpu(v) ) - { - printk("PVH: You don''t have the correct xen version for PVH\n"); - return -EINVAL; - } - /* The context is a compat-mode one if the target domain is compat-mode; * we expect the tools to DTRT even in compat-mode callers. */ compat = is_pv_32on64_domain(d); -- 1.7.2.3