Mukesh Rathor
2012-Aug-16 01:02 UTC
[RFC PATCH 3/8]: PVH: memory manager and paging related changes
--- arch/x86/xen/mmu.c | 179 ++++++++++++++++++++++++++++++++++++--- arch/x86/xen/mmu.h | 2 + include/xen/interface/memory.h | 27 ++++++- include/xen/interface/physdev.h | 10 ++ include/xen/xen-ops.h | 7 ++ 5 files changed, 211 insertions(+), 14 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index b65a761..44a6477 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -330,6 +330,38 @@ static void xen_set_pte(pte_t *ptep, pte_t pteval) __xen_set_pte(ptep, pteval); } +/* This for PV guest in hvm container */ +void xen_set_clr_mmio_pvh_pte(unsigned long pfn, unsigned long mfn, + int nr_mfns, int add_mapping) +{ + int rc; + struct physdev_map_iomem iomem; + + iomem.first_gfn = pfn; + iomem.first_mfn = mfn; + iomem.nr_mfns = nr_mfns; + iomem.add_mapping = add_mapping; + + rc = HYPERVISOR_physdev_op(PHYSDEVOP_pvh_map_iomem, &iomem); + BUG_ON(rc); +} + +/* This for PV guest in hvm container. + * We need this because during boot early_ioremap path eventually calls + * set_pte that maps io space. Also, ACPI pages are not mapped into to the + * EPT during dom0 creation. The pages are mapped initially here from + * kernel_physical_mapping_init() then later the memtype is changed. */ +static void xen_dom0pvh_set_pte(pte_t *ptep, pte_t pteval) +{ + native_set_pte(ptep, pteval); +} + +static void xen_dom0pvh_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval) +{ + native_set_pte(ptep, pteval); +} + static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { @@ -1197,6 +1229,10 @@ static void xen_post_allocator_init(void); static void __init xen_pagetable_setup_done(pgd_t *base) { xen_setup_shared_info(); + + if (xen_pvh_domain()) + return; + xen_post_allocator_init(); } @@ -1652,6 +1688,10 @@ static void set_page_prot(void *addr, pgprot_t prot) unsigned long pfn = __pa(addr) >> PAGE_SHIFT; pte_t pte = pfn_pte(pfn, prot); + /* for PVH, page tables are native. */ + if (xen_pvh_domain()) + return; + if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) BUG(); } @@ -1745,6 +1785,7 @@ static void convert_pfn_mfn(void *v) * but that''s enough to get __va working. We need to fill in the rest * of the physical mapping once some sort of allocator has been set * up. + * NOTE: for PVH, the page tables are native with HAP required. */ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) @@ -1761,10 +1802,12 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, /* Zap identity mapping */ init_level4_pgt[0] = __pgd(0); - /* Pre-constructed entries are in pfn, so convert to mfn */ - convert_pfn_mfn(init_level4_pgt); - convert_pfn_mfn(level3_ident_pgt); - convert_pfn_mfn(level3_kernel_pgt); + if (!xen_pvh_domain()) { + /* Pre-constructed entries are in pfn, so convert to mfn */ + convert_pfn_mfn(init_level4_pgt); + convert_pfn_mfn(level3_ident_pgt); + convert_pfn_mfn(level3_kernel_pgt); + } l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); @@ -1787,12 +1830,14 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); - /* Pin down new L4 */ - pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, - PFN_DOWN(__pa_symbol(init_level4_pgt))); + if (!xen_pvh_domain()) { + /* Pin down new L4 */ + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, + PFN_DOWN(__pa_symbol(init_level4_pgt))); - /* Unpin Xen-provided one */ - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + /* Unpin Xen-provided one */ + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + } /* Switch over */ pgd = init_level4_pgt; @@ -1802,9 +1847,13 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, * structure to attach it to, so make sure we just set kernel * pgd. */ - xen_mc_batch(); - __xen_write_cr3(true, __pa(pgd)); - xen_mc_issue(PARAVIRT_LAZY_CPU); + if (xen_pvh_domain()) { + native_write_cr3(__pa(pgd)); + } else { + xen_mc_batch(); + __xen_write_cr3(true, __pa(pgd)); + xen_mc_issue(PARAVIRT_LAZY_CPU); + } memblock_reserve(__pa(xen_start_info->pt_base), xen_start_info->nr_pt_frames * PAGE_SIZE); @@ -2067,9 +2116,21 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { void __init xen_init_mmu_ops(void) { + x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; + + if (xen_pvh_domain()) { + pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others; + + /* set_pte* for PCI devices to map iomem. */ + if (xen_initial_domain()) { + pv_mmu_ops.set_pte = xen_dom0pvh_set_pte; + pv_mmu_ops.set_pte_at = xen_dom0pvh_set_pte_at; + } + return; + } + x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; - x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; pv_mmu_ops = xen_mmu_ops; memset(dummy_mapping, 0xff, PAGE_SIZE); @@ -2305,6 +2366,93 @@ void __init xen_hvm_init_mmu_ops(void) } #endif +/* Map foreign gmfn, fgmfn, to local pfn, lpfn. This for the user space + * creating new guest on PVH dom0 and needs to map domU pages. Called from + * exported function, so no need to export this. + */ +static int pvh_add_to_xen_p2m(unsigned long lpfn, unsigned long fgmfn, + unsigned int domid) +{ + int rc; + struct xen_add_to_physmap pmb = {.foreign_domid = domid}; + + pmb.gpfn = lpfn; + pmb.idx = fgmfn; + pmb.space = XENMAPSPACE_gmfn_foreign; + rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &pmb); + if (rc) { + pr_warn("Failed to map pfn to mfn rc:%d pfn:%lx mfn:%lx\n", + rc, lpfn, fgmfn); + return 1; + } + return 0; +} + +/* Unmap an entry from xen p2m table */ +int pvh_rem_xen_p2m(unsigned long spfn, int count) +{ + struct xen_remove_from_physmap xrp; + int i, rc; + + for (i=0; i < count; i++) { + xrp.domid = DOMID_SELF; + xrp.gpfn = spfn+i; + rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); + if (rc) { + pr_warn("Failed to unmap pfn:%lx rc:%d done:%d\n", + spfn+i, rc, i); + return 1; + } + } + return 0; +} +EXPORT_SYMBOL_GPL(pvh_rem_xen_p2m); + +struct pvh_remap_data { + unsigned long fgmfn; /* foreign domain''s gmfn */ + pgprot_t prot; + domid_t domid; + struct vm_area_struct *vma; +}; + +static int pvh_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, + void *data) +{ + struct pvh_remap_data *pvhp = data; + struct xen_pvh_sav_pfn_info *savp = pvhp->vma->vm_private_data; + unsigned long pfn = page_to_pfn(savp->sp_paga[savp->sp_next_todo++]); + pte_t pteval = pte_mkspecial(pfn_pte(pfn, pvhp->prot)); + + native_set_pte(ptep, pteval); + if (pvh_add_to_xen_p2m(pfn, pvhp->fgmfn, pvhp->domid)) + return -EFAULT; + + return 0; +} + +/* The only caller at moment passes one gmfn at a time. + * PVH TBD/FIXME: expand this in future to honor batch requests. + */ +static int pvh_remap_gmfn_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long mfn, int nr, + pgprot_t prot, unsigned domid) +{ + int err; + struct pvh_remap_data pvhdata; + + if (nr > 1) + return -EINVAL; + + pvhdata.fgmfn = mfn; + pvhdata.prot = prot; + pvhdata.domid = domid; + pvhdata.vma = vma; + err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT, + pvh_map_pte_fn, &pvhdata); + flush_tlb_all(); + return err; +} + #define REMAP_BATCH_SIZE 16 struct remap_data { @@ -2342,6 +2490,11 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) = (VM_PFNMAP | VM_RESERVED | VM_IO))); + if (xen_pvh_domain()) { + /* We need to update the local page tables and the xen HAP */ + return pvh_remap_gmfn_range(vma, addr, mfn, nr, prot, domid); + } + rmd.mfn = mfn; rmd.prot = prot; diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 73809bb..6d0bb56 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -23,4 +23,6 @@ unsigned long xen_read_cr2_direct(void); extern void xen_init_mmu_ops(void); extern void xen_hvm_init_mmu_ops(void); +extern void xen_set_clr_mmio_pvh_pte(unsigned long pfn, unsigned long mfn, + int nr_mfns, int add_mapping); #endif /* _XEN_MMU_H */ diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h index eac3ce1..1b213b1 100644 --- a/include/xen/interface/memory.h +++ b/include/xen/interface/memory.h @@ -163,10 +163,19 @@ struct xen_add_to_physmap { /* Which domain to change the mapping for. */ domid_t domid; + /* Number of pages to go through for gmfn_range */ + uint16_t size; + /* Source mapping space. */ #define XENMAPSPACE_shared_info 0 /* shared info page */ #define XENMAPSPACE_grant_table 1 /* grant table page */ - unsigned int space; +#define XENMAPSPACE_gmfn 2 /* GMFN */ +#define XENMAPSPACE_gmfn_range 3 /* GMFN range */ +#define XENMAPSPACE_gmfn_foreign 4 /* GMFN from another guest */ + uint16_t space; + domid_t foreign_domid; /* IFF XENMAPSPACE_gmfn_foreign */ + +#define XENMAPIDX_grant_table_status 0x80000000 /* Index into source mapping space. */ unsigned long idx; @@ -234,4 +243,20 @@ DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map); * during a driver critical region. */ extern spinlock_t xen_reservation_lock; + +/* + * Unmaps the page appearing at a particular GPFN from the specified guest''s + * pseudophysical address space. + * arg == addr of xen_remove_from_physmap_t. + */ +#define XENMEM_remove_from_physmap 15 +struct xen_remove_from_physmap { + /* Which domain to change the mapping for. */ + domid_t domid; + + /* GPFN of the current mapping of the page. */ + unsigned long gpfn; +}; +DEFINE_GUEST_HANDLE_STRUCT(xen_remove_from_physmap); + #endif /* __XEN_PUBLIC_MEMORY_H__ */ diff --git a/include/xen/interface/physdev.h b/include/xen/interface/physdev.h index 9ce788d..80f792e 100644 --- a/include/xen/interface/physdev.h +++ b/include/xen/interface/physdev.h @@ -258,6 +258,16 @@ struct physdev_pci_device { uint8_t devfn; }; +#define PHYSDEVOP_pvh_map_iomem 29 +struct physdev_map_iomem { + /* IN */ + uint64_t first_gfn; + uint64_t first_mfn; + uint32_t nr_mfns; + uint32_t add_mapping; /* 1 == add mapping; 0 == unmap */ + +}; + /* * Notify that some PIRQ-bound event channels have been unmasked. * ** This command is obsolete since interface version 0x00030202 and is ** diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 6a198e4..fa595e1 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -29,4 +29,11 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, unsigned long mfn, int nr, pgprot_t prot, unsigned domid); +struct xen_pvh_sav_pfn_info { + struct page **sp_paga; /* save pfn (info) page array */ + int sp_num_pgs; + int sp_next_todo; +}; +extern int pvh_rem_xen_p2m(unsigned long spfn, int count); + #endif /* INCLUDE_XEN_OPS_H */ -- 1.7.2.3
Stefano Stabellini
2012-Aug-16 14:10 UTC
Re: [RFC PATCH 3/8]: PVH: memory manager and paging related changes
On Thu, 16 Aug 2012, Mukesh Rathor wrote:> arch/x86/xen/mmu.c | 179 ++++++++++++++++++++++++++++++++++++--- > arch/x86/xen/mmu.h | 2 + > include/xen/interface/memory.h | 27 ++++++- > include/xen/interface/physdev.h | 10 ++ > include/xen/xen-ops.h | 7 ++ > 5 files changed, 211 insertions(+), 14 deletions(-) > > diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c > index b65a761..44a6477 100644 > --- a/arch/x86/xen/mmu.c > +++ b/arch/x86/xen/mmu.c > @@ -330,6 +330,38 @@ static void xen_set_pte(pte_t *ptep, pte_t pteval) > __xen_set_pte(ptep, pteval); > } > > +/* This for PV guest in hvm container */ > +void xen_set_clr_mmio_pvh_pte(unsigned long pfn, unsigned long mfn, > + int nr_mfns, int add_mapping) > +{ > + int rc; > + struct physdev_map_iomem iomem; > + > + iomem.first_gfn = pfn; > + iomem.first_mfn = mfn; > + iomem.nr_mfns = nr_mfns; > + iomem.add_mapping = add_mapping; > + > + rc = HYPERVISOR_physdev_op(PHYSDEVOP_pvh_map_iomem, &iomem); > + BUG_ON(rc); > +} > + > +/* This for PV guest in hvm container. > + * We need this because during boot early_ioremap path eventually calls > + * set_pte that maps io space. Also, ACPI pages are not mapped into to the > + * EPT during dom0 creation. The pages are mapped initially here from > + * kernel_physical_mapping_init() then later the memtype is changed. */ > +static void xen_dom0pvh_set_pte(pte_t *ptep, pte_t pteval) > +{ > + native_set_pte(ptep, pteval); > +} > + > +static void xen_dom0pvh_set_pte_at(struct mm_struct *mm, unsigned long addr, > + pte_t *ptep, pte_t pteval) > +{ > + native_set_pte(ptep, pteval); > +} > + > static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, > pte_t *ptep, pte_t pteval) > { > @@ -1197,6 +1229,10 @@ static void xen_post_allocator_init(void); > static void __init xen_pagetable_setup_done(pgd_t *base) > { > xen_setup_shared_info(); > + > + if (xen_pvh_domain()) > + return; > + > xen_post_allocator_init(); > } > > @@ -1652,6 +1688,10 @@ static void set_page_prot(void *addr, pgprot_t prot) > unsigned long pfn = __pa(addr) >> PAGE_SHIFT; > pte_t pte = pfn_pte(pfn, prot); > > + /* for PVH, page tables are native. */ > + if (xen_pvh_domain()) > + return; > + > if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) > BUG(); > } > @@ -1745,6 +1785,7 @@ static void convert_pfn_mfn(void *v) > * but that''s enough to get __va working. We need to fill in the rest > * of the physical mapping once some sort of allocator has been set > * up. > + * NOTE: for PVH, the page tables are native with HAP required. > */ > pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, > unsigned long max_pfn) > @@ -1761,10 +1802,12 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, > /* Zap identity mapping */ > init_level4_pgt[0] = __pgd(0); > > - /* Pre-constructed entries are in pfn, so convert to mfn */ > - convert_pfn_mfn(init_level4_pgt); > - convert_pfn_mfn(level3_ident_pgt); > - convert_pfn_mfn(level3_kernel_pgt); > + if (!xen_pvh_domain()) { > + /* Pre-constructed entries are in pfn, so convert to mfn */ > + convert_pfn_mfn(init_level4_pgt); > + convert_pfn_mfn(level3_ident_pgt); > + convert_pfn_mfn(level3_kernel_pgt); > + } > > l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); > l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); > @@ -1787,12 +1830,14 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, > set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); > set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); > > - /* Pin down new L4 */ > - pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, > - PFN_DOWN(__pa_symbol(init_level4_pgt))); > + if (!xen_pvh_domain()) { > + /* Pin down new L4 */ > + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, > + PFN_DOWN(__pa_symbol(init_level4_pgt))); > > - /* Unpin Xen-provided one */ > - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); > + /* Unpin Xen-provided one */ > + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); > + } > > /* Switch over */ > pgd = init_level4_pgt; > @@ -1802,9 +1847,13 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, > * structure to attach it to, so make sure we just set kernel > * pgd. > */ > - xen_mc_batch(); > - __xen_write_cr3(true, __pa(pgd)); > - xen_mc_issue(PARAVIRT_LAZY_CPU); > + if (xen_pvh_domain()) { > + native_write_cr3(__pa(pgd)); > + } else { > + xen_mc_batch(); > + __xen_write_cr3(true, __pa(pgd)); > + xen_mc_issue(PARAVIRT_LAZY_CPU); > + } > > memblock_reserve(__pa(xen_start_info->pt_base), > xen_start_info->nr_pt_frames * PAGE_SIZE); > @@ -2067,9 +2116,21 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { > > void __init xen_init_mmu_ops(void) > { > + x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; > + > + if (xen_pvh_domain()) { > + pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others; > + > + /* set_pte* for PCI devices to map iomem. */ > + if (xen_initial_domain()) { > + pv_mmu_ops.set_pte = xen_dom0pvh_set_pte; > + pv_mmu_ops.set_pte_at = xen_dom0pvh_set_pte_at; > + } > + return; > + }Considering that the implementation of xen_dom0pvh_set_pte is native_set_pte, can''t we just leave it to the default that is native_set_pte?> x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; > x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; > - x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; > pv_mmu_ops = xen_mmu_ops; > > memset(dummy_mapping, 0xff, PAGE_SIZE); > @@ -2305,6 +2366,93 @@ void __init xen_hvm_init_mmu_ops(void) > } > #endif > > +/* Map foreign gmfn, fgmfn, to local pfn, lpfn. This for the user space > + * creating new guest on PVH dom0 and needs to map domU pages. Called from > + * exported function, so no need to export this. > + */ > +static int pvh_add_to_xen_p2m(unsigned long lpfn, unsigned long fgmfn, > + unsigned int domid) > +{ > + int rc; > + struct xen_add_to_physmap pmb = {.foreign_domid = domid}; > + > + pmb.gpfn = lpfn; > + pmb.idx = fgmfn; > + pmb.space = XENMAPSPACE_gmfn_foreign; > + rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &pmb); > + if (rc) { > + pr_warn("Failed to map pfn to mfn rc:%d pfn:%lx mfn:%lx\n", > + rc, lpfn, fgmfn); > + return 1; > + } > + return 0; > +} > + > +/* Unmap an entry from xen p2m table */ > +int pvh_rem_xen_p2m(unsigned long spfn, int count) > +{ > + struct xen_remove_from_physmap xrp; > + int i, rc; > + > + for (i=0; i < count; i++) { > + xrp.domid = DOMID_SELF; > + xrp.gpfn = spfn+i; > + rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); > + if (rc) { > + pr_warn("Failed to unmap pfn:%lx rc:%d done:%d\n", > + spfn+i, rc, i); > + return 1; > + } > + } > + return 0; > +} > +EXPORT_SYMBOL_GPL(pvh_rem_xen_p2m); > + > +struct pvh_remap_data { > + unsigned long fgmfn; /* foreign domain''s gmfn */ > + pgprot_t prot; > + domid_t domid; > + struct vm_area_struct *vma; > +}; > + > +static int pvh_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, > + void *data) > +{ > + struct pvh_remap_data *pvhp = data; > + struct xen_pvh_sav_pfn_info *savp = pvhp->vma->vm_private_data; > + unsigned long pfn = page_to_pfn(savp->sp_paga[savp->sp_next_todo++]); > + pte_t pteval = pte_mkspecial(pfn_pte(pfn, pvhp->prot)); > + > + native_set_pte(ptep, pteval); > + if (pvh_add_to_xen_p2m(pfn, pvhp->fgmfn, pvhp->domid)) > + return -EFAULT; > + > + return 0; > +} > + > +/* The only caller at moment passes one gmfn at a time. > + * PVH TBD/FIXME: expand this in future to honor batch requests. > + */ > +static int pvh_remap_gmfn_range(struct vm_area_struct *vma, > + unsigned long addr, unsigned long mfn, int nr, > + pgprot_t prot, unsigned domid) > +{ > + int err; > + struct pvh_remap_data pvhdata; > + > + if (nr > 1) > + return -EINVAL; > + > + pvhdata.fgmfn = mfn; > + pvhdata.prot = prot; > + pvhdata.domid = domid; > + pvhdata.vma = vma; > + err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT, > + pvh_map_pte_fn, &pvhdata); > + flush_tlb_all(); > + return err; > +} > + > #define REMAP_BATCH_SIZE 16 > > struct remap_data { > @@ -2342,6 +2490,11 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) => (VM_PFNMAP | VM_RESERVED | VM_IO))); > > + if (xen_pvh_domain()) { > + /* We need to update the local page tables and the xen HAP */ > + return pvh_remap_gmfn_range(vma, addr, mfn, nr, prot, domid); > + } > + > rmd.mfn = mfn; > rmd.prot = prot; > > diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h > index 73809bb..6d0bb56 100644 > --- a/arch/x86/xen/mmu.h > +++ b/arch/x86/xen/mmu.h > @@ -23,4 +23,6 @@ unsigned long xen_read_cr2_direct(void); > > extern void xen_init_mmu_ops(void); > extern void xen_hvm_init_mmu_ops(void); > +extern void xen_set_clr_mmio_pvh_pte(unsigned long pfn, unsigned long mfn, > + int nr_mfns, int add_mapping); > #endif /* _XEN_MMU_H */ > diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h > index eac3ce1..1b213b1 100644 > --- a/include/xen/interface/memory.h > +++ b/include/xen/interface/memory.h > @@ -163,10 +163,19 @@ struct xen_add_to_physmap { > /* Which domain to change the mapping for. */ > domid_t domid; > > + /* Number of pages to go through for gmfn_range */ > + uint16_t size; > + > /* Source mapping space. */ > #define XENMAPSPACE_shared_info 0 /* shared info page */ > #define XENMAPSPACE_grant_table 1 /* grant table page */ > - unsigned int space; > +#define XENMAPSPACE_gmfn 2 /* GMFN */ > +#define XENMAPSPACE_gmfn_range 3 /* GMFN range */ > +#define XENMAPSPACE_gmfn_foreign 4 /* GMFN from another guest */ > + uint16_t space; > + domid_t foreign_domid; /* IFF XENMAPSPACE_gmfn_foreign */ > + > +#define XENMAPIDX_grant_table_status 0x80000000As you have seen, I have a very similar patch in my series.> /* Index into source mapping space. */ > unsigned long idx; > @@ -234,4 +243,20 @@ DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map); > * during a driver critical region. > */ > extern spinlock_t xen_reservation_lock; > + > +/* > + * Unmaps the page appearing at a particular GPFN from the specified guest''s > + * pseudophysical address space. > + * arg == addr of xen_remove_from_physmap_t. > + */ > +#define XENMEM_remove_from_physmap 15 > +struct xen_remove_from_physmap { > + /* Which domain to change the mapping for. */ > + domid_t domid; > + > + /* GPFN of the current mapping of the page. */ > + unsigned long gpfn; > +}; > +DEFINE_GUEST_HANDLE_STRUCT(xen_remove_from_physmap); > + > #endif /* __XEN_PUBLIC_MEMORY_H__ */ > diff --git a/include/xen/interface/physdev.h b/include/xen/interface/physdev.h > index 9ce788d..80f792e 100644 > --- a/include/xen/interface/physdev.h > +++ b/include/xen/interface/physdev.h > @@ -258,6 +258,16 @@ struct physdev_pci_device { > uint8_t devfn; > }; > > +#define PHYSDEVOP_pvh_map_iomem 29 > +struct physdev_map_iomem { > + /* IN */ > + uint64_t first_gfn; > + uint64_t first_mfn; > + uint32_t nr_mfns; > + uint32_t add_mapping; /* 1 == add mapping; 0 == unmap */ > + > +}; > + > /* > * Notify that some PIRQ-bound event channels have been unmasked. > * ** This command is obsolete since interface version 0x00030202 and is ** > diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h > index 6a198e4..fa595e1 100644 > --- a/include/xen/xen-ops.h > +++ b/include/xen/xen-ops.h > @@ -29,4 +29,11 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > unsigned long mfn, int nr, > pgprot_t prot, unsigned domid); > > +struct xen_pvh_sav_pfn_info { > + struct page **sp_paga; /* save pfn (info) page array */ > + int sp_num_pgs; > + int sp_next_todo; > +}; > +extern int pvh_rem_xen_p2m(unsigned long spfn, int count); > + > #endif /* INCLUDE_XEN_OPS_H */ > -- > 1.7.2.3 > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel >
Ian Campbell
2012-Aug-16 14:15 UTC
Re: [RFC PATCH 3/8]: PVH: memory manager and paging related changes
On Thu, 2012-08-16 at 15:10 +0100, Stefano Stabellini wrote: On Thu, 16 Aug 2012, Mukesh Rathor wrote:> [....]Please can you trim your quotes, getting a repeat of hundreds of lines of quote for just a couple of lines of new content makes it quite hard to spot the actual content. Ian.
Ian Campbell
2012-Aug-17 09:26 UTC
Re: [RFC PATCH 3/8]: PVH: memory manager and paging related changes
On Thu, 2012-08-16 at 02:02 +0100, Mukesh Rathor wrote:> > --- > arch/x86/xen/mmu.c | 179 ++++++++++++++++++++++++++++++++++++--- > arch/x86/xen/mmu.h | 2 + > include/xen/interface/memory.h | 27 ++++++- > include/xen/interface/physdev.h | 10 ++ > include/xen/xen-ops.h | 7 ++ > 5 files changed, 211 insertions(+), 14 deletions(-) > > diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c > index b65a761..44a6477 100644 > --- a/arch/x86/xen/mmu.c > +++ b/arch/x86/xen/mmu.c > @@ -330,6 +330,38 @@ static void xen_set_pte(pte_t *ptep, pte_t pteval) > __xen_set_pte(ptep, pteval); > } > > +/* This for PV guest in hvm container */Shall I stop mentioning uses of "HVM" in the context of PVH? ;-)> +void xen_set_clr_mmio_pvh_pte(unsigned long pfn, unsigned long mfn, > + int nr_mfns, int add_mapping) > +{ > + int rc; > + struct physdev_map_iomem iomem; > + > + iomem.first_gfn = pfn; > + iomem.first_mfn = mfn; > + iomem.nr_mfns = nr_mfns; > + iomem.add_mapping = add_mapping; > + > + rc = HYPERVISOR_physdev_op(PHYSDEVOP_pvh_map_iomem, &iomem); > + BUG_ON(rc);It''s purely a matter of taste but we would tend to write if (HYPERVISOR_foO(...) BUG();> +} > + > +/* This for PV guest in hvm container. > + * We need this because during boot early_ioremap path eventually calls > + * set_pte that maps io space. Also, ACPI pages are not mapped into to the > + * EPT during dom0 creation. The pages are mapped initially here from > + * kernel_physical_mapping_init() then later the memtype is changed. */ > +static void xen_dom0pvh_set_pte(pte_t *ptep, pte_t pteval) > +{ > + native_set_pte(ptep, pteval); > +} > + > +static void xen_dom0pvh_set_pte_at(struct mm_struct *mm, unsigned long addr, > + pte_t *ptep, pte_t pteval) > +{ > + native_set_pte(ptep, pteval); > +}I think Stefano alreeady asked why not just use native_* as the hook.> + > static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, > pte_t *ptep, pte_t pteval) > { > @@ -1197,6 +1229,10 @@ static void xen_post_allocator_init(void); > static void __init xen_pagetable_setup_done(pgd_t *base) > { > xen_setup_shared_info(); > + > + if (xen_pvh_domain()) > + return; > + > xen_post_allocator_init(); > } > > @@ -1652,6 +1688,10 @@ static void set_page_prot(void *addr, pgprot_t prot) > unsigned long pfn = __pa(addr) >> PAGE_SHIFT; > pte_t pte = pfn_pte(pfn, prot); > > + /* for PVH, page tables are native. */ > + if (xen_pvh_domain()) > + return; > + > if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) > BUG(); > } > @@ -1745,6 +1785,7 @@ static void convert_pfn_mfn(void *v) > * but that''s enough to get __va working. We need to fill in the rest > * of the physical mapping once some sort of allocator has been set > * up. > + * NOTE: for PVH, the page tables are native with HAP required.OOI does this mean shadow doesn''t work?> @@ -1761,10 +1802,12 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, > /* Zap identity mapping */ > init_level4_pgt[0] = __pgd(0); > > - /* Pre-constructed entries are in pfn, so convert to mfn */ > - convert_pfn_mfn(init_level4_pgt); > - convert_pfn_mfn(level3_ident_pgt); > - convert_pfn_mfn(level3_kernel_pgt); > + if (!xen_pvh_domain()) {Does this test really mean if(XENFEAT_auto_translated_physmap)? And maybe it fits mnore logically in convert_pfn_mfn itself?> + /* Pre-constructed entries are in pfn, so convert to mfn */ > + convert_pfn_mfn(init_level4_pgt); > + convert_pfn_mfn(level3_ident_pgt); > + convert_pfn_mfn(level3_kernel_pgt); > + } > > l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); > l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); > @@ -1787,12 +1830,14 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, > set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); > set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); > > - /* Pin down new L4 */ > - pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, > - PFN_DOWN(__pa_symbol(init_level4_pgt))); > + if (!xen_pvh_domain()) {XENFEAT_writeable_page_tables? And inside pin_pagetable_pfn perhaps?> + /* Pin down new L4 */ > + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, > + PFN_DOWN(__pa_symbol(init_level4_pgt))); > > - /* Unpin Xen-provided one */ > - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); > + /* Unpin Xen-provided one */ > + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); > + } > > /* Switch over */ > pgd = init_level4_pgt; > @@ -1802,9 +1847,13 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, > * structure to attach it to, so make sure we just set kernel > * pgd. > */ > - xen_mc_batch(); > - __xen_write_cr3(true, __pa(pgd)); > - xen_mc_issue(PARAVIRT_LAZY_CPU);Not your issue but I wonder why we do this weird looking singleton batch?> + if (xen_pvh_domain()) { > + native_write_cr3(__pa(pgd)); > + } else { > + xen_mc_batch(); > + __xen_write_cr3(true, __pa(pgd)); > + xen_mc_issue(PARAVIRT_LAZY_CPU); > + } > > memblock_reserve(__pa(xen_start_info->pt_base), > xen_start_info->nr_pt_frames * PAGE_SIZE); > @@ -2067,9 +2116,21 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { > > void __init xen_init_mmu_ops(void) > { > + x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;I''d be tempted to leave this below, where it fits in alongside the others and duplicate it in the if as necessary. Given that the two cases are basically non-overlapping perhaps you want xen_init_(pv,pvh)_mmu_ops as separate hooks? Looking back the same might apply to the pagetable_setup_done hook?> + > + if (xen_pvh_domain()) { > + pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others; > + > + /* set_pte* for PCI devices to map iomem. */ > + if (xen_initial_domain()) { > + pv_mmu_ops.set_pte = xen_dom0pvh_set_pte; > + pv_mmu_ops.set_pte_at = xen_dom0pvh_set_pte_at;Is this wrong for domU or have you just not tried/implemented passthrough support yet?> + } > + return; > + } > + > x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; > x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; > - x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; > pv_mmu_ops = xen_mmu_ops; > > memset(dummy_mapping, 0xff, PAGE_SIZE); > @@ -2305,6 +2366,93 @@ void __init xen_hvm_init_mmu_ops(void) > } > #endif > > +/* Map foreign gmfn, fgmfn, to local pfn, lpfn. This for the user space > + * creating new guest on PVH dom0 and needs to map domU pages. Called from > + * exported function, so no need to export this. > + */ > +static int pvh_add_to_xen_p2m(unsigned long lpfn, unsigned long fgmfn, > + unsigned int domid) > +{ > + int rc; > + struct xen_add_to_physmap pmb = {.foreign_domid = domid};I''m not sure but I think CodingStyle would want spaces inside the {}s. What is the b in pmb? Phys Map B??? (every other user of this interface says xatp, FWIW)> + > + pmb.gpfn = lpfn; > + pmb.idx = fgmfn; > + pmb.space = XENMAPSPACE_gmfn_foreign; > + rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &pmb); > + if (rc) { > + pr_warn("Failed to map pfn to mfn rc:%d pfn:%lx mfn:%lx\n", > + rc, lpfn, fgmfn); > + return 1; > + } > + return 0; > +} > + > +/* Unmap an entry from xen p2m table */Actually it is @count entries, starting a spfn. That seems obvious enough from the prototype not to be worth saying though.> +int pvh_rem_xen_p2m(unsigned long spfn, int count) > +{ > + struct xen_remove_from_physmap xrp; > + int i, rc; > + > + for (i=0; i < count; i++) { > + xrp.domid = DOMID_SELF; > + xrp.gpfn = spfn+i; > + rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); > + if (rc) { > + pr_warn("Failed to unmap pfn:%lx rc:%d done:%d\n", > + spfn+i, rc, i); > + return 1; > + } > + } > + return 0; > +} > +EXPORT_SYMBOL_GPL(pvh_rem_xen_p2m);What is the external/modular user of this? I guess this is why you noted that pvh_add_to_xen_p2m didn''t need exporting, which struck me as unnecessary at the time. pvh_add_to_xen_p2m seems to have an exported a wrapper, why not rem too? e.g. xen_unmap_domain_mfn_range?> + > +struct pvh_remap_data { > + unsigned long fgmfn; /* foreign domain''s gmfn */ > + pgprot_t prot; > + domid_t domid; > + struct vm_area_struct *vma; > +}; > + > +static int pvh_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, > + void *data) > +{ > + struct pvh_remap_data *pvhp = data; > + struct xen_pvh_sav_pfn_info *savp = pvhp->vma->vm_private_data; > + unsigned long pfn = page_to_pfn(savp->sp_paga[savp->sp_next_todo++]); > + pte_t pteval = pte_mkspecial(pfn_pte(pfn, pvhp->prot)); > + > + native_set_pte(ptep, pteval); > + if (pvh_add_to_xen_p2m(pfn, pvhp->fgmfn, pvhp->domid)) > + return -EFAULT;Is there a little window here where we''ve setup the page table entry but the p2m entry behind it is uninitialised? I suppose even if an interrupt occurred we can rely on the virtual address not having been "exposed" anywhere yet and therefore there is no chance of anyone dereferencing it. But is there any harm in just flipping the ordering here? Why EFAULT? Seems a bit random, I think HYPERVISOR_memory_op returns a -ve errno which you could propagate.> diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h > index eac3ce1..1b213b1 100644 > --- a/include/xen/interface/memory.h > +++ b/include/xen/interface/memory.h > @@ -163,10 +163,19 @@ struct xen_add_to_physmap { > /* Which domain to change the mapping for. */ > domid_t domid; > > + /* Number of pages to go through for gmfn_range */ > + uint16_t size; > + > /* Source mapping space. */ > #define XENMAPSPACE_shared_info 0 /* shared info page */ > #define XENMAPSPACE_grant_table 1 /* grant table page */ > - unsigned int space; > +#define XENMAPSPACE_gmfn 2 /* GMFN */ > +#define XENMAPSPACE_gmfn_range 3 /* GMFN range */ > +#define XENMAPSPACE_gmfn_foreign 4 /* GMFN from another guest */ > + uint16_t space; > + domid_t foreign_domid; /* IFF XENMAPSPACE_gmfn_foreign */ > + > +#define XENMAPIDX_grant_table_status 0x80000000Even if we don''t get the full PVH implementation in the hypervisor right away I think we should at least update the canonical interface definition in the Xen tree (xen/include/public) first. (same goes for the other interface definitions here). Ian.
Mukesh Rathor
2012-Aug-17 23:40 UTC
Re: [RFC PATCH 3/8]: PVH: memory manager and paging related changes
On Thu, 16 Aug 2012 15:10:03 +0100 Stefano Stabellini <stefano.stabellini@eu.citrix.com> wrote:> > void __init xen_init_mmu_ops(void) > > { > > + x86_init.paging.pagetable_setup_done > > xen_pagetable_setup_done; + > > + if (xen_pvh_domain()) { > > + pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others; > > + > > + /* set_pte* for PCI devices to map iomem. */ > > + if (xen_initial_domain()) { > > + pv_mmu_ops.set_pte = xen_dom0pvh_set_pte; > > + pv_mmu_ops.set_pte_at > > xen_dom0pvh_set_pte_at; > > + } > > + return; > > + } > > Considering that the implementation of xen_dom0pvh_set_pte is > native_set_pte, can''t we just leave it to the default that is > native_set_pte?We can. I had more code in the function that I removed when I mapped the entire IO space up front. Then, I had lot of debug code there. I can change it to native now.
Mukesh Rathor
2012-Aug-17 23:49 UTC
Re: [RFC PATCH 3/8]: PVH: memory manager and paging related changes
On Fri, 17 Aug 2012 10:26:15 +0100 Ian Campbell <Ian.Campbell@citrix.com> wrote:> On Thu, 2012-08-16 at 02:02 +0100, Mukesh Rathor wrote: > > + > > if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, > > 0)) BUG(); > > } > > @@ -1745,6 +1785,7 @@ static void convert_pfn_mfn(void *v) > > * but that''s enough to get __va working. We need to fill in the > > rest > > * of the physical mapping once some sort of allocator has been set > > * up. > > + * NOTE: for PVH, the page tables are native with HAP required. > > OOI does this mean shadow doesn''t work?Most likely now. Will need to explore that. Later.> > + > > + if (xen_pvh_domain()) { > > + pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others; > > + > > + /* set_pte* for PCI devices to map iomem. */ > > + if (xen_initial_domain()) { > > + pv_mmu_ops.set_pte = xen_dom0pvh_set_pte; > > + pv_mmu_ops.set_pte_at > > xen_dom0pvh_set_pte_at; > > Is this wrong for domU or have you just not tried/implemented > passthrough support yet?No, passthrough is phase II/III. Too big a project for one person to do in a single shot as it is ;)..> > + * exported function, so no need to export this. > > + */ > > +static int pvh_add_to_xen_p2m(unsigned long lpfn, unsigned long > > fgmfn, > > + unsigned int domid) > > +{ > > + int rc; > > + struct xen_add_to_physmap pmb = {.foreign_domid = domid}; > > I''m not sure but I think CodingStyle would want spaces inside the {}s. > > What is the b in pmb? Phys Map B??? (every other user of this > interface says xatp, FWIW)because originally it was a batch function but got changed after the code review earlier. I will rename it.> > HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); > > + if (rc) { > > + pr_warn("Failed to unmap pfn:%lx rc:%d > > done:%d\n", > > + spfn+i, rc, i); > > + return 1; > > + } > > + } > > + return 0; > > +} > > +EXPORT_SYMBOL_GPL(pvh_rem_xen_p2m); > > What is the external/modular user of this?privcmd.> I guess this is why you noted that pvh_add_to_xen_p2m didn''t need > exporting, which struck me as unnecessary at the time. > > pvh_add_to_xen_p2m seems to have an exported a wrapper, why not rem > too? e.g. xen_unmap_domain_mfn_range?I believe unmapping happens thru native code. So, we just need to remove the entries from the ept. Ok, I mean, hap.> > + > > + native_set_pte(ptep, pteval); > > + if (pvh_add_to_xen_p2m(pfn, pvhp->fgmfn, pvhp->domid)) > > + return -EFAULT; > > Is there a little window here where we''ve setup the page table entry > but the p2m entry behind it is uninitialised? > > I suppose even if an interrupt occurred we can rely on the virtual > address not having been "exposed" anywhere yet and therefore there is > no chance of anyone dereferencing it. But is there any harm in just > flipping the ordering here?Should be ok flippiong the order.> Why EFAULT? Seems a bit random, I think HYPERVISOR_memory_op returns a > -ve errno which you could propagate.Ok, sounds good.
Ian Campbell
2012-Sep-14 16:20 UTC
Re: [RFC PATCH 3/8]: PVH: memory manager and paging related changes
On Thu, 2012-08-16 at 02:02 +0100, Mukesh Rathor wrote:> > +/* Map foreign gmfn, fgmfn, to local pfn, lpfn. This for the user > space > + * creating new guest on PVH dom0 and needs to map domU pages. Called > from > + * exported function, so no need to export this. > + */ > +static int pvh_add_to_xen_p2m(unsigned long lpfn, unsigned long > fgmfn, > + unsigned int domid) > +{ > + int rc; > + struct xen_add_to_physmap pmb = {.foreign_domid = domid}; > + > + pmb.gpfn = lpfn; > + pmb.idx = fgmfn; > + pmb.space = XENMAPSPACE_gmfn_foreign;You''ve not initialised pmb.domid here. I think you want/need to set it to DOMID_SELF and have the h/v side behave accordingly.> + rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &pmb);