--- arch/x86/xen/mmu.c | 180 +++++++++++++++++++++++++++++++++++++++++++++++-- arch/x86/xen/mmu.h | 2 + include/xen/xen-ops.h | 12 +++- 3 files changed, 188 insertions(+), 6 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index b65a761..9b5a5ef 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -73,6 +73,7 @@ #include <xen/interface/version.h> #include <xen/interface/memory.h> #include <xen/hvc-console.h> +#include <xen/balloon.h> #include "multicalls.h" #include "mmu.h" @@ -330,6 +331,26 @@ static void xen_set_pte(pte_t *ptep, pte_t pteval) __xen_set_pte(ptep, pteval); } +void xen_set_clr_mmio_pvh_pte(unsigned long pfn, unsigned long mfn, + int nr_mfns, int add_mapping) +{ + struct physdev_map_iomem iomem; + + iomem.first_gfn = pfn; + iomem.first_mfn = mfn; + iomem.nr_mfns = nr_mfns; + iomem.add_mapping = add_mapping; + + if (HYPERVISOR_physdev_op(PHYSDEVOP_pvh_map_iomem, &iomem)) + BUG(); +} + +static void xen_dom0pvh_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval) +{ + native_set_pte(ptep, pteval); +} + static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { @@ -1197,6 +1218,10 @@ static void xen_post_allocator_init(void); static void __init xen_pagetable_setup_done(pgd_t *base) { xen_setup_shared_info(); + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + xen_post_allocator_init(); } @@ -1455,6 +1480,10 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) { struct mmuext_op op; + + if (xen_feature(XENFEAT_writable_page_tables)) + return; + op.cmd = cmd; op.arg1.mfn = pfn_to_mfn(pfn); if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) @@ -1652,6 +1681,10 @@ static void set_page_prot(void *addr, pgprot_t prot) unsigned long pfn = __pa(addr) >> PAGE_SHIFT; pte_t pte = pfn_pte(pfn, prot); + /* recall for PVH, page tables are native. */ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) BUG(); } @@ -1729,6 +1762,9 @@ static void convert_pfn_mfn(void *v) pte_t *pte = v; int i; + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + /* All levels are converted the same way, so just treat them as ptes. */ for (i = 0; i < PTRS_PER_PTE; i++) @@ -1745,6 +1781,7 @@ static void convert_pfn_mfn(void *v) * but that''s enough to get __va working. We need to fill in the rest * of the physical mapping once some sort of allocator has been set * up. + * NOTE: for PVH, the page tables are native. */ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) @@ -1802,9 +1839,13 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, * structure to attach it to, so make sure we just set kernel * pgd. */ - xen_mc_batch(); - __xen_write_cr3(true, __pa(pgd)); - xen_mc_issue(PARAVIRT_LAZY_CPU); + if (xen_feature(XENFEAT_writable_page_tables)) { + native_write_cr3(__pa(pgd)); + } else { + xen_mc_batch(); + __xen_write_cr3(true, __pa(pgd)); + xen_mc_issue(PARAVIRT_LAZY_CPU); + } memblock_reserve(__pa(xen_start_info->pt_base), xen_start_info->nr_pt_frames * PAGE_SIZE); @@ -2067,9 +2108,21 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { void __init xen_init_mmu_ops(void) { + x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; + + if (xen_feature(XENFEAT_auto_translated_physmap)) { + pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others; + + /* set_pte* for PCI devices to map iomem. */ + if (xen_initial_domain()) { + pv_mmu_ops.set_pte = native_set_pte; + pv_mmu_ops.set_pte_at = xen_dom0pvh_set_pte_at; + } + return; + } + x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; - x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; pv_mmu_ops = xen_mmu_ops; memset(dummy_mapping, 0xff, PAGE_SIZE); @@ -2305,6 +2358,92 @@ void __init xen_hvm_init_mmu_ops(void) } #endif +/* Map foreign gmfn, fgmfn, to local pfn, lpfn. This for the user space + * creating new guest on PVH dom0 and needs to map domU pages. + */ +static int pvh_add_to_xen_p2m(unsigned long lpfn, unsigned long fgmfn, + unsigned int domid) +{ + int rc; + struct xen_add_to_physmap xatp = { .u.foreign_domid = domid }; + + xatp.gpfn = lpfn; + xatp.idx = fgmfn; + xatp.domid = DOMID_SELF; + xatp.space = XENMAPSPACE_gmfn_foreign; + rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp); + if (rc) + pr_warn("d0: Failed to map pfn to mfn rc:%d pfn:%lx mfn:%lx\n", + rc, lpfn, fgmfn); + return rc; +} + +int pvh_rem_xen_p2m(unsigned long spfn, int count) +{ + struct xen_remove_from_physmap xrp; + int i, rc; + + for (i=0; i < count; i++) { + xrp.domid = DOMID_SELF; + xrp.gpfn = spfn+i; + rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); + if (rc) { + pr_warn("Failed to unmap pfn:%lx rc:%d done:%d\n", + spfn+i, rc, i); + return 1; + } + } + return 0; +} +EXPORT_SYMBOL_GPL(pvh_rem_xen_p2m); + +struct pvh_remap_data { + unsigned long fgmfn; /* foreign domain''s gmfn */ + pgprot_t prot; + domid_t domid; + struct xen_pvh_pfn_info *pvhinfop; +}; + +static int pvh_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, + void *data) +{ + int rc; + struct pvh_remap_data *remapp = data; + struct xen_pvh_pfn_info *pvhp = remapp->pvhinfop; + unsigned long pfn = page_to_pfn(pvhp->pi_paga[pvhp->pi_next_todo++]); + pte_t pteval = pte_mkspecial(pfn_pte(pfn, remapp->prot)); + + if ((rc=pvh_add_to_xen_p2m(pfn, remapp->fgmfn, remapp->domid))) + return rc; + native_set_pte(ptep, pteval); + + return 0; +} + +/* The only caller at moment passes one gmfn at a time. + * PVH TBD/FIXME: expand this in future to honor batch requests. + */ +static int pvh_remap_gmfn_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long mfn, int nr, + pgprot_t prot, unsigned domid, + struct xen_pvh_pfn_info *pvhp) +{ + int err; + struct pvh_remap_data pvhdata; + + if (nr > 1) + return -EINVAL; + + pvhdata.fgmfn = mfn; + pvhdata.prot = prot; + pvhdata.domid = domid; + pvhdata.pvhinfop = pvhp; + err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT, + pvh_map_pte_fn, &pvhdata); + flush_tlb_all(); + return err; +} + #define REMAP_BATCH_SIZE 16 struct remap_data { @@ -2329,7 +2468,9 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, int xen_remap_domain_mfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long mfn, int nr, - pgprot_t prot, unsigned domid) + pgprot_t prot, unsigned domid, + struct xen_pvh_pfn_info *pvhp) + { struct remap_data rmd; struct mmu_update mmu_update[REMAP_BATCH_SIZE]; @@ -2342,6 +2483,12 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) = (VM_PFNMAP | VM_RESERVED | VM_IO))); + if (xen_feature(XENFEAT_auto_translated_physmap)) { + /* We need to update the local page tables and the xen HAP */ + return pvh_remap_gmfn_range(vma, addr, mfn, nr, prot, domid, + pvhp); + } + rmd.mfn = mfn; rmd.prot = prot; @@ -2371,3 +2518,26 @@ out: return err; } EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); + +/* Returns: Number of pages unmapped */ +int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, + struct xen_pvh_pfn_info *pvhp) +{ + int count = 0; + + if (!pvhp || !xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + while (pvhp->pi_next_todo--) { + unsigned long pfn; + + /* the mmu has already cleaned up the process mmu resources at + * this point (lookup_address will return NULL). */ + pfn = page_to_pfn(pvhp->pi_paga[pvhp->pi_next_todo]); + pvh_rem_xen_p2m(pfn, 1); + count++; + } + flush_tlb_all(); + return count; +} +EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range); diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 73809bb..6d0bb56 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -23,4 +23,6 @@ unsigned long xen_read_cr2_direct(void); extern void xen_init_mmu_ops(void); extern void xen_hvm_init_mmu_ops(void); +extern void xen_set_clr_mmio_pvh_pte(unsigned long pfn, unsigned long mfn, + int nr_mfns, int add_mapping); #endif /* _XEN_MMU_H */ diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 6a198e4..6c5ad83 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -24,9 +24,19 @@ int xen_create_contiguous_region(unsigned long vstart, unsigned int order, void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order); struct vm_area_struct; +struct xen_pvh_pfn_info; int xen_remap_domain_mfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long mfn, int nr, - pgprot_t prot, unsigned domid); + pgprot_t prot, unsigned domid, + struct xen_pvh_pfn_info *pvhp); +int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, + struct xen_pvh_pfn_info *pvhp); + +struct xen_pvh_pfn_info { + struct page **pi_paga; /* pfn info page array */ + int pi_num_pgs; + int pi_next_todo; +}; #endif /* INCLUDE_XEN_OPS_H */ -- 1.7.2.3
There are few code style issues on this patch, I suggest you run it through scripts/checkpatch.pl, it should be able to catch all these errors. On Fri, 21 Sep 2012, Mukesh Rathor wrote:> --- > arch/x86/xen/mmu.c | 180 +++++++++++++++++++++++++++++++++++++++++++++++-- > arch/x86/xen/mmu.h | 2 + > include/xen/xen-ops.h | 12 +++- > 3 files changed, 188 insertions(+), 6 deletions(-) > > diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c > index b65a761..9b5a5ef 100644 > --- a/arch/x86/xen/mmu.c > +++ b/arch/x86/xen/mmu.c > @@ -73,6 +73,7 @@ > #include <xen/interface/version.h> > #include <xen/interface/memory.h> > #include <xen/hvc-console.h> > +#include <xen/balloon.h> > > #include "multicalls.h" > #include "mmu.h" > @@ -330,6 +331,26 @@ static void xen_set_pte(pte_t *ptep, pte_t pteval) > __xen_set_pte(ptep, pteval); > } > > +void xen_set_clr_mmio_pvh_pte(unsigned long pfn, unsigned long mfn, > + int nr_mfns, int add_mapping) > +{ > + struct physdev_map_iomem iomem; > + > + iomem.first_gfn = pfn; > + iomem.first_mfn = mfn; > + iomem.nr_mfns = nr_mfns; > + iomem.add_mapping = add_mapping; > + > + if (HYPERVISOR_physdev_op(PHYSDEVOP_pvh_map_iomem, &iomem)) > + BUG(); > +} > + > +static void xen_dom0pvh_set_pte_at(struct mm_struct *mm, unsigned long addr, > + pte_t *ptep, pte_t pteval) > +{ > + native_set_pte(ptep, pteval); > +}can''t you just set set_pte_at to native_set_pte?> static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, > pte_t *ptep, pte_t pteval) > { > @@ -1197,6 +1218,10 @@ static void xen_post_allocator_init(void); > static void __init xen_pagetable_setup_done(pgd_t *base) > { > xen_setup_shared_info(); > + > + if (xen_feature(XENFEAT_auto_translated_physmap)) > + return; > + > xen_post_allocator_init(); > }Can we move the if..return at the beginning of xen_post_allocator_init? It would make it easier to understand what it is for. Or maybe we could have xen_setup_shared_info take a pgd_t *base parameter so that you can set pagetable_setup_done to xen_setup_shared_info directly on pvh.> @@ -1455,6 +1480,10 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) > static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) > { > struct mmuext_op op; > + > + if (xen_feature(XENFEAT_writable_page_tables)) > + return; > + > op.cmd = cmd; > op.arg1.mfn = pfn_to_mfn(pfn); > if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))given the very limited set of mmu pvops that we initialize on pvh, I cannot find who would actually call pin_pagetable_pfn on pvh.> @@ -1652,6 +1681,10 @@ static void set_page_prot(void *addr, pgprot_t prot) > unsigned long pfn = __pa(addr) >> PAGE_SHIFT; > pte_t pte = pfn_pte(pfn, prot); > > + /* recall for PVH, page tables are native. */ > + if (xen_feature(XENFEAT_auto_translated_physmap)) > + return; > + > if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) > BUG(); > } > @@ -1729,6 +1762,9 @@ static void convert_pfn_mfn(void *v) > pte_t *pte = v; > int i; > > + if (xen_feature(XENFEAT_auto_translated_physmap)) > + return; > + > /* All levels are converted the same way, so just treat them > as ptes. */ > for (i = 0; i < PTRS_PER_PTE; i++) > @@ -1745,6 +1781,7 @@ static void convert_pfn_mfn(void *v) > * but that''s enough to get __va working. We need to fill in the rest > * of the physical mapping once some sort of allocator has been set > * up. > + * NOTE: for PVH, the page tables are native. > */ > pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, > unsigned long max_pfn) > @@ -1802,9 +1839,13 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, > * structure to attach it to, so make sure we just set kernel > * pgd. > */ > - xen_mc_batch(); > - __xen_write_cr3(true, __pa(pgd)); > - xen_mc_issue(PARAVIRT_LAZY_CPU); > + if (xen_feature(XENFEAT_writable_page_tables)) { > + native_write_cr3(__pa(pgd)); > + } else { > + xen_mc_batch(); > + __xen_write_cr3(true, __pa(pgd)); > + xen_mc_issue(PARAVIRT_LAZY_CPU); > + } > > memblock_reserve(__pa(xen_start_info->pt_base), > xen_start_info->nr_pt_frames * PAGE_SIZE); > @@ -2067,9 +2108,21 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { > > void __init xen_init_mmu_ops(void) > { > + x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; > + > + if (xen_feature(XENFEAT_auto_translated_physmap)) { > + pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others; > + > + /* set_pte* for PCI devices to map iomem. */ > + if (xen_initial_domain()) { > + pv_mmu_ops.set_pte = native_set_pte; > + pv_mmu_ops.set_pte_at = xen_dom0pvh_set_pte_at; > + } > + return; > + } > + > x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; > x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; > - x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; > pv_mmu_ops = xen_mmu_ops; > > memset(dummy_mapping, 0xff, PAGE_SIZE);I wonder whether it would make sense to have a xen_pvh_init_mmu_ops, given that they end up being so different...> @@ -2305,6 +2358,92 @@ void __init xen_hvm_init_mmu_ops(void) > } > #endif > > +/* Map foreign gmfn, fgmfn, to local pfn, lpfn. This for the user space > + * creating new guest on PVH dom0 and needs to map domU pages. > + */ > +static int pvh_add_to_xen_p2m(unsigned long lpfn, unsigned long fgmfn, > + unsigned int domid) > +{ > + int rc; > + struct xen_add_to_physmap xatp = { .u.foreign_domid = domid }; > + > + xatp.gpfn = lpfn; > + xatp.idx = fgmfn; > + xatp.domid = DOMID_SELF; > + xatp.space = XENMAPSPACE_gmfn_foreign; > + rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp); > + if (rc) > + pr_warn("d0: Failed to map pfn to mfn rc:%d pfn:%lx mfn:%lx\n", > + rc, lpfn, fgmfn); > + return rc; > +} > + > +int pvh_rem_xen_p2m(unsigned long spfn, int count) > +{ > + struct xen_remove_from_physmap xrp; > + int i, rc; > + > + for (i=0; i < count; i++) { > + xrp.domid = DOMID_SELF; > + xrp.gpfn = spfn+i; > + rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); > + if (rc) { > + pr_warn("Failed to unmap pfn:%lx rc:%d done:%d\n", > + spfn+i, rc, i); > + return 1; > + } > + } > + return 0; > +} > +EXPORT_SYMBOL_GPL(pvh_rem_xen_p2m); > + > +struct pvh_remap_data { > + unsigned long fgmfn; /* foreign domain''s gmfn */ > + pgprot_t prot; > + domid_t domid; > + struct xen_pvh_pfn_info *pvhinfop; > +}; > + > +static int pvh_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, > + void *data) > +{ > + int rc; > + struct pvh_remap_data *remapp = data; > + struct xen_pvh_pfn_info *pvhp = remapp->pvhinfop; > + unsigned long pfn = page_to_pfn(pvhp->pi_paga[pvhp->pi_next_todo++]); > + pte_t pteval = pte_mkspecial(pfn_pte(pfn, remapp->prot)); > + > + if ((rc=pvh_add_to_xen_p2m(pfn, remapp->fgmfn, remapp->domid))) > + return rc; > + native_set_pte(ptep, pteval);Do we actually need the pte to be "special"? I would think that being in the guest p2m, it is actually quite a normal page.> + return 0; > +} > + > +/* The only caller at moment passes one gmfn at a time. > + * PVH TBD/FIXME: expand this in future to honor batch requests. > + */ > +static int pvh_remap_gmfn_range(struct vm_area_struct *vma, > + unsigned long addr, unsigned long mfn, int nr, > + pgprot_t prot, unsigned domid, > + struct xen_pvh_pfn_info *pvhp) > +{ > + int err; > + struct pvh_remap_data pvhdata; > + > + if (nr > 1) > + return -EINVAL; > + > + pvhdata.fgmfn = mfn; > + pvhdata.prot = prot; > + pvhdata.domid = domid; > + pvhdata.pvhinfop = pvhp; > + err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT, > + pvh_map_pte_fn, &pvhdata); > + flush_tlb_all();Maybe we can use one of the flush_tlb_range varieties instead of a flush_tlb_all?> + return err; > +} > + > #define REMAP_BATCH_SIZE 16 > > struct remap_data { > @@ -2329,7 +2468,9 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, > int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > unsigned long addr, > unsigned long mfn, int nr, > - pgprot_t prot, unsigned domid) > + pgprot_t prot, unsigned domid, > + struct xen_pvh_pfn_info *pvhp) > + > { > struct remap_data rmd; > struct mmu_update mmu_update[REMAP_BATCH_SIZE]; > @@ -2342,6 +2483,12 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) => (VM_PFNMAP | VM_RESERVED | VM_IO))); > > + if (xen_feature(XENFEAT_auto_translated_physmap)) { > + /* We need to update the local page tables and the xen HAP */ > + return pvh_remap_gmfn_range(vma, addr, mfn, nr, prot, domid, > + pvhp); > + } > + > rmd.mfn = mfn; > rmd.prot = prot; > > @@ -2371,3 +2518,26 @@ out: > return err; > } > EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); > + > +/* Returns: Number of pages unmapped */ > +int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, > + struct xen_pvh_pfn_info *pvhp) > +{ > + int count = 0; > + > + if (!pvhp || !xen_feature(XENFEAT_auto_translated_physmap)) > + return 0; > + > + while (pvhp->pi_next_todo--) { > + unsigned long pfn; > + > + /* the mmu has already cleaned up the process mmu resources at > + * this point (lookup_address will return NULL). */ > + pfn = page_to_pfn(pvhp->pi_paga[pvhp->pi_next_todo]); > + pvh_rem_xen_p2m(pfn, 1); > + count++; > + } > + flush_tlb_all(); > + return count;Who is going to remove the corresponding mapping from the vma? Also we might be able to replace the flush_tlb_all with a flush_tlb_range.> +EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range); > diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h > index 73809bb..6d0bb56 100644 > --- a/arch/x86/xen/mmu.h > +++ b/arch/x86/xen/mmu.h > @@ -23,4 +23,6 @@ unsigned long xen_read_cr2_direct(void); > > extern void xen_init_mmu_ops(void); > extern void xen_hvm_init_mmu_ops(void); > +extern void xen_set_clr_mmio_pvh_pte(unsigned long pfn, unsigned long mfn, > + int nr_mfns, int add_mapping); > #endif /* _XEN_MMU_H */ > diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h > index 6a198e4..6c5ad83 100644 > --- a/include/xen/xen-ops.h > +++ b/include/xen/xen-ops.h > @@ -24,9 +24,19 @@ int xen_create_contiguous_region(unsigned long vstart, unsigned int order, > void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order); > > struct vm_area_struct; > +struct xen_pvh_pfn_info; > int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > unsigned long addr, > unsigned long mfn, int nr, > - pgprot_t prot, unsigned domid); > + pgprot_t prot, unsigned domid, > + struct xen_pvh_pfn_info *pvhp); > +int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, > + struct xen_pvh_pfn_info *pvhp); > + > +struct xen_pvh_pfn_info { > + struct page **pi_paga; /* pfn info page array */ > + int pi_num_pgs; > + int pi_next_todo; > +}; > > #endif /* INCLUDE_XEN_OPS_H */ > -- > 1.7.2.3 >
On Mon, 2012-09-24 at 12:55 +0100, Stefano Stabellini wrote:> There are few code style issues on this patch, I suggest you run it > through scripts/checkpatch.pl, it should be able to catch all these > errors.It would also be nice to starting having some changelogs entries for these patches for the next posting. There''s a lot of complex stuff going on here.> > @@ -2371,3 +2518,26 @@ out: > > return err; > > } > > EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); > > + > > +/* Returns: Number of pages unmapped */ > > +int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, > > + struct xen_pvh_pfn_info *pvhp) > > +{ > > + int count = 0; > > + > > + if (!pvhp || !xen_feature(XENFEAT_auto_translated_physmap)) > > + return 0; > > + > > + while (pvhp->pi_next_todo--) { > > + unsigned long pfn; > > + > > + /* the mmu has already cleaned up the process mmu resources at > > + * this point (lookup_address will return NULL). */ > > + pfn = page_to_pfn(pvhp->pi_paga[pvhp->pi_next_todo]); > > + pvh_rem_xen_p2m(pfn, 1); > > + count++; > > + } > > + flush_tlb_all(); > > + return count; > > Who is going to remove the corresponding mapping from the vma? > Also we might be able to replace the flush_tlb_all with a > flush_tlb_range.I''m not convinced that a guest level TLB flush is either necessary or sufficient here. What we are doing is removing entries from the P2M which means that we need to do the appropriate HAP flush in the hypervisor, which must necessarily invalidate any stage 1 mappings which this flush might also touch (i.e. the HAP flush must be a super set of this flush). Without the HAP flush in the hypervisor you risk guests being able to see old p2m mappings via the TLB entries which is a security issue AFAICT. Ian.
On Mon, 24 Sep 2012, Ian Campbell wrote:> On Mon, 2012-09-24 at 12:55 +0100, Stefano Stabellini wrote: > > There are few code style issues on this patch, I suggest you run it > > through scripts/checkpatch.pl, it should be able to catch all these > > errors. > > It would also be nice to starting having some changelogs entries for > these patches for the next posting. There''s a lot of complex stuff going > on here. > > > > @@ -2371,3 +2518,26 @@ out: > > > return err; > > > } > > > EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); > > > + > > > +/* Returns: Number of pages unmapped */ > > > +int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, > > > + struct xen_pvh_pfn_info *pvhp) > > > +{ > > > + int count = 0; > > > + > > > + if (!pvhp || !xen_feature(XENFEAT_auto_translated_physmap)) > > > + return 0; > > > + > > > + while (pvhp->pi_next_todo--) { > > > + unsigned long pfn; > > > + > > > + /* the mmu has already cleaned up the process mmu resources at > > > + * this point (lookup_address will return NULL). */ > > > + pfn = page_to_pfn(pvhp->pi_paga[pvhp->pi_next_todo]); > > > + pvh_rem_xen_p2m(pfn, 1); > > > + count++; > > > + } > > > + flush_tlb_all(); > > > + return count; > > > > Who is going to remove the corresponding mapping from the vma? > > Also we might be able to replace the flush_tlb_all with a > > flush_tlb_range. > > I''m not convinced that a guest level TLB flush is either necessary or > sufficient here. What we are doing is removing entries from the P2M > which means that we need to do the appropriate HAP flush in the > hypervisor, which must necessarily invalidate any stage 1 mappings which > this flush might also touch (i.e. the HAP flush must be a super set of > this flush). > > Without the HAP flush in the hypervisor you risk guests being able to > see old p2m mappings via the TLB entries which is a security issue > AFAICT.Yes, you are right, we need a flush in the hypervisor to flush the EPT. It could probably live in the implementation of XENMEM_add_to_physmap. This one should be just for the vma mappings, so in the case of xen_unmap_domain_mfn_range is unnecessary (given that it is not removing the vma mappings).
On Mon, Sep 24, 2012 at 12:55:31PM +0100, Stefano Stabellini wrote:> There are few code style issues on this patch, I suggest you run it > through scripts/checkpatch.pl, it should be able to catch all these > errors. > > On Fri, 21 Sep 2012, Mukesh Rathor wrote: > > --- > > arch/x86/xen/mmu.c | 180 +++++++++++++++++++++++++++++++++++++++++++++++-- > > arch/x86/xen/mmu.h | 2 + > > include/xen/xen-ops.h | 12 +++- > > 3 files changed, 188 insertions(+), 6 deletions(-) > > > > diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c > > index b65a761..9b5a5ef 100644 > > --- a/arch/x86/xen/mmu.c > > +++ b/arch/x86/xen/mmu.c > > @@ -73,6 +73,7 @@ > > #include <xen/interface/version.h> > > #include <xen/interface/memory.h> > > #include <xen/hvc-console.h> > > +#include <xen/balloon.h> > > > > #include "multicalls.h" > > #include "mmu.h" > > @@ -330,6 +331,26 @@ static void xen_set_pte(pte_t *ptep, pte_t pteval) > > __xen_set_pte(ptep, pteval); > > } > > > > +void xen_set_clr_mmio_pvh_pte(unsigned long pfn, unsigned long mfn, > > + int nr_mfns, int add_mapping) > > +{ > > + struct physdev_map_iomem iomem; > > + > > + iomem.first_gfn = pfn; > > + iomem.first_mfn = mfn; > > + iomem.nr_mfns = nr_mfns; > > + iomem.add_mapping = add_mapping; > > + > > + if (HYPERVISOR_physdev_op(PHYSDEVOP_pvh_map_iomem, &iomem)) > > + BUG(); > > +} > > + > > +static void xen_dom0pvh_set_pte_at(struct mm_struct *mm, unsigned long addr, > > + pte_t *ptep, pte_t pteval) > > +{ > > + native_set_pte(ptep, pteval); > > +} > > can''t you just set set_pte_at to native_set_pte? > > > > static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, > > pte_t *ptep, pte_t pteval) > > { > > @@ -1197,6 +1218,10 @@ static void xen_post_allocator_init(void); > > static void __init xen_pagetable_setup_done(pgd_t *base) > > { > > xen_setup_shared_info(); > > + > > + if (xen_feature(XENFEAT_auto_translated_physmap)) > > + return; > > + > > xen_post_allocator_init(); > > } > > Can we move the if..return at the beginning of xen_post_allocator_init? > It would make it easier to understand what it is for. > Or maybe we could have xen_setup_shared_info take a pgd_t *base > parameter so that you can set pagetable_setup_done to > xen_setup_shared_info directly on pvh.It actually ends up nicely aligning with my patches since I end up using ''if (xen_feature(..)''. But you can''t see that since these patches are not rebased on that - which is OK.> > > > @@ -1455,6 +1480,10 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) > > static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) > > { > > struct mmuext_op op; > > + > > + if (xen_feature(XENFEAT_writable_page_tables)) > > + return; > > + > > op.cmd = cmd; > > op.arg1.mfn = pfn_to_mfn(pfn); > > if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) > > given the very limited set of mmu pvops that we initialize on pvh, I > cannot find who would actually call pin_pagetable_pfn on pvh. > > > > @@ -1652,6 +1681,10 @@ static void set_page_prot(void *addr, pgprot_t prot) > > unsigned long pfn = __pa(addr) >> PAGE_SHIFT; > > pte_t pte = pfn_pte(pfn, prot); > > > > + /* recall for PVH, page tables are native. */ > > + if (xen_feature(XENFEAT_auto_translated_physmap)) > > + return; > > + > > if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) > > BUG(); > > } > > @@ -1729,6 +1762,9 @@ static void convert_pfn_mfn(void *v) > > pte_t *pte = v; > > int i; > > > > + if (xen_feature(XENFEAT_auto_translated_physmap)) > > + return; > > + > > /* All levels are converted the same way, so just treat them > > as ptes. */ > > for (i = 0; i < PTRS_PER_PTE; i++) > > @@ -1745,6 +1781,7 @@ static void convert_pfn_mfn(void *v) > > * but that''s enough to get __va working. We need to fill in the rest > > * of the physical mapping once some sort of allocator has been set > > * up. > > + * NOTE: for PVH, the page tables are native. > > */ > > pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, > > unsigned long max_pfn) > > @@ -1802,9 +1839,13 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, > > * structure to attach it to, so make sure we just set kernel > > * pgd. > > */ > > - xen_mc_batch(); > > - __xen_write_cr3(true, __pa(pgd)); > > - xen_mc_issue(PARAVIRT_LAZY_CPU); > > + if (xen_feature(XENFEAT_writable_page_tables)) { > > + native_write_cr3(__pa(pgd)); > > + } else { > > + xen_mc_batch(); > > + __xen_write_cr3(true, __pa(pgd)); > > + xen_mc_issue(PARAVIRT_LAZY_CPU); > > + } > > > > memblock_reserve(__pa(xen_start_info->pt_base), > > xen_start_info->nr_pt_frames * PAGE_SIZE); > > @@ -2067,9 +2108,21 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { > > > > void __init xen_init_mmu_ops(void) > > { > > + x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; > > + > > + if (xen_feature(XENFEAT_auto_translated_physmap)) { > > + pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others; > > + > > + /* set_pte* for PCI devices to map iomem. */ > > + if (xen_initial_domain()) { > > + pv_mmu_ops.set_pte = native_set_pte; > > + pv_mmu_ops.set_pte_at = xen_dom0pvh_set_pte_at; > > + } > > + return; > > + } > > + > > x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; > > x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; > > - x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; > > pv_mmu_ops = xen_mmu_ops; > > > > memset(dummy_mapping, 0xff, PAGE_SIZE); > > I wonder whether it would make sense to have a xen_pvh_init_mmu_ops, > given that they end up being so different... > > > > @@ -2305,6 +2358,92 @@ void __init xen_hvm_init_mmu_ops(void) > > } > > #endif > > > > +/* Map foreign gmfn, fgmfn, to local pfn, lpfn. This for the user space > > + * creating new guest on PVH dom0 and needs to map domU pages. > > + */ > > +static int pvh_add_to_xen_p2m(unsigned long lpfn, unsigned long fgmfn, > > + unsigned int domid) > > +{ > > + int rc; > > + struct xen_add_to_physmap xatp = { .u.foreign_domid = domid }; > > + > > + xatp.gpfn = lpfn; > > + xatp.idx = fgmfn; > > + xatp.domid = DOMID_SELF; > > + xatp.space = XENMAPSPACE_gmfn_foreign; > > + rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp); > > + if (rc) > > + pr_warn("d0: Failed to map pfn to mfn rc:%d pfn:%lx mfn:%lx\n", > > + rc, lpfn, fgmfn); > > + return rc; > > +} > > + > > +int pvh_rem_xen_p2m(unsigned long spfn, int count) > > +{ > > + struct xen_remove_from_physmap xrp; > > + int i, rc; > > + > > + for (i=0; i < count; i++) { > > + xrp.domid = DOMID_SELF; > > + xrp.gpfn = spfn+i; > > + rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); > > + if (rc) { > > + pr_warn("Failed to unmap pfn:%lx rc:%d done:%d\n", > > + spfn+i, rc, i); > > + return 1; > > + } > > + } > > + return 0; > > +} > > +EXPORT_SYMBOL_GPL(pvh_rem_xen_p2m); > > + > > +struct pvh_remap_data { > > + unsigned long fgmfn; /* foreign domain''s gmfn */ > > + pgprot_t prot; > > + domid_t domid; > > + struct xen_pvh_pfn_info *pvhinfop; > > +}; > > + > > +static int pvh_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, > > + void *data) > > +{ > > + int rc; > > + struct pvh_remap_data *remapp = data; > > + struct xen_pvh_pfn_info *pvhp = remapp->pvhinfop; > > + unsigned long pfn = page_to_pfn(pvhp->pi_paga[pvhp->pi_next_todo++]); > > + pte_t pteval = pte_mkspecial(pfn_pte(pfn, remapp->prot)); > > + > > + if ((rc=pvh_add_to_xen_p2m(pfn, remapp->fgmfn, remapp->domid))) > > + return rc; > > + native_set_pte(ptep, pteval); > > Do we actually need the pte to be "special"? > I would think that being in the guest p2m, it is actually quite a normal > page. > > > > + return 0; > > +} > > + > > +/* The only caller at moment passes one gmfn at a time. > > + * PVH TBD/FIXME: expand this in future to honor batch requests. > > + */ > > +static int pvh_remap_gmfn_range(struct vm_area_struct *vma, > > + unsigned long addr, unsigned long mfn, int nr, > > + pgprot_t prot, unsigned domid, > > + struct xen_pvh_pfn_info *pvhp) > > +{ > > + int err; > > + struct pvh_remap_data pvhdata; > > + > > + if (nr > 1) > > + return -EINVAL; > > + > > + pvhdata.fgmfn = mfn; > > + pvhdata.prot = prot; > > + pvhdata.domid = domid; > > + pvhdata.pvhinfop = pvhp; > > + err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT, > > + pvh_map_pte_fn, &pvhdata); > > + flush_tlb_all(); > > Maybe we can use one of the flush_tlb_range varieties instead of a > flush_tlb_all? > > > > + return err; > > +} > > + > > #define REMAP_BATCH_SIZE 16 > > > > struct remap_data { > > @@ -2329,7 +2468,9 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, > > int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > > unsigned long addr, > > unsigned long mfn, int nr, > > - pgprot_t prot, unsigned domid) > > + pgprot_t prot, unsigned domid, > > + struct xen_pvh_pfn_info *pvhp) > > + > > { > > struct remap_data rmd; > > struct mmu_update mmu_update[REMAP_BATCH_SIZE]; > > @@ -2342,6 +2483,12 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > > BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) => > (VM_PFNMAP | VM_RESERVED | VM_IO))); > > > > + if (xen_feature(XENFEAT_auto_translated_physmap)) { > > + /* We need to update the local page tables and the xen HAP */ > > + return pvh_remap_gmfn_range(vma, addr, mfn, nr, prot, domid, > > + pvhp); > > + } > > + > > rmd.mfn = mfn; > > rmd.prot = prot; > > > > @@ -2371,3 +2518,26 @@ out: > > return err; > > } > > EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); > > + > > +/* Returns: Number of pages unmapped */ > > +int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, > > + struct xen_pvh_pfn_info *pvhp) > > +{ > > + int count = 0; > > + > > + if (!pvhp || !xen_feature(XENFEAT_auto_translated_physmap)) > > + return 0; > > + > > + while (pvhp->pi_next_todo--) { > > + unsigned long pfn; > > + > > + /* the mmu has already cleaned up the process mmu resources at > > + * this point (lookup_address will return NULL). */ > > + pfn = page_to_pfn(pvhp->pi_paga[pvhp->pi_next_todo]); > > + pvh_rem_xen_p2m(pfn, 1); > > + count++; > > + } > > + flush_tlb_all(); > > + return count; > > Who is going to remove the corresponding mapping from the vma? > Also we might be able to replace the flush_tlb_all with a > flush_tlb_range. > > > > +EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range); > > diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h > > index 73809bb..6d0bb56 100644 > > --- a/arch/x86/xen/mmu.h > > +++ b/arch/x86/xen/mmu.h > > @@ -23,4 +23,6 @@ unsigned long xen_read_cr2_direct(void); > > > > extern void xen_init_mmu_ops(void); > > extern void xen_hvm_init_mmu_ops(void); > > +extern void xen_set_clr_mmio_pvh_pte(unsigned long pfn, unsigned long mfn, > > + int nr_mfns, int add_mapping); > > #endif /* _XEN_MMU_H */ > > diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h > > index 6a198e4..6c5ad83 100644 > > --- a/include/xen/xen-ops.h > > +++ b/include/xen/xen-ops.h > > @@ -24,9 +24,19 @@ int xen_create_contiguous_region(unsigned long vstart, unsigned int order, > > void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order); > > > > struct vm_area_struct; > > +struct xen_pvh_pfn_info; > > int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > > unsigned long addr, > > unsigned long mfn, int nr, > > - pgprot_t prot, unsigned domid); > > + pgprot_t prot, unsigned domid, > > + struct xen_pvh_pfn_info *pvhp); > > +int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, > > + struct xen_pvh_pfn_info *pvhp); > > + > > +struct xen_pvh_pfn_info { > > + struct page **pi_paga; /* pfn info page array */ > > + int pi_num_pgs; > > + int pi_next_todo; > > +}; > > > > #endif /* INCLUDE_XEN_OPS_H */ > > -- > > 1.7.2.3 > >
On Fri, 21 Sep 2012, Mukesh Rathor wrote:> +struct pvh_remap_data { > + unsigned long fgmfn; /* foreign domain''s gmfn */ > + pgprot_t prot; > + domid_t domid; > + struct xen_pvh_pfn_info *pvhinfop; > +}; > + > +static int pvh_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, > + void *data) > +{ > + int rc; > + struct pvh_remap_data *remapp = data; > + struct xen_pvh_pfn_info *pvhp = remapp->pvhinfop; > + unsigned long pfn = page_to_pfn(pvhp->pi_paga[pvhp->pi_next_todo++]); > + pte_t pteval = pte_mkspecial(pfn_pte(pfn, remapp->prot)); > + > + if ((rc=pvh_add_to_xen_p2m(pfn, remapp->fgmfn, remapp->domid))) > + return rc; > + native_set_pte(ptep, pteval); > + > + return 0; > +} > + > +/* The only caller at moment passes one gmfn at a time. > + * PVH TBD/FIXME: expand this in future to honor batch requests. > + */ > +static int pvh_remap_gmfn_range(struct vm_area_struct *vma, > + unsigned long addr, unsigned long mfn, int nr, > + pgprot_t prot, unsigned domid, > + struct xen_pvh_pfn_info *pvhp) > +{ > + int err; > + struct pvh_remap_data pvhdata; > + > + if (nr > 1) > + return -EINVAL; > + > + pvhdata.fgmfn = mfn; > + pvhdata.prot = prot; > + pvhdata.domid = domid; > + pvhdata.pvhinfop = pvhp; > + err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT, > + pvh_map_pte_fn, &pvhdata); > + flush_tlb_all(); > + return err; > +} > + > #define REMAP_BATCH_SIZE 16 > > struct remap_data { > @@ -2329,7 +2468,9 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, > int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > unsigned long addr, > unsigned long mfn, int nr, > - pgprot_t prot, unsigned domid) > + pgprot_t prot, unsigned domid, > + struct xen_pvh_pfn_info *pvhp) > +xen_remap_domain_mfn_range is a cross-architecture call (it is available on ARM as well). We cannot leak architecture specific informations like xen_pvh_pfn_info in the parameter list. It seems to be that xen_pvh_pfn_info contains arch-agnostic information: in that case you just need to rename the struct to something more generic. Otherwise if it really contains x86 specific info, you can change it into an opaque pointer.
On Mon, 2012-09-24 at 15:04 +0100, Stefano Stabellini wrote:> On Fri, 21 Sep 2012, Mukesh Rathor wrote: > > +struct pvh_remap_data { > > + unsigned long fgmfn; /* foreign domain''s gmfn */ > > + pgprot_t prot; > > + domid_t domid; > > + struct xen_pvh_pfn_info *pvhinfop; > > +}; > > + > > +static int pvh_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, > > + void *data) > > +{ > > + int rc; > > + struct pvh_remap_data *remapp = data; > > + struct xen_pvh_pfn_info *pvhp = remapp->pvhinfop; > > + unsigned long pfn = page_to_pfn(pvhp->pi_paga[pvhp->pi_next_todo++]); > > + pte_t pteval = pte_mkspecial(pfn_pte(pfn, remapp->prot)); > > + > > + if ((rc=pvh_add_to_xen_p2m(pfn, remapp->fgmfn, remapp->domid))) > > + return rc; > > + native_set_pte(ptep, pteval); > > + > > + return 0; > > +} > > + > > +/* The only caller at moment passes one gmfn at a time. > > + * PVH TBD/FIXME: expand this in future to honor batch requests. > > + */ > > +static int pvh_remap_gmfn_range(struct vm_area_struct *vma, > > + unsigned long addr, unsigned long mfn, int nr, > > + pgprot_t prot, unsigned domid, > > + struct xen_pvh_pfn_info *pvhp) > > +{ > > + int err; > > + struct pvh_remap_data pvhdata; > > + > > + if (nr > 1) > > + return -EINVAL; > > + > > + pvhdata.fgmfn = mfn; > > + pvhdata.prot = prot; > > + pvhdata.domid = domid; > > + pvhdata.pvhinfop = pvhp; > > + err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT, > > + pvh_map_pte_fn, &pvhdata); > > + flush_tlb_all(); > > + return err; > > +} > > + > > #define REMAP_BATCH_SIZE 16 > > > > struct remap_data { > > @@ -2329,7 +2468,9 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, > > int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > > unsigned long addr, > > unsigned long mfn, int nr, > > - pgprot_t prot, unsigned domid) > > + pgprot_t prot, unsigned domid, > > + struct xen_pvh_pfn_info *pvhp) > > + > > xen_remap_domain_mfn_range is a cross-architecture call (it is available > on ARM as well). We cannot leak architecture specific informations like > xen_pvh_pfn_info in the parameter list. > It seems to be that xen_pvh_pfn_info contains arch-agnostic information: > in that case you just need to rename the struct to something more > generic. Otherwise if it really contains x86 specific info, you can > change it into an opaque pointer.... or explode it into the relevant constituents struct members which need to be passed over the interface when calling these functions. Ian.
On Mon, 24 Sep 2012 12:55:31 +0100 Stefano Stabellini <stefano.stabellini@eu.citrix.com> wrote:> There are few code style issues on this patch, I suggest you run it > through scripts/checkpatch.pl, it should be able to catch all these > errors. > > On Fri, 21 Sep 2012, Mukesh Rathor wrote: > > --- > > arch/x86/xen/mmu.c | 180 > > +++++++++++++++++++++++++++++++++++++++++++++++-- > > arch/x86/xen/mmu.h | 2 + include/xen/xen-ops.h | 12 +++- > > 3 files changed, 188 insertions(+), 6 deletions(-) > > > > diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c > > long addr, > > + pte_t *ptep, pte_t pteval) > > +{ > > + native_set_pte(ptep, pteval); > > +} > > can''t you just set set_pte_at to native_set_pte?yup. oh, i had other code there, which is why it''s not already native_set_pte_at.> > > static void xen_set_pte_at(struct mm_struct *mm, unsigned long > > addr, pte_t *ptep, pte_t pteval) > > { > > @@ -1197,6 +1218,10 @@ static void xen_post_allocator_init(void); > > static void __init xen_pagetable_setup_done(pgd_t *base) > > { > > xen_setup_shared_info(); > > + > > + if (xen_feature(XENFEAT_auto_translated_physmap)) > > + return; > > + > > xen_post_allocator_init(); > > } > > Can we move the if..return at the beginning of > xen_post_allocator_init? It would make it easier to understand what > it is for. Or maybe we could have xen_setup_shared_info take a pgd_t > *base parameter so that you can set pagetable_setup_done to > xen_setup_shared_info directly on pvh.done.> > > @@ -1455,6 +1480,10 @@ static void __init xen_set_pte_init(pte_t > > *ptep, pte_t pte) static void pin_pagetable_pfn(unsigned cmd, > > unsigned long pfn) { > > struct mmuext_op op; > > + > > + if (xen_feature(XENFEAT_writable_page_tables)) > > + return; > > + > > op.cmd = cmd; > > op.arg1.mfn = pfn_to_mfn(pfn); > > if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) > > given the very limited set of mmu pvops that we initialize on pvh, I > cannot find who would actually call pin_pagetable_pfn on pvh.xen_setup_kernel_pagetable().> > > @@ -1652,6 +1681,10 @@ static void set_page_prot(void *addr, > > pgprot_t prot) unsigned long pfn = __pa(addr) >> PAGE_SHIFT; > > + /* set_pte* for PCI devices to map iomem. */ > > + if (xen_initial_domain()) { > > + pv_mmu_ops.set_pte = native_set_pte; > > + pv_mmu_ops.set_pte_at > > xen_dom0pvh_set_pte_at; > > + } > > + return; > > + } > > + > > x86_init.mapping.pagetable_reserve > > xen_mapping_pagetable_reserve; > > x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; > > - x86_init.paging.pagetable_setup_done > > xen_pagetable_setup_done; pv_mmu_ops = xen_mmu_ops; > > > > memset(dummy_mapping, 0xff, PAGE_SIZE); > > I wonder whether it would make sense to have a xen_pvh_init_mmu_ops, > given that they end up being so different...It''s not a pv ops call. It''s called from xen_start_kernel in enlighten.c. So lets just leave it like that.> > + void *data) > > +{ > > + int rc; > > + struct pvh_remap_data *remapp = data; > > + struct xen_pvh_pfn_info *pvhp = remapp->pvhinfop; > > + unsigned long pfn > > page_to_pfn(pvhp->pi_paga[pvhp->pi_next_todo++]); > > + pte_t pteval = pte_mkspecial(pfn_pte(pfn, remapp->prot)); > > + > > + if ((rc=pvh_add_to_xen_p2m(pfn, remapp->fgmfn, > > remapp->domid))) > > + return rc; > > + native_set_pte(ptep, pteval); > > Do we actually need the pte to be "special"? > I would think that being in the guest p2m, it is actually quite a > normal page.Ah, I remember. The reason I had it special is because earlier I was allocating pfn''s from high up address space. These pfns'' were greater than highest_memmap_pfn. But, now I''m allocating them via ballooing. So I can change it to normal. If we go back to allocating pfn''s space from high up, then we''d need this back for this check: vm_normal_page(): if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); return NULL;> > + return 0; > > +} > > + > > +/* The only caller at moment passes one gmfn at a time. > > + * PVH TBD/FIXME: expand this in future to honor batch requests. > > + */ > > +static int pvh_remap_gmfn_range(struct vm_area_struct *vma, > > + unsigned long addr, unsigned long > > mfn, int nr, > > + pgprot_t prot, unsigned domid, > > + struct xen_pvh_pfn_info *pvhp) > > +{ > > + int err; > > + struct pvh_remap_data pvhdata; > > + > > + if (nr > 1) > > + return -EINVAL; > > + > > + pvhdata.fgmfn = mfn; > > + pvhdata.prot = prot; > > + pvhdata.domid = domid; > > + pvhdata.pvhinfop = pvhp; > > + err = apply_to_page_range(vma->vm_mm, addr, nr << > > PAGE_SHIFT, > > + pvh_map_pte_fn, &pvhdata); > > + flush_tlb_all(); > > Maybe we can use one of the flush_tlb_range varieties instead of a > flush_tlb_all?changed it to : flush_tlb_page(vma, addr);> > + > > + if (!pvhp || !xen_feature(XENFEAT_auto_translated_physmap)) > > + return 0; > > + > > + while (pvhp->pi_next_todo--) { > > + unsigned long pfn; > > + > > + /* the mmu has already cleaned up the process mmu > > resources at > > + * this point (lookup_address will return NULL). */ > > + pfn > > page_to_pfn(pvhp->pi_paga[pvhp->pi_next_todo]); > > + pvh_rem_xen_p2m(pfn, 1); > > + count++; > > + } > > + flush_tlb_all(); > > + return count; > > Who is going to remove the corresponding mapping from the vma? > Also we might be able to replace the flush_tlb_all with a > flush_tlb_range.the kernel already does that before privcmd_close is called. don''t have the addrs for flush_tlb_range in privcmd_close(). thanks, Mukesh
On Mon, 24 Sep 2012 13:24:12 +0100 Stefano Stabellini <stefano.stabellini@eu.citrix.com> wrote:> On Mon, 24 Sep 2012, Ian Campbell wrote: > > On Mon, 2012-09-24 at 12:55 +0100, Stefano Stabellini wrote: > > > There are few code style issues on this patch, I suggest you run > > > it through scripts/checkpatch.pl, it should be able to catch all > > > these errors. > > > > It would also be nice to starting having some changelogs entries for > > these patches for the next posting. There''s a lot of complex stuff > > > > + return count; > > > > > > Who is going to remove the corresponding mapping from the vma? > > > Also we might be able to replace the flush_tlb_all with a > > > flush_tlb_range. > > > > I''m not convinced that a guest level TLB flush is either necessary > > or sufficient here. What we are doing is removing entries from the > > P2M which means that we need to do the appropriate HAP flush in the > > hypervisor, which must necessarily invalidate any stage 1 mappings > > which this flush might also touch (i.e. the HAP flush must be a > > super set of this flush). > > > > Without the HAP flush in the hypervisor you risk guests being able > > to see old p2m mappings via the TLB entries which is a security > > issue AFAICT. > > Yes, you are right, we need a flush in the hypervisor to flush the > EPT. It could probably live in the implementation of > XENMEM_add_to_physmap. > > This one should be just for the vma mappings, so in the case of > xen_unmap_domain_mfn_range is unnecessary (given that it is > not removing the vma mappings).My head spins looking at INVEPT and INVVPID docs, but doesn''t it already happen in ept_set_entry(): if ( needs_sync ) ept_sync_domain(p2m->domain);
On Mon, 24 Sep 2012 15:04:22 +0100 Stefano Stabellini <stefano.stabellini@eu.citrix.com> wrote:> On Fri, 21 Sep 2012, Mukesh Rathor wrote: > > +struct pvh_remap_data { > > struct remap_data { > > @@ -2329,7 +2468,9 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, > > pgtable_t token, int xen_remap_domain_mfn_range(struct > > vm_area_struct *vma, unsigned long addr, > > unsigned long mfn, int nr, > > - pgprot_t prot, unsigned domid) > > + pgprot_t prot, unsigned domid, > > + struct xen_pvh_pfn_info *pvhp) > > + > > xen_remap_domain_mfn_range is a cross-architecture call (it is > available on ARM as well). We cannot leak architecture specific > informations like xen_pvh_pfn_info in the parameter list. > It seems to be that xen_pvh_pfn_info contains arch-agnostic > information: in that case you just need to rename the struct to > something more generic. Otherwise if it really contains x86 specific > info, you can change it into an opaque pointer.Ok, how about I just change it to: - struct xen_pvh_pfn_info *pvhp) + void *arch_spec_info) thanks, Mukesh
On Wed, Sep 26, 2012 at 1:27 AM, Mukesh Rathor <mukesh.rathor@oracle.com> wrote:>> > I''m not convinced that a guest level TLB flush is either necessary >> > or sufficient here. What we are doing is removing entries from the >> > P2M which means that we need to do the appropriate HAP flush in the >> > hypervisor, which must necessarily invalidate any stage 1 mappings >> > which this flush might also touch (i.e. the HAP flush must be a >> > super set of this flush). >> > >> > Without the HAP flush in the hypervisor you risk guests being able >> > to see old p2m mappings via the TLB entries which is a security >> > issue AFAICT. >> >> Yes, you are right, we need a flush in the hypervisor to flush the >> EPT. It could probably live in the implementation of >> XENMEM_add_to_physmap. >> >> This one should be just for the vma mappings, so in the case of >> xen_unmap_domain_mfn_range is unnecessary (given that it is >> not removing the vma mappings). > > > My head spins looking at INVEPT and INVVPID docs, but doesn''t it already > happen in ept_set_entry(): > > if ( needs_sync ) > ept_sync_domain(p2m->domain);Yes, the point of having a clean p2m interface is that you shouldn''t need to figure out when to do hap flushes. -George
On Wed, 26 Sep 2012, Mukesh Rathor wrote:> > > @@ -1652,6 +1681,10 @@ static void set_page_prot(void *addr, > > > pgprot_t prot) unsigned long pfn = __pa(addr) >> PAGE_SHIFT; > > > + /* set_pte* for PCI devices to map iomem. */ > > > + if (xen_initial_domain()) { > > > + pv_mmu_ops.set_pte = native_set_pte; > > > + pv_mmu_ops.set_pte_at > > > xen_dom0pvh_set_pte_at; > > > + } > > > + return; > > > + } > > > + > > > x86_init.mapping.pagetable_reserve > > > xen_mapping_pagetable_reserve; > > > x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; > > > - x86_init.paging.pagetable_setup_done > > > xen_pagetable_setup_done; pv_mmu_ops = xen_mmu_ops; > > > > > > memset(dummy_mapping, 0xff, PAGE_SIZE); > > > > I wonder whether it would make sense to have a xen_pvh_init_mmu_ops, > > given that they end up being so different... > > It''s not a pv ops call. It''s called from xen_start_kernel in > enlighten.c. So lets just leave it like that.I meant having a completely different initialization function in mmu.c, instead of trying to reuse xen_init_mmu_ops, because at the end of the day the two cases are very different
On Wed, 26 Sep 2012, Mukesh Rathor wrote:> On Mon, 24 Sep 2012 13:24:12 +0100 > Stefano Stabellini <stefano.stabellini@eu.citrix.com> wrote: > > > On Mon, 24 Sep 2012, Ian Campbell wrote: > > > On Mon, 2012-09-24 at 12:55 +0100, Stefano Stabellini wrote: > > > > There are few code style issues on this patch, I suggest you run > > > > it through scripts/checkpatch.pl, it should be able to catch all > > > > these errors. > > > > > > It would also be nice to starting having some changelogs entries for > > > these patches for the next posting. There''s a lot of complex stuff > > > > > + return count; > > > > > > > > Who is going to remove the corresponding mapping from the vma? > > > > Also we might be able to replace the flush_tlb_all with a > > > > flush_tlb_range. > > > > > > I''m not convinced that a guest level TLB flush is either necessary > > > or sufficient here. What we are doing is removing entries from the > > > P2M which means that we need to do the appropriate HAP flush in the > > > hypervisor, which must necessarily invalidate any stage 1 mappings > > > which this flush might also touch (i.e. the HAP flush must be a > > > super set of this flush). > > > > > > Without the HAP flush in the hypervisor you risk guests being able > > > to see old p2m mappings via the TLB entries which is a security > > > issue AFAICT. > > > > Yes, you are right, we need a flush in the hypervisor to flush the > > EPT. It could probably live in the implementation of > > XENMEM_add_to_physmap. > > > > This one should be just for the vma mappings, so in the case of > > xen_unmap_domain_mfn_range is unnecessary (given that it is > > not removing the vma mappings). > > > My head spins looking at INVEPT and INVVPID docs, but doesn''t it already > happen in ept_set_entry(): > > if ( needs_sync ) > ept_sync_domain(p2m->domain); >Right. So we should be able to get away without any tlb flushes in xen_unmap_domain_mfn_range.
On Wed, 26 Sep 2012, Mukesh Rathor wrote:> On Mon, 24 Sep 2012 15:04:22 +0100 > Stefano Stabellini <stefano.stabellini@eu.citrix.com> wrote: > > > On Fri, 21 Sep 2012, Mukesh Rathor wrote: > > > +struct pvh_remap_data { > > > struct remap_data { > > > @@ -2329,7 +2468,9 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, > > > pgtable_t token, int xen_remap_domain_mfn_range(struct > > > vm_area_struct *vma, unsigned long addr, > > > unsigned long mfn, int nr, > > > - pgprot_t prot, unsigned domid) > > > + pgprot_t prot, unsigned domid, > > > + struct xen_pvh_pfn_info *pvhp) > > > + > > > > xen_remap_domain_mfn_range is a cross-architecture call (it is > > available on ARM as well). We cannot leak architecture specific > > informations like xen_pvh_pfn_info in the parameter list. > > It seems to be that xen_pvh_pfn_info contains arch-agnostic > > information: in that case you just need to rename the struct to > > something more generic. Otherwise if it really contains x86 specific > > info, you can change it into an opaque pointer. > > Ok, how about I just change it to: > > - struct xen_pvh_pfn_info *pvhp) > + void *arch_spec_info)Sorry for misleading you, but on second thought, an opaque pointer is not a good idea: xen_remap_domain_mfn_range is called from privcmd.c, that is arch-agnostic code, so I think that all the parameters should be well specified and arch-agnostic. I think that you should: - rename xen_pvh_pfn_info to something non-x86 specific, for example xen_remap_pfn_info; - move the definition of struct xen_remap_pfn_info to include/xen/xen-ops.h; - add a good comment on top of the struct to explain what each field represents, because other people (me and Ian) might have to reimplement xen_remap_domain_mfn_range in the near future for another popular architecture ;-)
> @@ -24,9 +24,19 @@ int xen_create_contiguous_region(unsigned long vstart, unsigned int order, > void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order); > > struct vm_area_struct; > +struct xen_pvh_pfn_info; > int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > unsigned long addr, > unsigned long mfn, int nr, > - pgprot_t prot, unsigned domid); > + pgprot_t prot, unsigned domid, > + struct xen_pvh_pfn_info *pvhp);This change during a git bisection (where we only applied patch #1 and patch #2) will cause a build breakage like this: /home/konrad/ssd/linux/drivers/xen/privcmd.c: In function ‘mmap_mfn_range’: /home/konrad/ssd/linux/drivers/xen/privcmd.c:181: error: too few arguments to function ‘xen_remap_domain_mfn_range’ /home/konrad/ssd/linux/drivers/xen/privcmd.c: In function ‘mmap_batch_fn’: /home/konrad/ssd/linux/drivers/xen/privcmd.c:270: error: too few arguments to function ‘xen_remap_domain_mfn_range’ make[4]: *** [drivers/xen/privcmd.o] Error 1 make[3]: *** [drivers/xen] Error 2 Please squash this patch: diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index ef63895..802720c 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -178,7 +178,7 @@ static int mmap_mfn_range(void *data, void *state) msg->va & PAGE_MASK, msg->mfn, msg->npages, vma->vm_page_prot, - st->domain); + st->domain, NULL); if (rc < 0) return rc; @@ -267,7 +267,7 @@ static int mmap_batch_fn(void *data, void *state) int ret; ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, - st->vma->vm_page_prot, st->domain); + st->vma->vm_page_prot, st->domain, NULL); /* Store error code for second pass. */ *(st->err++) = ret; into this patch. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
On Mon, 24 Sep 2012 12:55:31 +0100 Stefano Stabellini <stefano.stabellini@eu.citrix.com> wrote:> There are few code style issues on this patch, I suggest you run it > through scripts/checkpatch.pl, it should be able to catch all these > errors.All patches were run thru checkpatch.pl already. So not sure what you are referring to.> > + struct pvh_remap_data *remapp = data; > > + struct xen_pvh_pfn_info *pvhp = remapp->pvhinfop; > > + unsigned long pfn > > page_to_pfn(pvhp->pi_paga[pvhp->pi_next_todo++]); > > + pte_t pteval = pte_mkspecial(pfn_pte(pfn, remapp->prot)); > > + > > + if ((rc=pvh_add_to_xen_p2m(pfn, remapp->fgmfn, > > remapp->domid))) > > + return rc; > > + native_set_pte(ptep, pteval); > > Do we actually need the pte to be "special"? > I would think that being in the guest p2m, it is actually quite a > normal page.Hmm... well, doesn''t like removing "special": BUG: Bad page map in process xl pte:800000027b57b467 pmd:2b408d067 page:ffffea0008afb2e8 count:1 mapcount:-1 mapping: (null) index:0x0 page flags: 0x40000000000414(referenced|dirty|reserved) addr:00007fb4345b0000 vm_flags:000a44fb anon_vma: (null) mapping:ffff88003911a3d0 index:4 vma->vm_ops->fault: privcmd_fault+0x0/0x40 vma->vm_file->f_op->mmap: privcmd_mmap+0x0/0x30 Pid: 2737, comm: xl Tainted: G B 3.6.0-rc6-merge+ #17 Call Trace: [<ffffffff8113dabc>] print_bad_pte+0x1dc/0x250 [<ffffffff8113e57e>] zap_pte_range+0x45e/0x4c0 [<ffffffff8113f30e>] unmap_page_range+0x1ae/0x310 [<ffffffff8113f4d1>] unmap_single_vma+0x61/0xe0 [<ffffffff8113f5a4>] unmap_vmas+0x54/0xa0 [<ffffffff8114514b>] unmap_region+0xab/0x120 [<ffffffff81313263>] ? privcmd_ioctl+0x93/0x100 [<ffffffff8114733d>] do_munmap+0x25d/0x380 This from: if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); So, the mapcount must be not be getting set in "normal" case properly it appears. Marking it special causes it so skip few things. Debugging... thanks, Mukesh
On Mon, 1 Oct 2012 14:32:44 -0700 Mukesh Rathor <mukesh.rathor@oracle.com> wrote:> > > > + struct pvh_remap_data *remapp = data; > > > + struct xen_pvh_pfn_info *pvhp = remapp->pvhinfop; > > > + unsigned long pfn > > > page_to_pfn(pvhp->pi_paga[pvhp->pi_next_todo++]); > > > + pte_t pteval = pte_mkspecial(pfn_pte(pfn, remapp->prot)); > > > + > > > + if ((rc=pvh_add_to_xen_p2m(pfn, remapp->fgmfn, > > > remapp->domid))) > > > + return rc; > > > + native_set_pte(ptep, pteval); > > > > Do we actually need the pte to be "special"? > > I would think that being in the guest p2m, it is actually quite a > > normal page. > > Hmm... well, doesn''t like removing "special": > > > BUG: Bad page map in process xl pte:800000027b57b467 pmd:2b408d067 > page:ffffea0008afb2e8 count:1 mapcount:-1 mapping: (null) > index:0x0 page flags: 0x40000000000414(referenced|dirty|reserved) > addr:00007fb4345b0000 vm_flags:000a44fb anon_vma: (null) > mapping:ffff88003911a3d0 index:4 vma->vm_ops->fault: > privcmd_fault+0x0/0x40 vma->vm_file->f_op->mmap: privcmd_mmap+0x0/0x30 > Pid: 2737, comm: xl Tainted: G B 3.6.0-rc6-merge+ #17 > Call Trace: > [<ffffffff8113dabc>] print_bad_pte+0x1dc/0x250 > [<ffffffff8113e57e>] zap_pte_range+0x45e/0x4c0 > [<ffffffff8113f30e>] unmap_page_range+0x1ae/0x310 > [<ffffffff8113f4d1>] unmap_single_vma+0x61/0xe0 > [<ffffffff8113f5a4>] unmap_vmas+0x54/0xa0 > [<ffffffff8114514b>] unmap_region+0xab/0x120 > [<ffffffff81313263>] ? privcmd_ioctl+0x93/0x100 > [<ffffffff8114733d>] do_munmap+0x25d/0x380 > > This from: > if (unlikely(page_mapcount(page) < 0)) > print_bad_pte(vma, addr, ptent, page); > > > So, the mapcount must be not be getting set in "normal" case properly > it appears. Marking it special causes it so skip few things. > Debugging...Shall I just leave it special for now, and come back and revisit this later? thanks Mukesh
On Mon, 1 Oct 2012, Mukesh Rathor wrote:> On Mon, 24 Sep 2012 12:55:31 +0100 > Stefano Stabellini <stefano.stabellini@eu.citrix.com> wrote: > > > There are few code style issues on this patch, I suggest you run it > > through scripts/checkpatch.pl, it should be able to catch all these > > errors. > > All patches were run thru checkpatch.pl already. So not sure what you > are referring to.I''ll go back to your series and tell you whenever I find a code style issue.
On Fri, 21 Sep 2012, Mukesh Rathor wrote:> --- > arch/x86/xen/mmu.c | 180 +++++++++++++++++++++++++++++++++++++++++++++++-- > arch/x86/xen/mmu.h | 2 + > include/xen/xen-ops.h | 12 +++- > 3 files changed, 188 insertions(+), 6 deletions(-) > > diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c > index b65a761..9b5a5ef 100644 > --- a/arch/x86/xen/mmu.c > +++ b/arch/x86/xen/mmu.c > @@ -73,6 +73,7 @@ > #include <xen/interface/version.h> > #include <xen/interface/memory.h> > #include <xen/hvc-console.h> > +#include <xen/balloon.h> > > #include "multicalls.h" > #include "mmu.h" > @@ -330,6 +331,26 @@ static void xen_set_pte(pte_t *ptep, pte_t pteval) > __xen_set_pte(ptep, pteval); > } > > +void xen_set_clr_mmio_pvh_pte(unsigned long pfn, unsigned long mfn, > + int nr_mfns, int add_mapping) > +{ > + struct physdev_map_iomem iomem; > + > + iomem.first_gfn = pfn; > + iomem.first_mfn = mfn; > + iomem.nr_mfns = nr_mfns; > + iomem.add_mapping = add_mapping; > + > + if (HYPERVISOR_physdev_op(PHYSDEVOP_pvh_map_iomem, &iomem)) > + BUG(); > +} > + > +static void xen_dom0pvh_set_pte_at(struct mm_struct *mm, unsigned long addr, > + pte_t *ptep, pte_t pteval) > +{ > + native_set_pte(ptep, pteval); > +} > + > static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, > pte_t *ptep, pte_t pteval) > { > @@ -1197,6 +1218,10 @@ static void xen_post_allocator_init(void); > static void __init xen_pagetable_setup_done(pgd_t *base) > { > xen_setup_shared_info(); > + > + if (xen_feature(XENFEAT_auto_translated_physmap)) > + return; > + > xen_post_allocator_init(); > } > > @@ -1455,6 +1480,10 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) > static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) > { > struct mmuext_op op; > + > + if (xen_feature(XENFEAT_writable_page_tables)) > + return; > + > op.cmd = cmd; > op.arg1.mfn = pfn_to_mfn(pfn); > if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) > @@ -1652,6 +1681,10 @@ static void set_page_prot(void *addr, pgprot_t prot) > unsigned long pfn = __pa(addr) >> PAGE_SHIFT; > pte_t pte = pfn_pte(pfn, prot); > > + /* recall for PVH, page tables are native. */ > + if (xen_feature(XENFEAT_auto_translated_physmap)) > + return; > + > if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) > BUG(); > } > @@ -1729,6 +1762,9 @@ static void convert_pfn_mfn(void *v) > pte_t *pte = v; > int i; > > + if (xen_feature(XENFEAT_auto_translated_physmap)) > + return; > + > /* All levels are converted the same way, so just treat them > as ptes. */ > for (i = 0; i < PTRS_PER_PTE; i++) > @@ -1745,6 +1781,7 @@ static void convert_pfn_mfn(void *v) > * but that''s enough to get __va working. We need to fill in the rest > * of the physical mapping once some sort of allocator has been set > * up. > + * NOTE: for PVH, the page tables are native. > */ > pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, > unsigned long max_pfn) > @@ -1802,9 +1839,13 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, > * structure to attach it to, so make sure we just set kernel > * pgd. > */ > - xen_mc_batch(); > - __xen_write_cr3(true, __pa(pgd)); > - xen_mc_issue(PARAVIRT_LAZY_CPU); > + if (xen_feature(XENFEAT_writable_page_tables)) { > + native_write_cr3(__pa(pgd)); > + } else { > + xen_mc_batch(); > + __xen_write_cr3(true, __pa(pgd)); > + xen_mc_issue(PARAVIRT_LAZY_CPU); > + } > > memblock_reserve(__pa(xen_start_info->pt_base), > xen_start_info->nr_pt_frames * PAGE_SIZE); > @@ -2067,9 +2108,21 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { > > void __init xen_init_mmu_ops(void) > { > + x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; > + > + if (xen_feature(XENFEAT_auto_translated_physmap)) { > + pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others; > + > + /* set_pte* for PCI devices to map iomem. */ > + if (xen_initial_domain()) { > + pv_mmu_ops.set_pte = native_set_pte; > + pv_mmu_ops.set_pte_at = xen_dom0pvh_set_pte_at; > + } > + return; > + } > + > x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; > x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; > - x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; > pv_mmu_ops = xen_mmu_ops; > > memset(dummy_mapping, 0xff, PAGE_SIZE); > @@ -2305,6 +2358,92 @@ void __init xen_hvm_init_mmu_ops(void) > } > #endif > > +/* Map foreign gmfn, fgmfn, to local pfn, lpfn. This for the user space > + * creating new guest on PVH dom0 and needs to map domU pages. > + */ > +static int pvh_add_to_xen_p2m(unsigned long lpfn, unsigned long fgmfn, > + unsigned int domid) > +{ > + int rc; > + struct xen_add_to_physmap xatp = { .u.foreign_domid = domid }; > + > + xatp.gpfn = lpfn; > + xatp.idx = fgmfn; > + xatp.domid = DOMID_SELF; > + xatp.space = XENMAPSPACE_gmfn_foreign; > + rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp); > + if (rc) > + pr_warn("d0: Failed to map pfn to mfn rc:%d pfn:%lx mfn:%lx\n", > + rc, lpfn, fgmfn); > + return rc; > +} > + > +int pvh_rem_xen_p2m(unsigned long spfn, int count) > +{ > + struct xen_remove_from_physmap xrp; > + int i, rc; > + > + for (i=0; i < count; i++) {i = 0> + xrp.domid = DOMID_SELF; > + xrp.gpfn = spfn+i;spfn + i> + rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); > + if (rc) { > + pr_warn("Failed to unmap pfn:%lx rc:%d done:%d\n", > + spfn+i, rc, i);spfn + i> + return 1; > + } > + } > + return 0; > +} > +EXPORT_SYMBOL_GPL(pvh_rem_xen_p2m); > + > +struct pvh_remap_data { > + unsigned long fgmfn; /* foreign domain''s gmfn */ > + pgprot_t prot; > + domid_t domid; > + struct xen_pvh_pfn_info *pvhinfop; > +}; > + > +static int pvh_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, > + void *data) > +{ > + int rc; > + struct pvh_remap_data *remapp = data; > + struct xen_pvh_pfn_info *pvhp = remapp->pvhinfop; > + unsigned long pfn = page_to_pfn(pvhp->pi_paga[pvhp->pi_next_todo++]); > + pte_t pteval = pte_mkspecial(pfn_pte(pfn, remapp->prot)); > + > + if ((rc=pvh_add_to_xen_p2m(pfn, remapp->fgmfn, remapp->domid))) > + return rc;rc = pvh_add_to_xen_p2m> + native_set_pte(ptep, pteval); > + > + return 0; > +} > + > +/* The only caller at moment passes one gmfn at a time. > + * PVH TBD/FIXME: expand this in future to honor batch requests. > + */ > +static int pvh_remap_gmfn_range(struct vm_area_struct *vma, > + unsigned long addr, unsigned long mfn, int nr, > + pgprot_t prot, unsigned domid, > + struct xen_pvh_pfn_info *pvhp) > +{ > + int err; > + struct pvh_remap_data pvhdata; > + > + if (nr > 1) > + return -EINVAL; > + > + pvhdata.fgmfn = mfn; > + pvhdata.prot = prot; > + pvhdata.domid = domid; > + pvhdata.pvhinfop = pvhp; > + err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT, > + pvh_map_pte_fn, &pvhdata); > + flush_tlb_all(); > + return err; > +} > + > #define REMAP_BATCH_SIZE 16 > > struct remap_data { > @@ -2329,7 +2468,9 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, > int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > unsigned long addr, > unsigned long mfn, int nr, > - pgprot_t prot, unsigned domid) > + pgprot_t prot, unsigned domid, > + struct xen_pvh_pfn_info *pvhp) > + > { > struct remap_data rmd; > struct mmu_update mmu_update[REMAP_BATCH_SIZE]; > @@ -2342,6 +2483,12 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) => (VM_PFNMAP | VM_RESERVED | VM_IO))); > > + if (xen_feature(XENFEAT_auto_translated_physmap)) { > + /* We need to update the local page tables and the xen HAP */ > + return pvh_remap_gmfn_range(vma, addr, mfn, nr, prot, domid, > + pvhp); > + } > + > rmd.mfn = mfn; > rmd.prot = prot; > > @@ -2371,3 +2518,26 @@ out: > return err; > } > EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); > + > +/* Returns: Number of pages unmapped */ > +int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, > + struct xen_pvh_pfn_info *pvhp) > +{ > + int count = 0; > + > + if (!pvhp || !xen_feature(XENFEAT_auto_translated_physmap)) > + return 0; > + > + while (pvhp->pi_next_todo--) { > + unsigned long pfn; > + > + /* the mmu has already cleaned up the process mmu resources at > + * this point (lookup_address will return NULL). */ > + pfn = page_to_pfn(pvhp->pi_paga[pvhp->pi_next_todo]); > + pvh_rem_xen_p2m(pfn, 1); > + count++; > + } > + flush_tlb_all(); > + return count; > +} > +EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range); > diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h > index 73809bb..6d0bb56 100644 > --- a/arch/x86/xen/mmu.h > +++ b/arch/x86/xen/mmu.h > @@ -23,4 +23,6 @@ unsigned long xen_read_cr2_direct(void); > > extern void xen_init_mmu_ops(void); > extern void xen_hvm_init_mmu_ops(void); > +extern void xen_set_clr_mmio_pvh_pte(unsigned long pfn, unsigned long mfn, > + int nr_mfns, int add_mapping); > #endif /* _XEN_MMU_H */ > diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h > index 6a198e4..6c5ad83 100644 > --- a/include/xen/xen-ops.h > +++ b/include/xen/xen-ops.h > @@ -24,9 +24,19 @@ int xen_create_contiguous_region(unsigned long vstart, unsigned int order, > void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order); > > struct vm_area_struct; > +struct xen_pvh_pfn_info; > int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > unsigned long addr, > unsigned long mfn, int nr, > - pgprot_t prot, unsigned domid); > + pgprot_t prot, unsigned domid, > + struct xen_pvh_pfn_info *pvhp); > +int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, > + struct xen_pvh_pfn_info *pvhp); > + > +struct xen_pvh_pfn_info { > + struct page **pi_paga; /* pfn info page array */ > + int pi_num_pgs; > + int pi_next_todo; > +}; > > #endif /* INCLUDE_XEN_OPS_H */ > -- > 1.7.2.3 >
On Mon, 1 Oct 2012, Mukesh Rathor wrote:> On Mon, 1 Oct 2012 14:32:44 -0700 > Mukesh Rathor <mukesh.rathor@oracle.com> wrote: > > > > > > > + struct pvh_remap_data *remapp = data; > > > > + struct xen_pvh_pfn_info *pvhp = remapp->pvhinfop; > > > > + unsigned long pfn > > > > page_to_pfn(pvhp->pi_paga[pvhp->pi_next_todo++]); > > > > + pte_t pteval = pte_mkspecial(pfn_pte(pfn, remapp->prot)); > > > > + > > > > + if ((rc=pvh_add_to_xen_p2m(pfn, remapp->fgmfn, > > > > remapp->domid))) > > > > + return rc; > > > > + native_set_pte(ptep, pteval); > > > > > > Do we actually need the pte to be "special"? > > > I would think that being in the guest p2m, it is actually quite a > > > normal page. > > > > Hmm... well, doesn''t like removing "special": > > > > > > BUG: Bad page map in process xl pte:800000027b57b467 pmd:2b408d067 > > page:ffffea0008afb2e8 count:1 mapcount:-1 mapping: (null) > > index:0x0 page flags: 0x40000000000414(referenced|dirty|reserved) > > addr:00007fb4345b0000 vm_flags:000a44fb anon_vma: (null) > > mapping:ffff88003911a3d0 index:4 vma->vm_ops->fault: > > privcmd_fault+0x0/0x40 vma->vm_file->f_op->mmap: privcmd_mmap+0x0/0x30 > > Pid: 2737, comm: xl Tainted: G B 3.6.0-rc6-merge+ #17 > > Call Trace: > > [<ffffffff8113dabc>] print_bad_pte+0x1dc/0x250 > > [<ffffffff8113e57e>] zap_pte_range+0x45e/0x4c0 > > [<ffffffff8113f30e>] unmap_page_range+0x1ae/0x310 > > [<ffffffff8113f4d1>] unmap_single_vma+0x61/0xe0 > > [<ffffffff8113f5a4>] unmap_vmas+0x54/0xa0 > > [<ffffffff8114514b>] unmap_region+0xab/0x120 > > [<ffffffff81313263>] ? privcmd_ioctl+0x93/0x100 > > [<ffffffff8114733d>] do_munmap+0x25d/0x380 > > > > This from: > > if (unlikely(page_mapcount(page) < 0)) > > print_bad_pte(vma, addr, ptent, page); > > > > > > So, the mapcount must be not be getting set in "normal" case properly > > it appears. Marking it special causes it so skip few things. > > Debugging... > > Shall I just leave it special for now, and come back and revisit > this later?special is going to create troubles if somebody starts using these pages in unexpected ways (for example dma from hardware ot gupf). Also I fail to see how this case is any different from mapping pages using gntdev (see gntdev_mmap) that works fine without special. Maybe we are not setting some vm_flags that we are supposed to set (VM_RESERVED | VM_IO | VM_DONTCOPY)? I see that we are setting them in privcmd_mmap but not in privcmd_ioctl_mmap_batch...
On Tue, 2 Oct 2012 12:23:58 +0100 Stefano Stabellini <stefano.stabellini@eu.citrix.com> wrote:> On Mon, 1 Oct 2012, Mukesh Rathor wrote: > > On Mon, 1 Oct 2012 14:32:44 -0700 > > Mukesh Rathor <mukesh.rathor@oracle.com> wrote: > > > > > > > > So, the mapcount must be not be getting set in "normal" case > > > properly it appears. Marking it special causes it so skip few > > > things. Debugging... > > > > Shall I just leave it special for now, and come back and revisit > > this later? > > special is going to create troubles if somebody starts using these > pages in unexpected ways (for example dma from hardware ot gupf). > > Also I fail to see how this case is any different from mapping pages > using gntdev (see gntdev_mmap) that works fine without special. > > Maybe we are not setting some vm_flags that we are supposed to set > (VM_RESERVED | VM_IO | VM_DONTCOPY)? > I see that we are setting them in privcmd_mmap but not in > privcmd_ioctl_mmap_batch...No, that is getting set. privcmd_mmap is hook for mmap(), so it gets called for both. If its not marked special, vm_normal_page() will not check for the VM_PFNMAP flag, which is what we want. It works for PV dom0 because remap_area_mfn_pte_fn() has also marked it special: static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, void *data) { struct remap_data *rmd = data; pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot)); thanks, Mukesh
> diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h > index 6a198e4..6c5ad83 100644 > --- a/include/xen/xen-ops.h > +++ b/include/xen/xen-ops.h > @@ -24,9 +24,19 @@ int xen_create_contiguous_region(unsigned long vstart, unsigned int order, > void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order); > > struct vm_area_struct; > +struct xen_pvh_pfn_info;If you move the struct def''n up you don''t need this forward decl.> int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > unsigned long addr, > unsigned long mfn, int nr, > - pgprot_t prot, unsigned domid); > + pgprot_t prot, unsigned domid, > + struct xen_pvh_pfn_info *pvhp); > +int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, > + struct xen_pvh_pfn_info *pvhp); > + > +struct xen_pvh_pfn_info {Can we call this xen_remap_mfn_info or something? PVH is x86 specific while this struct is also useful on ARM.> + struct page **pi_paga; /* pfn info page array */can we just call this "pages"? paga is pretty meaningless.> + int pi_num_pgs; > + int pi_next_todo;I don''t think we need the pi_ prefix for any of these. Ian.
On Mon, 24 Sep 2012 12:55:31 +0100 Stefano Stabellini <stefano.stabellini@eu.citrix.com> wrote:> There are few code style issues on this patch, I suggest you run it > through scripts/checkpatch.pl, it should be able to catch all these > errors.Oh, my bad. I was running them thru cleanpatch and not checkpatch. It''s my first linux submission, so please bear with me. I''ll run them all thru checkpatch this time. Thanks Mukesh
On Wed, 3 Oct 2012 16:42:43 +0100 Ian Campbell <Ian.Campbell@citrix.com> wrote:> > > diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h > > index 6a198e4..6c5ad83 100644 > > --- a/include/xen/xen-ops.h > > +++ b/include/xen/xen-ops.h > > @@ -24,9 +24,19 @@ int xen_create_contiguous_region(unsigned long > > vstart, unsigned int order, void > > xen_destroy_contiguous_region(unsigned long vstart, unsigned int > > order); struct vm_area_struct; > > +struct xen_pvh_pfn_info; > > If you move the struct def''n up you don''t need this forward decl. > > > int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > > unsigned long addr, > > unsigned long mfn, int nr, > > - pgprot_t prot, unsigned domid); > > + pgprot_t prot, unsigned domid, > > + struct xen_pvh_pfn_info *pvhp); > > +int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, > > + struct xen_pvh_pfn_info *pvhp); > > + > > +struct xen_pvh_pfn_info { > > Can we call this xen_remap_mfn_info or something? PVH is x86 specific > while this struct is also useful on ARM.I already renamed it to: xen_xlat_pfn_info.> > + struct page **pi_paga; /* pfn info page > > array */ > > can we just call this "pages"? paga is pretty meaningless.page array! i can rename page_array or page_a.> > + int pi_num_pgs; > > + int pi_next_todo; > > I don''t think we need the pi_ prefix for any of these.The prefix for fields in struct make it easy to find via cscope or grep, otherwise, it''s a nightmare to find common field names like pages when reading code. I really get frustrated. I prefer prefixing all field names. thanks Mukesh
On Wed, 2012-10-03 at 19:27 +0100, Mukesh Rathor wrote:> On Mon, 24 Sep 2012 12:55:31 +0100 > Stefano Stabellini <stefano.stabellini@eu.citrix.com> wrote: > > > There are few code style issues on this patch, I suggest you run it > > through scripts/checkpatch.pl, it should be able to catch all these > > errors. > > Oh, my bad. I was running them thru cleanpatch and not checkpatch. It''s > my first linux submission, so please bear with me. I''ll run them all > thru checkpatch this time.I''d never heard of cleanpatch before, looks like it is pretty obsolete (not updated since 2007 and still requires 79 char lines). Ian.
On Wed, 2012-10-03 at 23:29 +0100, Mukesh Rathor wrote:> On Wed, 3 Oct 2012 16:42:43 +0100 > Ian Campbell <Ian.Campbell@citrix.com> wrote: > > > > > > diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h > > > index 6a198e4..6c5ad83 100644 > > > --- a/include/xen/xen-ops.h > > > +++ b/include/xen/xen-ops.h > > > @@ -24,9 +24,19 @@ int xen_create_contiguous_region(unsigned long > > > vstart, unsigned int order, void > > > xen_destroy_contiguous_region(unsigned long vstart, unsigned int > > > order); struct vm_area_struct; > > > +struct xen_pvh_pfn_info; > > > > If you move the struct def''n up you don''t need this forward decl. > > > > > int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > > > unsigned long addr, > > > unsigned long mfn, int nr, > > > - pgprot_t prot, unsigned domid); > > > + pgprot_t prot, unsigned domid, > > > + struct xen_pvh_pfn_info *pvhp); > > > +int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, > > > + struct xen_pvh_pfn_info *pvhp); > > > + > > > +struct xen_pvh_pfn_info { > > > > Can we call this xen_remap_mfn_info or something? PVH is x86 specific > > while this struct is also useful on ARM. > > I already renamed it to: xen_xlat_pfn_info.What does xlat refer to here? I don''t think we translate anything with this struct.> > > + struct page **pi_paga; /* pfn info page > > > array */ > > > > can we just call this "pages"? paga is pretty meaningless. > > page array! i can rename page_array or page_a.What''s wrong with pages? It''s short (some of the lines using this stuff are necessarily pretty long) and obvious.> > > + int pi_num_pgs; > > > + int pi_next_todo; > > > > I don''t think we need the pi_ prefix for any of these. > > The prefix for fields in struct make it easy to find via cscope or > grep, otherwise, it''s a nightmare to find common field names like > pages when reading code. I really get frustrated. I prefer prefixing > all field names.It''s not common practice in Linux to do so but fair enough. While implementing the ARM version of this interface it occurred to me that the num_pgs and next_todo fields here are not really needed across the arch interface e.g. you already get pi_num_pgs from the nr argument to xen_remap_domain_mfn_range and pi_next_todo is really state which fits in struct pvh_remap_data. That would mean that remap_foo could take a struct page * directly and I think you would save an allocation. I may look into implementing that change as I code up the ARM side. Ian.
On Wed, 2012-10-03 at 23:29 +0100, Mukesh Rathor wrote:> On Wed, 3 Oct 2012 16:42:43 +0100 > Ian Campbell <Ian.Campbell@citrix.com> wrote: > > > > > > diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h > > > index 6a198e4..6c5ad83 100644 > > > --- a/include/xen/xen-ops.h > > > +++ b/include/xen/xen-ops.h > > > @@ -24,9 +24,19 @@ int xen_create_contiguous_region(unsigned long > > > vstart, unsigned int order, void > > > xen_destroy_contiguous_region(unsigned long vstart, unsigned int > > > order); struct vm_area_struct; > > > +struct xen_pvh_pfn_info; > > > > If you move the struct def''n up you don''t need this forward decl. > > > > > int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > > > unsigned long addr, > > > unsigned long mfn, int nr, > > > - pgprot_t prot, unsigned domid); > > > + pgprot_t prot, unsigned domid, > > > + struct xen_pvh_pfn_info *pvhp); > > > +int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, > > > + struct xen_pvh_pfn_info *pvhp); > > > + > > > +struct xen_pvh_pfn_info { > > > > Can we call this xen_remap_mfn_info or something? PVH is x86 specific > > while this struct is also useful on ARM. > > I already renamed it to: xen_xlat_pfn_info.BTW, where can I find you latest patches? I''ve just noticed that you''ve not been CCing LKML with this series -- was that deliberate? Ian.
On Fri, 2012-09-21 at 20:15 +0100, Mukesh Rathor wrote:> > +int pvh_rem_xen_p2m(unsigned long spfn, int count) > +{ > + struct xen_remove_from_physmap xrp; > + int i, rc; > + > + for (i=0; i < count; i++) { > + xrp.domid = DOMID_SELF; > + xrp.gpfn = spfn+i; > + rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);It seems that the current implementation of this hypercall in the h/v doesn''t handle unmapping of foreign pages, since it requires that the page be owned by the domain whose P2M it is manipulating. How did you fix this on the hypervisor side? I''d like to make sure I do things consistently on the ARM side. Talking with Tim it seems like having foreign mappings in a p2m is a bit tricky and needs a bit of careful thought WRT reference counting etc. Ian.> + if (rc) { > + pr_warn("Failed to unmap pfn:%lx rc:%d done:%d\n", > + spfn+i, rc, i); > + return 1; > + } > + } > + return 0; > +} > +EXPORT_SYMBOL_GPL(pvh_rem_xen_p2m);
On Thu, 4 Oct 2012 09:27:59 +0100 Ian Campbell <Ian.Campbell@citrix.com> wrote:> On Wed, 2012-10-03 at 23:29 +0100, Mukesh Rathor wrote: > > On Wed, 3 Oct 2012 16:42:43 +0100 > > Ian Campbell <Ian.Campbell@citrix.com> wrote: > > > > > > > > > int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > > > > unsigned long addr, > > > > unsigned long mfn, int nr, > > > > - pgprot_t prot, unsigned domid); > > > > + pgprot_t prot, unsigned domid, > > > > + struct xen_pvh_pfn_info *pvhp); > > > > +int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, > > > > + struct xen_pvh_pfn_info *pvhp); > > > > + > > > > +struct xen_pvh_pfn_info { > > > > > > Can we call this xen_remap_mfn_info or something? PVH is x86 > > > specific while this struct is also useful on ARM. > > > > I already renamed it to: xen_xlat_pfn_info. > > What does xlat refer to here? I don''t think we translate anything with > this struct.paging mode xlate! See stefanno''s prev email where he suggested changing name to this.> > > > + struct page **pi_paga; /* pfn info page > > > > array */ > > > > > > can we just call this "pages"? paga is pretty meaningless. > > > > page array! i can rename page_array or page_a. > > What''s wrong with pages? It''s short (some of the lines using this > stuff are necessarily pretty long) and obvious.grep''ing pages would give thousands results. I can prefix with something and use pages.> > > > + int pi_num_pgs; > > > > + int pi_next_todo; > > > > > > I don''t think we need the pi_ prefix for any of these. > > > > The prefix for fields in struct make it easy to find via cscope or > > grep, otherwise, it''s a nightmare to find common field names like > > pages when reading code. I really get frustrated. I prefer prefixing > > all field names. > > It''s not common practice in Linux to do so but fair enough. > > While implementing the ARM version of this interface it occurred to me > that the num_pgs and next_todo fields here are not really needed > across the arch interface e.g. you already get pi_num_pgs from the nr > argument to xen_remap_domain_mfn_range and pi_next_todo is really > state which fits in struct pvh_remap_data. That would mean that > remap_foo could take a struct page * directly and I think you would > save an allocation.Ok, let me explore this. thanks Mukesh
> > > > > + struct page **pi_paga; /* pfn info page > > > > > array */ > > > > > > > > can we just call this "pages"? paga is pretty meaningless. > > > > > > page array! i can rename page_array or page_a. > > > > What''s wrong with pages? It''s short (some of the lines using this > > stuff are necessarily pretty long) and obvious. > > grep''ing pages would give thousands results. I can prefix with something > and use pages.Please don''t prefix it. ''pages'' is good.> > > > > > > + int pi_num_pgs; > > > > > + int pi_next_todo; > > > > > > > > I don''t think we need the pi_ prefix for any of these. > > > > > > The prefix for fields in struct make it easy to find via cscope or > > > grep, otherwise, it''s a nightmare to find common field names like > > > pages when reading code. I really get frustrated. I prefer prefixing > > > all field names. > > > > It''s not common practice in Linux to do so but fair enough.Please remove the ''pi_'' field. Everytime I see it I think of bathroom and then 3.1415... I''ve no idea what it actually stands for and I am not sure if there is a need to know what it stands for? If the ''pi_'' is very important - it should be part of the structure''s name. And then probably unrolled. If you are searching for a field in a structure - why? Why not search for the structure itself? Making the structure name unique should be enough right to find in cscope/ctags? Both ctags and cscope are good at helping you (once you have the structure or code name) at finding the definitions of the fields if you need to.
On Thu, 4 Oct 2012 09:31:36 +0100 Ian Campbell <Ian.Campbell@citrix.com> wrote:> On Wed, 2012-10-03 at 23:29 +0100, Mukesh Rathor wrote: > > On Wed, 3 Oct 2012 16:42:43 +0100 > > Ian Campbell <Ian.Campbell@citrix.com> wrote: > > > > > > > > > diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h > > > > index 6a198e4..6c5ad83 100644 > > > > --- a/include/xen/xen-ops.h > > > > +++ b/include/xen/xen-ops.h > > > > @@ -24,9 +24,19 @@ int xen_create_contiguous_region(unsigned > > > > long vstart, unsigned int order, void > > > > xen_destroy_contiguous_region(unsigned long vstart, unsigned int > > > > order); struct vm_area_struct; > > > > +struct xen_pvh_pfn_info; > > > > > > If you move the struct def''n up you don''t need this forward decl. > > > > > > > int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > > > > unsigned long addr, > > > > unsigned long mfn, int nr, > > > > - pgprot_t prot, unsigned domid); > > > > + pgprot_t prot, unsigned domid, > > > > + struct xen_pvh_pfn_info *pvhp); > > > > +int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, > > > > + struct xen_pvh_pfn_info *pvhp); > > > > + > > > > +struct xen_pvh_pfn_info { > > > > > > Can we call this xen_remap_mfn_info or something? PVH is x86 > > > specific while this struct is also useful on ARM. > > > > I already renamed it to: xen_xlat_pfn_info. > > BTW, where can I find you latest patches?In my local tree here. Will be submitting v3 soon.> I''ve just noticed that you''ve not been CCing LKML with this series -- > was that deliberate?I was told I didn''t need to since there is no common code change. thanks Mukesh
On Thu, 4 Oct 2012 14:54:47 +0100 Ian Campbell <Ian.Campbell@citrix.com> wrote:> On Fri, 2012-09-21 at 20:15 +0100, Mukesh Rathor wrote: > > > > +int pvh_rem_xen_p2m(unsigned long spfn, int count) > > +{ > > + struct xen_remove_from_physmap xrp; > > + int i, rc; > > + > > + for (i=0; i < count; i++) { > > + xrp.domid = DOMID_SELF; > > + xrp.gpfn = spfn+i; > > + rc > > HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); > > It seems that the current implementation of this hypercall in the h/v > doesn''t handle unmapping of foreign pages, since it requires that the > page be owned by the domain whose P2M it is manipulating. > > How did you fix this on the hypervisor side? I''d like to make sure I > do things consistently on the ARM side. > > Talking with Tim it seems like having foreign mappings in a p2m is a > bit tricky and needs a bit of careful thought WRT reference counting > etc.Hey Ian, This is what I''ve (don''t worry about "hybrid" word for now, i''ll fix it): casee XENMEM_remove_from_physmap: { struct xen_remove_from_physmap xrfp; struct page_info *page; struct domain *d; p2m_type_t p2mt; if ( copy_from_guest(&xrfp, arg, 1) ) return -EFAULT; rc = rcu_lock_target_domain_by_id(xrfp.domid, &d); if ( rc != 0 ) return rc; if ( xsm_remove_from_physmap(current->domain, d) ) { rcu_unlock_domain(d); return -EPERM; } domain_lock(d); page = get_page_from_gfn(d, xrfp.gpfn, &p2mt, P2M_ALLOC); if ( page || (is_hybrid_vcpu(current) && (p2m_is_mmio(p2mt) || p2m_is_foreign(p2mt))) ) { unsigned long argmfn = page ? page_to_mfn(page) : INVALID_MFN; guest_physmap_remove_page(d, xrfp.gpfn, argmfn, 0); if (page) put_page(page); } else { if ( is_hybrid_vcpu(current) ) gdprintk(XENLOG_WARNING, "%s: Domain:%u gmfn:%lx invalid\n", __FUNCTION__, current->domain->domain_id, xrfp.gpfn); rc = -ENOENT; } domain_unlock(d); rcu_unlock_domain(d); break; }
On Thu, 2012-10-04 at 19:27 +0100, Mukesh Rathor wrote:> On Thu, 4 Oct 2012 09:27:59 +0100 > Ian Campbell <Ian.Campbell@citrix.com> wrote: > > > On Wed, 2012-10-03 at 23:29 +0100, Mukesh Rathor wrote: > > > On Wed, 3 Oct 2012 16:42:43 +0100 > > > Ian Campbell <Ian.Campbell@citrix.com> wrote: > > > > > > > > > > > > int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > > > > > unsigned long addr, > > > > > unsigned long mfn, int nr, > > > > > - pgprot_t prot, unsigned domid); > > > > > + pgprot_t prot, unsigned domid, > > > > > + struct xen_pvh_pfn_info *pvhp); > > > > > +int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, > > > > > + struct xen_pvh_pfn_info *pvhp); > > > > > + > > > > > +struct xen_pvh_pfn_info { > > > > > > > > Can we call this xen_remap_mfn_info or something? PVH is x86 > > > > specific while this struct is also useful on ARM. > > > > > > I already renamed it to: xen_xlat_pfn_info. > > > > What does xlat refer to here? I don''t think we translate anything with > > this struct. > > paging mode xlate! See stefanno''s prev email where he suggested changing > name to this.Oh, because the uses are under XENFEAT_auto_translated_physmap? I suppose that makes sense, kind of. I guess it doesn''t really matter since I reckon we can get rid of the struct entirely anyway ;-) Ian.
On Fri, 2012-10-05 at 02:17 +0100, Mukesh Rathor wrote:> On Thu, 4 Oct 2012 09:31:36 +0100 > Ian Campbell <Ian.Campbell@citrix.com> wrote: > > > On Wed, 2012-10-03 at 23:29 +0100, Mukesh Rathor wrote: > > > On Wed, 3 Oct 2012 16:42:43 +0100 > > > Ian Campbell <Ian.Campbell@citrix.com> wrote: > > > > > > > > > > > > diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h > > > > > index 6a198e4..6c5ad83 100644 > > > > > --- a/include/xen/xen-ops.h > > > > > +++ b/include/xen/xen-ops.h > > > > > @@ -24,9 +24,19 @@ int xen_create_contiguous_region(unsigned > > > > > long vstart, unsigned int order, void > > > > > xen_destroy_contiguous_region(unsigned long vstart, unsigned int > > > > > order); struct vm_area_struct; > > > > > +struct xen_pvh_pfn_info; > > > > > > > > If you move the struct def''n up you don''t need this forward decl. > > > > > > > > > int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > > > > > unsigned long addr, > > > > > unsigned long mfn, int nr, > > > > > - pgprot_t prot, unsigned domid); > > > > > + pgprot_t prot, unsigned domid, > > > > > + struct xen_pvh_pfn_info *pvhp); > > > > > +int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, > > > > > + struct xen_pvh_pfn_info *pvhp); > > > > > + > > > > > +struct xen_pvh_pfn_info { > > > > > > > > Can we call this xen_remap_mfn_info or something? PVH is x86 > > > > specific while this struct is also useful on ARM. > > > > > > I already renamed it to: xen_xlat_pfn_info. > > > > BTW, where can I find you latest patches? > > In my local tree here. Will be submitting v3 soon.Great, thanks.> > I''ve just noticed that you''ve not been CCing LKML with this series -- > > was that deliberate? > > I was told I didn''t need to since there is no common code change.I always (try to) CC both regardless but I suppose that''s not really a rule. Ian.
On Thu, 4 Oct 2012 09:27:59 +0100 Ian Campbell <Ian.Campbell@citrix.com> wrote:> On Wed, 2012-10-03 at 23:29 +0100, Mukesh Rathor wrote: > > On Wed, 3 Oct 2012 16:42:43 +0100 > > Ian Campbell <Ian.Campbell@citrix.com> wrote: > > > While implementing the ARM version of this interface it occurred to me > that the num_pgs and next_todo fields here are not really needed > across the arch interface e.g. you already get pi_num_pgs from the nr > argument to xen_remap_domain_mfn_range and pi_next_todo is really > state which fits in struct pvh_remap_data. That would mean that > remap_foo could take a struct page * directly and I think you would > save an allocation.Ok, take a look at the attached and inlined diffs. I redid it, passing struct page to the api, and getting rid of the pi_* struct completely. thanks Mukesh diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index ef63895..6cbf59a 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -33,6 +33,7 @@ #include <xen/features.h> #include <xen/page.h> #include <xen/xen-ops.h> +#include <xen/balloon.h> #include "privcmd.h" @@ -178,7 +179,7 @@ static int mmap_mfn_range(void *data, void *state) msg->va & PAGE_MASK, msg->mfn, msg->npages, vma->vm_page_prot, - st->domain); + st->domain, NULL); if (rc < 0) return rc; @@ -199,6 +200,10 @@ static long privcmd_ioctl_mmap(void __user *udata) if (!xen_initial_domain()) return -EPERM; + /* We only support privcmd_ioctl_mmap_batch for auto translated. */ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return -ENOSYS; + if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) return -EFAULT; @@ -246,6 +251,7 @@ struct mmap_batch_state { domid_t domain; unsigned long va; struct vm_area_struct *vma; + int index; /* A tristate: * 0 for no errors * 1 if at least one error has happened (and no @@ -260,14 +266,20 @@ struct mmap_batch_state { xen_pfn_t __user *user_mfn; }; +/* PVH dom0 fyi: if domU being created is PV, then mfn is mfn(addr on bus). If + * it''s PVH then mfn is pfn (input to HAP). */ static int mmap_batch_fn(void *data, void *state) { xen_pfn_t *mfnp = data; struct mmap_batch_state *st = state; + struct vm_area_struct *vma = st->vma; + struct page **pages = vma ? vma->vm_private_data : NULL; + struct page *cur_page = pages[st->index++]; int ret; ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, - st->vma->vm_page_prot, st->domain); + st->vma->vm_page_prot, st->domain, + &cur_page); /* Store error code for second pass. */ *(st->err++) = ret; @@ -303,6 +315,32 @@ static int mmap_return_errors_v1(void *data, void *state) return __put_user(*mfnp, st->user_mfn++); } +/* Allocate pfns that are then mapped with gmfns from foreign domid. Update + * the vma with the page info to use later. + * Returns: 0 if success, otherwise -errno + */ +static int pvh_privcmd_resv_pfns(struct vm_area_struct *vma, int numpgs) +{ + int rc; + struct page **pages; + + pages = kcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL); + if (pages == NULL) + return -ENOMEM; + + rc = alloc_xenballooned_pages(numpgs, pages, 0); + if (rc != 0) { + pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__, + numpgs, rc); + kfree(pages); + return -ENOMEM; + } + BUG_ON(vma->vm_private_data); + vma->vm_private_data = pages; + + return 0; +} + static struct vm_operations_struct privcmd_vm_ops; static long privcmd_ioctl_mmap_batch(void __user *udata, int version) @@ -370,10 +408,17 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) up_write(&mm->mmap_sem); goto out; } + if (xen_feature(XENFEAT_auto_translated_physmap)) { + if ((ret = pvh_privcmd_resv_pfns(vma, m.num)) < 0) { + up_write(&mm->mmap_sem); + goto out; + } + } state.domain = m.dom; state.vma = vma; state.va = m.addr; + state.index = 0; state.global_error = 0; state.err = err_array; @@ -438,6 +483,20 @@ static long privcmd_ioctl(struct file *file, return ret; } +static void privcmd_close(struct vm_area_struct *vma) +{ + struct page **pages = vma ? vma->vm_private_data : NULL; + int numpgs = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + + if (!pages || !numpgs || !xen_feature(XENFEAT_auto_translated_physmap)) + return; + + xen_unmap_domain_mfn_range(vma); + while (numpgs--) + free_xenballooned_pages(1, &pages[numpgs]); + kfree(pages); +} + static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", @@ -448,6 +507,7 @@ static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } static struct vm_operations_struct privcmd_vm_ops = { + .close = privcmd_close, .fault = privcmd_fault }; @@ -464,6 +524,10 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma) { + if (xen_feature(XENFEAT_auto_translated_physmap)) { + int sz = sizeof(vma->vm_private_data); + return (!__cmpxchg(&vma->vm_private_data, NULL, NULL, sz)); + } return (xchg(&vma->vm_private_data, (void *)1) == NULL); } ============================================================================================== diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 5a16824..d5e53ad 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -73,6 +73,7 @@ #include <xen/interface/version.h> #include <xen/interface/memory.h> #include <xen/hvc-console.h> +#include <xen/balloon.h> #include "multicalls.h" #include "mmu.h" @@ -331,6 +332,20 @@ static void xen_set_pte(pte_t *ptep, pte_t pteval) __xen_set_pte(ptep, pteval); } +void xen_set_clr_mmio_pvh_pte(unsigned long pfn, unsigned long mfn, + int nr_mfns, int add_mapping) +{ + struct physdev_map_iomem iomem; + + iomem.first_gfn = pfn; + iomem.first_mfn = mfn; + iomem.nr_mfns = nr_mfns; + iomem.add_mapping = add_mapping; + + if (HYPERVISOR_physdev_op(PHYSDEVOP_pvh_map_iomem, &iomem)) + BUG(); +} + static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { @@ -1220,6 +1235,8 @@ static void __init xen_pagetable_init(void) #endif paging_init(); xen_setup_shared_info(); + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; #ifdef CONFIG_X86_64 if (!xen_feature(XENFEAT_auto_translated_physmap)) { unsigned long new_mfn_list; @@ -1527,6 +1544,10 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) { struct mmuext_op op; + + if (xen_feature(XENFEAT_writable_page_tables)) + return; + op.cmd = cmd; op.arg1.mfn = pfn_to_mfn(pfn); if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) @@ -1724,6 +1745,10 @@ static void set_page_prot(void *addr, pgprot_t prot) unsigned long pfn = __pa(addr) >> PAGE_SHIFT; pte_t pte = pfn_pte(pfn, prot); + /* recall for PVH, page tables are native. */ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) BUG(); } @@ -1801,6 +1826,9 @@ static void convert_pfn_mfn(void *v) pte_t *pte = v; int i; + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + /* All levels are converted the same way, so just treat them as ptes. */ for (i = 0; i < PTRS_PER_PTE; i++) @@ -1820,6 +1848,7 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, (*pt_end)--; } } + /* * Set up the initial kernel pagetable. * @@ -1830,6 +1859,7 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, * but that''s enough to get __va working. We need to fill in the rest * of the physical mapping once some sort of allocator has been set * up. + * NOTE: for PVH, the page tables are native. */ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { @@ -1907,10 +1937,13 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) * structure to attach it to, so make sure we just set kernel * pgd. */ - xen_mc_batch(); - __xen_write_cr3(true, __pa(init_level4_pgt)); - xen_mc_issue(PARAVIRT_LAZY_CPU); - + if (xen_feature(XENFEAT_writable_page_tables)) { + native_write_cr3(__pa(init_level4_pgt)); + } else { + xen_mc_batch(); + __xen_write_cr3(true, __pa(init_level4_pgt)); + xen_mc_issue(PARAVIRT_LAZY_CPU); + } /* We can''t that easily rip out L3 and L2, as the Xen pagetables are * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for * the initial domain. For guests using the toolstack, they are in: @@ -2177,8 +2210,19 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { void __init xen_init_mmu_ops(void) { - x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; x86_init.paging.pagetable_init = xen_pagetable_init; + + if (xen_feature(XENFEAT_auto_translated_physmap)) { + pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others; + + /* set_pte* for PCI devices to map iomem. */ + if (xen_initial_domain()) { + pv_mmu_ops.set_pte = native_set_pte; + pv_mmu_ops.set_pte_at = native_set_pte_at; + } + return; + } + x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; pv_mmu_ops = xen_mmu_ops; memset(dummy_mapping, 0xff, PAGE_SIZE); @@ -2414,6 +2458,94 @@ void __init xen_hvm_init_mmu_ops(void) } #endif +/* Map foreign gmfn, fgmfn, to local pfn, lpfn. This for the user space + * creating new guest on PVH dom0 and needs to map domU pages. + */ +static int pvh_add_to_xen_p2m(unsigned long lpfn, unsigned long fgmfn, + unsigned int domid) +{ + int rc; + struct xen_add_to_physmap xatp = { .u.foreign_domid = domid }; + + xatp.gpfn = lpfn; + xatp.idx = fgmfn; + xatp.domid = DOMID_SELF; + xatp.space = XENMAPSPACE_gmfn_foreign; + rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp); + if (rc) + pr_warn("d0: Failed to map pfn to mfn rc:%d pfn:%lx mfn:%lx\n", + rc, lpfn, fgmfn); + return rc; +} + +int pvh_rem_xen_p2m(unsigned long spfn, int count) +{ + struct xen_remove_from_physmap xrp; + int i, rc; + + for (i=0; i < count; i++) { + xrp.domid = DOMID_SELF; + xrp.gpfn = spfn+i; + rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); + if (rc) { + pr_warn("Failed to unmap pfn:%lx rc:%d done:%d\n", + spfn+i, rc, i); + return 1; + } + } + return 0; +} +EXPORT_SYMBOL_GPL(pvh_rem_xen_p2m); + +struct pvh_remap_data { + unsigned long fgmfn; /* foreign domain''s gmfn */ + pgprot_t prot; + domid_t domid; + int index; + struct page **pages; +}; + +static int pvh_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, + void *data) +{ + int rc; + struct pvh_remap_data *remap = data; + unsigned long pfn = page_to_pfn(remap->pages[remap->index++]); + pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot)); + + if ((rc=pvh_add_to_xen_p2m(pfn, remap->fgmfn, remap->domid))) + return rc; + native_set_pte(ptep, pteval); + + return 0; +} + +/* The only caller at moment passes one gmfn at a time. + * PVH TBD/FIXME: expand this in future to honor batch requests. + */ +static int pvh_remap_gmfn_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long mfn, int nr, + pgprot_t prot, unsigned domid, + struct page **pages) +{ + int err; + struct pvh_remap_data pvhdata; + + if (nr > 1) + return -EINVAL; + + pvhdata.fgmfn = mfn; + pvhdata.prot = prot; + pvhdata.domid = domid; + pvhdata.index = 0; + pvhdata.pages = pages; + err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT, + pvh_map_pte_fn, &pvhdata); + flush_tlb_all(); + /* flush_tlb_page(vma, addr); */ + return err; +} + #define REMAP_BATCH_SIZE 16 struct remap_data { @@ -2438,7 +2570,9 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, int xen_remap_domain_mfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long mfn, int nr, - pgprot_t prot, unsigned domid) + pgprot_t prot, unsigned domid, + struct page **pages) + { struct remap_data rmd; struct mmu_update mmu_update[REMAP_BATCH_SIZE]; @@ -2446,14 +2580,17 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, unsigned long range; int err = 0; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return -EINVAL; - prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) = (VM_PFNMAP | VM_RESERVED | VM_IO))); + if (xen_feature(XENFEAT_auto_translated_physmap)) { + /* We need to update the local page tables and the xen HAP */ + return pvh_remap_gmfn_range(vma, addr, mfn, nr, prot, domid, + pages); + } + rmd.mfn = mfn; rmd.prot = prot; @@ -2483,3 +2620,25 @@ out: return err; } EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); + +/* Returns: 0 success */ +int xen_unmap_domain_mfn_range(struct vm_area_struct *vma) +{ + int numpgs = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + struct page **pages = vma ? vma->vm_private_data : NULL; + + if (!pages || !xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + while (numpgs--) { + + /* the mmu has already cleaned up the process mmu resources at + * this point (lookup_address will return NULL). */ + unsigned long pfn = page_to_pfn(pages[numpgs]); + + pvh_rem_xen_p2m(pfn, 1); + } + flush_tlb_all(); + return 0; +} +EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range); _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
On Sat, 2012-10-06 at 03:00 +0100, Mukesh Rathor wrote:> On Thu, 4 Oct 2012 09:27:59 +0100 > Ian Campbell <Ian.Campbell@citrix.com> wrote: > > > On Wed, 2012-10-03 at 23:29 +0100, Mukesh Rathor wrote: > > > On Wed, 3 Oct 2012 16:42:43 +0100 > > > Ian Campbell <Ian.Campbell@citrix.com> wrote: > > > > > While implementing the ARM version of this interface it occurred to me > > that the num_pgs and next_todo fields here are not really needed > > across the arch interface e.g. you already get pi_num_pgs from the nr > > argument to xen_remap_domain_mfn_range and pi_next_todo is really > > state which fits in struct pvh_remap_data. That would mean that > > remap_foo could take a struct page * directly and I think you would > > save an allocation. > > Ok, take a look at the attached and inlined diffs. I redid it, passing > struct page to the api, and getting rid of the pi_* struct completely.Excellent, thanks.> @@ -260,14 +266,20 @@ struct mmap_batch_state { > xen_pfn_t __user *user_mfn; > }; > > +/* PVH dom0 fyi: if domU being created is PV, then mfn is mfn(addr on bus). If > + * it''s PVH then mfn is pfn (input to HAP). */Can we say XENFEAT_auto_translated_physmap (or some abbrev.) instead of PVH here, since this is generic code.> static int mmap_batch_fn(void *data, void *state) > { > xen_pfn_t *mfnp = data; > struct mmap_batch_state *st = state; > + struct vm_area_struct *vma = st->vma; > + struct page **pages = vma ? vma->vm_private_data : NULL; > + struct page *cur_page = pages[st->index++]; > int ret; > > ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, > - st->vma->vm_page_prot, st->domain); > + st->vma->vm_page_prot, st->domain, > + &cur_page); > > /* Store error code for second pass. */ > *(st->err++) = ret;[...]> @@ -370,10 +408,17 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) > up_write(&mm->mmap_sem); > goto out; > } > + if (xen_feature(XENFEAT_auto_translated_physmap)) { > + if ((ret = pvh_privcmd_resv_pfns(vma, m.num)) < 0) {I renamed this alloc_empty_pages to avoid the PVH terminology in common code. There could be an argument for moving this to next to alloc_xenballooned_pages e.g. as alloc_xenballooned_pagearray. We used to have alloc_empty_pages_and_pagevec() In the classic kernels which was effectively that same function. Ian.