1: IOMMU: make page table population preemptible 2: IOMMU: make page table deallocation preemptible 3: x86/p2m: restrict auditing to debug builds 4: HVM: prevent leaking heap data from hvm_save_one() 5: x86/PV: don''t commit debug register values early in arch_set_info_guest() Signed-off-by: Jan Beulich <jbeulich@suse.com>
Jan Beulich
2013-Dec-10 15:45 UTC
[PATCH 1/5] IOMMU: make page table population preemptible
Since this can take an arbitrary amount of time, the rooting domctl as well as all involved code must become aware of this requiring a continuation. The subject domain''s rel_mem_list is being (ab)used for this, in a way similar to and compatible with broken page offlining. Further, operations get slightly re-ordered in assign_device(): IOMMU page tables now get set up _before_ the first device gets assigned, at once closing a small timing window in which the guest may already see the device but wouldn''t be able to access it. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -1923,6 +1923,12 @@ int domain_relinquish_resources(struct d } d->arch.relmem = RELMEM_xen; + + spin_lock(&d->page_alloc_lock); + page_list_splice(&d->arch.relmem_list, &d->page_list); + INIT_PAGE_LIST_HEAD(&d->arch.relmem_list); + spin_unlock(&d->page_alloc_lock); + /* Fallthrough. Relinquish every page of memory. */ case RELMEM_xen: ret = relinquish_memory(d, &d->xenpage_list, ~0UL); --- a/xen/arch/x86/mm/p2m-pod.c +++ b/xen/arch/x86/mm/p2m-pod.c @@ -459,7 +459,8 @@ p2m_pod_offline_or_broken_hit(struct pag pod_hit: lock_page_alloc(p2m); - page_list_add_tail(p, &d->arch.relmem_list); + /* Insertion must be at list head (see iommu_populate_page_table()). */ + page_list_add(p, &d->arch.relmem_list); unlock_page_alloc(p2m); pod_unlock(p2m); return 1; --- a/xen/drivers/passthrough/iommu.c +++ b/xen/drivers/passthrough/iommu.c @@ -18,6 +18,7 @@ #include <asm/hvm/iommu.h> #include <xen/paging.h> #include <xen/guest_access.h> +#include <xen/event.h> #include <xen/softirq.h> #include <xen/keyhandler.h> #include <xsm/xsm.h> @@ -265,7 +266,23 @@ static int assign_device(struct domain * d->mem_event->paging.ring_page)) ) return -EXDEV; - spin_lock(&pcidevs_lock); + if ( !spin_trylock(&pcidevs_lock) ) + return -ERESTART; + + if ( need_iommu(d) <= 0 ) + { + if ( !iommu_use_hap_pt(d) ) + { + rc = iommu_populate_page_table(d); + if ( rc ) + { + spin_unlock(&pcidevs_lock); + return rc; + } + } + d->need_iommu = 1; + } + pdev = pci_get_pdev_by_domain(dom0, seg, bus, devfn); if ( !pdev ) { @@ -290,15 +307,14 @@ static int assign_device(struct domain * rc); } - if ( has_arch_pdevs(d) && !need_iommu(d) ) + done: + if ( !has_arch_pdevs(d) && need_iommu(d) ) { - d->need_iommu = 1; - if ( !iommu_use_hap_pt(d) ) - rc = iommu_populate_page_table(d); - goto done; + d->need_iommu = 0; + hd->platform_ops->teardown(d); } -done: spin_unlock(&pcidevs_lock); + return rc; } @@ -306,12 +322,17 @@ static int iommu_populate_page_table(str { struct hvm_iommu *hd = domain_hvm_iommu(d); struct page_info *page; - int rc = 0; + int rc = 0, n = 0; + + d->need_iommu = -1; this_cpu(iommu_dont_flush_iotlb) = 1; spin_lock(&d->page_alloc_lock); - page_list_for_each ( page, &d->page_list ) + if ( unlikely(d->is_dying) ) + rc = -ESRCH; + + while ( !rc && (page = page_list_remove_head(&d->page_list)) ) { if ( is_hvm_domain(d) || (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page ) @@ -321,7 +342,32 @@ static int iommu_populate_page_table(str d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page), IOMMUF_readable|IOMMUF_writable); if ( rc ) + { + page_list_add(page, &d->page_list); break; + } + } + page_list_add_tail(page, &d->arch.relmem_list); + if ( !(++n & 0xff) && !page_list_empty(&d->page_list) && + hypercall_preempt_check() ) + rc = -ERESTART; + } + + if ( !rc ) + { + /* + * The expectation here is that generally there are many normal pages + * on relmem_list (the ones we put there) and only few being in an + * offline/broken state. The latter ones are always at the head of the + * list. Hence we first move the whole list, and then move back the + * first few entries. + */ + page_list_move(&d->page_list, &d->arch.relmem_list); + while ( (page = page_list_first(&d->page_list)) != NULL && + (page->count_info & (PGC_state|PGC_broken)) ) + { + page_list_del(page, &d->page_list); + page_list_add_tail(page, &d->arch.relmem_list); } } @@ -330,8 +376,11 @@ static int iommu_populate_page_table(str if ( !rc ) iommu_iotlb_flush_all(d); - else + else if ( rc != -ERESTART ) + { hd->platform_ops->teardown(d); + d->need_iommu = 0; + } return rc; } @@ -688,7 +737,10 @@ int iommu_do_domctl( ret = device_assigned(seg, bus, devfn) ?: assign_device(d, seg, bus, devfn); - if ( ret ) + if ( ret == -ERESTART ) + ret = hypercall_create_continuation(__HYPERVISOR_domctl, + "h", u_domctl); + else if ( ret ) printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: " "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n", seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -323,7 +323,7 @@ struct domain #ifdef HAS_PASSTHROUGH /* Does this guest need iommu mappings? */ - bool_t need_iommu; + s8 need_iommu; #endif /* is node-affinity automatically computed? */ bool_t auto_node_affinity; _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Jan Beulich
2013-Dec-10 15:46 UTC
[PATCH 2/5] IOMMU: make page table deallocation preemptible
This too can take an arbitrary amount of time. In fact, the bulk of the work is being moved to a tasklet, as handling the necessary preemption logic in line seems close to impossible given that the teardown may also be invoked on error paths. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c @@ -195,6 +195,8 @@ static int __init amd_iommu_setup_dom0_d return 0; } +static void deallocate_page_tables(unsigned long unused); + int __init amd_iov_detect(void) { INIT_LIST_HEAD(&amd_iommu_head); @@ -215,6 +217,7 @@ int __init amd_iov_detect(void) return -ENODEV; } + tasklet_init(&iommu_pt_cleanup_tasklet, deallocate_page_tables, 0); init_done = 1; if ( !amd_iommu_perdev_intremap ) @@ -405,11 +408,21 @@ static int amd_iommu_assign_device(struc return reassign_device(dom0, d, devfn, pdev); } -static void deallocate_next_page_table(struct page_info* pg, int level) +static void deallocate_next_page_table(struct page_info *pg, int level) +{ + spin_lock(&iommu_pt_cleanup_lock); + PFN_ORDER(pg) = level; + page_list_add_tail(pg, &iommu_pt_cleanup_list); + spin_unlock(&iommu_pt_cleanup_lock); +} + +static void deallocate_page_table(struct page_info *pg) { void *table_vaddr, *pde; u64 next_table_maddr; - int index, next_level; + unsigned int index, level = PFN_ORDER(pg), next_level; + + PFN_ORDER(pg) = 0; if ( level <= 1 ) { @@ -439,6 +452,23 @@ static void deallocate_next_page_table(s free_amd_iommu_pgtable(pg); } +static void deallocate_page_tables(unsigned long unused) +{ + do { + struct page_info *pg; + + spin_lock(&iommu_pt_cleanup_lock); + pg = page_list_remove_head(&iommu_pt_cleanup_list); + spin_unlock(&iommu_pt_cleanup_lock); + if ( !pg ) + return; + deallocate_page_table(pg); + } while ( !softirq_pending(smp_processor_id()) ); + + tasklet_schedule_on_cpu(&iommu_pt_cleanup_tasklet, + cpumask_cycle(smp_processor_id(), &cpu_online_map)); +} + static void deallocate_iommu_page_tables(struct domain *d) { struct hvm_iommu *hd = domain_hvm_iommu(d); @@ -451,6 +481,7 @@ static void deallocate_iommu_page_tables { deallocate_next_page_table(hd->root_table, hd->paging_mode); hd->root_table = NULL; + tasklet_schedule(&iommu_pt_cleanup_tasklet); } spin_unlock(&hd->mapping_lock); } --- a/xen/drivers/passthrough/iommu.c +++ b/xen/drivers/passthrough/iommu.c @@ -58,6 +58,10 @@ bool_t __read_mostly amd_iommu_perdev_in DEFINE_PER_CPU(bool_t, iommu_dont_flush_iotlb); +DEFINE_SPINLOCK(iommu_pt_cleanup_lock); +PAGE_LIST_HEAD(iommu_pt_cleanup_list); +struct tasklet iommu_pt_cleanup_tasklet; + static struct keyhandler iommu_p2m_table = { .diagnostic = 0, .u.fn = iommu_dump_p2m_table, --- a/xen/drivers/passthrough/vtd/iommu.c +++ b/xen/drivers/passthrough/vtd/iommu.c @@ -668,13 +668,24 @@ static void dma_pte_clear_one(struct dom static void iommu_free_pagetable(u64 pt_maddr, int level) { - int i; - struct dma_pte *pt_vaddr, *pte; - int next_level = level - 1; + struct page_info *pg = maddr_to_page(pt_maddr); if ( pt_maddr == 0 ) return; + spin_lock(&iommu_pt_cleanup_lock); + PFN_ORDER(pg) = level; + page_list_add_tail(pg, &iommu_pt_cleanup_list); + spin_unlock(&iommu_pt_cleanup_lock); +} + +static void iommu_free_page_table(struct page_info *pg) +{ + unsigned int i, next_level = PFN_ORDER(pg) - 1; + u64 pt_maddr = page_to_maddr(pg); + struct dma_pte *pt_vaddr, *pte; + + PFN_ORDER(pg) = 0; pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr); for ( i = 0; i < PTE_NUM; i++ ) @@ -694,6 +705,23 @@ static void iommu_free_pagetable(u64 pt_ free_pgtable_maddr(pt_maddr); } +static void iommu_free_pagetables(unsigned long unused) +{ + do { + struct page_info *pg; + + spin_lock(&iommu_pt_cleanup_lock); + pg = page_list_remove_head(&iommu_pt_cleanup_list); + spin_unlock(&iommu_pt_cleanup_lock); + if ( !pg ) + return; + iommu_free_page_table(pg); + } while ( !softirq_pending(smp_processor_id()) ); + + tasklet_schedule_on_cpu(&iommu_pt_cleanup_tasklet, + cpumask_cycle(smp_processor_id(), &cpu_online_map)); +} + static int iommu_set_root_entry(struct iommu *iommu) { u32 sts; @@ -1704,6 +1732,8 @@ void iommu_domain_teardown(struct domain iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw)); hd->pgd_maddr = 0; spin_unlock(&hd->mapping_lock); + + tasklet_schedule(&iommu_pt_cleanup_tasklet); } static int intel_iommu_map_page( @@ -2204,6 +2234,7 @@ int __init intel_vtd_setup(void) if ( ret ) goto error; + tasklet_init(&iommu_pt_cleanup_tasklet, iommu_free_pagetables, 0); register_keyhandler(''V'', &dump_iommu_info_keyhandler); return 0; --- a/xen/include/xen/iommu.h +++ b/xen/include/xen/iommu.h @@ -151,4 +151,8 @@ int adjust_vtd_irq_affinities(void); */ DECLARE_PER_CPU(bool_t, iommu_dont_flush_iotlb); +extern struct spinlock iommu_pt_cleanup_lock; +extern struct page_list_head iommu_pt_cleanup_list; +extern struct tasklet iommu_pt_cleanup_tasklet; + #endif /* _IOMMU_H_ */ _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
... since iterating through all of a guest''s pages may take unduly long. Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: Ian Campbell <ian.campbell@citrix.com> --- a/xen/include/asm-x86/p2m.h +++ b/xen/include/asm-x86/p2m.h @@ -600,7 +600,11 @@ int set_p2m_entry(struct p2m_domain *p2m extern void p2m_pt_init(struct p2m_domain *p2m); /* Debugging and auditing of the P2M code? */ +#ifndef NDEBUG #define P2M_AUDIT 1 +#else +#define P2M_AUDIT 0 +#endif #define P2M_DEBUGGING 0 #if P2M_AUDIT _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Jan Beulich
2013-Dec-10 15:48 UTC
[PATCH 4/5] HVM: prevent leaking heap data from hvm_save_one()
When one or more of the vCPU-s of a guest are offline, no data may be put into the allocated space for them and, due to another bug, such uninitialized data may be passed back to the caller. Signed-off-by: Don Slutz <dslutz@verizon.com> Acked-by: Ian Campbell <ian.campbell@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> --- a/xen/common/hvm/save.c +++ b/xen/common/hvm/save.c @@ -102,7 +102,7 @@ int hvm_save_one(struct domain *d, uint1 return -EINVAL; ctxt.size = sz; - ctxt.data = xmalloc_bytes(sz); + ctxt.data = xzalloc_bytes(sz); if ( !ctxt.data ) return -ENOMEM;
Jan Beulich
2013-Dec-10 15:48 UTC
[PATCH 5/5] x86/PV: don''t commit debug register values early in arch_set_info_guest()
They''re being taken care of later (via set_debugreg()), and temporarily copying them into struct vcpu means that bad values may end up getting loaded during context switch if the vCPU is already running and the function errors out between the premature and real commit step, leading to the same issue that XSA-12 dealt with. Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: Ian Campbell <ian.campbell@citrix.com> --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -740,11 +740,12 @@ int arch_set_info_guest( XLAT_trap_info(v->arch.pv_vcpu.trap_ctxt + i, c.cmp->trap_ctxt + i); } - for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i ) - v->arch.debugreg[i] = c(debugreg[i]); if ( has_hvm_container_vcpu(v) ) { + for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i ) + v->arch.debugreg[i] = c(debugreg[i]); + /* * NB: TF_kernel_mode is set unconditionally for HVM guests, * so we always use the gs_base_kernel here. If we change this _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Andrew Cooper
2013-Dec-10 15:58 UTC
Re: [PATCH 3/5] x86/p2m: restrict auditing to debug builds
On 10/12/13 15:47, Jan Beulich wrote:> ... since iterating through all of a guest''s pages may take unduly > long. > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > Acked-by: Ian Campbell <ian.campbell@citrix.com>Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>> > --- a/xen/include/asm-x86/p2m.h > +++ b/xen/include/asm-x86/p2m.h > @@ -600,7 +600,11 @@ int set_p2m_entry(struct p2m_domain *p2m > extern void p2m_pt_init(struct p2m_domain *p2m); > > /* Debugging and auditing of the P2M code? */ > +#ifndef NDEBUG > #define P2M_AUDIT 1 > +#else > +#define P2M_AUDIT 0 > +#endif > #define P2M_DEBUGGING 0 > > #if P2M_AUDIT > > > > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel_______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Andrew Cooper
2013-Dec-10 16:01 UTC
Re: [PATCH 4/5] HVM: prevent leaking heap data from hvm_save_one()
On 10/12/13 15:48, Jan Beulich wrote:> When one or more of the vCPU-s of a guest are offline, no data may be > put into the allocated space for them and, due to another bug, such > uninitialized data may be passed back to the caller. > > Signed-off-by: Don Slutz <dslutz@verizon.com> > Acked-by: Ian Campbell <ian.campbell@citrix.com> > Reviewed-by: Jan Beulich <jbeulich@suse.com>Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>> > --- a/xen/common/hvm/save.c > +++ b/xen/common/hvm/save.c > @@ -102,7 +102,7 @@ int hvm_save_one(struct domain *d, uint1 > return -EINVAL; > > ctxt.size = sz; > - ctxt.data = xmalloc_bytes(sz); > + ctxt.data = xzalloc_bytes(sz); > if ( !ctxt.data ) > return -ENOMEM; > > > > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel
Andrew Cooper
2013-Dec-10 17:23 UTC
Re: [PATCH 5/5] x86/PV: don''t commit debug register values early in arch_set_info_guest()
On 10/12/13 15:48, Jan Beulich wrote:> They''re being taken care of later (via set_debugreg()), and temporarily > copying them into struct vcpu means that bad values may end up getting > loaded during context switch if the vCPU is already running and the > function errors out between the premature and real commit step, leading > to the same issue that XSA-12 dealt with. > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > Acked-by: Ian Campbell <ian.campbell@citrix.com>Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>> > --- a/xen/arch/x86/domain.c > +++ b/xen/arch/x86/domain.c > @@ -740,11 +740,12 @@ int arch_set_info_guest( > XLAT_trap_info(v->arch.pv_vcpu.trap_ctxt + i, > c.cmp->trap_ctxt + i); > } > - for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i ) > - v->arch.debugreg[i] = c(debugreg[i]); > > if ( has_hvm_container_vcpu(v) ) > { > + for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i ) > + v->arch.debugreg[i] = c(debugreg[i]); > + > /* > * NB: TF_kernel_mode is set unconditionally for HVM guests, > * so we always use the gs_base_kernel here. If we change this > > > > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel_______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
George Dunlap
2013-Dec-10 17:31 UTC
Re: [PATCH 3/5] x86/p2m: restrict auditing to debug builds
On 12/10/2013 03:47 PM, Jan Beulich wrote:> ... since iterating through all of a guest''s pages may take unduly > long. > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > Acked-by: Ian Campbell <ian.campbell@citrix.com>Release-acked-by: George Dunlap <george.dunlap@eu.citrix.com>
George Dunlap
2013-Dec-10 17:32 UTC
Re: [PATCH 4/5] HVM: prevent leaking heap data from hvm_save_one()
On 12/10/2013 03:48 PM, Jan Beulich wrote:> When one or more of the vCPU-s of a guest are offline, no data may be > put into the allocated space for them and, due to another bug, such > uninitialized data may be passed back to the caller. > > Signed-off-by: Don Slutz <dslutz@verizon.com> > Acked-by: Ian Campbell <ian.campbell@citrix.com> > Reviewed-by: Jan Beulich <jbeulich@suse.com>Release-acked-by: George Dunlap <george.dunlap@eu.citrix.com>> > --- a/xen/common/hvm/save.c > +++ b/xen/common/hvm/save.c > @@ -102,7 +102,7 @@ int hvm_save_one(struct domain *d, uint1 > return -EINVAL; > > ctxt.size = sz; > - ctxt.data = xmalloc_bytes(sz); > + ctxt.data = xzalloc_bytes(sz); > if ( !ctxt.data ) > return -ENOMEM; > > > >
George Dunlap
2013-Dec-10 17:33 UTC
Re: [PATCH 5/5] x86/PV: don''t commit debug register values early in arch_set_info_guest()
On 12/10/2013 03:48 PM, Jan Beulich wrote:> They''re being taken care of later (via set_debugreg()), and temporarily > copying them into struct vcpu means that bad values may end up getting > loaded during context switch if the vCPU is already running and the > function errors out between the premature and real commit step, leading > to the same issue that XSA-12 dealt with. > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > Acked-by: Ian Campbell <ian.campbell@citrix.com>Release-acked-by: George Dunlap <george.dunlap@eu.citrix.com>> > --- a/xen/arch/x86/domain.c > +++ b/xen/arch/x86/domain.c > @@ -740,11 +740,12 @@ int arch_set_info_guest( > XLAT_trap_info(v->arch.pv_vcpu.trap_ctxt + i, > c.cmp->trap_ctxt + i); > } > - for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i ) > - v->arch.debugreg[i] = c(debugreg[i]); > > if ( has_hvm_container_vcpu(v) ) > { > + for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i ) > + v->arch.debugreg[i] = c(debugreg[i]); > + > /* > * NB: TF_kernel_mode is set unconditionally for HVM guests, > * so we always use the gs_base_kernel here. If we change this > > >
Keir Fraser
2013-Dec-10 18:17 UTC
Re: [PATCH 5/5] x86/PV: don''t commit debug register values early in arch_set_info_guest()
On 10/12/2013 17:33, "George Dunlap" <george.dunlap@eu.citrix.com> wrote:> On 12/10/2013 03:48 PM, Jan Beulich wrote: >> They''re being taken care of later (via set_debugreg()), and temporarily >> copying them into struct vcpu means that bad values may end up getting >> loaded during context switch if the vCPU is already running and the >> function errors out between the premature and real commit step, leading >> to the same issue that XSA-12 dealt with. >> >> Signed-off-by: Jan Beulich <jbeulich@suse.com> >> Acked-by: Ian Campbell <ian.campbell@citrix.com> > > Release-acked-by: George Dunlap <george.dunlap@eu.citrix.com>Acked-by: Keir Fraser <keir@xen.org>>> >> --- a/xen/arch/x86/domain.c >> +++ b/xen/arch/x86/domain.c >> @@ -740,11 +740,12 @@ int arch_set_info_guest( >> XLAT_trap_info(v->arch.pv_vcpu.trap_ctxt + i, >> c.cmp->trap_ctxt + i); >> } >> - for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i ) >> - v->arch.debugreg[i] = c(debugreg[i]); >> >> if ( has_hvm_container_vcpu(v) ) >> { >> + for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i ) >> + v->arch.debugreg[i] = c(debugreg[i]); >> + >> /* >> * NB: TF_kernel_mode is set unconditionally for HVM guests, >> * so we always use the gs_base_kernel here. If we change this >> >> >> >
Andrew Cooper
2013-Dec-11 18:40 UTC
Re: [PATCH 1/5] IOMMU: make page table population preemptible
On 10/12/2013 15:45, Jan Beulich wrote:> Since this can take an arbitrary amount of time, the rooting domctl as > well as all involved code must become aware of this requiring a > continuation. > > The subject domain''s rel_mem_list is being (ab)used for this, in a way > similar to and compatible with broken page offlining. > > Further, operations get slightly re-ordered in assign_device(): IOMMU > page tables now get set up _before_ the first device gets assigned, at > once closing a small timing window in which the guest may already see > the device but wouldn''t be able to access it. > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > > --- a/xen/arch/x86/domain.c > +++ b/xen/arch/x86/domain.c > @@ -1923,6 +1923,12 @@ int domain_relinquish_resources(struct d > } > > d->arch.relmem = RELMEM_xen; > + > + spin_lock(&d->page_alloc_lock); > + page_list_splice(&d->arch.relmem_list, &d->page_list); > + INIT_PAGE_LIST_HEAD(&d->arch.relmem_list); > + spin_unlock(&d->page_alloc_lock); > + > /* Fallthrough. Relinquish every page of memory. */ > case RELMEM_xen: > ret = relinquish_memory(d, &d->xenpage_list, ~0UL); > --- a/xen/arch/x86/mm/p2m-pod.c > +++ b/xen/arch/x86/mm/p2m-pod.c > @@ -459,7 +459,8 @@ p2m_pod_offline_or_broken_hit(struct pag > > pod_hit: > lock_page_alloc(p2m); > - page_list_add_tail(p, &d->arch.relmem_list); > + /* Insertion must be at list head (see iommu_populate_page_table()). */ > + page_list_add(p, &d->arch.relmem_list); > unlock_page_alloc(p2m); > pod_unlock(p2m); > return 1; > --- a/xen/drivers/passthrough/iommu.c > +++ b/xen/drivers/passthrough/iommu.c > @@ -18,6 +18,7 @@ > #include <asm/hvm/iommu.h> > #include <xen/paging.h> > #include <xen/guest_access.h> > +#include <xen/event.h> > #include <xen/softirq.h> > #include <xen/keyhandler.h> > #include <xsm/xsm.h> > @@ -265,7 +266,23 @@ static int assign_device(struct domain * > d->mem_event->paging.ring_page)) ) > return -EXDEV; > > - spin_lock(&pcidevs_lock); > + if ( !spin_trylock(&pcidevs_lock) ) > + return -ERESTART; > + > + if ( need_iommu(d) <= 0 ) > + { > + if ( !iommu_use_hap_pt(d) ) > + { > + rc = iommu_populate_page_table(d); > + if ( rc ) > + { > + spin_unlock(&pcidevs_lock); > + return rc; > + } > + } > + d->need_iommu = 1; > + } > + > pdev = pci_get_pdev_by_domain(dom0, seg, bus, devfn); > if ( !pdev ) > { > @@ -290,15 +307,14 @@ static int assign_device(struct domain * > rc); > } > > - if ( has_arch_pdevs(d) && !need_iommu(d) ) > + done: > + if ( !has_arch_pdevs(d) && need_iommu(d) ) > { > - d->need_iommu = 1; > - if ( !iommu_use_hap_pt(d) ) > - rc = iommu_populate_page_table(d); > - goto done; > + d->need_iommu = 0; > + hd->platform_ops->teardown(d); > } > -done: > spin_unlock(&pcidevs_lock); > + > return rc; > } > > @@ -306,12 +322,17 @@ static int iommu_populate_page_table(str > { > struct hvm_iommu *hd = domain_hvm_iommu(d); > struct page_info *page; > - int rc = 0; > + int rc = 0, n = 0; > + > + d->need_iommu = -1; > > this_cpu(iommu_dont_flush_iotlb) = 1; > spin_lock(&d->page_alloc_lock); > > - page_list_for_each ( page, &d->page_list ) > + if ( unlikely(d->is_dying) ) > + rc = -ESRCH; > + > + while ( !rc && (page = page_list_remove_head(&d->page_list)) ) > { > if ( is_hvm_domain(d) || > (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page ) > @@ -321,7 +342,32 @@ static int iommu_populate_page_table(str > d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page), > IOMMUF_readable|IOMMUF_writable); > if ( rc ) > + { > + page_list_add(page, &d->page_list); > break; > + } > + } > + page_list_add_tail(page, &d->arch.relmem_list); > + if ( !(++n & 0xff) && !page_list_empty(&d->page_list) &&Why the forced restart here? If nothing needs pre-empting, surely it is better to continue? Or is this about equality on the pcidevs_lock ?> + hypercall_preempt_check() ) > + rc = -ERESTART; > + } > + > + if ( !rc ) > + { > + /* > + * The expectation here is that generally there are many normal pages > + * on relmem_list (the ones we put there) and only few being in an > + * offline/broken state. The latter ones are always at the head of the > + * list. Hence we first move the whole list, and then move back the > + * first few entries. > + */ > + page_list_move(&d->page_list, &d->arch.relmem_list); > + while ( (page = page_list_first(&d->page_list)) != NULL && > + (page->count_info & (PGC_state|PGC_broken)) ) > + { > + page_list_del(page, &d->page_list); > + page_list_add_tail(page, &d->arch.relmem_list); > } > } > > @@ -330,8 +376,11 @@ static int iommu_populate_page_table(str > > if ( !rc ) > iommu_iotlb_flush_all(d); > - else > + else if ( rc != -ERESTART ) > + { > hd->platform_ops->teardown(d); > + d->need_iommu = 0; > + } > > return rc; > } > @@ -688,7 +737,10 @@ int iommu_do_domctl( > > ret = device_assigned(seg, bus, devfn) ?: > assign_device(d, seg, bus, devfn); > - if ( ret ) > + if ( ret == -ERESTART ) > + ret = hypercall_create_continuation(__HYPERVISOR_domctl, > + "h", u_domctl); > + else if ( ret ) > printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: " > "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n", > seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), > --- a/xen/include/xen/sched.h > +++ b/xen/include/xen/sched.h > @@ -323,7 +323,7 @@ struct domain > > #ifdef HAS_PASSTHROUGH > /* Does this guest need iommu mappings? */ > - bool_t need_iommu; > + s8 need_iommu;I think this change from bool_t to s8 needs a comment explaining that -1 indicates "the iommu mappings are pending creation" Is there any particular reason that -ERESTART is used when -EAGAIN is the prevailing style for hypercall continuations? ~Andrew
Andrew Cooper
2013-Dec-11 19:01 UTC
Re: [PATCH 2/5] IOMMU: make page table deallocation preemptible
On 10/12/2013 15:46, Jan Beulich wrote:> This too can take an arbitrary amount of time. > > In fact, the bulk of the work is being moved to a tasklet, as handling > the necessary preemption logic in line seems close to impossible given > that the teardown may also be invoked on error paths. > > Signed-off-by: Jan Beulich <jbeulich@suse.com>There is a lot of duplicated code here. I would suggest making most of this infrastructure common, and having a new iommu_op for "void free_io_page_table(struct page_info*);" ~Andrew> > --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c > +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c > @@ -195,6 +195,8 @@ static int __init amd_iommu_setup_dom0_d > return 0; > } > > +static void deallocate_page_tables(unsigned long unused); > + > int __init amd_iov_detect(void) > { > INIT_LIST_HEAD(&amd_iommu_head); > @@ -215,6 +217,7 @@ int __init amd_iov_detect(void) > return -ENODEV; > } > > + tasklet_init(&iommu_pt_cleanup_tasklet, deallocate_page_tables, 0); > init_done = 1; > > if ( !amd_iommu_perdev_intremap ) > @@ -405,11 +408,21 @@ static int amd_iommu_assign_device(struc > return reassign_device(dom0, d, devfn, pdev); > } > > -static void deallocate_next_page_table(struct page_info* pg, int level) > +static void deallocate_next_page_table(struct page_info *pg, int level) > +{ > + spin_lock(&iommu_pt_cleanup_lock); > + PFN_ORDER(pg) = level; > + page_list_add_tail(pg, &iommu_pt_cleanup_list); > + spin_unlock(&iommu_pt_cleanup_lock); > +} > + > +static void deallocate_page_table(struct page_info *pg) > { > void *table_vaddr, *pde; > u64 next_table_maddr; > - int index, next_level; > + unsigned int index, level = PFN_ORDER(pg), next_level; > + > + PFN_ORDER(pg) = 0; > > if ( level <= 1 ) > { > @@ -439,6 +452,23 @@ static void deallocate_next_page_table(s > free_amd_iommu_pgtable(pg); > } > > +static void deallocate_page_tables(unsigned long unused) > +{ > + do { > + struct page_info *pg; > + > + spin_lock(&iommu_pt_cleanup_lock); > + pg = page_list_remove_head(&iommu_pt_cleanup_list); > + spin_unlock(&iommu_pt_cleanup_lock); > + if ( !pg ) > + return; > + deallocate_page_table(pg); > + } while ( !softirq_pending(smp_processor_id()) ); > + > + tasklet_schedule_on_cpu(&iommu_pt_cleanup_tasklet, > + cpumask_cycle(smp_processor_id(), &cpu_online_map)); > +} > + > static void deallocate_iommu_page_tables(struct domain *d) > { > struct hvm_iommu *hd = domain_hvm_iommu(d); > @@ -451,6 +481,7 @@ static void deallocate_iommu_page_tables > { > deallocate_next_page_table(hd->root_table, hd->paging_mode); > hd->root_table = NULL; > + tasklet_schedule(&iommu_pt_cleanup_tasklet); > } > spin_unlock(&hd->mapping_lock); > } > --- a/xen/drivers/passthrough/iommu.c > +++ b/xen/drivers/passthrough/iommu.c > @@ -58,6 +58,10 @@ bool_t __read_mostly amd_iommu_perdev_in > > DEFINE_PER_CPU(bool_t, iommu_dont_flush_iotlb); > > +DEFINE_SPINLOCK(iommu_pt_cleanup_lock); > +PAGE_LIST_HEAD(iommu_pt_cleanup_list); > +struct tasklet iommu_pt_cleanup_tasklet; > + > static struct keyhandler iommu_p2m_table = { > .diagnostic = 0, > .u.fn = iommu_dump_p2m_table, > --- a/xen/drivers/passthrough/vtd/iommu.c > +++ b/xen/drivers/passthrough/vtd/iommu.c > @@ -668,13 +668,24 @@ static void dma_pte_clear_one(struct dom > > static void iommu_free_pagetable(u64 pt_maddr, int level) > { > - int i; > - struct dma_pte *pt_vaddr, *pte; > - int next_level = level - 1; > + struct page_info *pg = maddr_to_page(pt_maddr); > > if ( pt_maddr == 0 ) > return; > > + spin_lock(&iommu_pt_cleanup_lock); > + PFN_ORDER(pg) = level; > + page_list_add_tail(pg, &iommu_pt_cleanup_list); > + spin_unlock(&iommu_pt_cleanup_lock); > +} > + > +static void iommu_free_page_table(struct page_info *pg) > +{ > + unsigned int i, next_level = PFN_ORDER(pg) - 1; > + u64 pt_maddr = page_to_maddr(pg); > + struct dma_pte *pt_vaddr, *pte; > + > + PFN_ORDER(pg) = 0; > pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr); > > for ( i = 0; i < PTE_NUM; i++ ) > @@ -694,6 +705,23 @@ static void iommu_free_pagetable(u64 pt_ > free_pgtable_maddr(pt_maddr); > } > > +static void iommu_free_pagetables(unsigned long unused) > +{ > + do { > + struct page_info *pg; > + > + spin_lock(&iommu_pt_cleanup_lock); > + pg = page_list_remove_head(&iommu_pt_cleanup_list); > + spin_unlock(&iommu_pt_cleanup_lock); > + if ( !pg ) > + return; > + iommu_free_page_table(pg); > + } while ( !softirq_pending(smp_processor_id()) ); > + > + tasklet_schedule_on_cpu(&iommu_pt_cleanup_tasklet, > + cpumask_cycle(smp_processor_id(), &cpu_online_map)); > +} > + > static int iommu_set_root_entry(struct iommu *iommu) > { > u32 sts; > @@ -1704,6 +1732,8 @@ void iommu_domain_teardown(struct domain > iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw)); > hd->pgd_maddr = 0; > spin_unlock(&hd->mapping_lock); > + > + tasklet_schedule(&iommu_pt_cleanup_tasklet); > } > > static int intel_iommu_map_page( > @@ -2204,6 +2234,7 @@ int __init intel_vtd_setup(void) > if ( ret ) > goto error; > > + tasklet_init(&iommu_pt_cleanup_tasklet, iommu_free_pagetables, 0); > register_keyhandler(''V'', &dump_iommu_info_keyhandler); > > return 0; > --- a/xen/include/xen/iommu.h > +++ b/xen/include/xen/iommu.h > @@ -151,4 +151,8 @@ int adjust_vtd_irq_affinities(void); > */ > DECLARE_PER_CPU(bool_t, iommu_dont_flush_iotlb); > > +extern struct spinlock iommu_pt_cleanup_lock; > +extern struct page_list_head iommu_pt_cleanup_list; > +extern struct tasklet iommu_pt_cleanup_tasklet; > + > #endif /* _IOMMU_H_ */ > > > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel_______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Jan Beulich
2013-Dec-13 09:51 UTC
Re: [PATCH 1/5] IOMMU: make page table population preemptible
>>> On 11.12.13 at 19:40, Andrew Cooper <andrew.cooper3@citrix.com> wrote: > On 10/12/2013 15:45, Jan Beulich wrote: >> @@ -321,7 +342,32 @@ static int iommu_populate_page_table(str >> d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page), >> IOMMUF_readable|IOMMUF_writable); >> if ( rc ) >> + { >> + page_list_add(page, &d->page_list); >> break; >> + } >> + } >> + page_list_add_tail(page, &d->arch.relmem_list); >> + if ( !(++n & 0xff) && !page_list_empty(&d->page_list) && > > Why the forced restart here? If nothing needs pre-empting, surely it is > better to continue? > > Or is this about equality on the pcidevs_lock ? > >> + hypercall_preempt_check() )Did you overlook this part of the condition?>> --- a/xen/include/xen/sched.h >> +++ b/xen/include/xen/sched.h >> @@ -323,7 +323,7 @@ struct domain >> >> #ifdef HAS_PASSTHROUGH >> /* Does this guest need iommu mappings? */ >> - bool_t need_iommu; >> + s8 need_iommu; > > I think this change from bool_t to s8 needs a comment explaining that -1 > indicates "the iommu mappings are pending creation"Will do.> Is there any particular reason that -ERESTART is used when -EAGAIN is > the prevailing style for hypercall continuations?I meanwhile realized that using -EAGAIN was a mistake (iirc taken from certain domctl-s having passed this back up to the caller to request re-invocation a long time ago) - -EAGAIN really has a different meaning, and hence we ought to switch all its current mis-uses to -ERESTART. Jan
Jan Beulich
2013-Dec-13 09:55 UTC
Re: [PATCH 2/5] IOMMU: make page table deallocation preemptible
>>> On 11.12.13 at 20:01, Andrew Cooper <andrew.cooper3@citrix.com> wrote: > On 10/12/2013 15:46, Jan Beulich wrote: >> This too can take an arbitrary amount of time. >> >> In fact, the bulk of the work is being moved to a tasklet, as handling >> the necessary preemption logic in line seems close to impossible given >> that the teardown may also be invoked on error paths. >> >> Signed-off-by: Jan Beulich <jbeulich@suse.com> > > There is a lot of duplicated code here. I would suggest making most of > this infrastructure common, and having a new iommu_op for "void > free_io_page_table(struct page_info*);"I guess that could be done - let me see how this works out. Jan
Andrew Cooper
2013-Dec-13 12:18 UTC
Re: [PATCH 1/5] IOMMU: make page table population preemptible
On 13/12/2013 09:51, Jan Beulich wrote:>>>> On 11.12.13 at 19:40, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >> On 10/12/2013 15:45, Jan Beulich wrote: >>> @@ -321,7 +342,32 @@ static int iommu_populate_page_table(str >>> d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page), >>> IOMMUF_readable|IOMMUF_writable); >>> if ( rc ) >>> + { >>> + page_list_add(page, &d->page_list); >>> break; >>> + } >>> + } >>> + page_list_add_tail(page, &d->arch.relmem_list); >>> + if ( !(++n & 0xff) && !page_list_empty(&d->page_list) && >> Why the forced restart here? If nothing needs pre-empting, surely it is >> better to continue? >> >> Or is this about equality on the pcidevs_lock ? >> >>> + hypercall_preempt_check() ) > Did you overlook this part of the condition?No, but I did mentally get the logic inverted when trying to work out what was going on. How about (++n > 0xff) ? If we have already spent a while in this loop, and the hypercall_preempt_check() doesn''t flip to 1 until a few iterations after n is congruent with 0x100, waiting for another 0x100 iterations before checking again seems a little long.> >>> --- a/xen/include/xen/sched.h >>> +++ b/xen/include/xen/sched.h >>> @@ -323,7 +323,7 @@ struct domain >>> >>> #ifdef HAS_PASSTHROUGH >>> /* Does this guest need iommu mappings? */ >>> - bool_t need_iommu; >>> + s8 need_iommu; >> I think this change from bool_t to s8 needs a comment explaining that -1 >> indicates "the iommu mappings are pending creation" > Will do. > >> Is there any particular reason that -ERESTART is used when -EAGAIN is >> the prevailing style for hypercall continuations? > I meanwhile realized that using -EAGAIN was a mistake (iirc taken > from certain domctl-s having passed this back up to the caller to > request re-invocation a long time ago) - -EAGAIN really has a > different meaning, and hence we ought to switch all its current > mis-uses to -ERESTART. > > Jan >Ok. ~Andrew
Jan Beulich
2013-Dec-13 12:34 UTC
Re: [PATCH 1/5] IOMMU: make page table population preemptible
>>> On 13.12.13 at 13:18, Andrew Cooper <andrew.cooper3@citrix.com> wrote: > On 13/12/2013 09:51, Jan Beulich wrote: >>>>> On 11.12.13 at 19:40, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >>> On 10/12/2013 15:45, Jan Beulich wrote: >>>> @@ -321,7 +342,32 @@ static int iommu_populate_page_table(str >>>> d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page), >>>> IOMMUF_readable|IOMMUF_writable); >>>> if ( rc ) >>>> + { >>>> + page_list_add(page, &d->page_list); >>>> break; >>>> + } >>>> + } >>>> + page_list_add_tail(page, &d->arch.relmem_list); >>>> + if ( !(++n & 0xff) && !page_list_empty(&d->page_list) && >>> Why the forced restart here? If nothing needs pre-empting, surely it is >>> better to continue? >>> >>> Or is this about equality on the pcidevs_lock ? >>> >>>> + hypercall_preempt_check() ) >> Did you overlook this part of the condition? > > No, but I did mentally get the logic inverted when trying to work out > what was going on. > > How about (++n > 0xff) ? > > If we have already spent a while in this loop, and the > hypercall_preempt_check() doesn''t flip to 1 until a few iterations after > n is congruent with 0x100, waiting for another 0x100 iterations before > checking again seems a little long.The thing I''m trying to avoid is calling hypercall_preempt_check() overly often - especially when the prior if()''s body doesn''t get entered we might otherwise do very little useful for per check. While 256 isn''t overly much (covering merely 1Mb of guest space), I could see two possibilities to get this a little more towards what you want: Either we right shift the mask each time we actually call hypercall_preempt_check(), or we bias the increment (adding e.g. 10 when going the expensive route, else 1). Jan
Tim Deegan
2013-Dec-13 13:46 UTC
Re: [PATCH 3/5] x86/p2m: restrict auditing to debug builds
At 15:47 +0000 on 10 Dec (1386686837), Jan Beulich wrote:> ... since iterating through all of a guest''s pages may take unduly > long. > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > Acked-by: Ian Campbell <ian.campbell@citrix.com>Acked-by: Tim Deegan <tim@xen.org>
Andrew Cooper
2013-Dec-13 13:57 UTC
Re: [PATCH 1/5] IOMMU: make page table population preemptible
On 13/12/2013 12:34, Jan Beulich wrote:>>>> On 13.12.13 at 13:18, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >> On 13/12/2013 09:51, Jan Beulich wrote: >>>>>> On 11.12.13 at 19:40, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >>>> On 10/12/2013 15:45, Jan Beulich wrote: >>>>> @@ -321,7 +342,32 @@ static int iommu_populate_page_table(str >>>>> d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page), >>>>> IOMMUF_readable|IOMMUF_writable); >>>>> if ( rc ) >>>>> + { >>>>> + page_list_add(page, &d->page_list); >>>>> break; >>>>> + } >>>>> + } >>>>> + page_list_add_tail(page, &d->arch.relmem_list); >>>>> + if ( !(++n & 0xff) && !page_list_empty(&d->page_list) && >>>> Why the forced restart here? If nothing needs pre-empting, surely it is >>>> better to continue? >>>> >>>> Or is this about equality on the pcidevs_lock ? >>>> >>>>> + hypercall_preempt_check() ) >>> Did you overlook this part of the condition? >> No, but I did mentally get the logic inverted when trying to work out >> what was going on. >> >> How about (++n > 0xff) ? >> >> If we have already spent a while in this loop, and the >> hypercall_preempt_check() doesn''t flip to 1 until a few iterations after >> n is congruent with 0x100, waiting for another 0x100 iterations before >> checking again seems a little long. > The thing I''m trying to avoid is calling hypercall_preempt_check() > overly often - especially when the prior if()''s body doesn''t get > entered we might otherwise do very little useful for per check. > > While 256 isn''t overly much (covering merely 1Mb of guest space), > I could see two possibilities to get this a little more towards what > you want: Either we right shift the mask each time we actually > call hypercall_preempt_check(), or we bias the increment (adding > e.g. 10 when going the expensive route, else 1). > > JanI suppose it probably doesn''t matter too much. In the slim case where he loop does manage to get to 257 iterations before the preempt check returns true, doing another 254 iterations isn''t going to skew the timings too much. And the net time till completion will be fractionally shorter. ~Andrew
Jan Beulich
2013-Dec-13 13:59 UTC
[PATCH v2 1/5] IOMMU: make page table population preemptible
Since this can take an arbitrary amount of time, the rooting domctl as well as all involved code must become aware of this requiring a continuation. The subject domain''s rel_mem_list is being (ab)used for this, in a way similar to and compatible with broken page offlining. Further, operations get slightly re-ordered in assign_device(): IOMMU page tables now get set up _before_ the first device gets assigned, at once closing a small timing window in which the guest may already see the device but wouldn''t be able to access it. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v2: Extend comment on struct domain''s need_iommu field. --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -1924,6 +1924,12 @@ int domain_relinquish_resources(struct d } d->arch.relmem = RELMEM_xen; + + spin_lock(&d->page_alloc_lock); + page_list_splice(&d->arch.relmem_list, &d->page_list); + INIT_PAGE_LIST_HEAD(&d->arch.relmem_list); + spin_unlock(&d->page_alloc_lock); + /* Fallthrough. Relinquish every page of memory. */ case RELMEM_xen: ret = relinquish_memory(d, &d->xenpage_list, ~0UL); --- a/xen/arch/x86/mm/p2m-pod.c +++ b/xen/arch/x86/mm/p2m-pod.c @@ -459,7 +459,8 @@ p2m_pod_offline_or_broken_hit(struct pag pod_hit: lock_page_alloc(p2m); - page_list_add_tail(p, &d->arch.relmem_list); + /* Insertion must be at list head (see iommu_populate_page_table()). */ + page_list_add(p, &d->arch.relmem_list); unlock_page_alloc(p2m); pod_unlock(p2m); return 1; --- a/xen/drivers/passthrough/iommu.c +++ b/xen/drivers/passthrough/iommu.c @@ -18,6 +18,7 @@ #include <asm/hvm/iommu.h> #include <xen/paging.h> #include <xen/guest_access.h> +#include <xen/event.h> #include <xen/softirq.h> #include <xen/keyhandler.h> #include <xsm/xsm.h> @@ -265,7 +266,23 @@ static int assign_device(struct domain * d->mem_event->paging.ring_page)) ) return -EXDEV; - spin_lock(&pcidevs_lock); + if ( !spin_trylock(&pcidevs_lock) ) + return -ERESTART; + + if ( need_iommu(d) <= 0 ) + { + if ( !iommu_use_hap_pt(d) ) + { + rc = iommu_populate_page_table(d); + if ( rc ) + { + spin_unlock(&pcidevs_lock); + return rc; + } + } + d->need_iommu = 1; + } + pdev = pci_get_pdev_by_domain(dom0, seg, bus, devfn); if ( !pdev ) { @@ -290,15 +307,14 @@ static int assign_device(struct domain * rc); } - if ( has_arch_pdevs(d) && !need_iommu(d) ) + done: + if ( !has_arch_pdevs(d) && need_iommu(d) ) { - d->need_iommu = 1; - if ( !iommu_use_hap_pt(d) ) - rc = iommu_populate_page_table(d); - goto done; + d->need_iommu = 0; + hd->platform_ops->teardown(d); } -done: spin_unlock(&pcidevs_lock); + return rc; } @@ -306,12 +322,17 @@ static int iommu_populate_page_table(str { struct hvm_iommu *hd = domain_hvm_iommu(d); struct page_info *page; - int rc = 0; + int rc = 0, n = 0; + + d->need_iommu = -1; this_cpu(iommu_dont_flush_iotlb) = 1; spin_lock(&d->page_alloc_lock); - page_list_for_each ( page, &d->page_list ) + if ( unlikely(d->is_dying) ) + rc = -ESRCH; + + while ( !rc && (page = page_list_remove_head(&d->page_list)) ) { if ( is_hvm_domain(d) || (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page ) @@ -321,7 +342,32 @@ static int iommu_populate_page_table(str d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page), IOMMUF_readable|IOMMUF_writable); if ( rc ) + { + page_list_add(page, &d->page_list); break; + } + } + page_list_add_tail(page, &d->arch.relmem_list); + if ( !(++n & 0xff) && !page_list_empty(&d->page_list) && + hypercall_preempt_check() ) + rc = -ERESTART; + } + + if ( !rc ) + { + /* + * The expectation here is that generally there are many normal pages + * on relmem_list (the ones we put there) and only few being in an + * offline/broken state. The latter ones are always at the head of the + * list. Hence we first move the whole list, and then move back the + * first few entries. + */ + page_list_move(&d->page_list, &d->arch.relmem_list); + while ( (page = page_list_first(&d->page_list)) != NULL && + (page->count_info & (PGC_state|PGC_broken)) ) + { + page_list_del(page, &d->page_list); + page_list_add_tail(page, &d->arch.relmem_list); } } @@ -330,8 +376,11 @@ static int iommu_populate_page_table(str if ( !rc ) iommu_iotlb_flush_all(d); - else + else if ( rc != -ERESTART ) + { + d->need_iommu = 0; hd->platform_ops->teardown(d); + } return rc; } @@ -688,7 +737,10 @@ int iommu_do_domctl( ret = device_assigned(seg, bus, devfn) ?: assign_device(d, seg, bus, devfn); - if ( ret ) + if ( ret == -ERESTART ) + ret = hypercall_create_continuation(__HYPERVISOR_domctl, + "h", u_domctl); + else if ( ret ) printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: " "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n", seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -322,8 +322,8 @@ struct domain enum guest_type guest_type; #ifdef HAS_PASSTHROUGH - /* Does this guest need iommu mappings? */ - bool_t need_iommu; + /* Does this guest need iommu mappings (-1 meaning "being set up")? */ + s8 need_iommu; #endif /* is node-affinity automatically computed? */ bool_t auto_node_affinity; _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Jan Beulich
2013-Dec-13 14:00 UTC
[PATCH v2 2/5] IOMMU: make page table deallocation preemptible
This too can take an arbitrary amount of time. In fact, the bulk of the work is being moved to a tasklet, as handling the necessary preemption logic in line seems close to impossible given that the teardown may also be invoked on error paths. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v2: abstract out tasklet logic --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c @@ -405,11 +405,21 @@ static int amd_iommu_assign_device(struc return reassign_device(dom0, d, devfn, pdev); } -static void deallocate_next_page_table(struct page_info* pg, int level) +static void deallocate_next_page_table(struct page_info *pg, int level) +{ + PFN_ORDER(pg) = level; + spin_lock(&iommu_pt_cleanup_lock); + page_list_add_tail(pg, &iommu_pt_cleanup_list); + spin_unlock(&iommu_pt_cleanup_lock); +} + +static void deallocate_page_table(struct page_info *pg) { void *table_vaddr, *pde; u64 next_table_maddr; - int index, next_level; + unsigned int index, level = PFN_ORDER(pg), next_level; + + PFN_ORDER(pg) = 0; if ( level <= 1 ) { @@ -599,6 +609,7 @@ const struct iommu_ops amd_iommu_ops = { .teardown = amd_iommu_domain_destroy, .map_page = amd_iommu_map_page, .unmap_page = amd_iommu_unmap_page, + .free_page_table = deallocate_page_table, .reassign_device = reassign_device, .get_device_group_id = amd_iommu_group_id, .update_ire_from_apic = amd_iommu_ioapic_update_ire, --- a/xen/drivers/passthrough/iommu.c +++ b/xen/drivers/passthrough/iommu.c @@ -58,6 +58,10 @@ bool_t __read_mostly amd_iommu_perdev_in DEFINE_PER_CPU(bool_t, iommu_dont_flush_iotlb); +DEFINE_SPINLOCK(iommu_pt_cleanup_lock); +PAGE_LIST_HEAD(iommu_pt_cleanup_list); +static struct tasklet iommu_pt_cleanup_tasklet; + static struct keyhandler iommu_p2m_table = { .diagnostic = 0, .u.fn = iommu_dump_p2m_table, @@ -235,6 +239,15 @@ int iommu_remove_device(struct pci_dev * return hd->platform_ops->remove_device(pdev->devfn, pdev); } +static void iommu_teardown(struct domain *d) +{ + const struct hvm_iommu *hd = domain_hvm_iommu(d); + + d->need_iommu = 0; + hd->platform_ops->teardown(d); + tasklet_schedule(&iommu_pt_cleanup_tasklet); +} + /* * If the device isn''t owned by dom0, it means it already * has been assigned to other domain, or it doesn''t exist. @@ -309,10 +322,7 @@ static int assign_device(struct domain * done: if ( !has_arch_pdevs(d) && need_iommu(d) ) - { - d->need_iommu = 0; - hd->platform_ops->teardown(d); - } + iommu_teardown(d); spin_unlock(&pcidevs_lock); return rc; @@ -377,10 +387,7 @@ static int iommu_populate_page_table(str if ( !rc ) iommu_iotlb_flush_all(d); else if ( rc != -ERESTART ) - { - d->need_iommu = 0; - hd->platform_ops->teardown(d); - } + iommu_teardown(d); return rc; } @@ -397,10 +404,7 @@ void iommu_domain_destroy(struct domain return; if ( need_iommu(d) ) - { - d->need_iommu = 0; - hd->platform_ops->teardown(d); - } + iommu_teardown(d); list_for_each_safe ( ioport_list, tmp, &hd->g2m_ioport_list ) { @@ -438,6 +442,23 @@ int iommu_unmap_page(struct domain *d, u return hd->platform_ops->unmap_page(d, gfn); } +static void iommu_free_pagetables(unsigned long unused) +{ + do { + struct page_info *pg; + + spin_lock(&iommu_pt_cleanup_lock); + pg = page_list_remove_head(&iommu_pt_cleanup_list); + spin_unlock(&iommu_pt_cleanup_lock); + if ( !pg ) + return; + iommu_get_ops()->free_page_table(pg); + } while ( !softirq_pending(smp_processor_id()) ); + + tasklet_schedule_on_cpu(&iommu_pt_cleanup_tasklet, + cpumask_cycle(smp_processor_id(), &cpu_online_map)); +} + void iommu_iotlb_flush(struct domain *d, unsigned long gfn, unsigned int page_count) { struct hvm_iommu *hd = domain_hvm_iommu(d); @@ -500,10 +521,7 @@ int deassign_device(struct domain *d, u1 pdev->fault.count = 0; if ( !has_arch_pdevs(d) && need_iommu(d) ) - { - d->need_iommu = 0; - hd->platform_ops->teardown(d); - } + iommu_teardown(d); return ret; } @@ -542,6 +560,7 @@ int __init iommu_setup(void) iommu_passthrough ? "Passthrough" : iommu_dom0_strict ? "Strict" : "Relaxed"); printk("Interrupt remapping %sabled\n", iommu_intremap ? "en" : "dis"); + tasklet_init(&iommu_pt_cleanup_tasklet, iommu_free_pagetables, 0); } return rc; --- a/xen/drivers/passthrough/vtd/iommu.c +++ b/xen/drivers/passthrough/vtd/iommu.c @@ -668,13 +668,24 @@ static void dma_pte_clear_one(struct dom static void iommu_free_pagetable(u64 pt_maddr, int level) { - int i; - struct dma_pte *pt_vaddr, *pte; - int next_level = level - 1; + struct page_info *pg = maddr_to_page(pt_maddr); if ( pt_maddr == 0 ) return; + PFN_ORDER(pg) = level; + spin_lock(&iommu_pt_cleanup_lock); + page_list_add_tail(pg, &iommu_pt_cleanup_list); + spin_unlock(&iommu_pt_cleanup_lock); +} + +static void iommu_free_page_table(struct page_info *pg) +{ + unsigned int i, next_level = PFN_ORDER(pg) - 1; + u64 pt_maddr = page_to_maddr(pg); + struct dma_pte *pt_vaddr, *pte; + + PFN_ORDER(pg) = 0; pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr); for ( i = 0; i < PTE_NUM; i++ ) @@ -2430,6 +2441,7 @@ const struct iommu_ops intel_iommu_ops .teardown = iommu_domain_teardown, .map_page = intel_iommu_map_page, .unmap_page = intel_iommu_unmap_page, + .free_page_table = iommu_free_page_table, .reassign_device = reassign_device_ownership, .get_device_group_id = intel_iommu_group_id, .update_ire_from_apic = io_apic_write_remap_rte, --- a/xen/include/xen/iommu.h +++ b/xen/include/xen/iommu.h @@ -88,6 +88,7 @@ bool_t pt_irq_need_timer(uint32_t flags) struct msi_desc; struct msi_msg; +struct page_info; struct iommu_ops { int (*init)(struct domain *d); @@ -100,6 +101,7 @@ struct iommu_ops { int (*map_page)(struct domain *d, unsigned long gfn, unsigned long mfn, unsigned int flags); int (*unmap_page)(struct domain *d, unsigned long gfn); + void (*free_page_table)(struct page_info *); int (*reassign_device)(struct domain *s, struct domain *t, u8 devfn, struct pci_dev *); int (*get_device_group_id)(u16 seg, u8 bus, u8 devfn); @@ -151,4 +153,7 @@ int adjust_vtd_irq_affinities(void); */ DECLARE_PER_CPU(bool_t, iommu_dont_flush_iotlb); +extern struct spinlock iommu_pt_cleanup_lock; +extern struct page_list_head iommu_pt_cleanup_list; + #endif /* _IOMMU_H_ */ _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Tim Deegan
2013-Dec-13 14:16 UTC
Re: [PATCH v2 1/5] IOMMU: make page table population preemptible
At 13:59 +0000 on 13 Dec (1386939579), Jan Beulich wrote:> Since this can take an arbitrary amount of time, the rooting domctl as > well as all involved code must become aware of this requiring a > continuation. > > The subject domain''s rel_mem_list is being (ab)used for this, in a way > similar to and compatible with broken page offlining. > > Further, operations get slightly re-ordered in assign_device(): IOMMU > page tables now get set up _before_ the first device gets assigned, at > once closing a small timing window in which the guest may already see > the device but wouldn''t be able to access it. > > Signed-off-by: Jan Beulich <jbeulich@suse.com>Acked-by: Tim Deegan <tim@xen.org>
Andrew Cooper
2013-Dec-13 15:09 UTC
Re: [PATCH v2 1/5] IOMMU: make page table population preemptible
On 13/12/2013 13:59, Jan Beulich wrote:> Since this can take an arbitrary amount of time, the rooting domctl as > well as all involved code must become aware of this requiring a > continuation. > > The subject domain''s rel_mem_list is being (ab)used for this, in a way > similar to and compatible with broken page offlining. > > Further, operations get slightly re-ordered in assign_device(): IOMMU > page tables now get set up _before_ the first device gets assigned, at > once closing a small timing window in which the guest may already see > the device but wouldn''t be able to access it. > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > --- > v2: Extend comment on struct domain''s need_iommu field. > > --- a/xen/arch/x86/domain.c > +++ b/xen/arch/x86/domain.c > @@ -1924,6 +1924,12 @@ int domain_relinquish_resources(struct d > } > > d->arch.relmem = RELMEM_xen; > + > + spin_lock(&d->page_alloc_lock); > + page_list_splice(&d->arch.relmem_list, &d->page_list); > + INIT_PAGE_LIST_HEAD(&d->arch.relmem_list); > + spin_unlock(&d->page_alloc_lock); > + > /* Fallthrough. Relinquish every page of memory. */ > case RELMEM_xen: > ret = relinquish_memory(d, &d->xenpage_list, ~0UL); > --- a/xen/arch/x86/mm/p2m-pod.c > +++ b/xen/arch/x86/mm/p2m-pod.c > @@ -459,7 +459,8 @@ p2m_pod_offline_or_broken_hit(struct pag > > pod_hit: > lock_page_alloc(p2m); > - page_list_add_tail(p, &d->arch.relmem_list); > + /* Insertion must be at list head (see iommu_populate_page_table()). */ > + page_list_add(p, &d->arch.relmem_list); > unlock_page_alloc(p2m); > pod_unlock(p2m); > return 1; > --- a/xen/drivers/passthrough/iommu.c > +++ b/xen/drivers/passthrough/iommu.c > @@ -18,6 +18,7 @@ > #include <asm/hvm/iommu.h> > #include <xen/paging.h> > #include <xen/guest_access.h> > +#include <xen/event.h> > #include <xen/softirq.h> > #include <xen/keyhandler.h> > #include <xsm/xsm.h> > @@ -265,7 +266,23 @@ static int assign_device(struct domain * > d->mem_event->paging.ring_page)) ) > return -EXDEV; > > - spin_lock(&pcidevs_lock); > + if ( !spin_trylock(&pcidevs_lock) ) > + return -ERESTART; > + > + if ( need_iommu(d) <= 0 ) > + { > + if ( !iommu_use_hap_pt(d) ) > + { > + rc = iommu_populate_page_table(d); > + if ( rc ) > + { > + spin_unlock(&pcidevs_lock); > + return rc; > + } > + } > + d->need_iommu = 1; > + } > + > pdev = pci_get_pdev_by_domain(dom0, seg, bus, devfn); > if ( !pdev ) > { > @@ -290,15 +307,14 @@ static int assign_device(struct domain * > rc); > } > > - if ( has_arch_pdevs(d) && !need_iommu(d) ) > + done: > + if ( !has_arch_pdevs(d) && need_iommu(d) )We now have a case where, for the first device, we could set up pagetables for a large domain, get an error with assignment, then tear them all back down. (-EBUSY from pci_get_pdev() looks like a good non-fatal candidate for causing this behaviour) I am wondering whether this is better for worse than the race condition where a guest couldn''t use the device. A guest could not reasonably expect to use a device before the toolstack is done setting it up. A buggy toolstack could quite easily tie up a lot of Xen time creating and destroying complete iommu pagetable sets. ~Andrew> { > - d->need_iommu = 1; > - if ( !iommu_use_hap_pt(d) ) > - rc = iommu_populate_page_table(d); > - goto done; > + d->need_iommu = 0; > + hd->platform_ops->teardown(d); > } > -done: > spin_unlock(&pcidevs_lock); > + > return rc; > } > > @@ -306,12 +322,17 @@ static int iommu_populate_page_table(str > { > struct hvm_iommu *hd = domain_hvm_iommu(d); > struct page_info *page; > - int rc = 0; > + int rc = 0, n = 0; > + > + d->need_iommu = -1; > > this_cpu(iommu_dont_flush_iotlb) = 1; > spin_lock(&d->page_alloc_lock); > > - page_list_for_each ( page, &d->page_list ) > + if ( unlikely(d->is_dying) ) > + rc = -ESRCH; > + > + while ( !rc && (page = page_list_remove_head(&d->page_list)) ) > { > if ( is_hvm_domain(d) || > (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page ) > @@ -321,7 +342,32 @@ static int iommu_populate_page_table(str > d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page), > IOMMUF_readable|IOMMUF_writable); > if ( rc ) > + { > + page_list_add(page, &d->page_list); > break; > + } > + } > + page_list_add_tail(page, &d->arch.relmem_list); > + if ( !(++n & 0xff) && !page_list_empty(&d->page_list) && > + hypercall_preempt_check() ) > + rc = -ERESTART; > + } > + > + if ( !rc ) > + { > + /* > + * The expectation here is that generally there are many normal pages > + * on relmem_list (the ones we put there) and only few being in an > + * offline/broken state. The latter ones are always at the head of the > + * list. Hence we first move the whole list, and then move back the > + * first few entries. > + */ > + page_list_move(&d->page_list, &d->arch.relmem_list); > + while ( (page = page_list_first(&d->page_list)) != NULL && > + (page->count_info & (PGC_state|PGC_broken)) ) > + { > + page_list_del(page, &d->page_list); > + page_list_add_tail(page, &d->arch.relmem_list); > } > } > > @@ -330,8 +376,11 @@ static int iommu_populate_page_table(str > > if ( !rc ) > iommu_iotlb_flush_all(d); > - else > + else if ( rc != -ERESTART ) > + { > + d->need_iommu = 0; > hd->platform_ops->teardown(d); > + } > > return rc; > } > @@ -688,7 +737,10 @@ int iommu_do_domctl( > > ret = device_assigned(seg, bus, devfn) ?: > assign_device(d, seg, bus, devfn); > - if ( ret ) > + if ( ret == -ERESTART ) > + ret = hypercall_create_continuation(__HYPERVISOR_domctl, > + "h", u_domctl); > + else if ( ret ) > printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: " > "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n", > seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), > --- a/xen/include/xen/sched.h > +++ b/xen/include/xen/sched.h > @@ -322,8 +322,8 @@ struct domain > enum guest_type guest_type; > > #ifdef HAS_PASSTHROUGH > - /* Does this guest need iommu mappings? */ > - bool_t need_iommu; > + /* Does this guest need iommu mappings (-1 meaning "being set up")? */ > + s8 need_iommu; > #endif > /* is node-affinity automatically computed? */ > bool_t auto_node_affinity; > > > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel_______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Andrew Cooper
2013-Dec-13 15:26 UTC
Re: [PATCH v2 2/5] IOMMU: make page table deallocation preemptible
On 13/12/2013 14:00, Jan Beulich wrote:> This too can take an arbitrary amount of time. > > In fact, the bulk of the work is being moved to a tasklet, as handling > the necessary preemption logic in line seems close to impossible given > that the teardown may also be invoked on error paths. > > Signed-off-by: Jan Beulich <jbeulich@suse.com>Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>> --- > v2: abstract out tasklet logic > > --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c > +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c > @@ -405,11 +405,21 @@ static int amd_iommu_assign_device(struc > return reassign_device(dom0, d, devfn, pdev); > } > > -static void deallocate_next_page_table(struct page_info* pg, int level) > +static void deallocate_next_page_table(struct page_info *pg, int level) > +{ > + PFN_ORDER(pg) = level; > + spin_lock(&iommu_pt_cleanup_lock); > + page_list_add_tail(pg, &iommu_pt_cleanup_list); > + spin_unlock(&iommu_pt_cleanup_lock); > +} > + > +static void deallocate_page_table(struct page_info *pg) > { > void *table_vaddr, *pde; > u64 next_table_maddr; > - int index, next_level; > + unsigned int index, level = PFN_ORDER(pg), next_level; > + > + PFN_ORDER(pg) = 0; > > if ( level <= 1 ) > { > @@ -599,6 +609,7 @@ const struct iommu_ops amd_iommu_ops = { > .teardown = amd_iommu_domain_destroy, > .map_page = amd_iommu_map_page, > .unmap_page = amd_iommu_unmap_page, > + .free_page_table = deallocate_page_table, > .reassign_device = reassign_device, > .get_device_group_id = amd_iommu_group_id, > .update_ire_from_apic = amd_iommu_ioapic_update_ire, > --- a/xen/drivers/passthrough/iommu.c > +++ b/xen/drivers/passthrough/iommu.c > @@ -58,6 +58,10 @@ bool_t __read_mostly amd_iommu_perdev_in > > DEFINE_PER_CPU(bool_t, iommu_dont_flush_iotlb); > > +DEFINE_SPINLOCK(iommu_pt_cleanup_lock); > +PAGE_LIST_HEAD(iommu_pt_cleanup_list); > +static struct tasklet iommu_pt_cleanup_tasklet; > + > static struct keyhandler iommu_p2m_table = { > .diagnostic = 0, > .u.fn = iommu_dump_p2m_table, > @@ -235,6 +239,15 @@ int iommu_remove_device(struct pci_dev * > return hd->platform_ops->remove_device(pdev->devfn, pdev); > } > > +static void iommu_teardown(struct domain *d) > +{ > + const struct hvm_iommu *hd = domain_hvm_iommu(d); > + > + d->need_iommu = 0; > + hd->platform_ops->teardown(d); > + tasklet_schedule(&iommu_pt_cleanup_tasklet); > +} > + > /* > * If the device isn''t owned by dom0, it means it already > * has been assigned to other domain, or it doesn''t exist. > @@ -309,10 +322,7 @@ static int assign_device(struct domain * > > done: > if ( !has_arch_pdevs(d) && need_iommu(d) ) > - { > - d->need_iommu = 0; > - hd->platform_ops->teardown(d); > - } > + iommu_teardown(d); > spin_unlock(&pcidevs_lock); > > return rc; > @@ -377,10 +387,7 @@ static int iommu_populate_page_table(str > if ( !rc ) > iommu_iotlb_flush_all(d); > else if ( rc != -ERESTART ) > - { > - d->need_iommu = 0; > - hd->platform_ops->teardown(d); > - } > + iommu_teardown(d); > > return rc; > } > @@ -397,10 +404,7 @@ void iommu_domain_destroy(struct domain > return; > > if ( need_iommu(d) ) > - { > - d->need_iommu = 0; > - hd->platform_ops->teardown(d); > - } > + iommu_teardown(d); > > list_for_each_safe ( ioport_list, tmp, &hd->g2m_ioport_list ) > { > @@ -438,6 +442,23 @@ int iommu_unmap_page(struct domain *d, u > return hd->platform_ops->unmap_page(d, gfn); > } > > +static void iommu_free_pagetables(unsigned long unused) > +{ > + do { > + struct page_info *pg; > + > + spin_lock(&iommu_pt_cleanup_lock); > + pg = page_list_remove_head(&iommu_pt_cleanup_list); > + spin_unlock(&iommu_pt_cleanup_lock); > + if ( !pg ) > + return; > + iommu_get_ops()->free_page_table(pg); > + } while ( !softirq_pending(smp_processor_id()) ); > + > + tasklet_schedule_on_cpu(&iommu_pt_cleanup_tasklet, > + cpumask_cycle(smp_processor_id(), &cpu_online_map)); > +} > + > void iommu_iotlb_flush(struct domain *d, unsigned long gfn, unsigned int page_count) > { > struct hvm_iommu *hd = domain_hvm_iommu(d); > @@ -500,10 +521,7 @@ int deassign_device(struct domain *d, u1 > pdev->fault.count = 0; > > if ( !has_arch_pdevs(d) && need_iommu(d) ) > - { > - d->need_iommu = 0; > - hd->platform_ops->teardown(d); > - } > + iommu_teardown(d); > > return ret; > } > @@ -542,6 +560,7 @@ int __init iommu_setup(void) > iommu_passthrough ? "Passthrough" : > iommu_dom0_strict ? "Strict" : "Relaxed"); > printk("Interrupt remapping %sabled\n", iommu_intremap ? "en" : "dis"); > + tasklet_init(&iommu_pt_cleanup_tasklet, iommu_free_pagetables, 0); > } > > return rc; > --- a/xen/drivers/passthrough/vtd/iommu.c > +++ b/xen/drivers/passthrough/vtd/iommu.c > @@ -668,13 +668,24 @@ static void dma_pte_clear_one(struct dom > > static void iommu_free_pagetable(u64 pt_maddr, int level) > { > - int i; > - struct dma_pte *pt_vaddr, *pte; > - int next_level = level - 1; > + struct page_info *pg = maddr_to_page(pt_maddr); > > if ( pt_maddr == 0 ) > return; > > + PFN_ORDER(pg) = level; > + spin_lock(&iommu_pt_cleanup_lock); > + page_list_add_tail(pg, &iommu_pt_cleanup_list); > + spin_unlock(&iommu_pt_cleanup_lock); > +} > + > +static void iommu_free_page_table(struct page_info *pg) > +{ > + unsigned int i, next_level = PFN_ORDER(pg) - 1; > + u64 pt_maddr = page_to_maddr(pg); > + struct dma_pte *pt_vaddr, *pte; > + > + PFN_ORDER(pg) = 0; > pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr); > > for ( i = 0; i < PTE_NUM; i++ ) > @@ -2430,6 +2441,7 @@ const struct iommu_ops intel_iommu_ops > .teardown = iommu_domain_teardown, > .map_page = intel_iommu_map_page, > .unmap_page = intel_iommu_unmap_page, > + .free_page_table = iommu_free_page_table, > .reassign_device = reassign_device_ownership, > .get_device_group_id = intel_iommu_group_id, > .update_ire_from_apic = io_apic_write_remap_rte, > --- a/xen/include/xen/iommu.h > +++ b/xen/include/xen/iommu.h > @@ -88,6 +88,7 @@ bool_t pt_irq_need_timer(uint32_t flags) > > struct msi_desc; > struct msi_msg; > +struct page_info; > > struct iommu_ops { > int (*init)(struct domain *d); > @@ -100,6 +101,7 @@ struct iommu_ops { > int (*map_page)(struct domain *d, unsigned long gfn, unsigned long mfn, > unsigned int flags); > int (*unmap_page)(struct domain *d, unsigned long gfn); > + void (*free_page_table)(struct page_info *); > int (*reassign_device)(struct domain *s, struct domain *t, > u8 devfn, struct pci_dev *); > int (*get_device_group_id)(u16 seg, u8 bus, u8 devfn); > @@ -151,4 +153,7 @@ int adjust_vtd_irq_affinities(void); > */ > DECLARE_PER_CPU(bool_t, iommu_dont_flush_iotlb); > > +extern struct spinlock iommu_pt_cleanup_lock; > +extern struct page_list_head iommu_pt_cleanup_list; > + > #endif /* _IOMMU_H_ */ > >
Jan Beulich
2013-Dec-13 15:41 UTC
Re: [PATCH v2 1/5] IOMMU: make page table population preemptible
>>> On 13.12.13 at 16:09, Andrew Cooper <andrew.cooper3@citrix.com> wrote: > On 13/12/2013 13:59, Jan Beulich wrote: >> @@ -290,15 +307,14 @@ static int assign_device(struct domain * >> rc); >> } >> >> - if ( has_arch_pdevs(d) && !need_iommu(d) ) >> + done: >> + if ( !has_arch_pdevs(d) && need_iommu(d) ) > > We now have a case where, for the first device, we could set up > pagetables for a large domain, get an error with assignment, then tear > them all back down. (-EBUSY from pci_get_pdev() looks like a good > non-fatal candidate for causing this behaviour) > > I am wondering whether this is better for worse than the race condition > where a guest couldn''t use the device. A guest could not reasonably > expect to use a device before the toolstack is done setting it up. A > buggy toolstack could quite easily tie up a lot of Xen time creating and > destroying complete iommu pagetable sets.I don''t think it''s worth worrying about buggy tool stacks here - they should simply get fixed. Furthermore this change in operation ordering is only a (nice) side effect, the necessary cleanup seemed easier with the order changed. And said time window would have grown with the added preemption handling. Jan
Andrew Cooper
2013-Dec-13 15:44 UTC
Re: [PATCH v2 1/5] IOMMU: make page table population preemptible
On 13/12/2013 15:41, Jan Beulich wrote:>>>> On 13.12.13 at 16:09, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >> On 13/12/2013 13:59, Jan Beulich wrote: >>> @@ -290,15 +307,14 @@ static int assign_device(struct domain * >>> rc); >>> } >>> >>> - if ( has_arch_pdevs(d) && !need_iommu(d) ) >>> + done: >>> + if ( !has_arch_pdevs(d) && need_iommu(d) ) >> We now have a case where, for the first device, we could set up >> pagetables for a large domain, get an error with assignment, then tear >> them all back down. (-EBUSY from pci_get_pdev() looks like a good >> non-fatal candidate for causing this behaviour) >> >> I am wondering whether this is better for worse than the race condition >> where a guest couldn''t use the device. A guest could not reasonably >> expect to use a device before the toolstack is done setting it up. A >> buggy toolstack could quite easily tie up a lot of Xen time creating and >> destroying complete iommu pagetable sets. > I don''t think it''s worth worrying about buggy tool stacks here - > they should simply get fixed. > > Furthermore this change in operation ordering is only a (nice) > side effect, the necessary cleanup seemed easier with the > order changed. And said time window would have grown with > the added preemption handling. > > Jan >All true. Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>