thr3ads.net - Xen devel - [PATCH 0/5] XSA-77 follow-ups [Dec 2013]

If this information is useful, please help other people find it:
Share via:

Jan Beulich

2013-Dec-10 15:43 UTC

[PATCH 0/5] XSA-77 follow-ups

1: IOMMU: make page table population preemptible
2: IOMMU: make page table deallocation preemptible
3: x86/p2m: restrict auditing to debug builds
4: HVM: prevent leaking heap data from hvm_save_one()
5: x86/PV: don''t commit debug register values early in
arch_set_info_guest()

Signed-off-by: Jan Beulich <jbeulich@suse.com>

Jan Beulich

2013-Dec-10 15:45 UTC

head link

[PATCH 1/5] IOMMU: make page table population preemptible

Since this can take an arbitrary amount of time, the rooting domctl as
well as all involved code must become aware of this requiring a
continuation.

The subject domain''s rel_mem_list is being (ab)used for this, in a way
similar to and compatible with broken page offlining.

Further, operations get slightly re-ordered in assign_device(): IOMMU
page tables now get set up _before_ the first device gets assigned, at
once closing a small timing window in which the guest may already see
the device but wouldn''t be able to access it.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1923,6 +1923,12 @@ int domain_relinquish_resources(struct d
         }
 
         d->arch.relmem = RELMEM_xen;
+
+        spin_lock(&d->page_alloc_lock);
+        page_list_splice(&d->arch.relmem_list, &d->page_list);
+        INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
+        spin_unlock(&d->page_alloc_lock);
+
         /* Fallthrough. Relinquish every page of memory. */
     case RELMEM_xen:
         ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
--- a/xen/arch/x86/mm/p2m-pod.c
+++ b/xen/arch/x86/mm/p2m-pod.c
@@ -459,7 +459,8 @@ p2m_pod_offline_or_broken_hit(struct pag
 
 pod_hit:
     lock_page_alloc(p2m);
-    page_list_add_tail(p, &d->arch.relmem_list);
+    /* Insertion must be at list head (see iommu_populate_page_table()). */
+    page_list_add(p, &d->arch.relmem_list);
     unlock_page_alloc(p2m);
     pod_unlock(p2m);
     return 1;
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -18,6 +18,7 @@
 #include <asm/hvm/iommu.h>
 #include <xen/paging.h>
 #include <xen/guest_access.h>
+#include <xen/event.h>
 #include <xen/softirq.h>
 #include <xen/keyhandler.h>
 #include <xsm/xsm.h>
@@ -265,7 +266,23 @@ static int assign_device(struct domain *
              d->mem_event->paging.ring_page)) )
         return -EXDEV;
 
-    spin_lock(&pcidevs_lock);
+    if ( !spin_trylock(&pcidevs_lock) )
+        return -ERESTART;
+
+    if ( need_iommu(d) <= 0 )
+    {
+        if ( !iommu_use_hap_pt(d) )
+        {
+            rc = iommu_populate_page_table(d);
+            if ( rc )
+            {
+                spin_unlock(&pcidevs_lock);
+                return rc;
+            }
+        }
+        d->need_iommu = 1;
+    }
+
     pdev = pci_get_pdev_by_domain(dom0, seg, bus, devfn);
     if ( !pdev )
     {
@@ -290,15 +307,14 @@ static int assign_device(struct domain *
                    rc);
     }
 
-    if ( has_arch_pdevs(d) && !need_iommu(d) )
+ done:
+    if ( !has_arch_pdevs(d) && need_iommu(d) )
     {
-        d->need_iommu = 1;
-        if ( !iommu_use_hap_pt(d) )
-            rc = iommu_populate_page_table(d);
-        goto done;
+        d->need_iommu = 0;
+        hd->platform_ops->teardown(d);
     }
-done:
     spin_unlock(&pcidevs_lock);
+
     return rc;
 }
 
@@ -306,12 +322,17 @@ static int iommu_populate_page_table(str
 {
     struct hvm_iommu *hd = domain_hvm_iommu(d);
     struct page_info *page;
-    int rc = 0;
+    int rc = 0, n = 0;
+
+    d->need_iommu = -1;
 
     this_cpu(iommu_dont_flush_iotlb) = 1;
     spin_lock(&d->page_alloc_lock);
 
-    page_list_for_each ( page, &d->page_list )
+    if ( unlikely(d->is_dying) )
+        rc = -ESRCH;
+
+    while ( !rc && (page = page_list_remove_head(&d->page_list))
)
     {
         if ( is_hvm_domain(d) ||
             (page->u.inuse.type_info & PGT_type_mask) ==
PGT_writable_page )
@@ -321,7 +342,32 @@ static int iommu_populate_page_table(str
                 d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page),
                 IOMMUF_readable|IOMMUF_writable);
             if ( rc )
+            {
+                page_list_add(page, &d->page_list);
                 break;
+            }
+        }
+        page_list_add_tail(page, &d->arch.relmem_list);
+        if ( !(++n & 0xff) &&
!page_list_empty(&d->page_list) &&
+             hypercall_preempt_check() )
+            rc = -ERESTART;
+    }
+
+    if ( !rc )
+    {
+        /*
+         * The expectation here is that generally there are many normal pages
+         * on relmem_list (the ones we put there) and only few being in an
+         * offline/broken state. The latter ones are always at the head of the
+         * list. Hence we first move the whole list, and then move back the
+         * first few entries.
+         */
+        page_list_move(&d->page_list, &d->arch.relmem_list);
+        while ( (page = page_list_first(&d->page_list)) != NULL
&&
+                (page->count_info & (PGC_state|PGC_broken)) )
+        {
+            page_list_del(page, &d->page_list);
+            page_list_add_tail(page, &d->arch.relmem_list);
         }
     }
 
@@ -330,8 +376,11 @@ static int iommu_populate_page_table(str
 
     if ( !rc )
         iommu_iotlb_flush_all(d);
-    else
+    else if ( rc != -ERESTART )
+    {
         hd->platform_ops->teardown(d);
+        d->need_iommu = 0;
+    }
 
     return rc;
 }
@@ -688,7 +737,10 @@ int iommu_do_domctl(
 
         ret = device_assigned(seg, bus, devfn) ?:
               assign_device(d, seg, bus, devfn);
-        if ( ret )
+        if ( ret == -ERESTART )
+            ret = hypercall_create_continuation(__HYPERVISOR_domctl,
+                                                "h", u_domctl);
+        else if ( ret )
             printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: "
                    "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n",
                    seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -323,7 +323,7 @@ struct domain
 
 #ifdef HAS_PASSTHROUGH
     /* Does this guest need iommu mappings? */
-    bool_t           need_iommu;
+    s8               need_iommu;
 #endif
     /* is node-affinity automatically computed? */
     bool_t           auto_node_affinity;




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

Jan Beulich

2013-Dec-10 15:46 UTC

head link

[PATCH 2/5] IOMMU: make page table deallocation preemptible

This too can take an arbitrary amount of time.

In fact, the bulk of the work is being moved to a tasklet, as handling
the necessary preemption logic in line seems close to impossible given
that the teardown may also be invoked on error paths.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -195,6 +195,8 @@ static int __init amd_iommu_setup_dom0_d
     return 0;
 }
 
+static void deallocate_page_tables(unsigned long unused);
+
 int __init amd_iov_detect(void)
 {
     INIT_LIST_HEAD(&amd_iommu_head);
@@ -215,6 +217,7 @@ int __init amd_iov_detect(void)
         return -ENODEV;
     }
 
+    tasklet_init(&iommu_pt_cleanup_tasklet, deallocate_page_tables, 0);
     init_done = 1;
 
     if ( !amd_iommu_perdev_intremap )
@@ -405,11 +408,21 @@ static int amd_iommu_assign_device(struc
     return reassign_device(dom0, d, devfn, pdev);
 }
 
-static void deallocate_next_page_table(struct page_info* pg, int level)
+static void deallocate_next_page_table(struct page_info *pg, int level)
+{
+    spin_lock(&iommu_pt_cleanup_lock);
+    PFN_ORDER(pg) = level;
+    page_list_add_tail(pg, &iommu_pt_cleanup_list);
+    spin_unlock(&iommu_pt_cleanup_lock);
+}
+
+static void deallocate_page_table(struct page_info *pg)
 {
     void *table_vaddr, *pde;
     u64 next_table_maddr;
-    int index, next_level;
+    unsigned int index, level = PFN_ORDER(pg), next_level;
+
+    PFN_ORDER(pg) = 0;
 
     if ( level <= 1 )
     {
@@ -439,6 +452,23 @@ static void deallocate_next_page_table(s
     free_amd_iommu_pgtable(pg);
 }
 
+static void deallocate_page_tables(unsigned long unused)
+{
+    do {
+        struct page_info *pg;
+
+        spin_lock(&iommu_pt_cleanup_lock);
+        pg = page_list_remove_head(&iommu_pt_cleanup_list);
+        spin_unlock(&iommu_pt_cleanup_lock);
+        if ( !pg )
+            return;
+        deallocate_page_table(pg);
+    } while ( !softirq_pending(smp_processor_id()) );
+
+    tasklet_schedule_on_cpu(&iommu_pt_cleanup_tasklet,
+                            cpumask_cycle(smp_processor_id(),
&cpu_online_map));
+}
+
 static void deallocate_iommu_page_tables(struct domain *d)
 {
     struct hvm_iommu *hd  = domain_hvm_iommu(d);
@@ -451,6 +481,7 @@ static void deallocate_iommu_page_tables
     {
         deallocate_next_page_table(hd->root_table, hd->paging_mode);
         hd->root_table = NULL;
+        tasklet_schedule(&iommu_pt_cleanup_tasklet);
     }
     spin_unlock(&hd->mapping_lock);
 }
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -58,6 +58,10 @@ bool_t __read_mostly amd_iommu_perdev_in
 
 DEFINE_PER_CPU(bool_t, iommu_dont_flush_iotlb);
 
+DEFINE_SPINLOCK(iommu_pt_cleanup_lock);
+PAGE_LIST_HEAD(iommu_pt_cleanup_list);
+struct tasklet iommu_pt_cleanup_tasklet;
+
 static struct keyhandler iommu_p2m_table = {
     .diagnostic = 0,
     .u.fn = iommu_dump_p2m_table,
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -668,13 +668,24 @@ static void dma_pte_clear_one(struct dom
 
 static void iommu_free_pagetable(u64 pt_maddr, int level)
 {
-    int i;
-    struct dma_pte *pt_vaddr, *pte;
-    int next_level = level - 1;
+    struct page_info *pg = maddr_to_page(pt_maddr);
 
     if ( pt_maddr == 0 )
         return;
 
+    spin_lock(&iommu_pt_cleanup_lock);
+    PFN_ORDER(pg) = level;
+    page_list_add_tail(pg, &iommu_pt_cleanup_list);
+    spin_unlock(&iommu_pt_cleanup_lock);
+}
+
+static void iommu_free_page_table(struct page_info *pg)
+{
+    unsigned int i, next_level = PFN_ORDER(pg) - 1;
+    u64 pt_maddr = page_to_maddr(pg);
+    struct dma_pte *pt_vaddr, *pte;
+
+    PFN_ORDER(pg) = 0;
     pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
 
     for ( i = 0; i < PTE_NUM; i++ )
@@ -694,6 +705,23 @@ static void iommu_free_pagetable(u64 pt_
     free_pgtable_maddr(pt_maddr);
 }
 
+static void iommu_free_pagetables(unsigned long unused)
+{
+    do {
+        struct page_info *pg;
+
+        spin_lock(&iommu_pt_cleanup_lock);
+        pg = page_list_remove_head(&iommu_pt_cleanup_list);
+        spin_unlock(&iommu_pt_cleanup_lock);
+        if ( !pg )
+            return;
+        iommu_free_page_table(pg);
+    } while ( !softirq_pending(smp_processor_id()) );
+
+    tasklet_schedule_on_cpu(&iommu_pt_cleanup_tasklet,
+                            cpumask_cycle(smp_processor_id(),
&cpu_online_map));
+}
+
 static int iommu_set_root_entry(struct iommu *iommu)
 {
     u32 sts;
@@ -1704,6 +1732,8 @@ void iommu_domain_teardown(struct domain
     iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw));
     hd->pgd_maddr = 0;
     spin_unlock(&hd->mapping_lock);
+
+    tasklet_schedule(&iommu_pt_cleanup_tasklet);
 }
 
 static int intel_iommu_map_page(
@@ -2204,6 +2234,7 @@ int __init intel_vtd_setup(void)
     if ( ret )
         goto error;
 
+    tasklet_init(&iommu_pt_cleanup_tasklet, iommu_free_pagetables, 0);
     register_keyhandler(''V'',
&dump_iommu_info_keyhandler);
 
     return 0;
--- a/xen/include/xen/iommu.h
+++ b/xen/include/xen/iommu.h
@@ -151,4 +151,8 @@ int adjust_vtd_irq_affinities(void);
  */
 DECLARE_PER_CPU(bool_t, iommu_dont_flush_iotlb);
 
+extern struct spinlock iommu_pt_cleanup_lock;
+extern struct page_list_head iommu_pt_cleanup_list;
+extern struct tasklet iommu_pt_cleanup_tasklet;
+
 #endif /* _IOMMU_H_ */




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

Jan Beulich

2013-Dec-10 15:47 UTC

head link

[PATCH 3/5] x86/p2m: restrict auditing to debug builds

... since iterating through all of a guest''s pages may take unduly
long.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>

--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -600,7 +600,11 @@ int set_p2m_entry(struct p2m_domain *p2m
 extern void p2m_pt_init(struct p2m_domain *p2m);
 
 /* Debugging and auditing of the P2M code? */
+#ifndef NDEBUG
 #define P2M_AUDIT     1
+#else
+#define P2M_AUDIT     0
+#endif
 #define P2M_DEBUGGING 0
 
 #if P2M_AUDIT





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

Jan Beulich

2013-Dec-10 15:48 UTC

head link

[PATCH 4/5] HVM: prevent leaking heap data from hvm_save_one()

When one or more of the vCPU-s of a guest are offline, no data may be
put into the allocated space for them and, due to another bug, such
uninitialized data may be passed back to the caller.

Signed-off-by: Don Slutz <dslutz@verizon.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/common/hvm/save.c
+++ b/xen/common/hvm/save.c
@@ -102,7 +102,7 @@ int hvm_save_one(struct domain *d, uint1
         return -EINVAL;
 
     ctxt.size = sz;
-    ctxt.data = xmalloc_bytes(sz);
+    ctxt.data = xzalloc_bytes(sz);
     if ( !ctxt.data )
         return -ENOMEM;

Jan Beulich

2013-Dec-10 15:48 UTC

head link

[PATCH 5/5] x86/PV: don''t commit debug register values early in arch_set_info_guest()

They''re being taken care of later (via set_debugreg()), and temporarily
copying them into struct vcpu means that bad values may end up getting
loaded during context switch if the vCPU is already running and the
function errors out between the premature and real commit step, leading
to the same issue that XSA-12 dealt with.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -740,11 +740,12 @@ int arch_set_info_guest(
             XLAT_trap_info(v->arch.pv_vcpu.trap_ctxt + i,
                            c.cmp->trap_ctxt + i);
     }
-    for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i )
-        v->arch.debugreg[i] = c(debugreg[i]);
 
     if ( has_hvm_container_vcpu(v) )
     {
+        for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i )
+            v->arch.debugreg[i] = c(debugreg[i]);
+
         /*
          * NB: TF_kernel_mode is set unconditionally for HVM guests,
          * so we always use the gs_base_kernel here. If we change this





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

Andrew Cooper

2013-Dec-10 15:58 UTC

head link

Re: [PATCH 3/5] x86/p2m: restrict auditing to debug builds

On 10/12/13 15:47, Jan Beulich wrote:> ... since iterating through all of a guest''s pages may take unduly
> long.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Acked-by: Ian Campbell <ian.campbell@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
>
> --- a/xen/include/asm-x86/p2m.h
> +++ b/xen/include/asm-x86/p2m.h
> @@ -600,7 +600,11 @@ int set_p2m_entry(struct p2m_domain *p2m
>  extern void p2m_pt_init(struct p2m_domain *p2m);
>  
>  /* Debugging and auditing of the P2M code? */
> +#ifndef NDEBUG
>  #define P2M_AUDIT     1
> +#else
> +#define P2M_AUDIT     0
> +#endif
>  #define P2M_DEBUGGING 0
>  
>  #if P2M_AUDIT
>
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

Andrew Cooper

2013-Dec-10 16:01 UTC

head link

Re: [PATCH 4/5] HVM: prevent leaking heap data from hvm_save_one()

On 10/12/13 15:48, Jan Beulich wrote:> When one or more of the vCPU-s of a guest are offline, no data may be
> put into the allocated space for them and, due to another bug, such
> uninitialized data may be passed back to the caller.
>
> Signed-off-by: Don Slutz <dslutz@verizon.com>
> Acked-by: Ian Campbell <ian.campbell@citrix.com>
> Reviewed-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
>
> --- a/xen/common/hvm/save.c
> +++ b/xen/common/hvm/save.c
> @@ -102,7 +102,7 @@ int hvm_save_one(struct domain *d, uint1
>          return -EINVAL;
>  
>      ctxt.size = sz;
> -    ctxt.data = xmalloc_bytes(sz);
> +    ctxt.data = xzalloc_bytes(sz);
>      if ( !ctxt.data )
>          return -ENOMEM;
>  
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel

Andrew Cooper

2013-Dec-10 17:23 UTC

head link

Re: [PATCH 5/5] x86/PV: don''t commit debug register values early in arch_set_info_guest()

On 10/12/13 15:48, Jan Beulich wrote:> They''re being taken care of later (via set_debugreg()), and
temporarily
> copying them into struct vcpu means that bad values may end up getting
> loaded during context switch if the vCPU is already running and the
> function errors out between the premature and real commit step, leading
> to the same issue that XSA-12 dealt with.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Acked-by: Ian Campbell <ian.campbell@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
>
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -740,11 +740,12 @@ int arch_set_info_guest(
>              XLAT_trap_info(v->arch.pv_vcpu.trap_ctxt + i,
>                             c.cmp->trap_ctxt + i);
>      }
> -    for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i )
> -        v->arch.debugreg[i] = c(debugreg[i]);
>  
>      if ( has_hvm_container_vcpu(v) )
>      {
> +        for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i )
> +            v->arch.debugreg[i] = c(debugreg[i]);
> +
>          /*
>           * NB: TF_kernel_mode is set unconditionally for HVM guests,
>           * so we always use the gs_base_kernel here. If we change this
>
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

George Dunlap

2013-Dec-10 17:31 UTC

head link

Re: [PATCH 3/5] x86/p2m: restrict auditing to debug builds

On 12/10/2013 03:47 PM, Jan Beulich wrote:> ... since iterating through all of a guest''s pages may take unduly
> long.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Acked-by: Ian Campbell <ian.campbell@citrix.com>
Release-acked-by: George Dunlap <george.dunlap@eu.citrix.com>

George Dunlap

2013-Dec-10 17:32 UTC

head link

Re: [PATCH 4/5] HVM: prevent leaking heap data from hvm_save_one()

On 12/10/2013 03:48 PM, Jan Beulich wrote:> When one or more of the vCPU-s of a guest are offline, no data may be
> put into the allocated space for them and, due to another bug, such
> uninitialized data may be passed back to the caller.
>
> Signed-off-by: Don Slutz <dslutz@verizon.com>
> Acked-by: Ian Campbell <ian.campbell@citrix.com>
> Reviewed-by: Jan Beulich <jbeulich@suse.com>
Release-acked-by: George Dunlap <george.dunlap@eu.citrix.com>
>
> --- a/xen/common/hvm/save.c
> +++ b/xen/common/hvm/save.c
> @@ -102,7 +102,7 @@ int hvm_save_one(struct domain *d, uint1
>           return -EINVAL;
>   
>       ctxt.size = sz;
> -    ctxt.data = xmalloc_bytes(sz);
> +    ctxt.data = xzalloc_bytes(sz);
>       if ( !ctxt.data )
>           return -ENOMEM;
>   
>
>
>

George Dunlap

2013-Dec-10 17:33 UTC

head link

Re: [PATCH 5/5] x86/PV: don''t commit debug register values early in arch_set_info_guest()

On 12/10/2013 03:48 PM, Jan Beulich wrote:> They''re being taken care of later (via set_debugreg()), and
temporarily
> copying them into struct vcpu means that bad values may end up getting
> loaded during context switch if the vCPU is already running and the
> function errors out between the premature and real commit step, leading
> to the same issue that XSA-12 dealt with.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Acked-by: Ian Campbell <ian.campbell@citrix.com>
Release-acked-by: George Dunlap <george.dunlap@eu.citrix.com>
>
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -740,11 +740,12 @@ int arch_set_info_guest(
>               XLAT_trap_info(v->arch.pv_vcpu.trap_ctxt + i,
>                              c.cmp->trap_ctxt + i);
>       }
> -    for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i )
> -        v->arch.debugreg[i] = c(debugreg[i]);
>   
>       if ( has_hvm_container_vcpu(v) )
>       {
> +        for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i )
> +            v->arch.debugreg[i] = c(debugreg[i]);
> +
>           /*
>            * NB: TF_kernel_mode is set unconditionally for HVM guests,
>            * so we always use the gs_base_kernel here. If we change this
>
>
>

Keir Fraser

2013-Dec-10 18:17 UTC

head link

Re: [PATCH 5/5] x86/PV: don''t commit debug register values early in arch_set_info_guest()

On 10/12/2013 17:33, "George Dunlap"
<george.dunlap@eu.citrix.com> wrote:
> On 12/10/2013 03:48 PM, Jan Beulich wrote:
>> They''re being taken care of later (via set_debugreg()), and
temporarily
>> copying them into struct vcpu means that bad values may end up getting
>> loaded during context switch if the vCPU is already running and the
>> function errors out between the premature and real commit step, leading
>> to the same issue that XSA-12 dealt with.
>> 
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> Acked-by: Ian Campbell <ian.campbell@citrix.com>
> 
> Release-acked-by: George Dunlap <george.dunlap@eu.citrix.com>
Acked-by: Keir Fraser <keir@xen.org>
>> 
>> --- a/xen/arch/x86/domain.c
>> +++ b/xen/arch/x86/domain.c
>> @@ -740,11 +740,12 @@ int arch_set_info_guest(
>>               XLAT_trap_info(v->arch.pv_vcpu.trap_ctxt + i,
>>                              c.cmp->trap_ctxt + i);
>>       }
>> -    for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i )
>> -        v->arch.debugreg[i] = c(debugreg[i]);
>>   
>>       if ( has_hvm_container_vcpu(v) )
>>       {
>> +        for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i )
>> +            v->arch.debugreg[i] = c(debugreg[i]);
>> +
>>           /*
>>            * NB: TF_kernel_mode is set unconditionally for HVM guests,
>>            * so we always use the gs_base_kernel here. If we change
this
>> 
>> 
>> 
>

Andrew Cooper

2013-Dec-11 18:40 UTC

head link

Re: [PATCH 1/5] IOMMU: make page table population preemptible

On 10/12/2013 15:45, Jan Beulich wrote:> Since this can take an arbitrary amount of time, the rooting domctl as
> well as all involved code must become aware of this requiring a
> continuation.
>
> The subject domain''s rel_mem_list is being (ab)used for this, in a
way
> similar to and compatible with broken page offlining.
>
> Further, operations get slightly re-ordered in assign_device(): IOMMU
> page tables now get set up _before_ the first device gets assigned, at
> once closing a small timing window in which the guest may already see
> the device but wouldn''t be able to access it.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -1923,6 +1923,12 @@ int domain_relinquish_resources(struct d
>          }
>  
>          d->arch.relmem = RELMEM_xen;
> +
> +        spin_lock(&d->page_alloc_lock);
> +        page_list_splice(&d->arch.relmem_list,
&d->page_list);
> +        INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
> +        spin_unlock(&d->page_alloc_lock);
> +
>          /* Fallthrough. Relinquish every page of memory. */
>      case RELMEM_xen:
>          ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
> --- a/xen/arch/x86/mm/p2m-pod.c
> +++ b/xen/arch/x86/mm/p2m-pod.c
> @@ -459,7 +459,8 @@ p2m_pod_offline_or_broken_hit(struct pag
>  
>  pod_hit:
>      lock_page_alloc(p2m);
> -    page_list_add_tail(p, &d->arch.relmem_list);
> +    /* Insertion must be at list head (see iommu_populate_page_table()).
*/
> +    page_list_add(p, &d->arch.relmem_list);
>      unlock_page_alloc(p2m);
>      pod_unlock(p2m);
>      return 1;
> --- a/xen/drivers/passthrough/iommu.c
> +++ b/xen/drivers/passthrough/iommu.c
> @@ -18,6 +18,7 @@
>  #include <asm/hvm/iommu.h>
>  #include <xen/paging.h>
>  #include <xen/guest_access.h>
> +#include <xen/event.h>
>  #include <xen/softirq.h>
>  #include <xen/keyhandler.h>
>  #include <xsm/xsm.h>
> @@ -265,7 +266,23 @@ static int assign_device(struct domain *
>               d->mem_event->paging.ring_page)) )
>          return -EXDEV;
>  
> -    spin_lock(&pcidevs_lock);
> +    if ( !spin_trylock(&pcidevs_lock) )
> +        return -ERESTART;
> +
> +    if ( need_iommu(d) <= 0 )
> +    {
> +        if ( !iommu_use_hap_pt(d) )
> +        {
> +            rc = iommu_populate_page_table(d);
> +            if ( rc )
> +            {
> +                spin_unlock(&pcidevs_lock);
> +                return rc;
> +            }
> +        }
> +        d->need_iommu = 1;
> +    }
> +
>      pdev = pci_get_pdev_by_domain(dom0, seg, bus, devfn);
>      if ( !pdev )
>      {
> @@ -290,15 +307,14 @@ static int assign_device(struct domain *
>                     rc);
>      }
>  
> -    if ( has_arch_pdevs(d) && !need_iommu(d) )
> + done:
> +    if ( !has_arch_pdevs(d) && need_iommu(d) )
>      {
> -        d->need_iommu = 1;
> -        if ( !iommu_use_hap_pt(d) )
> -            rc = iommu_populate_page_table(d);
> -        goto done;
> +        d->need_iommu = 0;
> +        hd->platform_ops->teardown(d);
>      }
> -done:
>      spin_unlock(&pcidevs_lock);
> +
>      return rc;
>  }
>  
> @@ -306,12 +322,17 @@ static int iommu_populate_page_table(str
>  {
>      struct hvm_iommu *hd = domain_hvm_iommu(d);
>      struct page_info *page;
> -    int rc = 0;
> +    int rc = 0, n = 0;
> +
> +    d->need_iommu = -1;
>  
>      this_cpu(iommu_dont_flush_iotlb) = 1;
>      spin_lock(&d->page_alloc_lock);
>  
> -    page_list_for_each ( page, &d->page_list )
> +    if ( unlikely(d->is_dying) )
> +        rc = -ESRCH;
> +
> +    while ( !rc && (page =
page_list_remove_head(&d->page_list)) )
>      {
>          if ( is_hvm_domain(d) ||
>              (page->u.inuse.type_info & PGT_type_mask) ==
PGT_writable_page )
> @@ -321,7 +342,32 @@ static int iommu_populate_page_table(str
>                  d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page),
>                  IOMMUF_readable|IOMMUF_writable);
>              if ( rc )
> +            {
> +                page_list_add(page, &d->page_list);
>                  break;
> +            }
> +        }
> +        page_list_add_tail(page, &d->arch.relmem_list);
> +        if ( !(++n & 0xff) &&
!page_list_empty(&d->page_list) &&
Why the forced restart here?  If nothing needs pre-empting, surely it is
better to continue?

Or is this about equality on the pcidevs_lock ?
> +             hypercall_preempt_check() )
> +            rc = -ERESTART;
> +    }
> +
> +    if ( !rc )
> +    {
> +        /*
> +         * The expectation here is that generally there are many normal
pages
> +         * on relmem_list (the ones we put there) and only few being in an
> +         * offline/broken state. The latter ones are always at the head of
the
> +         * list. Hence we first move the whole list, and then move back
the
> +         * first few entries.
> +         */
> +        page_list_move(&d->page_list, &d->arch.relmem_list);
> +        while ( (page = page_list_first(&d->page_list)) != NULL
&&
> +                (page->count_info & (PGC_state|PGC_broken)) )
> +        {
> +            page_list_del(page, &d->page_list);
> +            page_list_add_tail(page, &d->arch.relmem_list);
>          }
>      }
>  
> @@ -330,8 +376,11 @@ static int iommu_populate_page_table(str
>  
>      if ( !rc )
>          iommu_iotlb_flush_all(d);
> -    else
> +    else if ( rc != -ERESTART )
> +    {
>          hd->platform_ops->teardown(d);
> +        d->need_iommu = 0;
> +    }
>  
>      return rc;
>  }
> @@ -688,7 +737,10 @@ int iommu_do_domctl(
>  
>          ret = device_assigned(seg, bus, devfn) ?:
>                assign_device(d, seg, bus, devfn);
> -        if ( ret )
> +        if ( ret == -ERESTART )
> +            ret = hypercall_create_continuation(__HYPERVISOR_domctl,
> +                                                "h", u_domctl);
> +        else if ( ret )
>              printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: "
>                     "assign %04x:%02x:%02x.%u to dom%d failed
(%d)\n",
>                     seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -323,7 +323,7 @@ struct domain
>  
>  #ifdef HAS_PASSTHROUGH
>      /* Does this guest need iommu mappings? */
> -    bool_t           need_iommu;
> +    s8               need_iommu;
I think this change from bool_t to s8 needs a comment explaining that -1
indicates "the iommu mappings are pending creation"

Is there any particular reason that -ERESTART is used when -EAGAIN is
the prevailing style for hypercall continuations?

~Andrew

Andrew Cooper

2013-Dec-11 19:01 UTC

head link

Re: [PATCH 2/5] IOMMU: make page table deallocation preemptible

On 10/12/2013 15:46, Jan Beulich wrote:> This too can take an arbitrary amount of time.
>
> In fact, the bulk of the work is being moved to a tasklet, as handling
> the necessary preemption logic in line seems close to impossible given
> that the teardown may also be invoked on error paths.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
There is a lot of duplicated code here.  I would suggest making most of
this infrastructure common, and having a new iommu_op for "void
free_io_page_table(struct page_info*);"

~Andrew
>
> --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
> +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
> @@ -195,6 +195,8 @@ static int __init amd_iommu_setup_dom0_d
>      return 0;
>  }
>  
> +static void deallocate_page_tables(unsigned long unused);
> +
>  int __init amd_iov_detect(void)
>  {
>      INIT_LIST_HEAD(&amd_iommu_head);
> @@ -215,6 +217,7 @@ int __init amd_iov_detect(void)
>          return -ENODEV;
>      }
>  
> +    tasklet_init(&iommu_pt_cleanup_tasklet, deallocate_page_tables,
0);
>      init_done = 1;
>  
>      if ( !amd_iommu_perdev_intremap )
> @@ -405,11 +408,21 @@ static int amd_iommu_assign_device(struc
>      return reassign_device(dom0, d, devfn, pdev);
>  }
>  
> -static void deallocate_next_page_table(struct page_info* pg, int level)
> +static void deallocate_next_page_table(struct page_info *pg, int level)
> +{
> +    spin_lock(&iommu_pt_cleanup_lock);
> +    PFN_ORDER(pg) = level;
> +    page_list_add_tail(pg, &iommu_pt_cleanup_list);
> +    spin_unlock(&iommu_pt_cleanup_lock);
> +}
> +
> +static void deallocate_page_table(struct page_info *pg)
>  {
>      void *table_vaddr, *pde;
>      u64 next_table_maddr;
> -    int index, next_level;
> +    unsigned int index, level = PFN_ORDER(pg), next_level;
> +
> +    PFN_ORDER(pg) = 0;
>  
>      if ( level <= 1 )
>      {
> @@ -439,6 +452,23 @@ static void deallocate_next_page_table(s
>      free_amd_iommu_pgtable(pg);
>  }
>  
> +static void deallocate_page_tables(unsigned long unused)
> +{
> +    do {
> +        struct page_info *pg;
> +
> +        spin_lock(&iommu_pt_cleanup_lock);
> +        pg = page_list_remove_head(&iommu_pt_cleanup_list);
> +        spin_unlock(&iommu_pt_cleanup_lock);
> +        if ( !pg )
> +            return;
> +        deallocate_page_table(pg);
> +    } while ( !softirq_pending(smp_processor_id()) );
> +
> +    tasklet_schedule_on_cpu(&iommu_pt_cleanup_tasklet,
> +                            cpumask_cycle(smp_processor_id(),
&cpu_online_map));
> +}
> +
>  static void deallocate_iommu_page_tables(struct domain *d)
>  {
>      struct hvm_iommu *hd  = domain_hvm_iommu(d);
> @@ -451,6 +481,7 @@ static void deallocate_iommu_page_tables
>      {
>          deallocate_next_page_table(hd->root_table, hd->paging_mode);
>          hd->root_table = NULL;
> +        tasklet_schedule(&iommu_pt_cleanup_tasklet);
>      }
>      spin_unlock(&hd->mapping_lock);
>  }
> --- a/xen/drivers/passthrough/iommu.c
> +++ b/xen/drivers/passthrough/iommu.c
> @@ -58,6 +58,10 @@ bool_t __read_mostly amd_iommu_perdev_in
>  
>  DEFINE_PER_CPU(bool_t, iommu_dont_flush_iotlb);
>  
> +DEFINE_SPINLOCK(iommu_pt_cleanup_lock);
> +PAGE_LIST_HEAD(iommu_pt_cleanup_list);
> +struct tasklet iommu_pt_cleanup_tasklet;
> +
>  static struct keyhandler iommu_p2m_table = {
>      .diagnostic = 0,
>      .u.fn = iommu_dump_p2m_table,
> --- a/xen/drivers/passthrough/vtd/iommu.c
> +++ b/xen/drivers/passthrough/vtd/iommu.c
> @@ -668,13 +668,24 @@ static void dma_pte_clear_one(struct dom
>  
>  static void iommu_free_pagetable(u64 pt_maddr, int level)
>  {
> -    int i;
> -    struct dma_pte *pt_vaddr, *pte;
> -    int next_level = level - 1;
> +    struct page_info *pg = maddr_to_page(pt_maddr);
>  
>      if ( pt_maddr == 0 )
>          return;
>  
> +    spin_lock(&iommu_pt_cleanup_lock);
> +    PFN_ORDER(pg) = level;
> +    page_list_add_tail(pg, &iommu_pt_cleanup_list);
> +    spin_unlock(&iommu_pt_cleanup_lock);
> +}
> +
> +static void iommu_free_page_table(struct page_info *pg)
> +{
> +    unsigned int i, next_level = PFN_ORDER(pg) - 1;
> +    u64 pt_maddr = page_to_maddr(pg);
> +    struct dma_pte *pt_vaddr, *pte;
> +
> +    PFN_ORDER(pg) = 0;
>      pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
>  
>      for ( i = 0; i < PTE_NUM; i++ )
> @@ -694,6 +705,23 @@ static void iommu_free_pagetable(u64 pt_
>      free_pgtable_maddr(pt_maddr);
>  }
>  
> +static void iommu_free_pagetables(unsigned long unused)
> +{
> +    do {
> +        struct page_info *pg;
> +
> +        spin_lock(&iommu_pt_cleanup_lock);
> +        pg = page_list_remove_head(&iommu_pt_cleanup_list);
> +        spin_unlock(&iommu_pt_cleanup_lock);
> +        if ( !pg )
> +            return;
> +        iommu_free_page_table(pg);
> +    } while ( !softirq_pending(smp_processor_id()) );
> +
> +    tasklet_schedule_on_cpu(&iommu_pt_cleanup_tasklet,
> +                            cpumask_cycle(smp_processor_id(),
&cpu_online_map));
> +}
> +
>  static int iommu_set_root_entry(struct iommu *iommu)
>  {
>      u32 sts;
> @@ -1704,6 +1732,8 @@ void iommu_domain_teardown(struct domain
>      iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw));
>      hd->pgd_maddr = 0;
>      spin_unlock(&hd->mapping_lock);
> +
> +    tasklet_schedule(&iommu_pt_cleanup_tasklet);
>  }
>  
>  static int intel_iommu_map_page(
> @@ -2204,6 +2234,7 @@ int __init intel_vtd_setup(void)
>      if ( ret )
>          goto error;
>  
> +    tasklet_init(&iommu_pt_cleanup_tasklet, iommu_free_pagetables, 0);
>      register_keyhandler(''V'',
&dump_iommu_info_keyhandler);
>  
>      return 0;
> --- a/xen/include/xen/iommu.h
> +++ b/xen/include/xen/iommu.h
> @@ -151,4 +151,8 @@ int adjust_vtd_irq_affinities(void);
>   */
>  DECLARE_PER_CPU(bool_t, iommu_dont_flush_iotlb);
>  
> +extern struct spinlock iommu_pt_cleanup_lock;
> +extern struct page_list_head iommu_pt_cleanup_list;
> +extern struct tasklet iommu_pt_cleanup_tasklet;
> +
>  #endif /* _IOMMU_H_ */
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

Jan Beulich

2013-Dec-13 09:51 UTC

head link

Re: [PATCH 1/5] IOMMU: make page table population preemptible

>>> On 11.12.13 at 19:40, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
> On 10/12/2013 15:45, Jan Beulich wrote:
>> @@ -321,7 +342,32 @@ static int iommu_populate_page_table(str
>>                  d, mfn_to_gmfn(d, page_to_mfn(page)),
page_to_mfn(page),
>>                  IOMMUF_readable|IOMMUF_writable);
>>              if ( rc )
>> +            {
>> +                page_list_add(page, &d->page_list);
>>                  break;
>> +            }
>> +        }
>> +        page_list_add_tail(page, &d->arch.relmem_list);
>> +        if ( !(++n & 0xff) &&
!page_list_empty(&d->page_list) &&
> 
> Why the forced restart here?  If nothing needs pre-empting, surely it is
> better to continue?
> 
> Or is this about equality on the pcidevs_lock ?
> 
>> +             hypercall_preempt_check() )
Did you overlook this part of the condition?
>> --- a/xen/include/xen/sched.h
>> +++ b/xen/include/xen/sched.h
>> @@ -323,7 +323,7 @@ struct domain
>>  
>>  #ifdef HAS_PASSTHROUGH
>>      /* Does this guest need iommu mappings? */
>> -    bool_t           need_iommu;
>> +    s8               need_iommu;
> 
> I think this change from bool_t to s8 needs a comment explaining that -1
> indicates "the iommu mappings are pending creation"
Will do.
> Is there any particular reason that -ERESTART is used when -EAGAIN is
> the prevailing style for hypercall continuations?
I meanwhile realized that using -EAGAIN was a mistake (iirc taken
from certain domctl-s having passed this back up to the caller to
request re-invocation a long time ago) - -EAGAIN really has a
different meaning, and hence we ought to switch all its current
mis-uses to -ERESTART.

Jan

Jan Beulich

2013-Dec-13 09:55 UTC

head link

Re: [PATCH 2/5] IOMMU: make page table deallocation preemptible

>>> On 11.12.13 at 20:01, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
> On 10/12/2013 15:46, Jan Beulich wrote:
>> This too can take an arbitrary amount of time.
>>
>> In fact, the bulk of the work is being moved to a tasklet, as handling
>> the necessary preemption logic in line seems close to impossible given
>> that the teardown may also be invoked on error paths.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> There is a lot of duplicated code here.  I would suggest making most of
> this infrastructure common, and having a new iommu_op for "void
> free_io_page_table(struct page_info*);"
I guess that could be done - let me see how this works out.

Jan

Andrew Cooper

2013-Dec-13 12:18 UTC

head link

Re: [PATCH 1/5] IOMMU: make page table population preemptible

On 13/12/2013 09:51, Jan Beulich wrote:>>>> On 11.12.13 at 19:40, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
>> On 10/12/2013 15:45, Jan Beulich wrote:
>>> @@ -321,7 +342,32 @@ static int iommu_populate_page_table(str
>>>                  d, mfn_to_gmfn(d, page_to_mfn(page)),
page_to_mfn(page),
>>>                  IOMMUF_readable|IOMMUF_writable);
>>>              if ( rc )
>>> +            {
>>> +                page_list_add(page, &d->page_list);
>>>                  break;
>>> +            }
>>> +        }
>>> +        page_list_add_tail(page, &d->arch.relmem_list);
>>> +        if ( !(++n & 0xff) &&
!page_list_empty(&d->page_list) &&
>> Why the forced restart here?  If nothing needs pre-empting, surely it
is
>> better to continue?
>>
>> Or is this about equality on the pcidevs_lock ?
>>
>>> +             hypercall_preempt_check() )
> Did you overlook this part of the condition?
No, but I did mentally get the logic inverted when trying to work out
what was going on.

How about (++n > 0xff) ?

If we have already spent a while in this loop, and the
hypercall_preempt_check() doesn''t flip to 1 until a few iterations
after
n is congruent with 0x100, waiting for another 0x100 iterations before
checking again seems a little long.
>
>>> --- a/xen/include/xen/sched.h
>>> +++ b/xen/include/xen/sched.h
>>> @@ -323,7 +323,7 @@ struct domain
>>>  
>>>  #ifdef HAS_PASSTHROUGH
>>>      /* Does this guest need iommu mappings? */
>>> -    bool_t           need_iommu;
>>> +    s8               need_iommu;
>> I think this change from bool_t to s8 needs a comment explaining that
-1
>> indicates "the iommu mappings are pending creation"
> Will do.
>
>> Is there any particular reason that -ERESTART is used when -EAGAIN is
>> the prevailing style for hypercall continuations?
> I meanwhile realized that using -EAGAIN was a mistake (iirc taken
> from certain domctl-s having passed this back up to the caller to
> request re-invocation a long time ago) - -EAGAIN really has a
> different meaning, and hence we ought to switch all its current
> mis-uses to -ERESTART.
>
> Jan
>
Ok.

~Andrew

Jan Beulich

2013-Dec-13 12:34 UTC

head link

Re: [PATCH 1/5] IOMMU: make page table population preemptible

>>> On 13.12.13 at 13:18, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
> On 13/12/2013 09:51, Jan Beulich wrote:
>>>>> On 11.12.13 at 19:40, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
>>> On 10/12/2013 15:45, Jan Beulich wrote:
>>>> @@ -321,7 +342,32 @@ static int iommu_populate_page_table(str
>>>>                  d, mfn_to_gmfn(d, page_to_mfn(page)),
page_to_mfn(page),
>>>>                  IOMMUF_readable|IOMMUF_writable);
>>>>              if ( rc )
>>>> +            {
>>>> +                page_list_add(page, &d->page_list);
>>>>                  break;
>>>> +            }
>>>> +        }
>>>> +        page_list_add_tail(page, &d->arch.relmem_list);
>>>> +        if ( !(++n & 0xff) &&
!page_list_empty(&d->page_list) &&
>>> Why the forced restart here?  If nothing needs pre-empting, surely
it is
>>> better to continue?
>>>
>>> Or is this about equality on the pcidevs_lock ?
>>>
>>>> +             hypercall_preempt_check() )
>> Did you overlook this part of the condition?
> 
> No, but I did mentally get the logic inverted when trying to work out
> what was going on.
> 
> How about (++n > 0xff) ?
> 
> If we have already spent a while in this loop, and the
> hypercall_preempt_check() doesn''t flip to 1 until a few iterations
after
> n is congruent with 0x100, waiting for another 0x100 iterations before
> checking again seems a little long.
The thing I''m trying to avoid is calling hypercall_preempt_check()
overly often - especially when the prior if()''s body doesn''t
get
entered we might otherwise do very little useful for per check.

While 256 isn''t overly much (covering merely 1Mb of guest space),
I could see two possibilities to get this a little more towards what
you want: Either we right shift the mask each time we actually
call hypercall_preempt_check(), or we bias the increment (adding
e.g. 10 when going the expensive route, else 1).

Jan

Tim Deegan

2013-Dec-13 13:46 UTC

head link

Re: [PATCH 3/5] x86/p2m: restrict auditing to debug builds

At 15:47 +0000 on 10 Dec (1386686837), Jan Beulich
wrote:> ... since iterating through all of a guest''s pages may take unduly
> long.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Acked-by: Ian Campbell <ian.campbell@citrix.com>
Acked-by: Tim Deegan <tim@xen.org>

Andrew Cooper

2013-Dec-13 13:57 UTC

head link

Re: [PATCH 1/5] IOMMU: make page table population preemptible

On 13/12/2013 12:34, Jan Beulich wrote:>>>> On 13.12.13 at 13:18, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
>> On 13/12/2013 09:51, Jan Beulich wrote:
>>>>>> On 11.12.13 at 19:40, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
>>>> On 10/12/2013 15:45, Jan Beulich wrote:
>>>>> @@ -321,7 +342,32 @@ static int
iommu_populate_page_table(str
>>>>>                  d, mfn_to_gmfn(d, page_to_mfn(page)),
page_to_mfn(page),
>>>>>                  IOMMUF_readable|IOMMUF_writable);
>>>>>              if ( rc )
>>>>> +            {
>>>>> +                page_list_add(page, &d->page_list);
>>>>>                  break;
>>>>> +            }
>>>>> +        }
>>>>> +        page_list_add_tail(page,
&d->arch.relmem_list);
>>>>> +        if ( !(++n & 0xff) &&
!page_list_empty(&d->page_list) &&
>>>> Why the forced restart here?  If nothing needs pre-empting,
surely it is
>>>> better to continue?
>>>>
>>>> Or is this about equality on the pcidevs_lock ?
>>>>
>>>>> +             hypercall_preempt_check() )
>>> Did you overlook this part of the condition?
>> No, but I did mentally get the logic inverted when trying to work out
>> what was going on.
>>
>> How about (++n > 0xff) ?
>>
>> If we have already spent a while in this loop, and the
>> hypercall_preempt_check() doesn''t flip to 1 until a few
iterations after
>> n is congruent with 0x100, waiting for another 0x100 iterations before
>> checking again seems a little long.
> The thing I''m trying to avoid is calling hypercall_preempt_check()
> overly often - especially when the prior if()''s body
doesn''t get
> entered we might otherwise do very little useful for per check.
>
> While 256 isn''t overly much (covering merely 1Mb of guest space),
> I could see two possibilities to get this a little more towards what
> you want: Either we right shift the mask each time we actually
> call hypercall_preempt_check(), or we bias the increment (adding
> e.g. 10 when going the expensive route, else 1).
>
> Jan
I suppose it probably doesn''t matter too much.  In the slim case where
he loop does manage to get to 257 iterations before the preempt check
returns true, doing another 254 iterations isn''t going to skew the
timings too much.  And the net time till completion will be fractionally
shorter.

~Andrew

Jan Beulich

2013-Dec-13 13:59 UTC

head link

[PATCH v2 1/5] IOMMU: make page table population preemptible

Since this can take an arbitrary amount of time, the rooting domctl as
well as all involved code must become aware of this requiring a
continuation.

The subject domain''s rel_mem_list is being (ab)used for this, in a way
similar to and compatible with broken page offlining.

Further, operations get slightly re-ordered in assign_device(): IOMMU
page tables now get set up _before_ the first device gets assigned, at
once closing a small timing window in which the guest may already see
the device but wouldn''t be able to access it.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Extend comment on struct domain''s need_iommu field.

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1924,6 +1924,12 @@ int domain_relinquish_resources(struct d
         }
 
         d->arch.relmem = RELMEM_xen;
+
+        spin_lock(&d->page_alloc_lock);
+        page_list_splice(&d->arch.relmem_list, &d->page_list);
+        INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
+        spin_unlock(&d->page_alloc_lock);
+
         /* Fallthrough. Relinquish every page of memory. */
     case RELMEM_xen:
         ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
--- a/xen/arch/x86/mm/p2m-pod.c
+++ b/xen/arch/x86/mm/p2m-pod.c
@@ -459,7 +459,8 @@ p2m_pod_offline_or_broken_hit(struct pag
 
 pod_hit:
     lock_page_alloc(p2m);
-    page_list_add_tail(p, &d->arch.relmem_list);
+    /* Insertion must be at list head (see iommu_populate_page_table()). */
+    page_list_add(p, &d->arch.relmem_list);
     unlock_page_alloc(p2m);
     pod_unlock(p2m);
     return 1;
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -18,6 +18,7 @@
 #include <asm/hvm/iommu.h>
 #include <xen/paging.h>
 #include <xen/guest_access.h>
+#include <xen/event.h>
 #include <xen/softirq.h>
 #include <xen/keyhandler.h>
 #include <xsm/xsm.h>
@@ -265,7 +266,23 @@ static int assign_device(struct domain *
              d->mem_event->paging.ring_page)) )
         return -EXDEV;
 
-    spin_lock(&pcidevs_lock);
+    if ( !spin_trylock(&pcidevs_lock) )
+        return -ERESTART;
+
+    if ( need_iommu(d) <= 0 )
+    {
+        if ( !iommu_use_hap_pt(d) )
+        {
+            rc = iommu_populate_page_table(d);
+            if ( rc )
+            {
+                spin_unlock(&pcidevs_lock);
+                return rc;
+            }
+        }
+        d->need_iommu = 1;
+    }
+
     pdev = pci_get_pdev_by_domain(dom0, seg, bus, devfn);
     if ( !pdev )
     {
@@ -290,15 +307,14 @@ static int assign_device(struct domain *
                    rc);
     }
 
-    if ( has_arch_pdevs(d) && !need_iommu(d) )
+ done:
+    if ( !has_arch_pdevs(d) && need_iommu(d) )
     {
-        d->need_iommu = 1;
-        if ( !iommu_use_hap_pt(d) )
-            rc = iommu_populate_page_table(d);
-        goto done;
+        d->need_iommu = 0;
+        hd->platform_ops->teardown(d);
     }
-done:
     spin_unlock(&pcidevs_lock);
+
     return rc;
 }
 
@@ -306,12 +322,17 @@ static int iommu_populate_page_table(str
 {
     struct hvm_iommu *hd = domain_hvm_iommu(d);
     struct page_info *page;
-    int rc = 0;
+    int rc = 0, n = 0;
+
+    d->need_iommu = -1;
 
     this_cpu(iommu_dont_flush_iotlb) = 1;
     spin_lock(&d->page_alloc_lock);
 
-    page_list_for_each ( page, &d->page_list )
+    if ( unlikely(d->is_dying) )
+        rc = -ESRCH;
+
+    while ( !rc && (page = page_list_remove_head(&d->page_list))
)
     {
         if ( is_hvm_domain(d) ||
             (page->u.inuse.type_info & PGT_type_mask) ==
PGT_writable_page )
@@ -321,7 +342,32 @@ static int iommu_populate_page_table(str
                 d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page),
                 IOMMUF_readable|IOMMUF_writable);
             if ( rc )
+            {
+                page_list_add(page, &d->page_list);
                 break;
+            }
+        }
+        page_list_add_tail(page, &d->arch.relmem_list);
+        if ( !(++n & 0xff) &&
!page_list_empty(&d->page_list) &&
+             hypercall_preempt_check() )
+            rc = -ERESTART;
+    }
+
+    if ( !rc )
+    {
+        /*
+         * The expectation here is that generally there are many normal pages
+         * on relmem_list (the ones we put there) and only few being in an
+         * offline/broken state. The latter ones are always at the head of the
+         * list. Hence we first move the whole list, and then move back the
+         * first few entries.
+         */
+        page_list_move(&d->page_list, &d->arch.relmem_list);
+        while ( (page = page_list_first(&d->page_list)) != NULL
&&
+                (page->count_info & (PGC_state|PGC_broken)) )
+        {
+            page_list_del(page, &d->page_list);
+            page_list_add_tail(page, &d->arch.relmem_list);
         }
     }
 
@@ -330,8 +376,11 @@ static int iommu_populate_page_table(str
 
     if ( !rc )
         iommu_iotlb_flush_all(d);
-    else
+    else if ( rc != -ERESTART )
+    {
+        d->need_iommu = 0;
         hd->platform_ops->teardown(d);
+    }
 
     return rc;
 }
@@ -688,7 +737,10 @@ int iommu_do_domctl(
 
         ret = device_assigned(seg, bus, devfn) ?:
               assign_device(d, seg, bus, devfn);
-        if ( ret )
+        if ( ret == -ERESTART )
+            ret = hypercall_create_continuation(__HYPERVISOR_domctl,
+                                                "h", u_domctl);
+        else if ( ret )
             printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: "
                    "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n",
                    seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -322,8 +322,8 @@ struct domain
     enum guest_type guest_type;
 
 #ifdef HAS_PASSTHROUGH
-    /* Does this guest need iommu mappings? */
-    bool_t           need_iommu;
+    /* Does this guest need iommu mappings (-1 meaning "being set
up")? */
+    s8               need_iommu;
 #endif
     /* is node-affinity automatically computed? */
     bool_t           auto_node_affinity;




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

Jan Beulich

2013-Dec-13 14:00 UTC

head link

[PATCH v2 2/5] IOMMU: make page table deallocation preemptible

This too can take an arbitrary amount of time.

In fact, the bulk of the work is being moved to a tasklet, as handling
the necessary preemption logic in line seems close to impossible given
that the teardown may also be invoked on error paths.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: abstract out tasklet logic

--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -405,11 +405,21 @@ static int amd_iommu_assign_device(struc
     return reassign_device(dom0, d, devfn, pdev);
 }
 
-static void deallocate_next_page_table(struct page_info* pg, int level)
+static void deallocate_next_page_table(struct page_info *pg, int level)
+{
+    PFN_ORDER(pg) = level;
+    spin_lock(&iommu_pt_cleanup_lock);
+    page_list_add_tail(pg, &iommu_pt_cleanup_list);
+    spin_unlock(&iommu_pt_cleanup_lock);
+}
+
+static void deallocate_page_table(struct page_info *pg)
 {
     void *table_vaddr, *pde;
     u64 next_table_maddr;
-    int index, next_level;
+    unsigned int index, level = PFN_ORDER(pg), next_level;
+
+    PFN_ORDER(pg) = 0;
 
     if ( level <= 1 )
     {
@@ -599,6 +609,7 @@ const struct iommu_ops amd_iommu_ops = {
     .teardown = amd_iommu_domain_destroy,
     .map_page = amd_iommu_map_page,
     .unmap_page = amd_iommu_unmap_page,
+    .free_page_table = deallocate_page_table,
     .reassign_device = reassign_device,
     .get_device_group_id = amd_iommu_group_id,
     .update_ire_from_apic = amd_iommu_ioapic_update_ire,
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -58,6 +58,10 @@ bool_t __read_mostly amd_iommu_perdev_in
 
 DEFINE_PER_CPU(bool_t, iommu_dont_flush_iotlb);
 
+DEFINE_SPINLOCK(iommu_pt_cleanup_lock);
+PAGE_LIST_HEAD(iommu_pt_cleanup_list);
+static struct tasklet iommu_pt_cleanup_tasklet;
+
 static struct keyhandler iommu_p2m_table = {
     .diagnostic = 0,
     .u.fn = iommu_dump_p2m_table,
@@ -235,6 +239,15 @@ int iommu_remove_device(struct pci_dev *
     return hd->platform_ops->remove_device(pdev->devfn, pdev);
 }
 
+static void iommu_teardown(struct domain *d)
+{
+    const struct hvm_iommu *hd = domain_hvm_iommu(d);
+
+    d->need_iommu = 0;
+    hd->platform_ops->teardown(d);
+    tasklet_schedule(&iommu_pt_cleanup_tasklet);
+}
+
 /*
  * If the device isn''t owned by dom0, it means it already
  * has been assigned to other domain, or it doesn''t exist.
@@ -309,10 +322,7 @@ static int assign_device(struct domain *
 
  done:
     if ( !has_arch_pdevs(d) && need_iommu(d) )
-    {
-        d->need_iommu = 0;
-        hd->platform_ops->teardown(d);
-    }
+        iommu_teardown(d);
     spin_unlock(&pcidevs_lock);
 
     return rc;
@@ -377,10 +387,7 @@ static int iommu_populate_page_table(str
     if ( !rc )
         iommu_iotlb_flush_all(d);
     else if ( rc != -ERESTART )
-    {
-        d->need_iommu = 0;
-        hd->platform_ops->teardown(d);
-    }
+        iommu_teardown(d);
 
     return rc;
 }
@@ -397,10 +404,7 @@ void iommu_domain_destroy(struct domain 
         return;
 
     if ( need_iommu(d) )
-    {
-        d->need_iommu = 0;
-        hd->platform_ops->teardown(d);
-    }
+        iommu_teardown(d);
 
     list_for_each_safe ( ioport_list, tmp, &hd->g2m_ioport_list )
     {
@@ -438,6 +442,23 @@ int iommu_unmap_page(struct domain *d, u
     return hd->platform_ops->unmap_page(d, gfn);
 }
 
+static void iommu_free_pagetables(unsigned long unused)
+{
+    do {
+        struct page_info *pg;
+
+        spin_lock(&iommu_pt_cleanup_lock);
+        pg = page_list_remove_head(&iommu_pt_cleanup_list);
+        spin_unlock(&iommu_pt_cleanup_lock);
+        if ( !pg )
+            return;
+        iommu_get_ops()->free_page_table(pg);
+    } while ( !softirq_pending(smp_processor_id()) );
+
+    tasklet_schedule_on_cpu(&iommu_pt_cleanup_tasklet,
+                            cpumask_cycle(smp_processor_id(),
&cpu_online_map));
+}
+
 void iommu_iotlb_flush(struct domain *d, unsigned long gfn, unsigned int
page_count)
 {
     struct hvm_iommu *hd = domain_hvm_iommu(d);
@@ -500,10 +521,7 @@ int deassign_device(struct domain *d, u1
     pdev->fault.count = 0;
 
     if ( !has_arch_pdevs(d) && need_iommu(d) )
-    {
-        d->need_iommu = 0;
-        hd->platform_ops->teardown(d);
-    }
+        iommu_teardown(d);
 
     return ret;
 }
@@ -542,6 +560,7 @@ int __init iommu_setup(void)
                iommu_passthrough ? "Passthrough" :
                iommu_dom0_strict ? "Strict" : "Relaxed");
         printk("Interrupt remapping %sabled\n", iommu_intremap ?
"en" : "dis");
+        tasklet_init(&iommu_pt_cleanup_tasklet, iommu_free_pagetables, 0);
     }
 
     return rc;
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -668,13 +668,24 @@ static void dma_pte_clear_one(struct dom
 
 static void iommu_free_pagetable(u64 pt_maddr, int level)
 {
-    int i;
-    struct dma_pte *pt_vaddr, *pte;
-    int next_level = level - 1;
+    struct page_info *pg = maddr_to_page(pt_maddr);
 
     if ( pt_maddr == 0 )
         return;
 
+    PFN_ORDER(pg) = level;
+    spin_lock(&iommu_pt_cleanup_lock);
+    page_list_add_tail(pg, &iommu_pt_cleanup_list);
+    spin_unlock(&iommu_pt_cleanup_lock);
+}
+
+static void iommu_free_page_table(struct page_info *pg)
+{
+    unsigned int i, next_level = PFN_ORDER(pg) - 1;
+    u64 pt_maddr = page_to_maddr(pg);
+    struct dma_pte *pt_vaddr, *pte;
+
+    PFN_ORDER(pg) = 0;
     pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
 
     for ( i = 0; i < PTE_NUM; i++ )
@@ -2430,6 +2441,7 @@ const struct iommu_ops intel_iommu_ops      .teardown =
iommu_domain_teardown,
     .map_page = intel_iommu_map_page,
     .unmap_page = intel_iommu_unmap_page,
+    .free_page_table = iommu_free_page_table,
     .reassign_device = reassign_device_ownership,
     .get_device_group_id = intel_iommu_group_id,
     .update_ire_from_apic = io_apic_write_remap_rte,
--- a/xen/include/xen/iommu.h
+++ b/xen/include/xen/iommu.h
@@ -88,6 +88,7 @@ bool_t pt_irq_need_timer(uint32_t flags)
 
 struct msi_desc;
 struct msi_msg;
+struct page_info;
 
 struct iommu_ops {
     int (*init)(struct domain *d);
@@ -100,6 +101,7 @@ struct iommu_ops {
     int (*map_page)(struct domain *d, unsigned long gfn, unsigned long mfn,
                     unsigned int flags);
     int (*unmap_page)(struct domain *d, unsigned long gfn);
+    void (*free_page_table)(struct page_info *);
     int (*reassign_device)(struct domain *s, struct domain *t,
 			   u8 devfn, struct pci_dev *);
     int (*get_device_group_id)(u16 seg, u8 bus, u8 devfn);
@@ -151,4 +153,7 @@ int adjust_vtd_irq_affinities(void);
  */
 DECLARE_PER_CPU(bool_t, iommu_dont_flush_iotlb);
 
+extern struct spinlock iommu_pt_cleanup_lock;
+extern struct page_list_head iommu_pt_cleanup_list;
+
 #endif /* _IOMMU_H_ */




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

Tim Deegan

2013-Dec-13 14:16 UTC

head link

Re: [PATCH v2 1/5] IOMMU: make page table population preemptible

At 13:59 +0000 on 13 Dec (1386939579), Jan Beulich
wrote:> Since this can take an arbitrary amount of time, the rooting domctl as
> well as all involved code must become aware of this requiring a
> continuation.
> 
> The subject domain''s rel_mem_list is being (ab)used for this, in a
way
> similar to and compatible with broken page offlining.
> 
> Further, operations get slightly re-ordered in assign_device(): IOMMU
> page tables now get set up _before_ the first device gets assigned, at
> once closing a small timing window in which the guest may already see
> the device but wouldn''t be able to access it.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>

Andrew Cooper

2013-Dec-13 15:09 UTC

head link

Re: [PATCH v2 1/5] IOMMU: make page table population preemptible

On 13/12/2013 13:59, Jan Beulich wrote:> Since this can take an arbitrary amount of time, the rooting domctl as
> well as all involved code must become aware of this requiring a
> continuation.
>
> The subject domain''s rel_mem_list is being (ab)used for this, in a
way
> similar to and compatible with broken page offlining.
>
> Further, operations get slightly re-ordered in assign_device(): IOMMU
> page tables now get set up _before_ the first device gets assigned, at
> once closing a small timing window in which the guest may already see
> the device but wouldn''t be able to access it.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v2: Extend comment on struct domain''s need_iommu field.
>
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -1924,6 +1924,12 @@ int domain_relinquish_resources(struct d
>          }
>  
>          d->arch.relmem = RELMEM_xen;
> +
> +        spin_lock(&d->page_alloc_lock);
> +        page_list_splice(&d->arch.relmem_list,
&d->page_list);
> +        INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
> +        spin_unlock(&d->page_alloc_lock);
> +
>          /* Fallthrough. Relinquish every page of memory. */
>      case RELMEM_xen:
>          ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
> --- a/xen/arch/x86/mm/p2m-pod.c
> +++ b/xen/arch/x86/mm/p2m-pod.c
> @@ -459,7 +459,8 @@ p2m_pod_offline_or_broken_hit(struct pag
>  
>  pod_hit:
>      lock_page_alloc(p2m);
> -    page_list_add_tail(p, &d->arch.relmem_list);
> +    /* Insertion must be at list head (see iommu_populate_page_table()).
*/
> +    page_list_add(p, &d->arch.relmem_list);
>      unlock_page_alloc(p2m);
>      pod_unlock(p2m);
>      return 1;
> --- a/xen/drivers/passthrough/iommu.c
> +++ b/xen/drivers/passthrough/iommu.c
> @@ -18,6 +18,7 @@
>  #include <asm/hvm/iommu.h>
>  #include <xen/paging.h>
>  #include <xen/guest_access.h>
> +#include <xen/event.h>
>  #include <xen/softirq.h>
>  #include <xen/keyhandler.h>
>  #include <xsm/xsm.h>
> @@ -265,7 +266,23 @@ static int assign_device(struct domain *
>               d->mem_event->paging.ring_page)) )
>          return -EXDEV;
>  
> -    spin_lock(&pcidevs_lock);
> +    if ( !spin_trylock(&pcidevs_lock) )
> +        return -ERESTART;
> +
> +    if ( need_iommu(d) <= 0 )
> +    {
> +        if ( !iommu_use_hap_pt(d) )
> +        {
> +            rc = iommu_populate_page_table(d);
> +            if ( rc )
> +            {
> +                spin_unlock(&pcidevs_lock);
> +                return rc;
> +            }
> +        }
> +        d->need_iommu = 1;
> +    }
> +
>      pdev = pci_get_pdev_by_domain(dom0, seg, bus, devfn);
>      if ( !pdev )
>      {
> @@ -290,15 +307,14 @@ static int assign_device(struct domain *
>                     rc);
>      }
>  
> -    if ( has_arch_pdevs(d) && !need_iommu(d) )
> + done:
> +    if ( !has_arch_pdevs(d) && need_iommu(d) )
We now have a case where, for the first device, we could set up
pagetables for a large domain, get an error with assignment, then tear
them all back down.  (-EBUSY from pci_get_pdev() looks like a good
non-fatal candidate for causing this behaviour)

I am wondering whether this is better for worse than the race condition
where a guest couldn''t use the device.  A guest could not reasonably
expect to use a device before the toolstack is done setting it up.   A
buggy toolstack could quite easily tie up a lot of Xen time creating and
destroying complete iommu pagetable sets.

~Andrew
>      {
> -        d->need_iommu = 1;
> -        if ( !iommu_use_hap_pt(d) )
> -            rc = iommu_populate_page_table(d);
> -        goto done;
> +        d->need_iommu = 0;
> +        hd->platform_ops->teardown(d);
>      }
> -done:
>      spin_unlock(&pcidevs_lock);
> +
>      return rc;
>  }
>  
> @@ -306,12 +322,17 @@ static int iommu_populate_page_table(str
>  {
>      struct hvm_iommu *hd = domain_hvm_iommu(d);
>      struct page_info *page;
> -    int rc = 0;
> +    int rc = 0, n = 0;
> +
> +    d->need_iommu = -1;
>  
>      this_cpu(iommu_dont_flush_iotlb) = 1;
>      spin_lock(&d->page_alloc_lock);
>  
> -    page_list_for_each ( page, &d->page_list )
> +    if ( unlikely(d->is_dying) )
> +        rc = -ESRCH;
> +
> +    while ( !rc && (page =
page_list_remove_head(&d->page_list)) )
>      {
>          if ( is_hvm_domain(d) ||
>              (page->u.inuse.type_info & PGT_type_mask) ==
PGT_writable_page )
> @@ -321,7 +342,32 @@ static int iommu_populate_page_table(str
>                  d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page),
>                  IOMMUF_readable|IOMMUF_writable);
>              if ( rc )
> +            {
> +                page_list_add(page, &d->page_list);
>                  break;
> +            }
> +        }
> +        page_list_add_tail(page, &d->arch.relmem_list);
> +        if ( !(++n & 0xff) &&
!page_list_empty(&d->page_list) &&
> +             hypercall_preempt_check() )
> +            rc = -ERESTART;
> +    }
> +
> +    if ( !rc )
> +    {
> +        /*
> +         * The expectation here is that generally there are many normal
pages
> +         * on relmem_list (the ones we put there) and only few being in an
> +         * offline/broken state. The latter ones are always at the head of
the
> +         * list. Hence we first move the whole list, and then move back
the
> +         * first few entries.
> +         */
> +        page_list_move(&d->page_list, &d->arch.relmem_list);
> +        while ( (page = page_list_first(&d->page_list)) != NULL
&&
> +                (page->count_info & (PGC_state|PGC_broken)) )
> +        {
> +            page_list_del(page, &d->page_list);
> +            page_list_add_tail(page, &d->arch.relmem_list);
>          }
>      }
>  
> @@ -330,8 +376,11 @@ static int iommu_populate_page_table(str
>  
>      if ( !rc )
>          iommu_iotlb_flush_all(d);
> -    else
> +    else if ( rc != -ERESTART )
> +    {
> +        d->need_iommu = 0;
>          hd->platform_ops->teardown(d);
> +    }
>  
>      return rc;
>  }
> @@ -688,7 +737,10 @@ int iommu_do_domctl(
>  
>          ret = device_assigned(seg, bus, devfn) ?:
>                assign_device(d, seg, bus, devfn);
> -        if ( ret )
> +        if ( ret == -ERESTART )
> +            ret = hypercall_create_continuation(__HYPERVISOR_domctl,
> +                                                "h", u_domctl);
> +        else if ( ret )
>              printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: "
>                     "assign %04x:%02x:%02x.%u to dom%d failed
(%d)\n",
>                     seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -322,8 +322,8 @@ struct domain
>      enum guest_type guest_type;
>  
>  #ifdef HAS_PASSTHROUGH
> -    /* Does this guest need iommu mappings? */
> -    bool_t           need_iommu;
> +    /* Does this guest need iommu mappings (-1 meaning "being set
up")? */
> +    s8               need_iommu;
>  #endif
>      /* is node-affinity automatically computed? */
>      bool_t           auto_node_affinity;
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

Andrew Cooper

2013-Dec-13 15:26 UTC

head link

Re: [PATCH v2 2/5] IOMMU: make page table deallocation preemptible

On 13/12/2013 14:00, Jan Beulich wrote:> This too can take an arbitrary amount of time.
>
> In fact, the bulk of the work is being moved to a tasklet, as handling
> the necessary preemption logic in line seems close to impossible given
> that the teardown may also be invoked on error paths.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
> ---
> v2: abstract out tasklet logic
>
> --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
> +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
> @@ -405,11 +405,21 @@ static int amd_iommu_assign_device(struc
>      return reassign_device(dom0, d, devfn, pdev);
>  }
>  
> -static void deallocate_next_page_table(struct page_info* pg, int level)
> +static void deallocate_next_page_table(struct page_info *pg, int level)
> +{
> +    PFN_ORDER(pg) = level;
> +    spin_lock(&iommu_pt_cleanup_lock);
> +    page_list_add_tail(pg, &iommu_pt_cleanup_list);
> +    spin_unlock(&iommu_pt_cleanup_lock);
> +}
> +
> +static void deallocate_page_table(struct page_info *pg)
>  {
>      void *table_vaddr, *pde;
>      u64 next_table_maddr;
> -    int index, next_level;
> +    unsigned int index, level = PFN_ORDER(pg), next_level;
> +
> +    PFN_ORDER(pg) = 0;
>  
>      if ( level <= 1 )
>      {
> @@ -599,6 +609,7 @@ const struct iommu_ops amd_iommu_ops = {
>      .teardown = amd_iommu_domain_destroy,
>      .map_page = amd_iommu_map_page,
>      .unmap_page = amd_iommu_unmap_page,
> +    .free_page_table = deallocate_page_table,
>      .reassign_device = reassign_device,
>      .get_device_group_id = amd_iommu_group_id,
>      .update_ire_from_apic = amd_iommu_ioapic_update_ire,
> --- a/xen/drivers/passthrough/iommu.c
> +++ b/xen/drivers/passthrough/iommu.c
> @@ -58,6 +58,10 @@ bool_t __read_mostly amd_iommu_perdev_in
>  
>  DEFINE_PER_CPU(bool_t, iommu_dont_flush_iotlb);
>  
> +DEFINE_SPINLOCK(iommu_pt_cleanup_lock);
> +PAGE_LIST_HEAD(iommu_pt_cleanup_list);
> +static struct tasklet iommu_pt_cleanup_tasklet;
> +
>  static struct keyhandler iommu_p2m_table = {
>      .diagnostic = 0,
>      .u.fn = iommu_dump_p2m_table,
> @@ -235,6 +239,15 @@ int iommu_remove_device(struct pci_dev *
>      return hd->platform_ops->remove_device(pdev->devfn, pdev);
>  }
>  
> +static void iommu_teardown(struct domain *d)
> +{
> +    const struct hvm_iommu *hd = domain_hvm_iommu(d);
> +
> +    d->need_iommu = 0;
> +    hd->platform_ops->teardown(d);
> +    tasklet_schedule(&iommu_pt_cleanup_tasklet);
> +}
> +
>  /*
>   * If the device isn''t owned by dom0, it means it already
>   * has been assigned to other domain, or it doesn''t exist.
> @@ -309,10 +322,7 @@ static int assign_device(struct domain *
>  
>   done:
>      if ( !has_arch_pdevs(d) && need_iommu(d) )
> -    {
> -        d->need_iommu = 0;
> -        hd->platform_ops->teardown(d);
> -    }
> +        iommu_teardown(d);
>      spin_unlock(&pcidevs_lock);
>  
>      return rc;
> @@ -377,10 +387,7 @@ static int iommu_populate_page_table(str
>      if ( !rc )
>          iommu_iotlb_flush_all(d);
>      else if ( rc != -ERESTART )
> -    {
> -        d->need_iommu = 0;
> -        hd->platform_ops->teardown(d);
> -    }
> +        iommu_teardown(d);
>  
>      return rc;
>  }
> @@ -397,10 +404,7 @@ void iommu_domain_destroy(struct domain 
>          return;
>  
>      if ( need_iommu(d) )
> -    {
> -        d->need_iommu = 0;
> -        hd->platform_ops->teardown(d);
> -    }
> +        iommu_teardown(d);
>  
>      list_for_each_safe ( ioport_list, tmp, &hd->g2m_ioport_list )
>      {
> @@ -438,6 +442,23 @@ int iommu_unmap_page(struct domain *d, u
>      return hd->platform_ops->unmap_page(d, gfn);
>  }
>  
> +static void iommu_free_pagetables(unsigned long unused)
> +{
> +    do {
> +        struct page_info *pg;
> +
> +        spin_lock(&iommu_pt_cleanup_lock);
> +        pg = page_list_remove_head(&iommu_pt_cleanup_list);
> +        spin_unlock(&iommu_pt_cleanup_lock);
> +        if ( !pg )
> +            return;
> +        iommu_get_ops()->free_page_table(pg);
> +    } while ( !softirq_pending(smp_processor_id()) );
> +
> +    tasklet_schedule_on_cpu(&iommu_pt_cleanup_tasklet,
> +                            cpumask_cycle(smp_processor_id(),
&cpu_online_map));
> +}
> +
>  void iommu_iotlb_flush(struct domain *d, unsigned long gfn, unsigned int
page_count)
>  {
>      struct hvm_iommu *hd = domain_hvm_iommu(d);
> @@ -500,10 +521,7 @@ int deassign_device(struct domain *d, u1
>      pdev->fault.count = 0;
>  
>      if ( !has_arch_pdevs(d) && need_iommu(d) )
> -    {
> -        d->need_iommu = 0;
> -        hd->platform_ops->teardown(d);
> -    }
> +        iommu_teardown(d);
>  
>      return ret;
>  }
> @@ -542,6 +560,7 @@ int __init iommu_setup(void)
>                 iommu_passthrough ? "Passthrough" :
>                 iommu_dom0_strict ? "Strict" :
"Relaxed");
>          printk("Interrupt remapping %sabled\n", iommu_intremap ?
"en" : "dis");
> +        tasklet_init(&iommu_pt_cleanup_tasklet, iommu_free_pagetables,
0);
>      }
>  
>      return rc;
> --- a/xen/drivers/passthrough/vtd/iommu.c
> +++ b/xen/drivers/passthrough/vtd/iommu.c
> @@ -668,13 +668,24 @@ static void dma_pte_clear_one(struct dom
>  
>  static void iommu_free_pagetable(u64 pt_maddr, int level)
>  {
> -    int i;
> -    struct dma_pte *pt_vaddr, *pte;
> -    int next_level = level - 1;
> +    struct page_info *pg = maddr_to_page(pt_maddr);
>  
>      if ( pt_maddr == 0 )
>          return;
>  
> +    PFN_ORDER(pg) = level;
> +    spin_lock(&iommu_pt_cleanup_lock);
> +    page_list_add_tail(pg, &iommu_pt_cleanup_list);
> +    spin_unlock(&iommu_pt_cleanup_lock);
> +}
> +
> +static void iommu_free_page_table(struct page_info *pg)
> +{
> +    unsigned int i, next_level = PFN_ORDER(pg) - 1;
> +    u64 pt_maddr = page_to_maddr(pg);
> +    struct dma_pte *pt_vaddr, *pte;
> +
> +    PFN_ORDER(pg) = 0;
>      pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
>  
>      for ( i = 0; i < PTE_NUM; i++ )
> @@ -2430,6 +2441,7 @@ const struct iommu_ops intel_iommu_ops >     
.teardown = iommu_domain_teardown,
>      .map_page = intel_iommu_map_page,
>      .unmap_page = intel_iommu_unmap_page,
> +    .free_page_table = iommu_free_page_table,
>      .reassign_device = reassign_device_ownership,
>      .get_device_group_id = intel_iommu_group_id,
>      .update_ire_from_apic = io_apic_write_remap_rte,
> --- a/xen/include/xen/iommu.h
> +++ b/xen/include/xen/iommu.h
> @@ -88,6 +88,7 @@ bool_t pt_irq_need_timer(uint32_t flags)
>  
>  struct msi_desc;
>  struct msi_msg;
> +struct page_info;
>  
>  struct iommu_ops {
>      int (*init)(struct domain *d);
> @@ -100,6 +101,7 @@ struct iommu_ops {
>      int (*map_page)(struct domain *d, unsigned long gfn, unsigned long
mfn,
>                      unsigned int flags);
>      int (*unmap_page)(struct domain *d, unsigned long gfn);
> +    void (*free_page_table)(struct page_info *);
>      int (*reassign_device)(struct domain *s, struct domain *t,
>  			   u8 devfn, struct pci_dev *);
>      int (*get_device_group_id)(u16 seg, u8 bus, u8 devfn);
> @@ -151,4 +153,7 @@ int adjust_vtd_irq_affinities(void);
>   */
>  DECLARE_PER_CPU(bool_t, iommu_dont_flush_iotlb);
>  
> +extern struct spinlock iommu_pt_cleanup_lock;
> +extern struct page_list_head iommu_pt_cleanup_list;
> +
>  #endif /* _IOMMU_H_ */
>
>

Jan Beulich

2013-Dec-13 15:41 UTC

head link

Re: [PATCH v2 1/5] IOMMU: make page table population preemptible

>>> On 13.12.13 at 16:09, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
> On 13/12/2013 13:59, Jan Beulich wrote:
>> @@ -290,15 +307,14 @@ static int assign_device(struct domain *
>>                     rc);
>>      }
>>  
>> -    if ( has_arch_pdevs(d) && !need_iommu(d) )
>> + done:
>> +    if ( !has_arch_pdevs(d) && need_iommu(d) )
> 
> We now have a case where, for the first device, we could set up
> pagetables for a large domain, get an error with assignment, then tear
> them all back down.  (-EBUSY from pci_get_pdev() looks like a good
> non-fatal candidate for causing this behaviour)
> 
> I am wondering whether this is better for worse than the race condition
> where a guest couldn''t use the device.  A guest could not
reasonably
> expect to use a device before the toolstack is done setting it up.   A
> buggy toolstack could quite easily tie up a lot of Xen time creating and
> destroying complete iommu pagetable sets.
I don''t think it''s worth worrying about buggy tool stacks here
-
they should simply get fixed.

Furthermore this change in operation ordering is only a (nice)
side effect, the necessary cleanup seemed easier with the
order changed. And said time window would have grown with
the added preemption handling.

Jan

Andrew Cooper

2013-Dec-13 15:44 UTC

head link

Re: [PATCH v2 1/5] IOMMU: make page table population preemptible

On 13/12/2013 15:41, Jan Beulich wrote:>>>> On 13.12.13 at 16:09, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
>> On 13/12/2013 13:59, Jan Beulich wrote:
>>> @@ -290,15 +307,14 @@ static int assign_device(struct domain *
>>>                     rc);
>>>      }
>>>  
>>> -    if ( has_arch_pdevs(d) && !need_iommu(d) )
>>> + done:
>>> +    if ( !has_arch_pdevs(d) && need_iommu(d) )
>> We now have a case where, for the first device, we could set up
>> pagetables for a large domain, get an error with assignment, then tear
>> them all back down.  (-EBUSY from pci_get_pdev() looks like a good
>> non-fatal candidate for causing this behaviour)
>>
>> I am wondering whether this is better for worse than the race condition
>> where a guest couldn''t use the device.  A guest could not
reasonably
>> expect to use a device before the toolstack is done setting it up.   A
>> buggy toolstack could quite easily tie up a lot of Xen time creating
and
>> destroying complete iommu pagetable sets.
> I don''t think it''s worth worrying about buggy tool stacks
here -
> they should simply get fixed.
>
> Furthermore this change in operation ordering is only a (nice)
> side effect, the necessary cleanup seemed easier with the
> order changed. And said time window would have grown with
> the added preemption handling.
>
> Jan
>
All true.

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

Xen devel - Dec 2013 - [PATCH 0/5] XSA-77 follow-ups

[PATCH 0/5] XSA-77 follow-ups

[PATCH 1/5] IOMMU: make page table population preemptible

[PATCH 2/5] IOMMU: make page table deallocation preemptible

[PATCH 3/5] x86/p2m: restrict auditing to debug builds

[PATCH 4/5] HVM: prevent leaking heap data from hvm_save_one()

[PATCH 5/5] x86/PV: don''t commit debug register values early in arch_set_info_guest()

Re: [PATCH 3/5] x86/p2m: restrict auditing to debug builds

Re: [PATCH 4/5] HVM: prevent leaking heap data from hvm_save_one()

Re: [PATCH 5/5] x86/PV: don''t commit debug register values early in arch_set_info_guest()

Re: [PATCH 3/5] x86/p2m: restrict auditing to debug builds

Re: [PATCH 4/5] HVM: prevent leaking heap data from hvm_save_one()

Re: [PATCH 5/5] x86/PV: don''t commit debug register values early in arch_set_info_guest()

Re: [PATCH 5/5] x86/PV: don''t commit debug register values early in arch_set_info_guest()

Re: [PATCH 1/5] IOMMU: make page table population preemptible

Re: [PATCH 2/5] IOMMU: make page table deallocation preemptible

Re: [PATCH 1/5] IOMMU: make page table population preemptible

Re: [PATCH 2/5] IOMMU: make page table deallocation preemptible

Re: [PATCH 1/5] IOMMU: make page table population preemptible

Re: [PATCH 1/5] IOMMU: make page table population preemptible

Re: [PATCH 3/5] x86/p2m: restrict auditing to debug builds

Re: [PATCH 1/5] IOMMU: make page table population preemptible

[PATCH v2 1/5] IOMMU: make page table population preemptible

[PATCH v2 2/5] IOMMU: make page table deallocation preemptible

Re: [PATCH v2 1/5] IOMMU: make page table population preemptible

Re: [PATCH v2 1/5] IOMMU: make page table population preemptible

Re: [PATCH v2 2/5] IOMMU: make page table deallocation preemptible

Re: [PATCH v2 1/5] IOMMU: make page table population preemptible

Re: [PATCH v2 1/5] IOMMU: make page table population preemptible