thr3ads.net - Xen devel - [PATCH] VT-d: bind IRQs to CPUs local to the node the IOMMU is on [Dec 2011]

If this information is useful, please help other people find it:
Share via:

Jan Beulich

2011-Dec-12 15:53 UTC

[PATCH] VT-d: bind IRQs to CPUs local to the node the IOMMU is on

This extends create_irq() to take a node parameter, allowing the
resulting IRQ to have its destination set to a CPU on that node right
away, which is more natural than having to post-adjust this (and
get e.g. a new IRQ vector assigned despite a fresh one was just
obtained).

All other callers of create_irq() pass NUMA_NO_NODE for the time being.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- 2011-12-12.orig/xen/arch/x86/hpet.c	2011-11-11 09:40:55.000000000 +0100
+++ 2011-12-12/xen/arch/x86/hpet.c	2011-12-06 16:36:24.000000000 +0100
@@ -11,6 +11,7 @@
 #include <xen/smp.h>
 #include <xen/softirq.h>
 #include <xen/irq.h>
+#include <xen/numa.h>
 #include <asm/fixmap.h>
 #include <asm/div64.h>
 #include <asm/hpet.h>
@@ -334,7 +335,7 @@ static int __init hpet_assign_irq(unsign
 {
     int irq;
 
-    if ( (irq = create_irq()) < 0 )
+    if ( (irq = create_irq(NUMA_NO_NODE)) < 0 )
         return irq;
 
     if ( hpet_setup_msi_irq(irq, hpet_events + idx) )
--- 2011-12-12.orig/xen/arch/x86/io_apic.c	2011-11-22 10:57:09.000000000 +0100
+++ 2011-12-12/xen/arch/x86/io_apic.c	2011-12-12 15:24:10.000000000 +0100
@@ -995,7 +995,7 @@ static void __init setup_IO_APIC_irqs(vo
                 continue;
 
             if (IO_APIC_IRQ(irq)) {
-                vector = assign_irq_vector(irq);
+                vector = assign_irq_vector(irq, NULL);
                 BUG_ON(vector < 0);
                 entry.vector = vector;
                 ioapic_register_intr(irq, IOAPIC_AUTO);
@@ -2188,7 +2188,7 @@ int io_apic_set_pci_routing (int ioapic,
     if (!platform_legacy_irq(irq))
         add_pin_to_irq(irq, ioapic, pin);
 
-    vector = assign_irq_vector(irq);
+    vector = assign_irq_vector(irq, NULL);
     if (vector < 0)
         return vector;
     entry.vector = vector;
@@ -2340,7 +2340,7 @@ int ioapic_guest_write(unsigned long phy
 
     if ( desc->arch.vector <= 0 || desc->arch.vector >
LAST_DYNAMIC_VECTOR ) {
         add_pin_to_irq(irq, apic, pin);
-        vector = assign_irq_vector(irq);
+        vector = assign_irq_vector(irq, NULL);
         if ( vector < 0 )
             return vector;
 
--- 2011-12-12.orig/xen/arch/x86/irq.c	2011-12-12 09:01:34.000000000 +0100
+++ 2011-12-12/xen/arch/x86/irq.c	2011-12-12 15:24:15.000000000 +0100
@@ -151,7 +151,7 @@ int __init bind_irq_vector(int irq, int 
 /*
  * Dynamic irq allocate and deallocation for MSI
  */
-int create_irq(void)
+int create_irq(int node)
 {
     int irq, ret;
     struct irq_desc *desc;
@@ -168,7 +168,17 @@ int create_irq(void)
 
     ret = init_one_irq_desc(desc);
     if (!ret)
-        ret = assign_irq_vector(irq);
+    {
+        cpumask_t *mask = NULL;
+
+        if (node != NUMA_NO_NODE && node >= 0)
+        {
+            mask = &node_to_cpumask(node);
+            if (cpumask_empty(mask))
+                mask = NULL;
+        }
+        ret = assign_irq_vector(irq, mask);
+    }
     if (ret < 0)
     {
         desc->arch.used = IRQ_UNUSED;
@@ -514,7 +524,7 @@ next:
     return err;
 }
 
-int assign_irq_vector(int irq)
+int assign_irq_vector(int irq, const cpumask_t *mask)
 {
     int ret;
     unsigned long flags;
@@ -523,7 +533,7 @@ int assign_irq_vector(int irq)
     BUG_ON(irq >= nr_irqs || irq <0);
 
     spin_lock_irqsave(&vector_lock, flags);
-    ret = __assign_irq_vector(irq, desc, TARGET_CPUS);
+    ret = __assign_irq_vector(irq, desc, mask ?: TARGET_CPUS);
     if (!ret) {
         ret = desc->arch.vector;
         cpumask_copy(desc->affinity, desc->arch.cpu_mask);
--- 2011-12-12.orig/xen/arch/x86/physdev.c	2011-12-12 09:01:34.000000000 +0100
+++ 2011-12-12/xen/arch/x86/physdev.c	2011-12-06 16:28:10.000000000 +0100
@@ -132,7 +132,7 @@ int physdev_map_pirq(domid_t domid, int 
     case MAP_PIRQ_TYPE_MSI:
         irq = *index;
         if ( irq == -1 )
-            irq = create_irq();
+            irq = create_irq(NUMA_NO_NODE);
 
         if ( irq < 0 || irq >= nr_irqs )
         {
--- 2011-12-12.orig/xen/drivers/passthrough/amd/iommu_init.c	2011-11-29
09:19:05.000000000 +0100
+++ 2011-12-12/xen/drivers/passthrough/amd/iommu_init.c	2011-12-06
16:27:49.000000000 +0100
@@ -551,7 +551,7 @@ static int __init set_iommu_interrupt_ha
 {
     int irq, ret;
 
-    irq = create_irq();
+    irq = create_irq(NUMA_NO_NODE);
     if ( irq <= 0 )
     {
         dprintk(XENLOG_ERR, "IOMMU: no irqs\n");
--- 2011-12-12.orig/xen/drivers/passthrough/vtd/iommu.c	2011-11-22
10:57:10.000000000 +0100
+++ 2011-12-12/xen/drivers/passthrough/vtd/iommu.c	2011-12-06 16:35:42.000000000
+0100
@@ -1096,11 +1096,13 @@ static hw_irq_controller dma_msi_type = 
     .set_affinity = dma_msi_set_affinity,
 };
 
-static int __init iommu_set_interrupt(struct iommu *iommu)
+static int __init iommu_set_interrupt(struct acpi_drhd_unit *drhd)
 {
     int irq, ret;
+    struct acpi_rhsa_unit *rhsa = drhd_to_rhsa(drhd);
 
-    irq = create_irq();
+    irq = create_irq(rhsa ? pxm_to_node(rhsa->proximity_domain)
+                          : NUMA_NO_NODE);
     if ( irq <= 0 )
     {
         dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no irq available!\n");
@@ -1109,9 +1111,9 @@ static int __init iommu_set_interrupt(st
 
     irq_desc[irq].handler = &dma_msi_type;
 #ifdef CONFIG_X86
-    ret = request_irq(irq, iommu_page_fault, 0, "dmar", iommu);
+    ret = request_irq(irq, iommu_page_fault, 0, "dmar",
drhd->iommu);
 #else
-    ret = request_irq_vector(irq, iommu_page_fault, 0, "dmar",
iommu);
+    ret = request_irq_vector(irq, iommu_page_fault, 0, "dmar",
drhd->iommu);
 #endif
     if ( ret )
     {
@@ -2133,7 +2135,7 @@ int __init intel_vtd_setup(void)
         if ( !vtd_ept_page_compatible(iommu) )
             iommu_hap_pt_share = 0;
 
-        ret = iommu_set_interrupt(iommu);
+        ret = iommu_set_interrupt(drhd);
         if ( ret < 0 )
         {
             dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: interrupt setup
failed\n");
--- 2011-12-12.orig/xen/include/asm-x86/io_apic.h	2011-10-04 08:22:17.000000000
+0200
+++ 2011-12-12/xen/include/asm-x86/io_apic.h	2011-12-06 16:24:22.000000000 +0100
@@ -213,9 +213,6 @@ static inline void ioapic_suspend(void) 
 static inline void ioapic_resume(void) {}
 #endif
 
-extern int assign_irq_vector(int irq);
-extern int free_irq_vector(int vector);
-
 unsigned highest_gsi(void);
 
 int ioapic_guest_read( unsigned long physbase, unsigned int reg, u32 *pval);
--- 2011-12-12.orig/xen/include/asm-x86/irq.h	2011-11-22 10:57:10.000000000
+0100
+++ 2011-12-12/xen/include/asm-x86/irq.h	2011-12-06 16:25:22.000000000 +0100
@@ -155,8 +155,9 @@ int  init_irq_data(void);
 void clear_irq_vector(int irq);
 
 int irq_to_vector(int irq);
-int create_irq(void);
+int create_irq(int node);
 void destroy_irq(unsigned int irq);
+int assign_irq_vector(int irq, const cpumask_t *);
 
 extern void irq_complete_move(struct irq_desc *);
 




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Keir Fraser

2011-Dec-12 17:50 UTC

head link

Re: [PATCH] VT-d: bind IRQs to CPUs local to the node the IOMMU is on

On 12/12/2011 15:53, "Jan Beulich" <JBeulich@suse.com> wrote:
> This extends create_irq() to take a node parameter, allowing the
> resulting IRQ to have its destination set to a CPU on that node right
> away, which is more natural than having to post-adjust this (and
> get e.g. a new IRQ vector assigned despite a fresh one was just
> obtained).
> 
> All other callers of create_irq() pass NUMA_NO_NODE for the time being.
I don''t know about this one. Does the current
''inefficient'' way things work
really matter?

 -- Keir
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> --- 2011-12-12.orig/xen/arch/x86/hpet.c 2011-11-11 09:40:55.000000000 +0100
> +++ 2011-12-12/xen/arch/x86/hpet.c 2011-12-06 16:36:24.000000000 +0100
> @@ -11,6 +11,7 @@
>  #include <xen/smp.h>
>  #include <xen/softirq.h>
>  #include <xen/irq.h>
> +#include <xen/numa.h>
>  #include <asm/fixmap.h>
>  #include <asm/div64.h>
>  #include <asm/hpet.h>
> @@ -334,7 +335,7 @@ static int __init hpet_assign_irq(unsign
>  {
>      int irq;
>  
> -    if ( (irq = create_irq()) < 0 )
> +    if ( (irq = create_irq(NUMA_NO_NODE)) < 0 )
>          return irq;
>  
>      if ( hpet_setup_msi_irq(irq, hpet_events + idx) )
> --- 2011-12-12.orig/xen/arch/x86/io_apic.c 2011-11-22 10:57:09.000000000
+0100
> +++ 2011-12-12/xen/arch/x86/io_apic.c 2011-12-12 15:24:10.000000000 +0100
> @@ -995,7 +995,7 @@ static void __init setup_IO_APIC_irqs(vo
>                  continue;
>  
>              if (IO_APIC_IRQ(irq)) {
> -                vector = assign_irq_vector(irq);
> +                vector = assign_irq_vector(irq, NULL);
>                  BUG_ON(vector < 0);
>                  entry.vector = vector;
>                  ioapic_register_intr(irq, IOAPIC_AUTO);
> @@ -2188,7 +2188,7 @@ int io_apic_set_pci_routing (int ioapic,
>      if (!platform_legacy_irq(irq))
>          add_pin_to_irq(irq, ioapic, pin);
>  
> -    vector = assign_irq_vector(irq);
> +    vector = assign_irq_vector(irq, NULL);
>      if (vector < 0)
>          return vector;
>      entry.vector = vector;
> @@ -2340,7 +2340,7 @@ int ioapic_guest_write(unsigned long phy
>  
>      if ( desc->arch.vector <= 0 || desc->arch.vector >
LAST_DYNAMIC_VECTOR )
> {
>          add_pin_to_irq(irq, apic, pin);
> -        vector = assign_irq_vector(irq);
> +        vector = assign_irq_vector(irq, NULL);
>          if ( vector < 0 )
>              return vector;
>  
> --- 2011-12-12.orig/xen/arch/x86/irq.c 2011-12-12 09:01:34.000000000 +0100
> +++ 2011-12-12/xen/arch/x86/irq.c 2011-12-12 15:24:15.000000000 +0100
> @@ -151,7 +151,7 @@ int __init bind_irq_vector(int irq, int
>  /*
>   * Dynamic irq allocate and deallocation for MSI
>   */
> -int create_irq(void)
> +int create_irq(int node)
>  {
>      int irq, ret;
>      struct irq_desc *desc;
> @@ -168,7 +168,17 @@ int create_irq(void)
>  
>      ret = init_one_irq_desc(desc);
>      if (!ret)
> -        ret = assign_irq_vector(irq);
> +    {
> +        cpumask_t *mask = NULL;
> +
> +        if (node != NUMA_NO_NODE && node >= 0)
> +        {
> +            mask = &node_to_cpumask(node);
> +            if (cpumask_empty(mask))
> +                mask = NULL;
> +        }
> +        ret = assign_irq_vector(irq, mask);
> +    }
>      if (ret < 0)
>      {
>          desc->arch.used = IRQ_UNUSED;
> @@ -514,7 +524,7 @@ next:
>      return err;
>  }
>  
> -int assign_irq_vector(int irq)
> +int assign_irq_vector(int irq, const cpumask_t *mask)
>  {
>      int ret;
>      unsigned long flags;
> @@ -523,7 +533,7 @@ int assign_irq_vector(int irq)
>      BUG_ON(irq >= nr_irqs || irq <0);
>  
>      spin_lock_irqsave(&vector_lock, flags);
> -    ret = __assign_irq_vector(irq, desc, TARGET_CPUS);
> +    ret = __assign_irq_vector(irq, desc, mask ?: TARGET_CPUS);
>      if (!ret) {
>          ret = desc->arch.vector;
>          cpumask_copy(desc->affinity, desc->arch.cpu_mask);
> --- 2011-12-12.orig/xen/arch/x86/physdev.c 2011-12-12 09:01:34.000000000
+0100
> +++ 2011-12-12/xen/arch/x86/physdev.c 2011-12-06 16:28:10.000000000 +0100
> @@ -132,7 +132,7 @@ int physdev_map_pirq(domid_t domid, int
>      case MAP_PIRQ_TYPE_MSI:
>          irq = *index;
>          if ( irq == -1 )
> -            irq = create_irq();
> +            irq = create_irq(NUMA_NO_NODE);
>  
>          if ( irq < 0 || irq >= nr_irqs )
>          {
> --- 2011-12-12.orig/xen/drivers/passthrough/amd/iommu_init.c 2011-11-29
> 09:19:05.000000000 +0100
> +++ 2011-12-12/xen/drivers/passthrough/amd/iommu_init.c 2011-12-06
> 16:27:49.000000000 +0100
> @@ -551,7 +551,7 @@ static int __init set_iommu_interrupt_ha
>  {
>      int irq, ret;
>  
> -    irq = create_irq();
> +    irq = create_irq(NUMA_NO_NODE);
>      if ( irq <= 0 )
>      {
>          dprintk(XENLOG_ERR, "IOMMU: no irqs\n");
> --- 2011-12-12.orig/xen/drivers/passthrough/vtd/iommu.c 2011-11-22
> 10:57:10.000000000 +0100
> +++ 2011-12-12/xen/drivers/passthrough/vtd/iommu.c 2011-12-06
> 16:35:42.000000000 +0100
> @@ -1096,11 +1096,13 @@ static hw_irq_controller dma_msi_type >     
.set_affinity = dma_msi_set_affinity,
>  };
>  
> -static int __init iommu_set_interrupt(struct iommu *iommu)
> +static int __init iommu_set_interrupt(struct acpi_drhd_unit *drhd)
>  {
>      int irq, ret;
> +    struct acpi_rhsa_unit *rhsa = drhd_to_rhsa(drhd);
>  
> -    irq = create_irq();
> +    irq = create_irq(rhsa ? pxm_to_node(rhsa->proximity_domain)
> +                          : NUMA_NO_NODE);
>      if ( irq <= 0 )
>      {
>          dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no irq
available!\n");
> @@ -1109,9 +1111,9 @@ static int __init iommu_set_interrupt(st
>  
>      irq_desc[irq].handler = &dma_msi_type;
>  #ifdef CONFIG_X86
> -    ret = request_irq(irq, iommu_page_fault, 0, "dmar", iommu);
> +    ret = request_irq(irq, iommu_page_fault, 0, "dmar",
drhd->iommu);
>  #else
> -    ret = request_irq_vector(irq, iommu_page_fault, 0, "dmar",
iommu);
> +    ret = request_irq_vector(irq, iommu_page_fault, 0, "dmar",
drhd->iommu);
>  #endif
>      if ( ret )
>      {
> @@ -2133,7 +2135,7 @@ int __init intel_vtd_setup(void)
>          if ( !vtd_ept_page_compatible(iommu) )
>              iommu_hap_pt_share = 0;
>  
> -        ret = iommu_set_interrupt(iommu);
> +        ret = iommu_set_interrupt(drhd);
>          if ( ret < 0 )
>          {
>              dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: interrupt setup
failed\n");
> --- 2011-12-12.orig/xen/include/asm-x86/io_apic.h 2011-10-04
> 08:22:17.000000000 +0200
> +++ 2011-12-12/xen/include/asm-x86/io_apic.h 2011-12-06 16:24:22.000000000
> +0100
> @@ -213,9 +213,6 @@ static inline void ioapic_suspend(void)
>  static inline void ioapic_resume(void) {}
>  #endif
>  
> -extern int assign_irq_vector(int irq);
> -extern int free_irq_vector(int vector);
> -
>  unsigned highest_gsi(void);
>  
>  int ioapic_guest_read( unsigned long physbase, unsigned int reg, u32
*pval);
> --- 2011-12-12.orig/xen/include/asm-x86/irq.h 2011-11-22 10:57:10.000000000
> +0100
> +++ 2011-12-12/xen/include/asm-x86/irq.h 2011-12-06 16:25:22.000000000
+0100
> @@ -155,8 +155,9 @@ int  init_irq_data(void);
>  void clear_irq_vector(int irq);
>  
>  int irq_to_vector(int irq);
> -int create_irq(void);
> +int create_irq(int node);
>  void destroy_irq(unsigned int irq);
> +int assign_irq_vector(int irq, const cpumask_t *);
>  
>  extern void irq_complete_move(struct irq_desc *);
>  
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

Jan Beulich

2011-Dec-13 08:23 UTC

head link

Re: [PATCH] VT-d: bind IRQs to CPUs local to the node the IOMMU is on

>>> On 12.12.11 at 18:50, Keir Fraser <keir@xen.org> wrote:
> On 12/12/2011 15:53, "Jan Beulich" <JBeulich@suse.com>
wrote:
> 
>> This extends create_irq() to take a node parameter, allowing the
>> resulting IRQ to have its destination set to a CPU on that node right
>> away, which is more natural than having to post-adjust this (and
>> get e.g. a new IRQ vector assigned despite a fresh one was just
>> obtained).
>> 
>> All other callers of create_irq() pass NUMA_NO_NODE for the time being.
> 
> I don''t know about this one. Does the current
''inefficient'' way things work
> really matter?
The depends on the NUMA interconnect. My general perspective on
this is that whenever NUMA locality information is available, we should
aim at making use of it (unless it conflicts with something else*). And
there''s certainly ways to go in this respect.

* When coming up with this, I actually looked at whether using the
proximity information now passed down by Dom0 for PCI devices
could be used for properly binding at least MSI interrupts. That didn''t
turn out reasonable, since we''re already setting the IRQ affinity to
match the pCPU the target vCPU is running on (which is likely
providing greater benefit, as this allows avoiding IPIs; the
efficiency of this can certainly be tweaked - I''m meanwhile thinking
that we might be overly aggressive here, but that''s to some part
related to the scheduler apparently migrating vCPU-s more often
than really desirable). But for Xen internal interrupts (like in the case
of the IOMMU ones) this clearly ought to be done when possible. For
the AMD case I just wasn''t able to spot whether locality information
is available.

Jan

Keir Fraser

2011-Dec-13 08:31 UTC

head link

Re: [PATCH] VT-d: bind IRQs to CPUs local to the node the IOMMU is on

On 13/12/2011 08:23, "Jan Beulich" <JBeulich@suse.com> wrote:
>>>> On 12.12.11 at 18:50, Keir Fraser <keir@xen.org> wrote:
>> On 12/12/2011 15:53, "Jan Beulich" <JBeulich@suse.com>
wrote:
>> 
>>> This extends create_irq() to take a node parameter, allowing the
>>> resulting IRQ to have its destination set to a CPU on that node
right
>>> away, which is more natural than having to post-adjust this (and
>>> get e.g. a new IRQ vector assigned despite a fresh one was just
>>> obtained).
>>> 
>>> All other callers of create_irq() pass NUMA_NO_NODE for the time
being.
>> 
>> I don''t know about this one. Does the current
''inefficient'' way things work
>> really matter?
> 
> The depends on the NUMA interconnect. My general perspective on
> this is that whenever NUMA locality information is available, we should
> aim at making use of it (unless it conflicts with something else*). And
> there''s certainly ways to go in this respect.
Oh, I misunderstood the patch comment, and thought we *were* already
post-adjusting the affinity. But we''re not, so this patch does make
sense.

Acked-by: Keir Fraser <keir@xen.org>

Xen devel - Dec 2011 - [PATCH] VT-d: bind IRQs to CPUs local to the node the IOMMU is on

[PATCH] VT-d: bind IRQs to CPUs local to the node the IOMMU is on

Re: [PATCH] VT-d: bind IRQs to CPUs local to the node the IOMMU is on

Re: [PATCH] VT-d: bind IRQs to CPUs local to the node the IOMMU is on

Re: [PATCH] VT-d: bind IRQs to CPUs local to the node the IOMMU is on