Jan Beulich
2011-Dec-12 15:53 UTC
[PATCH] VT-d: bind IRQs to CPUs local to the node the IOMMU is on
This extends create_irq() to take a node parameter, allowing the resulting IRQ to have its destination set to a CPU on that node right away, which is more natural than having to post-adjust this (and get e.g. a new IRQ vector assigned despite a fresh one was just obtained). All other callers of create_irq() pass NUMA_NO_NODE for the time being. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- 2011-12-12.orig/xen/arch/x86/hpet.c 2011-11-11 09:40:55.000000000 +0100 +++ 2011-12-12/xen/arch/x86/hpet.c 2011-12-06 16:36:24.000000000 +0100 @@ -11,6 +11,7 @@ #include <xen/smp.h> #include <xen/softirq.h> #include <xen/irq.h> +#include <xen/numa.h> #include <asm/fixmap.h> #include <asm/div64.h> #include <asm/hpet.h> @@ -334,7 +335,7 @@ static int __init hpet_assign_irq(unsign { int irq; - if ( (irq = create_irq()) < 0 ) + if ( (irq = create_irq(NUMA_NO_NODE)) < 0 ) return irq; if ( hpet_setup_msi_irq(irq, hpet_events + idx) ) --- 2011-12-12.orig/xen/arch/x86/io_apic.c 2011-11-22 10:57:09.000000000 +0100 +++ 2011-12-12/xen/arch/x86/io_apic.c 2011-12-12 15:24:10.000000000 +0100 @@ -995,7 +995,7 @@ static void __init setup_IO_APIC_irqs(vo continue; if (IO_APIC_IRQ(irq)) { - vector = assign_irq_vector(irq); + vector = assign_irq_vector(irq, NULL); BUG_ON(vector < 0); entry.vector = vector; ioapic_register_intr(irq, IOAPIC_AUTO); @@ -2188,7 +2188,7 @@ int io_apic_set_pci_routing (int ioapic, if (!platform_legacy_irq(irq)) add_pin_to_irq(irq, ioapic, pin); - vector = assign_irq_vector(irq); + vector = assign_irq_vector(irq, NULL); if (vector < 0) return vector; entry.vector = vector; @@ -2340,7 +2340,7 @@ int ioapic_guest_write(unsigned long phy if ( desc->arch.vector <= 0 || desc->arch.vector > LAST_DYNAMIC_VECTOR ) { add_pin_to_irq(irq, apic, pin); - vector = assign_irq_vector(irq); + vector = assign_irq_vector(irq, NULL); if ( vector < 0 ) return vector; --- 2011-12-12.orig/xen/arch/x86/irq.c 2011-12-12 09:01:34.000000000 +0100 +++ 2011-12-12/xen/arch/x86/irq.c 2011-12-12 15:24:15.000000000 +0100 @@ -151,7 +151,7 @@ int __init bind_irq_vector(int irq, int /* * Dynamic irq allocate and deallocation for MSI */ -int create_irq(void) +int create_irq(int node) { int irq, ret; struct irq_desc *desc; @@ -168,7 +168,17 @@ int create_irq(void) ret = init_one_irq_desc(desc); if (!ret) - ret = assign_irq_vector(irq); + { + cpumask_t *mask = NULL; + + if (node != NUMA_NO_NODE && node >= 0) + { + mask = &node_to_cpumask(node); + if (cpumask_empty(mask)) + mask = NULL; + } + ret = assign_irq_vector(irq, mask); + } if (ret < 0) { desc->arch.used = IRQ_UNUSED; @@ -514,7 +524,7 @@ next: return err; } -int assign_irq_vector(int irq) +int assign_irq_vector(int irq, const cpumask_t *mask) { int ret; unsigned long flags; @@ -523,7 +533,7 @@ int assign_irq_vector(int irq) BUG_ON(irq >= nr_irqs || irq <0); spin_lock_irqsave(&vector_lock, flags); - ret = __assign_irq_vector(irq, desc, TARGET_CPUS); + ret = __assign_irq_vector(irq, desc, mask ?: TARGET_CPUS); if (!ret) { ret = desc->arch.vector; cpumask_copy(desc->affinity, desc->arch.cpu_mask); --- 2011-12-12.orig/xen/arch/x86/physdev.c 2011-12-12 09:01:34.000000000 +0100 +++ 2011-12-12/xen/arch/x86/physdev.c 2011-12-06 16:28:10.000000000 +0100 @@ -132,7 +132,7 @@ int physdev_map_pirq(domid_t domid, int case MAP_PIRQ_TYPE_MSI: irq = *index; if ( irq == -1 ) - irq = create_irq(); + irq = create_irq(NUMA_NO_NODE); if ( irq < 0 || irq >= nr_irqs ) { --- 2011-12-12.orig/xen/drivers/passthrough/amd/iommu_init.c 2011-11-29 09:19:05.000000000 +0100 +++ 2011-12-12/xen/drivers/passthrough/amd/iommu_init.c 2011-12-06 16:27:49.000000000 +0100 @@ -551,7 +551,7 @@ static int __init set_iommu_interrupt_ha { int irq, ret; - irq = create_irq(); + irq = create_irq(NUMA_NO_NODE); if ( irq <= 0 ) { dprintk(XENLOG_ERR, "IOMMU: no irqs\n"); --- 2011-12-12.orig/xen/drivers/passthrough/vtd/iommu.c 2011-11-22 10:57:10.000000000 +0100 +++ 2011-12-12/xen/drivers/passthrough/vtd/iommu.c 2011-12-06 16:35:42.000000000 +0100 @@ -1096,11 +1096,13 @@ static hw_irq_controller dma_msi_type = .set_affinity = dma_msi_set_affinity, }; -static int __init iommu_set_interrupt(struct iommu *iommu) +static int __init iommu_set_interrupt(struct acpi_drhd_unit *drhd) { int irq, ret; + struct acpi_rhsa_unit *rhsa = drhd_to_rhsa(drhd); - irq = create_irq(); + irq = create_irq(rhsa ? pxm_to_node(rhsa->proximity_domain) + : NUMA_NO_NODE); if ( irq <= 0 ) { dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no irq available!\n"); @@ -1109,9 +1111,9 @@ static int __init iommu_set_interrupt(st irq_desc[irq].handler = &dma_msi_type; #ifdef CONFIG_X86 - ret = request_irq(irq, iommu_page_fault, 0, "dmar", iommu); + ret = request_irq(irq, iommu_page_fault, 0, "dmar", drhd->iommu); #else - ret = request_irq_vector(irq, iommu_page_fault, 0, "dmar", iommu); + ret = request_irq_vector(irq, iommu_page_fault, 0, "dmar", drhd->iommu); #endif if ( ret ) { @@ -2133,7 +2135,7 @@ int __init intel_vtd_setup(void) if ( !vtd_ept_page_compatible(iommu) ) iommu_hap_pt_share = 0; - ret = iommu_set_interrupt(iommu); + ret = iommu_set_interrupt(drhd); if ( ret < 0 ) { dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: interrupt setup failed\n"); --- 2011-12-12.orig/xen/include/asm-x86/io_apic.h 2011-10-04 08:22:17.000000000 +0200 +++ 2011-12-12/xen/include/asm-x86/io_apic.h 2011-12-06 16:24:22.000000000 +0100 @@ -213,9 +213,6 @@ static inline void ioapic_suspend(void) static inline void ioapic_resume(void) {} #endif -extern int assign_irq_vector(int irq); -extern int free_irq_vector(int vector); - unsigned highest_gsi(void); int ioapic_guest_read( unsigned long physbase, unsigned int reg, u32 *pval); --- 2011-12-12.orig/xen/include/asm-x86/irq.h 2011-11-22 10:57:10.000000000 +0100 +++ 2011-12-12/xen/include/asm-x86/irq.h 2011-12-06 16:25:22.000000000 +0100 @@ -155,8 +155,9 @@ int init_irq_data(void); void clear_irq_vector(int irq); int irq_to_vector(int irq); -int create_irq(void); +int create_irq(int node); void destroy_irq(unsigned int irq); +int assign_irq_vector(int irq, const cpumask_t *); extern void irq_complete_move(struct irq_desc *); _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2011-Dec-12 17:50 UTC
Re: [PATCH] VT-d: bind IRQs to CPUs local to the node the IOMMU is on
On 12/12/2011 15:53, "Jan Beulich" <JBeulich@suse.com> wrote:> This extends create_irq() to take a node parameter, allowing the > resulting IRQ to have its destination set to a CPU on that node right > away, which is more natural than having to post-adjust this (and > get e.g. a new IRQ vector assigned despite a fresh one was just > obtained). > > All other callers of create_irq() pass NUMA_NO_NODE for the time being.I don''t know about this one. Does the current ''inefficient'' way things work really matter? -- Keir> Signed-off-by: Jan Beulich <jbeulich@suse.com> > > --- 2011-12-12.orig/xen/arch/x86/hpet.c 2011-11-11 09:40:55.000000000 +0100 > +++ 2011-12-12/xen/arch/x86/hpet.c 2011-12-06 16:36:24.000000000 +0100 > @@ -11,6 +11,7 @@ > #include <xen/smp.h> > #include <xen/softirq.h> > #include <xen/irq.h> > +#include <xen/numa.h> > #include <asm/fixmap.h> > #include <asm/div64.h> > #include <asm/hpet.h> > @@ -334,7 +335,7 @@ static int __init hpet_assign_irq(unsign > { > int irq; > > - if ( (irq = create_irq()) < 0 ) > + if ( (irq = create_irq(NUMA_NO_NODE)) < 0 ) > return irq; > > if ( hpet_setup_msi_irq(irq, hpet_events + idx) ) > --- 2011-12-12.orig/xen/arch/x86/io_apic.c 2011-11-22 10:57:09.000000000 +0100 > +++ 2011-12-12/xen/arch/x86/io_apic.c 2011-12-12 15:24:10.000000000 +0100 > @@ -995,7 +995,7 @@ static void __init setup_IO_APIC_irqs(vo > continue; > > if (IO_APIC_IRQ(irq)) { > - vector = assign_irq_vector(irq); > + vector = assign_irq_vector(irq, NULL); > BUG_ON(vector < 0); > entry.vector = vector; > ioapic_register_intr(irq, IOAPIC_AUTO); > @@ -2188,7 +2188,7 @@ int io_apic_set_pci_routing (int ioapic, > if (!platform_legacy_irq(irq)) > add_pin_to_irq(irq, ioapic, pin); > > - vector = assign_irq_vector(irq); > + vector = assign_irq_vector(irq, NULL); > if (vector < 0) > return vector; > entry.vector = vector; > @@ -2340,7 +2340,7 @@ int ioapic_guest_write(unsigned long phy > > if ( desc->arch.vector <= 0 || desc->arch.vector > LAST_DYNAMIC_VECTOR ) > { > add_pin_to_irq(irq, apic, pin); > - vector = assign_irq_vector(irq); > + vector = assign_irq_vector(irq, NULL); > if ( vector < 0 ) > return vector; > > --- 2011-12-12.orig/xen/arch/x86/irq.c 2011-12-12 09:01:34.000000000 +0100 > +++ 2011-12-12/xen/arch/x86/irq.c 2011-12-12 15:24:15.000000000 +0100 > @@ -151,7 +151,7 @@ int __init bind_irq_vector(int irq, int > /* > * Dynamic irq allocate and deallocation for MSI > */ > -int create_irq(void) > +int create_irq(int node) > { > int irq, ret; > struct irq_desc *desc; > @@ -168,7 +168,17 @@ int create_irq(void) > > ret = init_one_irq_desc(desc); > if (!ret) > - ret = assign_irq_vector(irq); > + { > + cpumask_t *mask = NULL; > + > + if (node != NUMA_NO_NODE && node >= 0) > + { > + mask = &node_to_cpumask(node); > + if (cpumask_empty(mask)) > + mask = NULL; > + } > + ret = assign_irq_vector(irq, mask); > + } > if (ret < 0) > { > desc->arch.used = IRQ_UNUSED; > @@ -514,7 +524,7 @@ next: > return err; > } > > -int assign_irq_vector(int irq) > +int assign_irq_vector(int irq, const cpumask_t *mask) > { > int ret; > unsigned long flags; > @@ -523,7 +533,7 @@ int assign_irq_vector(int irq) > BUG_ON(irq >= nr_irqs || irq <0); > > spin_lock_irqsave(&vector_lock, flags); > - ret = __assign_irq_vector(irq, desc, TARGET_CPUS); > + ret = __assign_irq_vector(irq, desc, mask ?: TARGET_CPUS); > if (!ret) { > ret = desc->arch.vector; > cpumask_copy(desc->affinity, desc->arch.cpu_mask); > --- 2011-12-12.orig/xen/arch/x86/physdev.c 2011-12-12 09:01:34.000000000 +0100 > +++ 2011-12-12/xen/arch/x86/physdev.c 2011-12-06 16:28:10.000000000 +0100 > @@ -132,7 +132,7 @@ int physdev_map_pirq(domid_t domid, int > case MAP_PIRQ_TYPE_MSI: > irq = *index; > if ( irq == -1 ) > - irq = create_irq(); > + irq = create_irq(NUMA_NO_NODE); > > if ( irq < 0 || irq >= nr_irqs ) > { > --- 2011-12-12.orig/xen/drivers/passthrough/amd/iommu_init.c 2011-11-29 > 09:19:05.000000000 +0100 > +++ 2011-12-12/xen/drivers/passthrough/amd/iommu_init.c 2011-12-06 > 16:27:49.000000000 +0100 > @@ -551,7 +551,7 @@ static int __init set_iommu_interrupt_ha > { > int irq, ret; > > - irq = create_irq(); > + irq = create_irq(NUMA_NO_NODE); > if ( irq <= 0 ) > { > dprintk(XENLOG_ERR, "IOMMU: no irqs\n"); > --- 2011-12-12.orig/xen/drivers/passthrough/vtd/iommu.c 2011-11-22 > 10:57:10.000000000 +0100 > +++ 2011-12-12/xen/drivers/passthrough/vtd/iommu.c 2011-12-06 > 16:35:42.000000000 +0100 > @@ -1096,11 +1096,13 @@ static hw_irq_controller dma_msi_type > .set_affinity = dma_msi_set_affinity, > }; > > -static int __init iommu_set_interrupt(struct iommu *iommu) > +static int __init iommu_set_interrupt(struct acpi_drhd_unit *drhd) > { > int irq, ret; > + struct acpi_rhsa_unit *rhsa = drhd_to_rhsa(drhd); > > - irq = create_irq(); > + irq = create_irq(rhsa ? pxm_to_node(rhsa->proximity_domain) > + : NUMA_NO_NODE); > if ( irq <= 0 ) > { > dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no irq available!\n"); > @@ -1109,9 +1111,9 @@ static int __init iommu_set_interrupt(st > > irq_desc[irq].handler = &dma_msi_type; > #ifdef CONFIG_X86 > - ret = request_irq(irq, iommu_page_fault, 0, "dmar", iommu); > + ret = request_irq(irq, iommu_page_fault, 0, "dmar", drhd->iommu); > #else > - ret = request_irq_vector(irq, iommu_page_fault, 0, "dmar", iommu); > + ret = request_irq_vector(irq, iommu_page_fault, 0, "dmar", drhd->iommu); > #endif > if ( ret ) > { > @@ -2133,7 +2135,7 @@ int __init intel_vtd_setup(void) > if ( !vtd_ept_page_compatible(iommu) ) > iommu_hap_pt_share = 0; > > - ret = iommu_set_interrupt(iommu); > + ret = iommu_set_interrupt(drhd); > if ( ret < 0 ) > { > dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: interrupt setup failed\n"); > --- 2011-12-12.orig/xen/include/asm-x86/io_apic.h 2011-10-04 > 08:22:17.000000000 +0200 > +++ 2011-12-12/xen/include/asm-x86/io_apic.h 2011-12-06 16:24:22.000000000 > +0100 > @@ -213,9 +213,6 @@ static inline void ioapic_suspend(void) > static inline void ioapic_resume(void) {} > #endif > > -extern int assign_irq_vector(int irq); > -extern int free_irq_vector(int vector); > - > unsigned highest_gsi(void); > > int ioapic_guest_read( unsigned long physbase, unsigned int reg, u32 *pval); > --- 2011-12-12.orig/xen/include/asm-x86/irq.h 2011-11-22 10:57:10.000000000 > +0100 > +++ 2011-12-12/xen/include/asm-x86/irq.h 2011-12-06 16:25:22.000000000 +0100 > @@ -155,8 +155,9 @@ int init_irq_data(void); > void clear_irq_vector(int irq); > > int irq_to_vector(int irq); > -int create_irq(void); > +int create_irq(int node); > void destroy_irq(unsigned int irq); > +int assign_irq_vector(int irq, const cpumask_t *); > > extern void irq_complete_move(struct irq_desc *); > > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xensource.com > http://lists.xensource.com/xen-devel
Jan Beulich
2011-Dec-13 08:23 UTC
Re: [PATCH] VT-d: bind IRQs to CPUs local to the node the IOMMU is on
>>> On 12.12.11 at 18:50, Keir Fraser <keir@xen.org> wrote: > On 12/12/2011 15:53, "Jan Beulich" <JBeulich@suse.com> wrote: > >> This extends create_irq() to take a node parameter, allowing the >> resulting IRQ to have its destination set to a CPU on that node right >> away, which is more natural than having to post-adjust this (and >> get e.g. a new IRQ vector assigned despite a fresh one was just >> obtained). >> >> All other callers of create_irq() pass NUMA_NO_NODE for the time being. > > I don''t know about this one. Does the current ''inefficient'' way things work > really matter?The depends on the NUMA interconnect. My general perspective on this is that whenever NUMA locality information is available, we should aim at making use of it (unless it conflicts with something else*). And there''s certainly ways to go in this respect. * When coming up with this, I actually looked at whether using the proximity information now passed down by Dom0 for PCI devices could be used for properly binding at least MSI interrupts. That didn''t turn out reasonable, since we''re already setting the IRQ affinity to match the pCPU the target vCPU is running on (which is likely providing greater benefit, as this allows avoiding IPIs; the efficiency of this can certainly be tweaked - I''m meanwhile thinking that we might be overly aggressive here, but that''s to some part related to the scheduler apparently migrating vCPU-s more often than really desirable). But for Xen internal interrupts (like in the case of the IOMMU ones) this clearly ought to be done when possible. For the AMD case I just wasn''t able to spot whether locality information is available. Jan
Keir Fraser
2011-Dec-13 08:31 UTC
Re: [PATCH] VT-d: bind IRQs to CPUs local to the node the IOMMU is on
On 13/12/2011 08:23, "Jan Beulich" <JBeulich@suse.com> wrote:>>>> On 12.12.11 at 18:50, Keir Fraser <keir@xen.org> wrote: >> On 12/12/2011 15:53, "Jan Beulich" <JBeulich@suse.com> wrote: >> >>> This extends create_irq() to take a node parameter, allowing the >>> resulting IRQ to have its destination set to a CPU on that node right >>> away, which is more natural than having to post-adjust this (and >>> get e.g. a new IRQ vector assigned despite a fresh one was just >>> obtained). >>> >>> All other callers of create_irq() pass NUMA_NO_NODE for the time being. >> >> I don''t know about this one. Does the current ''inefficient'' way things work >> really matter? > > The depends on the NUMA interconnect. My general perspective on > this is that whenever NUMA locality information is available, we should > aim at making use of it (unless it conflicts with something else*). And > there''s certainly ways to go in this respect.Oh, I misunderstood the patch comment, and thought we *were* already post-adjusting the affinity. But we''re not, so this patch does make sense. Acked-by: Keir Fraser <keir@xen.org>