Jan Beulich
2012-May-14 12:39 UTC
x86: adjust handling of interrupts coming in via legacy vectors
The debugging code added in c/s 24707:96987c324a4f was hit a (small) number of times (one report being http://lists.xen.org/archives/html/xen-devel/2012-05/msg00332.html), apparently always with a vector within the legacy range. Obviously, besides legacy vectors not normally expected to be in use on systems with IO-APIC(s), they should never make it to the IRQ migration logic. This wasn''t being prevented so far: Since we don''t have a one-to-one mapping between vectors and IRQs - legacy IRQs may have two vectors associated with them (one used in either 8259A, the other used in one of the IO-APICs) -, vector-to-IRQ translations for legacy vectors (as used in do_IRQ()) would yield a valid IRQ number despite the IRQ really being handled via an IO-APIC. This gets changed here - disable_8259A_irq() zaps the legacy vector-to- IRQ mapping, and enable_8259A_irq(), should it ever be called for a particular interrupts, restores it. Additionally, the spurious interrupt logic in do_IRQ() gets adjusted too: Interrupts coming in via legacy vectors obviously didn''t get reported through the IO-APIC/LAPIC pair (as we never program these vectors into any RTE), and hence shouldn''t get ack_APIC_irq() called on them. Instead, a new function (pointer) bogus_8259A_irq() gets used to have the 8259A driver take care of the bogus interrupt (as outside of automatice EOI mode it may need an EOI to be issued for it to prevent other interrupts that may legitimately go through the 8259As from getting masked out). Signed-off-by: Jan Beulich <jbeulich@suse.com> --- a/xen/arch/x86/i8259.c +++ b/xen/arch/x86/i8259.c @@ -85,7 +85,15 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BU static DEFINE_SPINLOCK(i8259A_lock); -static void mask_and_ack_8259A_irq(struct irq_desc *); +static void _mask_and_ack_8259A_irq(unsigned int irq); + +void (*__read_mostly bogus_8259A_irq)(unsigned int irq) + _mask_and_ack_8259A_irq; + +static void mask_and_ack_8259A_irq(struct irq_desc *desc) +{ + _mask_and_ack_8259A_irq(desc->irq); +} static unsigned int startup_8259A_irq(struct irq_desc *desc) { @@ -133,20 +141,26 @@ static unsigned int cached_irq_mask = 0x */ unsigned int __read_mostly io_apic_irqs; -void disable_8259A_irq(struct irq_desc *desc) +static void _disable_8259A_irq(unsigned int irq) { - unsigned int mask = 1 << desc->irq; + unsigned int mask = 1 << irq; unsigned long flags; spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask |= mask; - if (desc->irq & 8) + if (irq & 8) outb(cached_A1,0xA1); else outb(cached_21,0x21); + per_cpu(vector_irq, 0)[LEGACY_VECTOR(irq)] = -1; spin_unlock_irqrestore(&i8259A_lock, flags); } +void disable_8259A_irq(struct irq_desc *desc) +{ + _disable_8259A_irq(desc->irq); +} + void enable_8259A_irq(struct irq_desc *desc) { unsigned int mask = ~(1 << desc->irq); @@ -154,6 +168,7 @@ void enable_8259A_irq(struct irq_desc *d spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask &= mask; + per_cpu(vector_irq, 0)[LEGACY_VECTOR(desc->irq)] = desc->irq; if (desc->irq & 8) outb(cached_A1,0xA1); else @@ -226,9 +241,9 @@ static inline int i8259A_irq_real(unsign * first, _then_ send the EOI, and the order of EOI * to the two 8259s is important! */ -static void mask_and_ack_8259A_irq(struct irq_desc *desc) +static void _mask_and_ack_8259A_irq(unsigned int irq) { - unsigned int irqmask = 1 << desc->irq; + unsigned int irqmask = 1 << irq; unsigned long flags; spin_lock_irqsave(&i8259A_lock, flags); @@ -252,15 +267,15 @@ static void mask_and_ack_8259A_irq(struc cached_irq_mask |= irqmask; handle_real_irq: - if (desc->irq & 8) { + if (irq & 8) { inb(0xA1); /* DUMMY - (do we need this?) */ outb(cached_A1,0xA1); - outb(0x60 + (desc->irq & 7), 0xA0);/* ''Specific EOI'' to slave */ + outb(0x60 + (irq & 7), 0xA0);/* ''Specific EOI'' to slave */ outb(0x62,0x20); /* ''Specific EOI'' to master-IRQ2 */ } else { inb(0x21); /* DUMMY - (do we need this?) */ outb(cached_21,0x21); - outb(0x60 + desc->irq, 0x20);/* ''Specific EOI'' to master */ + outb(0x60 + irq, 0x20);/* ''Specific EOI'' to master */ } spin_unlock_irqrestore(&i8259A_lock, flags); return; @@ -269,7 +284,7 @@ static void mask_and_ack_8259A_irq(struc /* * this is the slow path - should happen rarely. */ - if (i8259A_irq_real(desc->irq)) + if (i8259A_irq_real(irq)) /* * oops, the IRQ _is_ in service according to the * 8259A - not spurious, go handle it. @@ -283,7 +298,7 @@ static void mask_and_ack_8259A_irq(struc * lets ACK and report it. [once per IRQ] */ if (!(spurious_irq_mask & irqmask)) { - printk("spurious 8259A interrupt: IRQ%d.\n", desc->irq); + printk("spurious 8259A interrupt: IRQ%d.\n", irq); spurious_irq_mask |= irqmask; } /* @@ -352,13 +367,19 @@ void __devinit init_8259A(int auto_eoi) is to be investigated) */ if (auto_eoi) + { /* * in AEOI mode we just have to mask the interrupt * when acking. */ i8259A_irq_type.ack = disable_8259A_irq; + bogus_8259A_irq = _disable_8259A_irq; + } else + { i8259A_irq_type.ack = mask_and_ack_8259A_irq; + bogus_8259A_irq = _mask_and_ack_8259A_irq; + } udelay(100); /* wait for 8259A to initialize */ --- a/xen/arch/x86/irq.c +++ b/xen/arch/x86/irq.c @@ -811,9 +811,12 @@ void do_IRQ(struct cpu_user_regs *regs) if (direct_apic_vector[vector] != NULL) { (*direct_apic_vector[vector])(regs); } else { - ack_APIC_irq(); - printk("%s: %d.%d No irq handler for vector (irq %d)\n", - __func__, smp_processor_id(), vector, irq); + if ( vector < FIRST_LEGACY_VECTOR || vector > LAST_LEGACY_VECTOR ) + ack_APIC_irq(); + else + bogus_8259A_irq(vector - FIRST_LEGACY_VECTOR); + printk("CPU%u: No handler for vector %02x (IRQ %d)\n", + smp_processor_id(), vector, irq); TRACE_1D(TRC_HW_IRQ_UNMAPPED_VECTOR, vector); } goto out_no_unlock; --- a/xen/include/asm-x86/irq.h +++ b/xen/include/asm-x86/irq.h @@ -104,6 +104,7 @@ void mask_8259A(void); void unmask_8259A(void); void init_8259A(int aeoi); void make_8259A_irq(unsigned int irq); +extern void (*bogus_8259A_irq)(unsigned int irq); int i8259A_suspend(void); int i8259A_resume(void); _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Jan Beulich
2012-May-14 12:55 UTC
Re: [PATCH] x86: adjust handling of interrupts coming in via legacy vectors
>>> On 14.05.12 at 14:39, "Jan Beulich" <JBeulich@suse.com> wrote: > The debugging code added in c/s 24707:96987c324a4f was hit a (small) > number of times (one report being > http://lists.xen.org/archives/html/xen-devel/2012-05/msg00332.html), > apparently always with a vector within the legacy range. Obviously, > besides legacy vectors not normally expected to be in use on systems > with IO-APIC(s), they should never make it to the IRQ migration logic. > > This wasn''t being prevented so far: Since we don''t have a one-to-one > mapping between vectors and IRQs - legacy IRQs may have two vectors > associated with them (one used in either 8259A, the other used in one > of the IO-APICs) -, vector-to-IRQ translations for legacy vectors (as > used in do_IRQ()) would yield a valid IRQ number despite the IRQ > really being handled via an IO-APIC. > > This gets changed here - disable_8259A_irq() zaps the legacy vector-to- > IRQ mapping, and enable_8259A_irq(), should it ever be called for a > particular interrupts, restores it. > > Additionally, the spurious interrupt logic in do_IRQ() gets adjusted > too: Interrupts coming in via legacy vectors obviously didn''t get > reported through the IO-APIC/LAPIC pair (as we never program these > vectors into any RTE), and hence shouldn''t get ack_APIC_irq() called on > them. Instead, a new function (pointer) bogus_8259A_irq() gets used to > have the 8259A driver take care of the bogus interrupt (as outside of > automatice EOI mode it may need an EOI to be issued for it to prevent > other interrupts that may legitimately go through the 8259As from > getting masked out).Note that this patch does not make any attempt at dealing with the underlying issue that causes the bogus interrupt(s) to show up. If my analysis is right, we shouldn''t see crashes anymore, but instead observe instances of spurious interrupts on legacy vectors. It would certainly be nice to have an actual proof of this (albeit I realize that this isn''t readily reproducible), in order to then - if indeed behaving as expected - add debugging code to identify whether such interrupts in fact get raised by one of the 8259A-s (particularly printing the cached and physical mask register values), or whether they get introduced into the system by yet another obscure mechanism. One particular thing I''m suspicious about are the numerous aliases to the two (each) 8259A I/O ports that various chipsets have: What if some component in Dom0 accesses one of the alias ports in order to do something specific to a non-standard platform (say, probe for some special hardware interface), not realizing that it actually plays with PIC state? Linux under the same conditions wouldn''t severely suffer - as it has a 1:1 vector <-> IRQ translation, it likely would merely observe an extra interrupt. Jan> Signed-off-by: Jan Beulich <jbeulich@suse.com> > > --- a/xen/arch/x86/i8259.c > +++ b/xen/arch/x86/i8259.c > @@ -85,7 +85,15 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BU > > static DEFINE_SPINLOCK(i8259A_lock); > > -static void mask_and_ack_8259A_irq(struct irq_desc *); > +static void _mask_and_ack_8259A_irq(unsigned int irq); > + > +void (*__read_mostly bogus_8259A_irq)(unsigned int irq) > + _mask_and_ack_8259A_irq; > + > +static void mask_and_ack_8259A_irq(struct irq_desc *desc) > +{ > + _mask_and_ack_8259A_irq(desc->irq); > +} > > static unsigned int startup_8259A_irq(struct irq_desc *desc) > { > @@ -133,20 +141,26 @@ static unsigned int cached_irq_mask = 0x > */ > unsigned int __read_mostly io_apic_irqs; > > -void disable_8259A_irq(struct irq_desc *desc) > +static void _disable_8259A_irq(unsigned int irq) > { > - unsigned int mask = 1 << desc->irq; > + unsigned int mask = 1 << irq; > unsigned long flags; > > spin_lock_irqsave(&i8259A_lock, flags); > cached_irq_mask |= mask; > - if (desc->irq & 8) > + if (irq & 8) > outb(cached_A1,0xA1); > else > outb(cached_21,0x21); > + per_cpu(vector_irq, 0)[LEGACY_VECTOR(irq)] = -1; > spin_unlock_irqrestore(&i8259A_lock, flags); > } > > +void disable_8259A_irq(struct irq_desc *desc) > +{ > + _disable_8259A_irq(desc->irq); > +} > + > void enable_8259A_irq(struct irq_desc *desc) > { > unsigned int mask = ~(1 << desc->irq); > @@ -154,6 +168,7 @@ void enable_8259A_irq(struct irq_desc *d > > spin_lock_irqsave(&i8259A_lock, flags); > cached_irq_mask &= mask; > + per_cpu(vector_irq, 0)[LEGACY_VECTOR(desc->irq)] = desc->irq; > if (desc->irq & 8) > outb(cached_A1,0xA1); > else > @@ -226,9 +241,9 @@ static inline int i8259A_irq_real(unsign > * first, _then_ send the EOI, and the order of EOI > * to the two 8259s is important! > */ > -static void mask_and_ack_8259A_irq(struct irq_desc *desc) > +static void _mask_and_ack_8259A_irq(unsigned int irq) > { > - unsigned int irqmask = 1 << desc->irq; > + unsigned int irqmask = 1 << irq; > unsigned long flags; > > spin_lock_irqsave(&i8259A_lock, flags); > @@ -252,15 +267,15 @@ static void mask_and_ack_8259A_irq(struc > cached_irq_mask |= irqmask; > > handle_real_irq: > - if (desc->irq & 8) { > + if (irq & 8) { > inb(0xA1); /* DUMMY - (do we need this?) */ > outb(cached_A1,0xA1); > - outb(0x60 + (desc->irq & 7), 0xA0);/* ''Specific EOI'' to slave */ > + outb(0x60 + (irq & 7), 0xA0);/* ''Specific EOI'' to slave */ > outb(0x62,0x20); /* ''Specific EOI'' to master-IRQ2 */ > } else { > inb(0x21); /* DUMMY - (do we need this?) */ > outb(cached_21,0x21); > - outb(0x60 + desc->irq, 0x20);/* ''Specific EOI'' to master */ > + outb(0x60 + irq, 0x20);/* ''Specific EOI'' to master */ > } > spin_unlock_irqrestore(&i8259A_lock, flags); > return; > @@ -269,7 +284,7 @@ static void mask_and_ack_8259A_irq(struc > /* > * this is the slow path - should happen rarely. > */ > - if (i8259A_irq_real(desc->irq)) > + if (i8259A_irq_real(irq)) > /* > * oops, the IRQ _is_ in service according to the > * 8259A - not spurious, go handle it. > @@ -283,7 +298,7 @@ static void mask_and_ack_8259A_irq(struc > * lets ACK and report it. [once per IRQ] > */ > if (!(spurious_irq_mask & irqmask)) { > - printk("spurious 8259A interrupt: IRQ%d.\n", desc->irq); > + printk("spurious 8259A interrupt: IRQ%d.\n", irq); > spurious_irq_mask |= irqmask; > } > /* > @@ -352,13 +367,19 @@ void __devinit init_8259A(int auto_eoi) > is to be investigated) */ > > if (auto_eoi) > + { > /* > * in AEOI mode we just have to mask the interrupt > * when acking. > */ > i8259A_irq_type.ack = disable_8259A_irq; > + bogus_8259A_irq = _disable_8259A_irq; > + } > else > + { > i8259A_irq_type.ack = mask_and_ack_8259A_irq; > + bogus_8259A_irq = _mask_and_ack_8259A_irq; > + } > > udelay(100); /* wait for 8259A to initialize */ > > --- a/xen/arch/x86/irq.c > +++ b/xen/arch/x86/irq.c > @@ -811,9 +811,12 @@ void do_IRQ(struct cpu_user_regs *regs) > if (direct_apic_vector[vector] != NULL) { > (*direct_apic_vector[vector])(regs); > } else { > - ack_APIC_irq(); > - printk("%s: %d.%d No irq handler for vector (irq %d)\n", > - __func__, smp_processor_id(), vector, irq); > + if ( vector < FIRST_LEGACY_VECTOR || vector > LAST_LEGACY_VECTOR > ) > + ack_APIC_irq(); > + else > + bogus_8259A_irq(vector - FIRST_LEGACY_VECTOR); > + printk("CPU%u: No handler for vector %02x (IRQ %d)\n", > + smp_processor_id(), vector, irq); > TRACE_1D(TRC_HW_IRQ_UNMAPPED_VECTOR, vector); > } > goto out_no_unlock; > --- a/xen/include/asm-x86/irq.h > +++ b/xen/include/asm-x86/irq.h > @@ -104,6 +104,7 @@ void mask_8259A(void); > void unmask_8259A(void); > void init_8259A(int aeoi); > void make_8259A_irq(unsigned int irq); > +extern void (*bogus_8259A_irq)(unsigned int irq); > int i8259A_suspend(void); > int i8259A_resume(void); >
Andrew Cooper
2012-May-14 13:33 UTC
Re: [PATCH] x86: adjust handling of interrupts coming in via legacy vectors
On 14/05/12 13:55, Jan Beulich wrote:>>>> On 14.05.12 at 14:39, "Jan Beulich" <JBeulich@suse.com> wrote: >> The debugging code added in c/s 24707:96987c324a4f was hit a (small) >> number of times (one report being >> http://lists.xen.org/archives/html/xen-devel/2012-05/msg00332.html), >> apparently always with a vector within the legacy range. Obviously, >> besides legacy vectors not normally expected to be in use on systems >> with IO-APIC(s), they should never make it to the IRQ migration logic. >> >> This wasn''t being prevented so far: Since we don''t have a one-to-one >> mapping between vectors and IRQs - legacy IRQs may have two vectors >> associated with them (one used in either 8259A, the other used in one >> of the IO-APICs) -, vector-to-IRQ translations for legacy vectors (as >> used in do_IRQ()) would yield a valid IRQ number despite the IRQ >> really being handled via an IO-APIC. >> >> This gets changed here - disable_8259A_irq() zaps the legacy vector-to- >> IRQ mapping, and enable_8259A_irq(), should it ever be called for a >> particular interrupts, restores it. >> >> Additionally, the spurious interrupt logic in do_IRQ() gets adjusted >> too: Interrupts coming in via legacy vectors obviously didn''t get >> reported through the IO-APIC/LAPIC pair (as we never program these >> vectors into any RTE), and hence shouldn''t get ack_APIC_irq() called on >> them. Instead, a new function (pointer) bogus_8259A_irq() gets used to >> have the 8259A driver take care of the bogus interrupt (as outside of >> automatice EOI mode it may need an EOI to be issued for it to prevent >> other interrupts that may legitimately go through the 8259As from >> getting masked out). > Note that this patch does not make any attempt at dealing with the > underlying issue that causes the bogus interrupt(s) to show up. If > my analysis is right, we shouldn''t see crashes anymore, but instead > observe instances of spurious interrupts on legacy vectors. It would > certainly be nice to have an actual proof of this (albeit I realize that > this isn''t readily reproducible), in order to then - if indeed behaving > as expected - add debugging code to identify whether such interrupts > in fact get raised by one of the 8259A-s (particularly printing the > cached and physical mask register values), or whether they get > introduced into the system by yet another obscure mechanism. > > One particular thing I''m suspicious about are the numerous aliases > to the two (each) 8259A I/O ports that various chipsets have: What > if some component in Dom0 accesses one of the alias ports in order > to do something specific to a non-standard platform (say, probe for > some special hardware interface), not realizing that it actually plays > with PIC state? Linux under the same conditions wouldn''t severely > suffer - as it has a 1:1 vector <-> IRQ translation, it likely would > merely observe an extra interrupt. > > JanOn the whole, the patch looks sensible, but what happens if the spurious interrupt is coming in through the Local APIC ? If this is the case, then we still need to ACK it, even if it is a bogus PIC interrupt. Perhaps in irq.c, the changes should check whether the observed vector has been raised in the LAPIC and ack it, and then decide whether it is bogus or not. Might it also be sensible to remove dom0''s permissions to use the PIC ports, in case it is some weird issue like that? ~Andrew> >> Signed-off-by: Jan Beulich <jbeulich@suse.com> >> >> --- a/xen/arch/x86/i8259.c >> +++ b/xen/arch/x86/i8259.c >> @@ -85,7 +85,15 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BU >> >> static DEFINE_SPINLOCK(i8259A_lock); >> >> -static void mask_and_ack_8259A_irq(struct irq_desc *); >> +static void _mask_and_ack_8259A_irq(unsigned int irq); >> + >> +void (*__read_mostly bogus_8259A_irq)(unsigned int irq) >> + _mask_and_ack_8259A_irq; >> + >> +static void mask_and_ack_8259A_irq(struct irq_desc *desc) >> +{ >> + _mask_and_ack_8259A_irq(desc->irq); >> +} >> >> static unsigned int startup_8259A_irq(struct irq_desc *desc) >> { >> @@ -133,20 +141,26 @@ static unsigned int cached_irq_mask = 0x >> */ >> unsigned int __read_mostly io_apic_irqs; >> >> -void disable_8259A_irq(struct irq_desc *desc) >> +static void _disable_8259A_irq(unsigned int irq) >> { >> - unsigned int mask = 1 << desc->irq; >> + unsigned int mask = 1 << irq; >> unsigned long flags; >> >> spin_lock_irqsave(&i8259A_lock, flags); >> cached_irq_mask |= mask; >> - if (desc->irq & 8) >> + if (irq & 8) >> outb(cached_A1,0xA1); >> else >> outb(cached_21,0x21); >> + per_cpu(vector_irq, 0)[LEGACY_VECTOR(irq)] = -1; >> spin_unlock_irqrestore(&i8259A_lock, flags); >> } >> >> +void disable_8259A_irq(struct irq_desc *desc) >> +{ >> + _disable_8259A_irq(desc->irq); >> +} >> + >> void enable_8259A_irq(struct irq_desc *desc) >> { >> unsigned int mask = ~(1 << desc->irq); >> @@ -154,6 +168,7 @@ void enable_8259A_irq(struct irq_desc *d >> >> spin_lock_irqsave(&i8259A_lock, flags); >> cached_irq_mask &= mask; >> + per_cpu(vector_irq, 0)[LEGACY_VECTOR(desc->irq)] = desc->irq; >> if (desc->irq & 8) >> outb(cached_A1,0xA1); >> else >> @@ -226,9 +241,9 @@ static inline int i8259A_irq_real(unsign >> * first, _then_ send the EOI, and the order of EOI >> * to the two 8259s is important! >> */ >> -static void mask_and_ack_8259A_irq(struct irq_desc *desc) >> +static void _mask_and_ack_8259A_irq(unsigned int irq) >> { >> - unsigned int irqmask = 1 << desc->irq; >> + unsigned int irqmask = 1 << irq; >> unsigned long flags; >> >> spin_lock_irqsave(&i8259A_lock, flags); >> @@ -252,15 +267,15 @@ static void mask_and_ack_8259A_irq(struc >> cached_irq_mask |= irqmask; >> >> handle_real_irq: >> - if (desc->irq & 8) { >> + if (irq & 8) { >> inb(0xA1); /* DUMMY - (do we need this?) */ >> outb(cached_A1,0xA1); >> - outb(0x60 + (desc->irq & 7), 0xA0);/* ''Specific EOI'' to slave */ >> + outb(0x60 + (irq & 7), 0xA0);/* ''Specific EOI'' to slave */ >> outb(0x62,0x20); /* ''Specific EOI'' to master-IRQ2 */ >> } else { >> inb(0x21); /* DUMMY - (do we need this?) */ >> outb(cached_21,0x21); >> - outb(0x60 + desc->irq, 0x20);/* ''Specific EOI'' to master */ >> + outb(0x60 + irq, 0x20);/* ''Specific EOI'' to master */ >> } >> spin_unlock_irqrestore(&i8259A_lock, flags); >> return; >> @@ -269,7 +284,7 @@ static void mask_and_ack_8259A_irq(struc >> /* >> * this is the slow path - should happen rarely. >> */ >> - if (i8259A_irq_real(desc->irq)) >> + if (i8259A_irq_real(irq)) >> /* >> * oops, the IRQ _is_ in service according to the >> * 8259A - not spurious, go handle it. >> @@ -283,7 +298,7 @@ static void mask_and_ack_8259A_irq(struc >> * lets ACK and report it. [once per IRQ] >> */ >> if (!(spurious_irq_mask & irqmask)) { >> - printk("spurious 8259A interrupt: IRQ%d.\n", desc->irq); >> + printk("spurious 8259A interrupt: IRQ%d.\n", irq); >> spurious_irq_mask |= irqmask; >> } >> /* >> @@ -352,13 +367,19 @@ void __devinit init_8259A(int auto_eoi) >> is to be investigated) */ >> >> if (auto_eoi) >> + { >> /* >> * in AEOI mode we just have to mask the interrupt >> * when acking. >> */ >> i8259A_irq_type.ack = disable_8259A_irq; >> + bogus_8259A_irq = _disable_8259A_irq; >> + } >> else >> + { >> i8259A_irq_type.ack = mask_and_ack_8259A_irq; >> + bogus_8259A_irq = _mask_and_ack_8259A_irq; >> + } >> >> udelay(100); /* wait for 8259A to initialize */ >> >> --- a/xen/arch/x86/irq.c >> +++ b/xen/arch/x86/irq.c >> @@ -811,9 +811,12 @@ void do_IRQ(struct cpu_user_regs *regs) >> if (direct_apic_vector[vector] != NULL) { >> (*direct_apic_vector[vector])(regs); >> } else { >> - ack_APIC_irq(); >> - printk("%s: %d.%d No irq handler for vector (irq %d)\n", >> - __func__, smp_processor_id(), vector, irq); >> + if ( vector < FIRST_LEGACY_VECTOR || vector > LAST_LEGACY_VECTOR >> ) >> + ack_APIC_irq(); >> + else >> + bogus_8259A_irq(vector - FIRST_LEGACY_VECTOR); >> + printk("CPU%u: No handler for vector %02x (IRQ %d)\n", >> + smp_processor_id(), vector, irq); >> TRACE_1D(TRC_HW_IRQ_UNMAPPED_VECTOR, vector); >> } >> goto out_no_unlock; >> --- a/xen/include/asm-x86/irq.h >> +++ b/xen/include/asm-x86/irq.h >> @@ -104,6 +104,7 @@ void mask_8259A(void); >> void unmask_8259A(void); >> void init_8259A(int aeoi); >> void make_8259A_irq(unsigned int irq); >> +extern void (*bogus_8259A_irq)(unsigned int irq); >> int i8259A_suspend(void); >> int i8259A_resume(void); >> > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel-- Andrew Cooper - Dom0 Kernel Engineer, Citrix XenServer T: +44 (0)1223 225 900, http://www.citrix.com
Jan Beulich
2012-May-14 14:28 UTC
Re: [PATCH] x86: adjust handling of interrupts coming in via legacy vectors
>>> On 14.05.12 at 15:33, Andrew Cooper <andrew.cooper3@citrix.com> wrote: > On 14/05/12 13:55, Jan Beulich wrote: >>>>> On 14.05.12 at 14:39, "Jan Beulich" <JBeulich@suse.com> wrote: >>> The debugging code added in c/s 24707:96987c324a4f was hit a (small) >>> number of times (one report being >>> http://lists.xen.org/archives/html/xen-devel/2012-05/msg00332.html), >>> apparently always with a vector within the legacy range. Obviously, >>> besides legacy vectors not normally expected to be in use on systems >>> with IO-APIC(s), they should never make it to the IRQ migration logic. >>> >>> This wasn''t being prevented so far: Since we don''t have a one-to-one >>> mapping between vectors and IRQs - legacy IRQs may have two vectors >>> associated with them (one used in either 8259A, the other used in one >>> of the IO-APICs) -, vector-to-IRQ translations for legacy vectors (as >>> used in do_IRQ()) would yield a valid IRQ number despite the IRQ >>> really being handled via an IO-APIC. >>> >>> This gets changed here - disable_8259A_irq() zaps the legacy vector-to- >>> IRQ mapping, and enable_8259A_irq(), should it ever be called for a >>> particular interrupts, restores it. >>> >>> Additionally, the spurious interrupt logic in do_IRQ() gets adjusted >>> too: Interrupts coming in via legacy vectors obviously didn''t get >>> reported through the IO-APIC/LAPIC pair (as we never program these >>> vectors into any RTE), and hence shouldn''t get ack_APIC_irq() called on >>> them. Instead, a new function (pointer) bogus_8259A_irq() gets used to >>> have the 8259A driver take care of the bogus interrupt (as outside of >>> automatice EOI mode it may need an EOI to be issued for it to prevent >>> other interrupts that may legitimately go through the 8259As from >>> getting masked out). >> Note that this patch does not make any attempt at dealing with the >> underlying issue that causes the bogus interrupt(s) to show up. If >> my analysis is right, we shouldn''t see crashes anymore, but instead >> observe instances of spurious interrupts on legacy vectors. It would >> certainly be nice to have an actual proof of this (albeit I realize that >> this isn''t readily reproducible), in order to then - if indeed behaving >> as expected - add debugging code to identify whether such interrupts >> in fact get raised by one of the 8259A-s (particularly printing the >> cached and physical mask register values), or whether they get >> introduced into the system by yet another obscure mechanism. >> >> One particular thing I''m suspicious about are the numerous aliases >> to the two (each) 8259A I/O ports that various chipsets have: What >> if some component in Dom0 accesses one of the alias ports in order >> to do something specific to a non-standard platform (say, probe for >> some special hardware interface), not realizing that it actually plays >> with PIC state? Linux under the same conditions wouldn''t severely >> suffer - as it has a 1:1 vector <-> IRQ translation, it likely would >> merely observe an extra interrupt. > > On the whole, the patch looks sensible, but what happens if the spurious > interrupt is coming in through the Local APIC ? If this is the case, > then we still need to ACK it, even if it is a bogus PIC interrupt. > > Perhaps in irq.c, the changes should check whether the observed vector > has been raised in the LAPIC and ack it, and then decide whether it is > bogus or not.Should that really turn out to be the case, we''re in much bigger trouble, as then we need an explanation how an interrupt at that vector could have got raised in the first place. I''d therefore like to keep the current change deal only with things that we know can happen.> Might it also be sensible to remove dom0''s permissions to use the PIC > ports, in case it is some weird issue like that?That''s already being done iirc. The problem is that it''s non-trivial (and perhaps non-reliable) to determine the aliases, and hence we can''t blindly remove more than the two real ports from Dom0''s permitted set. Jan
Andrew Cooper
2012-May-14 14:38 UTC
Re: [PATCH] x86: adjust handling of interrupts coming in via legacy vectors
On 14/05/12 15:28, Jan Beulich wrote:>>>> On 14.05.12 at 15:33, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >> On 14/05/12 13:55, Jan Beulich wrote: >>>>>> On 14.05.12 at 14:39, "Jan Beulich" <JBeulich@suse.com> wrote: >>>> The debugging code added in c/s 24707:96987c324a4f was hit a (small) >>>> number of times (one report being >>>> http://lists.xen.org/archives/html/xen-devel/2012-05/msg00332.html), >>>> apparently always with a vector within the legacy range. Obviously, >>>> besides legacy vectors not normally expected to be in use on systems >>>> with IO-APIC(s), they should never make it to the IRQ migration logic. >>>> >>>> This wasn''t being prevented so far: Since we don''t have a one-to-one >>>> mapping between vectors and IRQs - legacy IRQs may have two vectors >>>> associated with them (one used in either 8259A, the other used in one >>>> of the IO-APICs) -, vector-to-IRQ translations for legacy vectors (as >>>> used in do_IRQ()) would yield a valid IRQ number despite the IRQ >>>> really being handled via an IO-APIC. >>>> >>>> This gets changed here - disable_8259A_irq() zaps the legacy vector-to- >>>> IRQ mapping, and enable_8259A_irq(), should it ever be called for a >>>> particular interrupts, restores it. >>>> >>>> Additionally, the spurious interrupt logic in do_IRQ() gets adjusted >>>> too: Interrupts coming in via legacy vectors obviously didn''t get >>>> reported through the IO-APIC/LAPIC pair (as we never program these >>>> vectors into any RTE), and hence shouldn''t get ack_APIC_irq() called on >>>> them. Instead, a new function (pointer) bogus_8259A_irq() gets used to >>>> have the 8259A driver take care of the bogus interrupt (as outside of >>>> automatice EOI mode it may need an EOI to be issued for it to prevent >>>> other interrupts that may legitimately go through the 8259As from >>>> getting masked out). >>> Note that this patch does not make any attempt at dealing with the >>> underlying issue that causes the bogus interrupt(s) to show up. If >>> my analysis is right, we shouldn''t see crashes anymore, but instead >>> observe instances of spurious interrupts on legacy vectors. It would >>> certainly be nice to have an actual proof of this (albeit I realize that >>> this isn''t readily reproducible), in order to then - if indeed behaving >>> as expected - add debugging code to identify whether such interrupts >>> in fact get raised by one of the 8259A-s (particularly printing the >>> cached and physical mask register values), or whether they get >>> introduced into the system by yet another obscure mechanism. >>> >>> One particular thing I''m suspicious about are the numerous aliases >>> to the two (each) 8259A I/O ports that various chipsets have: What >>> if some component in Dom0 accesses one of the alias ports in order >>> to do something specific to a non-standard platform (say, probe for >>> some special hardware interface), not realizing that it actually plays >>> with PIC state? Linux under the same conditions wouldn''t severely >>> suffer - as it has a 1:1 vector <-> IRQ translation, it likely would >>> merely observe an extra interrupt. >> On the whole, the patch looks sensible, but what happens if the spurious >> interrupt is coming in through the Local APIC ? If this is the case, >> then we still need to ACK it, even if it is a bogus PIC interrupt. >> >> Perhaps in irq.c, the changes should check whether the observed vector >> has been raised in the LAPIC and ack it, and then decide whether it is >> bogus or not. > Should that really turn out to be the case, we''re in much bigger trouble, > as then we need an explanation how an interrupt at that vector could > have got raised in the first place. I''d therefore like to keep the current > change deal only with things that we know can happen.We would be in huge trouble. As it currently stands, I am not certain that we can be sure that this is not happening. As a concession, perhaps a test of the LAPIC IIR, and an obvious error to the console? It would be be more useful than having Xen crash/hang due to no longer always ack''ing the LAPIC.> >> Might it also be sensible to remove dom0''s permissions to use the PIC >> ports, in case it is some weird issue like that? > That''s already being done iirc. The problem is that it''s non-trivial (and > perhaps non-reliable) to determine the aliases, and hence we can''t > blindly remove more than the two real ports from Dom0''s permitted > set. > > JanAh yes - in which case its not feasible. -- Andrew Cooper - Dom0 Kernel Engineer, Citrix XenServer T: +44 (0)1223 225 900, http://www.citrix.com
Keir Fraser
2012-May-14 15:35 UTC
Re: x86: adjust handling of interrupts coming in via legacy vectors
On 14/05/2012 13:39, "Jan Beulich" <JBeulich@suse.com> wrote:> The debugging code added in c/s 24707:96987c324a4f was hit a (small) > number of times (one report being > http://lists.xen.org/archives/html/xen-devel/2012-05/msg00332.html), > apparently always with a vector within the legacy range. Obviously, > besides legacy vectors not normally expected to be in use on systems > with IO-APIC(s), they should never make it to the IRQ migration logic. > > This wasn''t being prevented so far: Since we don''t have a one-to-one > mapping between vectors and IRQs - legacy IRQs may have two vectors > associated with them (one used in either 8259A, the other used in one > of the IO-APICs) -, vector-to-IRQ translations for legacy vectors (as > used in do_IRQ()) would yield a valid IRQ number despite the IRQ > really being handled via an IO-APIC. > > This gets changed here - disable_8259A_irq() zaps the legacy vector-to- > IRQ mapping, and enable_8259A_irq(), should it ever be called for a > particular interrupts, restores it. > > Additionally, the spurious interrupt logic in do_IRQ() gets adjusted > too: Interrupts coming in via legacy vectors obviously didn''t get > reported through the IO-APIC/LAPIC pair (as we never program these > vectors into any RTE), and hence shouldn''t get ack_APIC_irq() called on > them. Instead, a new function (pointer) bogus_8259A_irq() gets used to > have the 8259A driver take care of the bogus interrupt (as outside of > automatice EOI mode it may need an EOI to be issued for it to prevent > other interrupts that may legitimately go through the 8259As from > getting masked out). > > Signed-off-by: Jan Beulich <jbeulich@suse.com>Looks sensible, and I suppose good to have for 4.2. Acked-by: Keir Fraser <keir@xen.org>> --- a/xen/arch/x86/i8259.c > +++ b/xen/arch/x86/i8259.c > @@ -85,7 +85,15 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BU > > static DEFINE_SPINLOCK(i8259A_lock); > > -static void mask_and_ack_8259A_irq(struct irq_desc *); > +static void _mask_and_ack_8259A_irq(unsigned int irq); > + > +void (*__read_mostly bogus_8259A_irq)(unsigned int irq) > + _mask_and_ack_8259A_irq; > + > +static void mask_and_ack_8259A_irq(struct irq_desc *desc) > +{ > + _mask_and_ack_8259A_irq(desc->irq); > +} > > static unsigned int startup_8259A_irq(struct irq_desc *desc) > { > @@ -133,20 +141,26 @@ static unsigned int cached_irq_mask = 0x > */ > unsigned int __read_mostly io_apic_irqs; > > -void disable_8259A_irq(struct irq_desc *desc) > +static void _disable_8259A_irq(unsigned int irq) > { > - unsigned int mask = 1 << desc->irq; > + unsigned int mask = 1 << irq; > unsigned long flags; > > spin_lock_irqsave(&i8259A_lock, flags); > cached_irq_mask |= mask; > - if (desc->irq & 8) > + if (irq & 8) > outb(cached_A1,0xA1); > else > outb(cached_21,0x21); > + per_cpu(vector_irq, 0)[LEGACY_VECTOR(irq)] = -1; > spin_unlock_irqrestore(&i8259A_lock, flags); > } > > +void disable_8259A_irq(struct irq_desc *desc) > +{ > + _disable_8259A_irq(desc->irq); > +} > + > void enable_8259A_irq(struct irq_desc *desc) > { > unsigned int mask = ~(1 << desc->irq); > @@ -154,6 +168,7 @@ void enable_8259A_irq(struct irq_desc *d > > spin_lock_irqsave(&i8259A_lock, flags); > cached_irq_mask &= mask; > + per_cpu(vector_irq, 0)[LEGACY_VECTOR(desc->irq)] = desc->irq; > if (desc->irq & 8) > outb(cached_A1,0xA1); > else > @@ -226,9 +241,9 @@ static inline int i8259A_irq_real(unsign > * first, _then_ send the EOI, and the order of EOI > * to the two 8259s is important! > */ > -static void mask_and_ack_8259A_irq(struct irq_desc *desc) > +static void _mask_and_ack_8259A_irq(unsigned int irq) > { > - unsigned int irqmask = 1 << desc->irq; > + unsigned int irqmask = 1 << irq; > unsigned long flags; > > spin_lock_irqsave(&i8259A_lock, flags); > @@ -252,15 +267,15 @@ static void mask_and_ack_8259A_irq(struc > cached_irq_mask |= irqmask; > > handle_real_irq: > - if (desc->irq & 8) { > + if (irq & 8) { > inb(0xA1); /* DUMMY - (do we need this?) */ > outb(cached_A1,0xA1); > - outb(0x60 + (desc->irq & 7), 0xA0);/* ''Specific EOI'' to slave */ > + outb(0x60 + (irq & 7), 0xA0);/* ''Specific EOI'' to slave */ > outb(0x62,0x20); /* ''Specific EOI'' to master-IRQ2 */ > } else { > inb(0x21); /* DUMMY - (do we need this?) */ > outb(cached_21,0x21); > - outb(0x60 + desc->irq, 0x20);/* ''Specific EOI'' to master */ > + outb(0x60 + irq, 0x20);/* ''Specific EOI'' to master */ > } > spin_unlock_irqrestore(&i8259A_lock, flags); > return; > @@ -269,7 +284,7 @@ static void mask_and_ack_8259A_irq(struc > /* > * this is the slow path - should happen rarely. > */ > - if (i8259A_irq_real(desc->irq)) > + if (i8259A_irq_real(irq)) > /* > * oops, the IRQ _is_ in service according to the > * 8259A - not spurious, go handle it. > @@ -283,7 +298,7 @@ static void mask_and_ack_8259A_irq(struc > * lets ACK and report it. [once per IRQ] > */ > if (!(spurious_irq_mask & irqmask)) { > - printk("spurious 8259A interrupt: IRQ%d.\n", desc->irq); > + printk("spurious 8259A interrupt: IRQ%d.\n", irq); > spurious_irq_mask |= irqmask; > } > /* > @@ -352,13 +367,19 @@ void __devinit init_8259A(int auto_eoi) > is to be investigated) */ > > if (auto_eoi) > + { > /* > * in AEOI mode we just have to mask the interrupt > * when acking. > */ > i8259A_irq_type.ack = disable_8259A_irq; > + bogus_8259A_irq = _disable_8259A_irq; > + } > else > + { > i8259A_irq_type.ack = mask_and_ack_8259A_irq; > + bogus_8259A_irq = _mask_and_ack_8259A_irq; > + } > > udelay(100); /* wait for 8259A to initialize */ > > --- a/xen/arch/x86/irq.c > +++ b/xen/arch/x86/irq.c > @@ -811,9 +811,12 @@ void do_IRQ(struct cpu_user_regs *regs) > if (direct_apic_vector[vector] != NULL) { > (*direct_apic_vector[vector])(regs); > } else { > - ack_APIC_irq(); > - printk("%s: %d.%d No irq handler for vector (irq %d)\n", > - __func__, smp_processor_id(), vector, irq); > + if ( vector < FIRST_LEGACY_VECTOR || vector > LAST_LEGACY_VECTOR > ) > + ack_APIC_irq(); > + else > + bogus_8259A_irq(vector - FIRST_LEGACY_VECTOR); > + printk("CPU%u: No handler for vector %02x (IRQ %d)\n", > + smp_processor_id(), vector, irq); > TRACE_1D(TRC_HW_IRQ_UNMAPPED_VECTOR, vector); > } > goto out_no_unlock; > --- a/xen/include/asm-x86/irq.h > +++ b/xen/include/asm-x86/irq.h > @@ -104,6 +104,7 @@ void mask_8259A(void); > void unmask_8259A(void); > void init_8259A(int aeoi); > void make_8259A_irq(unsigned int irq); > +extern void (*bogus_8259A_irq)(unsigned int irq); > int i8259A_suspend(void); > int i8259A_resume(void); > > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel
Jan Beulich
2012-May-14 15:39 UTC
Re: [PATCH] x86: adjust handling of interrupts coming in via legacy vectors
>>> On 14.05.12 at 16:38, Andrew Cooper <andrew.cooper3@citrix.com> wrote: > On 14/05/12 15:28, Jan Beulich wrote: >>>>> On 14.05.12 at 15:33, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >>> On 14/05/12 13:55, Jan Beulich wrote: >>>>>>> On 14.05.12 at 14:39, "Jan Beulich" <JBeulich@suse.com> wrote: >>>>> The debugging code added in c/s 24707:96987c324a4f was hit a (small) >>>>> number of times (one report being >>>>> http://lists.xen.org/archives/html/xen-devel/2012-05/msg00332.html), >>>>> apparently always with a vector within the legacy range. Obviously, >>>>> besides legacy vectors not normally expected to be in use on systems >>>>> with IO-APIC(s), they should never make it to the IRQ migration logic. >>>>> >>>>> This wasn''t being prevented so far: Since we don''t have a one-to-one >>>>> mapping between vectors and IRQs - legacy IRQs may have two vectors >>>>> associated with them (one used in either 8259A, the other used in one >>>>> of the IO-APICs) -, vector-to-IRQ translations for legacy vectors (as >>>>> used in do_IRQ()) would yield a valid IRQ number despite the IRQ >>>>> really being handled via an IO-APIC. >>>>> >>>>> This gets changed here - disable_8259A_irq() zaps the legacy vector-to- >>>>> IRQ mapping, and enable_8259A_irq(), should it ever be called for a >>>>> particular interrupts, restores it. >>>>> >>>>> Additionally, the spurious interrupt logic in do_IRQ() gets adjusted >>>>> too: Interrupts coming in via legacy vectors obviously didn''t get >>>>> reported through the IO-APIC/LAPIC pair (as we never program these >>>>> vectors into any RTE), and hence shouldn''t get ack_APIC_irq() called on >>>>> them. Instead, a new function (pointer) bogus_8259A_irq() gets used to >>>>> have the 8259A driver take care of the bogus interrupt (as outside of >>>>> automatice EOI mode it may need an EOI to be issued for it to prevent >>>>> other interrupts that may legitimately go through the 8259As from >>>>> getting masked out). >>>> Note that this patch does not make any attempt at dealing with the >>>> underlying issue that causes the bogus interrupt(s) to show up. If >>>> my analysis is right, we shouldn''t see crashes anymore, but instead >>>> observe instances of spurious interrupts on legacy vectors. It would >>>> certainly be nice to have an actual proof of this (albeit I realize that >>>> this isn''t readily reproducible), in order to then - if indeed behaving >>>> as expected - add debugging code to identify whether such interrupts >>>> in fact get raised by one of the 8259A-s (particularly printing the >>>> cached and physical mask register values), or whether they get >>>> introduced into the system by yet another obscure mechanism. >>>> >>>> One particular thing I''m suspicious about are the numerous aliases >>>> to the two (each) 8259A I/O ports that various chipsets have: What >>>> if some component in Dom0 accesses one of the alias ports in order >>>> to do something specific to a non-standard platform (say, probe for >>>> some special hardware interface), not realizing that it actually plays >>>> with PIC state? Linux under the same conditions wouldn''t severely >>>> suffer - as it has a 1:1 vector <-> IRQ translation, it likely would >>>> merely observe an extra interrupt. >>> On the whole, the patch looks sensible, but what happens if the spurious >>> interrupt is coming in through the Local APIC ? If this is the case, >>> then we still need to ACK it, even if it is a bogus PIC interrupt. >>> >>> Perhaps in irq.c, the changes should check whether the observed vector >>> has been raised in the LAPIC and ack it, and then decide whether it is >>> bogus or not. >> Should that really turn out to be the case, we''re in much bigger trouble, >> as then we need an explanation how an interrupt at that vector could >> have got raised in the first place. I''d therefore like to keep the current >> change deal only with things that we know can happen. > > We would be in huge trouble. As it currently stands, I am not certain > that we can be sure that this is not happening. > > As a concession, perhaps a test of the LAPIC IIR, and an obvious error > to the console? It would be be more useful than having Xen crash/hang > due to no longer always ack''ing the LAPIC.Okay, let''s do both then (check LAPIC and 8259A). I''ll send an updated patch soon. Jan
Jan Beulich
2012-May-14 15:56 UTC
Re: x86: adjust handling of interrupts coming in via legacy vectors
>>> On 14.05.12 at 17:35, Keir Fraser <keir.xen@gmail.com> wrote: > On 14/05/2012 13:39, "Jan Beulich" <JBeulich@suse.com> wrote: > >> The debugging code added in c/s 24707:96987c324a4f was hit a (small) >> number of times (one report being >> http://lists.xen.org/archives/html/xen-devel/2012-05/msg00332.html), >> apparently always with a vector within the legacy range. Obviously, >> besides legacy vectors not normally expected to be in use on systems >> with IO-APIC(s), they should never make it to the IRQ migration logic. >> >> This wasn''t being prevented so far: Since we don''t have a one-to-one >> mapping between vectors and IRQs - legacy IRQs may have two vectors >> associated with them (one used in either 8259A, the other used in one >> of the IO-APICs) -, vector-to-IRQ translations for legacy vectors (as >> used in do_IRQ()) would yield a valid IRQ number despite the IRQ >> really being handled via an IO-APIC. >> >> This gets changed here - disable_8259A_irq() zaps the legacy vector-to- >> IRQ mapping, and enable_8259A_irq(), should it ever be called for a >> particular interrupts, restores it. >> >> Additionally, the spurious interrupt logic in do_IRQ() gets adjusted >> too: Interrupts coming in via legacy vectors obviously didn''t get >> reported through the IO-APIC/LAPIC pair (as we never program these >> vectors into any RTE), and hence shouldn''t get ack_APIC_irq() called on >> them. Instead, a new function (pointer) bogus_8259A_irq() gets used to >> have the 8259A driver take care of the bogus interrupt (as outside of >> automatice EOI mode it may need an EOI to be issued for it to prevent >> other interrupts that may legitimately go through the 8259As from >> getting masked out). >> >> Signed-off-by: Jan Beulich <jbeulich@suse.com> > > Looks sensible, and I suppose good to have for 4.2. > > Acked-by: Keir Fraser <keir@xen.org>Please take a look at the v2 I just sent, to accommodate a suggestion from Andrew Cooper. Jan
Keir Fraser
2012-May-14 16:24 UTC
Re: x86: adjust handling of interrupts coming in via legacy vectors
On 14/05/2012 16:56, "Jan Beulich" <JBeulich@suse.com> wrote:>> Looks sensible, and I suppose good to have for 4.2. >> >> Acked-by: Keir Fraser <keir@xen.org> > > Please take a look at the v2 I just sent, to accommodate a suggestion > from Andrew Cooper.I think it''s very paranoid, since legacy vectors never get programmed into an IOAPIC RTE and should never need EOIing at the local APIC. But you do at least printk the case that we see the ISR bit set, and you printk the vector number, so really this v2 patch gives us more information about this bogus situation than v1 did, so it''s a slight improvement overall. So you still have my Ack. -- Keir
Jan Beulich
2012-May-15 06:43 UTC
Re: x86: adjust handling of interrupts coming in via legacy vectors
>>> On 14.05.12 at 18:24, Keir Fraser <keir@xen.org> wrote: > On 14/05/2012 16:56, "Jan Beulich" <JBeulich@suse.com> wrote: > >>> Looks sensible, and I suppose good to have for 4.2. >>> >>> Acked-by: Keir Fraser <keir@xen.org> >> >> Please take a look at the v2 I just sent, to accommodate a suggestion >> from Andrew Cooper. > > I think it''s very paranoid, since legacy vectors never get programmed into > an IOAPIC RTE and should never need EOIing at the local APIC. But you do at > least printk the case that we see the ISR bit set, and you printk the vector > number, so really this v2 patch gives us more information about this bogus > situation than v1 did, so it''s a slight improvement overall. So you still > have my Ack.It indeed is paranoid (which is why I didn''t do so in v1), but Andrew certainly has a point in saying that something so far unexplainable going on makes it desirable to cover as many (however remotely) potential causes as possible. (I still consider double delivery through IO-APIC and PIC the most likely scenario, despite not having a reasonably explanation on how the PIC mask bit could get cleared.) Once we hopefully understand the hole situation, the code here should likely be reverted to the v1 version (along with removing the other debugging code). Jan
AP
2012-May-15 08:03 UTC
Re: x86: adjust handling of interrupts coming in via legacy vectors
On Mon, May 14, 2012 at 11:43 PM, Jan Beulich <JBeulich@suse.com> wrote:> > >>> On 14.05.12 at 18:24, Keir Fraser <keir@xen.org> wrote: > > On 14/05/2012 16:56, "Jan Beulich" <JBeulich@suse.com> wrote: > > > >>> Looks sensible, and I suppose good to have for 4.2. > >>> > >>> Acked-by: Keir Fraser <keir@xen.org> > >> > >> Please take a look at the v2 I just sent, to accommodate a suggestion > >> from Andrew Cooper. > > > > I think it''s very paranoid, since legacy vectors never get programmed > > into > > an IOAPIC RTE and should never need EOIing at the local APIC. But you do > > at > > least printk the case that we see the ISR bit set, and you printk the > > vector > > number, so really this v2 patch gives us more information about this > > bogus > > situation than v1 did, so it''s a slight improvement overall. So you > > still > > have my Ack. > > It indeed is paranoid (which is why I didn''t do so in v1), but Andrew > certainly has a point in saying that something so far unexplainable > going on makes it desirable to cover as many (however remotely) > potential causes as possible. (I still consider double delivery through > IO-APIC and PIC the most likely scenario, despite not having a > reasonably explanation on how the PIC mask bit could get cleared.) > > Once we hopefully understand the hole situation, the code here > should likely be reverted to the v1 version (along with removing the > other debugging code).Once this patch goes in, do I need to still run with the patch Andrew provided in http://lists.xen.org/archives/html/xen-devel/2012-05/msg00332.html for the debugging code? Thanks, AP
Jan Beulich
2012-May-15 08:22 UTC
Re: x86: adjust handling of interrupts coming in via legacy vectors
>>> On 15.05.12 at 10:03, AP <apxeng@gmail.com> wrote: > On Mon, May 14, 2012 at 11:43 PM, Jan Beulich <JBeulich@suse.com> wrote: >> >> >>> On 14.05.12 at 18:24, Keir Fraser <keir@xen.org> wrote: >> > On 14/05/2012 16:56, "Jan Beulich" <JBeulich@suse.com> wrote: >> > >> >>> Looks sensible, and I suppose good to have for 4.2. >> >>> >> >>> Acked-by: Keir Fraser <keir@xen.org> >> >> >> >> Please take a look at the v2 I just sent, to accommodate a suggestion >> >> from Andrew Cooper. >> > >> > I think it''s very paranoid, since legacy vectors never get programmed >> > into >> > an IOAPIC RTE and should never need EOIing at the local APIC. But you do >> > at >> > least printk the case that we see the ISR bit set, and you printk the >> > vector >> > number, so really this v2 patch gives us more information about this >> > bogus >> > situation than v1 did, so it''s a slight improvement overall. So you >> > still >> > have my Ack. >> >> It indeed is paranoid (which is why I didn''t do so in v1), but Andrew >> certainly has a point in saying that something so far unexplainable >> going on makes it desirable to cover as many (however remotely) >> potential causes as possible. (I still consider double delivery through >> IO-APIC and PIC the most likely scenario, despite not having a >> reasonably explanation on how the PIC mask bit could get cleared.) >> >> Once we hopefully understand the hole situation, the code here >> should likely be reverted to the v1 version (along with removing the >> other debugging code). > > Once this patch goes in, do I need to still run with the patch Andrew > provided in http://lists.xen.org/archives/html/xen-devel/2012-05/msg00332.html > for the debugging code?Yes, that change is still going to be necessary. Probably worth committing too (perhaps with its second hunk annotated with a comment), which I believe didn''t happen because it wasn''t really submitted for that purpose. Andrew, Keir? Or would we be better off simply allowing xfree(NULL) in IRQ context, by swapping the in_irq() and NULL checks in the function)? I''d favor this, despite the small risk of it hiding latent bugs. Jan
Andrew Cooper
2012-May-15 08:52 UTC
Re: x86: adjust handling of interrupts coming in via legacy vectors
On 15/05/12 09:22, Jan Beulich wrote:>>>> On 15.05.12 at 10:03, AP <apxeng@gmail.com> wrote: >> On Mon, May 14, 2012 at 11:43 PM, Jan Beulich <JBeulich@suse.com> wrote: >>>>>> On 14.05.12 at 18:24, Keir Fraser <keir@xen.org> wrote: >>>> On 14/05/2012 16:56, "Jan Beulich" <JBeulich@suse.com> wrote: >>>> >>>>>> Looks sensible, and I suppose good to have for 4.2. >>>>>> >>>>>> Acked-by: Keir Fraser <keir@xen.org> >>>>> Please take a look at the v2 I just sent, to accommodate a suggestion >>>>> from Andrew Cooper. >>>> I think it''s very paranoid, since legacy vectors never get programmed >>>> into >>>> an IOAPIC RTE and should never need EOIing at the local APIC. But you do >>>> at >>>> least printk the case that we see the ISR bit set, and you printk the >>>> vector >>>> number, so really this v2 patch gives us more information about this >>>> bogus >>>> situation than v1 did, so it''s a slight improvement overall. So you >>>> still >>>> have my Ack. >>> It indeed is paranoid (which is why I didn''t do so in v1), but Andrew >>> certainly has a point in saying that something so far unexplainable >>> going on makes it desirable to cover as many (however remotely) >>> potential causes as possible. (I still consider double delivery through >>> IO-APIC and PIC the most likely scenario, despite not having a >>> reasonably explanation on how the PIC mask bit could get cleared.) >>> >>> Once we hopefully understand the hole situation, the code here >>> should likely be reverted to the v1 version (along with removing the >>> other debugging code). >> Once this patch goes in, do I need to still run with the patch Andrew >> provided in http://lists.xen.org/archives/html/xen-devel/2012-05/msg00332.html >> for the debugging code? > Yes, that change is still going to be necessary. Probably worth > committing too (perhaps with its second hunk annotated with a > comment), which I believe didn''t happen because it wasn''t really > submitted for that purpose. Andrew, Keir? > > Or would we be better off simply allowing xfree(NULL) in IRQ > context, by swapping the in_irq() and NULL checks in the > function)? I''d favor this, despite the small risk of it hiding > latent bugs. > > JanThe patch should probably be committed in the same vein as the other debugging patches -- Andrew Cooper - Dom0 Kernel Engineer, Citrix XenServer T: +44 (0)1223 225 900, http://www.citrix.com