c/s 22949:54fe1011f86b removed the forwarding of NMIs to Dom0 when they were caused by PCI SERR. NMI buttons as well as BMCs (like HP''s iLO) may however want such events to be seen in Dom0 (e.g. to trigger a dump). I''m therefore suggesting to restore most of the functionality which named c/s removed (adjusted for subsequent changes, and adjusting the public interface to use the modern term, retaining the old one for backwards compatibility). Jan --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -3089,6 +3089,7 @@ static void nmi_mce_softirq(void) static void pci_serr_softirq(void) { printk("\n\nNMI - PCI system error (SERR)\n"); + outb(inb(0x61) & 0x0b, 0x61); /* re-enable the PCI SERR error line. */ } void async_exception_cleanup(struct vcpu *curr) @@ -3135,9 +3136,20 @@ static void pci_serr_error(struct cpu_us { outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable the PCI SERR error line. */ - /* Would like to print a diagnostic here but can''t call printk() - from NMI context -- raise a softirq instead. */ - raise_softirq(PCI_SERR_SOFTIRQ); + switch ( opt_nmi[0] ) + { + case ''d'': /* ''dom0'' */ + nmi_dom0_report(_XEN_NMIREASON_pci_serr); + case ''i'': /* ''ignore'' */ + /* Would like to print a diagnostic here but can''t call printk() + from NMI context -- raise a softirq instead. */ + raise_softirq(PCI_SERR_SOFTIRQ); + break; + default: /* ''fatal'' */ + console_force_unlock(); + printk("\n\nNMI - MEMORY ERROR\n"); + fatal_trap(TRAP_nmi, regs); + } } static void io_check_error(struct cpu_user_regs *regs) --- a/xen/include/public/nmi.h +++ b/xen/include/public/nmi.h @@ -36,9 +36,14 @@ /* I/O-check error reported via ISA port 0x61, bit 6. */ #define _XEN_NMIREASON_io_error 0 #define XEN_NMIREASON_io_error (1UL << _XEN_NMIREASON_io_error) + /* PCI SERR reported via ISA port 0x61, bit 7. */ +#define _XEN_NMIREASON_pci_serr 1 +#define XEN_NMIREASON_pci_serr (1UL << _XEN_NMIREASON_pci_serr) +#if __XEN_INTERFACE_VERSION__ < 0x00040300 /* legacy alias of the above */ /* Parity error reported via ISA port 0x61, bit 7. */ #define _XEN_NMIREASON_parity_error 1 #define XEN_NMIREASON_parity_error (1UL << _XEN_NMIREASON_parity_error) +#endif /* Unknown hardware-generated NMI. */ #define _XEN_NMIREASON_unknown 2 #define XEN_NMIREASON_unknown (1UL << _XEN_NMIREASON_unknown)
Stefano Stabellini
2013-Jan-17 11:26 UTC
Re: [RFC] x86: restore PCI SERR NMI forwarding to Dom0
On Thu, 17 Jan 2013, Jan Beulich wrote:> c/s 22949:54fe1011f86b removed the forwarding of NMIs to Dom0 when they > were caused by PCI SERR. NMI buttons as well as BMCs (like HP''s iLO) > may however want such events to be seen in Dom0 (e.g. to trigger a > dump). > > I''m therefore suggesting to restore most of the functionality which > named c/s removed (adjusted for subsequent changes, and adjusting the > public interface to use the modern term, retaining the old one for > backwards compatibility).Allowing the possibility of injecting these events in Dom0 might be a good idea (dom0 just ignores PCI SERR anyway if I recall correctly). However the default for opt_nmi in Xen is "fatal", so if I am not mistaken you are changing the default Xen behaviour from ignore PCI SERR to break down on PCI SERR. I don''t think that''s a good idea.> --- a/xen/arch/x86/traps.c > +++ b/xen/arch/x86/traps.c > @@ -3089,6 +3089,7 @@ static void nmi_mce_softirq(void) > static void pci_serr_softirq(void) > { > printk("\n\nNMI - PCI system error (SERR)\n"); > + outb(inb(0x61) & 0x0b, 0x61); /* re-enable the PCI SERR error line. */ > } > > void async_exception_cleanup(struct vcpu *curr) > @@ -3135,9 +3136,20 @@ static void pci_serr_error(struct cpu_us > { > outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable the PCI SERR error line. */ > > - /* Would like to print a diagnostic here but can''t call printk() > - from NMI context -- raise a softirq instead. */ > - raise_softirq(PCI_SERR_SOFTIRQ); > + switch ( opt_nmi[0] ) > + { > + case ''d'': /* ''dom0'' */ > + nmi_dom0_report(_XEN_NMIREASON_pci_serr); > + case ''i'': /* ''ignore'' */ > + /* Would like to print a diagnostic here but can''t call printk() > + from NMI context -- raise a softirq instead. */ > + raise_softirq(PCI_SERR_SOFTIRQ); > + break; > + default: /* ''fatal'' */ > + console_force_unlock(); > + printk("\n\nNMI - MEMORY ERROR\n"); > + fatal_trap(TRAP_nmi, regs); > + } > } > > static void io_check_error(struct cpu_user_regs *regs) > --- a/xen/include/public/nmi.h > +++ b/xen/include/public/nmi.h > @@ -36,9 +36,14 @@ > /* I/O-check error reported via ISA port 0x61, bit 6. */ > #define _XEN_NMIREASON_io_error 0 > #define XEN_NMIREASON_io_error (1UL << _XEN_NMIREASON_io_error) > + /* PCI SERR reported via ISA port 0x61, bit 7. */ > +#define _XEN_NMIREASON_pci_serr 1 > +#define XEN_NMIREASON_pci_serr (1UL << _XEN_NMIREASON_pci_serr) > +#if __XEN_INTERFACE_VERSION__ < 0x00040300 /* legacy alias of the above */ > /* Parity error reported via ISA port 0x61, bit 7. */ > #define _XEN_NMIREASON_parity_error 1 > #define XEN_NMIREASON_parity_error (1UL << _XEN_NMIREASON_parity_error) > +#endif > /* Unknown hardware-generated NMI. */ > #define _XEN_NMIREASON_unknown 2 > #define XEN_NMIREASON_unknown (1UL << _XEN_NMIREASON_unknown) > >
>>> On 17.01.13 at 12:26, Stefano Stabellini <stefano.stabellini@eu.citrix.com>wrote:> On Thu, 17 Jan 2013, Jan Beulich wrote: >> c/s 22949:54fe1011f86b removed the forwarding of NMIs to Dom0 when they >> were caused by PCI SERR. NMI buttons as well as BMCs (like HP''s iLO) >> may however want such events to be seen in Dom0 (e.g. to trigger a >> dump). >> >> I''m therefore suggesting to restore most of the functionality which >> named c/s removed (adjusted for subsequent changes, and adjusting the >> public interface to use the modern term, retaining the old one for >> backwards compatibility). > > Allowing the possibility of injecting these events in Dom0 might be a > good idea (dom0 just ignores PCI SERR anyway if I recall correctly). > However the default for opt_nmi in Xen is "fatal", so if I am not > mistaken you are changing the default Xen behaviour from ignore PCI > SERR to break down on PCI SERR. I don''t think that''s a good idea.#ifdef NDEBUG static char __read_mostly opt_nmi[10] = "dom0"; #else static char __read_mostly opt_nmi[10] = "fatal"; #endif I.e. debug builds would die, but "normal" builds wouldn''t. If the former is a problem, it''s so not only for PCI SERR imo (and would need fixing here rather than in the patch I sent). Jan
Stefano Stabellini
2013-Jan-21 15:51 UTC
Re: [RFC] x86: restore PCI SERR NMI forwarding to Dom0
On Thu, 17 Jan 2013, Jan Beulich wrote:> >>> On 17.01.13 at 12:26, Stefano Stabellini <stefano.stabellini@eu.citrix.com> > wrote: > > On Thu, 17 Jan 2013, Jan Beulich wrote: > >> c/s 22949:54fe1011f86b removed the forwarding of NMIs to Dom0 when they > >> were caused by PCI SERR. NMI buttons as well as BMCs (like HP''s iLO) > >> may however want such events to be seen in Dom0 (e.g. to trigger a > >> dump). > >> > >> I''m therefore suggesting to restore most of the functionality which > >> named c/s removed (adjusted for subsequent changes, and adjusting the > >> public interface to use the modern term, retaining the old one for > >> backwards compatibility). > > > > Allowing the possibility of injecting these events in Dom0 might be a > > good idea (dom0 just ignores PCI SERR anyway if I recall correctly). > > However the default for opt_nmi in Xen is "fatal", so if I am not > > mistaken you are changing the default Xen behaviour from ignore PCI > > SERR to break down on PCI SERR. I don''t think that''s a good idea. > > #ifdef NDEBUG > static char __read_mostly opt_nmi[10] = "dom0"; > #else > static char __read_mostly opt_nmi[10] = "fatal"; > #endif > > I.e. debug builds would die, but "normal" builds wouldn''t. If the > former is a problem, it''s so not only for PCI SERR imo (and would > need fixing here rather than in the patch I sent).I didn''t realize we inverted the DEBUG (NDEBUG now), so I misread the condition. Could we print a more relevant error message then? Not just "MEMORY ERROR", but something about a PCI SERR.
>>> On 21.01.13 at 16:51, Stefano Stabellini <stefano.stabellini@eu.citrix.com>wrote:> On Thu, 17 Jan 2013, Jan Beulich wrote: >> >>> On 17.01.13 at 12:26, Stefano Stabellini <stefano.stabellini@eu.citrix.com> >> wrote: >> > On Thu, 17 Jan 2013, Jan Beulich wrote: >> >> c/s 22949:54fe1011f86b removed the forwarding of NMIs to Dom0 when they >> >> were caused by PCI SERR. NMI buttons as well as BMCs (like HP''s iLO) >> >> may however want such events to be seen in Dom0 (e.g. to trigger a >> >> dump). >> >> >> >> I''m therefore suggesting to restore most of the functionality which >> >> named c/s removed (adjusted for subsequent changes, and adjusting the >> >> public interface to use the modern term, retaining the old one for >> >> backwards compatibility). >> > >> > Allowing the possibility of injecting these events in Dom0 might be a >> > good idea (dom0 just ignores PCI SERR anyway if I recall correctly). >> > However the default for opt_nmi in Xen is "fatal", so if I am not >> > mistaken you are changing the default Xen behaviour from ignore PCI >> > SERR to break down on PCI SERR. I don''t think that''s a good idea. >> >> #ifdef NDEBUG >> static char __read_mostly opt_nmi[10] = "dom0"; >> #else >> static char __read_mostly opt_nmi[10] = "fatal"; >> #endif >> >> I.e. debug builds would die, but "normal" builds wouldn''t. If the >> former is a problem, it''s so not only for PCI SERR imo (and would >> need fixing here rather than in the patch I sent). > > I didn''t realize we inverted the DEBUG (NDEBUG now), so I misread the > condition. > Could we print a more relevant error message then? Not just "MEMORY ERROR", > but something about a PCI SERR.That''s already the case: static void pci_serr_softirq(void) { printk("\n\nNMI - PCI system error (SERR)\n"); outb(inb(0x61) & 0x0b, 0x61); /* re-enable the PCI SERR error line. */ } Oh, you mean in the message before calling fatal_trap() - it''s an oversight of mine that this still prints MEMORY ERROR. I''ll get that fix and then submit as non-RFC if that''s your only concern. Jan
Stefano Stabellini
2013-Jan-21 16:01 UTC
Re: [RFC] x86: restore PCI SERR NMI forwarding to Dom0
On Mon, 21 Jan 2013, Jan Beulich wrote:> >>> On 21.01.13 at 16:51, Stefano Stabellini <stefano.stabellini@eu.citrix.com> > wrote: > > On Thu, 17 Jan 2013, Jan Beulich wrote: > >> >>> On 17.01.13 at 12:26, Stefano Stabellini <stefano.stabellini@eu.citrix.com> > >> wrote: > >> > On Thu, 17 Jan 2013, Jan Beulich wrote: > >> >> c/s 22949:54fe1011f86b removed the forwarding of NMIs to Dom0 when they > >> >> were caused by PCI SERR. NMI buttons as well as BMCs (like HP''s iLO) > >> >> may however want such events to be seen in Dom0 (e.g. to trigger a > >> >> dump). > >> >> > >> >> I''m therefore suggesting to restore most of the functionality which > >> >> named c/s removed (adjusted for subsequent changes, and adjusting the > >> >> public interface to use the modern term, retaining the old one for > >> >> backwards compatibility). > >> > > >> > Allowing the possibility of injecting these events in Dom0 might be a > >> > good idea (dom0 just ignores PCI SERR anyway if I recall correctly). > >> > However the default for opt_nmi in Xen is "fatal", so if I am not > >> > mistaken you are changing the default Xen behaviour from ignore PCI > >> > SERR to break down on PCI SERR. I don''t think that''s a good idea. > >> > >> #ifdef NDEBUG > >> static char __read_mostly opt_nmi[10] = "dom0"; > >> #else > >> static char __read_mostly opt_nmi[10] = "fatal"; > >> #endif > >> > >> I.e. debug builds would die, but "normal" builds wouldn''t. If the > >> former is a problem, it''s so not only for PCI SERR imo (and would > >> need fixing here rather than in the patch I sent). > > > > I didn''t realize we inverted the DEBUG (NDEBUG now), so I misread the > > condition. > > Could we print a more relevant error message then? Not just "MEMORY ERROR", > > but something about a PCI SERR. > > That''s already the case: > > static void pci_serr_softirq(void) > { > printk("\n\nNMI - PCI system error (SERR)\n"); > outb(inb(0x61) & 0x0b, 0x61); /* re-enable the PCI SERR error line. */ > } > > Oh, you mean in the message before calling fatal_trap() - it''s > an oversight of mine that this still prints MEMORY ERROR. I''ll > get that fix and then submit as non-RFC if that''s your only > concern.Yes, that''s what I meant.