This series offers some improvements to Xen watchdogs. Patch 1 moves the concept of a watchdog from x86 specific code to common. Patch 2 tweaks the x86 implementation, given the common implementation. Patch 3 is a bugfix for certain cases where the watchdog code interacts badly with console_force_unlock(). Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> CC: Keir Fraser <keir@xen.org> CC: Jan Beulich <JBeulich@suse.com> CC: Tim Deegan <tim@xen.org>
Andrew Cooper
2013-Aug-12 13:37 UTC
[PATCH 1/3] xen/watchdog: Move watchdog from being x86 specific to common code.
Augment watchdog_setup() to be able to possibly return an error, and introduce watchdog_enabled() as a better alternative to knowing the architectures internal details. This patch does not change the x86 implementaion, beyond making it compile. For header files, some includes of xen/nmi.h were only for the watchdog functions, so are replaced rather than adding an extra include of xen/watchdog.h Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> CC: Keir Fraser <keir@xen.org> CC: Jan Beulich <JBeulich@suse.com> CC: Tim Deegan <tim@xen.org> --- xen/arch/x86/nmi.c | 8 +++++++- xen/arch/x86/setup.c | 1 + xen/arch/x86/shutdown.c | 2 +- xen/arch/x86/traps.c | 1 + xen/arch/x86/x86_64/traps.c | 1 + xen/common/kexec.c | 2 +- xen/common/keyhandler.c | 2 +- xen/common/shutdown.c | 2 +- xen/drivers/char/console.c | 2 +- xen/include/asm-x86/config.h | 1 + xen/include/asm-x86/nmi.h | 4 ---- xen/include/xen/watchdog.h | 35 +++++++++++++++++++++++++++++++++++ 12 files changed, 51 insertions(+), 10 deletions(-) create mode 100644 xen/include/xen/watchdog.h diff --git a/xen/arch/x86/nmi.c b/xen/arch/x86/nmi.c index c93812f..3eb2456 100644 --- a/xen/arch/x86/nmi.c +++ b/xen/arch/x86/nmi.c @@ -410,7 +410,12 @@ void watchdog_enable(void) atomic_dec(&watchdog_disable_count); } -void __init watchdog_setup(void) +bool_t watchdog_enabled(void) +{ + return !atomic_read(&watchdog_disable_count); +} + +int __init watchdog_setup(void) { unsigned int cpu; @@ -423,6 +428,7 @@ void __init watchdog_setup(void) register_cpu_notifier(&cpu_nmi_nfb); watchdog_enable(); + return 0; } void nmi_watchdog_tick(struct cpu_user_regs * regs) diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index a70d31b..c550e8e 100644 --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -26,6 +26,7 @@ #include <xen/pfn.h> #include <xen/nodemask.h> #include <xen/tmem_xen.h> /* for opt_tmem only */ +#include <xen/watchdog.h> #include <public/version.h> #include <compat/platform.h> #include <compat/xen.h> diff --git a/xen/arch/x86/shutdown.c b/xen/arch/x86/shutdown.c index c637883..4715681 100644 --- a/xen/arch/x86/shutdown.c +++ b/xen/arch/x86/shutdown.c @@ -12,7 +12,7 @@ #include <xen/delay.h> #include <xen/dmi.h> #include <xen/irq.h> -#include <xen/nmi.h> +#include <xen/watchdog.h> #include <xen/console.h> #include <xen/shutdown.h> #include <xen/acpi.h> diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index 57dbd0c..b445b2f 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -49,6 +49,7 @@ #include <xen/kexec.h> #include <xen/trace.h> #include <xen/paging.h> +#include <xen/watchdog.h> #include <asm/system.h> #include <asm/io.h> #include <asm/atomic.h> diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c index bcd7609..ea2ffb6 100644 --- a/xen/arch/x86/x86_64/traps.c +++ b/xen/arch/x86/x86_64/traps.c @@ -13,6 +13,7 @@ #include <xen/shutdown.h> #include <xen/nmi.h> #include <xen/guest_access.h> +#include <xen/watchdog.h> #include <asm/current.h> #include <asm/flushtlb.h> #include <asm/traps.h> diff --git a/xen/common/kexec.c b/xen/common/kexec.c index 1ba8556..7cd151f 100644 --- a/xen/common/kexec.c +++ b/xen/common/kexec.c @@ -12,7 +12,7 @@ #include <xen/ctype.h> #include <xen/errno.h> #include <xen/guest_access.h> -#include <xen/nmi.h> +#include <xen/watchdog.h> #include <xen/sched.h> #include <xen/types.h> #include <xen/hypercall.h> diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c index 5072133..e5c15d6 100644 --- a/xen/common/keyhandler.c +++ b/xen/common/keyhandler.c @@ -16,7 +16,7 @@ #include <xen/ctype.h> #include <xen/perfc.h> #include <xen/mm.h> -#include <xen/nmi.h> +#include <xen/watchdog.h> #include <xen/init.h> #include <asm/debugger.h> #include <asm/div64.h> diff --git a/xen/common/shutdown.c b/xen/common/shutdown.c index 73a7d7b..20f04b0 100644 --- a/xen/common/shutdown.c +++ b/xen/common/shutdown.c @@ -4,7 +4,7 @@ #include <xen/sched.h> #include <xen/domain.h> #include <xen/delay.h> -#include <xen/nmi.h> +#include <xen/watchdog.h> #include <xen/shutdown.h> #include <xen/console.h> #ifdef CONFIG_KEXEC diff --git a/xen/drivers/char/console.c b/xen/drivers/char/console.c index 8ac32e4..52ffa70 100644 --- a/xen/drivers/char/console.c +++ b/xen/drivers/char/console.c @@ -20,7 +20,7 @@ #include <xen/keyhandler.h> #include <xen/delay.h> #include <xen/guest_access.h> -#include <xen/nmi.h> +#include <xen/watchdog.h> #include <xen/shutdown.h> #include <xen/video.h> #include <xen/kexec.h> diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h index 0044edb..cb7b2e4 100644 --- a/xen/include/asm-x86/config.h +++ b/xen/include/asm-x86/config.h @@ -49,6 +49,7 @@ #define CONFIG_XENOPROF 1 #define CONFIG_KEXEC 1 +#define CONFIG_WATCHDOG 1 #define HZ 100 diff --git a/xen/include/asm-x86/nmi.h b/xen/include/asm-x86/nmi.h index e9faa72..98b5e04 100644 --- a/xen/include/asm-x86/nmi.h +++ b/xen/include/asm-x86/nmi.h @@ -41,8 +41,4 @@ long register_guest_nmi_callback(unsigned long address); */ long unregister_guest_nmi_callback(void); -void watchdog_disable(void); -void watchdog_enable(void); -void watchdog_setup(void); - #endif /* ASM_NMI_H */ diff --git a/xen/include/xen/watchdog.h b/xen/include/xen/watchdog.h new file mode 100644 index 0000000..e786b9b --- /dev/null +++ b/xen/include/xen/watchdog.h @@ -0,0 +1,35 @@ +/****************************************************************************** + * watchdog.h + * + * Common watchdog code + */ + +#ifndef __XEN_WATCHDOG_H__ +#define __XEN_WATCHDOG_H__ + +#include <xen/types.h> + +#ifdef CONFIG_WATCHDOG + +/* Try to set up a watchdog. */ +int watchdog_setup(void); + +/* Enable the watchdog. */ +void watchdog_enable(void); + +/* Disable the watchdog. */ +void watchdog_disable(void); + +/* Is the watchdog currently enabled. */ +bool_t watchdog_enabled(void); + +#else + +#define watchdog_setup() ((void)0) +#define watchdog_enable() ((void)0) +#define watchdog_disable() ((void)0) +#define watchdog_enabled() ((void)0) + +#endif + +#endif /* __XEN_WATCHDOG_H__ */ -- 1.7.10.4
Andrew Cooper
2013-Aug-12 13:37 UTC
[PATCH 2/3] x86/watchdog: Tweak implementation given new common code.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> CC: Keir Fraser <keir@xen.org> CC: Jan Beulich <JBeulich@suse.com> CC: Tim Deegan <tim@xen.org> --- xen/arch/x86/nmi.c | 6 ++++-- xen/arch/x86/setup.c | 3 +-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/xen/arch/x86/nmi.c b/xen/arch/x86/nmi.c index 3eb2456..5827bd4 100644 --- a/xen/arch/x86/nmi.c +++ b/xen/arch/x86/nmi.c @@ -419,6 +419,9 @@ int __init watchdog_setup(void) { unsigned int cpu; + if ( !opt_watchdog ) + return 0; + /* * Activate periodic heartbeats. We cannot do this earlier during * setup because the timer infrastructure is not available. @@ -435,8 +438,7 @@ void nmi_watchdog_tick(struct cpu_user_regs * regs) { unsigned int sum = this_cpu(nmi_timer_ticks); - if ( (this_cpu(last_irq_sums) == sum) && - !atomic_read(&watchdog_disable_count) ) + if ( (this_cpu(last_irq_sums) == sum) && watchdog_enabled() ) { /* * Ayiee, looks like this CPU is stuck ... wait for the timeout diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index c550e8e..2d1edb5 100644 --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -1310,8 +1310,7 @@ void __init __start_xen(unsigned long mbi_p) do_initcalls(); - if ( opt_watchdog ) - watchdog_setup(); + watchdog_setup(); if ( !tboot_protect_mem_regions() ) panic("Could not protect TXT memory regions\n"); -- 1.7.10.4
Andrew Cooper
2013-Aug-12 13:38 UTC
[PATCH 3/3] watchdog/crash: Always disable watchdog in console_force_unlock()
Depending on the state of the conring and serial_tx_buffer, console_force_unlock() can be a long running operation, usually because of serial_start_sync() XenServer testing has found a reliable case where console_force_unlock() on one PCPU takes long enough for another PCPU to timeout due to the watchdog (such as waiting for a tlb flush callin). The watchdog timeout causes the second PCPU to repeat the console_force_unlock(), at which point the first PCPU typically fails an assertion in spin_unlock_irqrestore(&port->tx_lock) (because the tx_lock has been unlocked behind itself). console_force_unlock() is only on emergency paths, so one way or another the host is going down. Disable the watchdog before forcing the console lock to help prevent having pcpus completing with each other to bring the host down. Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> CC: Keir Fraser <keir@xen.org> CC: Jan Beulich <JBeulich@suse.com> CC: Tim Deegan <tim@xen.org> --- xen/arch/x86/x86_64/traps.c | 2 -- xen/drivers/char/console.c | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c index ea2ffb6..1cc977c 100644 --- a/xen/arch/x86/x86_64/traps.c +++ b/xen/arch/x86/x86_64/traps.c @@ -227,8 +227,6 @@ void do_double_fault(struct cpu_user_regs *regs) unsigned int cpu; unsigned long crs[8]; - watchdog_disable(); - console_force_unlock(); asm ( "lsll %1, %0" : "=r" (cpu) : "rm" (PER_CPU_GDT_ENTRY << 3) ); diff --git a/xen/drivers/char/console.c b/xen/drivers/char/console.c index 52ffa70..e47ddf2 100644 --- a/xen/drivers/char/console.c +++ b/xen/drivers/char/console.c @@ -737,6 +737,7 @@ void console_end_log_everything(void) void console_force_unlock(void) { + watchdog_disable(); spin_lock_init(&console_lock); serial_force_unlock(sercon_handle); console_locks_busted = 1; -- 1.7.10.4
Hi Andrew, What''s the Watchdog used for in Xen? Josh 2013/8/12 Andrew Cooper <andrew.cooper3@citrix.com>> This series offers some improvements to Xen watchdogs. > > Patch 1 moves the concept of a watchdog from x86 specific code to common. > Patch 2 tweaks the x86 implementation, given the common implementation. > Patch 3 is a bugfix for certain cases where the watchdog code interacts > badly with > console_force_unlock(). > > Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> > CC: Keir Fraser <keir@xen.org> > CC: Jan Beulich <JBeulich@suse.com> > CC: Tim Deegan <tim@xen.org> > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel >_______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Jan Beulich
2013-Aug-12 13:49 UTC
Re: [PATCH 2/3] x86/watchdog: Tweak implementation given new common code.
>>> On 12.08.13 at 15:37, Andrew Cooper <andrew.cooper3@citrix.com> wrote: > --- a/xen/arch/x86/nmi.c > +++ b/xen/arch/x86/nmi.c > @@ -419,6 +419,9 @@ int __init watchdog_setup(void) > { > unsigned int cpu; > > + if ( !opt_watchdog ) > + return 0; > + > /* > * Activate periodic heartbeats. We cannot do this earlier during > * setup because the timer infrastructure is not available.This and the last hunk below look pretty pointless, i.e. in fact the patch could consist of just the middle one. Jan> @@ -435,8 +438,7 @@ void nmi_watchdog_tick(struct cpu_user_regs * regs) > { > unsigned int sum = this_cpu(nmi_timer_ticks); > > - if ( (this_cpu(last_irq_sums) == sum) && > - !atomic_read(&watchdog_disable_count) ) > + if ( (this_cpu(last_irq_sums) == sum) && watchdog_enabled() ) > { > /* > * Ayiee, looks like this CPU is stuck ... wait for the timeout > diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c > index c550e8e..2d1edb5 100644 > --- a/xen/arch/x86/setup.c > +++ b/xen/arch/x86/setup.c > @@ -1310,8 +1310,7 @@ void __init __start_xen(unsigned long mbi_p) > > do_initcalls(); > > - if ( opt_watchdog ) > - watchdog_setup(); > + watchdog_setup(); > > if ( !tboot_protect_mem_regions() ) > panic("Could not protect TXT memory regions\n"); > -- > 1.7.10.4
>>> On 12.08.13 at 15:37, Andrew Cooper <andrew.cooper3@citrix.com> wrote: > This series offers some improvements to Xen watchdogs. > > Patch 1 moves the concept of a watchdog from x86 specific code to common. > Patch 2 tweaks the x86 implementation, given the common implementation. > Patch 3 is a bugfix for certain cases where the watchdog code interacts > badly with > console_force_unlock(). > > Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>Pending clarification on the need of the two apparently pointless hunks in patch 2: Reviewed-by: Jan Beulich <jbeulich@suse.com>
Andrew Cooper
2013-Aug-12 13:54 UTC
Re: [PATCH 2/3] x86/watchdog: Tweak implementation given new common code.
On 12/08/13 14:49, Jan Beulich wrote:>>>> On 12.08.13 at 15:37, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >> --- a/xen/arch/x86/nmi.c >> +++ b/xen/arch/x86/nmi.c >> @@ -419,6 +419,9 @@ int __init watchdog_setup(void) >> { >> unsigned int cpu; >> >> + if ( !opt_watchdog ) >> + return 0; >> + >> /* >> * Activate periodic heartbeats. We cannot do this earlier during >> * setup because the timer infrastructure is not available. > This and the last hunk below look pretty pointless, i.e. in fact the > patch could consist of just the middle one. > > JanI had intended to do more with it, but it turned out I couldn''t I shall drop it down to just the middle hunk. ~Andrew> >> @@ -435,8 +438,7 @@ void nmi_watchdog_tick(struct cpu_user_regs * regs) >> { >> unsigned int sum = this_cpu(nmi_timer_ticks); >> >> - if ( (this_cpu(last_irq_sums) == sum) && >> - !atomic_read(&watchdog_disable_count) ) >> + if ( (this_cpu(last_irq_sums) == sum) && watchdog_enabled() ) >> { >> /* >> * Ayiee, looks like this CPU is stuck ... wait for the timeout >> diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c >> index c550e8e..2d1edb5 100644 >> --- a/xen/arch/x86/setup.c >> +++ b/xen/arch/x86/setup.c >> @@ -1310,8 +1310,7 @@ void __init __start_xen(unsigned long mbi_p) >> >> do_initcalls(); >> >> - if ( opt_watchdog ) >> - watchdog_setup(); >> + watchdog_setup(); >> >> if ( !tboot_protect_mem_regions() ) >> panic("Could not protect TXT memory regions\n"); >> -- >> 1.7.10.4 > >
Andrew Cooper
2013-Aug-12 14:03 UTC
[Patch v2 2/3] x86/watchdog: Tweak implementation given new common code.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> CC: Keir Fraser <keir@xen.org> CC: Jan Beulich <JBeulich@suse.com> CC: Tim Deegan <tim@xen.org> --- Changes since v1: * Drop functionally useless changes to watchdog_setup() --- xen/arch/x86/nmi.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/xen/arch/x86/nmi.c b/xen/arch/x86/nmi.c index 3eb2456..2c16d93 100644 --- a/xen/arch/x86/nmi.c +++ b/xen/arch/x86/nmi.c @@ -435,8 +435,7 @@ void nmi_watchdog_tick(struct cpu_user_regs * regs) { unsigned int sum = this_cpu(nmi_timer_ticks); - if ( (this_cpu(last_irq_sums) == sum) && - !atomic_read(&watchdog_disable_count) ) + if ( (this_cpu(last_irq_sums) == sum) && watchdog_enabled() ) { /* * Ayiee, looks like this CPU is stuck ... wait for the timeout -- 1.7.10.4
On 12/08/13 14:47, Josh Zhao wrote:> Hi Andrew, > What''s the Watchdog used for in Xen? > > JoshXen can set up a hardware watchdog on each pcpu, to help ensure that it doesn''t get stuck in loops. It is extra defence against bugs in Xen. ~Andrew> > > 2013/8/12 Andrew Cooper <andrew.cooper3@citrix.com > <mailto:andrew.cooper3@citrix.com>> > > This series offers some improvements to Xen watchdogs. > > Patch 1 moves the concept of a watchdog from x86 specific code to > common. > Patch 2 tweaks the x86 implementation, given the common > implementation. > Patch 3 is a bugfix for certain cases where the watchdog code > interacts badly with > console_force_unlock(). > > Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com > <mailto:andrew.cooper3@citrix.com>> > CC: Keir Fraser <keir@xen.org <mailto:keir@xen.org>> > CC: Jan Beulich <JBeulich@suse.com <mailto:JBeulich@suse.com>> > CC: Tim Deegan <tim@xen.org <mailto:tim@xen.org>> > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org <mailto:Xen-devel@lists.xen.org> > http://lists.xen.org/xen-devel > >_______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
On 12/08/2013 14:37, "Andrew Cooper" <andrew.cooper3@citrix.com> wrote:> This series offers some improvements to Xen watchdogs. > > Patch 1 moves the concept of a watchdog from x86 specific code to common. > Patch 2 tweaks the x86 implementation, given the common implementation. > Patch 3 is a bugfix for certain cases where the watchdog code interacts badly > with > console_force_unlock(). > > Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> > CC: Keir Fraser <keir@xen.org> > CC: Jan Beulich <JBeulich@suse.com> > CC: Tim Deegan <tim@xen.org>With v2 of patch 2: Acked-by: Keir Fraser <keir@xen.org>