On 09/09/2009 09:48, "Qing He" <qing.he@intel.com> wrote:> This is a poc patch of guest irq ratelimit. The main motivation is > to ensure Xen''s responsiveness during an irq storm, that caused by > improper hardware or drivers. If one interrupt is fired at a high > rate (>10k/10ms in the poc), it is disabled until the next periodic > timer_fn. A global generation counter is used to minimize overhead.What''s the generation counter for? Why not just zero desc->rl_cnt in the timer handler? -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
This is a poc patch of guest irq ratelimit. The main motivation is to ensure Xen''s responsiveness during an irq storm, that caused by improper hardware or drivers. If one interrupt is fired at a high rate (>10k/10ms in the poc), it is disabled until the next periodic timer_fn. A global generation counter is used to minimize overhead. Printks and configurability are not included in this patch. The configurability may be: 1. command-line param to set ratelimit_threshold 2. per irq threshold 3. dynamic reconfig, maybe a hypercall or something For preventing irq storms, #1 is likely enough, but not sure if this has other usages. Any comments? Thanks, Qing --- diff -r 8f81bdd57afe xen/arch/x86/irq.c --- a/xen/arch/x86/irq.c Thu Sep 03 09:51:37 2009 +0100 +++ b/xen/arch/x86/irq.c Wed Sep 09 16:23:19 2009 +0800 @@ -54,6 +54,12 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) }; DEFINE_PER_CPU(struct cpu_user_regs *, __irq_regs); + +static LIST_HEAD(irq_ratelimit_list); +static DEFINE_SPINLOCK(irq_ratelimit_lock); +static struct timer irq_ratelimit_timer; +static unsigned int irq_ratelimit_gen; +static unsigned int irq_ratelimit_threshold = 10000; /* Must be called when irq disabled */ void lock_vector_lock(void) @@ -241,6 +247,10 @@ static void init_one_irq_desc(struct irq desc->msi_desc = NULL; spin_lock_init(&desc->lock); cpus_setall(desc->affinity); + + desc->rl_gen = 0; + desc->rl_cnt = 0; + INIT_LIST_HEAD(&desc->rl_link); } static void init_one_irq_status(int irq) @@ -469,6 +479,29 @@ asmlinkage void do_IRQ(struct cpu_user_r if ( likely(desc->status & IRQ_GUEST) ) { + static unsigned int last_gen; + last_gen = desc->rl_gen; + desc->rl_gen = irq_ratelimit_gen; + if ( last_gen != desc->rl_gen ) { + desc->rl_cnt = 0; + } + if ( unlikely(desc->rl_cnt++ >= irq_ratelimit_threshold) ) { + desc->handler->disable(irq); + /* + * If handler->disable doesn''t actually mask the interrupt, + * a disabled irq still can fire. So if the irq is already + * in the ratelimit list, don''t add it again. This also + * avoids deadlocks of two spinlocks if ratelimit_timer_fn + * runs at the same time. + */ + if ( likely(list_empty(&desc->rl_link)) ) { + spin_lock(&irq_ratelimit_lock); + list_add(&desc->rl_link, &irq_ratelimit_list); + spin_unlock(&irq_ratelimit_lock); + } + goto out; + } + irq_enter(); tsc_in = tb_init_done ? get_cycles() : 0; __do_IRQ_guest(irq); @@ -511,6 +544,35 @@ asmlinkage void do_IRQ(struct cpu_user_r spin_unlock(&desc->lock); set_irq_regs(old_regs); } + +static void irq_ratelimit_timer_fn(void *data) +{ + struct irq_desc *desc, *tmp; + + irq_ratelimit_gen++; + + spin_lock(&irq_ratelimit_lock); + + list_for_each_entry_safe(desc, tmp, &irq_ratelimit_list, rl_link) { + spin_lock(&desc->lock); + desc->handler->enable(desc->irq); + list_del(&desc->rl_link); + spin_unlock(&desc->lock); + } + + spin_unlock(&irq_ratelimit_lock); + + set_timer(&irq_ratelimit_timer, NOW() + MILLISECS(10)); +} + +static int __init irq_ratelimit_init(void) +{ + init_timer(&irq_ratelimit_timer, irq_ratelimit_timer_fn, NULL, 0); + set_timer(&irq_ratelimit_timer, NOW() + MILLISECS(10)); + + return 0; +} +__initcall(irq_ratelimit_init); int request_irq(unsigned int irq, void (*handler)(int, void *, struct cpu_user_regs *), diff -r 8f81bdd57afe xen/include/xen/irq.h --- a/xen/include/xen/irq.h Thu Sep 03 09:51:37 2009 +0100 +++ b/xen/include/xen/irq.h Wed Sep 09 16:23:19 2009 +0800 @@ -74,6 +74,10 @@ typedef struct irq_desc { int irq; spinlock_t lock; cpumask_t affinity; + + unsigned int rl_gen; /* last update */ + unsigned int rl_cnt; + struct list_head rl_link; } __cacheline_aligned irq_desc_t; #if defined(__ia64__) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On Wed, 2009-09-09 at 16:48 +0800, Keir Fraser wrote:> On 09/09/2009 09:48, "Qing He" <qing.he@intel.com> wrote: > > > This is a poc patch of guest irq ratelimit. The main motivation is > > to ensure Xen''s responsiveness during an irq storm, that caused by > > improper hardware or drivers. If one interrupt is fired at a high > > rate (>10k/10ms in the poc), it is disabled until the next periodic > > timer_fn. A global generation counter is used to minimize overhead. > > What''s the generation counter for? Why not just zero desc->rl_cnt in the > timer handler?If zeroing desc->rl_cnt in the timer handler, we have to zero that of all irqs. If the numbers of possible irqs is big or sparse, it''s a bit painful. Also, possibly it needs to be moved out of irq_desc and use atomic_t to avoid spinlocking every irq_desc in the timer handler. Thanks, Qing> > -- Keir > >_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On 09/09/2009 10:07, "Qing He" <qing.he@intel.com> wrote:>> What''s the generation counter for? Why not just zero desc->rl_cnt in the >> timer handler? > > If zeroing desc->rl_cnt in the timer handler, we have to zero that of > all irqs. If the numbers of possible irqs is big or sparse, it''s a bit > painful.Ah yes, I see. An always-on 10ms timer is ugly however. The Intel guys working on power management won''t like it as it will reduce deep-sleep residency. I would suggest getting rid of the generation counter and enable the timer only when the irq_ratelimit_list is non-empty. In do_IRQ(), when rl_cnt exceeds the threshold: now = NOW(); if ( now < (desc->rl_quantum_start + MILLISECS(10)) ) Add to irq_ratelimit_list; Kick timer if list was previously empty; else desc->rl_cnt = 0; desc->rl_quantum_start = now; And in the timer handler, for each desc on the list: desc->rl_cnt = 0; desc->rl_quantum_start = now; And at the end of the timer handler, do not set_timer(). This approach adds the overhead of get_s_time() every irq_ratelimit_threshold interrupts, which should be unmeasurably tiny. -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On Wed, Sep 09, 2009 at 04:48:25PM +0800, Qing He wrote:> This is a poc patch of guest irq ratelimit. The main motivation is > to ensure Xen''s responsiveness during an irq storm, that caused by > improper hardware or drivers. If one interrupt is fired at a highWhat kind of hardware exhibits this? _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On Wed, 2009-09-09 at 17:29 +0800, Keir Fraser wrote:> On 09/09/2009 10:07, "Qing He" <qing.he@intel.com> wrote: > > >> What''s the generation counter for? Why not just zero desc->rl_cnt in the > >> timer handler? > > > > If zeroing desc->rl_cnt in the timer handler, we have to zero that of > > all irqs. If the numbers of possible irqs is big or sparse, it''s a bit > > painful. > > Ah yes, I see. An always-on 10ms timer is ugly however. The Intel guys > working on power management won''t like it as it will reduce deep-sleep > residency.Yes, I also knew it''s not PM friendly, but didn''t come up with a good solution.> > I would suggest getting rid of the generation counter and enable the timer > only when the irq_ratelimit_list is non-empty. In do_IRQ(), when rl_cnt > exceeds the threshold: > now = NOW(); > if ( now < (desc->rl_quantum_start + MILLISECS(10)) ) > Add to irq_ratelimit_list; Kick timer if list was previously empty; > else > desc->rl_cnt = 0; desc->rl_quantum_start = now; > > And in the timer handler, for each desc on the list: > desc->rl_cnt = 0; desc->rl_quantum_start = now; > > And at the end of the timer handler, do not set_timer(). > > This approach adds the overhead of get_s_time() every > irq_ratelimit_threshold interrupts, which should be unmeasurably tiny.This is OK to me, didn''t think of this method. I''ll rework the patch like this. Thanks, Qing> > -- Keir > >_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On Wed, 2009-09-09 at 22:12 +0800, Konrad Rzeszutek Wilk wrote:> On Wed, Sep 09, 2009 at 04:48:25PM +0800, Qing He wrote: > > This is a poc patch of guest irq ratelimit. The main motivation is > > to ensure Xen''s responsiveness during an irq storm, that caused by > > improper hardware or drivers. If one interrupt is fired at a high > > What kind of hardware exhibits this?I happened to see this on some network cards using MSI, everything just hung up. And to some extent, the driver of the device can control this. If assigned devices of one of the guests behave this way, at least other guests should still be operable. Thanks, Qing _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
This patch adds the feature of irq ratelimit. It temporarily masks the interrupt (guest) if too many irqs are observed in a short period (irq storm), to ensure responsiveness of Xen and other guests. As for now, the threshold can be adjusted at boot time using command- line option irq_ratelimit=. Signed-off-by: Qing He <qing.he@intel.com> --- arch/x86/irq.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ include/xen/irq.h | 6 +++++ 2 files changed, 70 insertions(+) diff -r 8f81bdd57afe xen/arch/x86/irq.c --- a/xen/arch/x86/irq.c Thu Sep 03 09:51:37 2009 +0100 +++ b/xen/arch/x86/irq.c Tue Sep 15 18:16:38 2009 +0800 @@ -54,6 +54,14 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) }; DEFINE_PER_CPU(struct cpu_user_regs *, __irq_regs); + +static LIST_HEAD(irq_ratelimit_list); +static DEFINE_SPINLOCK(irq_ratelimit_lock); +static struct timer irq_ratelimit_timer; + +/* irq_ratelimit: the max irq rate allowed in every 10ms, set 0 to disable */ +unsigned int __read_mostly irq_ratelimit_threshold = 10000; +integer_param("irq_ratelimit", irq_ratelimit_threshold); /* Must be called when irq disabled */ void lock_vector_lock(void) @@ -241,6 +249,10 @@ static void init_one_irq_desc(struct irq desc->msi_desc = NULL; spin_lock_init(&desc->lock); cpus_setall(desc->affinity); + + desc->rl_quantum_start = NOW(); + desc->rl_cnt = 0; + INIT_LIST_HEAD(&desc->rl_link); } static void init_one_irq_status(int irq) @@ -469,6 +481,29 @@ asmlinkage void do_IRQ(struct cpu_user_r if ( likely(desc->status & IRQ_GUEST) ) { + s_time_t now = NOW(); + if ( now > (desc->rl_quantum_start + MILLISECS(10)) ) { + desc->rl_cnt = 0; + desc->rl_quantum_start = now; + } + if ( unlikely(desc->rl_cnt++ >= irq_ratelimit_threshold) ) { + desc->handler->disable(irq); + /* + * If handler->disable doesn''t actually mask the interrupt, + * a disabled irq still can fire. This check also avoids + * possible deadlocks if ratelimit_timer_fn runs at the + * same time. + */ + if ( likely(list_empty(&desc->rl_link)) ) { + spin_lock(&irq_ratelimit_lock); + if ( list_empty(&irq_ratelimit_list) ) + set_timer(&irq_ratelimit_timer, now + MILLISECS(10)); + list_add(&desc->rl_link, &irq_ratelimit_list); + spin_unlock(&irq_ratelimit_lock); + } + goto out; + } + irq_enter(); tsc_in = tb_init_done ? get_cycles() : 0; __do_IRQ_guest(irq); @@ -511,6 +546,35 @@ asmlinkage void do_IRQ(struct cpu_user_r spin_unlock(&desc->lock); set_irq_regs(old_regs); } + +static void irq_ratelimit_timer_fn(void *data) +{ + struct irq_desc *desc, *tmp; + unsigned long flags; + + spin_lock_irqsave(&irq_ratelimit_lock, flags); + + list_for_each_entry_safe(desc, tmp, &irq_ratelimit_list, rl_link) { + spin_lock(&desc->lock); + desc->handler->enable(desc->irq); + list_del(&desc->rl_link); + INIT_LIST_HEAD(&desc->rl_link); + spin_unlock(&desc->lock); + } + + spin_unlock_irqrestore(&irq_ratelimit_lock, flags); +} + +static int __init irq_ratelimit_init(void) +{ + init_timer(&irq_ratelimit_timer, irq_ratelimit_timer_fn, NULL, 0); + + if (irq_ratelimit_threshold == 0) + irq_ratelimit_threshold = ~0U; + + return 0; +} +__initcall(irq_ratelimit_init); int request_irq(unsigned int irq, void (*handler)(int, void *, struct cpu_user_regs *), diff -r 8f81bdd57afe xen/include/xen/irq.h --- a/xen/include/xen/irq.h Thu Sep 03 09:51:37 2009 +0100 +++ b/xen/include/xen/irq.h Tue Sep 15 18:16:38 2009 +0800 @@ -4,6 +4,7 @@ #include <xen/config.h> #include <xen/cpumask.h> #include <xen/spinlock.h> +#include <xen/time.h> #include <asm/regs.h> #include <asm/hardirq.h> @@ -74,6 +75,11 @@ typedef struct irq_desc { int irq; spinlock_t lock; cpumask_t affinity; + + /* irq ratelimit */ + s_time_t rl_quantum_start; + unsigned int rl_cnt; + struct list_head rl_link; } __cacheline_aligned irq_desc_t; #if defined(__ia64__) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Applied as c/s 20214, with some changes. Please take a look. -- Keir On 15/09/2009 14:10, "Qing He" <qing.he@intel.com> wrote:> This patch adds the feature of irq ratelimit. It temporarily masks > the interrupt (guest) if too many irqs are observed in a short > period (irq storm), to ensure responsiveness of Xen and other guests. > > As for now, the threshold can be adjusted at boot time using command- > line option irq_ratelimit=. > > Signed-off-by: Qing He <qing.he@intel.com> > > --- > arch/x86/irq.c | 64 > ++++++++++++++++++++++++++++++++++++++++++++++++++++++ > include/xen/irq.h | 6 +++++ > 2 files changed, 70 insertions(+) > > diff -r 8f81bdd57afe xen/arch/x86/irq.c > --- a/xen/arch/x86/irq.c Thu Sep 03 09:51:37 2009 +0100 > +++ b/xen/arch/x86/irq.c Tue Sep 15 18:16:38 2009 +0800 > @@ -54,6 +54,14 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) > }; > > DEFINE_PER_CPU(struct cpu_user_regs *, __irq_regs); > + > +static LIST_HEAD(irq_ratelimit_list); > +static DEFINE_SPINLOCK(irq_ratelimit_lock); > +static struct timer irq_ratelimit_timer; > + > +/* irq_ratelimit: the max irq rate allowed in every 10ms, set 0 to disable */ > +unsigned int __read_mostly irq_ratelimit_threshold = 10000; > +integer_param("irq_ratelimit", irq_ratelimit_threshold); > > /* Must be called when irq disabled */ > void lock_vector_lock(void) > @@ -241,6 +249,10 @@ static void init_one_irq_desc(struct irq > desc->msi_desc = NULL; > spin_lock_init(&desc->lock); > cpus_setall(desc->affinity); > + > + desc->rl_quantum_start = NOW(); > + desc->rl_cnt = 0; > + INIT_LIST_HEAD(&desc->rl_link); > } > > static void init_one_irq_status(int irq) > @@ -469,6 +481,29 @@ asmlinkage void do_IRQ(struct cpu_user_r > > if ( likely(desc->status & IRQ_GUEST) ) > { > + s_time_t now = NOW(); > + if ( now > (desc->rl_quantum_start + MILLISECS(10)) ) { > + desc->rl_cnt = 0; > + desc->rl_quantum_start = now; > + } > + if ( unlikely(desc->rl_cnt++ >= irq_ratelimit_threshold) ) { > + desc->handler->disable(irq); > + /* > + * If handler->disable doesn''t actually mask the interrupt, > + * a disabled irq still can fire. This check also avoids > + * possible deadlocks if ratelimit_timer_fn runs at the > + * same time. > + */ > + if ( likely(list_empty(&desc->rl_link)) ) { > + spin_lock(&irq_ratelimit_lock); > + if ( list_empty(&irq_ratelimit_list) ) > + set_timer(&irq_ratelimit_timer, now + MILLISECS(10)); > + list_add(&desc->rl_link, &irq_ratelimit_list); > + spin_unlock(&irq_ratelimit_lock); > + } > + goto out; > + } > + > irq_enter(); > tsc_in = tb_init_done ? get_cycles() : 0; > __do_IRQ_guest(irq); > @@ -511,6 +546,35 @@ asmlinkage void do_IRQ(struct cpu_user_r > spin_unlock(&desc->lock); > set_irq_regs(old_regs); > } > + > +static void irq_ratelimit_timer_fn(void *data) > +{ > + struct irq_desc *desc, *tmp; > + unsigned long flags; > + > + spin_lock_irqsave(&irq_ratelimit_lock, flags); > + > + list_for_each_entry_safe(desc, tmp, &irq_ratelimit_list, rl_link) { > + spin_lock(&desc->lock); > + desc->handler->enable(desc->irq); > + list_del(&desc->rl_link); > + INIT_LIST_HEAD(&desc->rl_link); > + spin_unlock(&desc->lock); > + } > + > + spin_unlock_irqrestore(&irq_ratelimit_lock, flags); > +} > + > +static int __init irq_ratelimit_init(void) > +{ > + init_timer(&irq_ratelimit_timer, irq_ratelimit_timer_fn, NULL, 0); > + > + if (irq_ratelimit_threshold == 0) > + irq_ratelimit_threshold = ~0U; > + > + return 0; > +} > +__initcall(irq_ratelimit_init); > > int request_irq(unsigned int irq, > void (*handler)(int, void *, struct cpu_user_regs *), > diff -r 8f81bdd57afe xen/include/xen/irq.h > --- a/xen/include/xen/irq.h Thu Sep 03 09:51:37 2009 +0100 > +++ b/xen/include/xen/irq.h Tue Sep 15 18:16:38 2009 +0800 > @@ -4,6 +4,7 @@ > #include <xen/config.h> > #include <xen/cpumask.h> > #include <xen/spinlock.h> > +#include <xen/time.h> > #include <asm/regs.h> > #include <asm/hardirq.h> > > @@ -74,6 +75,11 @@ typedef struct irq_desc { > int irq; > spinlock_t lock; > cpumask_t affinity; > + > + /* irq ratelimit */ > + s_time_t rl_quantum_start; > + unsigned int rl_cnt; > + struct list_head rl_link; > } __cacheline_aligned irq_desc_t; > > #if defined(__ia64__)_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel