thr3ads.net - Xen devel - [Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs [Nov 2011]

If this information is useful, please help other people find it:
Share via:

Jan Beulich

2011-Nov-11 16:00 UTC

[Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

Following the prevention of vector sharing for MSIs, this change
enforces the same within IO-APICs: Pin based interrupts use the IO-APIC
as their identifying device under the AMD IOMMU (and just like for
MSIs, only the identifying device is used to remap interrupts here,
with no regard to an interrupt''s destination).

Additionally, LAPIC initiated EOIs (for level triggered interrupts) too
use only the vector for identifying which interrupts to end. While this
generally causes no significant problem (at worst an interrupt would be
re-raised without a new interrupt event actually having occurred), it
still seems better to avoid the situation.

For this second aspect, a distinction is being made between the
traditional and the directed-EOI cases: In the former, vectors should
not be shared throughout all IO-APICs in the system, while in the
latter case only individual IO-APICs need to be contrained (or, if the
firmware indicates so, sub- groups of them having the same GSI appear
at multiple pins).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -395,6 +395,11 @@ static vmask_t *irq_get_used_vector_mask
             }
         }
     }
+    else if ( IO_APIC_IRQ(irq) &&
+              opt_irq_vector_map != OPT_IRQ_VECTOR_MAP_NONE )
+    {
+        ret = io_apic_get_used_vector_map(irq);
+    }
 
     return ret;
 }
--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -72,6 +72,33 @@ int __read_mostly nr_ioapics;
 #define io_apic_eoi_vector(apic, vector) io_apic_eoi((apic), (vector), -1)
 #define io_apic_eoi_pin(apic, pin) io_apic_eoi((apic), -1, (pin))
 
+static int apic_pin_2_gsi_irq(int apic, int pin);
+
+static vmask_t *__read_mostly vector_map[MAX_IO_APICS];
+
+static void share_vector_maps(unsigned int src, unsigned int dst)
+{
+    unsigned int pin;
+
+    if (vector_map[src] == vector_map[dst])
+        return;
+
+    bitmap_or(vector_map[src]->_bits, vector_map[src]->_bits,
+              vector_map[dst]->_bits, NR_VECTORS);
+
+    for (pin = 0; pin < nr_ioapic_entries[dst]; ++pin) {
+        int irq = apic_pin_2_gsi_irq(dst, pin);
+        struct irq_desc *desc;
+
+        if (irq < 0)
+            continue;
+        desc = irq_to_desc(irq);
+        if (desc->arch.used_vectors == vector_map[dst])
+            desc->arch.used_vectors = vector_map[src];
+    }
+
+    vector_map[dst] = vector_map[src];
+}
 
 /*
  * This is performance-critical, we want to do it O(1)
@@ -113,6 +140,7 @@ static void add_pin_to_irq(unsigned int 
     }
     entry->apic = apic;
     entry->pin = pin;
+    share_vector_maps(irq_2_pin[irq].apic, apic);
 }
 
 /*
@@ -128,6 +156,7 @@ static void __init replace_pin_at_irq(un
         if (entry->apic == oldapic && entry->pin == oldpin) {
             entry->apic = newapic;
             entry->pin = newpin;
+            share_vector_maps(oldapic, newapic);
         }
         if (!entry->next)
             break;
@@ -135,6 +164,16 @@ static void __init replace_pin_at_irq(un
     }
 }
 
+vmask_t *io_apic_get_used_vector_map(unsigned int irq)
+{
+    struct irq_pin_list *entry = irq_2_pin + irq;
+
+    if (entry->pin == -1)
+        return NULL;
+
+    return vector_map[entry->apic];
+}
+
 struct IO_APIC_route_entry **alloc_ioapic_entries(void)
 {
     int apic;
@@ -1244,6 +1283,18 @@ static void __init enable_IO_APIC(void)
     for (i = irq_2_pin_free_entry = nr_irqs_gsi; i < PIN_MAP_SIZE; i++)
         irq_2_pin[i].next = i + 1;
 
+    if (directed_eoi_enabled) {
+        for (apic = 0; apic < nr_ioapics; apic++) {
+            vector_map[apic] = xzalloc(vmask_t);
+            BUG_ON(!vector_map[apic]);
+        }
+    } else {
+        vector_map[0] = xzalloc(vmask_t);
+        BUG_ON(!vector_map[0]);
+        for (apic = 1; apic < nr_ioapics; apic++)
+            vector_map[apic] = vector_map[0];
+    }
+
     for(apic = 0; apic < nr_ioapics; apic++) {
         int pin;
         /* See if any of the pins is in ExtINT mode */
@@ -2345,13 +2396,12 @@ int ioapic_guest_write(unsigned long phy
     }
 
     if ( desc->arch.vector <= 0 || desc->arch.vector >
LAST_DYNAMIC_VECTOR ) {
+        add_pin_to_irq(irq, apic, pin);
         vector = assign_irq_vector(irq);
         if ( vector < 0 )
             return vector;
 
         printk(XENLOG_INFO "allocated vector %02x for irq %d\n",
vector, irq);
-
-        add_pin_to_irq(irq, apic, pin);
     }
     spin_lock(&dom0->event_lock);
     ret = map_domain_pirq(dom0, pirq, irq,
--- a/xen/include/asm-x86/irq.h
+++ b/xen/include/asm-x86/irq.h
@@ -116,6 +116,7 @@ int i8259A_resume(void);
 void setup_IO_APIC(void);
 void disable_IO_APIC(void);
 void setup_ioapic_dest(void);
+vmask_t *io_apic_get_used_vector_map(unsigned int irq);
 
 extern unsigned int io_apic_irqs;
 




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Andrew Cooper

2011-Nov-11 17:13 UTC

head link

Re: [Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

On 11/11/11 16:00, Jan Beulich wrote:> Following the prevention of vector sharing for MSIs, this change
> enforces the same within IO-APICs: Pin based interrupts use the IO-APIC
> as their identifying device under the AMD IOMMU (and just like for
> MSIs, only the identifying device is used to remap interrupts here,
> with no regard to an interrupt''s destination).
>
> Additionally, LAPIC initiated EOIs (for level triggered interrupts) too
> use only the vector for identifying which interrupts to end. While this
> generally causes no significant problem (at worst an interrupt would be
> re-raised without a new interrupt event actually having occurred)
At worst, hardware asserts a line interrupt, deasserts it later, and an
EOI broadcast gets rid of any record that the IRQ was ever raised. 
While I would classify this as buggy behavior, I believe I have seen
some hardware doing this when investigating the line level IRQ migration
bug, as clearing the IRR did not immediately cause another interrupt to
be generated.
> , it
> still seems better to avoid the situation.
>
> For this second aspect, a distinction is being made between the
> traditional and the directed-EOI cases: In the former, vectors should
> not be shared throughout all IO-APICs in the system, while in the
> latter case only individual IO-APICs need to be contrained (or, if the
> firmware indicates so, sub- groups of them having the same GSI appear
> at multiple pins).
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
Provisional nack because it is my understanding that under all
circumstances, you must maintain a vector exclusivity map across all
IO-APICs because of the broadcast problem.  Or have I made a mistake in
my reasoning?
> --- a/xen/arch/x86/irq.c
> +++ b/xen/arch/x86/irq.c
> @@ -395,6 +395,11 @@ static vmask_t *irq_get_used_vector_mask
>              }
>          }
>      }
> +    else if ( IO_APIC_IRQ(irq) &&
> +              opt_irq_vector_map != OPT_IRQ_VECTOR_MAP_NONE )
> +    {
> +        ret = io_apic_get_used_vector_map(irq);
> +    }
>  
>      return ret;
>  }
> --- a/xen/arch/x86/io_apic.c
> +++ b/xen/arch/x86/io_apic.c
> @@ -72,6 +72,33 @@ int __read_mostly nr_ioapics;
>  #define io_apic_eoi_vector(apic, vector) io_apic_eoi((apic), (vector), -1)
>  #define io_apic_eoi_pin(apic, pin) io_apic_eoi((apic), -1, (pin))
>  
> +static int apic_pin_2_gsi_irq(int apic, int pin);
> +
> +static vmask_t *__read_mostly vector_map[MAX_IO_APICS];
> +
> +static void share_vector_maps(unsigned int src, unsigned int dst)
> +{
> +    unsigned int pin;
> +
> +    if (vector_map[src] == vector_map[dst])
> +        return;
> +
> +    bitmap_or(vector_map[src]->_bits, vector_map[src]->_bits,
> +              vector_map[dst]->_bits, NR_VECTORS);
> +
> +    for (pin = 0; pin < nr_ioapic_entries[dst]; ++pin) {
> +        int irq = apic_pin_2_gsi_irq(dst, pin);
> +        struct irq_desc *desc;
> +
> +        if (irq < 0)
> +            continue;
> +        desc = irq_to_desc(irq);
> +        if (desc->arch.used_vectors == vector_map[dst])
> +            desc->arch.used_vectors = vector_map[src];
> +    }
> +
> +    vector_map[dst] = vector_map[src];
> +}
>  
>  /*
>   * This is performance-critical, we want to do it O(1)
> @@ -113,6 +140,7 @@ static void add_pin_to_irq(unsigned int 
>      }
>      entry->apic = apic;
>      entry->pin = pin;
> +    share_vector_maps(irq_2_pin[irq].apic, apic);
>  }
>  
>  /*
> @@ -128,6 +156,7 @@ static void __init replace_pin_at_irq(un
>          if (entry->apic == oldapic && entry->pin == oldpin)
{
>              entry->apic = newapic;
>              entry->pin = newpin;
> +            share_vector_maps(oldapic, newapic);
>          }
>          if (!entry->next)
>              break;
> @@ -135,6 +164,16 @@ static void __init replace_pin_at_irq(un
>      }
>  }
>  
> +vmask_t *io_apic_get_used_vector_map(unsigned int irq)
> +{
> +    struct irq_pin_list *entry = irq_2_pin + irq;
> +
> +    if (entry->pin == -1)
> +        return NULL;
> +
> +    return vector_map[entry->apic];
> +}
> +
>  struct IO_APIC_route_entry **alloc_ioapic_entries(void)
>  {
>      int apic;
> @@ -1244,6 +1283,18 @@ static void __init enable_IO_APIC(void)
>      for (i = irq_2_pin_free_entry = nr_irqs_gsi; i < PIN_MAP_SIZE; i++)
>          irq_2_pin[i].next = i + 1;
>  
> +    if (directed_eoi_enabled) {
> +        for (apic = 0; apic < nr_ioapics; apic++) {
> +            vector_map[apic] = xzalloc(vmask_t);
> +            BUG_ON(!vector_map[apic]);
> +        }
> +    } else {
> +        vector_map[0] = xzalloc(vmask_t);
> +        BUG_ON(!vector_map[0]);
> +        for (apic = 1; apic < nr_ioapics; apic++)
> +            vector_map[apic] = vector_map[0];
> +    }
> +
>      for(apic = 0; apic < nr_ioapics; apic++) {
>          int pin;
>          /* See if any of the pins is in ExtINT mode */
> @@ -2345,13 +2396,12 @@ int ioapic_guest_write(unsigned long phy
>      }
>  
>      if ( desc->arch.vector <= 0 || desc->arch.vector >
LAST_DYNAMIC_VECTOR ) {
> +        add_pin_to_irq(irq, apic, pin);
>          vector = assign_irq_vector(irq);
>          if ( vector < 0 )
>              return vector;
>  
>          printk(XENLOG_INFO "allocated vector %02x for irq %d\n",
vector, irq);
> -
> -        add_pin_to_irq(irq, apic, pin);
>      }
>      spin_lock(&dom0->event_lock);
>      ret = map_domain_pirq(dom0, pirq, irq,
> --- a/xen/include/asm-x86/irq.h
> +++ b/xen/include/asm-x86/irq.h
> @@ -116,6 +116,7 @@ int i8259A_resume(void);
>  void setup_IO_APIC(void);
>  void disable_IO_APIC(void);
>  void setup_ioapic_dest(void);
> +vmask_t *io_apic_get_used_vector_map(unsigned int irq);
>  
>  extern unsigned int io_apic_irqs;
>  
>
>
-- 
Andrew Cooper - Dom0 Kernel Engineer, Citrix XenServer
T: +44 (0)1223 225 900, http://www.citrix.com



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jan Beulich

2011-Nov-14 08:57 UTC

head link

Re: [Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

>>> On 11.11.11 at 18:13, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
> On 11/11/11 16:00, Jan Beulich wrote:
>> Following the prevention of vector sharing for MSIs, this change
>> enforces the same within IO-APICs: Pin based interrupts use the IO-APIC
>> as their identifying device under the AMD IOMMU (and just like for
>> MSIs, only the identifying device is used to remap interrupts here,
>> with no regard to an interrupt''s destination).
>>
>> Additionally, LAPIC initiated EOIs (for level triggered interrupts) too
>> use only the vector for identifying which interrupts to end. While this
>> generally causes no significant problem (at worst an interrupt would be
>> re-raised without a new interrupt event actually having occurred)
> 
> At worst, hardware asserts a line interrupt, deasserts it later, and an
> EOI broadcast gets rid of any record that the IRQ was ever raised. 
> While I would classify this as buggy behavior, I believe I have seen
> some hardware doing this when investigating the line level IRQ migration
> bug, as clearing the IRR did not immediately cause another interrupt to
> be generated.
> 
>> , it
>> still seems better to avoid the situation.
>>
>> For this second aspect, a distinction is being made between the
>> traditional and the directed-EOI cases: In the former, vectors should
>> not be shared throughout all IO-APICs in the system, while in the
>> latter case only individual IO-APICs need to be contrained (or, if the
>> firmware indicates so, sub- groups of them having the same GSI appear
>> at multiple pins).
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> Provisional nack because it is my understanding that under all
> circumstances, you must maintain a vector exclusivity map across all
> IO-APICs because of the broadcast problem.  Or have I made a mistake in
> my reasoning?
With directed EOI there''s no broadcasting involved, which is why
global sharing prevention is not necessary.

However, after some more thinking over the weekend I think we need
to also/first adjust end_level_ioapic_irq()''s call to
io_apic_eoi_vector():
It shouldn''t really iterate over all IO-APICs, but instead call
eoi_IO_APIC_irq(). Thoughts? (That would also eliminate the need to
look up pin or vector in __io_apic_eoi(), as all remaining call sites pass
both.)

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Andrew Cooper

2011-Nov-14 13:59 UTC

head link

Re: [Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

On 14/11/11 08:57, Jan Beulich wrote:>>>> On 11.11.11 at 18:13, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
>> On 11/11/11 16:00, Jan Beulich wrote:
>>> Following the prevention of vector sharing for MSIs, this change
>>> enforces the same within IO-APICs: Pin based interrupts use the
IO-APIC
>>> as their identifying device under the AMD IOMMU (and just like for
>>> MSIs, only the identifying device is used to remap interrupts here,
>>> with no regard to an interrupt''s destination).
>>>
>>> Additionally, LAPIC initiated EOIs (for level triggered interrupts)
too
>>> use only the vector for identifying which interrupts to end. While
this
>>> generally causes no significant problem (at worst an interrupt
would be
>>> re-raised without a new interrupt event actually having occurred)
>> At worst, hardware asserts a line interrupt, deasserts it later, and an
>> EOI broadcast gets rid of any record that the IRQ was ever raised. 
>> While I would classify this as buggy behavior, I believe I have seen
>> some hardware doing this when investigating the line level IRQ
migration
>> bug, as clearing the IRR did not immediately cause another interrupt to
>> be generated.
>>
>>> , it
>>> still seems better to avoid the situation.
>>>
>>> For this second aspect, a distinction is being made between the
>>> traditional and the directed-EOI cases: In the former, vectors
should
>>> not be shared throughout all IO-APICs in the system, while in the
>>> latter case only individual IO-APICs need to be contrained (or, if
the
>>> firmware indicates so, sub- groups of them having the same GSI
appear
>>> at multiple pins).
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> Provisional nack because it is my understanding that under all
>> circumstances, you must maintain a vector exclusivity map across all
>> IO-APICs because of the broadcast problem.  Or have I made a mistake in
>> my reasoning?
> With directed EOI there''s no broadcasting involved, which is why
> global sharing prevention is not necessary.
>
> However, after some more thinking over the weekend I think we need
> to also/first adjust end_level_ioapic_irq()''s call to
io_apic_eoi_vector():
> It shouldn''t really iterate over all IO-APICs, but instead call
> eoi_IO_APIC_irq(). Thoughts? (That would also eliminate the need to
> look up pin or vector in __io_apic_eoi(), as all remaining call sites pass
> both.)
>
> Jan
>
I believe that should work.  By the point end_level_ioapic_irq() is
called, all the irq_desc information should point to the new vector, so
eoi_IO_APIC_irq() should get it correct.  At the time I made that patch,
I was not so familiar with the IO-APIC code so decided that calling
io_apic_eoi was the safer bet.

Having had the weekend to consider the logic in your patch, I now think
I was wrong, and your reasoning is correct.  Therefore, Ack.

-- 
Andrew Cooper - Dom0 Kernel Engineer, Citrix XenServer
T: +44 (0)1223 225 900, http://www.citrix.com


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jan Beulich

2011-Nov-14 14:20 UTC

head link

Re: [Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

>>> On 14.11.11 at 14:59, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
> On 14/11/11 08:57, Jan Beulich wrote:
>>>>> On 11.11.11 at 18:13, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
>>> On 11/11/11 16:00, Jan Beulich wrote:
>>>> Following the prevention of vector sharing for MSIs, this
change
>>>> enforces the same within IO-APICs: Pin based interrupts use the
IO-APIC
>>>> as their identifying device under the AMD IOMMU (and just like
for
>>>> MSIs, only the identifying device is used to remap interrupts
here,
>>>> with no regard to an interrupt''s destination).
>>>>
>>>> Additionally, LAPIC initiated EOIs (for level triggered
interrupts) too
>>>> use only the vector for identifying which interrupts to end.
While this
>>>> generally causes no significant problem (at worst an interrupt
would be
>>>> re-raised without a new interrupt event actually having
occurred)
>>> At worst, hardware asserts a line interrupt, deasserts it later,
and an
>>> EOI broadcast gets rid of any record that the IRQ was ever raised. 
>>> While I would classify this as buggy behavior, I believe I have
seen
>>> some hardware doing this when investigating the line level IRQ
migration
>>> bug, as clearing the IRR did not immediately cause another
interrupt to
>>> be generated.
>>>
>>>> , it
>>>> still seems better to avoid the situation.
>>>>
>>>> For this second aspect, a distinction is being made between the
>>>> traditional and the directed-EOI cases: In the former, vectors
should
>>>> not be shared throughout all IO-APICs in the system, while in
the
>>>> latter case only individual IO-APICs need to be contrained (or,
if the
>>>> firmware indicates so, sub- groups of them having the same GSI
appear
>>>> at multiple pins).
>>>>
>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>> Provisional nack because it is my understanding that under all
>>> circumstances, you must maintain a vector exclusivity map across
all
>>> IO-APICs because of the broadcast problem.  Or have I made a
mistake in
>>> my reasoning?
>> With directed EOI there''s no broadcasting involved, which is
why
>> global sharing prevention is not necessary.
>>
>> However, after some more thinking over the weekend I think we need
>> to also/first adjust end_level_ioapic_irq()''s call to
io_apic_eoi_vector():
>> It shouldn''t really iterate over all IO-APICs, but instead
call
>> eoi_IO_APIC_irq(). Thoughts? (That would also eliminate the need to
>> look up pin or vector in __io_apic_eoi(), as all remaining call sites
pass
>> both.)
>>
>> Jan
>>
> 
> I believe that should work.  By the point end_level_ioapic_irq() is
> called, all the irq_desc information should point to the new vector, so
> eoi_IO_APIC_irq() should get it correct.  At the time I made that patch,
> I was not so familiar with the IO-APIC code so decided that calling
> io_apic_eoi was the safer bet.
I''m having some more fundamental problem with this original change
of yours: The assignment of the new vector happens in the context
of move_native_irq(), which gets called *after* doing the manual
EOI (and hence also *after* the vector != desc->arch.vector check).
How does that do what it is supposed to?

(Below the [untested] patch that I would propose to do what I described
above, pending your clarification regarding the original change.)

Jan

--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -69,10 +69,6 @@ int __read_mostly nr_ioapics;
 
 #define ioapic_has_eoi_reg(apic) (mp_ioapics[(apic)].mpc_apicver >= 0x20)
 
-#define io_apic_eoi_vector(apic, vector) io_apic_eoi((apic), (vector), -1)
-#define io_apic_eoi_pin(apic, pin) io_apic_eoi((apic), -1, (pin))
-
-
 /*
  * This is performance-critical, we want to do it O(1)
  *
@@ -220,14 +216,11 @@ static void ioapic_write_entry(
  * same redirection entry in the IO-APIC. */
 static void __io_apic_eoi(unsigned int apic, unsigned int vector, unsigned int
pin)
 {
-    /* Ensure some useful information is passed in */
-    BUG_ON( (vector == -1 && pin == -1) );
-    
     /* Prefer the use of the EOI register if available */
     if ( ioapic_has_eoi_reg(apic) )
     {
         /* If vector is unknown, read it from the IO-APIC */
-        if ( vector == -1 )
+        if ( vector == IRQ_VECTOR_UNASSIGNED )
             vector = __ioapic_read_entry(apic, pin, TRUE).vector;
 
         *(IO_APIC_BASE(apic)+16) = vector;
@@ -239,42 +232,6 @@ static void __io_apic_eoi(unsigned int a
         struct IO_APIC_route_entry entry;
         bool_t need_to_unmask = 0;
 
-        /* If pin is unknown, search for it */
-        if ( pin == -1 )
-        {
-            unsigned int p;
-            for ( p = 0; p < nr_ioapic_entries[apic]; ++p )
-            {
-                entry = __ioapic_read_entry(apic, p, TRUE);
-                if ( entry.vector == vector )
-                {
-                    pin = p;
-                    /* break; */
-
-                    /* Here should be a break out of the loop, but at the 
-                     * Xen code doesn''t actually prevent multiple
IO-APIC
-                     * entries being assigned the same vector, so EOI all
-                     * pins which have the correct vector.
-                     *
-                     * Remove the following code when the above assertion
-                     * is fulfilled. */
-                    __io_apic_eoi(apic, vector, p);
-                }
-            }
-            
-            /* If search fails, nothing to do */
-
-            /* if ( pin == -1 ) */
-
-            /* Because the loop wasn''t broken out of (see comment
above),
-             * all relevant pins have been EOI, so we can always return.
-             * 
-             * Re-instate the if statement above when the Xen logic has been
-             * fixed.*/
-
-            return;
-        }
-
         entry = __ioapic_read_entry(apic, pin, TRUE);
 
         if ( ! entry.mask )
@@ -1645,7 +1602,6 @@ static void mask_and_ack_level_ioapic_ir
 static void end_level_ioapic_irq(struct irq_desc *desc, u8 vector)
 {
     unsigned long v;
-    int i;
 
     if ( !ioapic_ack_new )
     {
@@ -1689,15 +1645,9 @@ static void end_level_ioapic_irq(struct 
  * operation to prevent an edge-triggered interrupt escaping meanwhile.
  * The idea is from Manfred Spraul.  --macro
  */
-    i = desc->arch.vector;
-
     /* Manually EOI the old vector if we are moving to the new */
-    if ( vector && i != vector )
-    {
-        int ioapic;
-        for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
-            io_apic_eoi_vector(ioapic, i);
-    }
+    if ( vector && desc->arch.vector != vector )
+        eoi_IO_APIC_irq(desc);
 
     v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
 


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jan Beulich

2011-Nov-14 14:30 UTC

head link

Re: [Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

>>> On 14.11.11 at 15:20, "Jan Beulich"
<JBeulich@suse.com> wrote:
>>>> On 14.11.11 at 14:59, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
>> On 14/11/11 08:57, Jan Beulich wrote:
>>>>>> On 11.11.11 at 18:13, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
>>>> On 11/11/11 16:00, Jan Beulich wrote:
>>>>> Following the prevention of vector sharing for MSIs, this
change
>>>>> enforces the same within IO-APICs: Pin based interrupts use
the IO-APIC
>>>>> as their identifying device under the AMD IOMMU (and just
like for
>>>>> MSIs, only the identifying device is used to remap
interrupts here,
>>>>> with no regard to an interrupt''s destination).
>>>>>
>>>>> Additionally, LAPIC initiated EOIs (for level triggered
interrupts) too
>>>>> use only the vector for identifying which interrupts to
end. While this
>>>>> generally causes no significant problem (at worst an
interrupt would be
>>>>> re-raised without a new interrupt event actually having
occurred)
>>>> At worst, hardware asserts a line interrupt, deasserts it
later, and an
>>>> EOI broadcast gets rid of any record that the IRQ was ever
raised.
>>>> While I would classify this as buggy behavior, I believe I have
seen
>>>> some hardware doing this when investigating the line level IRQ
migration
>>>> bug, as clearing the IRR did not immediately cause another
interrupt to
>>>> be generated.
>>>>
>>>>> , it
>>>>> still seems better to avoid the situation.
>>>>>
>>>>> For this second aspect, a distinction is being made between
the
>>>>> traditional and the directed-EOI cases: In the former,
vectors should
>>>>> not be shared throughout all IO-APICs in the system, while
in the
>>>>> latter case only individual IO-APICs need to be contrained
(or, if the
>>>>> firmware indicates so, sub- groups of them having the same
GSI appear
>>>>> at multiple pins).
>>>>>
>>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>>> Provisional nack because it is my understanding that under all
>>>> circumstances, you must maintain a vector exclusivity map
across all
>>>> IO-APICs because of the broadcast problem.  Or have I made a
mistake in
>>>> my reasoning?
>>> With directed EOI there''s no broadcasting involved, which
is why
>>> global sharing prevention is not necessary.
>>>
>>> However, after some more thinking over the weekend I think we need
>>> to also/first adjust end_level_ioapic_irq()''s call to
io_apic_eoi_vector():
>>> It shouldn''t really iterate over all IO-APICs, but instead
call
>>> eoi_IO_APIC_irq(). Thoughts? (That would also eliminate the need to
>>> look up pin or vector in __io_apic_eoi(), as all remaining call
sites pass
>>> both.)
>>>
>>> Jan
>>>
>> 
>> I believe that should work.  By the point end_level_ioapic_irq() is
>> called, all the irq_desc information should point to the new vector, so
>> eoi_IO_APIC_irq() should get it correct.  At the time I made that
patch,
>> I was not so familiar with the IO-APIC code so decided that calling
>> io_apic_eoi was the safer bet.
> 
> I''m having some more fundamental problem with this original change
> of yours: The assignment of the new vector happens in the context
> of move_native_irq(), which gets called *after* doing the manual
> EOI (and hence also *after* the vector != desc->arch.vector check).
> How does that do what it is supposed to?
> 
> (Below the [untested] patch that I would propose to do what I described
> above, pending your clarification regarding the original change.)
Here is one that actually compiles (and does even more cleanup, as
pointed out by the compiler).

Jan

--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -69,10 +69,6 @@ int __read_mostly nr_ioapics;
 
 #define ioapic_has_eoi_reg(apic) (mp_ioapics[(apic)].mpc_apicver >= 0x20)
 
-#define io_apic_eoi_vector(apic, vector) io_apic_eoi((apic), (vector), -1)
-#define io_apic_eoi_pin(apic, pin) io_apic_eoi((apic), -1, (pin))
-
-
 /*
  * This is performance-critical, we want to do it O(1)
  *
@@ -213,21 +209,18 @@ static void ioapic_write_entry(
     spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-/* EOI an IO-APIC entry.  One of vector or pin may be -1, indicating that
- * it should be worked out using the other.  This function expect that the
- * ioapic_lock is taken, and interrupts are disabled (or there is a good reason
- * not to), and that if both pin and vector are passed, that they refer to the
+/* EOI an IO-APIC entry.  Vector may be -1, indicating that it should be
+ * worked out using the pin.  This function expects that the ioapic_lock is
+ * being held, and interrupts are disabled (or there is a good reason not
+ * to), and that if both pin and vector are passed, that they refer to the
  * same redirection entry in the IO-APIC. */
 static void __io_apic_eoi(unsigned int apic, unsigned int vector, unsigned int
pin)
 {
-    /* Ensure some useful information is passed in */
-    BUG_ON( (vector == -1 && pin == -1) );
-    
     /* Prefer the use of the EOI register if available */
     if ( ioapic_has_eoi_reg(apic) )
     {
         /* If vector is unknown, read it from the IO-APIC */
-        if ( vector == -1 )
+        if ( vector == IRQ_VECTOR_UNASSIGNED )
             vector = __ioapic_read_entry(apic, pin, TRUE).vector;
 
         *(IO_APIC_BASE(apic)+16) = vector;
@@ -239,42 +232,6 @@ static void __io_apic_eoi(unsigned int a
         struct IO_APIC_route_entry entry;
         bool_t need_to_unmask = 0;
 
-        /* If pin is unknown, search for it */
-        if ( pin == -1 )
-        {
-            unsigned int p;
-            for ( p = 0; p < nr_ioapic_entries[apic]; ++p )
-            {
-                entry = __ioapic_read_entry(apic, p, TRUE);
-                if ( entry.vector == vector )
-                {
-                    pin = p;
-                    /* break; */
-
-                    /* Here should be a break out of the loop, but at the 
-                     * Xen code doesn''t actually prevent multiple
IO-APIC
-                     * entries being assigned the same vector, so EOI all
-                     * pins which have the correct vector.
-                     *
-                     * Remove the following code when the above assertion
-                     * is fulfilled. */
-                    __io_apic_eoi(apic, vector, p);
-                }
-            }
-            
-            /* If search fails, nothing to do */
-
-            /* if ( pin == -1 ) */
-
-            /* Because the loop wasn''t broken out of (see comment
above),
-             * all relevant pins have been EOI, so we can always return.
-             * 
-             * Re-instate the if statement above when the Xen logic has been
-             * fixed.*/
-
-            return;
-        }
-
         entry = __ioapic_read_entry(apic, pin, TRUE);
 
         if ( ! entry.mask )
@@ -301,17 +258,6 @@ static void __io_apic_eoi(unsigned int a
     }
 }
 
-/* EOI an IO-APIC entry.  One of vector or pin may be -1, indicating that
- * it should be worked out using the other.  This function disables interrupts
- * and takes the ioapic_lock */
-static void io_apic_eoi(unsigned int apic, unsigned int vector, unsigned int
pin)
-{
-    unsigned int flags;
-    spin_lock_irqsave(&ioapic_lock, flags);
-    __io_apic_eoi(apic, vector, pin);
-    spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
 /*
  * Saves all the IO-APIC RTE''s
  */
@@ -1693,11 +1639,7 @@ static void end_level_ioapic_irq(struct 
 
     /* Manually EOI the old vector if we are moving to the new */
     if ( vector && i != vector )
-    {
-        int ioapic;
-        for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
-            io_apic_eoi_vector(ioapic, i);
-    }
+        eoi_IO_APIC_irq(desc);
 
     v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
 


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Andrew Cooper

2011-Nov-14 15:27 UTC

head link

Re: [Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

On 14/11/11 14:20, Jan Beulich wrote:>>>> On 14.11.11 at 14:59, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
>> On 14/11/11 08:57, Jan Beulich wrote:
>>>>>> On 11.11.11 at 18:13, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
>>>> On 11/11/11 16:00, Jan Beulich wrote:
>>>>> Following the prevention of vector sharing for MSIs, this
change
>>>>> enforces the same within IO-APICs: Pin based interrupts use
the IO-APIC
>>>>> as their identifying device under the AMD IOMMU (and just
like for
>>>>> MSIs, only the identifying device is used to remap
interrupts here,
>>>>> with no regard to an interrupt''s destination).
>>>>>
>>>>> Additionally, LAPIC initiated EOIs (for level triggered
interrupts) too
>>>>> use only the vector for identifying which interrupts to
end. While this
>>>>> generally causes no significant problem (at worst an
interrupt would be
>>>>> re-raised without a new interrupt event actually having
occurred)
>>>> At worst, hardware asserts a line interrupt, deasserts it
later, and an
>>>> EOI broadcast gets rid of any record that the IRQ was ever
raised.
>>>> While I would classify this as buggy behavior, I believe I have
seen
>>>> some hardware doing this when investigating the line level IRQ
migration
>>>> bug, as clearing the IRR did not immediately cause another
interrupt to
>>>> be generated.
>>>>
>>>>> , it
>>>>> still seems better to avoid the situation.
>>>>>
>>>>> For this second aspect, a distinction is being made between
the
>>>>> traditional and the directed-EOI cases: In the former,
vectors should
>>>>> not be shared throughout all IO-APICs in the system, while
in the
>>>>> latter case only individual IO-APICs need to be contrained
(or, if the
>>>>> firmware indicates so, sub- groups of them having the same
GSI appear
>>>>> at multiple pins).
>>>>>
>>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>>> Provisional nack because it is my understanding that under all
>>>> circumstances, you must maintain a vector exclusivity map
across all
>>>> IO-APICs because of the broadcast problem.  Or have I made a
mistake in
>>>> my reasoning?
>>> With directed EOI there''s no broadcasting involved, which
is why
>>> global sharing prevention is not necessary.
>>>
>>> However, after some more thinking over the weekend I think we need
>>> to also/first adjust end_level_ioapic_irq()''s call to
io_apic_eoi_vector():
>>> It shouldn''t really iterate over all IO-APICs, but instead
call
>>> eoi_IO_APIC_irq(). Thoughts? (That would also eliminate the need to
>>> look up pin or vector in __io_apic_eoi(), as all remaining call
sites pass
>>> both.)
>>>
>>> Jan
>>>
>> I believe that should work.  By the point end_level_ioapic_irq() is
>> called, all the irq_desc information should point to the new vector, so
>> eoi_IO_APIC_irq() should get it correct.  At the time I made that
patch,
>> I was not so familiar with the IO-APIC code so decided that calling
>> io_apic_eoi was the safer bet.
> I''m having some more fundamental problem with this original change
> of yours: The assignment of the new vector happens in the context
> of move_native_irq(), which gets called *after* doing the manual
> EOI (and hence also *after* the vector != desc->arch.vector check).
> How does that do what it is supposed to?
>
> (Below the [untested] patch that I would propose to do what I described
> above, pending your clarification regarding the original change.)
>
> Jan
Are you sure?  I was under the impression that for safety, you have to
move the IRQ before ack''ing it at the local APIC, to prevent another
IRQ
coming in while you are updating the structures.

The steps (as far as I remember from debugging this issue) are:

1) Scheduler decides move a vcpu to a different pcpu.  This implies
moving all bound IRQs.  It sets irq_desc->arch->pending_mask to the
pcpu(s) we wish to move to, and sets irq_desc->static |= IRQ_MOVE_PENDING
2) When an irq appears (on the original pcpu) which has IRQ_MOVE_PENDING
set, the ack handler rewrites the RTE to point to the new pcpu:vector,
and updates the irq_desc a bit.  This rewriting necessarily happens
before sending an EOI.
3) When the next irq appears (on the new pcpu), the code cleans up the
old vector_irq reference for the old pcpu, and tweaks some of irq_desc.

An alternative to 3) can occur when using the IRQ_MOVE_CLEANUP_VECTOR
IPI which looks through all pending cleanups on the current pcpu and
cleans them up.

Anyway, it was certainly the case that I saw the RTE being updated
before the EOI was sent, which is why I had to change the
hw_irq_controler.end interface to include the vector from the pending
EOI stack, so end_level_ioapic_irq could EOI the new vector if it was
different from the vector the lapic saw.

I hope this makes sense - it has been a little while since I last looked
at the problem.

~Andrew
> --- a/xen/arch/x86/io_apic.c
> +++ b/xen/arch/x86/io_apic.c
> @@ -69,10 +69,6 @@ int __read_mostly nr_ioapics;
>  
>  #define ioapic_has_eoi_reg(apic) (mp_ioapics[(apic)].mpc_apicver >=
0x20)
>  
> -#define io_apic_eoi_vector(apic, vector) io_apic_eoi((apic), (vector), -1)
> -#define io_apic_eoi_pin(apic, pin) io_apic_eoi((apic), -1, (pin))
> -
> -
>  /*
>   * This is performance-critical, we want to do it O(1)
>   *
> @@ -220,14 +216,11 @@ static void ioapic_write_entry(
>   * same redirection entry in the IO-APIC. */
>  static void __io_apic_eoi(unsigned int apic, unsigned int vector, unsigned
int pin)
>  {
> -    /* Ensure some useful information is passed in */
> -    BUG_ON( (vector == -1 && pin == -1) );
> -    
>      /* Prefer the use of the EOI register if available */
>      if ( ioapic_has_eoi_reg(apic) )
>      {
>          /* If vector is unknown, read it from the IO-APIC */
> -        if ( vector == -1 )
> +        if ( vector == IRQ_VECTOR_UNASSIGNED )
>              vector = __ioapic_read_entry(apic, pin, TRUE).vector;
>  
>          *(IO_APIC_BASE(apic)+16) = vector;
> @@ -239,42 +232,6 @@ static void __io_apic_eoi(unsigned int a
>          struct IO_APIC_route_entry entry;
>          bool_t need_to_unmask = 0;
>  
> -        /* If pin is unknown, search for it */
> -        if ( pin == -1 )
> -        {
> -            unsigned int p;
> -            for ( p = 0; p < nr_ioapic_entries[apic]; ++p )
> -            {
> -                entry = __ioapic_read_entry(apic, p, TRUE);
> -                if ( entry.vector == vector )
> -                {
> -                    pin = p;
> -                    /* break; */
> -
> -                    /* Here should be a break out of the loop, but at the 
> -                     * Xen code doesn''t actually prevent multiple
IO-APIC
> -                     * entries being assigned the same vector, so EOI all
> -                     * pins which have the correct vector.
> -                     *
> -                     * Remove the following code when the above assertion
> -                     * is fulfilled. */
> -                    __io_apic_eoi(apic, vector, p);
> -                }
> -            }
> -            
> -            /* If search fails, nothing to do */
> -
> -            /* if ( pin == -1 ) */
> -
> -            /* Because the loop wasn''t broken out of (see comment
above),
> -             * all relevant pins have been EOI, so we can always return.
> -             * 
> -             * Re-instate the if statement above when the Xen logic has
been
> -             * fixed.*/
> -
> -            return;
> -        }
> -
>          entry = __ioapic_read_entry(apic, pin, TRUE);
>  
>          if ( ! entry.mask )
> @@ -1645,7 +1602,6 @@ static void mask_and_ack_level_ioapic_ir
>  static void end_level_ioapic_irq(struct irq_desc *desc, u8 vector)
>  {
>      unsigned long v;
> -    int i;
>  
>      if ( !ioapic_ack_new )
>      {
> @@ -1689,15 +1645,9 @@ static void end_level_ioapic_irq(struct 
>   * operation to prevent an edge-triggered interrupt escaping meanwhile.
>   * The idea is from Manfred Spraul.  --macro
>   */
> -    i = desc->arch.vector;
> -
>      /* Manually EOI the old vector if we are moving to the new */
> -    if ( vector && i != vector )
> -    {
> -        int ioapic;
> -        for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
> -            io_apic_eoi_vector(ioapic, i);
> -    }
> +    if ( vector && desc->arch.vector != vector )
> +        eoi_IO_APIC_irq(desc);
>  
>      v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
>  
>
-- 
Andrew Cooper - Dom0 Kernel Engineer, Citrix XenServer
T: +44 (0)1223 225 900, http://www.citrix.com


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jan Beulich

2011-Nov-14 16:29 UTC

head link

Re: [Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

>>> On 14.11.11 at 16:27, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
> On 14/11/11 14:20, Jan Beulich wrote:
>> I''m having some more fundamental problem with this original
change
>> of yours: The assignment of the new vector happens in the context
>> of move_native_irq(), which gets called *after* doing the manual
>> EOI (and hence also *after* the vector != desc->arch.vector check).
>> How does that do what it is supposed to?
>>
>> (Below the [untested] patch that I would propose to do what I described
>> above, pending your clarification regarding the original change.)
>>
>> Jan
> 
> Are you sure?  I was under the impression that for safety, you have to
> move the IRQ before ack''ing it at the local APIC, to prevent
another IRQ
> coming in while you are updating the structures.
Just take another look at end_level_ioapic_irq().
> The steps (as far as I remember from debugging this issue) are:
> 
> 1) Scheduler decides move a vcpu to a different pcpu.  This implies
> moving all bound IRQs.  It sets irq_desc->arch->pending_mask to the
> pcpu(s) we wish to move to, and sets irq_desc->static |=
IRQ_MOVE_PENDING
> 2) When an irq appears (on the original pcpu) which has IRQ_MOVE_PENDING
> set, the ack handler rewrites the RTE to point to the new pcpu:vector,
> and updates the irq_desc a bit.  This rewriting necessarily happens
> before sending an EOI.
One would think so, but according to my reading of the code that''s
not the case. And it really can''t, as otherwise
io_apic_level_ack_pending() would always return true (and hence
move_native_irq() would never get called).
> 3) When the next irq appears (on the new pcpu), the code cleans up the
> old vector_irq reference for the old pcpu, and tweaks some of irq_desc.
But the crucial thing is that desc->arch.vector gets written during step 2,
and hence the condition around the EOI can never be true unless an
interrupt surfaces in the window between the EOI in step 2 and the
disabling of it in move_native_irq(). Was it just this special case that
your original change addressed (the change description doesn''t hint in
that direction).
> An alternative to 3) can occur when using the IRQ_MOVE_CLEANUP_VECTOR
> IPI which looks through all pending cleanups on the current pcpu and
> cleans them up.
> 
> Anyway, it was certainly the case that I saw the RTE being updated
> before the EOI was sent, which is why I had to change the
> hw_irq_controler.end interface to include the vector from the pending
> EOI stack, so end_level_ioapic_irq could EOI the new vector if it was
> different from the vector the lapic saw.
That''s another thing I would want to revert: There really
shouldn''t be
a need to pass in the old vector, as it is getting stored in
__assign_irq_vector(). If the value wasn''t valid anymore at the point
we''d need it, we''d have a more severe problem - the vector
gets
marked as not in use by smp_irq_move_cleanup_interrupt(), and
hence it wouldn''t be valid anymore to play with it (read: EOI it) as
it may have got re-used already. As vector_irq[] gets updated there
too, that should result in "No irq handler for vector ...", but it
would
not make it into end_level_ioapic_irq() in such a case.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Andrew Cooper

2011-Nov-14 17:33 UTC

head link

Re: [Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

On 14/11/11 16:29, Jan Beulich wrote:>>>> On 14.11.11 at 16:27, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
>> On 14/11/11 14:20, Jan Beulich wrote:
>>> I''m having some more fundamental problem with this
original change
>>> of yours: The assignment of the new vector happens in the context
>>> of move_native_irq(), which gets called *after* doing the manual
>>> EOI (and hence also *after* the vector != desc->arch.vector
check).
>>> How does that do what it is supposed to?
>>>
>>> (Below the [untested] patch that I would propose to do what I
described
>>> above, pending your clarification regarding the original change.)
>>>
>>> Jan
>> Are you sure?  I was under the impression that for safety, you have to
>> move the IRQ before ack''ing it at the local APIC, to prevent
another IRQ
>> coming in while you are updating the structures.
> Just take another look at end_level_ioapic_irq().
If it makes a difference, I was debugging the issue on boxes without
x2APIC, so there was no direct EOI support.  Given the number of times
the code conditionally returns depending on direct_eoi_enabled, I would
not be surprised if there are two separate codepaths.
>> The steps (as far as I remember from debugging this issue) are:
>>
>> 1) Scheduler decides move a vcpu to a different pcpu.  This implies
>> moving all bound IRQs.  It sets irq_desc->arch->pending_mask to
the
>> pcpu(s) we wish to move to, and sets irq_desc->static |=
IRQ_MOVE_PENDING
>> 2) When an irq appears (on the original pcpu) which has
IRQ_MOVE_PENDING
>> set, the ack handler rewrites the RTE to point to the new pcpu:vector,
>> and updates the irq_desc a bit.  This rewriting necessarily happens
>> before sending an EOI.
> One would think so, but according to my reading of the code that''s
> not the case. And it really can''t, as otherwise
> io_apic_level_ack_pending() would always return true (and hence
> move_native_irq() would never get called).
In mask_and_ack_level_ioapic_irq which is the ack handler for level
triggered interrupts, there seems to be a codepath which masks the irq,
then EOIs the local apic which should clear the Remote IRR in the RTE at
which point the call to move_masked_irq() should go ahead.

I have to admit that I am perplexed as to how the original bug occurred,
but I guarantee that at the time, I was seeing the RTE being updated to
the new vector while the LAPIC still held the vector for the old irq. 
This is why the submitted patch fixed the bug.
>> 3) When the next irq appears (on the new pcpu), the code cleans up the
>> old vector_irq reference for the old pcpu, and tweaks some of irq_desc.
> But the crucial thing is that desc->arch.vector gets written during step
2,
> and hence the condition around the EOI can never be true unless an
> interrupt surfaces in the window between the EOI in step 2 and the
> disabling of it in move_native_irq(). Was it just this special case that
> your original change addressed (the change description doesn''t
hint in
> that direction).
Given that the bug was certainly a timing issue, and most of the time
the IRQ migrated, everything worked fine, It is quite possible that it
was a new IRQ appearing in that window.  Having said that, even if a new
IRQ arrived, its remote IRR should not be set until the LAPIC has acked
it, at which point we should be in the IRQ handler for it which seems
rather hard if we are still in the handler for the previous IRQ. 
Perhaps there is a weird interaction whereby acking a masked level
interrupt doesn''t reset the IRR.

I have to admit that the logic seems a bit broken, and it certainly
seems to be more complicated than it needs to be.
>> An alternative to 3) can occur when using the IRQ_MOVE_CLEANUP_VECTOR
>> IPI which looks through all pending cleanups on the current pcpu and
>> cleans them up.
>>
>> Anyway, it was certainly the case that I saw the RTE being updated
>> before the EOI was sent, which is why I had to change the
>> hw_irq_controler.end interface to include the vector from the pending
>> EOI stack, so end_level_ioapic_irq could EOI the new vector if it was
>> different from the vector the lapic saw.
> That''s another thing I would want to revert: There really
shouldn''t be
> a need to pass in the old vector, as it is getting stored in
> __assign_irq_vector(). If the value wasn''t valid anymore at the
point
> we''d need it, we''d have a more severe problem - the
vector gets
> marked as not in use by smp_irq_move_cleanup_interrupt(), and
> hence it wouldn''t be valid anymore to play with it (read: EOI it)
as
> it may have got re-used already. As vector_irq[] gets updated there
> too, that should result in "No irq handler for vector ...", but
it would
> not make it into end_level_ioapic_irq() in such a case.
>
> Jan
That is fair enough.  When I submitted this patch, irq_cfg did not have
old_vector in it - I added old_vector in a later patch.  I really should
have remembered that I could then revert the change to the interface.

-- 
Andrew Cooper - Dom0 Kernel Engineer, Citrix XenServer
T: +44 (0)1223 225 900, http://www.citrix.com


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jan Beulich

2011-Nov-15 11:10 UTC

head link

Re: [Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

>>> On 14.11.11 at 18:33, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
> On 14/11/11 16:29, Jan Beulich wrote:
>>>>> On 14.11.11 at 16:27, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
>>> On 14/11/11 14:20, Jan Beulich wrote:
>>>> I''m having some more fundamental problem with this
original change
>>>> of yours: The assignment of the new vector happens in the
context
>>>> of move_native_irq(), which gets called *after* doing the
manual
>>>> EOI (and hence also *after* the vector != desc->arch.vector
check).
>>>> How does that do what it is supposed to?
>>>>
>>>> (Below the [untested] patch that I would propose to do what I
described
>>>> above, pending your clarification regarding the original
change.)
>>>>
>>>> Jan
>>> Are you sure?  I was under the impression that for safety, you have
to
>>> move the IRQ before ack''ing it at the local APIC, to
prevent another IRQ
>>> coming in while you are updating the structures.
>> Just take another look at end_level_ioapic_irq().
> 
> If it makes a difference, I was debugging the issue on boxes without
> x2APIC, so there was no direct EOI support.  Given the number of times
Directed EOI has nothing to do with x2apic.
> the code conditionally returns depending on direct_eoi_enabled, I would
> not be surprised if there are two separate codepaths.
>...
>> But the crucial thing is that desc->arch.vector gets written during
step 2,
>> and hence the condition around the EOI can never be true unless an
>> interrupt surfaces in the window between the EOI in step 2 and the
>> disabling of it in move_native_irq(). Was it just this special case
that
>> your original change addressed (the change description doesn''t
hint in
>> that direction).
> 
> Given that the bug was certainly a timing issue, and most of the time
> the IRQ migrated, everything worked fine, It is quite possible that it
> was a new IRQ appearing in that window.  Having said that, even if a new
> IRQ arrived, its remote IRR should not be set until the LAPIC has acked
> it, at which point we should be in the IRQ handler for it which seems
> rather hard if we are still in the handler for the previous IRQ. 
No - the LAPIC acking the IO-APIC request doesn''t mean the IRQ
is already in the process of being handled by software. In particular,
while the first IRQ is in progress, the IRR bit could get set by a
second interrupt instance between the EOI and the disabling (in
move_native_irq()), bit EFLAGS.IF would still be zero. Once sti (or
an equivalent thereof) would get executed, that second interrupt
would get delivered from the LAPIC to the execution unit, and the
mask-and-ack would invoke irq_complete_move(), which in turn
might send the cleanup IPI (which in turn wouldn''t get delivered
until EFLAGS.IF became 1 again). Depending on whether the IPI
arrives before or after end_level_ioapic_irq() runs for this second
interrupt instance, the vectors may or may appear to be out of
sync (and here the description of your original change seems
wrong - you''re using the new vector, not the old one, as you
want to make sure that the vector written actually matches the
already updated RTE(s)).
> Perhaps there is a weird interaction whereby acking a masked level
> interrupt doesn''t reset the IRR.
> 
> I have to admit that the logic seems a bit broken, and it certainly
> seems to be more complicated than it needs to be.
> 
>>> An alternative to 3) can occur when using the
IRQ_MOVE_CLEANUP_VECTOR
>>> IPI which looks through all pending cleanups on the current pcpu
and
>>> cleans them up.
>>>
>>> Anyway, it was certainly the case that I saw the RTE being updated
>>> before the EOI was sent, which is why I had to change the
>>> hw_irq_controler.end interface to include the vector from the
pending
>>> EOI stack, so end_level_ioapic_irq could EOI the new vector if it
was
>>> different from the vector the lapic saw.
>> That''s another thing I would want to revert: There really
shouldn''t be
>> a need to pass in the old vector, as it is getting stored in
>> __assign_irq_vector(). If the value wasn''t valid anymore at
the point
>> we''d need it, we''d have a more severe problem - the
vector gets
>> marked as not in use by smp_irq_move_cleanup_interrupt(), and
>> hence it wouldn''t be valid anymore to play with it (read: EOI
it) as
>> it may have got re-used already. As vector_irq[] gets updated there
>> too, that should result in "No irq handler for vector ...",
but it would
>> not make it into end_level_ioapic_irq() in such a case.
>>
>> Jan
> 
> That is fair enough.  When I submitted this patch, irq_cfg did not have
> old_vector in it - I added old_vector in a later patch.  I really should
> have remembered that I could then revert the change to the interface.
The point is that we need to be sure that ->arch.old_vector is still valid
at this point. Yesterday I though I convinced myself that it would be,
but today I''m again not certain anymore.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Andrew Cooper

2011-Nov-15 12:44 UTC

head link

Re: [Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

On 15/11/11 11:10, Jan Beulich wrote:>>>> On 14.11.11 at 18:33, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
>> On 14/11/11 16:29, Jan Beulich wrote:
>>>>>> On 14.11.11 at 16:27, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
>>>> On 14/11/11 14:20, Jan Beulich wrote:
>>>>> I''m having some more fundamental problem with this
original change
>>>>> of yours: The assignment of the new vector happens in the
context
>>>>> of move_native_irq(), which gets called *after* doing the
manual
>>>>> EOI (and hence also *after* the vector !=
desc->arch.vector check).
>>>>> How does that do what it is supposed to?
>>>>>
>>>>> (Below the [untested] patch that I would propose to do what
I described
>>>>> above, pending your clarification regarding the original
change.)
>>>>>
>>>>> Jan
>>>> Are you sure?  I was under the impression that for safety, you
have to
>>>> move the IRQ before ack''ing it at the local APIC, to
prevent another IRQ
>>>> coming in while you are updating the structures.
>>> Just take another look at end_level_ioapic_irq().
>> If it makes a difference, I was debugging the issue on boxes without
>> x2APIC, so there was no direct EOI support.  Given the number of times
> Directed EOI has nothing to do with x2apic.
The manual (http://www.naic.edu/~phil/software/intel/318148.pdf) implies
that only x2APIC hardware has DirectEOI support, even though it is
usable in both xAPIC and x2APIC mode if bit 24 of the LAPIC version
register is set.

Irrespective of the implication, I discovered when profiling the code
that DirectEOI was not supported on the box in question.
>> the code conditionally returns depending on direct_eoi_enabled, I would
>> not be surprised if there are two separate codepaths.
>> ...
>>> But the crucial thing is that desc->arch.vector gets written
during step 2,
>>> and hence the condition around the EOI can never be true unless an
>>> interrupt surfaces in the window between the EOI in step 2 and the
>>> disabling of it in move_native_irq(). Was it just this special case
that
>>> your original change addressed (the change description
doesn''t hint in
>>> that direction).
>> Given that the bug was certainly a timing issue, and most of the time
>> the IRQ migrated, everything worked fine, It is quite possible that it
>> was a new IRQ appearing in that window.  Having said that, even if a
new
>> IRQ arrived, its remote IRR should not be set until the LAPIC has acked
>> it, at which point we should be in the IRQ handler for it which seems
>> rather hard if we are still in the handler for the previous IRQ. 
> No - the LAPIC acking the IO-APIC request doesn''t mean the IRQ
> is already in the process of being handled by software. In particular,
> while the first IRQ is in progress, the IRR bit could get set by a
> second interrupt instance between the EOI and the disabling (in
> move_native_irq()), bit EFLAGS.IF would still be zero. Once sti (or
> an equivalent thereof) would get executed, that second interrupt
> would get delivered from the LAPIC to the execution unit, and the
> mask-and-ack would invoke irq_complete_move(), which in turn
> might send the cleanup IPI (which in turn wouldn''t get delivered
> until EFLAGS.IF became 1 again). Depending on whether the IPI
> arrives before or after end_level_ioapic_irq() runs for this second
> interrupt instance, the vectors may or may appear to be out of
> sync (and here the description of your original change seems
> wrong - you''re using the new vector, not the old one, as you
> want to make sure that the vector written actually matches the
> already updated RTE(s)).
If this is the case, we really should not be sending an EOI to line
level interrupts until we have actually serviced them (and moved them if
relevant)  The IRR should not be cleared until we are able to service a
new interrupt from the source.

Rereading my comment in the patch, it is wrong.  The comment should read
/* Manually EOI the new vector if we have migrated while servicing */ or
something equivalent.
>> Perhaps there is a weird interaction whereby acking a masked level
>> interrupt doesn''t reset the IRR.
>>
>> I have to admit that the logic seems a bit broken, and it certainly
>> seems to be more complicated than it needs to be.
>>
>>>> An alternative to 3) can occur when using the
IRQ_MOVE_CLEANUP_VECTOR
>>>> IPI which looks through all pending cleanups on the current
pcpu and
>>>> cleans them up.
>>>>
>>>> Anyway, it was certainly the case that I saw the RTE being
updated
>>>> before the EOI was sent, which is why I had to change the
>>>> hw_irq_controler.end interface to include the vector from the
pending
>>>> EOI stack, so end_level_ioapic_irq could EOI the new vector if
it was
>>>> different from the vector the lapic saw.
>>> That''s another thing I would want to revert: There really
shouldn''t be
>>> a need to pass in the old vector, as it is getting stored in
>>> __assign_irq_vector(). If the value wasn''t valid anymore
at the point
>>> we''d need it, we''d have a more severe problem -
the vector gets
>>> marked as not in use by smp_irq_move_cleanup_interrupt(), and
>>> hence it wouldn''t be valid anymore to play with it (read:
EOI it) as
>>> it may have got re-used already. As vector_irq[] gets updated there
>>> too, that should result in "No irq handler for vector
...", but it would
>>> not make it into end_level_ioapic_irq() in such a case.
>>>
>>> Jan
>> That is fair enough.  When I submitted this patch, irq_cfg did not have
>> old_vector in it - I added old_vector in a later patch.  I really
should
>> have remembered that I could then revert the change to the interface.
> The point is that we need to be sure that ->arch.old_vector is still
valid
> at this point. Yesterday I though I convinced myself that it would be,
> but today I''m again not certain anymore.
>
> Jan
If your description of the case causing the bug above is correct, then
->arch.old_vector may not be valid at this point because of other
processors taking a new interrupt and calling the cleanup code.  I am
struggling to work out whether that race condition is possible or not.

One option would be to try it and see if the bug reoccurs - I still have
access to the server which would reproduce the Line Level EOI bug.

-- 
Andrew Cooper - Dom0 Kernel Engineer, Citrix XenServer
T: +44 (0)1223 225 900, http://www.citrix.com


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jan Beulich

2011-Nov-15 13:09 UTC

head link

Re: [Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

>>> On 15.11.11 at 13:44, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
> One option would be to try it and see if the bug reoccurs - I still have
> access to the server which would reproduce the Line Level EOI bug.
That would be much appreciated (perhaps after the two pending patches
went in).

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Xen devel - Nov 2011 - [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

[Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

Re: [Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

Re: [Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

Re: [Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

Re: [Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

Re: [Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

Re: [Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

Re: [Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

Re: [Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

Re: [Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

Re: [Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs

Re: [Xen-devel] [PATCH] x86/IRQ: prevent vector sharing within IO-APICs