thr3ads.net - Xen devel - [PATCH] Bypass mask bit of msix entry in xen [Mar 2013]

If this information is useful, please help other people find it:
Share via:

Zhenzhong Duan

2013-Mar-21 03:38 UTC

[PATCH] Bypass mask bit of msix entry in xen

Currently xen doesn''t support passthrough mask bit in msix entry,
this will conflict with qemu''s control to that bit.
Pass it in xen and let device module simulate it.

Signed-off-by: Zhenzhong Duan <zhenzhong.duan@oracle.com>
---
 xen/arch/x86/hvm/vmsi.c |   67 +++++++++++-----------------------------------
 1 files changed, 16 insertions(+), 51 deletions(-)

diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
index cfc7c80..845cf4c 100644
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -167,7 +167,7 @@ struct msixtbl_entry
     unsigned long table_flags[BITS_TO_LONGS(MAX_MSIX_TABLE_ENTRIES)];
 #define MAX_MSIX_ACC_ENTRIES 3
     struct { 
-        uint32_t msi_ad[3];	/* Shadow of address low, high and data */
+        uint32_t msi_ad[4];	/* Shadow of address low, high, data and control */
     } gentries[MAX_MSIX_ACC_ENTRIES];
     struct rcu_head rcu;
 };
@@ -213,7 +213,6 @@ static int msixtbl_read(
 {
     unsigned long offset;
     struct msixtbl_entry *entry;
-    void *virt;
     unsigned int nr_entry, index;
     int r = X86EMUL_UNHANDLEABLE;
 
@@ -224,23 +223,14 @@ static int msixtbl_read(
 
     entry = msixtbl_find_entry(v, address);
     offset = address & (PCI_MSIX_ENTRY_SIZE - 1);
+    nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
+
+    if ( nr_entry >= MAX_MSIX_ACC_ENTRIES )
+        goto out;
+
+    index = offset / sizeof(uint32_t);
+    *pval = entry->gentries[nr_entry].msi_ad[index];
 
-    if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
-    {
-        nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
-        if ( nr_entry >= MAX_MSIX_ACC_ENTRIES )
-            goto out;
-        index = offset / sizeof(uint32_t);
-        *pval = entry->gentries[nr_entry].msi_ad[index];
-    }
-    else 
-    {
-        virt = msixtbl_addr_to_virt(entry, address);
-        if ( !virt )
-            goto out;
-        *pval = readl(virt);
-    }
-    
     r = X86EMUL_OKAY;
 out:
     rcu_read_unlock(&msixtbl_rcu_lock);
@@ -252,7 +242,6 @@ static int msixtbl_write(struct vcpu *v, unsigned long
address,
 {
     unsigned long offset;
     struct msixtbl_entry *entry;
-    void *virt;
     unsigned int nr_entry, index;
     int r = X86EMUL_UNHANDLEABLE;
 
@@ -264,42 +253,18 @@ static int msixtbl_write(struct vcpu *v, unsigned long
address,
     entry = msixtbl_find_entry(v, address);
     nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
 
-    offset = address & (PCI_MSIX_ENTRY_SIZE - 1);
-    if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET)
-    {
-        if ( nr_entry < MAX_MSIX_ACC_ENTRIES ) 
-        {
-            index = offset / sizeof(uint32_t);
-            entry->gentries[nr_entry].msi_ad[index] = val;
-        }
-        set_bit(nr_entry, &entry->table_flags);
-        goto out;
-    }
-
-    /* exit to device model if address/data has been modified */
-    if ( test_and_clear_bit(nr_entry, &entry->table_flags) )
+    if ( nr_entry >= MAX_MSIX_ACC_ENTRIES )
         goto out;
 
-    virt = msixtbl_addr_to_virt(entry, address);
-    if ( !virt )
-        goto out;
+    offset = address & (PCI_MSIX_ENTRY_SIZE - 1);
+    index = offset / sizeof(uint32_t);
+    entry->gentries[nr_entry].msi_ad[index] = val;
 
-    /* Do not allow the mask bit to be changed. */
-#if 0 /* XXX
-       * As the mask bit is the only defined bit in the word, and as the
-       * host MSI-X code doesn''t preserve the other bits anyway, doing
-       * this is pointless. So for now just discard the write (also
-       * saving us from having to determine the matching irq_desc).
-       */
-    spin_lock_irqsave(&desc->lock, flags);
-    orig = readl(virt);
-    val &= ~PCI_MSIX_VECTOR_BITMASK;
-    val |= orig & PCI_MSIX_VECTOR_BITMASK;
-    writel(val, virt);
-    spin_unlock_irqrestore(&desc->lock, flags);
-#endif
+    if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET)
+        set_bit(nr_entry, &entry->table_flags);
+    else
+        clear_bit(nr_entry, &entry->table_flags);
 
-    r = X86EMUL_OKAY;
 out:
     rcu_read_unlock(&msixtbl_rcu_lock);
     return r;
-- 
1.7.3

Jan Beulich

2013-Mar-21 10:12 UTC

head link

Re: [PATCH] Bypass mask bit of msix entry in xen

>>> On 21.03.13 at 04:38, Zhenzhong Duan
<zhenzhong.duan@oracle.com> wrote:
> Currently xen doesn''t support passthrough mask bit in msix entry,
> this will conflict with qemu''s control to that bit.
> Pass it in xen and let device module simulate it.
> 
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@oracle.com>
> ---
>  xen/arch/x86/hvm/vmsi.c |   67
+++++++++++-----------------------------------
>  1 files changed, 16 insertions(+), 51 deletions(-)
> 
> diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
> index cfc7c80..845cf4c 100644
> --- a/xen/arch/x86/hvm/vmsi.c
> +++ b/xen/arch/x86/hvm/vmsi.c
> @@ -167,7 +167,7 @@ struct msixtbl_entry
>      unsigned long table_flags[BITS_TO_LONGS(MAX_MSIX_TABLE_ENTRIES)];
>  #define MAX_MSIX_ACC_ENTRIES 3
>      struct { 
> -        uint32_t msi_ad[3];	/* Shadow of address low, high and data */
> +        uint32_t msi_ad[4];	/* Shadow of address low, high, data and
control */
>      } gentries[MAX_MSIX_ACC_ENTRIES];
>      struct rcu_head rcu;
>  };
> @@ -213,7 +213,6 @@ static int msixtbl_read(
>  {
>      unsigned long offset;
>      struct msixtbl_entry *entry;
> -    void *virt;
>      unsigned int nr_entry, index;
>      int r = X86EMUL_UNHANDLEABLE;
>  
> @@ -224,23 +223,14 @@ static int msixtbl_read(
>  
>      entry = msixtbl_find_entry(v, address);
>      offset = address & (PCI_MSIX_ENTRY_SIZE - 1);
> +    nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
> +
> +    if ( nr_entry >= MAX_MSIX_ACC_ENTRIES )
> +        goto out;
> +
> +    index = offset / sizeof(uint32_t);
> +    *pval = entry->gentries[nr_entry].msi_ad[index];
>  
> -    if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
> -    {
> -        nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
> -        if ( nr_entry >= MAX_MSIX_ACC_ENTRIES )
> -            goto out;
> -        index = offset / sizeof(uint32_t);
> -        *pval = entry->gentries[nr_entry].msi_ad[index];
> -    }
> -    else 
> -    {
> -        virt = msixtbl_addr_to_virt(entry, address);
> -        if ( !virt )
> -            goto out;
> -        *pval = readl(virt);
> -    }
> -    
>      r = X86EMUL_OKAY;
>  out:
>      rcu_read_unlock(&msixtbl_rcu_lock);
> @@ -252,7 +242,6 @@ static int msixtbl_write(struct vcpu *v, unsigned long
address,
>  {
>      unsigned long offset;
>      struct msixtbl_entry *entry;
> -    void *virt;
>      unsigned int nr_entry, index;
>      int r = X86EMUL_UNHANDLEABLE;
>  
> @@ -264,42 +253,18 @@ static int msixtbl_write(struct vcpu *v, unsigned
long address,
>      entry = msixtbl_find_entry(v, address);
>      nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
>  
> -    offset = address & (PCI_MSIX_ENTRY_SIZE - 1);
> -    if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET)
> -    {
> -        if ( nr_entry < MAX_MSIX_ACC_ENTRIES ) 
> -        {
> -            index = offset / sizeof(uint32_t);
> -            entry->gentries[nr_entry].msi_ad[index] = val;
> -        }
> -        set_bit(nr_entry, &entry->table_flags);
> -        goto out;
> -    }
> -
> -    /* exit to device model if address/data has been modified */
> -    if ( test_and_clear_bit(nr_entry, &entry->table_flags) )
> +    if ( nr_entry >= MAX_MSIX_ACC_ENTRIES )
>          goto out;
>  
> -    virt = msixtbl_addr_to_virt(entry, address);
> -    if ( !virt )
> -        goto out;
> +    offset = address & (PCI_MSIX_ENTRY_SIZE - 1);
> +    index = offset / sizeof(uint32_t);
> +    entry->gentries[nr_entry].msi_ad[index] = val;
>  
> -    /* Do not allow the mask bit to be changed. */
> -#if 0 /* XXX
> -       * As the mask bit is the only defined bit in the word, and as the
> -       * host MSI-X code doesn''t preserve the other bits anyway,
doing
> -       * this is pointless. So for now just discard the write (also
> -       * saving us from having to determine the matching irq_desc).
> -       */
> -    spin_lock_irqsave(&desc->lock, flags);
> -    orig = readl(virt);
> -    val &= ~PCI_MSIX_VECTOR_BITMASK;
> -    val |= orig & PCI_MSIX_VECTOR_BITMASK;
> -    writel(val, virt);
> -    spin_unlock_irqrestore(&desc->lock, flags);
> -#endif
> +    if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET)
> +        set_bit(nr_entry, &entry->table_flags);
> +    else
> +        clear_bit(nr_entry, &entry->table_flags);
>  
> -    r = X86EMUL_OKAY;
With that you pass all writes to the device model. Which means the
acceleration code could as well be dropped altogether, rather than
modifying it in ways that look questionable to me without further
explanation.

Furthermore, without explanation I also don''t see how the mask
bit is now being dealt with properly: From an abstract pov you''d
need to merge ("or") the guest-requested mask bit state with what
Xen needs for its own purposes. I don''t see anything like that here
or in the qemu side patch.

Jan
>  out:
>      rcu_read_unlock(&msixtbl_rcu_lock);
>      return r;

Zhenzhong Duan

2013-Mar-21 10:50 UTC

head link

Re: [PATCH] Bypass mask bit of msix entry in xen

On 2013-03-21 18:12, Jan Beulich wrote:>>>> On 21.03.13 at 04:38, Zhenzhong Duan
<zhenzhong.duan@oracle.com> wrote:
>> Currently xen doesn''t support passthrough mask bit in msix
entry,
>> this will conflict with qemu''s control to that bit.
>> Pass it in xen and let device module simulate it.
>>
>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@oracle.com>
>> ---
>>   xen/arch/x86/hvm/vmsi.c |   67
+++++++++++-----------------------------------
>>   1 files changed, 16 insertions(+), 51 deletions(-)
>>
>> diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
>> index cfc7c80..845cf4c 100644
>> --- a/xen/arch/x86/hvm/vmsi.c
>> +++ b/xen/arch/x86/hvm/vmsi.c
>> @@ -167,7 +167,7 @@ struct msixtbl_entry
>>       unsigned long table_flags[BITS_TO_LONGS(MAX_MSIX_TABLE_ENTRIES)];
>>   #define MAX_MSIX_ACC_ENTRIES 3
>>       struct {
>> -        uint32_t msi_ad[3];	/* Shadow of address low, high and data */
>> +        uint32_t msi_ad[4];	/* Shadow of address low, high, data and
control */
>>       } gentries[MAX_MSIX_ACC_ENTRIES];
>>       struct rcu_head rcu;
>>   };
>> @@ -213,7 +213,6 @@ static int msixtbl_read(
>>   {
>>       unsigned long offset;
>>       struct msixtbl_entry *entry;
>> -    void *virt;
>>       unsigned int nr_entry, index;
>>       int r = X86EMUL_UNHANDLEABLE;
>>   
>> @@ -224,23 +223,14 @@ static int msixtbl_read(
>>   
>>       entry = msixtbl_find_entry(v, address);
>>       offset = address & (PCI_MSIX_ENTRY_SIZE - 1);
>> +    nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
>> +
>> +    if ( nr_entry >= MAX_MSIX_ACC_ENTRIES )
>> +        goto out;
>> +
>> +    index = offset / sizeof(uint32_t);
>> +    *pval = entry->gentries[nr_entry].msi_ad[index];
>>   
>> -    if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
>> -    {
>> -        nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
>> -        if ( nr_entry >= MAX_MSIX_ACC_ENTRIES )
>> -            goto out;
>> -        index = offset / sizeof(uint32_t);
>> -        *pval = entry->gentries[nr_entry].msi_ad[index];
>> -    }
>> -    else
>> -    {
>> -        virt = msixtbl_addr_to_virt(entry, address);
>> -        if ( !virt )
>> -            goto out;
>> -        *pval = readl(virt);
>> -    }
>> -
>>       r = X86EMUL_OKAY;
>>   out:
>>       rcu_read_unlock(&msixtbl_rcu_lock);
>> @@ -252,7 +242,6 @@ static int msixtbl_write(struct vcpu *v, unsigned
long address,
>>   {
>>       unsigned long offset;
>>       struct msixtbl_entry *entry;
>> -    void *virt;
>>       unsigned int nr_entry, index;
>>       int r = X86EMUL_UNHANDLEABLE;
>>   
>> @@ -264,42 +253,18 @@ static int msixtbl_write(struct vcpu *v, unsigned
long address,
>>       entry = msixtbl_find_entry(v, address);
>>       nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
>>   
>> -    offset = address & (PCI_MSIX_ENTRY_SIZE - 1);
>> -    if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET)
>> -    {
>> -        if ( nr_entry < MAX_MSIX_ACC_ENTRIES )
>> -        {
>> -            index = offset / sizeof(uint32_t);
>> -            entry->gentries[nr_entry].msi_ad[index] = val;
>> -        }
>> -        set_bit(nr_entry, &entry->table_flags);
>> -        goto out;
>> -    }
>> -
>> -    /* exit to device model if address/data has been modified */
>> -    if ( test_and_clear_bit(nr_entry, &entry->table_flags) )
>> +    if ( nr_entry >= MAX_MSIX_ACC_ENTRIES )
>>           goto out;
>>   
>> -    virt = msixtbl_addr_to_virt(entry, address);
>> -    if ( !virt )
>> -        goto out;
>> +    offset = address & (PCI_MSIX_ENTRY_SIZE - 1);
>> +    index = offset / sizeof(uint32_t);
>> +    entry->gentries[nr_entry].msi_ad[index] = val;
>>   
>> -    /* Do not allow the mask bit to be changed. */
>> -#if 0 /* XXX
>> -       * As the mask bit is the only defined bit in the word, and as
the
>> -       * host MSI-X code doesn''t preserve the other bits
anyway, doing
>> -       * this is pointless. So for now just discard the write (also
>> -       * saving us from having to determine the matching irq_desc).
>> -       */
>> -    spin_lock_irqsave(&desc->lock, flags);
>> -    orig = readl(virt);
>> -    val &= ~PCI_MSIX_VECTOR_BITMASK;
>> -    val |= orig & PCI_MSIX_VECTOR_BITMASK;
>> -    writel(val, virt);
>> -    spin_unlock_irqrestore(&desc->lock, flags);
>> -#endif
>> +    if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET)
>> +        set_bit(nr_entry, &entry->table_flags);
>> +    else
>> +        clear_bit(nr_entry, &entry->table_flags);
>>   
>> -    r = X86EMUL_OKAY;
> With that you pass all writes to the device model. Which means the
> acceleration code could as well be dropped altogether, rather than
> modifying it in ways that look questionable to me without further
> explanation.It only accelerate msix msixtbl_read with this change, and the 
acceleration need those
code in msixtbl_write.>
> Furthermore, without explanation I also don''t see how the mask
> bit is now being dealt with properly: From an abstract pov you''d
> need to merge ("or") the guest-requested mask bit state with what
> Xen needs for its own purposes. I don''t see anything like that
here
> or in the qemu side patch.Right, I didn''t consider combine xen''s masking with
guest''s.
My patch just target making irq affinity in old hvm guest work and 
sometimes no irq handler panic.
But I would like to know what''s the result if guest''s mask
setting
doesn''t pass to device,
interrupt loss or something else? I see current implemention didn''t 
passthrough mask bit too.

thanks
zduan

Jan Beulich

2013-Mar-21 13:10 UTC

head link

Re: [PATCH] Bypass mask bit of msix entry in xen

>>> On 21.03.13 at 11:50, Zhenzhong Duan
<zhenzhong.duan@oracle.com> wrote:
> On 2013-03-21 18:12, Jan Beulich wrote:
>> Furthermore, without explanation I also don''t see how the mask
>> bit is now being dealt with properly: From an abstract pov
you''d
>> need to merge ("or") the guest-requested mask bit state with
what
>> Xen needs for its own purposes. I don''t see anything like that
here
>> or in the qemu side patch.
> Right, I didn''t consider combine xen''s masking with
guest''s.
> My patch just target making irq affinity in old hvm guest work and 
> sometimes no irq handler panic.
> But I would like to know what''s the result if guest''s
mask setting
> doesn''t pass to device,
> interrupt loss or something else? I see current implemention
didn''t
> passthrough mask bit too.
The effect of this is unknown, as it depends on the particular
driver behavior and assumptions. Loss of interrupts, however,
doesn''t appear to be problematic here; instead, interrupts
getting delivered when the driver doesn''t expect them seems
to be the much bigger problem.

Jan

DuanZhenzhong

2013-Mar-26 03:15 UTC

head link

Re: [PATCH] Bypass mask bit of msix entry in xen

Jan Beulich wrote:>>>> On 21.03.13 at 11:50, Zhenzhong Duan
<zhenzhong.duan@oracle.com> wrote:
>>>>         
>> On 2013-03-21 18:12, Jan Beulich wrote:
>>     
>>> Furthermore, without explanation I also don''t see how the
mask
>>> bit is now being dealt with properly: From an abstract pov
you''d
>>> need to merge ("or") the guest-requested mask bit state
with what
>>> Xen needs for its own purposes. I don''t see anything like
that here
>>> or in the qemu side patch.
>>>       
>> Right, I didn''t consider combine xen''s masking with
guest''s.
>> My patch just target making irq affinity in old hvm guest work and 
>> sometimes no irq handler panic.
>> But I would like to know what''s the result if guest''s
mask setting
>> doesn''t pass to device,
>> interrupt loss or something else? I see current implemention
didn''t
>> passthrough mask bit too.
>>     
>
> The effect of this is unknown, as it depends on the particular
> driver behavior and assumptions. Loss of interrupts, however,
> doesn''t appear to be problematic here; instead, interrupts
> getting delivered when the driver doesn''t expect them seems
> to be the much bigger problem.
>
> Jan
>
>   It''s hard to merge guest requested mask with xen''s.
Because xen lack a irq_desc.depth as kernel for nested irq disable and 
enable.

-- 
Regards
zhenzhong
--
Oracle Building, No.24 Building, Zhongguancun Software Park
Haidian District, Beijing 100193, China

Xen devel - Mar 2013 - [PATCH] Bypass mask bit of msix entry in xen

[PATCH] Bypass mask bit of msix entry in xen

Re: [PATCH] Bypass mask bit of msix entry in xen

Re: [PATCH] Bypass mask bit of msix entry in xen

Re: [PATCH] Bypass mask bit of msix entry in xen

Re: [PATCH] Bypass mask bit of msix entry in xen