This addresses a number of problems in msixtbl_{read,write}():
- address alignment was not checked, allowing for memory corruption in
the hypervisor (write case) or returning of hypervisor private data
to the guest (read case)
- the interrupt mask bit was permitted to be written by the guest
(while Xen''s interrupt flow control routines need to control it)
- MAX_MSIX_TABLE_{ENTRIES,PAGES} were pointlessly defined to plain
numbers (making it unobvious why they have these values, and making
the latter non-portable)
- MAX_MSIX_TABLE_PAGES was also off by one (failing to account for a
non-zero table offset); this was also affecting host MSI-X code
- struct msixtbl_entry''s table_flags[] was one element larger than
necessary due to improper open-coding of BITS_TO_LONGS()
- msixtbl_read() unconditionally accessed the physical table, even
though the data was only needed in a quarter of all cases
- various calculations were done unnecessarily for both of the rather
distinct code paths in msixtbl_read()
Additionally it is unclear on what basis MAX_MSIX_ACC_ENTRIES was
chosen to be 3.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -165,7 +165,7 @@ struct msixtbl_entry
struct pci_dev *pdev;
unsigned long gtable; /* gpa of msix table */
unsigned long table_len;
- unsigned long table_flags[MAX_MSIX_TABLE_ENTRIES / BITS_PER_LONG + 1];
+ unsigned long table_flags[BITS_TO_LONGS(MAX_MSIX_TABLE_ENTRIES)];
#define MAX_MSIX_ACC_ENTRIES 3
struct {
uint32_t msi_ad[3]; /* Shadow of address low, high and data */
@@ -192,17 +192,14 @@ static struct msixtbl_entry *msixtbl_fin
static void __iomem *msixtbl_addr_to_virt(
struct msixtbl_entry *entry, unsigned long addr)
{
- int idx, nr_page;
+ unsigned int idx, nr_page;
- if ( !entry )
+ if ( !entry || !entry->pdev )
return NULL;
nr_page = (addr >> PAGE_SHIFT) -
(entry->gtable >> PAGE_SHIFT);
- if ( !entry->pdev )
- return NULL;
-
idx = entry->pdev->msix_table_idx[nr_page];
if ( !idx )
return NULL;
@@ -215,37 +212,34 @@ static int msixtbl_read(
struct vcpu *v, unsigned long address,
unsigned long len, unsigned long *pval)
{
- unsigned long offset, val;
+ unsigned long offset;
struct msixtbl_entry *entry;
void *virt;
- int nr_entry, index;
+ unsigned int nr_entry, index;
int r = X86EMUL_UNHANDLEABLE;
- rcu_read_lock(&msixtbl_rcu_lock);
+ if ( len != 4 || (address & 3) )
+ return r;
- if ( len != 4 )
- goto out;
+ rcu_read_lock(&msixtbl_rcu_lock);
entry = msixtbl_find_entry(v, address);
- virt = msixtbl_addr_to_virt(entry, address);
- if ( !virt )
- goto out;
-
- nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
offset = address & (PCI_MSIX_ENTRY_SIZE - 1);
- if ( nr_entry >= MAX_MSIX_ACC_ENTRIES &&
- offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
- goto out;
- val = readl(virt);
if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
{
+ nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
+ if ( nr_entry >= MAX_MSIX_ACC_ENTRIES )
+ goto out;
index = offset / sizeof(uint32_t);
*pval = entry->gentries[nr_entry].msi_ad[index];
}
else
{
- *pval = val;
+ virt = msixtbl_addr_to_virt(entry, address);
+ if ( !virt )
+ goto out;
+ *pval = readl(virt);
}
r = X86EMUL_OKAY;
@@ -260,13 +254,13 @@ static int msixtbl_write(struct vcpu *v,
unsigned long offset;
struct msixtbl_entry *entry;
void *virt;
- int nr_entry, index;
+ unsigned int nr_entry, index;
int r = X86EMUL_UNHANDLEABLE;
- rcu_read_lock(&msixtbl_rcu_lock);
+ if ( len != 4 || (address & 3) )
+ return r;
- if ( len != 4 )
- goto out;
+ rcu_read_lock(&msixtbl_rcu_lock);
entry = msixtbl_find_entry(v, address);
nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
@@ -291,9 +285,22 @@ static int msixtbl_write(struct vcpu *v,
if ( !virt )
goto out;
+ /* Do not allow the mask bit to be changed. */
+#if 0 /* XXX
+ * As the mask bit is the only defined bit in the word, and as the
+ * host MSI-X code doesn''t preserve the other bits anyway, doing
+ * this is pointless. So for now just discard the write (also
+ * saving us from having to determine the matching irq_desc).
+ */
+ spin_lock_irqsave(&desc->lock, flags);
+ orig = readl(virt);
+ val &= ~PCI_MSIX_VECTOR_BITMASK;
+ val |= orig & PCI_MSIX_VECTOR_BITMASK;
writel(val, virt);
- r = X86EMUL_OKAY;
+ spin_unlock_irqrestore(&desc->lock, flags);
+#endif
+ r = X86EMUL_OKAY;
out:
rcu_read_unlock(&msixtbl_rcu_lock);
return r;
--- a/xen/include/asm-x86/msi.h
+++ b/xen/include/asm-x86/msi.h
@@ -122,12 +122,6 @@ int msi_free_irq(struct msi_desc *entry)
*/
#define NR_HP_RESERVED_VECTORS 20
-#define PCI_MSIX_ENTRY_SIZE 16
-#define PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET 0
-#define PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET 4
-#define PCI_MSIX_ENTRY_DATA_OFFSET 8
-#define PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET 12
-
#define msi_control_reg(base) (base + PCI_MSI_FLAGS)
#define msi_lower_address_reg(base) (base + PCI_MSI_ADDRESS_LO)
#define msi_upper_address_reg(base) (base + PCI_MSI_ADDRESS_HI)
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -11,6 +11,8 @@
#include <xen/list.h>
#include <xen/spinlock.h>
#include <xen/irq.h>
+#include <xen/pci_regs.h>
+#include <xen/pfn.h>
#include <asm/pci.h>
/*
@@ -30,8 +32,10 @@
#define PCI_BDF(b,d,f) ((((b) & 0xff) << 8) | PCI_DEVFN(d,f))
#define PCI_BDF2(b,df) ((((b) & 0xff) << 8) | ((df) & 0xff))
-#define MAX_MSIX_TABLE_ENTRIES 2048
-#define MAX_MSIX_TABLE_PAGES 8
+#define MAX_MSIX_TABLE_ENTRIES (PCI_MSIX_FLAGS_QSIZE + 1)
+#define MAX_MSIX_TABLE_PAGES PFN_UP(MAX_MSIX_TABLE_ENTRIES * \
+ PCI_MSIX_ENTRY_SIZE + \
+ (~PCI_MSIX_BIRMASK & (PAGE_SIZE -
1)))
struct pci_dev_info {
bool_t is_extfn;
bool_t is_virtfn;
--- a/xen/include/xen/pci_regs.h
+++ b/xen/include/xen/pci_regs.h
@@ -305,6 +305,12 @@
#define PCI_MSIX_PBA 8
#define PCI_MSIX_BIRMASK (7 << 0)
+#define PCI_MSIX_ENTRY_SIZE 16
+#define PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET 0
+#define PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET 4
+#define PCI_MSIX_ENTRY_DATA_OFFSET 8
+#define PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET 12
+
#define PCI_MSIX_VECTOR_BITMASK (1 << 0)
/* CompactPCI Hotswap Register */
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
On 20/01/12 16:33, Jan Beulich wrote:> This addresses a number of problems in msixtbl_{read,write}(): > - address alignment was not checked, allowing for memory corruption in > the hypervisor (write case) or returning of hypervisor private data > to the guest (read case) > - the interrupt mask bit was permitted to be written by the guest > (while Xen''s interrupt flow control routines need to control it) > - MAX_MSIX_TABLE_{ENTRIES,PAGES} were pointlessly defined to plain > numbers (making it unobvious why they have these values, and making > the latter non-portable) > - MAX_MSIX_TABLE_PAGES was also off by one (failing to account for a > non-zero table offset); this was also affecting host MSI-X code > - struct msixtbl_entry''s table_flags[] was one element larger than > necessary due to improper open-coding of BITS_TO_LONGS() > - msixtbl_read() unconditionally accessed the physical table, even > though the data was only needed in a quarter of all cases > - various calculations were done unnecessarily for both of the rather > distinct code paths in msixtbl_read() > > Additionally it is unclear on what basis MAX_MSIX_ACC_ENTRIES was > chosen to be 3. > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > > --- a/xen/arch/x86/hvm/vmsi.c > +++ b/xen/arch/x86/hvm/vmsi.c > @@ -165,7 +165,7 @@ struct msixtbl_entry > struct pci_dev *pdev; > unsigned long gtable; /* gpa of msix table */ > unsigned long table_len; > - unsigned long table_flags[MAX_MSIX_TABLE_ENTRIES / BITS_PER_LONG + 1]; > + unsigned long table_flags[BITS_TO_LONGS(MAX_MSIX_TABLE_ENTRIES)]; > #define MAX_MSIX_ACC_ENTRIES 3 > struct { > uint32_t msi_ad[3]; /* Shadow of address low, high and data */ > @@ -192,17 +192,14 @@ static struct msixtbl_entry *msixtbl_fin > static void __iomem *msixtbl_addr_to_virt( > struct msixtbl_entry *entry, unsigned long addr) > { > - int idx, nr_page; > + unsigned int idx, nr_page; > > - if ( !entry ) > + if ( !entry || !entry->pdev ) > return NULL; > > nr_page = (addr >> PAGE_SHIFT) - > (entry->gtable >> PAGE_SHIFT); > > - if ( !entry->pdev ) > - return NULL; > - > idx = entry->pdev->msix_table_idx[nr_page]; > if ( !idx ) > return NULL; > @@ -215,37 +212,34 @@ static int msixtbl_read( > struct vcpu *v, unsigned long address, > unsigned long len, unsigned long *pval) > { > - unsigned long offset, val; > + unsigned long offset; > struct msixtbl_entry *entry; > void *virt; > - int nr_entry, index; > + unsigned int nr_entry, index; > int r = X86EMUL_UNHANDLEABLE; > > - rcu_read_lock(&msixtbl_rcu_lock); > + if ( len != 4 || (address & 3) ) > + return r; > > - if ( len != 4 ) > - goto out; > + rcu_read_lock(&msixtbl_rcu_lock); > > entry = msixtbl_find_entry(v, address); > - virt = msixtbl_addr_to_virt(entry, address); > - if ( !virt ) > - goto out; > - > - nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; > offset = address & (PCI_MSIX_ENTRY_SIZE - 1); > - if ( nr_entry >= MAX_MSIX_ACC_ENTRIES && > - offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET ) > - goto out; > > - val = readl(virt); > if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET ) > { > + nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; > + if ( nr_entry >= MAX_MSIX_ACC_ENTRIES ) > + goto out; > index = offset / sizeof(uint32_t); > *pval = entry->gentries[nr_entry].msi_ad[index]; > } > else > { > - *pval = val; > + virt = msixtbl_addr_to_virt(entry, address); > + if ( !virt ) > + goto out; > + *pval = readl(virt); > } > > r = X86EMUL_OKAY; > @@ -260,13 +254,13 @@ static int msixtbl_write(struct vcpu *v, > unsigned long offset; > struct msixtbl_entry *entry; > void *virt; > - int nr_entry, index; > + unsigned int nr_entry, index; > int r = X86EMUL_UNHANDLEABLE; > > - rcu_read_lock(&msixtbl_rcu_lock); > + if ( len != 4 || (address & 3) ) > + return r; > > - if ( len != 4 ) > - goto out; > + rcu_read_lock(&msixtbl_rcu_lock); > > entry = msixtbl_find_entry(v, address); > nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; > @@ -291,9 +285,22 @@ static int msixtbl_write(struct vcpu *v, > if ( !virt ) > goto out; > > + /* Do not allow the mask bit to be changed. */ > +#if 0 /* XXXYou appear to still have some debugging in this patch. ~Andrew> + * As the mask bit is the only defined bit in the word, and as the > + * host MSI-X code doesn''t preserve the other bits anyway, doing > + * this is pointless. So for now just discard the write (also > + * saving us from having to determine the matching irq_desc). > + */ > + spin_lock_irqsave(&desc->lock, flags); > + orig = readl(virt); > + val &= ~PCI_MSIX_VECTOR_BITMASK; > + val |= orig & PCI_MSIX_VECTOR_BITMASK; > writel(val, virt); > - r = X86EMUL_OKAY; > + spin_unlock_irqrestore(&desc->lock, flags); > +#endif > > + r = X86EMUL_OKAY; > out: > rcu_read_unlock(&msixtbl_rcu_lock); > return r; > --- a/xen/include/asm-x86/msi.h > +++ b/xen/include/asm-x86/msi.h > @@ -122,12 +122,6 @@ int msi_free_irq(struct msi_desc *entry) > */ > #define NR_HP_RESERVED_VECTORS 20 > > -#define PCI_MSIX_ENTRY_SIZE 16 > -#define PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET 0 > -#define PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET 4 > -#define PCI_MSIX_ENTRY_DATA_OFFSET 8 > -#define PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET 12 > - > #define msi_control_reg(base) (base + PCI_MSI_FLAGS) > #define msi_lower_address_reg(base) (base + PCI_MSI_ADDRESS_LO) > #define msi_upper_address_reg(base) (base + PCI_MSI_ADDRESS_HI) > --- a/xen/include/xen/pci.h > +++ b/xen/include/xen/pci.h > @@ -11,6 +11,8 @@ > #include <xen/list.h> > #include <xen/spinlock.h> > #include <xen/irq.h> > +#include <xen/pci_regs.h> > +#include <xen/pfn.h> > #include <asm/pci.h> > > /* > @@ -30,8 +32,10 @@ > #define PCI_BDF(b,d,f) ((((b) & 0xff) << 8) | PCI_DEVFN(d,f)) > #define PCI_BDF2(b,df) ((((b) & 0xff) << 8) | ((df) & 0xff)) > > -#define MAX_MSIX_TABLE_ENTRIES 2048 > -#define MAX_MSIX_TABLE_PAGES 8 > +#define MAX_MSIX_TABLE_ENTRIES (PCI_MSIX_FLAGS_QSIZE + 1) > +#define MAX_MSIX_TABLE_PAGES PFN_UP(MAX_MSIX_TABLE_ENTRIES * \ > + PCI_MSIX_ENTRY_SIZE + \ > + (~PCI_MSIX_BIRMASK & (PAGE_SIZE - 1))) > struct pci_dev_info { > bool_t is_extfn; > bool_t is_virtfn; > --- a/xen/include/xen/pci_regs.h > +++ b/xen/include/xen/pci_regs.h > @@ -305,6 +305,12 @@ > #define PCI_MSIX_PBA 8 > #define PCI_MSIX_BIRMASK (7 << 0) > > +#define PCI_MSIX_ENTRY_SIZE 16 > +#define PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET 0 > +#define PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET 4 > +#define PCI_MSIX_ENTRY_DATA_OFFSET 8 > +#define PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET 12 > + > #define PCI_MSIX_VECTOR_BITMASK (1 << 0) > > /* CompactPCI Hotswap Register */ > >-- Andrew Cooper - Dom0 Kernel Engineer, Citrix XenServer T: +44 (0)1223 225 900, http://www.citrix.com
>>> On 20.01.12 at 17:33, "Jan Beulich" <JBeulich@suse.com> wrote: > This addresses a number of problems in msixtbl_{read,write}(): > - address alignment was not checked, allowing for memory corruption in > the hypervisor (write case) or returning of hypervisor private data > to the guest (read case) > - the interrupt mask bit was permitted to be written by the guest > (while Xen''s interrupt flow control routines need to control it) > - MAX_MSIX_TABLE_{ENTRIES,PAGES} were pointlessly defined to plain > numbers (making it unobvious why they have these values, and making > the latter non-portable) > - MAX_MSIX_TABLE_PAGES was also off by one (failing to account for a > non-zero table offset); this was also affecting host MSI-X code > - struct msixtbl_entry''s table_flags[] was one element larger than > necessary due to improper open-coding of BITS_TO_LONGS() > - msixtbl_read() unconditionally accessed the physical table, even > though the data was only needed in a quarter of all cases > - various calculations were done unnecessarily for both of the rather > distinct code paths in msixtbl_read()I should note (as done in the earlier response to the 4.2 todo thread) that I have no way of actually testing this, so I''m relying on others here. (Intel? Or, Ian, would the stage tester find issues with passed through MSI-X capable devices?) Jan> Additionally it is unclear on what basis MAX_MSIX_ACC_ENTRIES was > chosen to be 3. > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > > --- a/xen/arch/x86/hvm/vmsi.c > +++ b/xen/arch/x86/hvm/vmsi.c > @@ -165,7 +165,7 @@ struct msixtbl_entry > struct pci_dev *pdev; > unsigned long gtable; /* gpa of msix table */ > unsigned long table_len; > - unsigned long table_flags[MAX_MSIX_TABLE_ENTRIES / BITS_PER_LONG + 1]; > + unsigned long table_flags[BITS_TO_LONGS(MAX_MSIX_TABLE_ENTRIES)]; > #define MAX_MSIX_ACC_ENTRIES 3 > struct { > uint32_t msi_ad[3]; /* Shadow of address low, high and data */ > @@ -192,17 +192,14 @@ static struct msixtbl_entry *msixtbl_fin > static void __iomem *msixtbl_addr_to_virt( > struct msixtbl_entry *entry, unsigned long addr) > { > - int idx, nr_page; > + unsigned int idx, nr_page; > > - if ( !entry ) > + if ( !entry || !entry->pdev ) > return NULL; > > nr_page = (addr >> PAGE_SHIFT) - > (entry->gtable >> PAGE_SHIFT); > > - if ( !entry->pdev ) > - return NULL; > - > idx = entry->pdev->msix_table_idx[nr_page]; > if ( !idx ) > return NULL; > @@ -215,37 +212,34 @@ static int msixtbl_read( > struct vcpu *v, unsigned long address, > unsigned long len, unsigned long *pval) > { > - unsigned long offset, val; > + unsigned long offset; > struct msixtbl_entry *entry; > void *virt; > - int nr_entry, index; > + unsigned int nr_entry, index; > int r = X86EMUL_UNHANDLEABLE; > > - rcu_read_lock(&msixtbl_rcu_lock); > + if ( len != 4 || (address & 3) ) > + return r; > > - if ( len != 4 ) > - goto out; > + rcu_read_lock(&msixtbl_rcu_lock); > > entry = msixtbl_find_entry(v, address); > - virt = msixtbl_addr_to_virt(entry, address); > - if ( !virt ) > - goto out; > - > - nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; > offset = address & (PCI_MSIX_ENTRY_SIZE - 1); > - if ( nr_entry >= MAX_MSIX_ACC_ENTRIES && > - offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET ) > - goto out; > > - val = readl(virt); > if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET ) > { > + nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; > + if ( nr_entry >= MAX_MSIX_ACC_ENTRIES ) > + goto out; > index = offset / sizeof(uint32_t); > *pval = entry->gentries[nr_entry].msi_ad[index]; > } > else > { > - *pval = val; > + virt = msixtbl_addr_to_virt(entry, address); > + if ( !virt ) > + goto out; > + *pval = readl(virt); > } > > r = X86EMUL_OKAY; > @@ -260,13 +254,13 @@ static int msixtbl_write(struct vcpu *v, > unsigned long offset; > struct msixtbl_entry *entry; > void *virt; > - int nr_entry, index; > + unsigned int nr_entry, index; > int r = X86EMUL_UNHANDLEABLE; > > - rcu_read_lock(&msixtbl_rcu_lock); > + if ( len != 4 || (address & 3) ) > + return r; > > - if ( len != 4 ) > - goto out; > + rcu_read_lock(&msixtbl_rcu_lock); > > entry = msixtbl_find_entry(v, address); > nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; > @@ -291,9 +285,22 @@ static int msixtbl_write(struct vcpu *v, > if ( !virt ) > goto out; > > + /* Do not allow the mask bit to be changed. */ > +#if 0 /* XXX > + * As the mask bit is the only defined bit in the word, and as the > + * host MSI-X code doesn''t preserve the other bits anyway, doing > + * this is pointless. So for now just discard the write (also > + * saving us from having to determine the matching irq_desc). > + */ > + spin_lock_irqsave(&desc->lock, flags); > + orig = readl(virt); > + val &= ~PCI_MSIX_VECTOR_BITMASK; > + val |= orig & PCI_MSIX_VECTOR_BITMASK; > writel(val, virt); > - r = X86EMUL_OKAY; > + spin_unlock_irqrestore(&desc->lock, flags); > +#endif > > + r = X86EMUL_OKAY; > out: > rcu_read_unlock(&msixtbl_rcu_lock); > return r; > --- a/xen/include/asm-x86/msi.h > +++ b/xen/include/asm-x86/msi.h > @@ -122,12 +122,6 @@ int msi_free_irq(struct msi_desc *entry) > */ > #define NR_HP_RESERVED_VECTORS 20 > > -#define PCI_MSIX_ENTRY_SIZE 16 > -#define PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET 0 > -#define PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET 4 > -#define PCI_MSIX_ENTRY_DATA_OFFSET 8 > -#define PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET 12 > - > #define msi_control_reg(base) (base + PCI_MSI_FLAGS) > #define msi_lower_address_reg(base) (base + PCI_MSI_ADDRESS_LO) > #define msi_upper_address_reg(base) (base + PCI_MSI_ADDRESS_HI) > --- a/xen/include/xen/pci.h > +++ b/xen/include/xen/pci.h > @@ -11,6 +11,8 @@ > #include <xen/list.h> > #include <xen/spinlock.h> > #include <xen/irq.h> > +#include <xen/pci_regs.h> > +#include <xen/pfn.h> > #include <asm/pci.h> > > /* > @@ -30,8 +32,10 @@ > #define PCI_BDF(b,d,f) ((((b) & 0xff) << 8) | PCI_DEVFN(d,f)) > #define PCI_BDF2(b,df) ((((b) & 0xff) << 8) | ((df) & 0xff)) > > -#define MAX_MSIX_TABLE_ENTRIES 2048 > -#define MAX_MSIX_TABLE_PAGES 8 > +#define MAX_MSIX_TABLE_ENTRIES (PCI_MSIX_FLAGS_QSIZE + 1) > +#define MAX_MSIX_TABLE_PAGES PFN_UP(MAX_MSIX_TABLE_ENTRIES * \ > + PCI_MSIX_ENTRY_SIZE + \ > + (~PCI_MSIX_BIRMASK & (PAGE_SIZE - 1))) > struct pci_dev_info { > bool_t is_extfn; > bool_t is_virtfn; > --- a/xen/include/xen/pci_regs.h > +++ b/xen/include/xen/pci_regs.h > @@ -305,6 +305,12 @@ > #define PCI_MSIX_PBA 8 > #define PCI_MSIX_BIRMASK (7 << 0) > > +#define PCI_MSIX_ENTRY_SIZE 16 > +#define PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET 0 > +#define PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET 4 > +#define PCI_MSIX_ENTRY_DATA_OFFSET 8 > +#define PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET 12 > + > #define PCI_MSIX_VECTOR_BITMASK (1 << 0) > > /* CompactPCI Hotswap Register */
>>> On 20.01.12 at 17:39, Andrew Cooper <andrew.cooper3@citrix.com> wrote: > On 20/01/12 16:33, Jan Beulich wrote: >> @@ -291,9 +285,22 @@ static int msixtbl_write(struct vcpu *v, >> if ( !virt ) >> goto out; >> >> + /* Do not allow the mask bit to be changed. */ >> +#if 0 /* XXX > > You appear to still have some debugging in this patch.No - that''s why the comment is there. I wanted to keep the unused code to make clear what ought to be taking place here. Jan>> + * As the mask bit is the only defined bit in the word, and as the >> + * host MSI-X code doesn''t preserve the other bits anyway, doing >> + * this is pointless. So for now just discard the write (also >> + * saving us from having to determine the matching irq_desc). >> + */
On 20/01/12 16:44, Jan Beulich wrote:>>>> On 20.01.12 at 17:39, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >> On 20/01/12 16:33, Jan Beulich wrote: >>> @@ -291,9 +285,22 @@ static int msixtbl_write(struct vcpu *v, >>> if ( !virt ) >>> goto out; >>> >>> + /* Do not allow the mask bit to be changed. */ >>> +#if 0 /* XXX >> You appear to still have some debugging in this patch. > No - that''s why the comment is there. I wanted to keep the unused > code to make clear what ought to be taking place here. > > JanAh - that makes a little more sense. That was going to be my next query for clarification. I have some hardware at my disposal which can do SRIOV and PCI passthrough with MSI-X capable network cards. Any specific things you want testing, other than just a basic passthrough and verify the cards are working? ~Andrew>>> + * As the mask bit is the only defined bit in the word, and as the >>> + * host MSI-X code doesn''t preserve the other bits anyway, doing >>> + * this is pointless. So for now just discard the write (also >>> + * saving us from having to determine the matching irq_desc). >>> + */ > >-- Andrew Cooper - Dom0 Kernel Engineer, Citrix XenServer T: +44 (0)1223 225 900, http://www.citrix.com
>>> On 20.01.12 at 17:50, Andrew Cooper <andrew.cooper3@citrix.com> wrote: > I have some hardware at my disposal which can do SRIOV and PCI > passthrough with MSI-X capable network cards. > > Any specific things you want testing, other than just a basic > passthrough and verify the cards are working?No, nothing specific. Thanks for taking on this! Jan
On 20/01/12 17:01, Jan Beulich wrote:>>>> On 20.01.12 at 17:50, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >> I have some hardware at my disposal which can do SRIOV and PCI >> passthrough with MSI-X capable network cards. >> >> Any specific things you want testing, other than just a basic >> passthrough and verify the cards are working? > No, nothing specific. Thanks for taking on this! > > JanNo problem. I will try to get the testing done early next week - I have other hypervisior bugs I am currently working on right at the moment. -- Andrew Cooper - Dom0 Kernel Engineer, Citrix XenServer T: +44 (0)1223 225 900, http://www.citrix.com
On 20/01/12 16:33, Jan Beulich wrote:> This addresses a number of problems in msixtbl_{read,write}(): > - address alignment was not checked, allowing for memory corruption in > the hypervisor (write case) or returning of hypervisor private data > to the guest (read case) > - the interrupt mask bit was permitted to be written by the guest > (while Xen''s interrupt flow control routines need to control it) > - MAX_MSIX_TABLE_{ENTRIES,PAGES} were pointlessly defined to plain > numbers (making it unobvious why they have these values, and making > the latter non-portable) > - MAX_MSIX_TABLE_PAGES was also off by one (failing to account for a > non-zero table offset); this was also affecting host MSI-X code > - struct msixtbl_entry''s table_flags[] was one element larger than > necessary due to improper open-coding of BITS_TO_LONGS() > - msixtbl_read() unconditionally accessed the physical table, even > though the data was only needed in a quarter of all cases > - various calculations were done unnecessarily for both of the rather > distinct code paths in msixtbl_read() > > Additionally it is unclear on what basis MAX_MSIX_ACC_ENTRIES was > chosen to be 3. > > Signed-off-by: Jan Beulich <jbeulich@suse.com>Tested by backport to XenServer 6.0 (Xen-4.1.1) The only conflicts were with the context around adding the two new header files, and <xen/pfn.h> which is a new header file. Tested-and-acked-by: Andrew Cooper <andrew.cooper3@citrix.com>> --- a/xen/arch/x86/hvm/vmsi.c > +++ b/xen/arch/x86/hvm/vmsi.c > @@ -165,7 +165,7 @@ struct msixtbl_entry > struct pci_dev *pdev; > unsigned long gtable; /* gpa of msix table */ > unsigned long table_len; > - unsigned long table_flags[MAX_MSIX_TABLE_ENTRIES / BITS_PER_LONG + 1]; > + unsigned long table_flags[BITS_TO_LONGS(MAX_MSIX_TABLE_ENTRIES)]; > #define MAX_MSIX_ACC_ENTRIES 3 > struct { > uint32_t msi_ad[3]; /* Shadow of address low, high and data */ > @@ -192,17 +192,14 @@ static struct msixtbl_entry *msixtbl_fin > static void __iomem *msixtbl_addr_to_virt( > struct msixtbl_entry *entry, unsigned long addr) > { > - int idx, nr_page; > + unsigned int idx, nr_page; > > - if ( !entry ) > + if ( !entry || !entry->pdev ) > return NULL; > > nr_page = (addr >> PAGE_SHIFT) - > (entry->gtable >> PAGE_SHIFT); > > - if ( !entry->pdev ) > - return NULL; > - > idx = entry->pdev->msix_table_idx[nr_page]; > if ( !idx ) > return NULL; > @@ -215,37 +212,34 @@ static int msixtbl_read( > struct vcpu *v, unsigned long address, > unsigned long len, unsigned long *pval) > { > - unsigned long offset, val; > + unsigned long offset; > struct msixtbl_entry *entry; > void *virt; > - int nr_entry, index; > + unsigned int nr_entry, index; > int r = X86EMUL_UNHANDLEABLE; > > - rcu_read_lock(&msixtbl_rcu_lock); > + if ( len != 4 || (address & 3) ) > + return r; > > - if ( len != 4 ) > - goto out; > + rcu_read_lock(&msixtbl_rcu_lock); > > entry = msixtbl_find_entry(v, address); > - virt = msixtbl_addr_to_virt(entry, address); > - if ( !virt ) > - goto out; > - > - nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; > offset = address & (PCI_MSIX_ENTRY_SIZE - 1); > - if ( nr_entry >= MAX_MSIX_ACC_ENTRIES && > - offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET ) > - goto out; > > - val = readl(virt); > if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET ) > { > + nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; > + if ( nr_entry >= MAX_MSIX_ACC_ENTRIES ) > + goto out; > index = offset / sizeof(uint32_t); > *pval = entry->gentries[nr_entry].msi_ad[index]; > } > else > { > - *pval = val; > + virt = msixtbl_addr_to_virt(entry, address); > + if ( !virt ) > + goto out; > + *pval = readl(virt); > } > > r = X86EMUL_OKAY; > @@ -260,13 +254,13 @@ static int msixtbl_write(struct vcpu *v, > unsigned long offset; > struct msixtbl_entry *entry; > void *virt; > - int nr_entry, index; > + unsigned int nr_entry, index; > int r = X86EMUL_UNHANDLEABLE; > > - rcu_read_lock(&msixtbl_rcu_lock); > + if ( len != 4 || (address & 3) ) > + return r; > > - if ( len != 4 ) > - goto out; > + rcu_read_lock(&msixtbl_rcu_lock); > > entry = msixtbl_find_entry(v, address); > nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; > @@ -291,9 +285,22 @@ static int msixtbl_write(struct vcpu *v, > if ( !virt ) > goto out; > > + /* Do not allow the mask bit to be changed. */ > +#if 0 /* XXX > + * As the mask bit is the only defined bit in the word, and as the > + * host MSI-X code doesn''t preserve the other bits anyway, doing > + * this is pointless. So for now just discard the write (also > + * saving us from having to determine the matching irq_desc). > + */ > + spin_lock_irqsave(&desc->lock, flags); > + orig = readl(virt); > + val &= ~PCI_MSIX_VECTOR_BITMASK; > + val |= orig & PCI_MSIX_VECTOR_BITMASK; > writel(val, virt); > - r = X86EMUL_OKAY; > + spin_unlock_irqrestore(&desc->lock, flags); > +#endif > > + r = X86EMUL_OKAY; > out: > rcu_read_unlock(&msixtbl_rcu_lock); > return r; > --- a/xen/include/asm-x86/msi.h > +++ b/xen/include/asm-x86/msi.h > @@ -122,12 +122,6 @@ int msi_free_irq(struct msi_desc *entry) > */ > #define NR_HP_RESERVED_VECTORS 20 > > -#define PCI_MSIX_ENTRY_SIZE 16 > -#define PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET 0 > -#define PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET 4 > -#define PCI_MSIX_ENTRY_DATA_OFFSET 8 > -#define PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET 12 > - > #define msi_control_reg(base) (base + PCI_MSI_FLAGS) > #define msi_lower_address_reg(base) (base + PCI_MSI_ADDRESS_LO) > #define msi_upper_address_reg(base) (base + PCI_MSI_ADDRESS_HI) > --- a/xen/include/xen/pci.h > +++ b/xen/include/xen/pci.h > @@ -11,6 +11,8 @@ > #include <xen/list.h> > #include <xen/spinlock.h> > #include <xen/irq.h> > +#include <xen/pci_regs.h> > +#include <xen/pfn.h> > #include <asm/pci.h> > > /* > @@ -30,8 +32,10 @@ > #define PCI_BDF(b,d,f) ((((b) & 0xff) << 8) | PCI_DEVFN(d,f)) > #define PCI_BDF2(b,df) ((((b) & 0xff) << 8) | ((df) & 0xff)) > > -#define MAX_MSIX_TABLE_ENTRIES 2048 > -#define MAX_MSIX_TABLE_PAGES 8 > +#define MAX_MSIX_TABLE_ENTRIES (PCI_MSIX_FLAGS_QSIZE + 1) > +#define MAX_MSIX_TABLE_PAGES PFN_UP(MAX_MSIX_TABLE_ENTRIES * \ > + PCI_MSIX_ENTRY_SIZE + \ > + (~PCI_MSIX_BIRMASK & (PAGE_SIZE - 1))) > struct pci_dev_info { > bool_t is_extfn; > bool_t is_virtfn; > --- a/xen/include/xen/pci_regs.h > +++ b/xen/include/xen/pci_regs.h > @@ -305,6 +305,12 @@ > #define PCI_MSIX_PBA 8 > #define PCI_MSIX_BIRMASK (7 << 0) > > +#define PCI_MSIX_ENTRY_SIZE 16 > +#define PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET 0 > +#define PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET 4 > +#define PCI_MSIX_ENTRY_DATA_OFFSET 8 > +#define PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET 12 > + > #define PCI_MSIX_VECTOR_BITMASK (1 << 0) > > /* CompactPCI Hotswap Register */ > >-- Andrew Cooper - Dom0 Kernel Engineer, Citrix XenServer T: +44 (0)1223 225 900, http://www.citrix.com
>>> On 24.01.12 at 16:43, Andrew Cooper <andrew.cooper3@citrix.com> wrote: > On 20/01/12 16:33, Jan Beulich wrote: >> This addresses a number of problems in msixtbl_{read,write}(): >> - address alignment was not checked, allowing for memory corruption in >> the hypervisor (write case) or returning of hypervisor private data >> to the guest (read case) >> - the interrupt mask bit was permitted to be written by the guest >> (while Xen''s interrupt flow control routines need to control it) >> - MAX_MSIX_TABLE_{ENTRIES,PAGES} were pointlessly defined to plain >> numbers (making it unobvious why they have these values, and making >> the latter non-portable) >> - MAX_MSIX_TABLE_PAGES was also off by one (failing to account for a >> non-zero table offset); this was also affecting host MSI-X code >> - struct msixtbl_entry''s table_flags[] was one element larger than >> necessary due to improper open-coding of BITS_TO_LONGS() >> - msixtbl_read() unconditionally accessed the physical table, even >> though the data was only needed in a quarter of all cases >> - various calculations were done unnecessarily for both of the rather >> distinct code paths in msixtbl_read() >> >> Additionally it is unclear on what basis MAX_MSIX_ACC_ENTRIES was >> chosen to be 3. >> >> Signed-off-by: Jan Beulich <jbeulich@suse.com> > > Tested by backport to XenServer 6.0 (Xen-4.1.1) The only conflicts were > with the context around adding the two new header files, and <xen/pfn.h> > which is a new header file. > > Tested-and-acked-by: Andrew Cooper <andrew.cooper3@citrix.com>Thanks a lot, Andrew! (Keir had already committed the patch.) Jan
We''ve tested the commit using standard NIC and SR-IOV NIC. It works fine. Shan Haitao 2012/1/25 Jan Beulich <JBeulich@suse.com>:>>>> On 24.01.12 at 16:43, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >> On 20/01/12 16:33, Jan Beulich wrote: >>> This addresses a number of problems in msixtbl_{read,write}(): >>> - address alignment was not checked, allowing for memory corruption in >>> the hypervisor (write case) or returning of hypervisor private data >>> to the guest (read case) >>> - the interrupt mask bit was permitted to be written by the guest >>> (while Xen''s interrupt flow control routines need to control it) >>> - MAX_MSIX_TABLE_{ENTRIES,PAGES} were pointlessly defined to plain >>> numbers (making it unobvious why they have these values, and making >>> the latter non-portable) >>> - MAX_MSIX_TABLE_PAGES was also off by one (failing to account for a >>> non-zero table offset); this was also affecting host MSI-X code >>> - struct msixtbl_entry''s table_flags[] was one element larger than >>> necessary due to improper open-coding of BITS_TO_LONGS() >>> - msixtbl_read() unconditionally accessed the physical table, even >>> though the data was only needed in a quarter of all cases >>> - various calculations were done unnecessarily for both of the rather >>> distinct code paths in msixtbl_read() >>> >>> Additionally it is unclear on what basis MAX_MSIX_ACC_ENTRIES was >>> chosen to be 3. >>> >>> Signed-off-by: Jan Beulich <jbeulich@suse.com> >> >> Tested by backport to XenServer 6.0 (Xen-4.1.1) The only conflicts were >> with the context around adding the two new header files, and <xen/pfn.h> >> which is a new header file. >> >> Tested-and-acked-by: Andrew Cooper <andrew.cooper3@citrix.com> > > Thanks a lot, Andrew! (Keir had already committed the patch.) > > Jan > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xensource.com > http://lists.xensource.com/xen-devel