This addresses a number of problems in msixtbl_{read,write}(): - address alignment was not checked, allowing for memory corruption in the hypervisor (write case) or returning of hypervisor private data to the guest (read case) - the interrupt mask bit was permitted to be written by the guest (while Xen''s interrupt flow control routines need to control it) - MAX_MSIX_TABLE_{ENTRIES,PAGES} were pointlessly defined to plain numbers (making it unobvious why they have these values, and making the latter non-portable) - MAX_MSIX_TABLE_PAGES was also off by one (failing to account for a non-zero table offset); this was also affecting host MSI-X code - struct msixtbl_entry''s table_flags[] was one element larger than necessary due to improper open-coding of BITS_TO_LONGS() - msixtbl_read() unconditionally accessed the physical table, even though the data was only needed in a quarter of all cases - various calculations were done unnecessarily for both of the rather distinct code paths in msixtbl_read() Additionally it is unclear on what basis MAX_MSIX_ACC_ENTRIES was chosen to be 3. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- a/xen/arch/x86/hvm/vmsi.c +++ b/xen/arch/x86/hvm/vmsi.c @@ -165,7 +165,7 @@ struct msixtbl_entry struct pci_dev *pdev; unsigned long gtable; /* gpa of msix table */ unsigned long table_len; - unsigned long table_flags[MAX_MSIX_TABLE_ENTRIES / BITS_PER_LONG + 1]; + unsigned long table_flags[BITS_TO_LONGS(MAX_MSIX_TABLE_ENTRIES)]; #define MAX_MSIX_ACC_ENTRIES 3 struct { uint32_t msi_ad[3]; /* Shadow of address low, high and data */ @@ -192,17 +192,14 @@ static struct msixtbl_entry *msixtbl_fin static void __iomem *msixtbl_addr_to_virt( struct msixtbl_entry *entry, unsigned long addr) { - int idx, nr_page; + unsigned int idx, nr_page; - if ( !entry ) + if ( !entry || !entry->pdev ) return NULL; nr_page = (addr >> PAGE_SHIFT) - (entry->gtable >> PAGE_SHIFT); - if ( !entry->pdev ) - return NULL; - idx = entry->pdev->msix_table_idx[nr_page]; if ( !idx ) return NULL; @@ -215,37 +212,34 @@ static int msixtbl_read( struct vcpu *v, unsigned long address, unsigned long len, unsigned long *pval) { - unsigned long offset, val; + unsigned long offset; struct msixtbl_entry *entry; void *virt; - int nr_entry, index; + unsigned int nr_entry, index; int r = X86EMUL_UNHANDLEABLE; - rcu_read_lock(&msixtbl_rcu_lock); + if ( len != 4 || (address & 3) ) + return r; - if ( len != 4 ) - goto out; + rcu_read_lock(&msixtbl_rcu_lock); entry = msixtbl_find_entry(v, address); - virt = msixtbl_addr_to_virt(entry, address); - if ( !virt ) - goto out; - - nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; offset = address & (PCI_MSIX_ENTRY_SIZE - 1); - if ( nr_entry >= MAX_MSIX_ACC_ENTRIES && - offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET ) - goto out; - val = readl(virt); if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET ) { + nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; + if ( nr_entry >= MAX_MSIX_ACC_ENTRIES ) + goto out; index = offset / sizeof(uint32_t); *pval = entry->gentries[nr_entry].msi_ad[index]; } else { - *pval = val; + virt = msixtbl_addr_to_virt(entry, address); + if ( !virt ) + goto out; + *pval = readl(virt); } r = X86EMUL_OKAY; @@ -260,13 +254,13 @@ static int msixtbl_write(struct vcpu *v, unsigned long offset; struct msixtbl_entry *entry; void *virt; - int nr_entry, index; + unsigned int nr_entry, index; int r = X86EMUL_UNHANDLEABLE; - rcu_read_lock(&msixtbl_rcu_lock); + if ( len != 4 || (address & 3) ) + return r; - if ( len != 4 ) - goto out; + rcu_read_lock(&msixtbl_rcu_lock); entry = msixtbl_find_entry(v, address); nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; @@ -291,9 +285,22 @@ static int msixtbl_write(struct vcpu *v, if ( !virt ) goto out; + /* Do not allow the mask bit to be changed. */ +#if 0 /* XXX + * As the mask bit is the only defined bit in the word, and as the + * host MSI-X code doesn''t preserve the other bits anyway, doing + * this is pointless. So for now just discard the write (also + * saving us from having to determine the matching irq_desc). + */ + spin_lock_irqsave(&desc->lock, flags); + orig = readl(virt); + val &= ~PCI_MSIX_VECTOR_BITMASK; + val |= orig & PCI_MSIX_VECTOR_BITMASK; writel(val, virt); - r = X86EMUL_OKAY; + spin_unlock_irqrestore(&desc->lock, flags); +#endif + r = X86EMUL_OKAY; out: rcu_read_unlock(&msixtbl_rcu_lock); return r; --- a/xen/include/asm-x86/msi.h +++ b/xen/include/asm-x86/msi.h @@ -122,12 +122,6 @@ int msi_free_irq(struct msi_desc *entry) */ #define NR_HP_RESERVED_VECTORS 20 -#define PCI_MSIX_ENTRY_SIZE 16 -#define PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET 0 -#define PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET 4 -#define PCI_MSIX_ENTRY_DATA_OFFSET 8 -#define PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET 12 - #define msi_control_reg(base) (base + PCI_MSI_FLAGS) #define msi_lower_address_reg(base) (base + PCI_MSI_ADDRESS_LO) #define msi_upper_address_reg(base) (base + PCI_MSI_ADDRESS_HI) --- a/xen/include/xen/pci.h +++ b/xen/include/xen/pci.h @@ -11,6 +11,8 @@ #include <xen/list.h> #include <xen/spinlock.h> #include <xen/irq.h> +#include <xen/pci_regs.h> +#include <xen/pfn.h> #include <asm/pci.h> /* @@ -30,8 +32,10 @@ #define PCI_BDF(b,d,f) ((((b) & 0xff) << 8) | PCI_DEVFN(d,f)) #define PCI_BDF2(b,df) ((((b) & 0xff) << 8) | ((df) & 0xff)) -#define MAX_MSIX_TABLE_ENTRIES 2048 -#define MAX_MSIX_TABLE_PAGES 8 +#define MAX_MSIX_TABLE_ENTRIES (PCI_MSIX_FLAGS_QSIZE + 1) +#define MAX_MSIX_TABLE_PAGES PFN_UP(MAX_MSIX_TABLE_ENTRIES * \ + PCI_MSIX_ENTRY_SIZE + \ + (~PCI_MSIX_BIRMASK & (PAGE_SIZE - 1))) struct pci_dev_info { bool_t is_extfn; bool_t is_virtfn; --- a/xen/include/xen/pci_regs.h +++ b/xen/include/xen/pci_regs.h @@ -305,6 +305,12 @@ #define PCI_MSIX_PBA 8 #define PCI_MSIX_BIRMASK (7 << 0) +#define PCI_MSIX_ENTRY_SIZE 16 +#define PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET 0 +#define PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET 4 +#define PCI_MSIX_ENTRY_DATA_OFFSET 8 +#define PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET 12 + #define PCI_MSIX_VECTOR_BITMASK (1 << 0) /* CompactPCI Hotswap Register */ _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On 20/01/12 16:33, Jan Beulich wrote:> This addresses a number of problems in msixtbl_{read,write}(): > - address alignment was not checked, allowing for memory corruption in > the hypervisor (write case) or returning of hypervisor private data > to the guest (read case) > - the interrupt mask bit was permitted to be written by the guest > (while Xen''s interrupt flow control routines need to control it) > - MAX_MSIX_TABLE_{ENTRIES,PAGES} were pointlessly defined to plain > numbers (making it unobvious why they have these values, and making > the latter non-portable) > - MAX_MSIX_TABLE_PAGES was also off by one (failing to account for a > non-zero table offset); this was also affecting host MSI-X code > - struct msixtbl_entry''s table_flags[] was one element larger than > necessary due to improper open-coding of BITS_TO_LONGS() > - msixtbl_read() unconditionally accessed the physical table, even > though the data was only needed in a quarter of all cases > - various calculations were done unnecessarily for both of the rather > distinct code paths in msixtbl_read() > > Additionally it is unclear on what basis MAX_MSIX_ACC_ENTRIES was > chosen to be 3. > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > > --- a/xen/arch/x86/hvm/vmsi.c > +++ b/xen/arch/x86/hvm/vmsi.c > @@ -165,7 +165,7 @@ struct msixtbl_entry > struct pci_dev *pdev; > unsigned long gtable; /* gpa of msix table */ > unsigned long table_len; > - unsigned long table_flags[MAX_MSIX_TABLE_ENTRIES / BITS_PER_LONG + 1]; > + unsigned long table_flags[BITS_TO_LONGS(MAX_MSIX_TABLE_ENTRIES)]; > #define MAX_MSIX_ACC_ENTRIES 3 > struct { > uint32_t msi_ad[3]; /* Shadow of address low, high and data */ > @@ -192,17 +192,14 @@ static struct msixtbl_entry *msixtbl_fin > static void __iomem *msixtbl_addr_to_virt( > struct msixtbl_entry *entry, unsigned long addr) > { > - int idx, nr_page; > + unsigned int idx, nr_page; > > - if ( !entry ) > + if ( !entry || !entry->pdev ) > return NULL; > > nr_page = (addr >> PAGE_SHIFT) - > (entry->gtable >> PAGE_SHIFT); > > - if ( !entry->pdev ) > - return NULL; > - > idx = entry->pdev->msix_table_idx[nr_page]; > if ( !idx ) > return NULL; > @@ -215,37 +212,34 @@ static int msixtbl_read( > struct vcpu *v, unsigned long address, > unsigned long len, unsigned long *pval) > { > - unsigned long offset, val; > + unsigned long offset; > struct msixtbl_entry *entry; > void *virt; > - int nr_entry, index; > + unsigned int nr_entry, index; > int r = X86EMUL_UNHANDLEABLE; > > - rcu_read_lock(&msixtbl_rcu_lock); > + if ( len != 4 || (address & 3) ) > + return r; > > - if ( len != 4 ) > - goto out; > + rcu_read_lock(&msixtbl_rcu_lock); > > entry = msixtbl_find_entry(v, address); > - virt = msixtbl_addr_to_virt(entry, address); > - if ( !virt ) > - goto out; > - > - nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; > offset = address & (PCI_MSIX_ENTRY_SIZE - 1); > - if ( nr_entry >= MAX_MSIX_ACC_ENTRIES && > - offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET ) > - goto out; > > - val = readl(virt); > if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET ) > { > + nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; > + if ( nr_entry >= MAX_MSIX_ACC_ENTRIES ) > + goto out; > index = offset / sizeof(uint32_t); > *pval = entry->gentries[nr_entry].msi_ad[index]; > } > else > { > - *pval = val; > + virt = msixtbl_addr_to_virt(entry, address); > + if ( !virt ) > + goto out; > + *pval = readl(virt); > } > > r = X86EMUL_OKAY; > @@ -260,13 +254,13 @@ static int msixtbl_write(struct vcpu *v, > unsigned long offset; > struct msixtbl_entry *entry; > void *virt; > - int nr_entry, index; > + unsigned int nr_entry, index; > int r = X86EMUL_UNHANDLEABLE; > > - rcu_read_lock(&msixtbl_rcu_lock); > + if ( len != 4 || (address & 3) ) > + return r; > > - if ( len != 4 ) > - goto out; > + rcu_read_lock(&msixtbl_rcu_lock); > > entry = msixtbl_find_entry(v, address); > nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; > @@ -291,9 +285,22 @@ static int msixtbl_write(struct vcpu *v, > if ( !virt ) > goto out; > > + /* Do not allow the mask bit to be changed. */ > +#if 0 /* XXXYou appear to still have some debugging in this patch. ~Andrew> + * As the mask bit is the only defined bit in the word, and as the > + * host MSI-X code doesn''t preserve the other bits anyway, doing > + * this is pointless. So for now just discard the write (also > + * saving us from having to determine the matching irq_desc). > + */ > + spin_lock_irqsave(&desc->lock, flags); > + orig = readl(virt); > + val &= ~PCI_MSIX_VECTOR_BITMASK; > + val |= orig & PCI_MSIX_VECTOR_BITMASK; > writel(val, virt); > - r = X86EMUL_OKAY; > + spin_unlock_irqrestore(&desc->lock, flags); > +#endif > > + r = X86EMUL_OKAY; > out: > rcu_read_unlock(&msixtbl_rcu_lock); > return r; > --- a/xen/include/asm-x86/msi.h > +++ b/xen/include/asm-x86/msi.h > @@ -122,12 +122,6 @@ int msi_free_irq(struct msi_desc *entry) > */ > #define NR_HP_RESERVED_VECTORS 20 > > -#define PCI_MSIX_ENTRY_SIZE 16 > -#define PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET 0 > -#define PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET 4 > -#define PCI_MSIX_ENTRY_DATA_OFFSET 8 > -#define PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET 12 > - > #define msi_control_reg(base) (base + PCI_MSI_FLAGS) > #define msi_lower_address_reg(base) (base + PCI_MSI_ADDRESS_LO) > #define msi_upper_address_reg(base) (base + PCI_MSI_ADDRESS_HI) > --- a/xen/include/xen/pci.h > +++ b/xen/include/xen/pci.h > @@ -11,6 +11,8 @@ > #include <xen/list.h> > #include <xen/spinlock.h> > #include <xen/irq.h> > +#include <xen/pci_regs.h> > +#include <xen/pfn.h> > #include <asm/pci.h> > > /* > @@ -30,8 +32,10 @@ > #define PCI_BDF(b,d,f) ((((b) & 0xff) << 8) | PCI_DEVFN(d,f)) > #define PCI_BDF2(b,df) ((((b) & 0xff) << 8) | ((df) & 0xff)) > > -#define MAX_MSIX_TABLE_ENTRIES 2048 > -#define MAX_MSIX_TABLE_PAGES 8 > +#define MAX_MSIX_TABLE_ENTRIES (PCI_MSIX_FLAGS_QSIZE + 1) > +#define MAX_MSIX_TABLE_PAGES PFN_UP(MAX_MSIX_TABLE_ENTRIES * \ > + PCI_MSIX_ENTRY_SIZE + \ > + (~PCI_MSIX_BIRMASK & (PAGE_SIZE - 1))) > struct pci_dev_info { > bool_t is_extfn; > bool_t is_virtfn; > --- a/xen/include/xen/pci_regs.h > +++ b/xen/include/xen/pci_regs.h > @@ -305,6 +305,12 @@ > #define PCI_MSIX_PBA 8 > #define PCI_MSIX_BIRMASK (7 << 0) > > +#define PCI_MSIX_ENTRY_SIZE 16 > +#define PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET 0 > +#define PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET 4 > +#define PCI_MSIX_ENTRY_DATA_OFFSET 8 > +#define PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET 12 > + > #define PCI_MSIX_VECTOR_BITMASK (1 << 0) > > /* CompactPCI Hotswap Register */ > >-- Andrew Cooper - Dom0 Kernel Engineer, Citrix XenServer T: +44 (0)1223 225 900, http://www.citrix.com
>>> On 20.01.12 at 17:33, "Jan Beulich" <JBeulich@suse.com> wrote: > This addresses a number of problems in msixtbl_{read,write}(): > - address alignment was not checked, allowing for memory corruption in > the hypervisor (write case) or returning of hypervisor private data > to the guest (read case) > - the interrupt mask bit was permitted to be written by the guest > (while Xen''s interrupt flow control routines need to control it) > - MAX_MSIX_TABLE_{ENTRIES,PAGES} were pointlessly defined to plain > numbers (making it unobvious why they have these values, and making > the latter non-portable) > - MAX_MSIX_TABLE_PAGES was also off by one (failing to account for a > non-zero table offset); this was also affecting host MSI-X code > - struct msixtbl_entry''s table_flags[] was one element larger than > necessary due to improper open-coding of BITS_TO_LONGS() > - msixtbl_read() unconditionally accessed the physical table, even > though the data was only needed in a quarter of all cases > - various calculations were done unnecessarily for both of the rather > distinct code paths in msixtbl_read()I should note (as done in the earlier response to the 4.2 todo thread) that I have no way of actually testing this, so I''m relying on others here. (Intel? Or, Ian, would the stage tester find issues with passed through MSI-X capable devices?) Jan> Additionally it is unclear on what basis MAX_MSIX_ACC_ENTRIES was > chosen to be 3. > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > > --- a/xen/arch/x86/hvm/vmsi.c > +++ b/xen/arch/x86/hvm/vmsi.c > @@ -165,7 +165,7 @@ struct msixtbl_entry > struct pci_dev *pdev; > unsigned long gtable; /* gpa of msix table */ > unsigned long table_len; > - unsigned long table_flags[MAX_MSIX_TABLE_ENTRIES / BITS_PER_LONG + 1]; > + unsigned long table_flags[BITS_TO_LONGS(MAX_MSIX_TABLE_ENTRIES)]; > #define MAX_MSIX_ACC_ENTRIES 3 > struct { > uint32_t msi_ad[3]; /* Shadow of address low, high and data */ > @@ -192,17 +192,14 @@ static struct msixtbl_entry *msixtbl_fin > static void __iomem *msixtbl_addr_to_virt( > struct msixtbl_entry *entry, unsigned long addr) > { > - int idx, nr_page; > + unsigned int idx, nr_page; > > - if ( !entry ) > + if ( !entry || !entry->pdev ) > return NULL; > > nr_page = (addr >> PAGE_SHIFT) - > (entry->gtable >> PAGE_SHIFT); > > - if ( !entry->pdev ) > - return NULL; > - > idx = entry->pdev->msix_table_idx[nr_page]; > if ( !idx ) > return NULL; > @@ -215,37 +212,34 @@ static int msixtbl_read( > struct vcpu *v, unsigned long address, > unsigned long len, unsigned long *pval) > { > - unsigned long offset, val; > + unsigned long offset; > struct msixtbl_entry *entry; > void *virt; > - int nr_entry, index; > + unsigned int nr_entry, index; > int r = X86EMUL_UNHANDLEABLE; > > - rcu_read_lock(&msixtbl_rcu_lock); > + if ( len != 4 || (address & 3) ) > + return r; > > - if ( len != 4 ) > - goto out; > + rcu_read_lock(&msixtbl_rcu_lock); > > entry = msixtbl_find_entry(v, address); > - virt = msixtbl_addr_to_virt(entry, address); > - if ( !virt ) > - goto out; > - > - nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; > offset = address & (PCI_MSIX_ENTRY_SIZE - 1); > - if ( nr_entry >= MAX_MSIX_ACC_ENTRIES && > - offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET ) > - goto out; > > - val = readl(virt); > if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET ) > { > + nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; > + if ( nr_entry >= MAX_MSIX_ACC_ENTRIES ) > + goto out; > index = offset / sizeof(uint32_t); > *pval = entry->gentries[nr_entry].msi_ad[index]; > } > else > { > - *pval = val; > + virt = msixtbl_addr_to_virt(entry, address); > + if ( !virt ) > + goto out; > + *pval = readl(virt); > } > > r = X86EMUL_OKAY; > @@ -260,13 +254,13 @@ static int msixtbl_write(struct vcpu *v, > unsigned long offset; > struct msixtbl_entry *entry; > void *virt; > - int nr_entry, index; > + unsigned int nr_entry, index; > int r = X86EMUL_UNHANDLEABLE; > > - rcu_read_lock(&msixtbl_rcu_lock); > + if ( len != 4 || (address & 3) ) > + return r; > > - if ( len != 4 ) > - goto out; > + rcu_read_lock(&msixtbl_rcu_lock); > > entry = msixtbl_find_entry(v, address); > nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; > @@ -291,9 +285,22 @@ static int msixtbl_write(struct vcpu *v, > if ( !virt ) > goto out; > > + /* Do not allow the mask bit to be changed. */ > +#if 0 /* XXX > + * As the mask bit is the only defined bit in the word, and as the > + * host MSI-X code doesn''t preserve the other bits anyway, doing > + * this is pointless. So for now just discard the write (also > + * saving us from having to determine the matching irq_desc). > + */ > + spin_lock_irqsave(&desc->lock, flags); > + orig = readl(virt); > + val &= ~PCI_MSIX_VECTOR_BITMASK; > + val |= orig & PCI_MSIX_VECTOR_BITMASK; > writel(val, virt); > - r = X86EMUL_OKAY; > + spin_unlock_irqrestore(&desc->lock, flags); > +#endif > > + r = X86EMUL_OKAY; > out: > rcu_read_unlock(&msixtbl_rcu_lock); > return r; > --- a/xen/include/asm-x86/msi.h > +++ b/xen/include/asm-x86/msi.h > @@ -122,12 +122,6 @@ int msi_free_irq(struct msi_desc *entry) > */ > #define NR_HP_RESERVED_VECTORS 20 > > -#define PCI_MSIX_ENTRY_SIZE 16 > -#define PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET 0 > -#define PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET 4 > -#define PCI_MSIX_ENTRY_DATA_OFFSET 8 > -#define PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET 12 > - > #define msi_control_reg(base) (base + PCI_MSI_FLAGS) > #define msi_lower_address_reg(base) (base + PCI_MSI_ADDRESS_LO) > #define msi_upper_address_reg(base) (base + PCI_MSI_ADDRESS_HI) > --- a/xen/include/xen/pci.h > +++ b/xen/include/xen/pci.h > @@ -11,6 +11,8 @@ > #include <xen/list.h> > #include <xen/spinlock.h> > #include <xen/irq.h> > +#include <xen/pci_regs.h> > +#include <xen/pfn.h> > #include <asm/pci.h> > > /* > @@ -30,8 +32,10 @@ > #define PCI_BDF(b,d,f) ((((b) & 0xff) << 8) | PCI_DEVFN(d,f)) > #define PCI_BDF2(b,df) ((((b) & 0xff) << 8) | ((df) & 0xff)) > > -#define MAX_MSIX_TABLE_ENTRIES 2048 > -#define MAX_MSIX_TABLE_PAGES 8 > +#define MAX_MSIX_TABLE_ENTRIES (PCI_MSIX_FLAGS_QSIZE + 1) > +#define MAX_MSIX_TABLE_PAGES PFN_UP(MAX_MSIX_TABLE_ENTRIES * \ > + PCI_MSIX_ENTRY_SIZE + \ > + (~PCI_MSIX_BIRMASK & (PAGE_SIZE - 1))) > struct pci_dev_info { > bool_t is_extfn; > bool_t is_virtfn; > --- a/xen/include/xen/pci_regs.h > +++ b/xen/include/xen/pci_regs.h > @@ -305,6 +305,12 @@ > #define PCI_MSIX_PBA 8 > #define PCI_MSIX_BIRMASK (7 << 0) > > +#define PCI_MSIX_ENTRY_SIZE 16 > +#define PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET 0 > +#define PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET 4 > +#define PCI_MSIX_ENTRY_DATA_OFFSET 8 > +#define PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET 12 > + > #define PCI_MSIX_VECTOR_BITMASK (1 << 0) > > /* CompactPCI Hotswap Register */
>>> On 20.01.12 at 17:39, Andrew Cooper <andrew.cooper3@citrix.com> wrote: > On 20/01/12 16:33, Jan Beulich wrote: >> @@ -291,9 +285,22 @@ static int msixtbl_write(struct vcpu *v, >> if ( !virt ) >> goto out; >> >> + /* Do not allow the mask bit to be changed. */ >> +#if 0 /* XXX > > You appear to still have some debugging in this patch.No - that''s why the comment is there. I wanted to keep the unused code to make clear what ought to be taking place here. Jan>> + * As the mask bit is the only defined bit in the word, and as the >> + * host MSI-X code doesn''t preserve the other bits anyway, doing >> + * this is pointless. So for now just discard the write (also >> + * saving us from having to determine the matching irq_desc). >> + */
On 20/01/12 16:44, Jan Beulich wrote:>>>> On 20.01.12 at 17:39, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >> On 20/01/12 16:33, Jan Beulich wrote: >>> @@ -291,9 +285,22 @@ static int msixtbl_write(struct vcpu *v, >>> if ( !virt ) >>> goto out; >>> >>> + /* Do not allow the mask bit to be changed. */ >>> +#if 0 /* XXX >> You appear to still have some debugging in this patch. > No - that''s why the comment is there. I wanted to keep the unused > code to make clear what ought to be taking place here. > > JanAh - that makes a little more sense. That was going to be my next query for clarification. I have some hardware at my disposal which can do SRIOV and PCI passthrough with MSI-X capable network cards. Any specific things you want testing, other than just a basic passthrough and verify the cards are working? ~Andrew>>> + * As the mask bit is the only defined bit in the word, and as the >>> + * host MSI-X code doesn''t preserve the other bits anyway, doing >>> + * this is pointless. So for now just discard the write (also >>> + * saving us from having to determine the matching irq_desc). >>> + */ > >-- Andrew Cooper - Dom0 Kernel Engineer, Citrix XenServer T: +44 (0)1223 225 900, http://www.citrix.com
>>> On 20.01.12 at 17:50, Andrew Cooper <andrew.cooper3@citrix.com> wrote: > I have some hardware at my disposal which can do SRIOV and PCI > passthrough with MSI-X capable network cards. > > Any specific things you want testing, other than just a basic > passthrough and verify the cards are working?No, nothing specific. Thanks for taking on this! Jan
On 20/01/12 17:01, Jan Beulich wrote:>>>> On 20.01.12 at 17:50, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >> I have some hardware at my disposal which can do SRIOV and PCI >> passthrough with MSI-X capable network cards. >> >> Any specific things you want testing, other than just a basic >> passthrough and verify the cards are working? > No, nothing specific. Thanks for taking on this! > > JanNo problem. I will try to get the testing done early next week - I have other hypervisior bugs I am currently working on right at the moment. -- Andrew Cooper - Dom0 Kernel Engineer, Citrix XenServer T: +44 (0)1223 225 900, http://www.citrix.com
On 20/01/12 16:33, Jan Beulich wrote:> This addresses a number of problems in msixtbl_{read,write}(): > - address alignment was not checked, allowing for memory corruption in > the hypervisor (write case) or returning of hypervisor private data > to the guest (read case) > - the interrupt mask bit was permitted to be written by the guest > (while Xen''s interrupt flow control routines need to control it) > - MAX_MSIX_TABLE_{ENTRIES,PAGES} were pointlessly defined to plain > numbers (making it unobvious why they have these values, and making > the latter non-portable) > - MAX_MSIX_TABLE_PAGES was also off by one (failing to account for a > non-zero table offset); this was also affecting host MSI-X code > - struct msixtbl_entry''s table_flags[] was one element larger than > necessary due to improper open-coding of BITS_TO_LONGS() > - msixtbl_read() unconditionally accessed the physical table, even > though the data was only needed in a quarter of all cases > - various calculations were done unnecessarily for both of the rather > distinct code paths in msixtbl_read() > > Additionally it is unclear on what basis MAX_MSIX_ACC_ENTRIES was > chosen to be 3. > > Signed-off-by: Jan Beulich <jbeulich@suse.com>Tested by backport to XenServer 6.0 (Xen-4.1.1) The only conflicts were with the context around adding the two new header files, and <xen/pfn.h> which is a new header file. Tested-and-acked-by: Andrew Cooper <andrew.cooper3@citrix.com>> --- a/xen/arch/x86/hvm/vmsi.c > +++ b/xen/arch/x86/hvm/vmsi.c > @@ -165,7 +165,7 @@ struct msixtbl_entry > struct pci_dev *pdev; > unsigned long gtable; /* gpa of msix table */ > unsigned long table_len; > - unsigned long table_flags[MAX_MSIX_TABLE_ENTRIES / BITS_PER_LONG + 1]; > + unsigned long table_flags[BITS_TO_LONGS(MAX_MSIX_TABLE_ENTRIES)]; > #define MAX_MSIX_ACC_ENTRIES 3 > struct { > uint32_t msi_ad[3]; /* Shadow of address low, high and data */ > @@ -192,17 +192,14 @@ static struct msixtbl_entry *msixtbl_fin > static void __iomem *msixtbl_addr_to_virt( > struct msixtbl_entry *entry, unsigned long addr) > { > - int idx, nr_page; > + unsigned int idx, nr_page; > > - if ( !entry ) > + if ( !entry || !entry->pdev ) > return NULL; > > nr_page = (addr >> PAGE_SHIFT) - > (entry->gtable >> PAGE_SHIFT); > > - if ( !entry->pdev ) > - return NULL; > - > idx = entry->pdev->msix_table_idx[nr_page]; > if ( !idx ) > return NULL; > @@ -215,37 +212,34 @@ static int msixtbl_read( > struct vcpu *v, unsigned long address, > unsigned long len, unsigned long *pval) > { > - unsigned long offset, val; > + unsigned long offset; > struct msixtbl_entry *entry; > void *virt; > - int nr_entry, index; > + unsigned int nr_entry, index; > int r = X86EMUL_UNHANDLEABLE; > > - rcu_read_lock(&msixtbl_rcu_lock); > + if ( len != 4 || (address & 3) ) > + return r; > > - if ( len != 4 ) > - goto out; > + rcu_read_lock(&msixtbl_rcu_lock); > > entry = msixtbl_find_entry(v, address); > - virt = msixtbl_addr_to_virt(entry, address); > - if ( !virt ) > - goto out; > - > - nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; > offset = address & (PCI_MSIX_ENTRY_SIZE - 1); > - if ( nr_entry >= MAX_MSIX_ACC_ENTRIES && > - offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET ) > - goto out; > > - val = readl(virt); > if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET ) > { > + nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; > + if ( nr_entry >= MAX_MSIX_ACC_ENTRIES ) > + goto out; > index = offset / sizeof(uint32_t); > *pval = entry->gentries[nr_entry].msi_ad[index]; > } > else > { > - *pval = val; > + virt = msixtbl_addr_to_virt(entry, address); > + if ( !virt ) > + goto out; > + *pval = readl(virt); > } > > r = X86EMUL_OKAY; > @@ -260,13 +254,13 @@ static int msixtbl_write(struct vcpu *v, > unsigned long offset; > struct msixtbl_entry *entry; > void *virt; > - int nr_entry, index; > + unsigned int nr_entry, index; > int r = X86EMUL_UNHANDLEABLE; > > - rcu_read_lock(&msixtbl_rcu_lock); > + if ( len != 4 || (address & 3) ) > + return r; > > - if ( len != 4 ) > - goto out; > + rcu_read_lock(&msixtbl_rcu_lock); > > entry = msixtbl_find_entry(v, address); > nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; > @@ -291,9 +285,22 @@ static int msixtbl_write(struct vcpu *v, > if ( !virt ) > goto out; > > + /* Do not allow the mask bit to be changed. */ > +#if 0 /* XXX > + * As the mask bit is the only defined bit in the word, and as the > + * host MSI-X code doesn''t preserve the other bits anyway, doing > + * this is pointless. So for now just discard the write (also > + * saving us from having to determine the matching irq_desc). > + */ > + spin_lock_irqsave(&desc->lock, flags); > + orig = readl(virt); > + val &= ~PCI_MSIX_VECTOR_BITMASK; > + val |= orig & PCI_MSIX_VECTOR_BITMASK; > writel(val, virt); > - r = X86EMUL_OKAY; > + spin_unlock_irqrestore(&desc->lock, flags); > +#endif > > + r = X86EMUL_OKAY; > out: > rcu_read_unlock(&msixtbl_rcu_lock); > return r; > --- a/xen/include/asm-x86/msi.h > +++ b/xen/include/asm-x86/msi.h > @@ -122,12 +122,6 @@ int msi_free_irq(struct msi_desc *entry) > */ > #define NR_HP_RESERVED_VECTORS 20 > > -#define PCI_MSIX_ENTRY_SIZE 16 > -#define PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET 0 > -#define PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET 4 > -#define PCI_MSIX_ENTRY_DATA_OFFSET 8 > -#define PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET 12 > - > #define msi_control_reg(base) (base + PCI_MSI_FLAGS) > #define msi_lower_address_reg(base) (base + PCI_MSI_ADDRESS_LO) > #define msi_upper_address_reg(base) (base + PCI_MSI_ADDRESS_HI) > --- a/xen/include/xen/pci.h > +++ b/xen/include/xen/pci.h > @@ -11,6 +11,8 @@ > #include <xen/list.h> > #include <xen/spinlock.h> > #include <xen/irq.h> > +#include <xen/pci_regs.h> > +#include <xen/pfn.h> > #include <asm/pci.h> > > /* > @@ -30,8 +32,10 @@ > #define PCI_BDF(b,d,f) ((((b) & 0xff) << 8) | PCI_DEVFN(d,f)) > #define PCI_BDF2(b,df) ((((b) & 0xff) << 8) | ((df) & 0xff)) > > -#define MAX_MSIX_TABLE_ENTRIES 2048 > -#define MAX_MSIX_TABLE_PAGES 8 > +#define MAX_MSIX_TABLE_ENTRIES (PCI_MSIX_FLAGS_QSIZE + 1) > +#define MAX_MSIX_TABLE_PAGES PFN_UP(MAX_MSIX_TABLE_ENTRIES * \ > + PCI_MSIX_ENTRY_SIZE + \ > + (~PCI_MSIX_BIRMASK & (PAGE_SIZE - 1))) > struct pci_dev_info { > bool_t is_extfn; > bool_t is_virtfn; > --- a/xen/include/xen/pci_regs.h > +++ b/xen/include/xen/pci_regs.h > @@ -305,6 +305,12 @@ > #define PCI_MSIX_PBA 8 > #define PCI_MSIX_BIRMASK (7 << 0) > > +#define PCI_MSIX_ENTRY_SIZE 16 > +#define PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET 0 > +#define PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET 4 > +#define PCI_MSIX_ENTRY_DATA_OFFSET 8 > +#define PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET 12 > + > #define PCI_MSIX_VECTOR_BITMASK (1 << 0) > > /* CompactPCI Hotswap Register */ > >-- Andrew Cooper - Dom0 Kernel Engineer, Citrix XenServer T: +44 (0)1223 225 900, http://www.citrix.com
>>> On 24.01.12 at 16:43, Andrew Cooper <andrew.cooper3@citrix.com> wrote: > On 20/01/12 16:33, Jan Beulich wrote: >> This addresses a number of problems in msixtbl_{read,write}(): >> - address alignment was not checked, allowing for memory corruption in >> the hypervisor (write case) or returning of hypervisor private data >> to the guest (read case) >> - the interrupt mask bit was permitted to be written by the guest >> (while Xen''s interrupt flow control routines need to control it) >> - MAX_MSIX_TABLE_{ENTRIES,PAGES} were pointlessly defined to plain >> numbers (making it unobvious why they have these values, and making >> the latter non-portable) >> - MAX_MSIX_TABLE_PAGES was also off by one (failing to account for a >> non-zero table offset); this was also affecting host MSI-X code >> - struct msixtbl_entry''s table_flags[] was one element larger than >> necessary due to improper open-coding of BITS_TO_LONGS() >> - msixtbl_read() unconditionally accessed the physical table, even >> though the data was only needed in a quarter of all cases >> - various calculations were done unnecessarily for both of the rather >> distinct code paths in msixtbl_read() >> >> Additionally it is unclear on what basis MAX_MSIX_ACC_ENTRIES was >> chosen to be 3. >> >> Signed-off-by: Jan Beulich <jbeulich@suse.com> > > Tested by backport to XenServer 6.0 (Xen-4.1.1) The only conflicts were > with the context around adding the two new header files, and <xen/pfn.h> > which is a new header file. > > Tested-and-acked-by: Andrew Cooper <andrew.cooper3@citrix.com>Thanks a lot, Andrew! (Keir had already committed the patch.) Jan
We''ve tested the commit using standard NIC and SR-IOV NIC. It works fine. Shan Haitao 2012/1/25 Jan Beulich <JBeulich@suse.com>:>>>> On 24.01.12 at 16:43, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >> On 20/01/12 16:33, Jan Beulich wrote: >>> This addresses a number of problems in msixtbl_{read,write}(): >>> - address alignment was not checked, allowing for memory corruption in >>> the hypervisor (write case) or returning of hypervisor private data >>> to the guest (read case) >>> - the interrupt mask bit was permitted to be written by the guest >>> (while Xen''s interrupt flow control routines need to control it) >>> - MAX_MSIX_TABLE_{ENTRIES,PAGES} were pointlessly defined to plain >>> numbers (making it unobvious why they have these values, and making >>> the latter non-portable) >>> - MAX_MSIX_TABLE_PAGES was also off by one (failing to account for a >>> non-zero table offset); this was also affecting host MSI-X code >>> - struct msixtbl_entry''s table_flags[] was one element larger than >>> necessary due to improper open-coding of BITS_TO_LONGS() >>> - msixtbl_read() unconditionally accessed the physical table, even >>> though the data was only needed in a quarter of all cases >>> - various calculations were done unnecessarily for both of the rather >>> distinct code paths in msixtbl_read() >>> >>> Additionally it is unclear on what basis MAX_MSIX_ACC_ENTRIES was >>> chosen to be 3. >>> >>> Signed-off-by: Jan Beulich <jbeulich@suse.com> >> >> Tested by backport to XenServer 6.0 (Xen-4.1.1) The only conflicts were >> with the context around adding the two new header files, and <xen/pfn.h> >> which is a new header file. >> >> Tested-and-acked-by: Andrew Cooper <andrew.cooper3@citrix.com> > > Thanks a lot, Andrew! (Keir had already committed the patch.) > > Jan > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xensource.com > http://lists.xensource.com/xen-devel