1: AMD IOMMU: allocate IRTE entries instead of using a static mapping 2: AMD IOMMU: untie remap and vector maps 3: VT-d: enable for multi-vector MSI 4: AMD IOMMU: enable for multi-vector MSI 5: x86: enable multi-vector MSI 6: pciif: add multi-vector-MSI command The first two patches were posted before, and reportedly still don''t fully work (which means the series isn''t meant to be applied as is). Signed-off-by: Jan Beulich <jbeulich@suse.com>
Jan Beulich
2013-Apr-19 10:57 UTC
[PATCH 1/6] AMD IOMMU: allocate IRTE entries instead of using a static mapping
For multi-vector MSI, where we surely don''t want to allocate contiguous vectors and be able to set affinities of the individual vectors separately, we need to drop the use of the tuple of vector and delivery mode to determine the IRTE to use, and instead allocate IRTEs (which imo should have been done from the beginning). Signed-off-by: Jan Beulich <jbeulich@suse.com> --- One thing I surely need confirmation on is whether this BUG_ON(get_ivrs_mappings(iommu->seg)[req_id].intremap_table ! get_ivrs_mappings(iommu->seg)[alias_id].intremap_table); in update_intremap_entry_from_msi_msg() is valid. If it isn''t, it''s not clear to me how to properly set up things for affected devices, as we would need an identical index allocated for two different remap table instances (which can hardly be expected to work out well). --- a/xen/drivers/passthrough/amd/iommu_acpi.c +++ b/xen/drivers/passthrough/amd/iommu_acpi.c @@ -72,12 +72,15 @@ static void __init add_ivrs_mapping_entr /* allocate per-device interrupt remapping table */ if ( amd_iommu_perdev_intremap ) ivrs_mappings[alias_id].intremap_table - amd_iommu_alloc_intremap_table(); + amd_iommu_alloc_intremap_table( + &ivrs_mappings[alias_id].intremap_inuse); else { if ( shared_intremap_table == NULL ) - shared_intremap_table = amd_iommu_alloc_intremap_table(); + shared_intremap_table = amd_iommu_alloc_intremap_table( + &shared_intremap_inuse); ivrs_mappings[alias_id].intremap_table = shared_intremap_table; + ivrs_mappings[alias_id].intremap_inuse = shared_intremap_inuse; } } /* assgin iommu hardware */ @@ -671,7 +674,7 @@ static u16 __init parse_ivhd_device_spec if ( IO_APIC_ID(apic) != special->handle ) continue; - if ( ioapic_sbdf[special->handle].pin_setup ) + if ( ioapic_sbdf[special->handle].pin_2_idx ) { if ( ioapic_sbdf[special->handle].bdf == bdf && ioapic_sbdf[special->handle].seg == seg ) @@ -691,14 +694,16 @@ static u16 __init parse_ivhd_device_spec ioapic_sbdf[special->handle].bdf = bdf; ioapic_sbdf[special->handle].seg = seg; - ioapic_sbdf[special->handle].pin_setup = xzalloc_array( - unsigned long, BITS_TO_LONGS(nr_ioapic_entries[apic])); + ioapic_sbdf[special->handle].pin_2_idx = xmalloc_array( + u16, nr_ioapic_entries[apic]); if ( nr_ioapic_entries[apic] && - !ioapic_sbdf[IO_APIC_ID(apic)].pin_setup ) + !ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx ) { printk(XENLOG_ERR "IVHD Error: Out of memory\n"); return 0; } + memset(ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx, -1, + nr_ioapic_entries[apic]); } break; } @@ -926,7 +931,7 @@ static int __init parse_ivrs_table(struc for ( apic = 0; !error && iommu_intremap && apic < nr_ioapics; ++apic ) { if ( !nr_ioapic_entries[apic] || - ioapic_sbdf[IO_APIC_ID(apic)].pin_setup ) + ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx ) continue; printk(XENLOG_ERR "IVHD Error: no information for IO-APIC %#x\n", @@ -935,13 +940,15 @@ static int __init parse_ivrs_table(struc error = -ENXIO; else { - ioapic_sbdf[IO_APIC_ID(apic)].pin_setup = xzalloc_array( - unsigned long, BITS_TO_LONGS(nr_ioapic_entries[apic])); - if ( !ioapic_sbdf[IO_APIC_ID(apic)].pin_setup ) + ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx = xmalloc_array( + u16, nr_ioapic_entries[apic]); + if ( !ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx ) { printk(XENLOG_ERR "IVHD Error: Out of memory\n"); error = -ENOMEM; } + memset(ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx, -1, + nr_ioapic_entries[apic]); } } --- a/xen/drivers/passthrough/amd/iommu_intr.c +++ b/xen/drivers/passthrough/amd/iommu_intr.c @@ -31,6 +31,7 @@ struct ioapic_sbdf ioapic_sbdf[MAX_IO_APICS]; struct hpet_sbdf hpet_sbdf; void *shared_intremap_table; +unsigned long *shared_intremap_inuse; static DEFINE_SPINLOCK(shared_intremap_lock); static spinlock_t* get_intremap_lock(int seg, int req_id) @@ -46,30 +47,31 @@ static int get_intremap_requestor_id(int return get_ivrs_mappings(seg)[bdf].dte_requestor_id; } -static int get_intremap_offset(u8 vector, u8 dm) +static unsigned int alloc_intremap_entry(int seg, int bdf) { - int offset = 0; - offset = (dm << INT_REMAP_INDEX_DM_SHIFT) & INT_REMAP_INDEX_DM_MASK; - offset |= (vector << INT_REMAP_INDEX_VECTOR_SHIFT ) & - INT_REMAP_INDEX_VECTOR_MASK; - return offset; + unsigned long *inuse = get_ivrs_mappings(seg)[bdf].intremap_inuse; + unsigned int slot = find_first_zero_bit(inuse, INTREMAP_ENTRIES); + + if ( slot < INTREMAP_ENTRIES ) + __set_bit(slot, inuse); + return slot; } -static u8 *get_intremap_entry(int seg, int bdf, int offset) +static u32 *get_intremap_entry(int seg, int bdf, int offset) { - u8 *table; + u32 *table = get_ivrs_mappings(seg)[bdf].intremap_table; - table = (u8*)get_ivrs_mappings(seg)[bdf].intremap_table; ASSERT( (table != NULL) && (offset < INTREMAP_ENTRIES) ); - return (u8*) (table + offset); + return table + offset; } static void free_intremap_entry(int seg, int bdf, int offset) { - u32* entry; - entry = (u32*)get_intremap_entry(seg, bdf, offset); + u32 *entry = get_intremap_entry(seg, bdf, offset); + memset(entry, 0, sizeof(u32)); + __clear_bit(offset, get_ivrs_mappings(seg)[bdf].intremap_inuse); } static void update_intremap_entry(u32* entry, u8 vector, u8 int_type, @@ -98,18 +100,24 @@ static void update_intremap_entry(u32* e INT_REMAP_ENTRY_VECTOR_SHIFT, entry); } -static void update_intremap_entry_from_ioapic( +static void set_rte_index(struct IO_APIC_route_entry *rte, int offset) +{ + rte->vector = (u8)offset; + rte->delivery_mode = offset >> 8; +} + +static int update_intremap_entry_from_ioapic( int bdf, struct amd_iommu *iommu, - const struct IO_APIC_route_entry *rte, - const struct IO_APIC_route_entry *old_rte) + struct IO_APIC_route_entry *rte, + u16 *index) { unsigned long flags; u32* entry; u8 delivery_mode, dest, vector, dest_mode; int req_id; spinlock_t *lock; - int offset; + unsigned int offset; req_id = get_intremap_requestor_id(iommu->seg, bdf); lock = get_intremap_lock(iommu->seg, req_id); @@ -121,16 +129,20 @@ static void update_intremap_entry_from_i spin_lock_irqsave(lock, flags); - offset = get_intremap_offset(vector, delivery_mode); - if ( old_rte ) + offset = *index; + if ( offset >= INTREMAP_ENTRIES ) { - int old_offset = get_intremap_offset(old_rte->vector, - old_rte->delivery_mode); - - if ( offset != old_offset ) - free_intremap_entry(iommu->seg, bdf, old_offset); + offset = alloc_intremap_entry(iommu->seg, req_id); + if ( offset >= INTREMAP_ENTRIES ) + { + spin_unlock_irqrestore(lock, flags); + rte->mask = 1; + return -ENOSPC; + } + *index = offset; } - entry = (u32*)get_intremap_entry(iommu->seg, req_id, offset); + + entry = get_intremap_entry(iommu->seg, req_id, offset); update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest); spin_unlock_irqrestore(lock, flags); @@ -141,6 +153,10 @@ static void update_intremap_entry_from_i amd_iommu_flush_intremap(iommu, req_id); spin_unlock_irqrestore(&iommu->lock, flags); } + + set_rte_index(rte, offset); + + return 0; } int __init amd_iommu_setup_ioapic_remapping(void) @@ -153,7 +169,7 @@ int __init amd_iommu_setup_ioapic_remapp u16 seg, bdf, req_id; struct amd_iommu *iommu; spinlock_t *lock; - int offset; + unsigned int offset; /* Read ioapic entries and update interrupt remapping table accordingly */ for ( apic = 0; apic < nr_ioapics; apic++ ) @@ -184,19 +200,23 @@ int __init amd_iommu_setup_ioapic_remapp dest = rte.dest.logical.logical_dest; spin_lock_irqsave(lock, flags); - offset = get_intremap_offset(vector, delivery_mode); - entry = (u32*)get_intremap_entry(iommu->seg, req_id, offset); + offset = alloc_intremap_entry(seg, req_id); + BUG_ON(offset >= INTREMAP_ENTRIES); + entry = get_intremap_entry(iommu->seg, req_id, offset); update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest); spin_unlock_irqrestore(lock, flags); + set_rte_index(&rte, offset); + ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx[pin] = offset; + __ioapic_write_entry(apic, pin, 1, rte); + if ( iommu->enabled ) { spin_lock_irqsave(&iommu->lock, flags); amd_iommu_flush_intremap(iommu, req_id); spin_unlock_irqrestore(&iommu->lock, flags); } - set_bit(pin, ioapic_sbdf[IO_APIC_ID(apic)].pin_setup); } } return 0; @@ -209,7 +229,7 @@ void amd_iommu_ioapic_update_ire( struct IO_APIC_route_entry new_rte = { 0 }; unsigned int rte_lo = (reg & 1) ? reg - 1 : reg; unsigned int pin = (reg - 0x10) / 2; - int saved_mask, seg, bdf; + int saved_mask, seg, bdf, rc; struct amd_iommu *iommu; if ( !iommu_intremap ) @@ -247,7 +267,7 @@ void amd_iommu_ioapic_update_ire( } if ( new_rte.mask && - !test_bit(pin, ioapic_sbdf[IO_APIC_ID(apic)].pin_setup) ) + ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx[pin] >= INTREMAP_ENTRIES ) { ASSERT(saved_mask); __io_apic_write(apic, reg, value); @@ -262,14 +282,19 @@ void amd_iommu_ioapic_update_ire( } /* Update interrupt remapping entry */ - update_intremap_entry_from_ioapic( - bdf, iommu, &new_rte, - test_and_set_bit(pin, - ioapic_sbdf[IO_APIC_ID(apic)].pin_setup) ? &old_rte - : NULL); + rc = update_intremap_entry_from_ioapic( + bdf, iommu, &new_rte, + &ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx[pin]); - /* Forward write access to IO-APIC RTE */ - __io_apic_write(apic, reg, value); + __io_apic_write(apic, reg, ((u32 *)&new_rte)[reg != rte_lo]); + + if ( rc ) + { + /* Keep the entry masked. */ + printk(XENLOG_ERR "Remapping IO-APIC %#x pin %u failed (%d)\n", + IO_APIC_ID(apic), pin, rc); + return; + } /* For lower bits access, return directly to avoid double writes */ if ( reg == rte_lo ) @@ -283,16 +308,41 @@ void amd_iommu_ioapic_update_ire( } } -static void update_intremap_entry_from_msi_msg( +unsigned int amd_iommu_read_ioapic_from_ire( + unsigned int apic, unsigned int reg) +{ + unsigned int val = __io_apic_read(apic, reg); + + if ( !(reg & 1) ) + { + unsigned int offset = val & (INTREMAP_ENTRIES - 1); + u16 bdf = ioapic_sbdf[IO_APIC_ID(apic)].bdf; + u16 seg = ioapic_sbdf[IO_APIC_ID(apic)].seg; + u16 req_id = get_intremap_requestor_id(seg, bdf); + const u32 *entry = get_intremap_entry(seg, req_id, offset); + + val &= ~(INTREMAP_ENTRIES - 1); + val |= get_field_from_reg_u32(*entry, + INT_REMAP_ENTRY_INTTYPE_MASK, + INT_REMAP_ENTRY_INTTYPE_SHIFT) << 8; + val |= get_field_from_reg_u32(*entry, + INT_REMAP_ENTRY_VECTOR_MASK, + INT_REMAP_ENTRY_VECTOR_SHIFT); + } + + return val; +} + +static int update_intremap_entry_from_msi_msg( struct amd_iommu *iommu, u16 bdf, - int *remap_index, const struct msi_msg *msg) + int *remap_index, const struct msi_msg *msg, u32 *data) { unsigned long flags; u32* entry; u16 req_id, alias_id; u8 delivery_mode, dest, vector, dest_mode; spinlock_t *lock; - int offset; + unsigned int offset; req_id = get_dma_requestor_id(iommu->seg, bdf); alias_id = get_intremap_requestor_id(iommu->seg, bdf); @@ -303,15 +353,6 @@ static void update_intremap_entry_from_m spin_lock_irqsave(lock, flags); free_intremap_entry(iommu->seg, req_id, *remap_index); spin_unlock_irqrestore(lock, flags); - - if ( ( req_id != alias_id ) && - get_ivrs_mappings(iommu->seg)[alias_id].intremap_table != NULL ) - { - lock = get_intremap_lock(iommu->seg, alias_id); - spin_lock_irqsave(lock, flags); - free_intremap_entry(iommu->seg, alias_id, *remap_index); - spin_unlock_irqrestore(lock, flags); - } goto done; } @@ -322,16 +363,24 @@ static void update_intremap_entry_from_m delivery_mode = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1; vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) & MSI_DATA_VECTOR_MASK; dest = (msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff; - offset = get_intremap_offset(vector, delivery_mode); - if ( *remap_index < 0) + offset = *remap_index; + if ( offset >= INTREMAP_ENTRIES ) + { + offset = alloc_intremap_entry(iommu->seg, bdf); + if ( offset >= INTREMAP_ENTRIES ) + { + spin_unlock_irqrestore(lock, flags); + return -ENOSPC; + } *remap_index = offset; - else - BUG_ON(*remap_index != offset); + } - entry = (u32*)get_intremap_entry(iommu->seg, req_id, offset); + entry = get_intremap_entry(iommu->seg, req_id, offset); update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest); spin_unlock_irqrestore(lock, flags); + *data = (msg->data & ~(INTREMAP_ENTRIES - 1)) | offset; + /* * In some special cases, a pci-e device(e.g SATA controller in IDE mode) * will use alias id to index interrupt remapping table. @@ -343,10 +392,8 @@ static void update_intremap_entry_from_m if ( ( req_id != alias_id ) && get_ivrs_mappings(iommu->seg)[alias_id].intremap_table != NULL ) { - spin_lock_irqsave(lock, flags); - entry = (u32*)get_intremap_entry(iommu->seg, alias_id, offset); - update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest); - spin_unlock_irqrestore(lock, flags); + BUG_ON(get_ivrs_mappings(iommu->seg)[req_id].intremap_table !+ get_ivrs_mappings(iommu->seg)[alias_id].intremap_table); } done: @@ -358,19 +405,22 @@ done: amd_iommu_flush_intremap(iommu, alias_id); spin_unlock_irqrestore(&iommu->lock, flags); } + + return 0; } static struct amd_iommu *_find_iommu_for_device(int seg, int bdf) { - struct amd_iommu *iommu = find_iommu_for_device(seg, bdf); - - if ( iommu ) - return iommu; + struct amd_iommu *iommu; list_for_each_entry ( iommu, &amd_iommu_head, list ) if ( iommu->seg == seg && iommu->bdf == bdf ) return NULL; + iommu = find_iommu_for_device(seg, bdf); + if ( iommu ) + return iommu; + AMD_IOMMU_DEBUG("No IOMMU for MSI dev = %04x:%02x:%02x.%u\n", seg, PCI_BUS(bdf), PCI_SLOT(bdf), PCI_FUNC(bdf)); return ERR_PTR(-EINVAL); @@ -380,8 +430,9 @@ int amd_iommu_msi_msg_update_ire( struct msi_desc *msi_desc, struct msi_msg *msg) { struct pci_dev *pdev = msi_desc->dev; - int bdf, seg; + int bdf, seg, rc; struct amd_iommu *iommu; + u32 data; bdf = pdev ? PCI_BDF2(pdev->bus, pdev->devfn) : hpet_sbdf.bdf; seg = pdev ? pdev->seg : hpet_sbdf.seg; @@ -390,11 +441,12 @@ int amd_iommu_msi_msg_update_ire( if ( IS_ERR_OR_NULL(iommu) ) return PTR_ERR(iommu); - if ( msi_desc->remap_index >= 0 ) + if ( msi_desc->remap_index >= 0 && !msg ) { do { update_intremap_entry_from_msi_msg(iommu, bdf, - &msi_desc->remap_index, NULL); + &msi_desc->remap_index, + NULL, NULL); if ( !pdev || !pdev->phantom_stride ) break; bdf += pdev->phantom_stride; @@ -409,19 +461,39 @@ int amd_iommu_msi_msg_update_ire( return 0; do { - update_intremap_entry_from_msi_msg(iommu, bdf, &msi_desc->remap_index, - msg); - if ( !pdev || !pdev->phantom_stride ) + rc = update_intremap_entry_from_msi_msg(iommu, bdf, + &msi_desc->remap_index, + msg, &data); + if ( rc || !pdev || !pdev->phantom_stride ) break; bdf += pdev->phantom_stride; } while ( PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) ); - return 0; + msg->data = data; + return rc; } void amd_iommu_read_msi_from_ire( struct msi_desc *msi_desc, struct msi_msg *msg) { + unsigned int offset = msg->data & (INTREMAP_ENTRIES - 1); + const struct pci_dev *pdev = msi_desc->dev; + u16 bdf = pdev ? PCI_BDF2(pdev->bus, pdev->devfn) : hpet_sbdf.bdf; + u16 seg = pdev ? pdev->seg : hpet_sbdf.seg; + const u32 *entry; + + if ( IS_ERR_OR_NULL(_find_iommu_for_device(seg, bdf)) ) + return; + + entry = get_intremap_entry(seg, get_dma_requestor_id(seg, bdf), offset); + + msg->data &= ~(INTREMAP_ENTRIES - 1); + msg->data |= get_field_from_reg_u32(*entry, + INT_REMAP_ENTRY_INTTYPE_MASK, + INT_REMAP_ENTRY_INTTYPE_SHIFT) << 8; + msg->data |= get_field_from_reg_u32(*entry, + INT_REMAP_ENTRY_VECTOR_MASK, + INT_REMAP_ENTRY_VECTOR_SHIFT); } int __init amd_iommu_free_intremap_table( @@ -438,12 +510,14 @@ int __init amd_iommu_free_intremap_table return 0; } -void* __init amd_iommu_alloc_intremap_table(void) +void* __init amd_iommu_alloc_intremap_table(unsigned long **inuse_map) { void *tb; tb = __alloc_amd_iommu_tables(INTREMAP_TABLE_ORDER); BUG_ON(tb == NULL); memset(tb, 0, PAGE_SIZE * (1UL << INTREMAP_TABLE_ORDER)); + *inuse_map = xzalloc_array(unsigned long, BITS_TO_LONGS(INTREMAP_ENTRIES)); + BUG_ON(*inuse_map == NULL); return tb; } --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c @@ -622,7 +622,7 @@ const struct iommu_ops amd_iommu_ops = { .get_device_group_id = amd_iommu_group_id, .update_ire_from_apic = amd_iommu_ioapic_update_ire, .update_ire_from_msi = amd_iommu_msi_msg_update_ire, - .read_apic_from_ire = __io_apic_read, + .read_apic_from_ire = amd_iommu_read_ioapic_from_ire, .read_msi_from_ire = amd_iommu_read_msi_from_ire, .setup_hpet_msi = amd_setup_hpet_msi, .suspend = amd_iommu_suspend, --- a/xen/include/asm-x86/amd-iommu.h +++ b/xen/include/asm-x86/amd-iommu.h @@ -119,6 +119,7 @@ struct ivrs_mappings { /* per device interrupt remapping table */ void *intremap_table; + unsigned long *intremap_inuse; spinlock_t intremap_lock; /* ivhd device data settings */ --- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h +++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h @@ -458,10 +458,6 @@ #define MAX_AMD_IOMMUS 32 /* interrupt remapping table */ -#define INT_REMAP_INDEX_DM_MASK 0x1C00 -#define INT_REMAP_INDEX_DM_SHIFT 10 -#define INT_REMAP_INDEX_VECTOR_MASK 0x3FC -#define INT_REMAP_INDEX_VECTOR_SHIFT 2 #define INT_REMAP_ENTRY_REMAPEN_MASK 0x00000001 #define INT_REMAP_ENTRY_REMAPEN_SHIFT 0 #define INT_REMAP_ENTRY_SUPIOPF_MASK 0x00000002 --- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h +++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h @@ -89,10 +89,12 @@ struct amd_iommu *find_iommu_for_device( /* interrupt remapping */ int amd_iommu_setup_ioapic_remapping(void); -void *amd_iommu_alloc_intremap_table(void); +void *amd_iommu_alloc_intremap_table(unsigned long **); int amd_iommu_free_intremap_table(u16 seg, struct ivrs_mappings *); void amd_iommu_ioapic_update_ire( unsigned int apic, unsigned int reg, unsigned int value); +unsigned int amd_iommu_read_ioapic_from_ire( + unsigned int apic, unsigned int reg); int amd_iommu_msi_msg_update_ire( struct msi_desc *msi_desc, struct msi_msg *msg); void amd_iommu_read_msi_from_ire( @@ -101,15 +103,17 @@ int amd_setup_hpet_msi(struct msi_desc * extern struct ioapic_sbdf { u16 bdf, seg; - unsigned long *pin_setup; + u16 *pin_2_idx; } ioapic_sbdf[MAX_IO_APICS]; -extern void *shared_intremap_table; extern struct hpet_sbdf { u16 bdf, seg, id; struct amd_iommu *iommu; } hpet_sbdf; +extern void *shared_intremap_table; +extern unsigned long *shared_intremap_inuse; + /* power management support */ void amd_iommu_resume(void); void amd_iommu_suspend(void); _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
With the specific IRTEs used for an interrupt no longer depending on the vector, there''s no need to tie the remap sharing model to the vector sharing one. Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: George Dunlap <george.dunlap@eu.citrix.com> --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c @@ -207,35 +207,6 @@ int __init amd_iov_detect(void) init_done = 1; - /* - * AMD IOMMUs don''t distinguish between vectors destined for - * different cpus when doing interrupt remapping. This means - * that interrupts going through the same intremap table - * can''t share the same vector. - * - * If irq_vector_map isn''t specified, choose a sensible default: - * - If we''re using per-device interemap tables, per-device - * vector non-sharing maps - * - If we''re using a global interemap table, global vector - * non-sharing map - */ - if ( opt_irq_vector_map == OPT_IRQ_VECTOR_MAP_DEFAULT ) - { - if ( amd_iommu_perdev_intremap ) - { - printk("AMD-Vi: Enabling per-device vector maps\n"); - opt_irq_vector_map = OPT_IRQ_VECTOR_MAP_PERDEV; - } - else - { - printk("AMD-Vi: Enabling global vector map\n"); - opt_irq_vector_map = OPT_IRQ_VECTOR_MAP_GLOBAL; - } - } - else - { - printk("AMD-Vi: Not overriding irq_vector_map setting\n"); - } if ( !amd_iommu_perdev_intremap ) printk(XENLOG_WARNING "AMD-Vi: Using global interrupt remap table is not recommended (see XSA-36)!\n"); return scan_pci_devices(); _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
The main change being to make alloc_remap_entry() capable of allocating a block of entries. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- a/xen/drivers/passthrough/vtd/intremap.c +++ b/xen/drivers/passthrough/vtd/intremap.c @@ -194,18 +194,18 @@ static void free_remap_entry(struct iomm } /* - * Look for a free intr remap entry. + * Look for a free intr remap entry (or a contiguous set thereof). * Need hold iremap_lock, and setup returned entry before releasing lock. */ -static int alloc_remap_entry(struct iommu *iommu) +static unsigned int alloc_remap_entry(struct iommu *iommu, unsigned int nr) { struct iremap_entry *iremap_entries = NULL; struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); - int i; + unsigned int i, found; ASSERT( spin_is_locked(&ir_ctrl->iremap_lock) ); - for ( i = 0; i < IREMAP_ENTRY_NR; i++ ) + for ( found = i = 0; i < IREMAP_ENTRY_NR; i++ ) { struct iremap_entry *p; if ( i % (1 << IREMAP_ENTRY_ORDER) == 0 ) @@ -220,7 +220,9 @@ static int alloc_remap_entry(struct iomm else p = &iremap_entries[i % (1 << IREMAP_ENTRY_ORDER)]; - if ( p->lo_val == 0 && p->hi_val == 0 ) /* a free entry */ + if ( p->lo_val || p->hi_val ) /* not a free entry */ + found = 0; + else if ( ++found == nr ) break; } @@ -228,7 +230,7 @@ static int alloc_remap_entry(struct iomm unmap_vtd_domain_page(iremap_entries); if ( i < IREMAP_ENTRY_NR ) - ir_ctrl->iremap_num++; + ir_ctrl->iremap_num += nr; return i; } @@ -293,7 +295,7 @@ static int ioapic_rte_to_remap_entry(str index = apic_pin_2_ir_idx[apic][ioapic_pin]; if ( index < 0 ) { - index = alloc_remap_entry(iommu); + index = alloc_remap_entry(iommu, 1); if ( index < IREMAP_ENTRY_NR ) apic_pin_2_ir_idx[apic][ioapic_pin] = index; } @@ -485,19 +487,18 @@ static void set_msi_source_id(struct pci } static int remap_entry_to_msi_msg( - struct iommu *iommu, struct msi_msg *msg) + struct iommu *iommu, struct msi_msg *msg, unsigned int index) { struct iremap_entry *iremap_entry = NULL, *iremap_entries; struct msi_msg_remap_entry *remap_rte; - int index; unsigned long flags; struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); remap_rte = (struct msi_msg_remap_entry *) msg; - index = (remap_rte->address_lo.index_15 << 15) | + index += (remap_rte->address_lo.index_15 << 15) | remap_rte->address_lo.index_0_14; - if ( index < 0 || index > IREMAP_ENTRY_NR - 1 ) + if ( index >= IREMAP_ENTRY_NR ) { dprintk(XENLOG_ERR VTDPREFIX, "%s: index (%d) for remap table is invalid !\n", @@ -555,31 +556,29 @@ static int msi_msg_to_remap_entry( struct iremap_entry *iremap_entry = NULL, *iremap_entries; struct iremap_entry new_ire; struct msi_msg_remap_entry *remap_rte; - int index; + unsigned int index, i, nr = 1; unsigned long flags; struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); - remap_rte = (struct msi_msg_remap_entry *) msg; + if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSI ) + nr = msi_desc->msi.nvec; + spin_lock_irqsave(&ir_ctrl->iremap_lock, flags); if ( msg == NULL ) { - /* Free specified unused IRTE */ - free_remap_entry(iommu, msi_desc->remap_index); + /* Free specified unused IRTEs */ + for ( i = 0; i < nr; ++i ) + free_remap_entry(iommu, msi_desc->remap_index + i); spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); return 0; } if ( msi_desc->remap_index < 0 ) { - /* - * TODO: Multiple-vector MSI requires allocating multiple continuous - * entries and configuring addr/data of msi_msg in different way. So - * alloca_remap_entry will be changed if enabling multiple-vector MSI - * in future. - */ - index = alloc_remap_entry(iommu); - msi_desc->remap_index = index; + index = alloc_remap_entry(iommu, nr); + for ( i = 0; i < nr; ++i ) + msi_desc[i].remap_index = index + i; } else index = msi_desc->remap_index; @@ -590,7 +589,8 @@ static int msi_msg_to_remap_entry( "%s: intremap index (%d) is larger than" " the maximum index (%d)!\n", __func__, index, IREMAP_ENTRY_NR - 1); - msi_desc->remap_index = -1; + for ( i = 0; i < nr; ++i ) + msi_desc[i].remap_index = -1; spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); return -EFAULT; } @@ -626,14 +626,18 @@ static int msi_msg_to_remap_entry( new_ire.lo.p = 1; /* finally, set present bit */ /* now construct new MSI/MSI-X rte entry */ + remap_rte = (struct msi_msg_remap_entry *)msg; remap_rte->address_lo.dontcare = 0; - remap_rte->address_lo.index_15 = (index >> 15) & 0x1; - remap_rte->address_lo.index_0_14 = index & 0x7fff; + i = index; + if ( !nr ) + i -= msi_desc->msi_attrib.entry_nr; + remap_rte->address_lo.index_15 = (i >> 15) & 0x1; + remap_rte->address_lo.index_0_14 = i & 0x7fff; remap_rte->address_lo.SHV = 1; remap_rte->address_lo.format = 1; remap_rte->address_hi = 0; - remap_rte->data = 0; + remap_rte->data = index - i; memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry)); iommu_flush_cache_entry(iremap_entry, sizeof(struct iremap_entry)); @@ -654,7 +658,9 @@ void msi_msg_read_remap_rte( drhd = pdev ? acpi_find_matched_drhd_unit(pdev) : hpet_to_drhd(msi_desc->hpet_id); if ( drhd ) - remap_entry_to_msi_msg(drhd->iommu, msg); + remap_entry_to_msi_msg(drhd->iommu, msg, + msi_desc->msi_attrib.type == PCI_CAP_ID_MSI + ? msi_desc->msi_attrib.entry_nr : 0); } int msi_msg_write_remap_rte( @@ -680,7 +686,7 @@ int __init intel_setup_hpet_msi(struct m return 0; spin_lock_irqsave(&ir_ctrl->iremap_lock, flags); - msi_desc->remap_index = alloc_remap_entry(iommu); + msi_desc->remap_index = alloc_remap_entry(iommu, 1); if ( msi_desc->remap_index >= IREMAP_ENTRY_NR ) { dprintk(XENLOG_ERR VTDPREFIX, _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
The main change being to make alloc_intremap_entry() capable of allocating a block of entries. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- a/xen/drivers/passthrough/amd/iommu_intr.c +++ b/xen/drivers/passthrough/amd/iommu_intr.c @@ -47,13 +47,33 @@ static int get_intremap_requestor_id(int return get_ivrs_mappings(seg)[bdf].dte_requestor_id; } -static unsigned int alloc_intremap_entry(int seg, int bdf) +static unsigned int alloc_intremap_entry(int seg, int bdf, unsigned int nr) { unsigned long *inuse = get_ivrs_mappings(seg)[bdf].intremap_inuse; unsigned int slot = find_first_zero_bit(inuse, INTREMAP_ENTRIES); - if ( slot < INTREMAP_ENTRIES ) - __set_bit(slot, inuse); + for ( ; ; ) + { + unsigned int end; + + if ( slot >= INTREMAP_ENTRIES ) + break; + end = find_next_bit(inuse, INTREMAP_ENTRIES, slot + 1); + if ( end > INTREMAP_ENTRIES ) + end = INTREMAP_ENTRIES; + slot = (slot + nr - 1) & ~(nr - 1); + if ( slot + nr <= end ) + { + while ( nr-- ) + __set_bit(slot + nr, inuse); + break; + } + slot = (end + nr) & ~(nr - 1); + if ( slot >= INTREMAP_ENTRIES ) + break; + slot = find_next_zero_bit(inuse, INTREMAP_ENTRIES, slot); + } + return slot; } @@ -132,7 +152,7 @@ static int update_intremap_entry_from_io offset = *index; if ( offset >= INTREMAP_ENTRIES ) { - offset = alloc_intremap_entry(iommu->seg, req_id); + offset = alloc_intremap_entry(iommu->seg, req_id, 1); if ( offset >= INTREMAP_ENTRIES ) { spin_unlock_irqrestore(lock, flags); @@ -200,7 +220,7 @@ int __init amd_iommu_setup_ioapic_remapp dest = rte.dest.logical.logical_dest; spin_lock_irqsave(lock, flags); - offset = alloc_intremap_entry(seg, req_id); + offset = alloc_intremap_entry(seg, req_id, 1); BUG_ON(offset >= INTREMAP_ENTRIES); entry = get_intremap_entry(iommu->seg, req_id, offset); update_intremap_entry(entry, vector, @@ -334,7 +354,7 @@ unsigned int amd_iommu_read_ioapic_from_ } static int update_intremap_entry_from_msi_msg( - struct amd_iommu *iommu, u16 bdf, + struct amd_iommu *iommu, u16 bdf, unsigned int nr, int *remap_index, const struct msi_msg *msg, u32 *data) { unsigned long flags; @@ -342,7 +362,7 @@ static int update_intremap_entry_from_ms u16 req_id, alias_id; u8 delivery_mode, dest, vector, dest_mode; spinlock_t *lock; - unsigned int offset; + unsigned int offset, i; req_id = get_dma_requestor_id(iommu->seg, bdf); alias_id = get_intremap_requestor_id(iommu->seg, bdf); @@ -351,7 +371,8 @@ static int update_intremap_entry_from_ms { lock = get_intremap_lock(iommu->seg, req_id); spin_lock_irqsave(lock, flags); - free_intremap_entry(iommu->seg, req_id, *remap_index); + for ( i = 0; i < nr; ++i ) + free_intremap_entry(iommu->seg, req_id, *remap_index + i); spin_unlock_irqrestore(lock, flags); goto done; } @@ -366,7 +387,8 @@ static int update_intremap_entry_from_ms offset = *remap_index; if ( offset >= INTREMAP_ENTRIES ) { - offset = alloc_intremap_entry(iommu->seg, bdf); + ASSERT(nr); + offset = alloc_intremap_entry(iommu->seg, bdf, nr); if ( offset >= INTREMAP_ENTRIES ) { spin_unlock_irqrestore(lock, flags); @@ -432,6 +454,7 @@ int amd_iommu_msi_msg_update_ire( struct pci_dev *pdev = msi_desc->dev; int bdf, seg, rc; struct amd_iommu *iommu; + unsigned int i, nr = 1; u32 data; bdf = pdev ? PCI_BDF2(pdev->bus, pdev->devfn) : hpet_sbdf.bdf; @@ -441,10 +464,13 @@ int amd_iommu_msi_msg_update_ire( if ( IS_ERR_OR_NULL(iommu) ) return PTR_ERR(iommu); + if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSI ) + nr = msi_desc->msi.nvec; + if ( msi_desc->remap_index >= 0 && !msg ) { do { - update_intremap_entry_from_msi_msg(iommu, bdf, + update_intremap_entry_from_msi_msg(iommu, bdf, nr, &msi_desc->remap_index, NULL, NULL); if ( !pdev || !pdev->phantom_stride ) @@ -452,7 +478,8 @@ int amd_iommu_msi_msg_update_ire( bdf += pdev->phantom_stride; } while ( PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) ); - msi_desc->remap_index = -1; + for ( i = 0; i < nr; ++i ) + msi_desc[i].remap_index = -1; if ( pdev ) bdf = PCI_BDF2(pdev->bus, pdev->devfn); } @@ -461,7 +488,7 @@ int amd_iommu_msi_msg_update_ire( return 0; do { - rc = update_intremap_entry_from_msi_msg(iommu, bdf, + rc = update_intremap_entry_from_msi_msg(iommu, bdf, nr, &msi_desc->remap_index, msg, &data); if ( rc || !pdev || !pdev->phantom_stride ) @@ -469,6 +496,10 @@ int amd_iommu_msi_msg_update_ire( bdf += pdev->phantom_stride; } while ( PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) ); + if ( !rc ) + for ( i = 1; i < nr; ++i ) + msi_desc[i].remap_index = msi_desc->remap_index + i; + msg->data = data; return rc; } @@ -487,6 +518,14 @@ void amd_iommu_read_msi_from_ire( entry = get_intremap_entry(seg, get_dma_requestor_id(seg, bdf), offset); + if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSI ) + { + int nr = msi_desc->msi_attrib.entry_nr; + + ASSERT(!(offset & (msi_desc[-nr].msi.nvec - 1))); + offset |= nr; + } + msg->data &= ~(INTREMAP_ENTRIES - 1); msg->data |= get_field_from_reg_u32(*entry, INT_REMAP_ENTRY_INTTYPE_MASK, _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
This implies - extending the public interface to have a way to request a block of MSIs - allocating a block of contiguous pIRQ-s for the target domain (but note that the Xen IRQs allocated have no need of being contiguous) - repeating certain operations for all involved IRQs - fixing multi_msi_enable() - adjusting the mask bit accesses for maskable MSIs Signed-off-by: Jan Beulich <jbeulich@suse.com> --- a/xen/arch/x86/irq.c +++ b/xen/arch/x86/irq.c @@ -1880,6 +1880,25 @@ int get_free_pirq(struct domain *d, int return -ENOSPC; } +int get_free_pirqs(struct domain *d, unsigned int nr) +{ + unsigned int i, found = 0; + + ASSERT(spin_is_locked(&d->event_lock)); + + for ( i = d->nr_pirqs - 1; i >= nr_irqs_gsi; --i ) + if ( is_free_pirq(d, pirq_info(d, i)) ) + { + pirq_get_info(d, i); + if ( ++found == nr ) + return i; + } + else + found = 0; + + return -ENOSPC; +} + int map_domain_pirq( struct domain *d, int pirq, int irq, int type, void *data) { @@ -1935,11 +1954,12 @@ int map_domain_pirq( desc = irq_to_desc(irq); - if ( type == MAP_PIRQ_TYPE_MSI ) + if ( type == MAP_PIRQ_TYPE_MSI || type == MAP_PIRQ_TYPE_MULTI_MSI ) { struct msi_info *msi = (struct msi_info *)data; struct msi_desc *msi_desc; struct pci_dev *pdev; + unsigned int nr = 0; ASSERT(spin_is_locked(&pcidevs_lock)); @@ -1950,7 +1970,14 @@ int map_domain_pirq( pdev = pci_get_pdev(msi->seg, msi->bus, msi->devfn); ret = pci_enable_msi(msi, &msi_desc); if ( ret ) + { + if ( ret > 0 ) + { + msi->entry_nr = ret; + ret = -ENFILE; + } goto done; + } spin_lock_irqsave(&desc->lock, flags); @@ -1964,25 +1991,73 @@ int map_domain_pirq( goto done; } - ret = setup_msi_irq(desc, msi_desc); - if ( ret ) + while ( !(ret = setup_msi_irq(desc, msi_desc + nr)) ) { + if ( opt_irq_vector_map == OPT_IRQ_VECTOR_MAP_PERDEV && + !desc->arch.used_vectors ) + { + desc->arch.used_vectors = &pdev->arch.used_vectors; + if ( desc->arch.vector != IRQ_VECTOR_UNASSIGNED ) + { + int vector = desc->arch.vector; + + ASSERT(!test_bit(vector, desc->arch.used_vectors)); + set_bit(vector, desc->arch.used_vectors); + } + } + if ( type == MAP_PIRQ_TYPE_MSI || + msi_desc->msi_attrib.type != PCI_CAP_ID_MSI || + ++nr == msi->entry_nr ) + break; + + set_domain_irq_pirq(d, irq, info); spin_unlock_irqrestore(&desc->lock, flags); - pci_disable_msi(msi_desc); - goto done; + + info = NULL; + irq = create_irq(NUMA_NO_NODE); + ret = irq >= 0 ? prepare_domain_irq_pirq(d, irq, pirq + nr, &info) + : irq; + if ( ret ) + break; + msi_desc[nr].irq = irq; + + if ( irq_permit_access(d, irq) != 0 ) + printk(XENLOG_G_WARNING + "dom%d: could not permit access to IRQ%d (pirq %d)\n", + d->domain_id, irq, pirq); + + desc = irq_to_desc(irq); + spin_lock_irqsave(&desc->lock, flags); + + if ( desc->handler != &no_irq_type ) + { + dprintk(XENLOG_G_ERR, "dom%d: irq %d (pirq %u) in use (%s)\n", + d->domain_id, irq, pirq + nr, desc->handler->typename); + ret = -EBUSY; + break; + } } - if ( opt_irq_vector_map == OPT_IRQ_VECTOR_MAP_PERDEV - && !desc->arch.used_vectors ) + if ( ret ) { - desc->arch.used_vectors = &pdev->arch.used_vectors; - if ( desc->arch.vector != IRQ_VECTOR_UNASSIGNED ) + spin_unlock_irqrestore(&desc->lock, flags); + while ( nr-- ) { - int vector = desc->arch.vector; - ASSERT(!test_bit(vector, desc->arch.used_vectors)); - - set_bit(vector, desc->arch.used_vectors); + if ( irq >= 0 ) + { + if ( irq_deny_access(d, irq) ) + printk(XENLOG_G_ERR + "dom%d: could not revoke access to IRQ%d (pirq %d)\n", + d->domain_id, irq, pirq); + destroy_irq(irq); + } + if ( info ) + cleanup_domain_irq_pirq(d, irq, info); + info = pirq_info(d, pirq + nr); + irq = info->arch.irq; } + pci_disable_msi(msi_desc); + goto done; } set_domain_irq_pirq(d, irq, info); @@ -2013,7 +2088,8 @@ int unmap_domain_pirq(struct domain *d, { unsigned long flags; struct irq_desc *desc; - int irq, ret = 0; + int irq, ret = 0, rc; + unsigned int i, nr = 1; bool_t forced_unbind; struct pirq *info; struct msi_desc *msi_desc = NULL; @@ -2035,6 +2111,18 @@ int unmap_domain_pirq(struct domain *d, desc = irq_to_desc(irq); msi_desc = desc->msi_desc; + if ( msi_desc && msi_desc->msi_attrib.type == PCI_CAP_ID_MSI ) + { + if ( msi_desc->msi_attrib.entry_nr ) + { + printk(XENLOG_G_ERR + "dom%d: trying to unmap secondary MSI pirq %d\n", + d->domain_id, pirq); + ret = -EBUSY; + goto done; + } + nr = msi_desc->msi.nvec; + } ret = xsm_unmap_domain_irq(XSM_HOOK, d, irq, msi_desc); if ( ret ) @@ -2050,37 +2138,83 @@ int unmap_domain_pirq(struct domain *d, spin_lock_irqsave(&desc->lock, flags); - BUG_ON(irq != domain_pirq_to_irq(d, pirq)); - - if ( !forced_unbind ) - clear_domain_irq_pirq(d, irq, info); - else + for ( i = 0; ; ) { - info->arch.irq = -irq; - radix_tree_replace_slot( - radix_tree_lookup_slot(&d->arch.irq_pirq, irq), - radix_tree_int_to_ptr(-pirq)); + BUG_ON(irq != domain_pirq_to_irq(d, pirq + i)); + + if ( !forced_unbind ) + clear_domain_irq_pirq(d, irq, info); + else + { + info->arch.irq = -irq; + radix_tree_replace_slot( + radix_tree_lookup_slot(&d->arch.irq_pirq, irq), + radix_tree_int_to_ptr(-pirq)); + } + + if ( msi_desc ) + { + desc->handler = &no_irq_type; + desc->msi_desc = NULL; + } + + if ( ++i == nr ) + break; + + spin_unlock_irqrestore(&desc->lock, flags); + + if ( !forced_unbind ) + cleanup_domain_irq_pirq(d, irq, info); + + rc = irq_deny_access(d, irq); + if ( rc ) + { + printk(XENLOG_G_ERR + "dom%d: could not deny access to IRQ%d (pirq %d)\n", + d->domain_id, irq, pirq + i); + ret = rc; + } + + do { + info = pirq_info(d, pirq + i); + if ( info && (irq = info->arch.irq) > 0 ) + break; + printk(XENLOG_G_ERR "dom%d: MSI pirq %d not mapped\n", + d->domain_id, pirq + i); + } while ( ++i < nr ); + + if ( i == nr ) + { + desc = NULL; + break; + } + + desc = irq_to_desc(irq); + BUG_ON(desc->msi_desc != msi_desc + i); + + spin_lock_irqsave(&desc->lock, flags); } - if ( msi_desc ) + if ( desc ) { - desc->handler = &no_irq_type; - desc->msi_desc = NULL; + spin_unlock_irqrestore(&desc->lock, flags); + + if ( !forced_unbind ) + cleanup_domain_irq_pirq(d, irq, info); + + rc = irq_deny_access(d, irq); + if ( rc ) + { + printk(XENLOG_G_ERR + "dom%d: could not deny access to IRQ%d (pirq %d)\n", + d->domain_id, irq, pirq + nr - 1); + ret = rc; + } } - spin_unlock_irqrestore(&desc->lock, flags); if (msi_desc) msi_free_irq(msi_desc); - if ( !forced_unbind ) - cleanup_domain_irq_pirq(d, irq, info); - - ret = irq_deny_access(d, irq); - if ( ret ) - printk(XENLOG_G_ERR - "dom%d: could not deny access to IRQ%d (pirq %d)\n", - d->domain_id, irq, pirq); - done: return ret; } --- a/xen/arch/x86/msi.c +++ b/xen/arch/x86/msi.c @@ -238,6 +238,11 @@ static int write_msi_msg(struct msi_desc u8 bus = dev->bus; u8 slot = PCI_SLOT(dev->devfn); u8 func = PCI_FUNC(dev->devfn); + int nr = entry->msi_attrib.entry_nr; + + ASSERT((msg->data & (entry[-nr].msi.nvec - 1)) == nr); + if ( nr ) + return 0; pci_conf_write32(seg, bus, slot, func, msi_lower_address_reg(pos), msg->address_lo); @@ -361,8 +366,8 @@ static void msi_set_mask_bit(struct irq_ u8 func = PCI_FUNC(entry->dev->devfn); mask_bits = pci_conf_read32(seg, bus, slot, func, entry->msi.mpos); - mask_bits &= ~(1); - mask_bits |= flag; + mask_bits &= ~((u32)1 << entry->msi_attrib.entry_nr); + mask_bits |= (u32)flag << entry->msi_attrib.entry_nr; pci_conf_write32(seg, bus, slot, func, entry->msi.mpos, mask_bits); } break; @@ -386,10 +391,11 @@ static int msi_get_mask_bit(const struct case PCI_CAP_ID_MSI: if (!entry->dev || !entry->msi_attrib.maskbit) break; - return pci_conf_read32(entry->dev->seg, entry->dev->bus, - PCI_SLOT(entry->dev->devfn), - PCI_FUNC(entry->dev->devfn), - entry->msi.mpos) & 1; + return (pci_conf_read32(entry->dev->seg, entry->dev->bus, + PCI_SLOT(entry->dev->devfn), + PCI_FUNC(entry->dev->devfn), + entry->msi.mpos) >> + entry->msi_attrib.entry_nr) & 1; case PCI_CAP_ID_MSIX: return readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) & 1; } @@ -455,17 +461,20 @@ static hw_irq_controller pci_msi_nonmask .set_affinity = set_msi_affinity }; -static struct msi_desc* alloc_msi_entry(void) +static struct msi_desc *alloc_msi_entry(unsigned int nr) { struct msi_desc *entry; - entry = xmalloc(struct msi_desc); + entry = xmalloc_array(struct msi_desc, nr); if ( !entry ) return NULL; INIT_LIST_HEAD(&entry->list); - entry->dev = NULL; - entry->remap_index = -1; + while ( nr-- ) + { + entry[nr].dev = NULL; + entry[nr].remap_index = -1; + } return entry; } @@ -483,17 +492,24 @@ int setup_msi_irq(struct irq_desc *desc, int msi_free_irq(struct msi_desc *entry) { - destroy_irq(entry->irq); + unsigned int nr = entry->msi.nvec; + if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX ) { unsigned long start; start = (unsigned long)entry->mask_base & ~(PAGE_SIZE - 1); msix_put_fixmap(entry->dev, virt_to_fix(start)); + nr = 1; } - /* Free the unused IRTE if intr remap enabled */ - if ( iommu_intremap ) - iommu_update_ire_from_msi(entry, NULL); + while ( nr-- ) + { + destroy_irq(entry[nr].irq); + + /* Free the unused IRTE if intr remap enabled */ + if ( iommu_intremap ) + iommu_update_ire_from_msi(entry + nr, NULL); + } list_del(&entry->list); xfree(entry); @@ -526,11 +542,12 @@ static struct msi_desc *find_msi_entry(s **/ static int msi_capability_init(struct pci_dev *dev, int irq, - struct msi_desc **desc) + struct msi_desc **desc, + unsigned int nvec) { struct msi_desc *entry; int pos; - unsigned int maxvec, mpos; + unsigned int i, maxvec, mpos; u16 control, seg = dev->seg; u8 bus = dev->bus; u8 slot = PCI_SLOT(dev->devfn); @@ -540,27 +557,34 @@ static int msi_capability_init(struct pc pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSI); control = pci_conf_read16(seg, bus, slot, func, msi_control_reg(pos)); maxvec = multi_msi_capable(control); + if ( nvec > maxvec ) + return maxvec; control &= ~PCI_MSI_FLAGS_QSIZE; + multi_msi_enable(control, nvec); /* MSI Entry Initialization */ msi_set_enable(dev, 0); /* Ensure msi is disabled as I set it up */ - entry = alloc_msi_entry(); + entry = alloc_msi_entry(nvec); if ( !entry ) return -ENOMEM; - entry->msi_attrib.type = PCI_CAP_ID_MSI; - entry->msi_attrib.is_64 = is_64bit_address(control); - entry->msi_attrib.entry_nr = 0; - entry->msi_attrib.maskbit = is_mask_bit_support(control); - entry->msi_attrib.masked = 1; - entry->msi_attrib.pos = pos; mpos = msi_mask_bits_reg(pos, is_64bit_address(control)); - entry->msi.nvec = 1; + for ( i = 0; i < nvec; ++i ) + { + entry[i].msi_attrib.type = PCI_CAP_ID_MSI; + entry[i].msi_attrib.is_64 = is_64bit_address(control); + entry[i].msi_attrib.entry_nr = i; + entry[i].msi_attrib.maskbit = is_mask_bit_support(control); + entry[i].msi_attrib.masked = 1; + entry[i].msi_attrib.pos = pos; + if ( entry[i].msi_attrib.maskbit ) + entry[i].msi.mpos = mpos; + entry[i].msi.nvec = 0; + entry[i].dev = dev; + } + entry->msi.nvec = nvec; entry->irq = irq; - if ( is_mask_bit_support(control) ) - entry->msi.mpos = mpos; - entry->dev = dev; if ( entry->msi_attrib.maskbit ) { u32 maskbits; @@ -688,7 +712,7 @@ static int msix_capability_init(struct p if ( desc ) { - entry = alloc_msi_entry(); + entry = alloc_msi_entry(1); if ( !entry ) return -ENOMEM; ASSERT(msi); @@ -846,7 +870,6 @@ static int msix_capability_init(struct p static int __pci_enable_msi(struct msi_info *msi, struct msi_desc **desc) { - int status; struct pci_dev *pdev; struct msi_desc *old_desc; @@ -875,8 +898,7 @@ static int __pci_enable_msi(struct msi_i pci_disable_msi(old_desc); } - status = msi_capability_init(pdev, msi->irq, desc); - return status; + return msi_capability_init(pdev, msi->irq, desc, msi->entry_nr); } static void __pci_disable_msi(struct msi_desc *entry) @@ -1096,6 +1118,8 @@ int pci_restore_msi_state(struct pci_dev list_for_each_entry_safe( entry, tmp, &pdev->msi_list, list ) { + unsigned int i = 0, nr = 1; + irq = entry->irq; desc = &irq_desc[irq]; @@ -1105,30 +1129,58 @@ int pci_restore_msi_state(struct pci_dev if (desc->msi_desc != entry) { + bogus: dprintk(XENLOG_ERR, - "Restore MSI for dev %04x:%02x:%02x:%x not set before?\n", + "Restore MSI for %04x:%02x:%02x:%u entry %u not set?\n", pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn), - PCI_FUNC(pdev->devfn)); + PCI_FUNC(pdev->devfn), i); spin_unlock_irqrestore(&desc->lock, flags); return -EINVAL; } if ( entry->msi_attrib.type == PCI_CAP_ID_MSI ) + { msi_set_enable(pdev, 0); + nr = entry->msi.nvec; + } else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX ) msix_set_enable(pdev, 0); msg = entry->msg; write_msi_msg(entry, &msg); - msi_set_mask_bit(desc, entry->msi_attrib.masked); + for ( i = 0; ; ) + { + msi_set_mask_bit(desc, entry[i].msi_attrib.masked); + spin_unlock_irqrestore(&desc->lock, flags); + + if ( !--nr ) + break; + + desc = &irq_desc[entry[++i].irq]; + spin_lock_irqsave(&desc->lock, flags); + if ( desc->msi_desc != entry + i ) + goto bogus; + } + + spin_unlock_irqrestore(&desc->lock, flags); if ( entry->msi_attrib.type == PCI_CAP_ID_MSI ) + { + unsigned int cpos = msi_control_reg(entry->msi_attrib.pos); + u16 control = pci_conf_read16(pdev->seg, pdev->bus, + PCI_SLOT(pdev->devfn), + PCI_FUNC(pdev->devfn), cpos); + + control &= ~PCI_MSI_FLAGS_QSIZE; + multi_msi_enable(control, entry->msi.nvec); + pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn), + PCI_FUNC(pdev->devfn), cpos, control); + msi_set_enable(pdev, 1); + } else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX ) msix_set_enable(pdev, 1); - - spin_unlock_irqrestore(&desc->lock, flags); } return 0; --- a/xen/arch/x86/physdev.c +++ b/xen/arch/x86/physdev.c @@ -140,8 +140,11 @@ int physdev_map_pirq(domid_t domid, int break; case MAP_PIRQ_TYPE_MSI: + if ( !msi->table_base ) + msi->entry_nr = 1; irq = *index; if ( irq == -1 ) + case MAP_PIRQ_TYPE_MULTI_MSI: irq = create_irq(NUMA_NO_NODE); if ( irq < nr_irqs_gsi || irq >= nr_irqs ) @@ -179,6 +182,30 @@ int physdev_map_pirq(domid_t domid, int goto done; } } + else if ( type == MAP_PIRQ_TYPE_MULTI_MSI ) + { + if ( msi->entry_nr <= 0 || msi->entry_nr > 32 ) + ret = -EDOM; + else if ( msi->entry_nr != 1 && !iommu_intremap ) + ret = -EOPNOTSUPP; + else + { + while ( msi->entry_nr & (msi->entry_nr - 1) ) + msi->entry_nr += msi->entry_nr & -msi->entry_nr; + pirq = get_free_pirqs(d, msi->entry_nr); + if ( pirq < 0 ) + { + while ( (msi->entry_nr >>= 1) > 1 ) + if ( get_free_pirqs(d, msi->entry_nr) > 0 ) + break; + dprintk(XENLOG_G_ERR, "dom%d: no block of %d free pirqs\n", + d->domain_id, msi->entry_nr << 1); + ret = pirq; + } + } + if ( ret < 0 ) + goto done; + } else { pirq = get_free_pirq(d, type); @@ -210,8 +237,15 @@ int physdev_map_pirq(domid_t domid, int done: spin_unlock(&d->event_lock); spin_unlock(&pcidevs_lock); - if ( (ret != 0) && (type == MAP_PIRQ_TYPE_MSI) && (*index == -1) ) - destroy_irq(irq); + if ( ret != 0 ) + switch ( type ) + { + case MAP_PIRQ_TYPE_MSI: + if ( *index == -1 ) + case MAP_PIRQ_TYPE_MULTI_MSI: + destroy_irq(irq); + break; + } free_domain: rcu_unlock_domain(d); return ret; @@ -390,14 +424,22 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H if ( copy_from_guest(&map, arg, 1) != 0 ) break; - if ( map.type == MAP_PIRQ_TYPE_MSI_SEG ) + switch ( map.type ) { + case MAP_PIRQ_TYPE_MSI_SEG: map.type = MAP_PIRQ_TYPE_MSI; msi.seg = map.bus >> 16; - } - else - { + break; + + case MAP_PIRQ_TYPE_MULTI_MSI: + if ( map.table_base ) + return -EINVAL; + msi.seg = map.bus >> 16; + break; + + default: msi.seg = 0; + break; } msi.bus = map.bus; msi.devfn = map.devfn; @@ -406,6 +448,8 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H ret = physdev_map_pirq(map.domid, map.type, &map.index, &map.pirq, &msi); + if ( map.type == MAP_PIRQ_TYPE_MULTI_MSI ) + map.entry_nr = msi.entry_nr; if ( __copy_to_guest(arg, &map, 1) ) ret = -EFAULT; break; --- a/xen/include/asm-x86/irq.h +++ b/xen/include/asm-x86/irq.h @@ -141,6 +141,7 @@ int map_domain_pirq(struct domain *d, in void *data); int unmap_domain_pirq(struct domain *d, int pirq); int get_free_pirq(struct domain *d, int type); +int get_free_pirqs(struct domain *, unsigned int nr); void free_domain_pirqs(struct domain *d); int map_domain_emuirq_pirq(struct domain *d, int pirq, int irq); int unmap_domain_pirq_emuirq(struct domain *d, int pirq); --- a/xen/include/asm-x86/msi.h +++ b/xen/include/asm-x86/msi.h @@ -145,7 +145,7 @@ int msi_free_irq(struct msi_desc *entry) #define multi_msi_capable(control) \ (1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1)) #define multi_msi_enable(control, num) \ - control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE); + control |= (((fls(num) - 1) << 4) & PCI_MSI_FLAGS_QSIZE); #define is_64bit_address(control) (!!(control & PCI_MSI_FLAGS_64BIT)) #define is_mask_bit_support(control) (!!(control & PCI_MSI_FLAGS_MASKBIT)) #define msi_enable(control, num) multi_msi_enable(control, num); \ --- a/xen/include/public/physdev.h +++ b/xen/include/public/physdev.h @@ -151,21 +151,22 @@ DEFINE_XEN_GUEST_HANDLE(physdev_irq_t); #define MAP_PIRQ_TYPE_GSI 0x1 #define MAP_PIRQ_TYPE_UNKNOWN 0x2 #define MAP_PIRQ_TYPE_MSI_SEG 0x3 +#define MAP_PIRQ_TYPE_MULTI_MSI 0x4 #define PHYSDEVOP_map_pirq 13 struct physdev_map_pirq { domid_t domid; /* IN */ int type; - /* IN */ + /* IN (ignored for ..._MULTI_MSI) */ int index; /* IN or OUT */ int pirq; - /* IN - high 16 bits hold segment for MAP_PIRQ_TYPE_MSI_SEG */ + /* IN - high 16 bits hold segment for ..._MSI_SEG and ..._MULTI_MSI */ int bus; /* IN */ int devfn; - /* IN */ + /* IN (also OUT for ..._MULTI_MSI) */ int entry_nr; /* IN */ uint64_t table_base; _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
The requested vector count is to be passed in struct xen_pci_op''s info field. Upon failure, if a smaller vector count might work, the backend will pass that smaller count in the value field (which so far is always being set to zero in the error path). Signed-off-by: Jan Beulich <jbeulich@suse.com> --- a/xen/include/public/io/pciif.h +++ b/xen/include/public/io/pciif.h @@ -46,6 +46,7 @@ #define XEN_PCI_OP_aer_resume (7) #define XEN_PCI_OP_aer_mmio (8) #define XEN_PCI_OP_aer_slotreset (9) +#define XEN_PCI_OP_enable_multi_msi (10) /* xen_pci_op error numbers */ #define XEN_PCI_ERR_success (0) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
>>> On 19.04.13 at 12:50, "Jan Beulich" <JBeulich@suse.com> wrote: > 1: AMD IOMMU: allocate IRTE entries instead of using a static mapping > 2: AMD IOMMU: untie remap and vector maps > 3: VT-d: enable for multi-vector MSI > 4: AMD IOMMU: enable for multi-vector MSI > 5: x86: enable multi-vector MSI > 6: pciif: add multi-vector-MSI commandFor reference I''m also sending our kernel side code, as just discussed on irc. I''m also including the fragment that I created for pv-ops, which I stopped when seeing that I''d need to play with xen_bind_pirq_msi_to_irq() in any case. And please recall - the hypervisor side works only on VT-d so far. Jan _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
On 4/19/2013 5:59 AM, Jan Beulich wrote:> --- a/xen/arch/x86/msi.c > +++ b/xen/arch/x86/msi.c > @@ -238,6 +238,11 @@ static int write_msi_msg(struct msi_desc > u8 bus = dev->bus; > u8 slot = PCI_SLOT(dev->devfn); > u8 func = PCI_FUNC(dev->devfn); > + int nr = entry->msi_attrib.entry_nr; > + > + ASSERT((msg->data & (entry[-nr].msi.nvec - 1)) == nr); > + if ( nr ) > + return 0; >Jan, This logic seems incorrect. Do you meant to write --nr? This causes assertion here. Also, investigation showing the value of nr is 0 here. Suravee
>>> On 23.04.13 at 02:55, Suravee Suthikulanit <suravee.suthikulpanit@amd.com>wrote:> On 4/19/2013 5:59 AM, Jan Beulich wrote: >> --- a/xen/arch/x86/msi.c >> +++ b/xen/arch/x86/msi.c >> @@ -238,6 +238,11 @@ static int write_msi_msg(struct msi_desc >> u8 bus = dev->bus; >> u8 slot = PCI_SLOT(dev->devfn); >> u8 func = PCI_FUNC(dev->devfn); >> + int nr = entry->msi_attrib.entry_nr; >> + >> + ASSERT((msg->data & (entry[-nr].msi.nvec - 1)) == nr); >> + if ( nr ) >> + return 0; > > This logic seems incorrect. Do you meant to write --nr?No, this indeed has to be -nr (i.e. the "master" entry, which is the first on in the array.> This causes assertion here. Also, investigation showing the > value of nr is 0 here.nr being 0 here is perfectly fine, meaning this is the first ("master") entry of a multi-vector device (it can''t be a single-vector one, as in that case entry[0].msi.nvec == 1, i.e. the & yields zero regardless of msg->data). And the assertion should hold, due to *data = (msg->data & ~(INTREMAP_ENTRIES - 1)) | offset; in update_intremap_entry_from_msi_msg(), and alloc_intremap_entry() returning only aligned blocks. So the question isn''t just what value nr there has, but also what the other involved values are. Jan
Suravee Suthikulanit
2013-Apr-23 13:21 UTC
Re: [PATCH 1/6] AMD IOMMU: allocate IRTE entries instead of using a static mapping
I am now reproducing the issue with the USB devices not working with this patch again. I''ll continue to investigate more. Suravee On 4/19/2013 5:57 AM, Jan Beulich wrote:> For multi-vector MSI, where we surely don''t want to allocate > contiguous vectors and be able to set affinities of the individual > vectors separately, we need to drop the use of the tuple of vector and > delivery mode to determine the IRTE to use, and instead allocate IRTEs > (which imo should have been done from the beginning). > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > --- > One thing I surely need confirmation on is whether this > > BUG_ON(get_ivrs_mappings(iommu->seg)[req_id].intremap_table !> get_ivrs_mappings(iommu->seg)[alias_id].intremap_table); > > in update_intremap_entry_from_msi_msg() is valid. If it isn''t, it''s not > clear to me how to properly set up things for affected devices, as we > would need an identical index allocated for two different remap table > instances (which can hardly be expected to work out well). > > --- a/xen/drivers/passthrough/amd/iommu_acpi.c > +++ b/xen/drivers/passthrough/amd/iommu_acpi.c > @@ -72,12 +72,15 @@ static void __init add_ivrs_mapping_entr > /* allocate per-device interrupt remapping table */ > if ( amd_iommu_perdev_intremap ) > ivrs_mappings[alias_id].intremap_table > - amd_iommu_alloc_intremap_table(); > + amd_iommu_alloc_intremap_table( > + &ivrs_mappings[alias_id].intremap_inuse); > else > { > if ( shared_intremap_table == NULL ) > - shared_intremap_table = amd_iommu_alloc_intremap_table(); > + shared_intremap_table = amd_iommu_alloc_intremap_table( > + &shared_intremap_inuse); > ivrs_mappings[alias_id].intremap_table = shared_intremap_table; > + ivrs_mappings[alias_id].intremap_inuse = shared_intremap_inuse; > } > } > /* assgin iommu hardware */ > @@ -671,7 +674,7 @@ static u16 __init parse_ivhd_device_spec > if ( IO_APIC_ID(apic) != special->handle ) > continue; > > - if ( ioapic_sbdf[special->handle].pin_setup ) > + if ( ioapic_sbdf[special->handle].pin_2_idx ) > { > if ( ioapic_sbdf[special->handle].bdf == bdf && > ioapic_sbdf[special->handle].seg == seg ) > @@ -691,14 +694,16 @@ static u16 __init parse_ivhd_device_spec > ioapic_sbdf[special->handle].bdf = bdf; > ioapic_sbdf[special->handle].seg = seg; > > - ioapic_sbdf[special->handle].pin_setup = xzalloc_array( > - unsigned long, BITS_TO_LONGS(nr_ioapic_entries[apic])); > + ioapic_sbdf[special->handle].pin_2_idx = xmalloc_array( > + u16, nr_ioapic_entries[apic]); > if ( nr_ioapic_entries[apic] && > - !ioapic_sbdf[IO_APIC_ID(apic)].pin_setup ) > + !ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx ) > { > printk(XENLOG_ERR "IVHD Error: Out of memory\n"); > return 0; > } > + memset(ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx, -1, > + nr_ioapic_entries[apic]); > } > break; > } > @@ -926,7 +931,7 @@ static int __init parse_ivrs_table(struc > for ( apic = 0; !error && iommu_intremap && apic < nr_ioapics; ++apic ) > { > if ( !nr_ioapic_entries[apic] || > - ioapic_sbdf[IO_APIC_ID(apic)].pin_setup ) > + ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx ) > continue; > > printk(XENLOG_ERR "IVHD Error: no information for IO-APIC %#x\n", > @@ -935,13 +940,15 @@ static int __init parse_ivrs_table(struc > error = -ENXIO; > else > { > - ioapic_sbdf[IO_APIC_ID(apic)].pin_setup = xzalloc_array( > - unsigned long, BITS_TO_LONGS(nr_ioapic_entries[apic])); > - if ( !ioapic_sbdf[IO_APIC_ID(apic)].pin_setup ) > + ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx = xmalloc_array( > + u16, nr_ioapic_entries[apic]); > + if ( !ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx ) > { > printk(XENLOG_ERR "IVHD Error: Out of memory\n"); > error = -ENOMEM; > } > + memset(ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx, -1, > + nr_ioapic_entries[apic]); > } > } > > --- a/xen/drivers/passthrough/amd/iommu_intr.c > +++ b/xen/drivers/passthrough/amd/iommu_intr.c > @@ -31,6 +31,7 @@ > struct ioapic_sbdf ioapic_sbdf[MAX_IO_APICS]; > struct hpet_sbdf hpet_sbdf; > void *shared_intremap_table; > +unsigned long *shared_intremap_inuse; > static DEFINE_SPINLOCK(shared_intremap_lock); > > static spinlock_t* get_intremap_lock(int seg, int req_id) > @@ -46,30 +47,31 @@ static int get_intremap_requestor_id(int > return get_ivrs_mappings(seg)[bdf].dte_requestor_id; > } > > -static int get_intremap_offset(u8 vector, u8 dm) > +static unsigned int alloc_intremap_entry(int seg, int bdf) > { > - int offset = 0; > - offset = (dm << INT_REMAP_INDEX_DM_SHIFT) & INT_REMAP_INDEX_DM_MASK; > - offset |= (vector << INT_REMAP_INDEX_VECTOR_SHIFT ) & > - INT_REMAP_INDEX_VECTOR_MASK; > - return offset; > + unsigned long *inuse = get_ivrs_mappings(seg)[bdf].intremap_inuse; > + unsigned int slot = find_first_zero_bit(inuse, INTREMAP_ENTRIES); > + > + if ( slot < INTREMAP_ENTRIES ) > + __set_bit(slot, inuse); > + return slot; > } > > -static u8 *get_intremap_entry(int seg, int bdf, int offset) > +static u32 *get_intremap_entry(int seg, int bdf, int offset) > { > - u8 *table; > + u32 *table = get_ivrs_mappings(seg)[bdf].intremap_table; > > - table = (u8*)get_ivrs_mappings(seg)[bdf].intremap_table; > ASSERT( (table != NULL) && (offset < INTREMAP_ENTRIES) ); > > - return (u8*) (table + offset); > + return table + offset; > } > > static void free_intremap_entry(int seg, int bdf, int offset) > { > - u32* entry; > - entry = (u32*)get_intremap_entry(seg, bdf, offset); > + u32 *entry = get_intremap_entry(seg, bdf, offset); > + > memset(entry, 0, sizeof(u32)); > + __clear_bit(offset, get_ivrs_mappings(seg)[bdf].intremap_inuse); > } > > static void update_intremap_entry(u32* entry, u8 vector, u8 int_type, > @@ -98,18 +100,24 @@ static void update_intremap_entry(u32* e > INT_REMAP_ENTRY_VECTOR_SHIFT, entry); > } > > -static void update_intremap_entry_from_ioapic( > +static void set_rte_index(struct IO_APIC_route_entry *rte, int offset) > +{ > + rte->vector = (u8)offset; > + rte->delivery_mode = offset >> 8; > +} > + > +static int update_intremap_entry_from_ioapic( > int bdf, > struct amd_iommu *iommu, > - const struct IO_APIC_route_entry *rte, > - const struct IO_APIC_route_entry *old_rte) > + struct IO_APIC_route_entry *rte, > + u16 *index) > { > unsigned long flags; > u32* entry; > u8 delivery_mode, dest, vector, dest_mode; > int req_id; > spinlock_t *lock; > - int offset; > + unsigned int offset; > > req_id = get_intremap_requestor_id(iommu->seg, bdf); > lock = get_intremap_lock(iommu->seg, req_id); > @@ -121,16 +129,20 @@ static void update_intremap_entry_from_i > > spin_lock_irqsave(lock, flags); > > - offset = get_intremap_offset(vector, delivery_mode); > - if ( old_rte ) > + offset = *index; > + if ( offset >= INTREMAP_ENTRIES ) > { > - int old_offset = get_intremap_offset(old_rte->vector, > - old_rte->delivery_mode); > - > - if ( offset != old_offset ) > - free_intremap_entry(iommu->seg, bdf, old_offset); > + offset = alloc_intremap_entry(iommu->seg, req_id); > + if ( offset >= INTREMAP_ENTRIES ) > + { > + spin_unlock_irqrestore(lock, flags); > + rte->mask = 1; > + return -ENOSPC; > + } > + *index = offset; > } > - entry = (u32*)get_intremap_entry(iommu->seg, req_id, offset); > + > + entry = get_intremap_entry(iommu->seg, req_id, offset); > update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest); > > spin_unlock_irqrestore(lock, flags); > @@ -141,6 +153,10 @@ static void update_intremap_entry_from_i > amd_iommu_flush_intremap(iommu, req_id); > spin_unlock_irqrestore(&iommu->lock, flags); > } > + > + set_rte_index(rte, offset); > + > + return 0; > } > > int __init amd_iommu_setup_ioapic_remapping(void) > @@ -153,7 +169,7 @@ int __init amd_iommu_setup_ioapic_remapp > u16 seg, bdf, req_id; > struct amd_iommu *iommu; > spinlock_t *lock; > - int offset; > + unsigned int offset; > > /* Read ioapic entries and update interrupt remapping table accordingly */ > for ( apic = 0; apic < nr_ioapics; apic++ ) > @@ -184,19 +200,23 @@ int __init amd_iommu_setup_ioapic_remapp > dest = rte.dest.logical.logical_dest; > > spin_lock_irqsave(lock, flags); > - offset = get_intremap_offset(vector, delivery_mode); > - entry = (u32*)get_intremap_entry(iommu->seg, req_id, offset); > + offset = alloc_intremap_entry(seg, req_id); > + BUG_ON(offset >= INTREMAP_ENTRIES); > + entry = get_intremap_entry(iommu->seg, req_id, offset); > update_intremap_entry(entry, vector, > delivery_mode, dest_mode, dest); > spin_unlock_irqrestore(lock, flags); > > + set_rte_index(&rte, offset); > + ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx[pin] = offset; > + __ioapic_write_entry(apic, pin, 1, rte); > + > if ( iommu->enabled ) > { > spin_lock_irqsave(&iommu->lock, flags); > amd_iommu_flush_intremap(iommu, req_id); > spin_unlock_irqrestore(&iommu->lock, flags); > } > - set_bit(pin, ioapic_sbdf[IO_APIC_ID(apic)].pin_setup); > } > } > return 0; > @@ -209,7 +229,7 @@ void amd_iommu_ioapic_update_ire( > struct IO_APIC_route_entry new_rte = { 0 }; > unsigned int rte_lo = (reg & 1) ? reg - 1 : reg; > unsigned int pin = (reg - 0x10) / 2; > - int saved_mask, seg, bdf; > + int saved_mask, seg, bdf, rc; > struct amd_iommu *iommu; > > if ( !iommu_intremap ) > @@ -247,7 +267,7 @@ void amd_iommu_ioapic_update_ire( > } > > if ( new_rte.mask && > - !test_bit(pin, ioapic_sbdf[IO_APIC_ID(apic)].pin_setup) ) > + ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx[pin] >= INTREMAP_ENTRIES ) > { > ASSERT(saved_mask); > __io_apic_write(apic, reg, value); > @@ -262,14 +282,19 @@ void amd_iommu_ioapic_update_ire( > } > > /* Update interrupt remapping entry */ > - update_intremap_entry_from_ioapic( > - bdf, iommu, &new_rte, > - test_and_set_bit(pin, > - ioapic_sbdf[IO_APIC_ID(apic)].pin_setup) ? &old_rte > - : NULL); > + rc = update_intremap_entry_from_ioapic( > + bdf, iommu, &new_rte, > + &ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx[pin]); > > - /* Forward write access to IO-APIC RTE */ > - __io_apic_write(apic, reg, value); > + __io_apic_write(apic, reg, ((u32 *)&new_rte)[reg != rte_lo]); > + > + if ( rc ) > + { > + /* Keep the entry masked. */ > + printk(XENLOG_ERR "Remapping IO-APIC %#x pin %u failed (%d)\n", > + IO_APIC_ID(apic), pin, rc); > + return; > + } > > /* For lower bits access, return directly to avoid double writes */ > if ( reg == rte_lo ) > @@ -283,16 +308,41 @@ void amd_iommu_ioapic_update_ire( > } > } > > -static void update_intremap_entry_from_msi_msg( > +unsigned int amd_iommu_read_ioapic_from_ire( > + unsigned int apic, unsigned int reg) > +{ > + unsigned int val = __io_apic_read(apic, reg); > + > + if ( !(reg & 1) ) > + { > + unsigned int offset = val & (INTREMAP_ENTRIES - 1); > + u16 bdf = ioapic_sbdf[IO_APIC_ID(apic)].bdf; > + u16 seg = ioapic_sbdf[IO_APIC_ID(apic)].seg; > + u16 req_id = get_intremap_requestor_id(seg, bdf); > + const u32 *entry = get_intremap_entry(seg, req_id, offset); > + > + val &= ~(INTREMAP_ENTRIES - 1); > + val |= get_field_from_reg_u32(*entry, > + INT_REMAP_ENTRY_INTTYPE_MASK, > + INT_REMAP_ENTRY_INTTYPE_SHIFT) << 8; > + val |= get_field_from_reg_u32(*entry, > + INT_REMAP_ENTRY_VECTOR_MASK, > + INT_REMAP_ENTRY_VECTOR_SHIFT); > + } > + > + return val; > +} > + > +static int update_intremap_entry_from_msi_msg( > struct amd_iommu *iommu, u16 bdf, > - int *remap_index, const struct msi_msg *msg) > + int *remap_index, const struct msi_msg *msg, u32 *data) > { > unsigned long flags; > u32* entry; > u16 req_id, alias_id; > u8 delivery_mode, dest, vector, dest_mode; > spinlock_t *lock; > - int offset; > + unsigned int offset; > > req_id = get_dma_requestor_id(iommu->seg, bdf); > alias_id = get_intremap_requestor_id(iommu->seg, bdf); > @@ -303,15 +353,6 @@ static void update_intremap_entry_from_m > spin_lock_irqsave(lock, flags); > free_intremap_entry(iommu->seg, req_id, *remap_index); > spin_unlock_irqrestore(lock, flags); > - > - if ( ( req_id != alias_id ) && > - get_ivrs_mappings(iommu->seg)[alias_id].intremap_table != NULL ) > - { > - lock = get_intremap_lock(iommu->seg, alias_id); > - spin_lock_irqsave(lock, flags); > - free_intremap_entry(iommu->seg, alias_id, *remap_index); > - spin_unlock_irqrestore(lock, flags); > - } > goto done; > } > > @@ -322,16 +363,24 @@ static void update_intremap_entry_from_m > delivery_mode = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1; > vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) & MSI_DATA_VECTOR_MASK; > dest = (msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff; > - offset = get_intremap_offset(vector, delivery_mode); > - if ( *remap_index < 0) > + offset = *remap_index; > + if ( offset >= INTREMAP_ENTRIES ) > + { > + offset = alloc_intremap_entry(iommu->seg, bdf); > + if ( offset >= INTREMAP_ENTRIES ) > + { > + spin_unlock_irqrestore(lock, flags); > + return -ENOSPC; > + } > *remap_index = offset; > - else > - BUG_ON(*remap_index != offset); > + } > > - entry = (u32*)get_intremap_entry(iommu->seg, req_id, offset); > + entry = get_intremap_entry(iommu->seg, req_id, offset); > update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest); > spin_unlock_irqrestore(lock, flags); > > + *data = (msg->data & ~(INTREMAP_ENTRIES - 1)) | offset; > + > /* > * In some special cases, a pci-e device(e.g SATA controller in IDE mode) > * will use alias id to index interrupt remapping table. > @@ -343,10 +392,8 @@ static void update_intremap_entry_from_m > if ( ( req_id != alias_id ) && > get_ivrs_mappings(iommu->seg)[alias_id].intremap_table != NULL ) > { > - spin_lock_irqsave(lock, flags); > - entry = (u32*)get_intremap_entry(iommu->seg, alias_id, offset); > - update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest); > - spin_unlock_irqrestore(lock, flags); > + BUG_ON(get_ivrs_mappings(iommu->seg)[req_id].intremap_table !> + get_ivrs_mappings(iommu->seg)[alias_id].intremap_table); > } > > done: > @@ -358,19 +405,22 @@ done: > amd_iommu_flush_intremap(iommu, alias_id); > spin_unlock_irqrestore(&iommu->lock, flags); > } > + > + return 0; > } > > static struct amd_iommu *_find_iommu_for_device(int seg, int bdf) > { > - struct amd_iommu *iommu = find_iommu_for_device(seg, bdf); > - > - if ( iommu ) > - return iommu; > + struct amd_iommu *iommu; > > list_for_each_entry ( iommu, &amd_iommu_head, list ) > if ( iommu->seg == seg && iommu->bdf == bdf ) > return NULL; > > + iommu = find_iommu_for_device(seg, bdf); > + if ( iommu ) > + return iommu; > + > AMD_IOMMU_DEBUG("No IOMMU for MSI dev = %04x:%02x:%02x.%u\n", > seg, PCI_BUS(bdf), PCI_SLOT(bdf), PCI_FUNC(bdf)); > return ERR_PTR(-EINVAL); > @@ -380,8 +430,9 @@ int amd_iommu_msi_msg_update_ire( > struct msi_desc *msi_desc, struct msi_msg *msg) > { > struct pci_dev *pdev = msi_desc->dev; > - int bdf, seg; > + int bdf, seg, rc; > struct amd_iommu *iommu; > + u32 data; > > bdf = pdev ? PCI_BDF2(pdev->bus, pdev->devfn) : hpet_sbdf.bdf; > seg = pdev ? pdev->seg : hpet_sbdf.seg; > @@ -390,11 +441,12 @@ int amd_iommu_msi_msg_update_ire( > if ( IS_ERR_OR_NULL(iommu) ) > return PTR_ERR(iommu); > > - if ( msi_desc->remap_index >= 0 ) > + if ( msi_desc->remap_index >= 0 && !msg ) > { > do { > update_intremap_entry_from_msi_msg(iommu, bdf, > - &msi_desc->remap_index, NULL); > + &msi_desc->remap_index, > + NULL, NULL); > if ( !pdev || !pdev->phantom_stride ) > break; > bdf += pdev->phantom_stride; > @@ -409,19 +461,39 @@ int amd_iommu_msi_msg_update_ire( > return 0; > > do { > - update_intremap_entry_from_msi_msg(iommu, bdf, &msi_desc->remap_index, > - msg); > - if ( !pdev || !pdev->phantom_stride ) > + rc = update_intremap_entry_from_msi_msg(iommu, bdf, > + &msi_desc->remap_index, > + msg, &data); > + if ( rc || !pdev || !pdev->phantom_stride ) > break; > bdf += pdev->phantom_stride; > } while ( PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) ); > > - return 0; > + msg->data = data; > + return rc; > } > > void amd_iommu_read_msi_from_ire( > struct msi_desc *msi_desc, struct msi_msg *msg) > { > + unsigned int offset = msg->data & (INTREMAP_ENTRIES - 1); > + const struct pci_dev *pdev = msi_desc->dev; > + u16 bdf = pdev ? PCI_BDF2(pdev->bus, pdev->devfn) : hpet_sbdf.bdf; > + u16 seg = pdev ? pdev->seg : hpet_sbdf.seg; > + const u32 *entry; > + > + if ( IS_ERR_OR_NULL(_find_iommu_for_device(seg, bdf)) ) > + return; > + > + entry = get_intremap_entry(seg, get_dma_requestor_id(seg, bdf), offset); > + > + msg->data &= ~(INTREMAP_ENTRIES - 1); > + msg->data |= get_field_from_reg_u32(*entry, > + INT_REMAP_ENTRY_INTTYPE_MASK, > + INT_REMAP_ENTRY_INTTYPE_SHIFT) << 8; > + msg->data |= get_field_from_reg_u32(*entry, > + INT_REMAP_ENTRY_VECTOR_MASK, > + INT_REMAP_ENTRY_VECTOR_SHIFT); > } > > int __init amd_iommu_free_intremap_table( > @@ -438,12 +510,14 @@ int __init amd_iommu_free_intremap_table > return 0; > } > > -void* __init amd_iommu_alloc_intremap_table(void) > +void* __init amd_iommu_alloc_intremap_table(unsigned long **inuse_map) > { > void *tb; > tb = __alloc_amd_iommu_tables(INTREMAP_TABLE_ORDER); > BUG_ON(tb == NULL); > memset(tb, 0, PAGE_SIZE * (1UL << INTREMAP_TABLE_ORDER)); > + *inuse_map = xzalloc_array(unsigned long, BITS_TO_LONGS(INTREMAP_ENTRIES)); > + BUG_ON(*inuse_map == NULL); > return tb; > } > > --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c > +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c > @@ -622,7 +622,7 @@ const struct iommu_ops amd_iommu_ops = { > .get_device_group_id = amd_iommu_group_id, > .update_ire_from_apic = amd_iommu_ioapic_update_ire, > .update_ire_from_msi = amd_iommu_msi_msg_update_ire, > - .read_apic_from_ire = __io_apic_read, > + .read_apic_from_ire = amd_iommu_read_ioapic_from_ire, > .read_msi_from_ire = amd_iommu_read_msi_from_ire, > .setup_hpet_msi = amd_setup_hpet_msi, > .suspend = amd_iommu_suspend, > --- a/xen/include/asm-x86/amd-iommu.h > +++ b/xen/include/asm-x86/amd-iommu.h > @@ -119,6 +119,7 @@ struct ivrs_mappings { > > /* per device interrupt remapping table */ > void *intremap_table; > + unsigned long *intremap_inuse; > spinlock_t intremap_lock; > > /* ivhd device data settings */ > --- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h > +++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h > @@ -458,10 +458,6 @@ > #define MAX_AMD_IOMMUS 32 > > /* interrupt remapping table */ > -#define INT_REMAP_INDEX_DM_MASK 0x1C00 > -#define INT_REMAP_INDEX_DM_SHIFT 10 > -#define INT_REMAP_INDEX_VECTOR_MASK 0x3FC > -#define INT_REMAP_INDEX_VECTOR_SHIFT 2 > #define INT_REMAP_ENTRY_REMAPEN_MASK 0x00000001 > #define INT_REMAP_ENTRY_REMAPEN_SHIFT 0 > #define INT_REMAP_ENTRY_SUPIOPF_MASK 0x00000002 > --- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h > +++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h > @@ -89,10 +89,12 @@ struct amd_iommu *find_iommu_for_device( > > /* interrupt remapping */ > int amd_iommu_setup_ioapic_remapping(void); > -void *amd_iommu_alloc_intremap_table(void); > +void *amd_iommu_alloc_intremap_table(unsigned long **); > int amd_iommu_free_intremap_table(u16 seg, struct ivrs_mappings *); > void amd_iommu_ioapic_update_ire( > unsigned int apic, unsigned int reg, unsigned int value); > +unsigned int amd_iommu_read_ioapic_from_ire( > + unsigned int apic, unsigned int reg); > int amd_iommu_msi_msg_update_ire( > struct msi_desc *msi_desc, struct msi_msg *msg); > void amd_iommu_read_msi_from_ire( > @@ -101,15 +103,17 @@ int amd_setup_hpet_msi(struct msi_desc * > > extern struct ioapic_sbdf { > u16 bdf, seg; > - unsigned long *pin_setup; > + u16 *pin_2_idx; > } ioapic_sbdf[MAX_IO_APICS]; > -extern void *shared_intremap_table; > > extern struct hpet_sbdf { > u16 bdf, seg, id; > struct amd_iommu *iommu; > } hpet_sbdf; > > +extern void *shared_intremap_table; > +extern unsigned long *shared_intremap_inuse; > + > /* power management support */ > void amd_iommu_resume(void); > void amd_iommu_suspend(void); > >
Suravee Suthikulanit
2013-Apr-23 15:06 UTC
Re: [PATCH 1/6] AMD IOMMU: allocate IRTE entries instead of using a static mapping
On 4/23/2013 8:21 AM, Suravee Suthikulanit wrote:> I am now reproducing the issue with the USB devices not working with > this patch again. I''ll continue to investigate more. > > SuraveeOk, I have more update on the issue. Bellow, I include the output from "xl debug-key i". It is showing several IRQ having the same vector "b0". This is not the case when booting with the xen w/o the patch. On my system, IRQ 18 is for my USB keyboard. Suravee ## INTERRUPT BINDING (i) (XEN) Guest interrupt information: (XEN) IRQ: 0 affinity:1 vec:f0 type=IO-APIC-edge status=00000000 timer_interrupt+0/0x18a (XEN) IRQ: 1 affinity:1 vec:30 type=IO-APIC-edge status=00000002 mapped, unbound (XEN) IRQ: 3 affinity:1 vec:38 type=IO-APIC-edge status=00000002 mapped, unbound (XEN) IRQ: 4 affinity:1 vec:40 type=IO-APIC-edge status=00000002 mapped, unbound (XEN) IRQ: 5 affinity:f vec:48 type=IO-APIC-edge status=00000002 mapped, unbound (XEN) IRQ: 6 affinity:1 vec:50 type=IO-APIC-edge status=00000002 mapped, unbound (XEN) IRQ: 7 affinity:1 vec:58 type=IO-APIC-edge status=00000002 mapped, unbound (XEN) IRQ: 8 affinity:1 vec:60 type=IO-APIC-edge status=00000010 in-flight=0 domain-list=0: 8(----), (XEN) IRQ: 9 affinity:1 vec:68 type=IO-APIC-level status=00000010 in-flight=0 domain-list=0: 9(----), (XEN) IRQ: 10 affinity:1 vec:70 type=IO-APIC-edge status=00000002 mapped, unbound (XEN) IRQ: 11 affinity:1 vec:78 type=IO-APIC-edge status=00000002 mapped, unbound (XEN) IRQ: 12 affinity:1 vec:88 type=IO-APIC-edge status=00000002 mapped, unbound (XEN) IRQ: 13 affinity:f vec:90 type=IO-APIC-edge status=00000002 mapped, unbound (XEN) IRQ: 14 affinity:1 vec:98 type=IO-APIC-edge status=00000002 mapped, unbound (XEN) IRQ: 15 affinity:1 vec:a0 type=IO-APIC-edge status=00000002 mapped, unbound (XEN) IRQ: 16 affinity:1 vec:b0 type=IO-APIC-level status=00000010 in-flight=0 domain-list=0: 16(----), (XEN) IRQ: 17 affinity:1 vec:b8 type=IO-APIC-level status=00000010 in-flight=0 domain-list=0: 17(----), (XEN) IRQ: 18 affinity:1 vec:a8 type=IO-APIC-level status=00000010 in-flight=0 domain-list=0: 18(----), (XEN) IRQ: 19 affinity:f vec:d8 type=IO-APIC-level status=00000002 mapped, unbound (XEN) IRQ: 24 affinity:1 vec:28 type=AMD-IOMMU-MSI status=00000000 iommu_interrupt_handler+0/0x57 (XEN) IRQ: 25 affinity:1 vec:c0 type=PCI-MSI status=00000010 in-flight=0 domain-list=0:279(----), (XEN) IRQ: 26 affinity:1 vec:c8 type=PCI-MSI status=00000010 in-flight=0 domain-list=0:278(----), (XEN) IRQ: 27 affinity:1 vec:d0 type=PCI-MSI status=00000010 in-flight=0 domain-list=0:277(----), (XEN) IRQ: 28 affinity:1 vec:21 type=PCI-MSI status=00000010 in-flight=0 domain-list=0:276(----), (XEN) IRQ: 29 affinity:1 vec:29 type=PCI-MSI/-X status=00000010 in-flight=0 domain-list=0:275(----), (XEN) IRQ: 30 affinity:1 vec:31 type=PCI-MSI/-X status=00000010 in-flight=0 domain-list=0:274(----), (XEN) IRQ: 31 affinity:1 vec:39 type=PCI-MSI/-X status=00000010 in-flight=0 domain-list=0:273(----), (XEN) IRQ: 32 affinity:1 vec:41 type=PCI-MSI/-X status=00000010 in-flight=0 domain-list=0:272(----), (XEN) IRQ: 33 affinity:1 vec:49 type=PCI-MSI/-X status=00000010 in-flight=0 domain-list=0:271(----), (XEN) IRQ: 34 affinity:1 vec:51 type=PCI-MSI/-X status=00000010 in-flight=0 domain-list=0:270(----), (XEN) IRQ: 35 affinity:1 vec:59 type=PCI-MSI status=00000010 in-flight=0 domain-list=0:269(----), (XEN) IRQ: 36 affinity:1 vec:61 type=PCI-MSI status=00000010 in-flight=0 domain-list=0:268(----), (XEN) IRQ: 37 affinity:1 vec:69 type=PCI-MSI status=00000010 in-flight=0 domain-list=0:267(----), (XEN) IRQ: 38 affinity:1 vec:71 type=PCI-MSI status=00000010 in-flight=0 domain-list=0:266(----), (XEN) IO-APIC interrupt information: (XEN) IRQ 0 Vec240: (XEN) Apic 0x00, Pin 2: vec=f0 delivery=Fixed dest=L status=0 polarity=0 irr=0 trig=E mask=0 dest_id:1 (XEN) IRQ 1 Vec 48: (XEN) Apic 0x00, Pin 1: vec=b0 delivery=Fixed dest=L status=0 polarity=0 irr=0 trig=E mask=0 dest_id:1 (XEN) IRQ 3 Vec 56: (XEN) Apic 0x00, Pin 3: vec=38 delivery=Fixed dest=L status=0 polarity=0 irr=0 trig=E mask=0 dest_id:1 (XEN) IRQ 4 Vec 64: (XEN) Apic 0x00, Pin 4: vec=40 delivery=Fixed dest=L status=0 polarity=0 irr=0 trig=E mask=0 dest_id:1 (XEN) IRQ 5 Vec 72: (XEN) Apic 0x00, Pin 5: vec=48 delivery=LoPri dest=L status=0 polarity=0 irr=0 trig=E mask=1 dest_id:15 (XEN) IRQ 6 Vec 80: (XEN) Apic 0x00, Pin 6: vec=50 delivery=Fixed dest=L status=0 polarity=0 irr=0 trig=E mask=0 dest_id:1 (XEN) IRQ 7 Vec 88: (XEN) Apic 0x00, Pin 7: vec=58 delivery=Fixed dest=L status=0 polarity=0 irr=0 trig=E mask=0 dest_id:1 (XEN) IRQ 8 Vec 96: (XEN) Apic 0x00, Pin 8: vec=60 delivery=Fixed dest=L status=0 polarity=0 irr=0 trig=E mask=0 dest_id:1 (XEN) IRQ 9 Vec104: (XEN) Apic 0x00, Pin 9: vec=68 delivery=Fixed dest=L status=0 polarity=1 irr=0 trig=L mask=0 dest_id:1 (XEN) IRQ 10 Vec112: (XEN) Apic 0x00, Pin 10: vec=70 delivery=Fixed dest=L status=0 polarity=0 irr=0 trig=E mask=0 dest_id:1 (XEN) IRQ 11 Vec120: (XEN) Apic 0x00, Pin 11: vec=78 delivery=Fixed dest=L status=0 polarity=0 irr=0 trig=E mask=0 dest_id:1 (XEN) IRQ 12 Vec136: (XEN) Apic 0x00, Pin 12: vec=b0 delivery=Fixed dest=L status=0 polarity=0 irr=0 trig=E mask=0 dest_id:1 (XEN) IRQ 13 Vec144: (XEN) Apic 0x00, Pin 13: vec=b0 delivery=Fixed dest=L status=0 polarity=0 irr=0 trig=E mask=1 dest_id:15 (XEN) IRQ 14 Vec152: (XEN) Apic 0x00, Pin 14: vec=b0 delivery=Fixed dest=L status=0 polarity=0 irr=0 trig=E mask=0 dest_id:1 (XEN) IRQ 15 Vec160: (XEN) Apic 0x00, Pin 15: vec=b0 delivery=Fixed dest=L status=0 polarity=0 irr=0 trig=E mask=0 dest_id:1 (XEN) IRQ 16 Vec176: (XEN) Apic 0x00, Pin 16: vec=b0 delivery=Fixed dest=L status=0 polarity=1 irr=0 trig=L mask=0 dest_id:1 (XEN) IRQ 17 Vec184: (XEN) Apic 0x00, Pin 17: vec=b0 delivery=Fixed dest=L status=0 polarity=1 irr=1 trig=L mask=0 dest_id:1 (XEN) IRQ 18 Vec168: (XEN) Apic 0x00, Pin 18: vec=b0 delivery=Fixed dest=L status=0 polarity=1 irr=1 trig=L mask=0 dest_id:1 (XEN) IRQ 19 Vec216: (XEN) Apic 0x00, Pin 19: vec=b0 delivery=Fixed dest=L status=0 polarity=1 irr=0 trig=L mask=1 dest_id:15
Jan Beulich
2013-Apr-24 13:34 UTC
Re: [PATCH 1/6] AMD IOMMU: allocate IRTE entries instead of using a static mapping
>>> On 23.04.13 at 17:06, Suravee Suthikulanit <suravee.suthikulpanit@amd.com> wrote: > On 4/23/2013 8:21 AM, Suravee Suthikulanit wrote: >> I am now reproducing the issue with the USB devices not working with >> this patch again. I''ll continue to investigate more. >> >> Suravee > Ok, I have more update on the issue. Bellow, I include the output from > "xl debug-key i". It is showing several IRQ having the same vector "b0". > This is not the case when booting with the xen w/o the patch.Right, and I spotted a bug in the respective code, but I can''t readily connect that bug to the behavior you observed (i.e. I can''t explain why the bad vector would be the same all the time). Nevertheless, attached a fixed version of the first patch of the most recent series - let''s see how much of a difference this makes. Jan _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
suravee suthikulpanit
2013-Apr-24 21:52 UTC
Re: [PATCH 1/6] AMD IOMMU: allocate IRTE entries instead of using a static mapping
On 04/24/2013 08:34 AM, Jan Beulich wrote:>>>> On 23.04.13 at 17:06, Suravee Suthikulanit <suravee.suthikulpanit@amd.com> wrote: >> On 4/23/2013 8:21 AM, Suravee Suthikulanit wrote: >>> I am now reproducing the issue with the USB devices not working with >>> this patch again. I''ll continue to investigate more. >>> >>> Suravee >> Ok, I have more update on the issue. Bellow, I include the output from >> "xl debug-key i". It is showing several IRQ having the same vector "b0". >> This is not the case when booting with the xen w/o the patch. > Right, and I spotted a bug in the respective code, but I can''t > readily connect that bug to the behavior you observed (i.e. I > can''t explain why the bad vector would be the same all the time). > > Nevertheless, attached a fixed version of the first patch of > the most recent series - let''s see how much of a difference this > makes. > > Jan >Jan, I have finally root cause the issue. Here are the two patches that should help fixing the issue. The first patch fixes the issue with the mouse/keyboard not working while the second patch add a debug-key to help dumping the IRTE which I used to debug the issue. I have also include the xl dmesg from both "before" patching and "after" applying patch 1, 2 and the fixes. These also include the output from debug keys "M,z,i,j" Suravee _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Jan Beulich
2013-Apr-26 10:39 UTC
Re: [PATCH 1/6] AMD IOMMU: allocate IRTE entries instead of using a static mapping
>>> On 24.04.13 at 23:52, suravee suthikulpanit <suravee.suthikulpanit@amd.com> wrote: > I have finally root cause the issue. Here are the two patches that should > help fixing the issue. > The first patch fixes the issue with the mouse/keyboard not working whileThat''s not looking right: update_intremap_entry_from_ioapic() is very well doing the necessary allocation if none happened at boot, or at least the code to do so is there. If that one has a bug, we should aim at fixing it instead of working around the issue. Did you check that this offset = *index; if ( offset >= INTREMAP_ENTRIES ) { offset = alloc_intremap_entry(iommu->seg, req_id, 1); if ( offset >= INTREMAP_ENTRIES ) in update_intremap_entry_from_ioapic() doesn''t take effect for them? Also conceptually I don''t see why we would want to waste IRTEs for IO-APIC pins that likely will never get used.> the second patch > add a debug-key to help dumping the IRTE which I used to debug the issue.That one is certainly going to be helpful going forward, but you should clearly use ''V'' as the key just like VT-d does instead of occupying yet another one. Jan
Suravee Suthikulanit
2013-Apr-26 17:13 UTC
Re: [PATCH 1/6] AMD IOMMU: allocate IRTE entries instead of using a static mapping
On 4/19/2013 5:57 AM, Jan Beulich wrote:> --- a/xen/drivers/passthrough/amd/iommu_acpi.c > +++ b/xen/drivers/passthrough/amd/iommu_acpi.c > > @@ -691,14 +694,16 @@ static u16 __init parse_ivhd_device_spec > ioapic_sbdf[special->handle].bdf = bdf; > ioapic_sbdf[special->handle].seg = seg; > > - ioapic_sbdf[special->handle].pin_setup = xzalloc_array( > - unsigned long, BITS_TO_LONGS(nr_ioapic_entries[apic])); > + ioapic_sbdf[special->handle].pin_2_idx = xmalloc_array( > + u16, nr_ioapic_entries[apic]); > if ( nr_ioapic_entries[apic] && > - !ioapic_sbdf[IO_APIC_ID(apic)].pin_setup ) > + !ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx ) > { > printk(XENLOG_ERR "IVHD Error: Out of memory\n"); > return 0; > } > + memset(ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx, -1, > + nr_ioapic_entries[apic]); > }Jan, Ok.. here is why the (offset >= INTREMAP_ENTRIES) in update_intremap_entry_from_ioapic failed. + memset(ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx, -1, + nr_ioapic_entries[apic]); should have been + memset(ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx, -1, + (nr_ioapic_entries[apic] * sizeof(u16))); Since nr_ioapic_entries[apic] = 24, only pin_2_idx[0 to 11] is set to 0xffff. This causes the pin_2_idx[12-23] to fail the check. Suravee.
On 4/23/2013 1:26 AM, Jan Beulich wrote:>>>> On 23.04.13 at 02:55, Suravee Suthikulanit <suravee.suthikulpanit@amd.com> > wrote: >> On 4/19/2013 5:59 AM, Jan Beulich wrote: >>> --- a/xen/arch/x86/msi.c >>> +++ b/xen/arch/x86/msi.c >>> @@ -238,6 +238,11 @@ static int write_msi_msg(struct msi_desc >>> u8 bus = dev->bus; >>> u8 slot = PCI_SLOT(dev->devfn); >>> u8 func = PCI_FUNC(dev->devfn); >>> + int nr = entry->msi_attrib.entry_nr; >>> + >>> + ASSERT((msg->data & (entry[-nr].msi.nvec - 1)) == nr); >>> + if ( nr ) >>> + return 0; >> This logic seems incorrect. Do you meant to write --nr? > No, this indeed has to be -nr (i.e. the "masterkj" entry, which is the > first on in the array. > >> This causes assertion here. Also, investigation showing the >> value of nr is 0 here. > nr being 0 here is perfectly fine, meaning this is the first ("master") > entry of a multi-vector device (it can''t be a single-vector one, as in > that case entry[0].msi.nvec == 1, i.e. the & yields zero regardless > of msg->data). > > And the assertion should hold, due to > > *data = (msg->data & ~(INTREMAP_ENTRIES - 1)) | offset; > > in update_intremap_entry_from_msi_msg(), and > alloc_intremap_entry() returning only aligned blocks. > > So the question isn''t just what value nr there has, but also what > the other involved values are. > > JanOk, thanks for explanation. Do you think you could add comment in the code?kkk It was not quite clear at the beginning why we need this assertion. The problem occurs when the function "xen/driver/passthrough/amd/iommu_init.c: enable_iommu()" trying to initialize IOMMU and calling the "set_msi_affinity", which in turn calling "write_msi_msg". At this point, "nvec" is still zero. So, the following code should fix it. unsigned int nvec = entry[-nr].msi.nvec; if ( nvec > 0 ) ASSERT((msg->data & (nvec - 1)) == nr); Suravee
Jan Beulich
2013-Apr-29 07:31 UTC
Re: [PATCH 1/6] AMD IOMMU: allocate IRTE entries instead of using a static mapping
>>> On 26.04.13 at 19:13, Suravee Suthikulanit <suravee.suthikulpanit@amd.com>wrote:> On 4/19/2013 5:57 AM, Jan Beulich wrote: >> --- a/xen/drivers/passthrough/amd/iommu_acpi.c >> +++ b/xen/drivers/passthrough/amd/iommu_acpi.c >> >> @@ -691,14 +694,16 @@ static u16 __init parse_ivhd_device_spec >> ioapic_sbdf[special->handle].bdf = bdf; >> ioapic_sbdf[special->handle].seg = seg; >> >> - ioapic_sbdf[special->handle].pin_setup = xzalloc_array( >> - unsigned long, BITS_TO_LONGS(nr_ioapic_entries[apic])); >> + ioapic_sbdf[special->handle].pin_2_idx = xmalloc_array( >> + u16, nr_ioapic_entries[apic]); >> if ( nr_ioapic_entries[apic] && >> - !ioapic_sbdf[IO_APIC_ID(apic)].pin_setup ) >> + !ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx ) >> { >> printk(XENLOG_ERR "IVHD Error: Out of memory\n"); >> return 0; >> } >> + memset(ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx, -1, >> + nr_ioapic_entries[apic]); >> } > > Jan, > > Ok.. here is why the (offset >= INTREMAP_ENTRIES) in > update_intremap_entry_from_ioapic failed. > > + memset(ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx, -1, > + nr_ioapic_entries[apic]); > > should have been > > + memset(ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx, -1, > + (nr_ioapic_entries[apic] * sizeof(u16))); > > Since nr_ioapic_entries[apic] = 24, only pin_2_idx[0 to 11] is set to > 0xffff. This causes the pin_2_idx[12-23] to fail the check.Ah - so a one line fix. Will integrate that into the current patch. Are you saying that with that change, the patch finally works? Thanks, Jan
Suthikulpanit, Suravee
2013-Apr-29 07:33 UTC
Re: [PATCH 1/6] AMD IOMMU: allocate IRTE entries instead of using a static mapping
Yes. I have tested this and check the interrupt remapping tables. They look fine and working now. Suravee PS: Are you going to check in the patch to add the debug key also? (You could change it to "V" as you pointed out.) -----Original Message----- From: Jan Beulich [mailto:JBeulich@suse.com] Sent: Monday, April 29, 2013 2:31 AM To: Suthikulpanit, Suravee Cc: Shin, Jacob; xiantao.zhang@intel.com; xen-devel; Konrad Rzeszutek Wilk Subject: Re: [PATCH 1/6] AMD IOMMU: allocate IRTE entries instead of using a static mapping Importance: High>>> On 26.04.13 at 19:13, Suravee Suthikulanit >>> <suravee.suthikulpanit@amd.com>wrote:> On 4/19/2013 5:57 AM, Jan Beulich wrote: >> --- a/xen/drivers/passthrough/amd/iommu_acpi.c >> +++ b/xen/drivers/passthrough/amd/iommu_acpi.c >> >> @@ -691,14 +694,16 @@ static u16 __init parse_ivhd_device_spec >> ioapic_sbdf[special->handle].bdf = bdf; >> ioapic_sbdf[special->handle].seg = seg; >> >> - ioapic_sbdf[special->handle].pin_setup = xzalloc_array( >> - unsigned long, BITS_TO_LONGS(nr_ioapic_entries[apic])); >> + ioapic_sbdf[special->handle].pin_2_idx = xmalloc_array( >> + u16, nr_ioapic_entries[apic]); >> if ( nr_ioapic_entries[apic] && >> - !ioapic_sbdf[IO_APIC_ID(apic)].pin_setup ) >> + !ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx ) >> { >> printk(XENLOG_ERR "IVHD Error: Out of memory\n"); >> return 0; >> } >> + memset(ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx, -1, >> + nr_ioapic_entries[apic]); >> } > > Jan, > > Ok.. here is why the (offset >= INTREMAP_ENTRIES) in > update_intremap_entry_from_ioapic failed. > > + memset(ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx, -1, > + nr_ioapic_entries[apic]); > > should have been > > + memset(ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx, -1, > + (nr_ioapic_entries[apic] * sizeof(u16))); > > Since nr_ioapic_entries[apic] = 24, only pin_2_idx[0 to 11] is set to > 0xffff. This causes the pin_2_idx[12-23] to fail the check.Ah - so a one line fix. Will integrate that into the current patch. Are you saying that with that change, the patch finally works? Thanks, Jan
Jan Beulich
2013-Apr-29 07:42 UTC
Re: [PATCH 1/6] AMD IOMMU: allocate IRTE entries instead of using a static mapping
>>> On 29.04.13 at 09:33, "Suthikulpanit, Suravee" <Suravee.Suthikulpanit@amd.com> wrote: > PS: Are you going to check in the patch to add the debug key also? (You > could change it to "V" as you pointed out.)That''s the plan - albeit we''ll need a word from George regarding doing so still for 4.3 (for the multi-vector-MSI series we mostly settled on not dong this for 4.3, considering how late in the game we are now, and seeing that patch 5 still has at least one issue). Jan