Jan Beulich
2010-Sep-20 13:23 UTC
[Xen-devel] [PATCH] x86: protect MSI-X table and pending bit array from guest writes
These structures are used by Xen, and hence guests must not be able to fiddle with them. qemu-dm currently plays with the MSI-X table, requiring Dom0 to still have write access. This is broken (explicitly allowing the guest write access to the mask bit) and should be fixed in qemu-dm, at which time Dom0 won''t need any special casing anymore. The changes are made under the assumption that p2m_mmio_direct will only ever be used for order 0 pages. An open question is whether dealing with pv guests (including the IOMMU-less case) is necessary, as handling mappings a domain may already have in place at the time the first interrupt gets set up would require scanning all of the guest''s L1 page table pages. Currently a hole still remains allowing PV guests to map these ranges before actually setting up any MSI-X vector for a device. An alternative would be to determine and insert the address ranges earlier into mmio_ro_ranges, but that would require a hook in the PCI config space writes, which is particularly problematic in case MMCONFIG accesses are being used. A second alternative would be to require Dom0 to report all devices (or at least all MSI-X capable ones) regardless of whether they would be used by that domain, and do so after resources got determined/ assigned for them (i.e. a second notification later than the one currently happening from the PCI bus scan would be needed). Signed-off-by: Jan Beulich <jbeulich@novell.com> Acked-by: Jiang, Yunhong <yunhong.jiang@intel.com> --- 2010-09-20.orig/xen/arch/x86/mm.c 2010-09-20 14:29:04.000000000 +0200 +++ 2010-09-20/xen/arch/x86/mm.c 2010-09-20 10:00:07.000000000 +0200 @@ -822,7 +822,13 @@ get_page_from_l1e( return 0; } - return 1; + if ( !(l1f & _PAGE_RW) || IS_PRIV(pg_owner) || + !rangeset_contains_singleton(mmio_ro_ranges, mfn) ) + return 1; + dprintk(XENLOG_G_WARNING, + "d%d: Forcing read-only access to MFN %lx\n", + l1e_owner->domain_id, mfn); + return -1; } if ( unlikely(real_pg_owner != pg_owner) ) @@ -1178,9 +1184,15 @@ static int alloc_l1_table(struct page_in for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) { - if ( is_guest_l1_slot(i) && - unlikely(!get_page_from_l1e(pl1e[i], d, d)) ) - goto fail; + if ( is_guest_l1_slot(i) ) + switch ( get_page_from_l1e(pl1e[i], d, d) ) + { + case 0: + goto fail; + case -1: + l1e_remove_flags(pl1e[i], _PAGE_RW); + break; + } adjust_guest_l1e(pl1e[i], d); } @@ -1764,8 +1776,14 @@ static int mod_l1_entry(l1_pgentry_t *pl return rc; } - if ( unlikely(!get_page_from_l1e(nl1e, pt_dom, pg_dom)) ) + switch ( get_page_from_l1e(nl1e, pt_dom, pg_dom) ) + { + case 0: return 0; + case -1: + l1e_remove_flags(nl1e, _PAGE_RW); + break; + } adjust_guest_l1e(nl1e, pt_dom); if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu, @@ -4919,8 +4937,9 @@ static int ptwr_emulated_update( /* Check the new PTE. */ nl1e = l1e_from_intpte(val); - if ( unlikely(!get_page_from_l1e(nl1e, d, d)) ) + switch ( get_page_from_l1e(nl1e, d, d) ) { + case 0: if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) && !do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) ) { @@ -4939,6 +4958,10 @@ static int ptwr_emulated_update( MEM_LOG("ptwr_emulate: could not get_page_from_l1e()"); return X86EMUL_UNHANDLEABLE; } + break; + case -1: + l1e_remove_flags(nl1e, _PAGE_RW); + break; } adjust_guest_l1e(nl1e, d); --- 2010-09-20.orig/xen/arch/x86/mm/hap/p2m-ept.c 2010-09-20 14:29:04.000000000 +0200 +++ 2010-09-20/xen/arch/x86/mm/hap/p2m-ept.c 2010-09-20 10:00:07.000000000 +0200 @@ -72,9 +72,13 @@ static void ept_p2m_type_to_flags(ept_en entry->r = entry->w = entry->x = 0; return; case p2m_ram_rw: - case p2m_mmio_direct: entry->r = entry->w = entry->x = 1; return; + case p2m_mmio_direct: + entry->r = entry->x = 1; + entry->w = !rangeset_contains_singleton(mmio_ro_ranges, + entry->mfn); + return; case p2m_ram_logdirty: case p2m_ram_ro: case p2m_ram_shared: @@ -722,6 +726,9 @@ static void ept_change_entry_type_global if ( ept_get_asr(d) == 0 ) return; + BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt)); + BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct)); + ept_change_entry_type_page(_mfn(ept_get_asr(d)), ept_get_wl(d), ot, nt); ept_sync_domain(d); --- 2010-09-20.orig/xen/arch/x86/mm/p2m.c 2010-09-20 14:29:04.000000000 +0200 +++ 2010-09-20/xen/arch/x86/mm/p2m.c 2010-09-20 10:00:07.000000000 +0200 @@ -72,7 +72,7 @@ boolean_param("hap_1gb", opt_hap_1gb); #define SUPERPAGE_PAGES (1UL << 9) #define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0) -static unsigned long p2m_type_to_flags(p2m_type_t t) +static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn) { unsigned long flags; #ifdef __x86_64__ @@ -101,7 +101,9 @@ static unsigned long p2m_type_to_flags(p case p2m_mmio_dm: return flags; case p2m_mmio_direct: - return flags | P2M_BASE_FLAGS | _PAGE_RW | _PAGE_PCD; + if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) ) + flags |= _PAGE_RW; + return flags | P2M_BASE_FLAGS | _PAGE_PCD; case p2m_populate_on_demand: return flags; } @@ -1299,8 +1301,10 @@ p2m_set_entry(struct p2m_domain *p2m, un domain_crash(p2m->domain); goto out; } + ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct); l3e_content = mfn_valid(mfn) - ? l3e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt) | _PAGE_PSE) + ? l3e_from_pfn(mfn_x(mfn), + p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE) : l3e_empty(); entry_content.l1 = l3e_content.l3; paging_write_p2m_entry(p2m->domain, gfn, p2m_entry, @@ -1334,7 +1338,8 @@ p2m_set_entry(struct p2m_domain *p2m, un ASSERT(p2m_entry); if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) ) - entry_content = l1e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt)); + entry_content = l1e_from_pfn(mfn_x(mfn), + p2m_type_to_flags(p2mt, mfn)); else entry_content = l1e_empty(); @@ -1358,9 +1363,11 @@ p2m_set_entry(struct p2m_domain *p2m, un goto out; } + ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct); if ( mfn_valid(mfn) || p2m_is_magic(p2mt) ) l2e_content = l2e_from_pfn(mfn_x(mfn), - p2m_type_to_flags(p2mt) | _PAGE_PSE); + p2m_type_to_flags(p2mt, mfn) | + _PAGE_PSE); else l2e_content = l2e_empty(); @@ -2437,6 +2444,7 @@ void p2m_change_type_global(struct p2m_d #endif /* CONFIG_PAGING_LEVELS == 4 */ BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt)); + BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct)); if ( !paging_mode_translate(p2m->domain) ) return; @@ -2478,7 +2486,7 @@ void p2m_change_type_global(struct p2m_d continue; mfn = l3e_get_pfn(l3e[i3]); gfn = get_gpfn_from_mfn(mfn); - flags = p2m_type_to_flags(nt); + flags = p2m_type_to_flags(nt, _mfn(mfn)); l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE); paging_write_p2m_entry(p2m->domain, gfn, (l1_pgentry_t *)&l3e[i3], @@ -2509,7 +2517,7 @@ void p2m_change_type_global(struct p2m_d #endif ) * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; - flags = p2m_type_to_flags(nt); + flags = p2m_type_to_flags(nt, _mfn(mfn)); l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE); paging_write_p2m_entry(p2m->domain, gfn, (l1_pgentry_t *)&l2e[i2], @@ -2533,7 +2541,7 @@ void p2m_change_type_global(struct p2m_d ) * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; /* create a new 1le entry with the new type */ - flags = p2m_type_to_flags(nt); + flags = p2m_type_to_flags(nt, _mfn(mfn)); l1e_content = l1e_from_pfn(mfn, flags); paging_write_p2m_entry(p2m->domain, gfn, &l1e[i1], l1mfn, l1e_content, 1); --- 2010-09-20.orig/xen/arch/x86/mm/shadow/multi.c 2010-09-20 14:29:04.000000000 +0200 +++ 2010-09-20/xen/arch/x86/mm/shadow/multi.c 2010-09-20 10:00:07.000000000 +0200 @@ -674,7 +674,9 @@ _sh_propagate(struct vcpu *v, } /* Read-only memory */ - if ( p2m_is_readonly(p2mt) ) + if ( p2m_is_readonly(p2mt) || + (p2mt == p2m_mmio_direct && + rangeset_contains_singleton(mmio_ro_ranges, mfn_x(target_mfn))) ) sflags &= ~_PAGE_RW; // protect guest page tables @@ -1225,15 +1227,19 @@ static int shadow_set_l1e(struct vcpu *v /* About to install a new reference */ if ( shadow_mode_refcounts(d) ) { TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_GET_REF); - if ( shadow_get_page_from_l1e(new_sl1e, d, new_type) == 0 ) + switch ( shadow_get_page_from_l1e(new_sl1e, d, new_type) ) { + case 0: /* Doesn''t look like a pagetable. */ flags |= SHADOW_SET_ERROR; new_sl1e = shadow_l1e_empty(); - } - else - { + break; + case -1: + shadow_l1e_remove_flags(new_sl1e, _PAGE_RW); + /* fall through */ + default: shadow_vram_get_l1e(new_sl1e, sl1e, sl1mfn, d); + break; } } } --- 2010-09-20.orig/xen/arch/x86/msi.c 2010-09-20 14:29:04.000000000 +0200 +++ 2010-09-20/xen/arch/x86/msi.c 2010-09-20 14:29:34.000000000 +0200 @@ -16,12 +16,14 @@ #include <xen/errno.h> #include <xen/pci.h> #include <xen/pci_regs.h> +#include <xen/iocap.h> #include <xen/keyhandler.h> #include <asm/io.h> #include <asm/smp.h> #include <asm/desc.h> #include <asm/msi.h> #include <asm/fixmap.h> +#include <asm/p2m.h> #include <mach_apic.h> #include <io_ports.h> #include <public/physdev.h> @@ -520,6 +522,43 @@ static int msi_capability_init(struct pc return 0; } +static u64 read_pci_mem_bar(u8 bus, u8 slot, u8 func, u8 bir) +{ + u8 limit; + u32 addr; + + switch ( pci_conf_read8(bus, slot, func, PCI_HEADER_TYPE) ) + { + case PCI_HEADER_TYPE_NORMAL: + limit = 6; + break; + case PCI_HEADER_TYPE_BRIDGE: + limit = 2; + break; + case PCI_HEADER_TYPE_CARDBUS: + limit = 1; + break; + default: + return 0; + } + + if ( bir >= limit ) + return 0; + addr = pci_conf_read32(bus, slot, func, PCI_BASE_ADDRESS_0 + bir * 4); + if ( (addr & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO ) + return 0; + if ( (addr & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64 ) + { + addr &= ~PCI_BASE_ADDRESS_MEM_MASK; + if ( ++bir >= limit ) + return 0; + return addr | + ((u64)pci_conf_read32(bus, slot, func, + PCI_BASE_ADDRESS_0 + bir * 4) << 32); + } + return addr & ~PCI_BASE_ADDRESS_MEM_MASK; +} + /** * msix_capability_init - configure device''s MSI-X capability * @dev: pointer to the pci_dev data structure of MSI-X device function @@ -532,7 +571,8 @@ static int msi_capability_init(struct pc **/ static int msix_capability_init(struct pci_dev *dev, struct msi_info *msi, - struct msi_desc **desc) + struct msi_desc **desc, + unsigned int nr_entries) { struct msi_desc *entry; int pos; @@ -587,6 +627,66 @@ static int msix_capability_init(struct p list_add_tail(&entry->list, &dev->msi_list); + if ( !dev->msix_nr_entries ) + { + u64 pba_paddr; + u32 pba_offset; + + ASSERT(!dev->msix_used_entries); + WARN_ON(msi->table_base != read_pci_mem_bar(bus, slot, func, bir)); + + dev->msix_nr_entries = nr_entries; + dev->msix_table.first = PFN_DOWN(table_paddr); + dev->msix_table.last = PFN_DOWN(table_paddr + + nr_entries * PCI_MSIX_ENTRY_SIZE - 1); + WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, dev->msix_table.first, + dev->msix_table.last)); + + pba_offset = pci_conf_read32(bus, slot, func, + msix_pba_offset_reg(pos)); + bir = (u8)(pba_offset & PCI_MSIX_BIRMASK); + pba_paddr = read_pci_mem_bar(bus, slot, func, bir); + WARN_ON(!pba_paddr); + pba_paddr += pba_offset & ~PCI_MSIX_BIRMASK; + + dev->msix_pba.first = PFN_DOWN(pba_paddr); + dev->msix_pba.last = PFN_DOWN(pba_paddr + + BITS_TO_LONGS(nr_entries) - 1); + WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, dev->msix_pba.first, + dev->msix_pba.last)); + + if ( rangeset_add_range(mmio_ro_ranges, dev->msix_table.first, + dev->msix_table.last) ) + WARN(); + if ( rangeset_add_range(mmio_ro_ranges, dev->msix_pba.first, + dev->msix_pba.last) ) + WARN(); + + if ( dev->domain ) + p2m_change_entry_type_global(p2m_get_hostp2m(dev->domain), + p2m_mmio_direct, p2m_mmio_direct); + if ( !dev->domain || !paging_mode_translate(dev->domain) ) + { + struct domain *d = dev->domain; + + if ( !d ) + for_each_domain(d) + if ( !paging_mode_translate(d) && + (iomem_access_permitted(d, dev->msix_table.first, + dev->msix_table.last) || + iomem_access_permitted(d, dev->msix_pba.first, + dev->msix_pba.last)) ) + break; + if ( d ) + { + /* XXX How to deal with existing mappings? */ + } + } + } + WARN_ON(dev->msix_nr_entries != nr_entries); + WARN_ON(dev->msix_table.first != (table_paddr >> PAGE_SHIFT)); + ++dev->msix_used_entries; + /* Mask interrupt here */ writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET); @@ -717,7 +817,7 @@ static int __pci_enable_msix(struct msi_ } - status = msix_capability_init(pdev, msi, desc); + status = msix_capability_init(pdev, msi, desc, nr_entries); return status; } @@ -742,6 +842,16 @@ static void __pci_disable_msix(struct ms writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET); pci_conf_write16(bus, slot, func, msix_control_reg(pos), control); + + if ( !--dev->msix_used_entries ) + { + if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_table.first, + dev->msix_table.last) ) + WARN(); + if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_pba.first, + dev->msix_pba.last) ) + WARN(); + } } /* --- 2010-09-20.orig/xen/drivers/passthrough/io.c 2010-09-20 14:29:04.000000000 +0200 +++ 2010-09-20/xen/drivers/passthrough/io.c 2010-09-20 10:00:07.000000000 +0200 @@ -26,6 +26,8 @@ #include <xen/hvm/irq.h> #include <xen/tasklet.h> +struct rangeset *__read_mostly mmio_ro_ranges; + static void hvm_dirq_assist(unsigned long _d); bool_t pt_irq_need_timer(uint32_t flags) @@ -565,3 +567,11 @@ void hvm_dpci_eoi(struct domain *d, unsi unlock: spin_unlock(&d->event_lock); } + +static int __init setup_mmio_ro_ranges(void) +{ + mmio_ro_ranges = rangeset_new(NULL, "r/o mmio ranges", + RANGESETF_prettyprint_hex); + return 0; +} +__initcall(setup_mmio_ro_ranges); --- 2010-09-20.orig/xen/include/xen/iommu.h 2010-09-20 14:29:04.000000000 +0200 +++ 2010-09-20/xen/include/xen/iommu.h 2010-09-20 10:00:07.000000000 +0200 @@ -31,6 +31,8 @@ extern bool_t force_iommu, iommu_verbose extern bool_t iommu_workaround_bios_bug, iommu_passthrough; extern bool_t iommu_snoop, iommu_qinval, iommu_intremap; +extern struct rangeset *mmio_ro_ranges; + #define domain_hvm_iommu(d) (&d->arch.hvm_domain.hvm_iommu) #define MAX_IOMMUS 32 --- 2010-09-20.orig/xen/include/xen/pci.h 2010-09-20 14:29:04.000000000 +0200 +++ 2010-09-20/xen/include/xen/pci.h 2010-09-20 10:00:07.000000000 +0200 @@ -45,6 +45,10 @@ struct pci_dev { struct list_head domain_list; struct list_head msi_list; + unsigned int msix_nr_entries, msix_used_entries; + struct { + unsigned long first, last; + } msix_table, msix_pba; int msix_table_refcnt[MAX_MSIX_TABLE_PAGES]; int msix_table_idx[MAX_MSIX_TABLE_PAGES]; spinlock_t msix_table_lock; _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Konrad Rzeszutek Wilk
2010-Sep-21 15:17 UTC
Re: [Xen-devel] [PATCH] x86: protect MSI-X table and pending bit array from guest writes
On Mon, Sep 20, 2010 at 02:23:51PM +0100, Jan Beulich wrote:> These structures are used by Xen, and hence guests must not be able > to fiddle with them. > > qemu-dm currently plays with the MSI-X table, requiring Dom0 to > still have write access. This is broken (explicitly allowing the guest > write access to the mask bit) and should be fixed in qemu-dm, at which > time Dom0 won''t need any special casing anymore. > > The changes are made under the assumption that p2m_mmio_direct will > only ever be used for order 0 pages. > > An open question is whether dealing with pv guests (including the > IOMMU-less case) is necessary, as handling mappings a domain may > already have in place at the time the first interrupt gets set up > would require scanning all of the guest''s L1 page table pages.When the PCI passthrough is utilized for PV guests we utilize the xc_domain_iomem_permission, xc_domain_ioport_permission, and xc_physdev_map_pirq before we even start the guest. With your patch, will the MFN regions that are specified by the iomem_permission still be visible to the PV domain? I think the answer is yes, and I think the MSI-X regions are not of any importance to the PV guests as Dom0 is the one setting up the MSI-X entries and passing on the vector value to the PV guest. But I just want to be sure about this.> Currently a hole still remains allowing PV guests to map these ranges > before actually setting up any MSI-X vector for a device._______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jan Beulich
2010-Sep-21 16:07 UTC
Re: [Xen-devel] [PATCH] x86: protect MSI-X table and pending bit array from guest writes
>>> On 21.09.10 at 17:17, Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote: > On Mon, Sep 20, 2010 at 02:23:51PM +0100, Jan Beulich wrote: >> These structures are used by Xen, and hence guests must not be able >> to fiddle with them. >> >> qemu-dm currently plays with the MSI-X table, requiring Dom0 to >> still have write access. This is broken (explicitly allowing the guest >> write access to the mask bit) and should be fixed in qemu-dm, at which >> time Dom0 won''t need any special casing anymore. >> >> The changes are made under the assumption that p2m_mmio_direct will >> only ever be used for order 0 pages. >> >> An open question is whether dealing with pv guests (including the >> IOMMU-less case) is necessary, as handling mappings a domain may >> already have in place at the time the first interrupt gets set up >> would require scanning all of the guest''s L1 page table pages. > > When the PCI passthrough is utilized for PV guests we utilize > the xc_domain_iomem_permission, xc_domain_ioport_permission, and > xc_physdev_map_pirq before we even start the guest. > With your patch, will the MFN regions that are specified by the > iomem_permission still be visible to the PV domain?Yes, just that the page(s) containing MSI-X table and PBA won''t be writeable anymore (if the guest tries to map them so, they''ll get mapped read-only). And yes, the MSI-X table should be ignored by pv guests altogether, and the PBA (afaict) isn''t being used by Linux up to now. Jan _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel