Userspace patches for the pci-passthrough functionality. The major updates since the last post are: - Loop to add passthrough devices in pc_init1 - Handle errors in read/write calls - Allow invocation without irq number for in-kernel irqchip Other than this, several small things were fixed according to review comments received last time.
Amit Shah
2008-Jun-27 12:58 UTC
[PATCH 1/2] KVM/userspace: Support for assigning PCI devices to guest
From: Or Sagi <ors at tutis.com>
From: Nir Peleg <nir at tutis.com>
From: Amit Shah <amit.shah at qumranet.com>
From: Glauber de Oliveira Costa <gcosta at redhat.com>
We can assign a device from the host machine to a guest. The
original code comes from Neocleus.
A new command-line option, -pcidevice is added.
For example, to invoke it for an Ethernet device sitting at
PCI bus:dev.fn 04:08.0 with host IRQ 18, use this:
-pcidevice Ethernet/04:08.0
The host ethernet driver is to be removed before doing the
passthrough. If not, the device assignment fails but the
guest continues without the assignment.
If kvm uses the in-kernel irqchip, interrupts are routed to the
guest via the kvm module (accompanied kernel changes are necessary).
If -no-kvm-irqchip is used, the 'irqhook' module, available
separately, is to be used for interrupt injection into the guest. In
this case, an extra parameter, -<intr-number> is to be appended to
the above-mentioned pcidevice parameter.
Signed-off-by: Amit Shah <amit.shah at qumranet.com>
---
libkvm/libkvm-x86.c | 9 +-
libkvm/libkvm.h | 16 ++
qemu/Makefile.target | 1 +
qemu/hw/isa.h | 2 +
qemu/hw/pc.c | 9 +
qemu/hw/pci-passthrough.c | 594 +++++++++++++++++++++++++++++++++++++++++++++
qemu/hw/pci-passthrough.h | 93 +++++++
qemu/hw/pci.c | 12 +
qemu/hw/pci.h | 1 +
qemu/hw/piix_pci.c | 19 ++
qemu/vl.c | 17 ++
11 files changed, 772 insertions(+), 1 deletions(-)
create mode 100644 qemu/hw/pci-passthrough.c
create mode 100644 qemu/hw/pci-passthrough.h
diff --git a/libkvm/libkvm-x86.c b/libkvm/libkvm-x86.c
index ea97bdd..0c4cdbe 100644
--- a/libkvm/libkvm-x86.c
+++ b/libkvm/libkvm-x86.c
@@ -126,6 +126,14 @@ static int kvm_init_tss(kvm_context_t kvm)
return 0;
}
+#ifdef KVM_CAP_PCI_PASSTHROUGH
+int kvm_update_pci_pt_device(kvm_context_t kvm,
+ struct kvm_pci_passthrough_dev *pci_pt_dev)
+{
+ return ioctl(kvm->vm_fd, KVM_UPDATE_PCI_PT_DEV, pci_pt_dev);
+}
+#endif
+
int kvm_arch_create_default_phys_mem(kvm_context_t kvm,
unsigned long phys_mem_bytes,
void **vm_mem)
@@ -435,7 +443,6 @@ void kvm_show_code(kvm_context_t kvm, int vcpu)
fprintf(stderr, "code:%s\n", code_str);
}
-
/*
* Returns available msr list. User must free.
*/
diff --git a/libkvm/libkvm.h b/libkvm/libkvm.h
index ad6e26a..ccb086f 100644
--- a/libkvm/libkvm.h
+++ b/libkvm/libkvm.h
@@ -12,6 +12,7 @@
#endif
#include <linux/kvm.h>
+#include <linux/kvm_para.h>
#include <signal.h>
@@ -639,4 +640,19 @@ int kvm_enable_vapic(kvm_context_t kvm, int vcpu, uint64_t
vapic);
#endif
+#ifdef KVM_CAP_PCI_PASSTHROUGH
+/*!
+ * \brief Notifies host kernel about changes to a PCI device assigned to guest
+ *
+ * Used for PCI device assignment, this function notifies the host
+ * kernel about the assigning of the physical PCI device and the guest
+ * PCI parameters or updates to the PCI config space from the guest
+ * (mainly the device irq)
+ *
+ * \param kvm Pointer to the current kvm_context
+ * \param pci_pt_dev Parameters like irq, PCI bus, devfn number, etc
+ */
+int kvm_update_pci_pt_device(kvm_context_t kvm,
+ struct kvm_pci_passthrough_dev *pci_pt_dev);
+#endif
#endif
diff --git a/qemu/Makefile.target b/qemu/Makefile.target
index 77b2301..432011f 100644
--- a/qemu/Makefile.target
+++ b/qemu/Makefile.target
@@ -602,6 +602,7 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o
+OBJS+= pci-passthrough.o
ifeq ($(USE_KVM_PIT), 1)
OBJS+= i8254-kvm.o
endif
diff --git a/qemu/hw/isa.h b/qemu/hw/isa.h
index 89b3004..c720f5e 100644
--- a/qemu/hw/isa.h
+++ b/qemu/hw/isa.h
@@ -1,5 +1,7 @@
/* ISA bus */
+#include "hw.h"
+
extern target_phys_addr_t isa_mem_base;
int register_ioport_read(int start, int length, int size,
diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
index 6334c76..0b0606a 100644
--- a/qemu/hw/pc.c
+++ b/qemu/hw/pc.c
@@ -32,6 +32,7 @@
#include "smbus.h"
#include "boards.h"
#include "console.h"
+#include "pci-passthrough.h"
#include "qemu-kvm.h"
@@ -995,6 +996,14 @@ static void pc_init1(ram_addr_t ram_size, int vga_ram_size,
}
}
+ /* Initialize pass-through */
+ if (pci_enabled) {
+ int r = -1;
+ do {
+ pt_init_device(pci_bus, &r);
+ } while (r >= 0);
+ }
+
rtc_state = rtc_init(0x70, i8259[8]);
qemu_register_boot_set(pc_boot_set, rtc_state);
diff --git a/qemu/hw/pci-passthrough.c b/qemu/hw/pci-passthrough.c
new file mode 100644
index 0000000..250d7ef
--- /dev/null
+++ b/qemu/hw/pci-passthrough.c
@@ -0,0 +1,594 @@
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ *
+ * Pass a PCI device from the host to a guest VM.
+ *
+ * Adapted for KVM by Qumranet.
+ *
+ * Copyright (c) 2007, Neocleus, Alex Novik (alex at neocleus.com)
+ * Copyright (c) 2007, Neocleus, Guy Zana (guy at neocleus.com)
+ * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah at qumranet.com)
+ */
+#include <stdio.h>
+#include <pthread.h>
+#include <sys/io.h>
+#include <sys/ioctl.h>
+#include <linux/types.h>
+
+/* From linux/ioport.h */
+#define IORESOURCE_IO 0x00000100 /* Resource type */
+#define IORESOURCE_MEM 0x00000200
+#define IORESOURCE_IRQ 0x00000400
+#define IORESOURCE_DMA 0x00000800
+#define IORESOURCE_PREFETCH 0x00001000 /* No side effects */
+
+#include "pci-passthrough.h"
+#include "irq.h"
+
+#include "qemu-kvm.h"
+#include <linux/kvm_para.h>
+extern FILE *logfile;
+
+//#define PT_DEBUG
+
+#ifdef PT_DEBUG
+#define DEBUG(fmt, args...) fprintf(stderr, "%s: " fmt, __func__ , ##
args)
+#else
+#define DEBUG(fmt, args...)
+#endif
+
+#define pt_mmio_write(suffix, type) \
+static void pt_mmio_write##suffix(void *opaque, target_phys_addr_t e_phys, \
+ uint32_t value) \
+{ \
+ pt_region_t *r_access = (pt_region_t *)opaque; \
+ void *r_virt = (uint8_t *)r_access->r_virtbase + \
+ (e_phys - r_access->e_physbase); \
+ if (r_access->debug & PT_DEBUG_MMIO) { \
+ fprintf(logfile, "pt_mmio_write" #suffix \
+ ": e_physbase=%p e_phys=%p r_virt=%p value=%08x\n", \
+ (void *)r_access->e_physbase, (void *)e_phys, \
+ r_virt, value); \
+ } \
+ *(type *)r_virt = (type)value; \
+}
+
+pt_mmio_write(b, uint8_t)
+pt_mmio_write(w, uint16_t)
+pt_mmio_write(l, uint32_t)
+
+#define pt_mmio_read(suffix, type) \
+static uint32_t pt_mmio_read##suffix(void *opaque, target_phys_addr_t e_phys) \
+{ \
+ pt_region_t *r_access = (pt_region_t *)opaque; \
+ void *r_virt = (uint8_t *)r_access->r_virtbase + \
+ (e_phys - r_access->e_physbase); \
+ uint32_t value = (uint32_t) (*(type *) r_virt); \
+ if (r_access->debug & PT_DEBUG_MMIO) { \
+ fprintf(logfile, \
+ "pt_mmio_read" #suffix ": e_physbase=%p " \
+ "e_phys=%p r_virt=%p value=%08x\n", \
+ (void *)r_access->e_physbase, \
+ (void *)e_phys, r_virt, value); \
+ } \
+ return value; \
+}
+
+pt_mmio_read(b, uint8_t)
+pt_mmio_read(w, uint16_t)
+pt_mmio_read(l, uint32_t)
+
+CPUReadMemoryFunc *pt_mmio_read_cb[3] = {
+ pt_mmio_readb,
+ pt_mmio_readw,
+ pt_mmio_readl
+};
+
+CPUWriteMemoryFunc *pt_mmio_write_cb[3] = {
+ pt_mmio_writeb,
+ pt_mmio_writew,
+ pt_mmio_writel
+};
+
+#define pt_ioport_write(suffix) \
+static void pt_ioport_write##suffix(void *opaque, uint32_t addr, uint32_t
value) \
+{ \
+ pt_region_t *r_access = (pt_region_t *)opaque; \
+ uint32_t r_pio = (unsigned long)r_access->r_virtbase \
+ + (addr - r_access->e_physbase); \
+ if (r_access->debug & PT_DEBUG_PIO) { \
+ fprintf(logfile, "pt_ioport_write" #suffix \
+ ": r_pio=%08x e_physbase=%08x" \
+ " r_virtbase=%08lx value=%08x\n", \
+ r_pio, (int)r_access->e_physbase, \
+ (unsigned long)r_access->r_virtbase, value); \
+ } \
+ out##suffix(value, r_pio); \
+}
+
+pt_ioport_write(b)
+pt_ioport_write(w)
+pt_ioport_write(l)
+
+#define pt_ioport_read(suffix) \
+static uint32_t pt_ioport_read##suffix(void *opaque, uint32_t addr) \
+{ \
+ pt_region_t *r_access = (pt_region_t *)opaque; \
+ uint32_t r_pio = (addr - r_access->e_physbase) \
+ + (unsigned long)r_access->r_virtbase; \
+ uint32_t value = in##suffix(r_pio); \
+ if (r_access->debug & PT_DEBUG_PIO) { \
+ fprintf(logfile, "pt_ioport_read" #suffix \
+ ": r_pio=%08x e_physbase=%08x r_virtbase=%08lx "\
+ "value=%08x\n", \
+ r_pio, (int)r_access->e_physbase, \
+ (unsigned long)r_access->r_virtbase, value); \
+ } \
+ return value; \
+}
+
+pt_ioport_read(b)
+pt_ioport_read(w)
+pt_ioport_read(l)
+
+static void pt_iomem_map(PCIDevice *d, int region_num,
+ uint32_t e_phys, uint32_t e_size, int type)
+{
+ pt_dev_t *r_dev = (pt_dev_t *) d;
+
+ r_dev->v_addrs[region_num].e_physbase = e_phys;
+
+ DEBUG("e_phys=%08x r_virt=%p type=%d len=%08x region_num=%d \n",
+ e_phys, r_dev->v_addrs[region_num].r_virtbase, type, e_size,
+ region_num);
+
+ cpu_register_physical_memory(e_phys,
+ r_dev->dev.io_regions[region_num].size,
+ r_dev->v_addrs[region_num].memory_index);
+}
+
+static void pt_ioport_map(PCIDevice *pci_dev, int region_num,
+ uint32_t addr, uint32_t size, int type)
+{
+ pt_dev_t *r_dev = (pt_dev_t *) pci_dev;
+ int i;
+ uint32_t ((*rf[])(void *, uint32_t)) = { pt_ioport_readb,
+ pt_ioport_readw,
+ pt_ioport_readl
+ };
+ void ((*wf[])(void *, uint32_t, uint32_t)) = { pt_ioport_writeb,
+ pt_ioport_writew,
+ pt_ioport_writel
+ };
+
+ r_dev->v_addrs[region_num].e_physbase = addr;
+ DEBUG("pt_ioport_map: address=0x%x type=0x%x len=%d"
+ "region_num=%d \n", addr, type, size, region_num);
+
+ for (i = 0; i < 3; i++) {
+ register_ioport_write(addr, size, 1<<i, wf[i],
+ (void *) (r_dev->v_addrs + region_num));
+ register_ioport_read(addr, size, 1<<i, rf[i],
+ (void *) (r_dev->v_addrs + region_num));
+ }
+}
+
+static void pt_pci_write_config(PCIDevice *d, uint32_t address, uint32_t val,
+ int len)
+{
+ int fd, r;
+
+ DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+ ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
(uint16_t) address,
+ val, len);
+
+ if (address == 0x4)
+ pci_default_write_config(d, address, val, len);
+
+ if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
+ address == 0x3c || address == 0x3d) {
+ /* used for update-mappings (BAR emulation) */
+ pci_default_write_config(d, address, val, len);
+ return;
+ }
+
+ DEBUG("NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n",
+ ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
(uint16_t) address,
+ val, len);
+ fd = ((pt_dev_t *)d)->real_device.config_fd;
+ lseek(fd, address, SEEK_SET);
+again:
+ r = write(fd, &val, len);
+ if (r < 0) {
+ if (errno == EINTR || errno == EAGAIN)
+ goto again;
+ fprintf(stderr, "%s: write failed, errno = %d\n", __func__,
+ errno);
+ }
+}
+
+static uint32_t pt_pci_read_config(PCIDevice *d, uint32_t address, int len)
+{
+ uint32_t val = 0;
+ int fd, r;
+
+ if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
+ address == 0x3c || address == 0x3d) {
+ val = pci_default_read_config(d, address, len);
+ DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+ (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address,
val,
+ len);
+ return val;
+ }
+
+ /* vga specific, remove later */
+ if (address == 0xFC)
+ goto do_log;
+
+ fd = ((pt_dev_t *)d)->real_device.config_fd;
+ lseek(fd, address, SEEK_SET);
+again:
+ r = read(fd, &val, len);
+ if (r < 0) {
+ if (errno == EINTR || errno == EAGAIN)
+ goto again;
+ fprintf(stderr, "%s: read failed, errno = %d\n", __func__,
+ errno);
+ }
+
+do_log:
+ DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+ (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address,
val, len);
+
+ /* kill the special capabilities */
+ if (address == 4 && len == 4)
+ val &= ~0x100000;
+ else if (address == 6)
+ val &= ~0x10;
+
+ return val;
+}
+
+static int pt_register_regions(pci_region_t *io_regions,
+ unsigned long regions_num, pt_dev_t *pci_dev)
+{
+ uint32_t i;
+ pci_region_t *cur_region = io_regions;
+
+ for (i = 0; i < regions_num; i++, cur_region++) {
+ if (!cur_region->valid)
+ continue;
+#ifdef PT_DEBUG
+ pci_dev->v_addrs[i].debug |= PT_DEBUG_MMIO | PT_DEBUG_PIO;
+#endif
+ pci_dev->v_addrs[i].num = i;
+
+ /* handle memory io regions */
+ if (cur_region->type & IORESOURCE_MEM) {
+ int t = cur_region->type & IORESOURCE_PREFETCH
+ ? PCI_ADDRESS_SPACE_MEM_PREFETCH
+ : PCI_ADDRESS_SPACE_MEM;
+
+ /* map physical memory */
+ pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+ pci_dev->v_addrs[i].r_virtbase + mmap(NULL, (cur_region->size +
0xFFF) & 0xFFFFF000,
+ PROT_WRITE | PROT_READ, MAP_SHARED,
+ cur_region->resource_fd, (off_t) 0);
+
+ if ((void *) -1 == pci_dev->v_addrs[i].r_virtbase) {
+ fprintf(stderr, "Error: Couldn't mmap 0x%x!\n",
+ (uint32_t) (cur_region->base_addr));
+ return -1;
+ }
+
+ /* add offset */
+ pci_dev->v_addrs[i].r_virtbase ++ (cur_region->base_addr &
0xFFF);
+
+ pci_register_io_region((PCIDevice *) pci_dev, i,
+ cur_region->size, t,
+ pt_iomem_map);
+
+ pci_dev->v_addrs[i].memory_index + cpu_register_io_memory(0,
pt_mmio_read_cb,
+ pt_mmio_write_cb,
+ (void *) &(pci_dev->v_addrs[i]));
+
+ continue;
+ }
+ /* handle port io regions */
+
+ pci_register_io_region((PCIDevice *) pci_dev, i,
+ cur_region->size, PCI_ADDRESS_SPACE_IO,
+ pt_ioport_map);
+
+ pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+ pci_dev->v_addrs[i].r_virtbase = (void *)(long)cur_region->base_addr;
+ /* not relevant for port io */
+ pci_dev->v_addrs[i].memory_index = 0;
+ }
+
+ /* success */
+ return 0;
+
+}
+
+static int pt_get_real_device(pt_dev_t *pci_dev, uint8_t r_bus, uint8_t r_dev,
+ uint8_t r_func)
+{
+ char dir[128], name[128], comp[16];
+ int fd, r = 0;
+ FILE *f;
+ unsigned long long start, end, size, flags;
+ pci_region_t *rp;
+ pci_dev_t *dev = &pci_dev->real_device;
+
+ dev->region_number = 0;
+
+ sprintf(dir, "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
+ r_bus, r_dev, r_func);
+ strcpy(name, dir);
+ strcat(name, "config");
+ if ((fd = open(name, O_RDWR)) == -1) {
+ fprintf(stderr, "%s: %m\n", name);
+ return 1;
+ }
+ dev->config_fd = fd;
+again:
+ r = read(fd, pci_dev->dev.config, sizeof pci_dev->dev.config);
+ if (r < 0) {
+ if (errno == EINTR || errno == EAGAIN)
+ goto again;
+ fprintf(stderr, "%s: read failed, errno = %d\n", __func__,
+ errno);
+ }
+
+ strcpy(name, dir);
+ strcat(name, "resource");
+ if ((f = fopen(name, "r")) == NULL) {
+ fprintf(stderr, "%s: %m\n", name);
+ return 1;
+ }
+
+ for (r = 0; fscanf(f, "%lli %lli %lli\n", &start, &end,
&flags) == 3;
+ r++) {
+ rp = dev->regions + r;
+ rp->valid = 0;
+ size = end - start + 1;
+ flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
+ if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0)
+ continue;
+ if (flags & IORESOURCE_MEM) {
+ flags &= ~IORESOURCE_IO;
+ sprintf(comp, "resource%d", r);
+ strcpy(name, dir);
+ strcat(name, comp);
+ if ((fd = open(name, O_RDWR)) == -1)
+ continue; /* probably ROM */
+ rp->resource_fd = fd;
+ } else
+ flags &= ~IORESOURCE_PREFETCH;
+
+ rp->type = flags;
+ rp->valid = 1;
+ rp->base_addr = start;
+ rp->size = size;
+ DEBUG("region %d size %d start 0x%x type %d "
+ "resource_fd %d\n", r, rp->size, start, rp->type,
+ rp->resource_fd);
+ }
+ fclose(f);
+
+ dev->region_number = r;
+ return 0;
+}
+
+static pt_dev_t *register_real_device(PCIBus *e_bus, const char *e_dev_name,
+ int e_devfn, uint8_t r_bus, uint8_t r_dev,
+ uint8_t r_func)
+{
+ int rc;
+ pt_dev_t *pci_dev;
+ uint8_t e_device, e_intx;
+
+ DEBUG("register_real_device: Registering real physical "
+ "device %s (devfn=0x%x)\n", e_dev_name, e_devfn);
+
+ pci_dev = (pt_dev_t *) pci_register_device(e_bus, e_dev_name,
+ sizeof(pt_dev_t), e_devfn,
+ pt_pci_read_config,
+ pt_pci_write_config);
+
+ if (NULL == pci_dev) {
+ fprintf(stderr, "register_real_device: Error: Couldn't "
+ "register real device %s\n", e_dev_name);
+ return NULL;
+ }
+ if (pt_get_real_device(pci_dev, r_bus, r_dev, r_func)) {
+ fprintf(stderr, "register_real_device: Error: Couldn't get "
+ "real device (%s)!\n", e_dev_name);
+ return NULL;
+ }
+
+ /* handle real device's MMIO/PIO BARs */
+ if (pt_register_regions(pci_dev->real_device.regions,
+ pci_dev->real_device.region_number, pci_dev))
+ return NULL;
+
+ /* handle interrupt routing */
+ e_device = (pci_dev->dev.devfn >> 3) & 0x1f;
+ e_intx = pci_dev->dev.config[0x3d] - 1;
+ pci_dev->intpin = e_intx;
+ pci_dev->run = 0;
+ pci_dev->girq = 0;
+ pci_dev->h_busnr = r_bus;
+ pci_dev->h_devfn = PCI_DEVFN(r_dev, r_func);
+
+#ifdef KVM_CAP_PCI_PASSTHROUGH
+ if (kvm_enabled()) {
+ struct kvm_pci_passthrough_dev pci_pt_dev;
+
+ memset(&pci_pt_dev, 0, sizeof(pci_pt_dev));
+ pci_pt_dev.guest.busnr = pci_bus_num(e_bus);
+ pci_pt_dev.guest.devfn = PCI_DEVFN(e_device, r_func);
+ pci_pt_dev.host.busnr = pci_dev->h_busnr;
+ pci_pt_dev.host.devfn = pci_dev->h_devfn;
+
+ /* We'll set the value of the guest irq as and when
+ * the piix config gets updated. See pci_pt_update_irq.
+ * The host irq field never gets used anyway
+ */
+
+ rc = kvm_update_pci_pt_device(kvm_context, &pci_pt_dev);
+ if (rc < 0) {
+ fprintf(stderr, "Could not notify kernel about "
+ "passthrough device\n");
+ perror("pt-ioctl");
+ return NULL;
+ }
+ }
+#endif
+
+ fprintf(logfile, "Registered host PCI device %02x:%02x.%1x "
+ "as guest device %02x:%02x.%1x\n",
+ r_bus, r_dev, r_func,
+ pci_bus_num(e_bus), e_device, r_func);
+
+ return pci_dev;
+}
+
+#define MAX_PTDEVS 4
+struct {
+ char name[128];
+ int bus;
+ int dev;
+ int func;
+ pt_dev_t *ptdev;
+} ptdevs[MAX_PTDEVS];
+
+int nptdevs;
+extern int piix_get_irq(int);
+
+#ifdef KVM_CAP_PCI_PASSTHROUGH
+/* The pci config space got updated. Check if irq numbers have changed
+ * for our devices
+ */
+void pci_pt_update_irq(PCIDevice *d)
+{
+ int i, irq, r;
+ pt_dev_t *pt_dev;
+
+ for (i = 0; i < nptdevs; i++) {
+ pt_dev = ptdevs[i].ptdev;
+ if (pt_dev == NULL)
+ continue;
+
+ irq = pci_map_irq(&pt_dev->dev, pt_dev->intpin);
+ irq = piix_get_irq(irq);
+ if (irq != pt_dev->girq) {
+ struct kvm_pci_passthrough_dev pci_pt_dev;
+
+ memset(&pci_pt_dev, 0, sizeof(pci_pt_dev));
+ pci_pt_dev.guest.irq = irq;
+ pci_pt_dev.host.busnr = pt_dev->h_busnr;
+ pci_pt_dev.host.devfn = pt_dev->h_devfn;
+ r = kvm_update_pci_pt_device(kvm_context, &pci_pt_dev);
+ if (r < 0) {
+ perror("pci_pt_update_irq");
+ continue;
+ }
+ pt_dev->girq = irq;
+ }
+ }
+}
+#endif
+
+int pt_init_system(void)
+{
+ /* Do we have any devices to be assigned? */
+ if (nptdevs == 0)
+ return -1;
+
+ iopl(3);
+
+ return 0;
+}
+
+int pt_init_device(PCIBus *bus, int *index)
+{
+ pt_dev_t *dev = NULL;
+ int i, ret = 0;
+
+ if (*index == -1) {
+ if (pt_init_system() < 0)
+ return -1;
+
+ *index = nptdevs - 1;
+ }
+ i = *index;
+
+ dev = register_real_device(bus, ptdevs[i].name, -1,
+ ptdevs[i].bus, ptdevs[i].dev,
+ ptdevs[i].func);
+ if (dev == NULL) {
+ fprintf(stderr, "Error: Couldn't register device %s\n",
+ ptdevs[i].name);
+ ret = -1;
+ }
+ ptdevs[i].ptdev = dev;
+
+ --*index;
+ return ret;
+}
+
+void add_pci_passthrough_device(const char *arg)
+{
+ /* name/bus:dev.func */
+ char *cp, *cp1;
+
+ if (nptdevs >= MAX_PTDEVS) {
+ fprintf(stderr, "Too many passthrough devices (max %d)\n",
+ MAX_PTDEVS);
+ return;
+ }
+ strcpy(ptdevs[nptdevs].name, arg);
+ cp = strchr(ptdevs[nptdevs].name, '/');
+ if (cp == NULL)
+ goto bad;
+ *cp++ = 0;
+
+ ptdevs[nptdevs].bus = strtoul(cp, &cp1, 16);
+ if (*cp1 != ':')
+ goto bad;
+ cp = cp1 + 1;
+
+ ptdevs[nptdevs].dev = strtoul(cp, &cp1, 16);
+ if (*cp1 != '.')
+ goto bad;
+ cp = cp1 + 1;
+
+ ptdevs[nptdevs].func = strtoul(cp, &cp1, 16);
+ if (*cp1 != 0)
+ goto bad;
+
+ nptdevs++;
+ return;
+bad:
+ fprintf(stderr, "passthrough arg (%s) not in the form of "
+ "name/bus:dev.func\n", arg);
+}
diff --git a/qemu/hw/pci-passthrough.h b/qemu/hw/pci-passthrough.h
new file mode 100644
index 0000000..60df017
--- /dev/null
+++ b/qemu/hw/pci-passthrough.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Data structures for storing PCI state
+ *
+ * Adapted to kvm by Qumranet
+ *
+ * Copyright (c) 2007, Neocleus, Alex Novik (alex at neocleus.com)
+ * Copyright (c) 2007, Neocleus, Guy Zana (guy at neocleus.com)
+ * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah at qumranet.com)
+ */
+
+#ifndef __PCI_PASSTHROUGH_H__
+#define __PCI_PASSTHROUGH_H__
+
+#include <sys/mman.h>
+#include "qemu-common.h"
+#include "pci.h"
+#include <linux/types.h>
+
+#define PT_DEBUG_PIO (0x01)
+#define PT_DEBUG_MMIO (0x02)
+
+/* From include/linux/pci.h in the kernel sources */
+#define PCI_DEVFN(slot,func) ((((slot) & 0x1f) << 3) | ((func) &
0x07))
+
+typedef uint32_t pciaddr_t;
+
+#define MAX_IO_REGIONS (6)
+
+typedef struct pci_region_s {
+ int type; /* Memory or port I/O */
+ int valid;
+ pciaddr_t base_addr;
+ pciaddr_t size; /* size of the region */
+ int resource_fd;
+} pci_region_t;
+
+typedef struct pci_dev_s {
+ uint8_t bus, dev, func; /* Bus inside domain, device and function */
+ int irq; /* IRQ number */
+ uint16_t region_number; /* number of active regions */
+
+ /* Port I/O or MMIO Regions */
+ pci_region_t regions[MAX_IO_REGIONS];
+ int config_fd;
+} pci_dev_t;
+
+typedef struct pt_region_s {
+ target_phys_addr_t e_physbase;
+ uint32_t memory_index;
+ void *r_virtbase; /* mmapped access address */
+ int num; /* our index within v_addrs[] */
+ uint32_t debug;
+} pt_region_t;
+
+typedef struct pt_dev_s {
+ PCIDevice dev;
+ int intpin;
+ uint8_t debug_flags;
+ pt_region_t v_addrs[PCI_NUM_REGIONS];
+ pci_dev_t real_device;
+ int run;
+ int girq;
+ char sirq[4];
+ unsigned char h_busnr;
+ unsigned int h_devfn;
+ int bound;
+} pt_dev_t;
+
+/* Initialization functions */
+int pt_init_device(PCIBus *bus, int *index);
+void add_pci_passthrough_device(const char *arg);
+void pt_set_vector(int irq, int vector);
+void pt_ack_mirq(int vector);
+
+#define logfile stderr
+
+#endif /* __PCI_PASSTHROUGH_H__ */
diff --git a/qemu/hw/pci.c b/qemu/hw/pci.c
index 92683d1..ff21b83 100644
--- a/qemu/hw/pci.c
+++ b/qemu/hw/pci.c
@@ -50,6 +50,7 @@ struct PCIBus {
static void pci_update_mappings(PCIDevice *d);
static void pci_set_irq(void *opaque, int irq_num, int level);
+void pci_pt_update_irq(PCIDevice *d);
target_phys_addr_t pci_mem_base;
static int pci_irq_index;
@@ -453,6 +454,12 @@ void pci_default_write_config(PCIDevice *d,
val >>= 8;
}
+#ifdef KVM_CAP_PCI_PASSTHROUGH
+ if (kvm_enabled() && qemu_kvm_irqchip_in_kernel() &&
+ address >= 0x60 && address <= 0x63)
+ pci_pt_update_irq(d);
+#endif
+
end = address + len;
if (end > PCI_COMMAND && address < (PCI_COMMAND + 2)) {
/* if the command register is modified, we must modify the mappings */
@@ -555,6 +562,11 @@ static void pci_set_irq(void *opaque, int irq_num, int
level)
bus->set_irq(bus->irq_opaque, irq_num, bus->irq_count[irq_num] !=
0);
}
+int pci_map_irq(PCIDevice *pci_dev, int pin)
+{
+ return pci_dev->bus->map_irq(pci_dev, pin);
+}
+
/***********************************************************/
/* monitor info on PCI */
diff --git a/qemu/hw/pci.h b/qemu/hw/pci.h
index 60e4094..e11fbbf 100644
--- a/qemu/hw/pci.h
+++ b/qemu/hw/pci.h
@@ -81,6 +81,7 @@ void pci_register_io_region(PCIDevice *pci_dev, int
region_num,
uint32_t size, int type,
PCIMapIORegionFunc *map_func);
+int pci_map_irq(PCIDevice *pci_dev, int pin);
uint32_t pci_default_read_config(PCIDevice *d,
uint32_t address, int len);
void pci_default_write_config(PCIDevice *d,
diff --git a/qemu/hw/piix_pci.c b/qemu/hw/piix_pci.c
index 90cb3a6..112381a 100644
--- a/qemu/hw/piix_pci.c
+++ b/qemu/hw/piix_pci.c
@@ -237,6 +237,25 @@ static void piix3_set_irq(qemu_irq *pic, int irq_num, int
level)
}
}
+int piix3_get_pin(int pic_irq)
+{
+ int i;
+ for (i = 0; i < 4; i++)
+ if (piix3_dev->config[0x60+i] == pic_irq)
+ return i;
+ return -1;
+}
+
+int piix_get_irq(int pin)
+{
+ if (piix3_dev)
+ return piix3_dev->config[0x60+pin];
+ if (piix4_dev)
+ return piix4_dev->config[0x60+pin];
+
+ return 0;
+}
+
static void piix3_reset(PCIDevice *d)
{
uint8_t *pci_conf = d->config;
diff --git a/qemu/vl.c b/qemu/vl.c
index 3032eaf..4946e9a 100644
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -37,6 +37,7 @@
#include "qemu-char.h"
#include "block.h"
#include "audio/audio.h"
+#include "hw/pci-passthrough.h"
#include "migration.h"
#include "qemu-kvm.h"
@@ -7786,6 +7787,11 @@ static void help(int exitcode)
#endif
"-no-kvm-irqchip disable KVM kernel mode PIC/IOAPIC/LAPIC\n"
"-no-kvm-pit disable KVM kernel mode PIT\n"
+#if defined(TARGET_I386) || defined(TARGET_X86_64)
+ "-pcidevice name/bus:dev.func\n"
+ " expose a PCI device to the guest OS.\n"
+ " 'name' is just used for debug logs.\n"
+#endif
#endif
#ifdef TARGET_I386
"-std-vga simulate a standard VGA card with VESA Bochs
Extensions\n"
@@ -7909,6 +7915,9 @@ enum {
QEMU_OPTION_no_kvm,
QEMU_OPTION_no_kvm_irqchip,
QEMU_OPTION_no_kvm_pit,
+#if defined(TARGET_I386) || defined(TARGET_X86_64)
+ QEMU_OPTION_pcidevice,
+#endif
QEMU_OPTION_no_reboot,
QEMU_OPTION_no_shutdown,
QEMU_OPTION_show_cursor,
@@ -7997,6 +8006,9 @@ const QEMUOption qemu_options[] = {
#endif
{ "no-kvm-irqchip", 0, QEMU_OPTION_no_kvm_irqchip },
{ "no-kvm-pit", 0, QEMU_OPTION_no_kvm_pit },
+#if defined(TARGET_I386) || defined(TARGET_X86_64)
+ { "pcidevice", HAS_ARG, QEMU_OPTION_pcidevice },
+#endif
#endif
#if defined(TARGET_PPC) || defined(TARGET_SPARC)
{ "g", 1, QEMU_OPTION_g },
@@ -8909,6 +8921,11 @@ int main(int argc, char **argv)
kvm_pit = 0;
break;
}
+#if defined(TARGET_I386) || defined(TARGET_X86_64)
+ case QEMU_OPTION_pcidevice:
+ add_pci_passthrough_device(optarg);
+ break;
+#endif
#endif
case QEMU_OPTION_usb:
usb_enabled = 1;
--
1.5.4.3
Amit Shah
2008-Jun-27 12:58 UTC
[PATCH 2/2] KVM: PCIPT: irqhook module for interrupt injection into guests with
From: Nir Peleg <nir at tutis.com>
From: Or Sagi <ors at tutis.com>
When using the --no-kvm-irqchip option, this irqhook module injects
interrupts into the guests for assigned devices.
This module is not well-supported and only exists for debugging and
for legacy / non-x86 support.
Signed-off-by: Amit Shah <amit.shah at qumranet.com>
---
Makefile | 10 ++-
irqhook/Kbuild | 3 +
irqhook/Makefile | 25 +++++
irqhook/irqhook_main.c | 215 +++++++++++++++++++++++++++++++++++++++++++++
qemu/hw/apic.c | 4 +
qemu/hw/pci-passthrough.c | 171 ++++++++++++++++++++++++++++++++++--
qemu/hw/pci-passthrough.h | 1 +
qemu/vl.c | 4 +-
8 files changed, 421 insertions(+), 12 deletions(-)
create mode 100644 irqhook/Kbuild
create mode 100644 irqhook/Makefile
create mode 100644 irqhook/irqhook_main.c
diff --git a/Makefile b/Makefile
index 48a8dff..d4246fd 100644
--- a/Makefile
+++ b/Makefile
@@ -7,16 +7,16 @@ rpmrelease = devel
sane-arch = $(subst i386,x86,$(subst x86_64,x86,$(ARCH)))
-.PHONY: kernel user libkvm qemu bios vgabios extboot clean libfdt
+.PHONY: kernel irqhook user libkvm qemu bios vgabios extboot clean libfdt
all: libkvm qemu
ifneq '$(filter $(ARCH), x86_64 i386 ia64)' ''
- all: $(if $(WANT_MODULE), kernel) user
+ all: $(if $(WANT_MODULE), kernel irqhook) user
endif
kcmd = $(if $(WANT_MODULE),,@\#)
-qemu kernel user libkvm:
+qemu kernel user irqhook libkvm:
$(MAKE) -C $@
qemu: libkvm
@@ -77,6 +77,7 @@ install-rpm:
install:
$(kcmd)make -C kernel DESTDIR="$(DESTDIR)" install
+ $(kcmd)make -C irqhook DESTDIR="$(DESTDIR)" install
make -C libkvm DESTDIR="$(DESTDIR)" install
make -C qemu DESTDIR="$(DESTDIR)" install
@@ -97,6 +98,7 @@ srpm:
tar czf $(RPMTOPDIR)/SOURCES/user.tar.gz user
tar czf $(RPMTOPDIR)/SOURCES/libkvm.tar.gz libkvm
tar czf $(RPMTOPDIR)/SOURCES/kernel.tar.gz kernel
+ tar czf $(RPMTOPDIR)/SOURCES/irqhook.tar.gz irqhook
tar czf $(RPMTOPDIR)/SOURCES/scripts.tar.gz scripts
tar czf $(RPMTOPDIR)/SOURCES/extboot.tar.gz extboot
cp Makefile configure kvm_stat $(RPMTOPDIR)/SOURCES
@@ -104,7 +106,7 @@ srpm:
$(RM) $(tmpspec)
clean:
- for i in $(if $(WANT_MODULE), kernel) user libkvm qemu libfdt; do \
+ for i in $(if $(WANT_MODULE), kernel irqhook) user libkvm qemu libfdt; do \
make -C $$i clean; \
done
diff --git a/irqhook/Kbuild b/irqhook/Kbuild
new file mode 100644
index 0000000..9af75a4
--- /dev/null
+++ b/irqhook/Kbuild
@@ -0,0 +1,3 @@
+EXTRA_CFLAGS := -I$(src)/include
+obj-m := irqhook.o
+irqhook-objs := irqhook_main.o
diff --git a/irqhook/Makefile b/irqhook/Makefile
new file mode 100644
index 0000000..3b1d851
--- /dev/null
+++ b/irqhook/Makefile
@@ -0,0 +1,25 @@
+include ../config.mak
+
+KVERREL = $(patsubst /lib/modules/%/build,%,$(KERNELDIR))
+
+DESTDIR+
+INSTALLDIR = $(patsubst %/build,%/extra,$(KERNELDIR))
+
+rpmrelease = devel
+
+LINUX = ../linux-2.6
+
+all::
+ $(MAKE) -C $(KERNELDIR) M=`pwd` "$$@"
+
+#sync:
+# rsync --exclude='*.mod.c' "$(LINUX)"/drivers/irqhook/*.[ch]
.
+
+install:
+ mkdir -p $(DESTDIR)/$(INSTALLDIR)
+ cp *.ko $(DESTDIR)/$(INSTALLDIR)
+ /sbin/depmod -a
+
+clean:
+ $(MAKE) -C $(KERNELDIR) M=`pwd` $@
diff --git a/irqhook/irqhook_main.c b/irqhook/irqhook_main.c
new file mode 100644
index 0000000..0f93d17
--- /dev/null
+++ b/irqhook/irqhook_main.c
@@ -0,0 +1,215 @@
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/bitmap.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/miscdevice.h>
+#include <linux/pci.h>
+
+#include <asm/uaccess.h>
+
+#define irqh_VERSION "0.0.1"
+#define irqh_MODULE_NAME "irqhook"
+#define irqh_DRIVER_NAME irqh_MODULE_NAME " HW IRQ hook "
irqh_VERSION
+
+// based on earlier proprietary Tutis code; this modified version goes under
GPL
+MODULE_AUTHOR("Nir Peleg - Tutis");
+MODULE_DESCRIPTION("IRQ hook driver");
+MODULE_LICENSE("GPL");
+
+//#define irqh_DEBUG /* define to enable copious debugging info */
+
+#ifdef irqh_DEBUG
+#define DPRINTK(fmt, args...) printk("<1>" "%s: "
fmt, __FUNCTION__ , ## args)
+#else
+#define DPRINTK(fmt, args...)
+#endif
+
+#define ERROR(fmt, args...) printk("<1>" "%s: " fmt,
__FUNCTION__ , ## args)
+
+static spinlock_t irqh_lock;
+static wait_queue_head_t irqh_proc_list;
+
+static DECLARE_BITMAP(pending, NR_IRQS);
+static DECLARE_BITMAP(handled, NR_IRQS);
+
+#define irqh_on(which, bit) test_bit(bit, which)
+#define irqh_set(which, bit) set_bit(bit, which)
+#define irqh_clear(which, bit) clear_bit(bit, which)
+#define irqh_ffs(which) find_first_bit(which, NR_IRQS)
+
+static irqreturn_t
+irqh_interrupt(int irq, void *p)
+{
+ unsigned long flags;
+
+ DPRINTK("interrupt: %d\n", irq);
+ if (!irqh_on(handled, irq))
+ return IRQ_HANDLED;
+ spin_lock_irqsave(&irqh_lock, flags);
+ irqh_set(pending, irq);
+ wake_up_interruptible(&irqh_proc_list);
+ spin_unlock_irqrestore(&irqh_lock, flags);
+ disable_irq_nosync(irq);
+ return IRQ_HANDLED;
+}
+
+static ssize_t
+irqh_dev_write(struct file *fp, const char *buf, size_t size, loff_t *offp)
+{
+ int n, device, func, devfn;
+ char arg[32], *cp, *cp1;
+ struct pci_dev *pdp = 0;
+
+ DPRINTK("ENTER\n");
+ if ((fp->f_mode & FMODE_WRITE) == 0 || size > sizeof arg)
+ return -EINVAL;
+
+ if (size >= sizeof arg || copy_from_user(arg, buf, size))
+ return -EFAULT;
+ arg[size] = 0;
+ cp = arg + (arg[0] == '+' || arg[0] == '-');
+ n = simple_strtol(cp, &cp1, 0);
+ if (*cp1 == ':') {
+ device = simple_strtol(cp1+1, &cp1, 0);
+ func = simple_strtol(cp1+1, NULL, 0);
+ DPRINTK("PCI dev %d:%d.%d\n", n, device, func);
+ devfn = PCI_DEVFN(device, func);
+ for_each_pci_dev(pdp) {
+ if (pdp->bus->number == n && pdp->devfn == devfn) {
+ n = pdp->irq;
+ goto found;
+ }
+ }
+ ERROR("PCI device not found\n");
+ return -ENOENT;
+ }
+ found:
+ DPRINTK("IRQ %d\n", n);
+ if (arg[0] == '+') {
+ if (pdp) {
+ if (pci_enable_device(pdp))
+ ERROR("device not enabled\n");
+ if ((unsigned)(n = pdp->irq) >= NR_IRQS) {
+ ERROR("device has invalid IRQ set\n");
+ return -EINVAL;
+ }
+ }
+ if (irqh_on(handled, n))
+ return -EBUSY;
+ if (request_irq(n, irqh_interrupt, IRQF_SHARED, irqh_MODULE_NAME, (void
*)irqh_interrupt)) {
+ ERROR("request_irq failed\n");
+ return -EIO;
+ }
+ printk("Bound machine irq %d\n", n);
+ irqh_set(handled, n);
+ goto done;
+ }
+ if ((unsigned)n >= NR_IRQS)
+ return -EINVAL;
+ if (arg[0] == '-') {
+ if (pdp)
+ pci_disable_device(pdp);
+ free_irq(n, (void *)irqh_interrupt);
+ irqh_clear(handled, n);
+ } else
+ enable_irq(n);
+
+ done:
+ DPRINTK("DONE\n");
+ return size;
+}
+
+static ssize_t
+irqh_dev_read(struct file *fp, char *buf, size_t size, loff_t *offp)
+{
+ char b[20];
+ int m = -ERESTARTSYS, n;
+
+ DECLARE_WAITQUEUE(wait, current);
+
+ DPRINTK("ENTER\n");
+ if ((fp->f_mode & FMODE_READ) == 0)
+ return -EINVAL;
+ spin_lock_irq(&irqh_lock);
+ while (!signal_pending(current)) {
+ if ((n = irqh_ffs(pending)) < NR_IRQS) {
+ if ((m = sprintf(b, "%d", n) + 1) > size)
+ m = size;
+ if (copy_to_user(buf, b, m))
+ m = -EFAULT;
+ else
+ irqh_clear(pending, n);
+ break;
+ }
+ if (fp->f_flags & O_NONBLOCK) {
+ m = -EWOULDBLOCK;
+ break;
+ }
+ add_wait_queue(&irqh_proc_list, &wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+ spin_unlock_irq(&irqh_lock);
+ schedule();
+ spin_lock_irq(&irqh_lock);
+ current->state = TASK_RUNNING;
+ remove_wait_queue(&irqh_proc_list, &wait);
+ }
+ spin_unlock_irq(&irqh_lock);
+ return m;
+}
+
+static struct file_operations irqh_chrdev_ops = {
+ owner: THIS_MODULE,
+ read: irqh_dev_read,
+ write: irqh_dev_write,
+};
+
+#define irqh_MISCDEV_MINOR MISC_DYNAMIC_MINOR
+
+static struct miscdevice irqh_miscdev = {
+ irqh_MISCDEV_MINOR,
+ irqh_MODULE_NAME,
+ &irqh_chrdev_ops,
+};
+
+static int __init
+irqh_init(void)
+{
+ int rc;
+
+ DPRINTK("ENTER\n");
+
+ if ((rc = misc_register(&irqh_miscdev))) {
+ printk(KERN_ERR irqh_MODULE_NAME ": " "cannot register misc
device\n");
+ DPRINTK("EXIT, returning %d\n", rc);
+ return rc;
+ }
+
+ printk(KERN_INFO irqh_DRIVER_NAME " loaded\n");
+
+ init_waitqueue_head(&irqh_proc_list);
+ spin_lock_init(&irqh_lock);
+
+ DPRINTK("EXIT, returning 0\n");
+ return 0;
+}
+
+static void __exit
+irqh_cleanup(void)
+{
+ int n;
+
+ DPRINTK("ENTER\n");
+
+ while ((n = irqh_ffs(handled)) < NR_IRQS) {
+ irqh_clear(handled, n);
+ free_irq(n, (void *)irqh_interrupt);
+ }
+ misc_deregister (&irqh_miscdev);
+
+ DPRINTK("EXIT\n");
+}
+
+module_init (irqh_init);
+module_exit (irqh_cleanup);
diff --git a/qemu/hw/apic.c b/qemu/hw/apic.c
index 4ebf1ff..7d45385 100644
--- a/qemu/hw/apic.c
+++ b/qemu/hw/apic.c
@@ -23,6 +23,8 @@
#include "qemu-kvm.h"
+#include "pci-passthrough.h"
+
//#define DEBUG_APIC
//#define DEBUG_IOAPIC
@@ -389,6 +391,7 @@ static void apic_eoi(APICState *s)
/* XXX: send the EOI packet to the APIC bus to allow the I/O APIC to
set the remote IRR bit for level triggered interrupts. */
apic_update_irq(s);
+ pt_ack_mirq(isrv);
}
static void apic_get_delivery_bitmask(uint32_t *deliver_bitmask,
@@ -1144,6 +1147,7 @@ static void ioapic_mem_writel(void *opaque,
target_phys_addr_t addr, uint32_t va
} else {
s->ioredtbl[index] &= ~0xffffffffULL;
s->ioredtbl[index] |= val;
+ pt_set_vector(index, (val << 24) >> 24);
}
ioapic_service(s);
}
diff --git a/qemu/hw/pci-passthrough.c b/qemu/hw/pci-passthrough.c
index 250d7ef..1cf1d0f 100644
--- a/qemu/hw/pci-passthrough.c
+++ b/qemu/hw/pci-passthrough.c
@@ -398,9 +398,11 @@ again:
return 0;
}
+static int pt_bind_mirq(int bus, int dev, int fn);
+
static pt_dev_t *register_real_device(PCIBus *e_bus, const char *e_dev_name,
int e_devfn, uint8_t r_bus, uint8_t r_dev,
- uint8_t r_func)
+ uint8_t r_func, uint32_t machine_irq)
{
int rc;
pt_dev_t *pci_dev;
@@ -435,10 +437,24 @@ static pt_dev_t *register_real_device(PCIBus *e_bus, const
char *e_dev_name,
e_intx = pci_dev->dev.config[0x3d] - 1;
pci_dev->intpin = e_intx;
pci_dev->run = 0;
+ pci_dev->mirq = machine_irq;
pci_dev->girq = 0;
pci_dev->h_busnr = r_bus;
pci_dev->h_devfn = PCI_DEVFN(r_dev, r_func);
+ /* bind machine_irq to device */
+ if (machine_irq && (!kvm_enabled() || !qemu_kvm_irqchip_in_kernel()))
{
+ DEBUG(logfile, "Binding mirq %u to device=0x%x intpin=0x%x\n",
+ machine_irq, e_device, pci_dev->intpin);
+ rc = pt_bind_mirq(r_bus, r_dev, r_func);
+ if (rc) {
+ fprintf(stderr, "pt_bind %d failed rc=%d\n",
+ pci_dev->mirq, rc);
+ return NULL;
+ }
+ sprintf(pci_dev->sirq, "%d", pci_dev->mirq);
+ }
+
#ifdef KVM_CAP_PCI_PASSTHROUGH
if (kvm_enabled()) {
struct kvm_pci_passthrough_dev pci_pt_dev;
@@ -464,9 +480,9 @@ static pt_dev_t *register_real_device(PCIBus *e_bus, const
char *e_dev_name,
}
#endif
- fprintf(logfile, "Registered host PCI device %02x:%02x.%1x "
+ fprintf(logfile, "Registered host PCI device %02x:%02x.%1x-%u "
"as guest device %02x:%02x.%1x\n",
- r_bus, r_dev, r_func,
+ r_bus, r_dev, r_func, machine_irq,
pci_bus_num(e_bus), e_device, r_func);
return pci_dev;
@@ -478,6 +494,7 @@ struct {
int bus;
int dev;
int func;
+ int irq;
pt_dev_t *ptdev;
} ptdevs[MAX_PTDEVS];
@@ -518,6 +535,62 @@ void pci_pt_update_irq(PCIDevice *d)
}
#endif
+static QEMUBH *ptbh;
+static int irqfd;
+static pt_dev_t **apicv[0xfe]; /* 0x10 - 0xfe according to intel IOAPIC spec */
+#define IRQHOOK_DEV "/dev/irqhook"
+static pthread_t irqthread;
+
+static void *pt_irq(void *arg)
+{
+ char buf[20];
+ int irq;
+ int i;
+ pt_dev_t *dev;
+ sigset_t signals;
+
+ sigfillset(&signals);
+ sigprocmask(SIG_BLOCK, &signals, NULL);
+
+ if (!irqfd) {
+ fprintf(stderr, "pt_irq: irqfd %d, exiting\n", irqfd);
+ exit(-1);
+ }
+
+ for (;;) {
+ if (read(irqfd, buf, 20) == -1) {
+ if (errno == EINTR)
+ continue;
+ perror("irq read");
+ break;
+ }
+
+ irq = atoi(buf);
+ DEBUG("read irq %d\n", irq);
+ if (!irq)
+ continue;
+
+ for (i = 0; i < nptdevs; i++)
+ if ((dev = ptdevs[i].ptdev) && dev->mirq == irq)
+ dev->run = 1;
+ qemu_bh_schedule(ptbh);
+ }
+ return NULL;
+}
+
+static void pt_bh(void *p)
+{
+ int i;
+ pt_dev_t *dev;
+ for (i = 0; i < nptdevs; i++)
+ if ((dev = ptdevs[i].ptdev) && dev->run) {
+ qemu_set_irq(dev->dev.irq[dev->intpin], 1);
+ dev->run = 0;
+ if (cpu_single_env)
+ cpu_interrupt(cpu_single_env, CPU_INTERRUPT_EXIT);
+ }
+}
+
int pt_init_system(void)
{
/* Do we have any devices to be assigned? */
@@ -526,6 +599,17 @@ int pt_init_system(void)
iopl(3);
+ if (!kvm_enabled() || !qemu_kvm_irqchip_in_kernel()) {
+ if (!(ptbh = qemu_bh_new(pt_bh, 0))) {
+ fprintf(stderr, "Couldn't register PT callback\n");
+ return -1;
+ }
+ if (!(irqfd = open(IRQHOOK_DEV, O_RDWR))) {
+ fprintf(stderr, "Couldn't open PT irqhook dev, make "
+ "sure the irqhook module is loaded\n");
+ return -1;
+ }
+ }
return 0;
}
@@ -544,7 +628,7 @@ int pt_init_device(PCIBus *bus, int *index)
dev = register_real_device(bus, ptdevs[i].name, -1,
ptdevs[i].bus, ptdevs[i].dev,
- ptdevs[i].func);
+ ptdevs[i].func, ptdevs[i].irq);
if (dev == NULL) {
fprintf(stderr, "Error: Couldn't register device %s\n",
ptdevs[i].name);
@@ -552,13 +636,23 @@ int pt_init_device(PCIBus *bus, int *index)
}
ptdevs[i].ptdev = dev;
+ if (!*index && kvm_enabled() && !qemu_kvm_irqchip_in_kernel())
{
+ if (ptdevs[i].irq == 0) {
+ fprintf(stderr, "Please specify the irq for the device\n");
+ return -1;
+ }
+ if (pthread_create(&irqthread, 0, pt_irq, dev)) {
+ fprintf(stderr, "Couldn't create IRQ thread\n");
+ return -1;
+ }
+ }
--*index;
return ret;
}
void add_pci_passthrough_device(const char *arg)
{
- /* name/bus:dev.func */
+ /* name/bus:dev.func-intr */
char *cp, *cp1;
if (nptdevs >= MAX_PTDEVS) {
@@ -583,12 +677,75 @@ void add_pci_passthrough_device(const char *arg)
cp = cp1 + 1;
ptdevs[nptdevs].func = strtoul(cp, &cp1, 16);
- if (*cp1 != 0)
+
+ /* In case of irqchip_in_kernel, we don't want the next param */
+ if (*cp1 == 0) {
+ ptdevs[nptdevs].irq = 0;
+ goto skip_irq;
+ }
+ if (*cp1 != '-')
goto bad;
+ cp = cp1 + 1;
+ ptdevs[nptdevs].irq = strtoul(cp, &cp1, 0);
+ if (*cp1 != 0)
+ goto bad;
+skip_irq:
nptdevs++;
return;
bad:
fprintf(stderr, "passthrough arg (%s) not in the form of "
- "name/bus:dev.func\n", arg);
+ "name/bus:dev.func-intr\n", arg);
+}
+
+void pt_ack_mirq(int vector)
+{
+ pt_dev_t **p = apicv[vector];
+ if (!p)
+ return;
+
+ for (; *p; *p++) {
+ write(irqfd, (*p)->sirq, strlen((*p)->sirq));
+ qemu_set_irq((*p)->dev.irq[(*p)->intpin], 0);
+ }
+}
+
+static int pt_bind_mirq(int bus, int dev, int fn)
+{
+ char s[64];
+ sprintf(s, "+%d:%d.%d", bus, dev, fn);
+ if (write(irqfd, s, strlen(s)) != strlen(s)) {
+ perror("pt_bind_mirq");
+ fprintf(stderr, "Make sure the irqhook module is loaded\n");
+ exit(-1);
+ }
+ return 0;
+}
+
+int piix3_get_pin(int pic_irq);
+
+void pt_set_vector(int irq, int vector)
+{
+ int i, j;
+ int pin = piix3_get_pin(irq);
+ pt_dev_t *pt, **p;
+
+ DEBUG("irq %d vector %d\n", irq, vector);
+ if (vector > 0xfe)
+ return;
+ for (i = 0; i < nptdevs; i++) {
+ pt = ptdevs[i].ptdev;
+ if (!pt || pt->bound)
+ continue;
+ if (pci_map_irq(&pt->dev, pt->intpin) == pin) {
+ for (j = 1, p = apicv[vector]; p; j++, *p++)
+ ;
+ apicv[vector] = realloc(apicv[vector], j * sizeof pt);
+ p = &apicv[vector][j];
+ *(p-1) = pt;
+ *p = 0;
+ pt->bound = 1;
+ }
+ }
+ DEBUG("done\n");
}
diff --git a/qemu/hw/pci-passthrough.h b/qemu/hw/pci-passthrough.h
index 60df017..cd63482 100644
--- a/qemu/hw/pci-passthrough.h
+++ b/qemu/hw/pci-passthrough.h
@@ -75,6 +75,7 @@ typedef struct pt_dev_s {
pt_region_t v_addrs[PCI_NUM_REGIONS];
pci_dev_t real_device;
int run;
+ int mirq;
int girq;
char sirq[4];
unsigned char h_busnr;
diff --git a/qemu/vl.c b/qemu/vl.c
index 4946e9a..33decf5 100644
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -7788,9 +7788,11 @@ static void help(int exitcode)
"-no-kvm-irqchip disable KVM kernel mode PIC/IOAPIC/LAPIC\n"
"-no-kvm-pit disable KVM kernel mode PIT\n"
#if defined(TARGET_I386) || defined(TARGET_X86_64)
- "-pcidevice name/bus:dev.func\n"
+ "-pcidevice name/bus:dev.func[-intr] \n"
" expose a PCI device to the guest OS.\n"
" 'name' is just used for debug logs.\n"
+ " [-intr] is the interrupt (from the lspci -v
output),\n"
+ " in case you use the irqhook module for interrupt
routing.\n"
#endif
#endif
#ifdef TARGET_I386
--
1.5.4.3