Message ID | 1397057589-11779-5-git-send-email-eric.auger@linaro.org |
---|---|
State | New |
Headers | show |
On Wed, 9 Apr 2014 16:33:07 +0100 Eric Auger <eric.auger@linaro.org> wrote: > This work is inspired of PCI INTx. The code was prepared to support > multiple IRQs but this was not tested at that stage. Similarly to what > is done on PCI, the device register space is RAM unmapped on IRQ hit > in order to trap the end of interrupt (EOI). On mmap timer hit, if the > EOI was trapped, the mmapping is restored. the physical interrupt is > unmasked on the first read/write with the MMIO register space. > > Tested on Calxeda Midway xgmac which can be directly assigned to one > virt VM. > > Acknowledgements: > - vfio device name, guest physical address and IRQ indexes are > hardcoded. This will be improved in another patch > - currently tested on a single IRQ (xgmac main IRQ) > - a KVM patch is required to invalidate stage2 entries on RAM memory > region destruction (see [PATCH] ARM: KVM: Handle IPA unmapping on > memory region deletion) > > Signed-off-by: Eric Auger <eric.auger@linaro.org> > --- thanks for this, Eric. > hw/arm/virt.c | 13 ++- > hw/vfio/platform.c | 316 +++++++++++++++++++++++++++++++++++++++++++++++++---- > 2 files changed, 304 insertions(+), 25 deletions(-) > > diff --git a/hw/arm/virt.c b/hw/arm/virt.c > index 5d43cf0..31ae7d2 100644 > --- a/hw/arm/virt.c > +++ b/hw/arm/virt.c > @@ -108,12 +108,13 @@ static const MemMapEntry a15memmap[] = { > /* ...repeating for a total of NUM_VIRTIO_TRANSPORTS, each of that size */ > /* 0x10000000 .. 0x40000000 reserved for PCI */ > [VIRT_MEM] = { 0x40000000, 1ULL * 1024 * 1024 * 1024 }, > - [VIRT_ETHERNET] = { 0xfff51000, 0x1000 }, > + [VIRT_ETHERNET] = { 0xfff41000, 0x1000 }, this change isn't explained (the device is at physical 0xfff51000, not 0xfff41000), and looks like it belongs in the first patch of the series anyway. Note: see e.g., commit f5fdcd6e5 "hw/arm: Add 'virt' platform" for an example of how to re-compose commit message text for patches that undergo a change of author. > }; > > static const int a15irqmap[] = { > [VIRT_UART] = 1, > [VIRT_MMIO] = 16, /* ...to 16 + NUM_VIRTIO_TRANSPORTS - 1 */ > + [VIRT_ETHERNET] = 77, > }; > > static VirtBoardInfo machines[] = { > @@ -299,8 +300,12 @@ static void create_ethernet(const VirtBoardInfo *vbi, qemu_irq *pic) > hwaddr base = vbi->memmap[VIRT_ETHERNET].base; > hwaddr size = vbi->memmap[VIRT_ETHERNET].size; > const char compat[] = "calxeda,hb-xgmac"; > + int irqm = vbi->irqmap[VIRT_ETHERNET]; > + int irqp = irqm+1; > + int irqlp = irqm+2; > > - sysbus_create_simple("vfio-platform", base, NULL); > + sysbus_create_varargs("vfio-platform", base, > + pic[irqm], pic[irqp], pic[irqlp], NULL); > > nodename = g_strdup_printf("/ethernet@%" PRIx64, base); > qemu_fdt_add_subnode(vbi->fdt, nodename); > @@ -308,6 +313,10 @@ static void create_ethernet(const VirtBoardInfo *vbi, qemu_irq *pic) > /* Note that we can't use setprop_string because of the embedded NUL */ > qemu_fdt_setprop(vbi->fdt, nodename, "compatible", compat, sizeof(compat)); > qemu_fdt_setprop_sized_cells(vbi->fdt, nodename, "reg", 2, base, 2, size); > + qemu_fdt_setprop_cells(vbi->fdt, nodename, "interrupts", > + 0x0, irqm, 0x4, > + 0x0, irqp, 0x4, > + 0x0, irqlp, 0x4); fwiw, it would have been nice to reuse the irq variable names used in hw/net/xgmac.c, but whatever... > g_free(nodename); > } > diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c > index 138fb13..f148edd 100644 > --- a/hw/vfio/platform.c > +++ b/hw/vfio/platform.c > @@ -24,6 +24,7 @@ > #include "config.h" > #include "exec/address-spaces.h" > #include "exec/memory.h" > + > #include "qemu-common.h" > #include "qemu/error-report.h" > #include "qemu/event_notifier.h" > @@ -31,6 +32,7 @@ > #include "qemu/range.h" > #include "sysemu/kvm.h" > #include "sysemu/sysemu.h" > + > #include "hw/qdev-properties.h" > #include "migration/vmstate.h" > #include "hw/hw.h" > @@ -41,12 +43,15 @@ > #define DEBUG_VFIO > #ifdef DEBUG_VFIO > #define DPRINTF(fmt, ...) \ > - do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0) > + do { fprintf(stderr, "vfio: %s: " fmt, __func__, ## __VA_ARGS__); } \ > + while (0) these changes are unrelated to the subject of this "add IRQ" patch - either make them their own patch (in between the last one and this one), or merge them with the prior patch (that introduces the code), but since their nature starts to drift from the corresponding PCI code counterpart, I can't help think why they shouldn't be applied there, too. Probably best dropping them for now - these and cleaning up more of the superfluous DPRINTFs added in this series. > #else > #define DPRINTF(fmt, ...) \ > do { } while (0) > #endif > > +#define PLATFORM_NUM_REGIONS 10 > + this is a regression from the third patch in this series - see below. > /* Extra debugging, trap acceleration paths for more logging */ > #define VFIO_ALLOW_MMAP 1 > > @@ -63,16 +68,240 @@ typedef struct VFIORegion { > uint8_t nr; /* cache the region number for debug */ > } VFIORegion; > > + > +#define VFIO_INT_INTp 4 I've just cloned your qemu tree, vfio-dev-integ branch, and did a grep for where this is used, and got no occurrences at all, so this must be being removed in a later patch in the series...which makes a proper review, well, difficult. > +typedef struct VFIOINTp { > + QLIST_ENTRY(VFIOINTp) next; > + EventNotifier interrupt; /* eventfd triggered on interrupt */ > + EventNotifier unmask; /* eventfd for unmask on QEMU bypass */ > + qemu_irq qemuirq; > + struct VFIODevice *vdev; /* back pointer to device */ > + bool pending; /* interrupt pending */ > + bool kvm_accel; /* set when QEMU bypass through KVM enabled */ > + uint8_t pin; /* index */ > + uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */ > + QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */ > +} VFIOINTp; > + please look at merging this with hw/vfio/pci.c's VFIOINTx definition and putting the result in hw/vfio/vfio-common.h. > + > typedef struct VFIODevice { > SysBusDevice sbdev; > int fd; > int num_regions; > - VFIORegion *regions; > + int num_irqs; > + int interrupt; /* type of the interrupt, might disappear */ sigh, and it does disappear, presumably in a later patch :( > + char *name; > + uint32_t mmap_timeout; /* mmap timeout value in ms */ > + VFIORegion regions[PLATFORM_NUM_REGIONS]; this is a regression over the last version of the third patch I sent you; 'regions' should remain dynamically allocated - here, they're being fixed. > QLIST_ENTRY(VFIODevice) next; > struct VFIOGroup *group; > - char *name; > + QLIST_HEAD(, VFIOINTp) intp_list; > } VFIODevice; > > + > + > +static void vfio_unmask_intp(VFIODevice *vdev, int index) > +{ > + struct vfio_irq_set irq_set = { > + .argsz = sizeof(irq_set), > + .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK, > + .index = index, > + .start = 0, > + .count = 1, > + }; > + > + ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); > +} > + > + > + > + extra whitespace > +static void vfio_intp_mmap_enable(void *opaque) > +{ > + VFIOINTp * intp = (VFIOINTp *)opaque; > + VFIODevice *vdev = intp->vdev; > + > + if (intp->pending) { > + DPRINTF("IRQ still pending, re-schedule the mmap timer\n"); > + timer_mod(intp->mmap_timer, > + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + intp->mmap_timeout); > + return; > + } > + > + DPRINTF("IRQ EOI'ed sets mmap again\n"); > + VFIORegion *region = &vdev->regions[0]; > + memory_region_set_enabled(®ion->mmap_mem, true); > +} > + > + > + > +static void vfio_intp_interrupt(void *opaque) > +{ > + int ret; > + VFIOINTp *intp = (VFIOINTp *)opaque; > + VFIODevice *vdev = intp->vdev; > + > + DPRINTF("pin = %d fd = %d\n", > + intp->pin, event_notifier_get_fd(&intp->interrupt)); > + > + ret = event_notifier_test_and_clear(&intp->interrupt); > + if (!ret) { > + DPRINTF("Error when clearing fd=%d\n", > + event_notifier_get_fd(&intp->interrupt)); > + } > + > + intp->pending = true; > + > + /* TODO: fix this number of regions, > + * currently a single region is handled > + */ please do :) > + VFIORegion *region = &vdev->regions[0]; > + memory_region_set_enabled(®ion->mmap_mem, false); > + > + qemu_set_irq(intp->qemuirq, 1); > + > + /* schedule the mmap timer which will restote mmap path after EOI*/ restore > + if (intp->mmap_timeout) { > + timer_mod(intp->mmap_timer, > + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + intp->mmap_timeout); > + } > + > +} > + > + > + > +static void vfio_irq_eoi(VFIODevice *vdev) > +{ > + > + VFIOINTp *intp; > + bool eoi_done = false; > + > + QLIST_FOREACH(intp, &vdev->intp_list, next) { > + if (intp->pending) { > + if (eoi_done) { > + DPRINTF("several IRQ pending simultaneously: \ > + this is not a supported case yet\n"); > + } If the thing to do in this case is not remap MMIO to the 'fast path', then we should do that. Otherwise, I think I'd protect this with DEBUG in the meantime.. > + DPRINTF("EOI IRQ #%d fd=%d\n", > + intp->pin, event_notifier_get_fd(&intp->interrupt)); > + intp->pending = false; > + qemu_set_irq(intp->qemuirq, 0); > + vfio_unmask_intp(vdev, intp->pin); > + eoi_done = true; > + } > + } > + > + return; > + > +} > + > + > + > +#if 0 please, no dead code. > +static void vfio_list_intp(VFIODevice *vdev) > +{ > + VFIOINTp *intp; > + int i = 0; > + QLIST_FOREACH(intp, &vdev->intp_list, next) { > + DPRINTF("IRQ #%d\n", i); > + DPRINTF("- pin = %d\n", intp->pin); > + DPRINTF("- fd = %d\n", event_notifier_get_fd(&intp->interrupt)); > + DPRINTF("- pending = %d\n", (int)intp->pending); > + DPRINTF("- kvm_accel = %d\n", (int)intp->kvm_accel); > + i++; > + } > +} > +#endif > + > +static int vfio_enable_intp(VFIODevice *vdev, unsigned int index) > +{ > + struct vfio_irq_set *irq_set; /* irq structure passed to vfio kernel */ > + int32_t *pfd; /* pointer to the eventfd */ > + int ret, argsz; > + > + int device = vdev->fd; > + SysBusDevice *sbdev = SYS_BUS_DEVICE(vdev); > + > + vdev->interrupt = VFIO_INT_INTp; > + > + /* allocate and populate a new VFIOINTp structure put in a queue list */ > + VFIOINTp *intp = g_malloc0(sizeof(*intp)); mixed declarations - here and elsewhere - have you read the qemu/CODING_STYLE file? > + intp->vdev = vdev; > + intp->pin = index; > + intp->pending = false; > + intp->mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, > + vfio_intp_mmap_enable, intp); > + intp->mmap_timeout = 1100; > + /* TO DO: timeout as parameter */ please do :) > + /* incr sysbus num_irq and sets sysbus->irqp[n] = &intp->qemuirq > + * only the address of the qemu_irq is set here > + */ /* * multi-line comments are * formatted like this */ > + sysbus_init_irq(sbdev, &intp->qemuirq); > + > + /* content is set in sysbus_connect_irq (currently in machine definition) */ > + > + ret = event_notifier_init(&intp->interrupt, 0); > + if (ret) { > + error_report("vfio: Error: event_notifier_init failed "); > + return ret; > + } > + > + /* build the irq_set to be passed to the vfio kernel driver */ > + > + argsz = sizeof(*irq_set) + sizeof(*pfd); > + > + irq_set = g_malloc0(argsz); > + > + if (!irq_set) { > + DPRINTF("failure while allocating memory for irq\n"); > + qemu_log_mask(LOG_GUEST_ERROR, > + "VFIO platform: failure while allocating memory for irq"); > + return -errno; > + } > + > + irq_set->argsz = argsz; > + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; > + irq_set->index = index; > + irq_set->start = 0; > + irq_set->count = 1; > + pfd = (int32_t *)&irq_set->data; > + > + *pfd = event_notifier_get_fd(&intp->interrupt); > + > + DPRINTF("register fd=%d/irq index=%d to kernel\n", *pfd, index); > + > + qemu_set_fd_handler(*pfd, vfio_intp_interrupt, NULL, intp); > + > + /* pass the index/fd binding to the kernel driver so that it > + * triggers this fd on HW IRQ > + */ > + ret = ioctl(device, VFIO_DEVICE_SET_IRQS, irq_set); > + > + g_free(irq_set); > + > + if (ret) { > + error_report("vfio: Error: Failed to pass Int fd to the driver: %m"); > + qemu_set_fd_handler(*pfd, NULL, NULL, vdev); > + close(*pfd); /* TO DO : replace by event_notifier_cleanup */ > + return -errno; > + } > + > + /* store the new intp in qlist */ > + > + QLIST_INSERT_HEAD(&vdev->intp_list, intp, next); > + > + return 0; > + > +} > + > + > + > + > static int vfio_mmap_region(VFIODevice *vdev, VFIORegion *region, > MemoryRegion *mem, MemoryRegion *submem, > void **map, size_t size, off_t offset, > @@ -112,6 +341,7 @@ error: > /* > * IO Port/MMIO - Beware of the endians, VFIO is always little endian > */ > + > static void vfio_region_write(void *opaque, hwaddr addr, > uint64_t data, unsigned size) > { > @@ -139,12 +369,15 @@ static void vfio_region_write(void *opaque, hwaddr addr, > } > > if (pwrite(region->fd, &buf, size, region->fd_offset + addr) != size) { > - error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m", > - __func__, addr, data, size); > + error_report("(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m", > + addr, data, size); > } > > - DPRINTF("%s(region %d+0x%"HWADDR_PRIx", 0x%"PRIx64", %d)\n", > - __func__, region->nr, addr, data, size); > + DPRINTF("(region %d+0x%"HWADDR_PRIx", 0x%"PRIx64", %d)\n", > + region->nr, addr, data, size); > + > + vfio_irq_eoi(container_of(region, VFIODevice, regions[region->nr])); > + > } > > static uint64_t vfio_region_read(void *opaque, hwaddr addr, unsigned size) > @@ -159,8 +392,8 @@ static uint64_t vfio_region_read(void *opaque, hwaddr addr, unsigned size) > uint64_t data = 0; > > if (pread(region->fd, &buf, size, region->fd_offset + addr) != size) { > - error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m", > - __func__, addr, size); > + error_report("(,0x%"HWADDR_PRIx", %d) failed: %m", > + addr, size); > return (uint64_t)-1; > } > > @@ -179,18 +412,22 @@ static uint64_t vfio_region_read(void *opaque, hwaddr addr, unsigned size) > break; > } > > - DPRINTF("%s(region %d+0x%"HWADDR_PRIx", %d) = 0x%"PRIx64"\n", > - __func__, region->nr, addr, size, data); > + DPRINTF("(region %d+0x%"HWADDR_PRIx", %d) = 0x%"PRIx64"\n", > + region->nr, addr, size, data); > + > + vfio_irq_eoi(container_of(region, VFIODevice, regions[region->nr])); > > return data; > } > > + > static const MemoryRegionOps vfio_region_ops = { > .read = vfio_region_read, > .write = vfio_region_write, > .endianness = DEVICE_NATIVE_ENDIAN, > }; > > + > static void vfio_map_region(VFIODevice *vdev, int nr) > { > VFIORegion *region = &vdev->regions[nr]; unrelated whitespace changes > @@ -242,14 +479,6 @@ static int vfio_get_device(VFIOGroup *group, const char *name, > DPRINTF("Device %s flags: %u, regions: %u, irqs: %u\n", name, > dev_info.flags, dev_info.num_regions, dev_info.num_irqs); > > - vdev->regions = g_malloc0(sizeof(VFIORegion) * dev_info.num_regions); > - if (!vdev->regions) { > - error_report("vfio: Error allocating space for %d regions", > - dev_info.num_regions); > - ret = -ENOMEM; > - goto error; > - } > - > vdev->num_regions = dev_info.num_regions; > > for (i = 0; i < dev_info.num_regions; i++) { > @@ -273,6 +502,38 @@ static int vfio_get_device(VFIOGroup *group, const char *name, > vdev->regions[i].nr = i; > } > > + /* IRQ */ > + > + DPRINTF("Num IRQS: %d\n", dev_info.num_irqs); > + > + vdev->num_irqs = dev_info.num_irqs; > + > + for (i = 0; i < dev_info.num_irqs; i++) { > + > + struct vfio_irq_info irq = { .argsz = sizeof(irq) }; > + > + irq.index = i; > + > + DPRINTF("Retrieve IRQ info from vfio platform driver ...\n"); > + > + ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq); > + > + if (ret) { why is everything in doublespace here? group statements by relevance, e.g., there needs not be a blank line in between a "ret =" and its successive "if (ret)...". > + error_report("vfio: error getting device %s irq info", > + name); > + error_printf("vfio: error getting device %s irq info", > + name); pick one :) > + } > + > + DPRINTF("- IRQ index %d: count %d, flags=0x%x\n", > + irq.index, > + irq.count, > + irq.flags); > + > + vfio_enable_intp(vdev, irq.index); > + > + } > + that DPRINTF needn't occupy four lines. Please try and restrict PRINTFs to approximately the same level as how hw/vfio/pci.c does. Try to stay close to its coding style, too. I'm stopping my review here - please clean up and resubmit. Thanks, Kim
On 04/11/2014 03:34 AM, Kim Phillips wrote: > On Wed, 9 Apr 2014 16:33:07 +0100 > Eric Auger <eric.auger@linaro.org> wrote: > >> This work is inspired of PCI INTx. The code was prepared to support >> multiple IRQs but this was not tested at that stage. Similarly to what >> is done on PCI, the device register space is RAM unmapped on IRQ hit >> in order to trap the end of interrupt (EOI). On mmap timer hit, if the >> EOI was trapped, the mmapping is restored. the physical interrupt is >> unmasked on the first read/write with the MMIO register space. >> >> Tested on Calxeda Midway xgmac which can be directly assigned to one >> virt VM. >> >> Acknowledgements: >> - vfio device name, guest physical address and IRQ indexes are >> hardcoded. This will be improved in another patch >> - currently tested on a single IRQ (xgmac main IRQ) >> - a KVM patch is required to invalidate stage2 entries on RAM memory >> region destruction (see [PATCH] ARM: KVM: Handle IPA unmapping on >> memory region deletion) >> >> Signed-off-by: Eric Auger <eric.auger@linaro.org> >> --- > > thanks for this, Eric. > >> hw/arm/virt.c | 13 ++- >> hw/vfio/platform.c | 316 +++++++++++++++++++++++++++++++++++++++++++++++++---- >> 2 files changed, 304 insertions(+), 25 deletions(-) >> >> diff --git a/hw/arm/virt.c b/hw/arm/virt.c >> index 5d43cf0..31ae7d2 100644 >> --- a/hw/arm/virt.c >> +++ b/hw/arm/virt.c >> @@ -108,12 +108,13 @@ static const MemMapEntry a15memmap[] = { >> /* ...repeating for a total of NUM_VIRTIO_TRANSPORTS, each of that size */ >> /* 0x10000000 .. 0x40000000 reserved for PCI */ >> [VIRT_MEM] = { 0x40000000, 1ULL * 1024 * 1024 * 1024 }, >> - [VIRT_ETHERNET] = { 0xfff51000, 0x1000 }, >> + [VIRT_ETHERNET] = { 0xfff41000, 0x1000 }, > > this change isn't explained (the device is at physical 0xfff51000, > not 0xfff41000), and looks like it belongs in the first patch of the > series anyway. Note: see e.g., commit f5fdcd6e5 "hw/arm: Add > 'virt' platform" for an example of how to re-compose commit message > text for patches that undergo a change of author. > Hi Kim, I acknowledge the change is not justified in the context of IRQ support introduction. I will remove it. I changed this because the address used in the prior patch was misleading to me, as I reported in one comment. Indeed 0xFFF51000 is the host physical address of the device but here we specify the guest physical address which in general does not relate at all with the host physical address. >> }; >> >> static const int a15irqmap[] = { >> [VIRT_UART] = 1, >> [VIRT_MMIO] = 16, /* ...to 16 + NUM_VIRTIO_TRANSPORTS - 1 */ >> + [VIRT_ETHERNET] = 77, >> }; >> >> static VirtBoardInfo machines[] = { >> @@ -299,8 +300,12 @@ static void create_ethernet(const VirtBoardInfo *vbi, qemu_irq *pic) >> hwaddr base = vbi->memmap[VIRT_ETHERNET].base; >> hwaddr size = vbi->memmap[VIRT_ETHERNET].size; >> const char compat[] = "calxeda,hb-xgmac"; >> + int irqm = vbi->irqmap[VIRT_ETHERNET]; >> + int irqp = irqm+1; >> + int irqlp = irqm+2; >> >> - sysbus_create_simple("vfio-platform", base, NULL); >> + sysbus_create_varargs("vfio-platform", base, >> + pic[irqm], pic[irqp], pic[irqlp], NULL); >> >> nodename = g_strdup_printf("/ethernet@%" PRIx64, base); >> qemu_fdt_add_subnode(vbi->fdt, nodename); >> @@ -308,6 +313,10 @@ static void create_ethernet(const VirtBoardInfo *vbi, qemu_irq *pic) >> /* Note that we can't use setprop_string because of the embedded NUL */ >> qemu_fdt_setprop(vbi->fdt, nodename, "compatible", compat, sizeof(compat)); >> qemu_fdt_setprop_sized_cells(vbi->fdt, nodename, "reg", 2, base, 2, size); >> + qemu_fdt_setprop_cells(vbi->fdt, nodename, "interrupts", >> + 0x0, irqm, 0x4, >> + 0x0, irqp, 0x4, >> + 0x0, irqlp, 0x4); > > fwiw, it would have been nice to reuse the irq variable names used > in hw/net/xgmac.c, but whatever... OK, I used irqM for Main, irqP for Power and irqLP for Low Power. The driver only uses 2 of them: main and power. I can rename. Anyway this code is bound to disappear. See next patch. > >> g_free(nodename); >> } >> diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c >> index 138fb13..f148edd 100644 >> --- a/hw/vfio/platform.c >> +++ b/hw/vfio/platform.c >> @@ -24,6 +24,7 @@ >> #include "config.h" >> #include "exec/address-spaces.h" >> #include "exec/memory.h" >> + >> #include "qemu-common.h" >> #include "qemu/error-report.h" >> #include "qemu/event_notifier.h" >> @@ -31,6 +32,7 @@ >> #include "qemu/range.h" >> #include "sysemu/kvm.h" >> #include "sysemu/sysemu.h" >> + >> #include "hw/qdev-properties.h" >> #include "migration/vmstate.h" >> #include "hw/hw.h" >> @@ -41,12 +43,15 @@ >> #define DEBUG_VFIO >> #ifdef DEBUG_VFIO >> #define DPRINTF(fmt, ...) \ >> - do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0) >> + do { fprintf(stderr, "vfio: %s: " fmt, __func__, ## __VA_ARGS__); } \ >> + while (0) > > these changes are unrelated to the subject of this "add IRQ" patch - > either make them their own patch (in between the last one and this > one), or merge them with the prior patch (that introduces the code), > but since their nature starts to drift from the corresponding PCI > code counterpart, I can't help think why they shouldn't be applied > there, too. Probably best dropping them for now - these and > cleaning up more of the superfluous DPRINTFs added in this series. > agreed. I can drop them. >> #else >> #define DPRINTF(fmt, ...) \ >> do { } while (0) >> #endif >> >> +#define PLATFORM_NUM_REGIONS 10 >> + > > this is a regression from the third patch in this series - see below. > OK, see related comment below. >> /* Extra debugging, trap acceleration paths for more logging */ >> #define VFIO_ALLOW_MMAP 1 >> >> @@ -63,16 +68,240 @@ typedef struct VFIORegion { >> uint8_t nr; /* cache the region number for debug */ >> } VFIORegion; >> >> + >> +#define VFIO_INT_INTp 4 > > I've just cloned your qemu tree, vfio-dev-integ branch, and did a > grep for where this is used, and got no occurrences at all, so this > must be being removed in a later patch in the series...which makes > a proper review, well, difficult. > Indeed this disappeared in the last patch of the serie since I do not see any interest in keeping the interrupt type field. I will squash last commit with that one. >> +typedef struct VFIOINTp { >> + QLIST_ENTRY(VFIOINTp) next; >> + EventNotifier interrupt; /* eventfd triggered on interrupt */ >> + EventNotifier unmask; /* eventfd for unmask on QEMU bypass */ >> + qemu_irq qemuirq; >> + struct VFIODevice *vdev; /* back pointer to device */ >> + bool pending; /* interrupt pending */ >> + bool kvm_accel; /* set when QEMU bypass through KVM enabled */ >> + uint8_t pin; /* index */ >> + uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */ >> + QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */ >> +} VFIOINTp; >> + > > please look at merging this with hw/vfio/pci.c's VFIOINTx definition > and putting the result in hw/vfio/vfio-common.h. OK I will think about it. Need to complete the work and see how much the structs will defer at the end. > >> + >> typedef struct VFIODevice { >> SysBusDevice sbdev; >> int fd; >> int num_regions; >> - VFIORegion *regions; >> + int num_irqs; >> + int interrupt; /* type of the interrupt, might disappear */ > > sigh, and it does disappear, presumably in a later patch :( OK I will squash last commit with that one > >> + char *name; >> + uint32_t mmap_timeout; /* mmap timeout value in ms */ >> + VFIORegion regions[PLATFORM_NUM_REGIONS]; > > this is a regression over the last version of the third patch I sent > you; 'regions' should remain dynamically allocated - here, they're > being fixed. OK I did that way to reuse vfio_irq_eoi(container_of(region, VFIODevice, regions[region->nr])) in vfio_region_write/read. But this definitively can be improved. > >> QLIST_ENTRY(VFIODevice) next; >> struct VFIOGroup *group; >> - char *name; >> + QLIST_HEAD(, VFIOINTp) intp_list; >> } VFIODevice; >> >> + >> + >> +static void vfio_unmask_intp(VFIODevice *vdev, int index) >> +{ >> + struct vfio_irq_set irq_set = { >> + .argsz = sizeof(irq_set), >> + .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK, >> + .index = index, >> + .start = 0, >> + .count = 1, >> + }; >> + >> + ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); >> +} >> + >> + >> + >> + > > extra whitespace OK > >> +static void vfio_intp_mmap_enable(void *opaque) >> +{ >> + VFIOINTp * intp = (VFIOINTp *)opaque; >> + VFIODevice *vdev = intp->vdev; >> + >> + if (intp->pending) { >> + DPRINTF("IRQ still pending, re-schedule the mmap timer\n"); >> + timer_mod(intp->mmap_timer, >> + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + intp->mmap_timeout); >> + return; >> + } >> + >> + DPRINTF("IRQ EOI'ed sets mmap again\n"); >> + VFIORegion *region = &vdev->regions[0]; >> + memory_region_set_enabled(®ion->mmap_mem, true); >> +} >> + >> + >> + >> +static void vfio_intp_interrupt(void *opaque) >> +{ >> + int ret; >> + VFIOINTp *intp = (VFIOINTp *)opaque; >> + VFIODevice *vdev = intp->vdev; >> + >> + DPRINTF("pin = %d fd = %d\n", >> + intp->pin, event_notifier_get_fd(&intp->interrupt)); >> + >> + ret = event_notifier_test_and_clear(&intp->interrupt); >> + if (!ret) { >> + DPRINTF("Error when clearing fd=%d\n", >> + event_notifier_get_fd(&intp->interrupt)); >> + } >> + >> + intp->pending = true; >> + >> + /* TODO: fix this number of regions, >> + * currently a single region is handled >> + */ > > please do :) Can't we image to have a separate patch for this story of number of regions, overall? > >> + VFIORegion *region = &vdev->regions[0]; >> + memory_region_set_enabled(®ion->mmap_mem, false); >> + >> + qemu_set_irq(intp->qemuirq, 1); >> + >> + /* schedule the mmap timer which will restote mmap path after EOI*/ > > restore > OK >> + if (intp->mmap_timeout) { >> + timer_mod(intp->mmap_timer, >> + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + intp->mmap_timeout); >> + } >> + >> +} >> + >> + >> + >> +static void vfio_irq_eoi(VFIODevice *vdev) >> +{ >> + >> + VFIOINTp *intp; >> + bool eoi_done = false; >> + >> + QLIST_FOREACH(intp, &vdev->intp_list, next) { >> + if (intp->pending) { >> + if (eoi_done) { >> + DPRINTF("several IRQ pending simultaneously: \ >> + this is not a supported case yet\n"); >> + } > > If the thing to do in this case is not remap MMIO to the 'fast > path', then we should do that. Otherwise, I think I'd protect this > with DEBUG in the meantime.. I did not understand that comment > >> + DPRINTF("EOI IRQ #%d fd=%d\n", >> + intp->pin, event_notifier_get_fd(&intp->interrupt)); >> + intp->pending = false; >> + qemu_set_irq(intp->qemuirq, 0); >> + vfio_unmask_intp(vdev, intp->pin); >> + eoi_done = true; >> + } >> + } >> + >> + return; >> + >> +} >> + >> + >> + >> +#if 0 > > please, no dead code. OK > >> +static void vfio_list_intp(VFIODevice *vdev) >> +{ >> + VFIOINTp *intp; >> + int i = 0; >> + QLIST_FOREACH(intp, &vdev->intp_list, next) { >> + DPRINTF("IRQ #%d\n", i); >> + DPRINTF("- pin = %d\n", intp->pin); >> + DPRINTF("- fd = %d\n", event_notifier_get_fd(&intp->interrupt)); >> + DPRINTF("- pending = %d\n", (int)intp->pending); >> + DPRINTF("- kvm_accel = %d\n", (int)intp->kvm_accel); >> + i++; >> + } >> +} >> +#endif >> + >> +static int vfio_enable_intp(VFIODevice *vdev, unsigned int index) >> +{ >> + struct vfio_irq_set *irq_set; /* irq structure passed to vfio kernel */ >> + int32_t *pfd; /* pointer to the eventfd */ >> + int ret, argsz; >> + >> + int device = vdev->fd; >> + SysBusDevice *sbdev = SYS_BUS_DEVICE(vdev); >> + >> + vdev->interrupt = VFIO_INT_INTp; >> + >> + /* allocate and populate a new VFIOINTp structure put in a queue list */ >> + VFIOINTp *intp = g_malloc0(sizeof(*intp)); > > mixed declarations - here and elsewhere - have you read the > qemu/CODING_STYLE file? > I did but checkpatch did not argue and I skipped that. >> + intp->vdev = vdev; >> + intp->pin = index; >> + intp->pending = false; >> + intp->mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, >> + vfio_intp_mmap_enable, intp); >> + intp->mmap_timeout = 1100; >> + /* TO DO: timeout as parameter */ > > please do :) fixed in next patch where I take care of cmd line arguments. > >> + /* incr sysbus num_irq and sets sysbus->irqp[n] = &intp->qemuirq >> + * only the address of the qemu_irq is set here >> + */ > > /* > * multi-line comments are > * formatted like this > */ > OK > >> + sysbus_init_irq(sbdev, &intp->qemuirq); >> + >> + /* content is set in sysbus_connect_irq (currently in machine definition) */ >> + >> + ret = event_notifier_init(&intp->interrupt, 0); >> + if (ret) { >> + error_report("vfio: Error: event_notifier_init failed "); >> + return ret; >> + } >> + >> + /* build the irq_set to be passed to the vfio kernel driver */ >> + >> + argsz = sizeof(*irq_set) + sizeof(*pfd); >> + >> + irq_set = g_malloc0(argsz); >> + >> + if (!irq_set) { >> + DPRINTF("failure while allocating memory for irq\n"); >> + qemu_log_mask(LOG_GUEST_ERROR, >> + "VFIO platform: failure while allocating memory for irq"); >> + return -errno; >> + } >> + >> + irq_set->argsz = argsz; >> + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; >> + irq_set->index = index; >> + irq_set->start = 0; >> + irq_set->count = 1; >> + pfd = (int32_t *)&irq_set->data; >> + >> + *pfd = event_notifier_get_fd(&intp->interrupt); >> + >> + DPRINTF("register fd=%d/irq index=%d to kernel\n", *pfd, index); >> + >> + qemu_set_fd_handler(*pfd, vfio_intp_interrupt, NULL, intp); >> + >> + /* pass the index/fd binding to the kernel driver so that it >> + * triggers this fd on HW IRQ >> + */ >> + ret = ioctl(device, VFIO_DEVICE_SET_IRQS, irq_set); >> + >> + g_free(irq_set); >> + >> + if (ret) { >> + error_report("vfio: Error: Failed to pass Int fd to the driver: %m"); >> + qemu_set_fd_handler(*pfd, NULL, NULL, vdev); >> + close(*pfd); /* TO DO : replace by event_notifier_cleanup */ >> + return -errno; >> + } >> + >> + /* store the new intp in qlist */ >> + >> + QLIST_INSERT_HEAD(&vdev->intp_list, intp, next); >> + >> + return 0; >> + >> +} >> + >> + >> + >> + >> static int vfio_mmap_region(VFIODevice *vdev, VFIORegion *region, >> MemoryRegion *mem, MemoryRegion *submem, >> void **map, size_t size, off_t offset, >> @@ -112,6 +341,7 @@ error: >> /* >> * IO Port/MMIO - Beware of the endians, VFIO is always little endian >> */ >> + >> static void vfio_region_write(void *opaque, hwaddr addr, >> uint64_t data, unsigned size) >> { >> @@ -139,12 +369,15 @@ static void vfio_region_write(void *opaque, hwaddr addr, >> } >> >> if (pwrite(region->fd, &buf, size, region->fd_offset + addr) != size) { >> - error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m", >> - __func__, addr, data, size); >> + error_report("(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m", >> + addr, data, size); >> } >> >> - DPRINTF("%s(region %d+0x%"HWADDR_PRIx", 0x%"PRIx64", %d)\n", >> - __func__, region->nr, addr, data, size); >> + DPRINTF("(region %d+0x%"HWADDR_PRIx", 0x%"PRIx64", %d)\n", >> + region->nr, addr, data, size); >> + >> + vfio_irq_eoi(container_of(region, VFIODevice, regions[region->nr])); >> + >> } >> >> static uint64_t vfio_region_read(void *opaque, hwaddr addr, unsigned size) >> @@ -159,8 +392,8 @@ static uint64_t vfio_region_read(void *opaque, hwaddr addr, unsigned size) >> uint64_t data = 0; >> >> if (pread(region->fd, &buf, size, region->fd_offset + addr) != size) { >> - error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m", >> - __func__, addr, size); >> + error_report("(,0x%"HWADDR_PRIx", %d) failed: %m", >> + addr, size); > > > >> return (uint64_t)-1; >> } >> >> @@ -179,18 +412,22 @@ static uint64_t vfio_region_read(void *opaque, hwaddr addr, unsigned size) >> break; >> } >> >> - DPRINTF("%s(region %d+0x%"HWADDR_PRIx", %d) = 0x%"PRIx64"\n", >> - __func__, region->nr, addr, size, data); >> + DPRINTF("(region %d+0x%"HWADDR_PRIx", %d) = 0x%"PRIx64"\n", >> + region->nr, addr, size, data); >> + >> + vfio_irq_eoi(container_of(region, VFIODevice, regions[region->nr])); >> >> return data; >> } >> >> + >> static const MemoryRegionOps vfio_region_ops = { >> .read = vfio_region_read, >> .write = vfio_region_write, >> .endianness = DEVICE_NATIVE_ENDIAN, >> }; >> >> + >> static void vfio_map_region(VFIODevice *vdev, int nr) >> { >> VFIORegion *region = &vdev->regions[nr]; > > unrelated whitespace changes OK, will clean > >> @@ -242,14 +479,6 @@ static int vfio_get_device(VFIOGroup *group, const char *name, >> DPRINTF("Device %s flags: %u, regions: %u, irqs: %u\n", name, >> dev_info.flags, dev_info.num_regions, dev_info.num_irqs); >> >> - vdev->regions = g_malloc0(sizeof(VFIORegion) * dev_info.num_regions); >> - if (!vdev->regions) { >> - error_report("vfio: Error allocating space for %d regions", >> - dev_info.num_regions); >> - ret = -ENOMEM; >> - goto error; >> - } >> - >> vdev->num_regions = dev_info.num_regions; >> >> for (i = 0; i < dev_info.num_regions; i++) { >> @@ -273,6 +502,38 @@ static int vfio_get_device(VFIOGroup *group, const char *name, >> vdev->regions[i].nr = i; >> } >> >> + /* IRQ */ >> + >> + DPRINTF("Num IRQS: %d\n", dev_info.num_irqs); >> + >> + vdev->num_irqs = dev_info.num_irqs; >> + >> + for (i = 0; i < dev_info.num_irqs; i++) { >> + >> + struct vfio_irq_info irq = { .argsz = sizeof(irq) }; >> + >> + irq.index = i; >> + >> + DPRINTF("Retrieve IRQ info from vfio platform driver ...\n"); >> + >> + ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq); >> + >> + if (ret) { > > why is everything in doublespace here? group statements by > relevance, e.g., there needs not be a blank line in between a "ret > =" and its successive "if (ret)...". OK > >> + error_report("vfio: error getting device %s irq info", >> + name); >> + error_printf("vfio: error getting device %s irq info", >> + name); > > pick one :) OK > >> + } >> + >> + DPRINTF("- IRQ index %d: count %d, flags=0x%x\n", >> + irq.index, >> + irq.count, >> + irq.flags); >> + >> + vfio_enable_intp(vdev, irq.index); >> + >> + } >> + > > that DPRINTF needn't occupy four lines. Please try and restrict > PRINTFs to approximately the same level as how hw/vfio/pci.c does. > Try to stay close to its coding style, too. > > I'm stopping my review here - please clean up and resubmit. OK Thanks for your time BR Eric > > Thanks, > > Kim >
On Wed, 16 Apr 2014 15:29:35 +0200 Eric Auger <eric.auger@linaro.org> wrote: > On 04/11/2014 03:34 AM, Kim Phillips wrote: > > On Wed, 9 Apr 2014 16:33:07 +0100 > > Eric Auger <eric.auger@linaro.org> wrote: > >> @@ -108,12 +108,13 @@ static const MemMapEntry a15memmap[] = { > >> /* ...repeating for a total of NUM_VIRTIO_TRANSPORTS, each of that size */ > >> /* 0x10000000 .. 0x40000000 reserved for PCI */ > >> [VIRT_MEM] = { 0x40000000, 1ULL * 1024 * 1024 * 1024 }, > >> - [VIRT_ETHERNET] = { 0xfff51000, 0x1000 }, > >> + [VIRT_ETHERNET] = { 0xfff41000, 0x1000 }, > > > > this change isn't explained (the device is at physical 0xfff51000, > > not 0xfff41000), and looks like it belongs in the first patch of the > > series anyway. Note: see e.g., commit f5fdcd6e5 "hw/arm: Add > > 'virt' platform" for an example of how to re-compose commit message > > text for patches that undergo a change of author. > > > Hi Kim, Hi Eric, > I acknowledge the change is not justified in the context of IRQ support > introduction. I will remove it. > I changed this because the address used in the prior patch was > misleading to me, as I reported in one comment. Indeed 0xFFF51000 is the > host physical address of the device but here we specify the guest > physical address which in general does not relate at all with the host > physical address. I see there's churn in other threads in this area...good. > >> + char *name; > >> + uint32_t mmap_timeout; /* mmap timeout value in ms */ > >> + VFIORegion regions[PLATFORM_NUM_REGIONS]; > > > > this is a regression over the last version of the third patch I sent > > you; 'regions' should remain dynamically allocated - here, they're > > being fixed. > OK I did that way to reuse > vfio_irq_eoi(container_of(region, VFIODevice, regions[region->nr])) > in vfio_region_write/read. > > But this definitively can be improved. it has to, in order to support a variable number of regions. Maybe use the VFIODevice itself as the opaque pointer instead of 'fd' in struct VFIORegion? > >> + /* TODO: fix this number of regions, > >> + * currently a single region is handled > >> + */ > > > > please do :) > Can't we image to have a separate patch for this story of number of > regions, overall? well I'd expect the next revision of this series to blend better with the the way it was done in "vfio: add vfio-platform support" (the existing 3rd patch), i.e., add interrupt support without breaking support for a variable number of regions. > >> +static void vfio_irq_eoi(VFIODevice *vdev) > >> +{ > >> + > >> + VFIOINTp *intp; > >> + bool eoi_done = false; > >> + > >> + QLIST_FOREACH(intp, &vdev->intp_list, next) { > >> + if (intp->pending) { > >> + if (eoi_done) { > >> + DPRINTF("several IRQ pending simultaneously: \ > >> + this is not a supported case yet\n"); > >> + } > > > > If the thing to do in this case is not remap MMIO to the 'fast > > path', then we should do that. Otherwise, I think I'd protect this > > with DEBUG in the meantime.. > I did not understand that comment As it stands, that code is only DPRINTF'ing in the irq->pending case, and it's adding a linked list to the VFIOINT struct which otherwise is equal to that of the vfio-pci implementation. All we need to do in the irq pending case is *not* switch back to the 'fast path' (direct MMIO access, ie., still use vfio_region_{read,write} ()), right? In which case, all we may need is an interrupt pending counter in the VFIODevice struct, that this code should check. Looking at the code again, I noticed vfio_disable_irqindex(), vfio_unmask_int{x,p}() are almost verbatim copies - can they be merged into common.c? I also noticed the platform code doesn't have a vfio_mask_intx() equivalent but can't tell if it needs it ATM. > BR > > Eric Thanks Eric, Kim
diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 5d43cf0..31ae7d2 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -108,12 +108,13 @@ static const MemMapEntry a15memmap[] = { /* ...repeating for a total of NUM_VIRTIO_TRANSPORTS, each of that size */ /* 0x10000000 .. 0x40000000 reserved for PCI */ [VIRT_MEM] = { 0x40000000, 1ULL * 1024 * 1024 * 1024 }, - [VIRT_ETHERNET] = { 0xfff51000, 0x1000 }, + [VIRT_ETHERNET] = { 0xfff41000, 0x1000 }, }; static const int a15irqmap[] = { [VIRT_UART] = 1, [VIRT_MMIO] = 16, /* ...to 16 + NUM_VIRTIO_TRANSPORTS - 1 */ + [VIRT_ETHERNET] = 77, }; static VirtBoardInfo machines[] = { @@ -299,8 +300,12 @@ static void create_ethernet(const VirtBoardInfo *vbi, qemu_irq *pic) hwaddr base = vbi->memmap[VIRT_ETHERNET].base; hwaddr size = vbi->memmap[VIRT_ETHERNET].size; const char compat[] = "calxeda,hb-xgmac"; + int irqm = vbi->irqmap[VIRT_ETHERNET]; + int irqp = irqm+1; + int irqlp = irqm+2; - sysbus_create_simple("vfio-platform", base, NULL); + sysbus_create_varargs("vfio-platform", base, + pic[irqm], pic[irqp], pic[irqlp], NULL); nodename = g_strdup_printf("/ethernet@%" PRIx64, base); qemu_fdt_add_subnode(vbi->fdt, nodename); @@ -308,6 +313,10 @@ static void create_ethernet(const VirtBoardInfo *vbi, qemu_irq *pic) /* Note that we can't use setprop_string because of the embedded NUL */ qemu_fdt_setprop(vbi->fdt, nodename, "compatible", compat, sizeof(compat)); qemu_fdt_setprop_sized_cells(vbi->fdt, nodename, "reg", 2, base, 2, size); + qemu_fdt_setprop_cells(vbi->fdt, nodename, "interrupts", + 0x0, irqm, 0x4, + 0x0, irqp, 0x4, + 0x0, irqlp, 0x4); g_free(nodename); } diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index 138fb13..f148edd 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -24,6 +24,7 @@ #include "config.h" #include "exec/address-spaces.h" #include "exec/memory.h" + #include "qemu-common.h" #include "qemu/error-report.h" #include "qemu/event_notifier.h" @@ -31,6 +32,7 @@ #include "qemu/range.h" #include "sysemu/kvm.h" #include "sysemu/sysemu.h" + #include "hw/qdev-properties.h" #include "migration/vmstate.h" #include "hw/hw.h" @@ -41,12 +43,15 @@ #define DEBUG_VFIO #ifdef DEBUG_VFIO #define DPRINTF(fmt, ...) \ - do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0) + do { fprintf(stderr, "vfio: %s: " fmt, __func__, ## __VA_ARGS__); } \ + while (0) #else #define DPRINTF(fmt, ...) \ do { } while (0) #endif +#define PLATFORM_NUM_REGIONS 10 + /* Extra debugging, trap acceleration paths for more logging */ #define VFIO_ALLOW_MMAP 1 @@ -63,16 +68,240 @@ typedef struct VFIORegion { uint8_t nr; /* cache the region number for debug */ } VFIORegion; + +#define VFIO_INT_INTp 4 + +typedef struct VFIOINTp { + QLIST_ENTRY(VFIOINTp) next; + EventNotifier interrupt; /* eventfd triggered on interrupt */ + EventNotifier unmask; /* eventfd for unmask on QEMU bypass */ + qemu_irq qemuirq; + struct VFIODevice *vdev; /* back pointer to device */ + bool pending; /* interrupt pending */ + bool kvm_accel; /* set when QEMU bypass through KVM enabled */ + uint8_t pin; /* index */ + uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */ + QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */ +} VFIOINTp; + + + typedef struct VFIODevice { SysBusDevice sbdev; int fd; int num_regions; - VFIORegion *regions; + int num_irqs; + int interrupt; /* type of the interrupt, might disappear */ + char *name; + uint32_t mmap_timeout; /* mmap timeout value in ms */ + VFIORegion regions[PLATFORM_NUM_REGIONS]; QLIST_ENTRY(VFIODevice) next; struct VFIOGroup *group; - char *name; + QLIST_HEAD(, VFIOINTp) intp_list; } VFIODevice; + + +static void vfio_unmask_intp(VFIODevice *vdev, int index) +{ + struct vfio_irq_set irq_set = { + .argsz = sizeof(irq_set), + .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK, + .index = index, + .start = 0, + .count = 1, + }; + + ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); +} + + + + +static void vfio_intp_mmap_enable(void *opaque) +{ + VFIOINTp * intp = (VFIOINTp *)opaque; + VFIODevice *vdev = intp->vdev; + + if (intp->pending) { + DPRINTF("IRQ still pending, re-schedule the mmap timer\n"); + timer_mod(intp->mmap_timer, + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + intp->mmap_timeout); + return; + } + + DPRINTF("IRQ EOI'ed sets mmap again\n"); + VFIORegion *region = &vdev->regions[0]; + memory_region_set_enabled(®ion->mmap_mem, true); +} + + + +static void vfio_intp_interrupt(void *opaque) +{ + int ret; + VFIOINTp *intp = (VFIOINTp *)opaque; + VFIODevice *vdev = intp->vdev; + + DPRINTF("pin = %d fd = %d\n", + intp->pin, event_notifier_get_fd(&intp->interrupt)); + + ret = event_notifier_test_and_clear(&intp->interrupt); + if (!ret) { + DPRINTF("Error when clearing fd=%d\n", + event_notifier_get_fd(&intp->interrupt)); + } + + intp->pending = true; + + /* TODO: fix this number of regions, + * currently a single region is handled + */ + + VFIORegion *region = &vdev->regions[0]; + memory_region_set_enabled(®ion->mmap_mem, false); + + qemu_set_irq(intp->qemuirq, 1); + + /* schedule the mmap timer which will restote mmap path after EOI*/ + if (intp->mmap_timeout) { + timer_mod(intp->mmap_timer, + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + intp->mmap_timeout); + } + +} + + + +static void vfio_irq_eoi(VFIODevice *vdev) +{ + + VFIOINTp *intp; + bool eoi_done = false; + + QLIST_FOREACH(intp, &vdev->intp_list, next) { + if (intp->pending) { + if (eoi_done) { + DPRINTF("several IRQ pending simultaneously: \ + this is not a supported case yet\n"); + } + DPRINTF("EOI IRQ #%d fd=%d\n", + intp->pin, event_notifier_get_fd(&intp->interrupt)); + intp->pending = false; + qemu_set_irq(intp->qemuirq, 0); + vfio_unmask_intp(vdev, intp->pin); + eoi_done = true; + } + } + + return; + +} + + + +#if 0 +static void vfio_list_intp(VFIODevice *vdev) +{ + VFIOINTp *intp; + int i = 0; + QLIST_FOREACH(intp, &vdev->intp_list, next) { + DPRINTF("IRQ #%d\n", i); + DPRINTF("- pin = %d\n", intp->pin); + DPRINTF("- fd = %d\n", event_notifier_get_fd(&intp->interrupt)); + DPRINTF("- pending = %d\n", (int)intp->pending); + DPRINTF("- kvm_accel = %d\n", (int)intp->kvm_accel); + i++; + } +} +#endif + +static int vfio_enable_intp(VFIODevice *vdev, unsigned int index) +{ + struct vfio_irq_set *irq_set; /* irq structure passed to vfio kernel */ + int32_t *pfd; /* pointer to the eventfd */ + int ret, argsz; + + int device = vdev->fd; + SysBusDevice *sbdev = SYS_BUS_DEVICE(vdev); + + vdev->interrupt = VFIO_INT_INTp; + + /* allocate and populate a new VFIOINTp structure put in a queue list */ + VFIOINTp *intp = g_malloc0(sizeof(*intp)); + intp->vdev = vdev; + intp->pin = index; + intp->pending = false; + intp->mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, + vfio_intp_mmap_enable, intp); + intp->mmap_timeout = 1100; + /* TO DO: timeout as parameter */ + + /* incr sysbus num_irq and sets sysbus->irqp[n] = &intp->qemuirq + * only the address of the qemu_irq is set here + */ + + sysbus_init_irq(sbdev, &intp->qemuirq); + + /* content is set in sysbus_connect_irq (currently in machine definition) */ + + ret = event_notifier_init(&intp->interrupt, 0); + if (ret) { + error_report("vfio: Error: event_notifier_init failed "); + return ret; + } + + /* build the irq_set to be passed to the vfio kernel driver */ + + argsz = sizeof(*irq_set) + sizeof(*pfd); + + irq_set = g_malloc0(argsz); + + if (!irq_set) { + DPRINTF("failure while allocating memory for irq\n"); + qemu_log_mask(LOG_GUEST_ERROR, + "VFIO platform: failure while allocating memory for irq"); + return -errno; + } + + irq_set->argsz = argsz; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = index; + irq_set->start = 0; + irq_set->count = 1; + pfd = (int32_t *)&irq_set->data; + + *pfd = event_notifier_get_fd(&intp->interrupt); + + DPRINTF("register fd=%d/irq index=%d to kernel\n", *pfd, index); + + qemu_set_fd_handler(*pfd, vfio_intp_interrupt, NULL, intp); + + /* pass the index/fd binding to the kernel driver so that it + * triggers this fd on HW IRQ + */ + ret = ioctl(device, VFIO_DEVICE_SET_IRQS, irq_set); + + g_free(irq_set); + + if (ret) { + error_report("vfio: Error: Failed to pass Int fd to the driver: %m"); + qemu_set_fd_handler(*pfd, NULL, NULL, vdev); + close(*pfd); /* TO DO : replace by event_notifier_cleanup */ + return -errno; + } + + /* store the new intp in qlist */ + + QLIST_INSERT_HEAD(&vdev->intp_list, intp, next); + + return 0; + +} + + + + static int vfio_mmap_region(VFIODevice *vdev, VFIORegion *region, MemoryRegion *mem, MemoryRegion *submem, void **map, size_t size, off_t offset, @@ -112,6 +341,7 @@ error: /* * IO Port/MMIO - Beware of the endians, VFIO is always little endian */ + static void vfio_region_write(void *opaque, hwaddr addr, uint64_t data, unsigned size) { @@ -139,12 +369,15 @@ static void vfio_region_write(void *opaque, hwaddr addr, } if (pwrite(region->fd, &buf, size, region->fd_offset + addr) != size) { - error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m", - __func__, addr, data, size); + error_report("(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m", + addr, data, size); } - DPRINTF("%s(region %d+0x%"HWADDR_PRIx", 0x%"PRIx64", %d)\n", - __func__, region->nr, addr, data, size); + DPRINTF("(region %d+0x%"HWADDR_PRIx", 0x%"PRIx64", %d)\n", + region->nr, addr, data, size); + + vfio_irq_eoi(container_of(region, VFIODevice, regions[region->nr])); + } static uint64_t vfio_region_read(void *opaque, hwaddr addr, unsigned size) @@ -159,8 +392,8 @@ static uint64_t vfio_region_read(void *opaque, hwaddr addr, unsigned size) uint64_t data = 0; if (pread(region->fd, &buf, size, region->fd_offset + addr) != size) { - error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m", - __func__, addr, size); + error_report("(,0x%"HWADDR_PRIx", %d) failed: %m", + addr, size); return (uint64_t)-1; } @@ -179,18 +412,22 @@ static uint64_t vfio_region_read(void *opaque, hwaddr addr, unsigned size) break; } - DPRINTF("%s(region %d+0x%"HWADDR_PRIx", %d) = 0x%"PRIx64"\n", - __func__, region->nr, addr, size, data); + DPRINTF("(region %d+0x%"HWADDR_PRIx", %d) = 0x%"PRIx64"\n", + region->nr, addr, size, data); + + vfio_irq_eoi(container_of(region, VFIODevice, regions[region->nr])); return data; } + static const MemoryRegionOps vfio_region_ops = { .read = vfio_region_read, .write = vfio_region_write, .endianness = DEVICE_NATIVE_ENDIAN, }; + static void vfio_map_region(VFIODevice *vdev, int nr) { VFIORegion *region = &vdev->regions[nr]; @@ -242,14 +479,6 @@ static int vfio_get_device(VFIOGroup *group, const char *name, DPRINTF("Device %s flags: %u, regions: %u, irqs: %u\n", name, dev_info.flags, dev_info.num_regions, dev_info.num_irqs); - vdev->regions = g_malloc0(sizeof(VFIORegion) * dev_info.num_regions); - if (!vdev->regions) { - error_report("vfio: Error allocating space for %d regions", - dev_info.num_regions); - ret = -ENOMEM; - goto error; - } - vdev->num_regions = dev_info.num_regions; for (i = 0; i < dev_info.num_regions; i++) { @@ -273,6 +502,38 @@ static int vfio_get_device(VFIOGroup *group, const char *name, vdev->regions[i].nr = i; } + /* IRQ */ + + DPRINTF("Num IRQS: %d\n", dev_info.num_irqs); + + vdev->num_irqs = dev_info.num_irqs; + + for (i = 0; i < dev_info.num_irqs; i++) { + + struct vfio_irq_info irq = { .argsz = sizeof(irq) }; + + irq.index = i; + + DPRINTF("Retrieve IRQ info from vfio platform driver ...\n"); + + ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq); + + if (ret) { + error_report("vfio: error getting device %s irq info", + name); + error_printf("vfio: error getting device %s irq info", + name); + } + + DPRINTF("- IRQ index %d: count %d, flags=0x%x\n", + irq.index, + irq.count, + irq.flags); + + vfio_enable_intp(vdev, irq.index); + + } + error: if (ret) { g_free(vdev->regions); @@ -286,13 +547,16 @@ error: static void vfio_platform_realize(DeviceState *dev, Error **errp) { SysBusDevice *sbdev = SYS_BUS_DEVICE(dev); - VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, sbdev, sbdev); + VFIODevice *vdev = DO_UPCAST(VFIODevice, sbdev, sbdev); + VFIODevice *pvdev; + VFIOGroup *group; char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name; ssize_t len; struct stat st; int groupid, i, ret; + /* TODO: pass device name on command line */ vdev->name = malloc(PATH_MAX); strcpy(vdev->name, "fff51000.ethernet"); @@ -320,24 +584,29 @@ static void vfio_platform_realize(DeviceState *dev, Error **errp) return; } - DPRINTF("%s(%s) group %d\n", __func__, vdev->name, groupid); + DPRINTF("%s belongs to VFIO group %d\n", vdev->name, groupid); group = vfio_get_group(groupid, NULL); if (!group) { error_report("vfio: failed to get group %d", groupid); return; } - snprintf(path, sizeof(path), "%s", vdev->name); QLIST_FOREACH(pvdev, &group->device_list, next) { + DPRINTF("compare %s versus %s\n", pvdev->name, vdev->name); if (strcmp(pvdev->name, vdev->name) == 0) { - error_report("vfio: error: device %s is already attached", path); + + DPRINTF("vfio device %s already is attached to group %d\n", + vdev->name, groupid); + vfio_put_group(group, NULL); return; } } + DPRINTF("Calling vfio_get_device ...\n"); + ret = vfio_get_device(group, path, vdev); if (ret) { error_report("vfio: failed to get device %s", path); @@ -363,6 +632,7 @@ static void vfio_platform_dev_class_init(ObjectClass *klass, void *data) dc->realize = vfio_platform_realize; dc->vmsd = &vfio_platform_vmstate; dc->desc = "VFIO-based platform device assignment"; + dc->cannot_instantiate_with_device_add_yet = true; set_bit(DEVICE_CATEGORY_MISC, dc->categories); }
This work is inspired of PCI INTx. The code was prepared to support multiple IRQs but this was not tested at that stage. Similarly to what is done on PCI, the device register space is RAM unmapped on IRQ hit in order to trap the end of interrupt (EOI). On mmap timer hit, if the EOI was trapped, the mmapping is restored. the physical interrupt is unmasked on the first read/write with the MMIO register space. Tested on Calxeda Midway xgmac which can be directly assigned to one virt VM. Acknowledgements: - vfio device name, guest physical address and IRQ indexes are hardcoded. This will be improved in another patch - currently tested on a single IRQ (xgmac main IRQ) - a KVM patch is required to invalidate stage2 entries on RAM memory region destruction (see [PATCH] ARM: KVM: Handle IPA unmapping on memory region deletion) Signed-off-by: Eric Auger <eric.auger@linaro.org> --- hw/arm/virt.c | 13 ++- hw/vfio/platform.c | 316 +++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 304 insertions(+), 25 deletions(-)