diff mbox series

[RFC,POC] Supporting >255 guest CPUs without interrupt remapping

Message ID 38b94080aa2d616a0ecb98d5afcd7cbe9f69f9e8.camel@infradead.org
State New
Headers show
Series [RFC,POC] Supporting >255 guest CPUs without interrupt remapping | expand

Commit Message

David Woodhouse Oct. 2, 2020, 12:36 p.m. UTC
AFAICT there's not actually any good reason why guests can't use x2apic
and have more than 255 CPUs today, even without exposing interrupt
remapping to the guest.

The only issue is that guests can't direct external IOAPIC and MSI
interrupts at the higher APIC IDs. So what? A guest might have a
workload where it makes plenty of sense to use those extra CPUs and
just refrain from targeting external interrupts at them.

In fact, if you take a close look at the hyperv-iommu driver in the
Linux guest kernel, you'll note that it doesn't actually do any
remapping at all; all it does is return -EINVAL if asked to set
affinity to a CPU which can't be targeted.

For Linux at least, it should be fairly simple to have a per-IRQ
controller affinity limit, so it doesn't attempt to target CPUs it
can't reach.

But actually, it's really simple to extend the limit of reachable APICs
even without the complexity of adding a full vIOMMU.

There are 8 bits of extended destination ID in the IOAPIC RTE, which
maps to bits 11-4 of the MSI address. This was historically not used in
bare metal, but IRQ remapping now uses the lowest bit to indicate a
remappable format interrupt.

A VMM can use the other 7 bits to allow guests to target 15 bits of
APIC ID, which gives support for 32Ki vCPUs without needing to expose
IRQ remapping to the guest.

Here's a proof-of-concept hack, which I've tested with a Linux guest
that knows where to put the additional 7 bits in the IOAPIC RTE and MSI
message. At least IOAPIC and emulated AHCI (MSI) are working; I haven't
tested assigned PCI devices yet.
diff mbox series

Patch

diff --git a/hw/i386/kvm/apic.c b/hw/i386/kvm/apic.c
index 4eb2d77b87..b0f4b1a630 100644
--- a/hw/i386/kvm/apic.c
+++ b/hw/i386/kvm/apic.c
@@ -14,6 +14,7 @@ 
 #include "qemu/module.h"
 #include "cpu.h"
 #include "hw/i386/apic_internal.h"
+#include "hw/i386/apic-msidef.h"
 #include "hw/pci/msi.h"
 #include "sysemu/hw_accel.h"
 #include "sysemu/kvm.h"
@@ -183,6 +184,13 @@  static void kvm_send_msi(MSIMessage *msg)
 {
     int ret;
 
+    /*
+     * The message has already passed through interrupt remapping if enabled,
+     * but the legacy extended destination ID in low bits still needs to be
+     * handled.
+     */
+    msg->address = apic_convert_ext_dest_id(msg->address);
+
     ret = kvm_irqchip_send_msi(kvm_state, *msg);
     if (ret < 0) {
         fprintf(stderr, "KVM: injection failed, MSI lost (%s)\n",
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index e87be5d29a..eb4901d6b7 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -807,7 +807,7 @@  void pc_machine_done(Notifier *notifier, void *data)
         fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
     }
 
-    if (x86ms->apic_id_limit > 255 && !xen_enabled()) {
+    if (0 && x86ms->apic_id_limit > 255 && !xen_enabled()) {
         IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default());
 
         if (!iommu || !x86_iommu_ir_supported(X86_IOMMU_DEVICE(iommu)) ||
diff --git a/include/hw/i386/apic-msidef.h b/include/hw/i386/apic-msidef.h
index 420b41167d..b3e0da64a5 100644
--- a/include/hw/i386/apic-msidef.h
+++ b/include/hw/i386/apic-msidef.h
@@ -28,4 +28,20 @@ 
 #define MSI_ADDR_DEST_IDX_SHIFT         4
 #define  MSI_ADDR_DEST_ID_MASK          0x000ff000
 
+static inline uint64_t apic_convert_ext_dest_id(uint64_t address)
+{
+        uint64_t ext_id = address & (0xff << MSI_ADDR_DEST_IDX_SHIFT);
+        /*
+         * If the remappable format bit is set, or the upper bits are
+         * already set in address_hi, or the low extended bits aren't
+         * there anyway, do nothing.
+         */
+        if (!ext_id || (ext_id & (1 << MSI_ADDR_DEST_IDX_SHIFT)) ||
+            (address >> 32))
+                return address;
+
+        address &= ~ext_id;
+        address |= ext_id << 35;
+        return address;
+}
 #endif /* HW_APIC_MSIDEF_H */
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index f6dae4cfb6..547a2faf72 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -4589,13 +4589,11 @@  int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
     X86IOMMUState *iommu = x86_iommu_get_default();
 
     if (iommu) {
-        int ret;
-        MSIMessage src, dst;
         X86IOMMUClass *class = X86_IOMMU_DEVICE_GET_CLASS(iommu);
 
-        if (!class->int_remap) {
-            return 0;
-        }
+        if (class->int_remap) {
+            int ret;
+            MSIMessage src, dst;
 
             src.address = route->u.msi.address_hi;
             src.address <<= VTD_MSI_ADDR_HI_SHIFT;
@@ -4610,11 +4608,21 @@  int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
                 return 1;
             }
 
+            /*
+             * Handled untranslated compatibilty format interrupt with
+             * extended destination ID in the low bits 11-5. */
+            dst.address = apic_convert_ext_dest_id(dst.address);
+
             route->u.msi.address_hi = dst.address >> VTD_MSI_ADDR_HI_SHIFT;
             route->u.msi.address_lo = dst.address & VTD_MSI_ADDR_LO_MASK;
             route->u.msi.data = dst.data;
+            return 0;
+        }
     }
 
+    address = apic_convert_ext_dest_id(address);
+    route->u.msi.address_hi = address >> VTD_MSI_ADDR_HI_SHIFT;
+    route->u.msi.address_lo = address & VTD_MSI_ADDR_LO_MASK;
     return 0;
 }