diff mbox

[PART1,RFC,v4,08/11] svm: Add VMEXIT handlers for AVIC

Message ID 1460017232-17429-9-git-send-email-Suravee.Suthikulpanit@amd.com
State New
Headers show

Commit Message

Suthikulpanit, Suravee April 7, 2016, 8:20 a.m. UTC
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>


This patch introduces VMEXIT handlers, avic_incomplete_ipi_interception()
and avic_unaccelerated_access_interception() along with two trace points
(trace_kvm_avic_incomplete_ipi and trace_kvm_avic_unaccelerated_access).

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>

---
 arch/x86/include/uapi/asm/svm.h |   9 +-
 arch/x86/kvm/lapic.h            |   3 +
 arch/x86/kvm/svm.c              | 246 ++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/trace.h            |  57 ++++++++++
 arch/x86/kvm/x86.c              |   2 +
 5 files changed, 316 insertions(+), 1 deletion(-)

-- 
1.9.1

Comments

Suthikulpanit, Suravee April 28, 2016, 10:08 p.m. UTC | #1
Hi Radim / Paolo,

Sorry for delay in response.

On 4/12/2016 11:22 AM, Radim Krčmář wrote:
> 2016-04-07 03:20-0500, Suravee Suthikulpanit:

>> From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>

>>

>> This patch introduces VMEXIT handlers, avic_incomplete_ipi_interception()

>> and avic_unaccelerated_access_interception() along with two trace points

>> (trace_kvm_avic_incomplete_ipi and trace_kvm_avic_unaccelerated_access).

>>

>> Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>

>> ---

>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c

>> @@ -3515,6 +3515,250 @@ static int mwait_interception(struct vcpu_svm *svm)

>> +static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u8 mda, bool flat)

>> +{

>> +	struct kvm_arch *vm_data = &vcpu->kvm->arch;

>> +	int index;

>> +	u32 *logical_apic_id_table;

>> +

>> +	if (flat) { /* flat */

>> +		if (mda > 7)

>

> Don't you want to check that just one bit it set?

>

>> +			return NULL;

>> +		index = mda;

>> +	} else { /* cluster */

>> +		int apic_id = mda & 0xf;

>> +		int cluster_id = (mda & 0xf0) >> 8;

>

> ">> 4".

>

>> +

>> +		if (apic_id > 4 || cluster_id >= 0xf)

>> +			return NULL;

>> +		index = (cluster_id << 2) + apic_id;

>

> ffs(apic_id), because 'apic_id' must be compacted into 2 bits.

>

>> +	}

>> +	logical_apic_id_table = (u32 *) page_address(vm_data->avic_logical_id_table_page);

>> +

>> +	return &logical_apic_id_table[index];

>> +}


I have quite messed up in the logic here for handling the logical 
cluster ID. Sorry for not catching this earlier. I'm rewriting this 
function altogether to simplify it in the V5.

>> [...]

>> +		lid = ffs(dlid) - 1;

>> +		ret = avic_handle_ldr_write(&svm->vcpu, svm->vcpu.vcpu_id, lid);

>> +		if (ret)

>> +			return 0;

>

> OS can actually change LDR, so the old one should be invalidated.

>

> (No OS does, but that is not an important factor for the hypervisor.)

>


By validating the old one, are you suggesting that we should disable the 
logical APIC table entry previously used by this vcpu? If so, that means 
we would need to cached the previous LDR value since the one in vAPIC 
backing page would already be updated.

>> [...]

>

>> +		if (vm_data->ldr_mode != mod) {

>> +			clear_page(page_address(vm_data->avic_logical_id_table_page));

>> +			vm_data->ldr_mode = mod;

>> +		}

>> +		break;

>> +	}

>

> All these cases need to be called on KVM_SET_LAPIC -- the userspace can

> provide completely new set of APIC registers and AVIC should build its

> maps with them.  Test with save/restore or migration.


Hm.. This means we might need to introduce a new hook which is called 
from the arch/x86/kvm/lapic.c: kvm_apic_post_state_restore(). Probably 
something like kvm_x86_ops->apic_post_state_restore(), which sets up the 
new physical and logical APIC id tables for AVIC.

If this works, I'll add support to handle this and test with the 
migration stuff in the V5.

>> +	if (offset >= 0x400) {

>> +		WARN(1, "Unsupported APIC offset %#x\n", offset);

>

> "printk_ratelimited(KERN_INFO " is the most severe message you could

> give.  I think that not printing anything is best,

>

>> +		return ret;

>

> because we should not return, but continue to emulate the access.


Then, this would continue as if we are handling the normal fault access.

>

>> +	}

>> +

>> +	if (trap) {

>> +		/* Handling Trap */

>> +		if (!write) /* Trap read should never happens */

>> +			BUG();

>

> (BUG_ON(!write) is shorter, though I would avoid BUG -- only guests are

>   going to fail, so we don't need to kill the host.)


Ok. What about just WARN_ONCE(!write, "svm: Handling trap read.\n");

Thanks,
Suravee
diff mbox

Patch

diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index 8a4add8..b9e9bb2 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -73,6 +73,8 @@ 
 #define SVM_EXIT_MWAIT_COND    0x08c
 #define SVM_EXIT_XSETBV        0x08d
 #define SVM_EXIT_NPF           0x400
+#define SVM_EXIT_AVIC_INCOMPLETE_IPI		0x401
+#define SVM_EXIT_AVIC_UNACCELERATED_ACCESS	0x402
 
 #define SVM_EXIT_ERR           -1
 
@@ -107,8 +109,10 @@ 
 	{ SVM_EXIT_SMI,         "smi" }, \
 	{ SVM_EXIT_INIT,        "init" }, \
 	{ SVM_EXIT_VINTR,       "vintr" }, \
+	{ SVM_EXIT_CR0_SEL_WRITE, "cr0_sel_write" }, \
 	{ SVM_EXIT_CPUID,       "cpuid" }, \
 	{ SVM_EXIT_INVD,        "invd" }, \
+	{ SVM_EXIT_PAUSE,       "pause" }, \
 	{ SVM_EXIT_HLT,         "hlt" }, \
 	{ SVM_EXIT_INVLPG,      "invlpg" }, \
 	{ SVM_EXIT_INVLPGA,     "invlpga" }, \
@@ -127,7 +131,10 @@ 
 	{ SVM_EXIT_MONITOR,     "monitor" }, \
 	{ SVM_EXIT_MWAIT,       "mwait" }, \
 	{ SVM_EXIT_XSETBV,      "xsetbv" }, \
-	{ SVM_EXIT_NPF,         "npf" }
+	{ SVM_EXIT_NPF,         "npf" }, \
+	{ SVM_EXIT_RSM,         "rsm" }, \
+	{ SVM_EXIT_AVIC_INCOMPLETE_IPI,		"avic_incomplete_ipi" }, \
+	{ SVM_EXIT_AVIC_UNACCELERATED_ACCESS,   "avic_unaccelerated_access" }
 
 
 #endif /* _UAPI__SVM_H */
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index a70cb62..2fc86b7 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -9,6 +9,9 @@ 
 #define KVM_APIC_SIPI		1
 #define KVM_APIC_LVT_NUM	6
 
+#define KVM_APIC_SHORT_MASK	0xc0000
+#define KVM_APIC_DEST_MASK	0x800
+
 struct kvm_timer {
 	struct hrtimer timer;
 	s64 period; 				/* unit: ns */
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f9547bc..13fba3b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3515,6 +3515,250 @@  static int mwait_interception(struct vcpu_svm *svm)
 	return nop_interception(svm);
 }
 
+enum avic_ipi_failure_cause {
+	AVIC_IPI_FAILURE_INVALID_INT_TYPE,
+	AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
+	AVIC_IPI_FAILURE_INVALID_TARGET,
+	AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
+};
+
+static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
+{
+	u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
+	u32 icrl = svm->vmcb->control.exit_info_1;
+	u32 id = svm->vmcb->control.exit_info_2 >> 32;
+	u32 index = svm->vmcb->control.exit_info_2 && 0xFF;
+	struct kvm_lapic *apic = svm->vcpu.arch.apic;
+
+	trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
+
+	switch (id) {
+	case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
+		/*
+		 * AVIC hardware handles the generation of
+		 * IPIs when the specified Message Type is Fixed
+		 * (also known as fixed delivery mode) and
+		 * the Trigger Mode is edge-triggered. The hardware
+		 * also supports self and broadcast delivery modes
+		 * specified via the Destination Shorthand(DSH)
+		 * field of the ICRL. Logical and physical APIC ID
+		 * formats are supported. All other IPI types cause
+		 * a #VMEXIT, which needs to emulated.
+		 */
+		kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
+		kvm_lapic_reg_write(apic, APIC_ICR, icrl);
+		break;
+	case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
+		int i;
+		struct kvm_vcpu *vcpu;
+		struct kvm *kvm = svm->vcpu.kvm;
+		struct kvm_lapic *apic = svm->vcpu.arch.apic;
+
+		/*
+		 * At this point, we expect that the AVIC HW has already
+		 * set the appropriate IRR bits on the valid target
+		 * vcpus. So, we just need to kick the appropriate vcpu.
+		 */
+		kvm_for_each_vcpu(i, vcpu, kvm) {
+			bool m = kvm_apic_match_dest(vcpu, apic,
+						     icrl & KVM_APIC_SHORT_MASK,
+						     GET_APIC_DEST_FIELD(icrh),
+						     icrl & KVM_APIC_DEST_MASK);
+
+			if (m && !avic_vcpu_is_running(vcpu))
+				kvm_vcpu_wake_up(vcpu);
+		}
+		break;
+	}
+	case AVIC_IPI_FAILURE_INVALID_TARGET:
+		break;
+	case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
+		WARN_ONCE(1, "Invalid backing page\n");
+		break;
+	default:
+		pr_err("Unknown IPI interception\n");
+	}
+
+	return 1;
+}
+
+static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u8 mda, bool flat)
+{
+	struct kvm_arch *vm_data = &vcpu->kvm->arch;
+	int index;
+	u32 *logical_apic_id_table;
+
+	if (flat) { /* flat */
+		if (mda > 7)
+			return NULL;
+		index = mda;
+	} else { /* cluster */
+		int apic_id = mda & 0xf;
+		int cluster_id = (mda & 0xf0) >> 8;
+
+		if (apic_id > 4 || cluster_id >= 0xf)
+			return NULL;
+		index = (cluster_id << 2) + apic_id;
+	}
+	logical_apic_id_table = (u32 *) page_address(vm_data->avic_logical_id_table_page);
+
+	return &logical_apic_id_table[index];
+}
+
+static int avic_handle_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id,
+				 u8 logical_id)
+{
+	u32 mod;
+	u32 *entry, new_entry;
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (!svm)
+		return -EINVAL;
+
+	mod = (kvm_apic_get_reg(svm->vcpu.arch.apic, APIC_DFR) >> 28) & 0xf;
+	entry = avic_get_logical_id_entry(vcpu, logical_id, (mod == 0xf));
+	if (!entry)
+		return -EINVAL;
+
+	new_entry = READ_ONCE(*entry);
+	new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+	new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
+	new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
+	WRITE_ONCE(*entry, new_entry);
+
+	return 0;
+}
+
+static int avic_unaccel_trap_write(struct vcpu_svm *svm)
+{
+	u32 offset = svm->vmcb->control.exit_info_1 & 0xFF0;
+	struct kvm_lapic *apic = svm->vcpu.arch.apic;
+	u32 reg = kvm_apic_get_reg(apic, offset);
+
+	switch (offset) {
+	case APIC_ID: {
+		u32 aid = (reg >> 24) & 0xff;
+		u64 *o_ent = avic_get_physical_id_entry(&svm->vcpu,
+							svm->vcpu.vcpu_id);
+		u64 *n_ent = avic_get_physical_id_entry(&svm->vcpu, aid);
+
+		if (!n_ent || !o_ent)
+			return 0;
+
+		/* We need to move physical_id_entry to new offset */
+		*n_ent = *o_ent;
+		*o_ent = 0ULL;
+		svm->avic_physical_id_cache = n_ent;
+		break;
+	}
+	case APIC_LDR: {
+		int ret, lid;
+		int dlid = (reg >> 24) & 0xff;
+
+		if (!dlid)
+			return 0;
+
+		lid = ffs(dlid) - 1;
+		ret = avic_handle_ldr_write(&svm->vcpu, svm->vcpu.vcpu_id, lid);
+		if (ret)
+			return 0;
+
+		break;
+	}
+	case APIC_DFR: {
+		struct kvm_arch *vm_data = &svm->vcpu.kvm->arch;
+		u32 mod = (reg >> 28) & 0xf;
+
+		/*
+		 * We assume that all local APICs are using the same type.
+		 * If this changes, we need to rebuild the AVIC logical
+		 * APID id table with subsequent write to APIC_LDR.
+		 */
+		if (vm_data->ldr_mode != mod) {
+			clear_page(page_address(vm_data->avic_logical_id_table_page));
+			vm_data->ldr_mode = mod;
+		}
+		break;
+	}
+	default:
+		break;
+	}
+
+	kvm_lapic_reg_write(apic, offset, reg);
+
+	return 1;
+}
+
+static bool is_avic_unaccelerated_access_trap(u32 offset)
+{
+	bool ret = false;
+
+	switch (offset) {
+	case APIC_ID:
+	case APIC_EOI:
+	case APIC_RRR:
+	case APIC_LDR:
+	case APIC_DFR:
+	case APIC_SPIV:
+	case APIC_ESR:
+	case APIC_ICR:
+	case APIC_LVTT:
+	case APIC_LVTTHMR:
+	case APIC_LVTPC:
+	case APIC_LVT0:
+	case APIC_LVT1:
+	case APIC_LVTERR:
+	case APIC_TMICT:
+	case APIC_TDCR:
+		ret = true;
+		break;
+	default:
+		break;
+	}
+	return ret;
+}
+
+#define AVIC_UNACCEL_ACCESS_WRITE_MASK		1
+#define AVIC_UNACCEL_ACCESS_OFFSET_MASK		0xFF0
+#define AVIC_UNACCEL_ACCESS_VECTOR_MASK		0xFFFFFFFF
+
+static int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
+{
+	int ret = 0;
+	u32 offset = svm->vmcb->control.exit_info_1 &
+		     AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+	u32 vector = svm->vmcb->control.exit_info_2 &
+		     AVIC_UNACCEL_ACCESS_VECTOR_MASK;
+	bool write = (svm->vmcb->control.exit_info_1 >> 32) &
+		     AVIC_UNACCEL_ACCESS_WRITE_MASK;
+	bool trap = is_avic_unaccelerated_access_trap(offset);
+
+	trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
+					    trap, write, vector);
+
+	/**
+	 * AVIC does not support x2APIC registers, and we only advertise
+	 * xAPIC when enable AVIC. Therefore, access to these registers
+	 * will not be supported.
+	 */
+	if (offset >= 0x400) {
+		WARN(1, "Unsupported APIC offset %#x\n", offset);
+		return ret;
+	}
+
+	if (trap) {
+		/* Handling Trap */
+		if (!write) /* Trap read should never happens */
+			BUG();
+		ret = avic_unaccel_trap_write(svm);
+	} else {
+		/* Handling Fault */
+		ret = (emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE);
+	}
+
+	return ret;
+}
+
 static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_READ_CR0]			= cr_interception,
 	[SVM_EXIT_READ_CR3]			= cr_interception,
@@ -3578,6 +3822,8 @@  static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_XSETBV]			= xsetbv_interception,
 	[SVM_EXIT_NPF]				= pf_interception,
 	[SVM_EXIT_RSM]                          = emulate_on_interception,
+	[SVM_EXIT_AVIC_INCOMPLETE_IPI]		= avic_incomplete_ipi_interception,
+	[SVM_EXIT_AVIC_UNACCELERATED_ACCESS]	= avic_unaccelerated_access_interception,
 };
 
 static void dump_vmcb(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 2f1ea2f..39f264c 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1292,6 +1292,63 @@  TRACE_EVENT(kvm_hv_stimer_cleanup,
 		  __entry->vcpu_id, __entry->timer_index)
 );
 
+/*
+ * Tracepoint for AMD AVIC
+ */
+TRACE_EVENT(kvm_avic_incomplete_ipi,
+	    TP_PROTO(u32 vcpu, u32 icrh, u32 icrl, u32 id, u32 index),
+	    TP_ARGS(vcpu, icrh, icrl, id, index),
+
+	TP_STRUCT__entry(
+		__field(u32, vcpu)
+		__field(u32, icrh)
+		__field(u32, icrl)
+		__field(u32, id)
+		__field(u32, index)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu = vcpu;
+		__entry->icrh = icrh;
+		__entry->icrl = icrl;
+		__entry->id = id;
+		__entry->index = index;
+	),
+
+	TP_printk("vcpu=%u, icrh:icrl=%#010x:%08x, id=%u, index=%u\n",
+		  __entry->vcpu, __entry->icrh, __entry->icrl,
+		  __entry->id, __entry->index)
+);
+
+TRACE_EVENT(kvm_avic_unaccelerated_access,
+	    TP_PROTO(u32 vcpu, u32 offset, bool ft, bool rw, u32 vec),
+	    TP_ARGS(vcpu, offset, ft, rw, vec),
+
+	TP_STRUCT__entry(
+		__field(u32, vcpu)
+		__field(u32, offset)
+		__field(bool, ft)
+		__field(bool, rw)
+		__field(u32, vec)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu = vcpu;
+		__entry->offset = offset;
+		__entry->ft = ft;
+		__entry->rw = rw;
+		__entry->vec = vec;
+	),
+
+	TP_printk("vcpu=%u, offset=%#x(%s), %s, %s, vec=%#x\n",
+		  __entry->vcpu,
+		  __entry->offset,
+		  __print_symbolic(__entry->offset, kvm_trace_symbol_apic),
+		  __entry->ft ? "trap" : "fault",
+		  __entry->rw ? "write" : "read",
+		  __entry->vec)
+);
+
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d12583e..b0f211c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8437,3 +8437,5 @@  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);