diff mbox

[PART1,RFC,v2,05/10] KVM: x86: Detect and Initialize AVIC support

Message ID 1457124368-2025-6-git-send-email-Suravee.Suthikulpanit@amd.com
State New
Headers show

Commit Message

Suthikulpanit, Suravee March 4, 2016, 8:46 p.m. UTC
This patch introduces AVIC-related data structure, and AVIC
initialization code.

There are three main data structures for AVIC:
    * Virtual APIC (vAPIC) backing page (per-VCPU)
    * Physical APIC ID table (per-VM)
    * Logical APIC ID table (per-VM)

In order to accommodate the new per-VM tables, we introduce
a new per-VM arch-specific void pointer, struct kvm_arch.arch_data.
This will point to the newly introduced struct svm_vm_data.

Currently, AVIC is disabled by default. Users can manually
enable AVIC via kernel boot option kvm-amd.avic=1 or during
kvm-amd module loading with parameter avic=1.

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>

---
 arch/x86/include/asm/kvm_host.h |   2 +
 arch/x86/kvm/svm.c              | 416 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 417 insertions(+), 1 deletion(-)

-- 
1.9.1

Comments

Suthikulpanit, Suravee March 15, 2016, 5:09 p.m. UTC | #1
Hi

On 03/07/2016 11:41 PM, Paolo Bonzini wrote:
> On 04/03/2016 21:46, Suravee Suthikulpanit wrote:

> >  [....]

>> +/* Note: This structure is per VM */

>> +struct svm_vm_data {

>> +	atomic_t count;

>> +	u32 ldr_mode;

>> +	u32 avic_max_vcpu_id;

>> +	u32 avic_tag;

>> +

>> +	struct page *avic_log_ait_page;

>> +	struct page *avic_phy_ait_page;

>

> You can put these directly in kvm_arch.  Do not use abbreviations:

>

> 	struct page *avic_logical_apic_id_table_page;

> 	struct page *avic_physical_apic_id_table_page;

>


Actually, the reason I would like to introduce this per-arch specific 
structure is because I feel that it is easier to manage these 
processor-specific variable/data-structure. If we add all these directly 
into kvm_arch, which is shared b/w SVM and VMX, it is more difficult to 
tell which one is used in the different code base.

>> [...]

>> +	memcpy(vapic_bkpg, svm->in_kernel_lapic_regs, PAGE_SIZE);

>> +	svm->vcpu.arch.apic->regs = vapic_bkpg;

>

> Can you explain the flipping logic, and why you cannot just use the

> existing apic.regs?


Please see "explanation 1" below.

>> [...]

>> +static struct svm_avic_phy_ait_entry *

>> +avic_get_phy_ait_entry(struct kvm_vcpu *vcpu, int index)

>> +{

>> +    [.....]

>> +}

>> +

>> +struct svm_avic_log_ait_entry *

>> +avic_get_log_ait_entry(struct kvm_vcpu *vcpu, u8 mda, bool is_flat)

>> +{

>> +	[.....]

>> +}

>

> Instead of these functions, create a complete function to handle APIC_ID

> and APIC_LDR writes.  Then use kmap/kunmap instead of page_address.

>


Ok. May I ask why we are against using page_address?  I have see that 
used in several places in the code.

>> [...]

>> +static int avic_alloc_bk_page(struct vcpu_svm *svm, int id)

>> +{

>> +	int ret = 0, i;

>> +	bool realloc = false;

>> +	struct kvm_vcpu *vcpu;

>> +	struct kvm *kvm = svm->vcpu.kvm;

>> +	struct svm_vm_data *vm_data = kvm->arch.arch_data;

>> +

>> +	mutex_lock(&kvm->slots_lock);

>> +

>> +	/* Check if we have already allocated vAPIC backing

>> +	 * page for this vCPU. If not, we need to realloc

>> +	 * a new one and re-assign all other vCPU.

>> +	 */

>> +	if (kvm->arch.apic_access_page_done &&

>> +	    (id > vm_data->avic_max_vcpu_id)) {

>> +		kvm_for_each_vcpu(i, vcpu, kvm)

>> +			avic_unalloc_bk_page(vcpu);

>> +

>> +		__x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,

>> +					0, 0);

>> +		realloc = true;

>> +		vm_data->avic_max_vcpu_id = 0;

>> +	}

>> +

>> +	/*

>> +	 * We are allocating vAPIC backing page

>> +	 * upto the max vCPU ID

>> +	 */

>> +	if (id >= vm_data->avic_max_vcpu_id) {

>> +		ret = __x86_set_memory_region(kvm,

>> +					      APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,

>> +					      APIC_DEFAULT_PHYS_BASE,

>> +					      PAGE_SIZE * (id + 1));

>

> Why is this necessary?  The APIC access page is a peculiarity of Intel

> processors (and the special memslot for only needs to map 0xfee00000 to

> 0xfee00fff; after that there is the MSI area).

>


Please see "explanation 1" below.

 >> [...]

>> +		if (ret)

>> +			goto out;

>> +

>> +		vm_data->avic_max_vcpu_id = id;

>> +	}

>> +

>> +	/* Reinit vAPIC backing page for exisinting vcpus */

>> +	if (realloc)

>> +		kvm_for_each_vcpu(i, vcpu, kvm)

>> +			avic_init_bk_page(vcpu);

>

> Why is this necessary?


Explanation 1:

The current lapic regs page is allocated using get_zeroed_page(), which 
can be paged out. If I use these pages for AVIC backing pages, it seems 
to cause VM to slow down quite a bit due to a lot of page faults.

Currently, the AVIC backing pages are acquired from __x86_set_memory 
region() with APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, which maps the pages for 
address 0xfee00000 and above for VM to use. I mostly grab this from the 
VMX implementation in alloc_apic_access_page().

However, the memslot requires specification of the size at the time when 
calling __x86_set_memory_region(). However, I can't seem to figure out 
where I can get the number of vcpus at the time when we creating VM. 
Therefore, I have to track the vcpu creation, and re-acquire larger 
memslot every time vcpu_create() is called.

I was not sure if this is the right approach, any suggestion for this part.

Thanks,
Suravee
Suthikulpanit, Suravee March 16, 2016, 6:22 a.m. UTC | #2
Hi,

On 03/16/2016 12:22 AM, Paolo Bonzini wrote:
>>> Why is this necessary?  The APIC access page is a peculiarity of Intel

>>> >>processors (and the special memslot for only needs to map 0xfee00000 to

>>> >>0xfee00fff; after that there is the MSI area).

>> >

>> >The current lapic regs page is allocated using get_zeroed_page(), which

>> >can be paged out. If I use these pages for AVIC backing pages, it seems

>> >to cause VM to slow down quite a bit due to a lot of page faults.

> What causes the lapic regs page to be paged out?

>


This is mainly causing a large number of VMEXIT due to NPF.  In my test 
running hackbench in the guest. The following are perf result profiling 
for 10 seconds in the host in two cases:

CASE1: Using x86_set_memory_region() for AVIC backing page

# ./perf-vmexit.sh 10
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 2.813 MB perf.data.guest (30356 samples) ]


Analyze events for all VMs, all VCPUs:

              VM-EXIT    Samples  Samples%     Time%    Min Time    Max 
Time         Avg time

            interrupt      10042    66.30%    81.33%      0.43us 
202.50us      7.43us ( +-   1.20% )
                  msr       5004    33.04%    15.76%      0.73us 
12.21us      2.89us ( +-   0.43% )
                pause         58     0.38%     0.18%      0.56us 
5.88us      2.92us ( +-   6.43% )
                  npf         35     0.23%     2.01%      6.41us 
207.78us     52.70us ( +-  23.67% )
                  nmi          4     0.03%     0.02%      2.31us 
4.67us      3.49us ( +-  14.26% )
                   io          3     0.02%     0.70%     82.75us 
360.90us    214.28us ( +-  37.64% )
      avic_incomp_ipi          1     0.01%     0.00%      2.17us 
2.17us      2.17us ( +-   0.00% )

Total Samples:15147, Total events handled time:91715.78us.


CASE2: Using the lapic regs page for AVIC backing page.

# ./perf-vmexit.sh 10
[ perf record: Woken up 255 times to write data ]
[ perf record: Captured and wrote 509.202 MB perf.data.guest (5718856 
samples) ]


Analyze events for all VMs, all VCPUs:

              VM-EXIT    Samples  Samples%     Time%    Min Time    Max 
Time         Avg time

                  npf    1897710    99.33%    98.08%      1.09us 
243.22us      1.67us ( +-   0.04% )
            interrupt       7818     0.41%     1.44%      0.44us 
216.55us      5.97us ( +-   1.92% )
                  msr       5001     0.26%     0.45%      0.68us 
12.58us      2.89us ( +-   0.50% )
                pause         25     0.00%     0.00%      0.71us 
4.23us      2.03us ( +-  10.76% )
                   io          4     0.00%     0.03%     73.91us 
337.29us    206.74us ( +-  26.38% )
                  nmi          1     0.00%     0.00%      5.92us 
5.92us      5.92us ( +-   0.00% )

Total Samples:1910559, Total events handled time:3229214.64us.

Thanks,
Suravee
Suthikulpanit, Suravee March 16, 2016, 8:21 a.m. UTC | #3
Hi,

On 03/16/2016 02:20 PM, Paolo Bonzini wrote:
>

> On 16/03/2016 07:22, Suravee Suthikulpanit wrote:

>> >This is mainly causing a large number of VMEXIT due to NPF.

> Got it, it's here in the manual: "System software is responsible for

> setting up a translation in the nested page table granting guest read

> and write permissions for accesses to the vAPIC Backing Page in SPA

> space. AVIC hardware walks the nested page table to check permissions,

> but does not use the SPA address specified in the leaf page table entry.

> Instead, AVIC hardware finds this address in the AVIC_BACKING_PAGE

> pointer field of the VMCB".

>

> Strictly speaking the address of the 0xFEE00000 translation is

> unnecessary and it could be all zeroes, but I suggest that you set up an

> APIC access page like Intel does (4k only), using the special memslot.

> The AVIC backing page can then point to lapic->regs.

>

> Thanks for the explanation!

>

> Paolo

>


Ahh... you are right, this works also. Thanks for the pointer. I'm 
fixing this, doing some more testing, and cleaning up the code. This has 
simplify the init logic quite a bit.

Thanks for suggestion,
Suravee
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9a61669..15f6cd8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -754,6 +754,8 @@  struct kvm_arch {
 
 	bool irqchip_split;
 	u8 nr_reserved_ioapic_pins;
+
+	void *arch_data;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6ab66d0..c703149 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -14,6 +14,9 @@ 
  * the COPYING file in the top-level directory.
  *
  */
+
+#define pr_fmt(fmt) "SVM: " fmt
+
 #include <linux/kvm_host.h>
 
 #include "irq.h"
@@ -78,6 +81,11 @@  MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
 #define TSC_RATIO_MIN		0x0000000000000001ULL
 #define TSC_RATIO_MAX		0x000000ffffffffffULL
 
+#define AVIC_HPA_MASK	~((0xFFFULL << 52) || 0xFFF)
+
+/* NOTE: Current max index allowed for physical APIC ID table is 255 */
+#define AVIC_PHY_APIC_ID_MAX	0xFF
+
 static bool erratum_383_found __read_mostly;
 
 static const u32 host_save_user_msrs[] = {
@@ -162,6 +170,37 @@  struct vcpu_svm {
 
 	/* cached guest cpuid flags for faster access */
 	bool nrips_enabled	: 1;
+
+	struct page *avic_bk_page;
+	void *in_kernel_lapic_regs;
+};
+
+struct __attribute__ ((__packed__))
+svm_avic_log_ait_entry {
+	u32 guest_phy_apic_id	: 8,
+	    res			: 23,
+	    valid		: 1;
+};
+
+struct __attribute__ ((__packed__))
+svm_avic_phy_ait_entry {
+	u64 host_phy_apic_id	: 8,
+	    res1		: 4,
+	    bk_pg_ptr		: 40,
+	    res2		: 10,
+	    is_running		: 1,
+	    valid		: 1;
+};
+
+/* Note: This structure is per VM */
+struct svm_vm_data {
+	atomic_t count;
+	u32 ldr_mode;
+	u32 avic_max_vcpu_id;
+	u32 avic_tag;
+
+	struct page *avic_log_ait_page;
+	struct page *avic_phy_ait_page;
 };
 
 static DEFINE_PER_CPU(u64, current_tsc_ratio);
@@ -205,6 +244,10 @@  module_param(npt, int, S_IRUGO);
 static int nested = true;
 module_param(nested, int, S_IRUGO);
 
+/* enable / disable AVIC */
+static int avic;
+module_param(avic, int, S_IRUGO);
+
 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 static void svm_complete_interrupts(struct vcpu_svm *svm);
@@ -234,6 +277,13 @@  enum {
 /* TPR and CR2 are always written before VMRUN */
 #define VMCB_ALWAYS_DIRTY_MASK	((1U << VMCB_INTR) | (1U << VMCB_CR2))
 
+#define VMCB_AVIC_APIC_BAR_MASK		0xFFFFFFFFFF000ULL
+
+static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
+{
+	svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
+}
+
 static inline void mark_all_dirty(struct vmcb *vmcb)
 {
 	vmcb->control.clean = 0;
@@ -923,6 +973,13 @@  static __init int svm_hardware_setup(void)
 	} else
 		kvm_disable_tdp();
 
+	if (avic && (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC)))
+		avic = false;
+
+	if (avic) {
+		printk(KERN_INFO "kvm: AVIC enabled\n");
+	}
+
 	return 0;
 
 err:
@@ -1000,6 +1057,41 @@  static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
 	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 }
 
+static void avic_init_vmcb(struct vcpu_svm *svm)
+{
+	void *vapic_bkpg;
+	struct vmcb *vmcb = svm->vmcb;
+	struct svm_vm_data *vm_data = svm->vcpu.kvm->arch.arch_data;
+	phys_addr_t bpa = PFN_PHYS(page_to_pfn(svm->avic_bk_page));
+	phys_addr_t lpa = PFN_PHYS(page_to_pfn(vm_data->avic_log_ait_page));
+	phys_addr_t ppa = PFN_PHYS(page_to_pfn(vm_data->avic_phy_ait_page));
+
+	if (!vmcb)
+		return;
+
+	pr_debug("%s: bpa=%#llx, lpa=%#llx, ppa=%#llx\n",
+		 __func__, (unsigned long long)bpa,
+		 (unsigned long long)lpa, (unsigned long long)ppa);
+
+
+	/*
+	 * Here we copy the value from the emulated LAPIC register
+	 * to the vAPIC backign page, and then flip regs frame.
+	 */
+	if (!svm->in_kernel_lapic_regs)
+		svm->in_kernel_lapic_regs = svm->vcpu.arch.apic->regs;
+
+	vapic_bkpg = pfn_to_kaddr(page_to_pfn(svm->avic_bk_page));
+	memcpy(vapic_bkpg, svm->in_kernel_lapic_regs, PAGE_SIZE);
+	svm->vcpu.arch.apic->regs = vapic_bkpg;
+
+	vmcb->control.avic_bk_page = bpa & AVIC_HPA_MASK;
+	vmcb->control.avic_log_apic_id = lpa & AVIC_HPA_MASK;
+	vmcb->control.avic_phy_apic_id = ppa & AVIC_HPA_MASK;
+	vmcb->control.avic_phy_apic_id |= AVIC_PHY_APIC_ID_MAX;
+	vmcb->control.avic_enable = 1;
+}
+
 static void init_vmcb(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
@@ -1113,6 +1205,293 @@  static void init_vmcb(struct vcpu_svm *svm)
 	mark_all_dirty(svm->vmcb);
 
 	enable_gif(svm);
+
+	if (avic)
+		avic_init_vmcb(svm);
+}
+
+static struct svm_avic_phy_ait_entry *
+avic_get_phy_ait_entry(struct kvm_vcpu *vcpu, int index)
+{
+	struct svm_avic_phy_ait_entry *avic_phy_ait;
+	struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data;
+
+	if (!vm_data)
+		return NULL;
+
+	/* Note: APIC ID = 0xff is used for broadcast.
+	 *       APIC ID > 0xff is reserved.
+	 */
+	if (index >= 0xff)
+		return NULL;
+
+	avic_phy_ait = page_address(vm_data->avic_phy_ait_page);
+
+	return &avic_phy_ait[index];
+}
+
+struct svm_avic_log_ait_entry *
+avic_get_log_ait_entry(struct kvm_vcpu *vcpu, u8 mda, bool is_flat)
+{
+	struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data;
+	int index;
+	struct svm_avic_log_ait_entry *avic_log_ait;
+
+	if (!vm_data)
+		return NULL;
+
+	if (is_flat) { /* flat */
+		if (mda > 7)
+			return NULL;
+		index = mda;
+	} else { /* cluster */
+		int apic_id = mda & 0xf;
+		int cluster_id = (mda & 0xf0) >> 8;
+
+		if (apic_id > 4 || cluster_id >= 0xf)
+			return NULL;
+		index = (cluster_id << 2) + apic_id;
+	}
+	avic_log_ait = (struct svm_avic_log_ait_entry *)
+				page_address(vm_data->avic_log_ait_page);
+
+	return &avic_log_ait[index];
+}
+
+static inline void
+avic_set_bk_page_entry(struct vcpu_svm *svm, int reg_off, u32 val)
+{
+	void *avic_bk = page_address(svm->avic_bk_page);
+
+	*((u32 *) (avic_bk + reg_off)) = val;
+}
+
+static inline u32 *avic_get_bk_page_entry(struct vcpu_svm *svm, u32 offset)
+{
+	char *tmp = (char *)page_address(svm->avic_bk_page);
+
+	return (u32 *)(tmp+offset);
+}
+
+static int avic_init_log_apic_entry(struct kvm_vcpu *vcpu, u8 g_phy_apic_id,
+				    u8 log_apic_id)
+{
+	u32 mod;
+	struct svm_avic_log_ait_entry *entry;
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (!svm)
+		return -EINVAL;
+
+	mod = (*avic_get_bk_page_entry(svm, APIC_DFR) >> 28) & 0xf;
+	entry = avic_get_log_ait_entry(vcpu, log_apic_id, (mod == 0xf));
+	if (!entry)
+		return -EINVAL;
+	entry->guest_phy_apic_id = g_phy_apic_id;
+	entry->valid = 1;
+
+	return 0;
+}
+
+static int avic_init_bk_page(struct kvm_vcpu *vcpu)
+{
+	u64 addr;
+	struct page *page;
+	int id = vcpu->vcpu_id;
+	struct kvm *kvm = vcpu->kvm;
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	addr = APIC_DEFAULT_PHYS_BASE + (id * PAGE_SIZE);
+	page = gfn_to_page(kvm, addr >> PAGE_SHIFT);
+	if (is_error_page(page))
+		return -EFAULT;
+
+	/*
+	 * Do not pin the page in memory, so that memory hot-unplug
+	 * is able to migrate it.
+	 */
+	put_page(page);
+
+	/* Setting up AVIC Backing Page */
+	svm->avic_bk_page = page;
+
+	clear_page(kmap(page));
+	pr_debug("%s: vAPIC bk page: cpu=%u, addr=%#llx, pa=%#llx\n",
+		 __func__, id, addr,
+		 (unsigned long long) PFN_PHYS(page_to_pfn(page)));
+
+	avic_init_vmcb(svm);
+
+	return 0;
+}
+
+static inline void avic_unalloc_bk_page(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (svm->avic_bk_page)
+		kunmap(svm->avic_bk_page);
+}
+
+static int avic_alloc_bk_page(struct vcpu_svm *svm, int id)
+{
+	int ret = 0, i;
+	bool realloc = false;
+	struct kvm_vcpu *vcpu;
+	struct kvm *kvm = svm->vcpu.kvm;
+	struct svm_vm_data *vm_data = kvm->arch.arch_data;
+
+	mutex_lock(&kvm->slots_lock);
+
+	/* Check if we have already allocated vAPIC backing
+	 * page for this vCPU. If not, we need to realloc
+	 * a new one and re-assign all other vCPU.
+	 */
+	if (kvm->arch.apic_access_page_done &&
+	    (id > vm_data->avic_max_vcpu_id)) {
+		kvm_for_each_vcpu(i, vcpu, kvm)
+			avic_unalloc_bk_page(vcpu);
+
+		__x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+					0, 0);
+		realloc = true;
+		vm_data->avic_max_vcpu_id = 0;
+	}
+
+	/*
+	 * We are allocating vAPIC backing page
+	 * upto the max vCPU ID
+	 */
+	if (id >= vm_data->avic_max_vcpu_id) {
+		ret = __x86_set_memory_region(kvm,
+					      APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+					      APIC_DEFAULT_PHYS_BASE,
+					      PAGE_SIZE * (id + 1));
+		if (ret)
+			goto out;
+
+		vm_data->avic_max_vcpu_id = id;
+	}
+
+	/* Reinit vAPIC backing page for exisinting vcpus */
+	if (realloc)
+		kvm_for_each_vcpu(i, vcpu, kvm)
+			avic_init_bk_page(vcpu);
+
+	avic_init_bk_page(&svm->vcpu);
+
+	kvm->arch.apic_access_page_done = true;
+
+out:
+	mutex_unlock(&kvm->slots_lock);
+	return ret;
+}
+
+static void avic_vm_uninit(struct kvm *kvm)
+{
+	struct svm_vm_data *vm_data = kvm->arch.arch_data;
+
+	if (!vm_data)
+		return;
+
+	if (vm_data->avic_log_ait_page)
+		__free_page(vm_data->avic_log_ait_page);
+	if (vm_data->avic_phy_ait_page)
+		__free_page(vm_data->avic_phy_ait_page);
+	kfree(vm_data);
+	kvm->arch.arch_data = NULL;
+}
+
+static void avic_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data;
+
+	if (!avic)
+		return;
+
+	avic_unalloc_bk_page(vcpu);
+	svm->vcpu.arch.apic->regs = svm->in_kernel_lapic_regs;
+	svm->in_kernel_lapic_regs = NULL;
+
+	if (vm_data &&
+	    (atomic_read(&vm_data->count) == 0 ||
+	     atomic_dec_and_test(&vm_data->count)))
+		avic_vm_uninit(vcpu->kvm);
+}
+
+static atomic_t avic_tag_gen = ATOMIC_INIT(1);
+
+static inline u32 avic_get_next_tag(void)
+{
+	u32 tag = atomic_read(&avic_tag_gen);
+
+	atomic_inc(&avic_tag_gen);
+	return tag;
+}
+
+static int avic_vm_init(struct kvm *kvm)
+{
+	int err = -ENOMEM;
+	struct svm_vm_data *vm_data;
+	struct page *avic_phy_ait_page;
+	struct page *avic_log_ait_page;
+
+	vm_data = kzalloc(sizeof(struct svm_vm_data),
+				      GFP_KERNEL);
+	if (!vm_data)
+		return err;
+
+	kvm->arch.arch_data = vm_data;
+	atomic_set(&vm_data->count, 0);
+
+	/* Allocating physical APIC ID table (4KB) */
+	avic_phy_ait_page = alloc_page(GFP_KERNEL);
+	if (!avic_phy_ait_page)
+		goto free_avic;
+
+	vm_data->avic_phy_ait_page = avic_phy_ait_page;
+	clear_page(page_address(avic_phy_ait_page));
+
+	/* Allocating logical APIC ID table (4KB) */
+	avic_log_ait_page = alloc_page(GFP_KERNEL);
+	if (!avic_log_ait_page)
+		goto free_avic;
+
+	vm_data->avic_log_ait_page = avic_log_ait_page;
+	clear_page(page_address(avic_log_ait_page));
+
+	vm_data->avic_tag = avic_get_next_tag();
+
+	return 0;
+
+free_avic:
+	avic_vm_uninit(kvm);
+	return err;
+}
+
+static int avic_vcpu_init(struct kvm *kvm, struct vcpu_svm *svm, int id)
+{
+	int err;
+	struct svm_vm_data *vm_data = NULL;
+
+	/* Note: svm_vm_data is per VM */
+	if (!kvm->arch.arch_data) {
+		err = avic_vm_init(kvm);
+		if (err)
+			return err;
+	}
+
+	err = avic_alloc_bk_page(svm, id);
+	if (err) {
+		avic_vcpu_uninit(&svm->vcpu);
+		return err;
+	}
+
+	vm_data = kvm->arch.arch_data;
+	atomic_inc(&vm_data->count);
+
+	return 0;
 }
 
 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -1131,6 +1510,9 @@  static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
 	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
 	kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
+
+	if (avic && !init_event)
+		avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
 }
 
 static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
@@ -1169,6 +1551,12 @@  static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	if (!hsave_page)
 		goto free_page3;
 
+	if (avic) {
+		err = avic_vcpu_init(kvm, svm, id);
+		if (err)
+			goto free_page4;
+	}
+
 	svm->nested.hsave = page_address(hsave_page);
 
 	svm->msrpm = page_address(msrpm_pages);
@@ -1187,6 +1575,8 @@  static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 
 	return &svm->vcpu;
 
+free_page4:
+	__free_page(hsave_page);
 free_page3:
 	__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
 free_page2:
@@ -1209,6 +1599,7 @@  static void svm_free_vcpu(struct kvm_vcpu *vcpu)
 	__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
 	__free_page(virt_to_page(svm->nested.hsave));
 	__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
+	avic_vcpu_uninit(vcpu);
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, svm);
 }
@@ -3382,6 +3773,7 @@  static void dump_vmcb(struct kvm_vcpu *vcpu)
 	pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
 	pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
 	pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
+	pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
 	pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
 	pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
 	pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl);
@@ -3613,11 +4005,31 @@  static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 
 static bool svm_get_enable_apicv(void)
 {
-	return false;
+	return avic;
+}
+
+static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
+{
 }
 
+static void svm_hwapic_isr_update(struct kvm *kvm, int isr)
+{
+}
+
+/* Note: Currently only used by Hyper-V. */
 static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
 {
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct vmcb *vmcb = svm->vmcb;
+
+	if (!avic)
+		return;
+
+	if (!svm->in_kernel_lapic_regs)
+		return;
+
+	svm->vcpu.arch.apic->regs = svm->in_kernel_lapic_regs;
+	vmcb->control.avic_enable = 0;
 }
 
 static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
@@ -4387,6 +4799,8 @@  static struct kvm_x86_ops svm_x86_ops = {
 	.refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
 	.load_eoi_exitmap = svm_load_eoi_exitmap,
 	.sync_pir_to_irr = svm_sync_pir_to_irr,
+	.hwapic_irr_update = svm_hwapic_irr_update,
+	.hwapic_isr_update = svm_hwapic_isr_update,
 
 	.set_tss_addr = svm_set_tss_addr,
 	.get_tdp_level = get_npt_level,