diff mbox series

[v3,04/10] iommu/vt-d: Add helper to setup pasid nested translation

Message ID 20230511145110.27707-5-yi.l.liu@intel.com
State New
Headers show
Series Add Intel VT-d nested translation | expand

Commit Message

Yi Liu May 11, 2023, 2:51 p.m. UTC
From: Lu Baolu <baolu.lu@linux.intel.com>

The configurations are passed in from the user when the user domain is
allocated. This helper interprets these configurations according to the
data structure defined in uapi/linux/iommufd.h. The EINVAL error will be
returned if any of configurations are not compatible with the hardware
capabilities. The caller can retry with another compatible user domain.
The encoding of fields of each pasid entry is defined in section 9.6 of
the VT-d spec.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
---
 drivers/iommu/intel/pasid.c | 151 ++++++++++++++++++++++++++++++++++++
 drivers/iommu/intel/pasid.h |   2 +
 2 files changed, 153 insertions(+)

Comments

Baolu Lu May 26, 2023, 4:16 a.m. UTC | #1
On 5/24/23 3:16 PM, Tian, Kevin wrote:
>> From: Yi Liu <yi.l.liu@intel.com>
>> Sent: Thursday, May 11, 2023 10:51 PM
>>
>> +
>> +/**
>> + * intel_pasid_setup_nested() - Set up PASID entry for nested translation.
>> + * This could be used for guest shared virtual address. In this case, the
>> + * first level page tables are used for GVA-GPA translation in the guest,
>> + * second level page tables are used for GPA-HPA translation.
> 
> it's not just for guest SVA. Actually in this series it's RID_PASID nested
> translation.

Yes.

>> + *
>> + * @iommu:      IOMMU which the device belong to
>> + * @dev:        Device to be set up for translation
>> + * @pasid:      PASID to be programmed in the device PASID table
>> + * @domain:     User domain nested on a s2 domain
> 
> "User stage-1 domain"

Yes.

>> + */
>> +int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device
>> *dev,
>> +			     u32 pasid, struct dmar_domain *domain)
>> +{
>> +	struct iommu_hwpt_intel_vtd *s1_cfg = &domain->s1_cfg;
>> +	pgd_t *s1_gpgd = (pgd_t *)(uintptr_t)domain->s1_pgtbl;
>> +	struct dmar_domain *s2_domain = domain->s2_domain;
>> +	u16 did = domain_id_iommu(domain, iommu);
>> +	struct dma_pte *pgd = s2_domain->pgd;
>> +	struct pasid_entry *pte;
>> +	int agaw;
>> +
>> +	if (!ecap_nest(iommu->ecap)) {
>> +		pr_err_ratelimited("%s: No nested translation support\n",
>> +				   iommu->name);
>> +		return -ENODEV;
>> +	}
>> +
>> +	/*
>> +	 * Sanity checking performed by caller to make sure address width
> 
> "by caller"? it's checked in this function.

This comment need to be updated.

>> +	 * matching in two dimensions: CPU vs. IOMMU, guest vs. host.
>> +	 */
>> +	switch (s1_cfg->addr_width) {
>> +	case ADDR_WIDTH_4LEVEL:
>> +		break;
>> +#ifdef CONFIG_X86
>> +	case ADDR_WIDTH_5LEVEL:
>> +		if (!cpu_feature_enabled(X86_FEATURE_LA57) ||
>> +		    !cap_fl5lp_support(iommu->cap)) {
>> +			dev_err_ratelimited(dev,
>> +					    "5-level paging not supported\n");
>> +			return -EINVAL;
>> +		}
>> +		break;
>> +#endif
>> +	default:
>> +		dev_err_ratelimited(dev, "Invalid guest address width %d\n",
>> +				    s1_cfg->addr_width);
>> +		return -EINVAL;
>> +	}
>> +
>> +	if ((s1_cfg->flags & IOMMU_VTD_PGTBL_SRE) && !ecap_srs(iommu-
>>> ecap)) {
>> +		pr_err_ratelimited("No supervisor request support on %s\n",
>> +				   iommu->name);
>> +		return -EINVAL;
>> +	}
>> +
>> +	if ((s1_cfg->flags & IOMMU_VTD_PGTBL_EAFE)
>> && !ecap_eafs(iommu->ecap)) {
>> +		pr_err_ratelimited("No extended access flag support
>> on %s\n",
>> +				   iommu->name);
>> +		return -EINVAL;
>> +	}
>> +
>> +	/*
>> +	 * Memory type is only applicable to devices inside processor
>> coherent
>> +	 * domain. Will add MTS support once coherent devices are available.
>> +	 */
>> +	if (s1_cfg->flags & IOMMU_VTD_PGTBL_MTS_MASK) {
>> +		pr_warn_ratelimited("No memory type support %s\n",
>> +				    iommu->name);
>> +		return -EINVAL;
>> +	}
> 
> If it's unsupported why exposing them in the uAPI at this point?

Agreed. We can remove this flag for now.

>> +
>> +	agaw = iommu_skip_agaw(s2_domain, iommu, &pgd);
>> +	if (agaw < 0) {
>> +		dev_err_ratelimited(dev, "Invalid domain page table\n");
>> +		return -EINVAL;
>> +	}
> 
> this looks problematic.
> 
> static inline int iommu_skip_agaw(struct dmar_domain *domain,
>                                    struct intel_iommu *iommu,
>                                    struct dma_pte **pgd)
> {
> 	int agaw;
> 
> 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
> 		*pgd = phys_to_virt(dma_pte_addr(*pgd));
> 		if (!dma_pte_present(*pgd))
> 			return -EINVAL;
> 	}
> 
> 	return agaw;
> }
> 
> why is it safe to change pgd level of s2 domain when it's used as
> the parent? this s2 pgtbl might be used by other devices behind
> other iommus which already maps GPAs in a level which this
> iommu doesn't support...
> 
> shouldn't we simply fail it as another incompatible condition?

You are right. We can change it to this:

	if (domain->agaw > iommu->agaw)
		return -EINVAL;

> 
>> +
>> +	/* First level PGD (in GPA) must be supported by the second level. */
>> +	if ((uintptr_t)s1_gpgd > (1ULL << s2_domain->gaw)) {
>> +		dev_err_ratelimited(dev,
>> +				    "Guest PGD %lx not supported,
>> max %llx\n",
>> +				    (uintptr_t)s1_gpgd, s2_domain-
>>> max_addr);
>> +		return -EINVAL;
>> +	}
> 
> I'm not sure how useful this check is. Even if the pgd is sane the
> lower level PTEs could include unsupported GPA's. If a guest really
> doesn't want to follow the GPA restriction which vIOMMU reports,
> it can easily cause IOMMU fault in many ways.

You are right.

> Then why treating pgd specially?

I have no memory about this check for now. Yi, any thought?

Best regards,
baolu
Yi Liu June 7, 2023, 8:34 a.m. UTC | #2
> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Friday, May 26, 2023 12:16 PM

> >> +
> >> +	/* First level PGD (in GPA) must be supported by the second level. */
> >> +	if ((uintptr_t)s1_gpgd > (1ULL << s2_domain->gaw)) {
> >> +		dev_err_ratelimited(dev,
> >> +				    "Guest PGD %lx not supported,
> >> max %llx\n",
> >> +				    (uintptr_t)s1_gpgd, s2_domain-
> >>> max_addr);
> >> +		return -EINVAL;
> >> +	}
> >
> > I'm not sure how useful this check is. Even if the pgd is sane the
> > lower level PTEs could include unsupported GPA's. If a guest really
> > doesn't want to follow the GPA restriction which vIOMMU reports,
> > it can easily cause IOMMU fault in many ways.
> 
> You are right.
> 
> > Then why treating pgd specially?
> 
> I have no memory about this check for now. Yi, any thought?

I don’t think there is another special reason. Just want to ensure the page table
base address follows the GPA restriction. But as Kevin mentioned, hw can detect
it anyway. So, I think this check can be dropped.

Regards,
Yi Liu
Yi Liu June 8, 2023, 3:32 a.m. UTC | #3
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Wednesday, June 7, 2023 4:34 PM
> 
> > From: Baolu Lu <baolu.lu@linux.intel.com>
> > Sent: Friday, May 26, 2023 12:16 PM
> 
> > >> +
> > >> +	/* First level PGD (in GPA) must be supported by the second level. */
> > >> +	if ((uintptr_t)s1_gpgd > (1ULL << s2_domain->gaw)) {
> > >> +		dev_err_ratelimited(dev,
> > >> +				    "Guest PGD %lx not supported,
> > >> max %llx\n",
> > >> +				    (uintptr_t)s1_gpgd, s2_domain-
> > >>> max_addr);
> > >> +		return -EINVAL;
> > >> +	}
> > >
> > > I'm not sure how useful this check is. Even if the pgd is sane the
> > > lower level PTEs could include unsupported GPA's. If a guest really
> > > doesn't want to follow the GPA restriction which vIOMMU reports,
> > > it can easily cause IOMMU fault in many ways.
> >
> > You are right.
> >
> > > Then why treating pgd specially?
> >
> > I have no memory about this check for now. Yi, any thought?
> 
> I don’t think there is another special reason. Just want to ensure the page table
> base address follows the GPA restriction. But as Kevin mentioned, hw can detect
> it anyway. So, I think this check can be dropped.

Then we also need to remove below words in the uapi header.

+ Hence the
+ * stage-1 page table base address value should not be higher than the
+ * maximum untranslated address of stage-2 page table.

Regards,
Yi Liu
Yi Liu June 8, 2023, 3:35 a.m. UTC | #4
> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Friday, May 26, 2023 12:16 PM
> On 5/24/23 3:16 PM, Tian, Kevin wrote:
> >> From: Yi Liu <yi.l.liu@intel.com>
> >> Sent: Thursday, May 11, 2023 10:51 PM
> >>
> >> +	/*
> >> +	 * Memory type is only applicable to devices inside processor
> >> coherent
> >> +	 * domain. Will add MTS support once coherent devices are available.
> >> +	 */
> >> +	if (s1_cfg->flags & IOMMU_VTD_PGTBL_MTS_MASK) {
> >> +		pr_warn_ratelimited("No memory type support %s\n",
> >> +				    iommu->name);
> >> +		return -EINVAL;
> >> +	}
> >
> > If it's unsupported why exposing them in the uAPI at this point?
> 
> Agreed. We can remove this flag for now.

So we shall remove the below flags in uapi as well, is it?

+#define IOMMU_VTD_PGTBL_MTS_MASK	(IOMMU_VTD_PGTBL_CD | \
+					 IOMMU_VTD_PGTBL_EMTE | \
+					 IOMMU_VTD_PGTBL_PCD | \
+					 IOMMU_VTD_PGTBL_PWT)

Regards,
Yi Liu
Baolu Lu June 8, 2023, 3:37 a.m. UTC | #5
On 6/8/23 11:35 AM, Liu, Yi L wrote:
>> From: Baolu Lu<baolu.lu@linux.intel.com>
>> Sent: Friday, May 26, 2023 12:16 PM
>> On 5/24/23 3:16 PM, Tian, Kevin wrote:
>>>> From: Yi Liu<yi.l.liu@intel.com>
>>>> Sent: Thursday, May 11, 2023 10:51 PM
>>>>
>>>> +	/*
>>>> +	 * Memory type is only applicable to devices inside processor
>>>> coherent
>>>> +	 * domain. Will add MTS support once coherent devices are available.
>>>> +	 */
>>>> +	if (s1_cfg->flags & IOMMU_VTD_PGTBL_MTS_MASK) {
>>>> +		pr_warn_ratelimited("No memory type support %s\n",
>>>> +				    iommu->name);
>>>> +		return -EINVAL;
>>>> +	}
>>> If it's unsupported why exposing them in the uAPI at this point?
>> Agreed. We can remove this flag for now.
> So we shall remove the below flags in uapi as well, is it?
> 
> +#define IOMMU_VTD_PGTBL_MTS_MASK	(IOMMU_VTD_PGTBL_CD | \
> +					 IOMMU_VTD_PGTBL_EMTE | \
> +					 IOMMU_VTD_PGTBL_PCD | \
> +					 IOMMU_VTD_PGTBL_PWT)

I suppose so.

Best regards,
baolu
diff mbox series

Patch

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index c5d479770e12..ee4c7b69c6c6 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -21,6 +21,11 @@ 
 #include "iommu.h"
 #include "pasid.h"
 
+#define IOMMU_VTD_PGTBL_MTS_MASK	(IOMMU_VTD_PGTBL_CD | \
+					 IOMMU_VTD_PGTBL_EMTE | \
+					 IOMMU_VTD_PGTBL_PCD | \
+					 IOMMU_VTD_PGTBL_PWT)
+
 /*
  * Intel IOMMU system wide PASID name space:
  */
@@ -335,6 +340,15 @@  static inline void pasid_set_fault_enable(struct pasid_entry *pe)
 	pasid_set_bits(&pe->val[0], 1 << 1, 0);
 }
 
+/*
+ * Setup the SRE(Supervisor Request Enable) field (Bit 128) of a
+ * scalable mode PASID entry.
+ */
+static inline void pasid_set_sre(struct pasid_entry *pe)
+{
+	pasid_set_bits(&pe->val[2], 1 << 0, 1);
+}
+
 /*
  * Setup the WPE(Write Protect Enable) field (Bit 132) of a
  * scalable mode PASID entry.
@@ -402,6 +416,15 @@  pasid_set_flpm(struct pasid_entry *pe, u64 value)
 	pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2);
 }
 
+/*
+ * Setup the Extended Access Flag Enable (EAFE) field (Bit 135)
+ * of a scalable mode PASID entry.
+ */
+static inline void pasid_set_eafe(struct pasid_entry *pe)
+{
+	pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7);
+}
+
 static void
 pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu,
 				    u16 did, u32 pasid)
@@ -713,3 +736,131 @@  void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu,
 	if (!cap_caching_mode(iommu->cap))
 		devtlb_invalidation_with_pasid(iommu, dev, pasid);
 }
+
+/**
+ * intel_pasid_setup_nested() - Set up PASID entry for nested translation.
+ * This could be used for guest shared virtual address. In this case, the
+ * first level page tables are used for GVA-GPA translation in the guest,
+ * second level page tables are used for GPA-HPA translation.
+ *
+ * @iommu:      IOMMU which the device belong to
+ * @dev:        Device to be set up for translation
+ * @pasid:      PASID to be programmed in the device PASID table
+ * @domain:     User domain nested on a s2 domain
+ */
+int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
+			     u32 pasid, struct dmar_domain *domain)
+{
+	struct iommu_hwpt_intel_vtd *s1_cfg = &domain->s1_cfg;
+	pgd_t *s1_gpgd = (pgd_t *)(uintptr_t)domain->s1_pgtbl;
+	struct dmar_domain *s2_domain = domain->s2_domain;
+	u16 did = domain_id_iommu(domain, iommu);
+	struct dma_pte *pgd = s2_domain->pgd;
+	struct pasid_entry *pte;
+	int agaw;
+
+	if (!ecap_nest(iommu->ecap)) {
+		pr_err_ratelimited("%s: No nested translation support\n",
+				   iommu->name);
+		return -ENODEV;
+	}
+
+	/*
+	 * Sanity checking performed by caller to make sure address width
+	 * matching in two dimensions: CPU vs. IOMMU, guest vs. host.
+	 */
+	switch (s1_cfg->addr_width) {
+	case ADDR_WIDTH_4LEVEL:
+		break;
+#ifdef CONFIG_X86
+	case ADDR_WIDTH_5LEVEL:
+		if (!cpu_feature_enabled(X86_FEATURE_LA57) ||
+		    !cap_fl5lp_support(iommu->cap)) {
+			dev_err_ratelimited(dev,
+					    "5-level paging not supported\n");
+			return -EINVAL;
+		}
+		break;
+#endif
+	default:
+		dev_err_ratelimited(dev, "Invalid guest address width %d\n",
+				    s1_cfg->addr_width);
+		return -EINVAL;
+	}
+
+	if ((s1_cfg->flags & IOMMU_VTD_PGTBL_SRE) && !ecap_srs(iommu->ecap)) {
+		pr_err_ratelimited("No supervisor request support on %s\n",
+				   iommu->name);
+		return -EINVAL;
+	}
+
+	if ((s1_cfg->flags & IOMMU_VTD_PGTBL_EAFE) && !ecap_eafs(iommu->ecap)) {
+		pr_err_ratelimited("No extended access flag support on %s\n",
+				   iommu->name);
+		return -EINVAL;
+	}
+
+	/*
+	 * Memory type is only applicable to devices inside processor coherent
+	 * domain. Will add MTS support once coherent devices are available.
+	 */
+	if (s1_cfg->flags & IOMMU_VTD_PGTBL_MTS_MASK) {
+		pr_warn_ratelimited("No memory type support %s\n",
+				    iommu->name);
+		return -EINVAL;
+	}
+
+	agaw = iommu_skip_agaw(s2_domain, iommu, &pgd);
+	if (agaw < 0) {
+		dev_err_ratelimited(dev, "Invalid domain page table\n");
+		return -EINVAL;
+	}
+
+	/* First level PGD (in GPA) must be supported by the second level. */
+	if ((uintptr_t)s1_gpgd > (1ULL << s2_domain->gaw)) {
+		dev_err_ratelimited(dev,
+				    "Guest PGD %lx not supported, max %llx\n",
+				    (uintptr_t)s1_gpgd, s2_domain->max_addr);
+		return -EINVAL;
+	}
+
+	spin_lock(&iommu->lock);
+	pte = intel_pasid_get_entry(dev, pasid);
+	if (!pte) {
+		spin_unlock(&iommu->lock);
+		return -ENODEV;
+	}
+	if (pasid_pte_is_present(pte)) {
+		spin_unlock(&iommu->lock);
+		return -EBUSY;
+	}
+
+	pasid_clear_entry(pte);
+
+	if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL)
+		pasid_set_flpm(pte, 1);
+
+	pasid_set_flptr(pte, (uintptr_t)s1_gpgd);
+
+	if (s1_cfg->flags & IOMMU_VTD_PGTBL_SRE) {
+		pasid_set_sre(pte);
+		if (s1_cfg->flags & IOMMU_VTD_PGTBL_WPE)
+			pasid_set_wpe(pte);
+	}
+
+	if (s1_cfg->flags & IOMMU_VTD_PGTBL_EAFE)
+		pasid_set_eafe(pte);
+
+	pasid_set_slptr(pte, virt_to_phys(pgd));
+	pasid_set_fault_enable(pte);
+	pasid_set_domain_id(pte, did);
+	pasid_set_address_width(pte, agaw);
+	pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+	pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED);
+	pasid_set_present(pte);
+	spin_unlock(&iommu->lock);
+
+	pasid_flush_caches(iommu, pte, pasid, did);
+
+	return 0;
+}
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index d6b7d21244b1..864b12848392 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -111,6 +111,8 @@  int intel_pasid_setup_second_level(struct intel_iommu *iommu,
 int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
 				   struct dmar_domain *domain,
 				   struct device *dev, u32 pasid);
+int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
+			     u32 pasid, struct dmar_domain *domain);
 void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
 				 struct device *dev, u32 pasid,
 				 bool fault_ignore);