diff mbox series

[v5,10/29] iommufd: Abstract iopt_pin_pages and iopt_unpin_pages helpers

Message ID 49f7143c1b513049fd8158278a11d9f8b6c837d3.1747537752.git.nicolinc@nvidia.com
State New
Headers show
Series iommufd: Add vIOMMU infrastructure (Part-4 HW QUEUE) | expand

Commit Message

Nicolin Chen May 18, 2025, 3:21 a.m. UTC
The new HW QUEUE object will be added for HW to access the guest queue for
HW-accelerated virtualization feature. Some of HW QUEUEs are designed in a
way of accessing the guest queue via a host physical address without doing
a translation using the nesting parent IO page table, while others can use
the guest physical address. For the former case, kernel working with a VMM
needs to pin the physical pages backing the guest memory to lock them when
HW QUEUE is accessing, and to ensure those physical pages to be contiguous
in the physical address space.

This is very like the existing iommufd_access_pin_pages() that outputs the
pinned page list for the caller to test its contiguity.

Move those code from iommufd_access_pin/unpin_pages() and related function
for a pair of iopt helpers that can be shared with the HW QUEUE allocator.

Rename check_area_prot() to align with the existing iopt_area helpers, and
inline it to the header since iommufd_access_rw() still uses it.

Reviewed-by: Pranjal Shrivastava <praan@google.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 drivers/iommu/iommufd/io_pagetable.h    |   8 ++
 drivers/iommu/iommufd/iommufd_private.h |   6 ++
 drivers/iommu/iommufd/device.c          | 119 ++----------------------
 drivers/iommu/iommufd/io_pagetable.c    |  97 +++++++++++++++++++
 4 files changed, 119 insertions(+), 111 deletions(-)

Comments

Nicolin Chen June 5, 2025, 4:11 a.m. UTC | #1
On Wed, May 28, 2025 at 02:17:54PM -0300, Jason Gunthorpe wrote:
> On Sat, May 17, 2025 at 08:21:27PM -0700, Nicolin Chen wrote:
> > The new HW QUEUE object will be added for HW to access the guest queue for
> > HW-accelerated virtualization feature. Some of HW QUEUEs are designed in a
> > way of accessing the guest queue via a host physical address without doing
> > a translation using the nesting parent IO page table, while others can use
> > the guest physical address. For the former case, kernel working with a VMM
> > needs to pin the physical pages backing the guest memory to lock them when
> > HW QUEUE is accessing, and to ensure those physical pages to be contiguous
> > in the physical address space.
> > 
> > This is very like the existing iommufd_access_pin_pages() that outputs the
> > pinned page list for the caller to test its contiguity.
> > 
> > Move those code from iommufd_access_pin/unpin_pages() and related function
> > for a pair of iopt helpers that can be shared with the HW QUEUE allocator.
> > 
> > Rename check_area_prot() to align with the existing iopt_area helpers, and
> > inline it to the header since iommufd_access_rw() still uses it.
> > 
> > Reviewed-by: Pranjal Shrivastava <praan@google.com>
> > Reviewed-by: Kevin Tian <kevin.tian@intel.com>
> > Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
> > Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> > ---
> >  drivers/iommu/iommufd/io_pagetable.h    |   8 ++
> >  drivers/iommu/iommufd/iommufd_private.h |   6 ++
> >  drivers/iommu/iommufd/device.c          | 119 ++----------------------
> >  drivers/iommu/iommufd/io_pagetable.c    |  97 +++++++++++++++++++
> >  4 files changed, 119 insertions(+), 111 deletions(-)
> 
> And if you do what was suggested do we need this patch at all? Just
> use the normal access sequence:
> 
>  iommufd_access_create(ops=NULL)
>  iommufd_access_attach(viommu->hwpt->ioas)
>  iommufd_access_pin_pages()
> 
> And store a viommu->access pointer to undo it all.

I found the entire ictx would be locked by iommufd_access_create(),
then the release fop couldn't even get invoked to destroy objects.

I added a new flag to address this:
-----------------------------------------------------------------
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index f25e272ae378c..a3e0ace583a66 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -1085,7 +1085,8 @@ void iommufd_access_destroy_object(struct iommufd_object *obj)
        if (access->ioas)
                WARN_ON(iommufd_access_change_ioas(access, NULL));
        mutex_unlock(&access->ioas_lock);
-       iommufd_ctx_put(access->ictx);
+       if (!access->ops->internal_use)
+               iommufd_ctx_put(access->ictx);
 }
 
 /**
@@ -1126,7 +1127,8 @@ iommufd_access_create(struct iommufd_ctx *ictx,
        /* The calling driver is a user until iommufd_access_destroy() */
        refcount_inc(&access->obj.users);
        access->ictx = ictx;
-       iommufd_ctx_get(ictx);
+       if (!ops->internal_use)
+               iommufd_ctx_get(ictx);
        iommufd_object_finalize(ictx, &access->obj);
        *id = access->obj.id;
        mutex_init(&access->ioas_lock);
-----------------------------------------------------------------

Btw, I think we can have an ops but only set unmap to NULL:
 static const struct iommufd_access_ops hw_queue_access_ops = {
        .needs_pin_pages = 1,
+       .internal_use = 1,
        /* NULL unmap to reject IOMMUFD_CMD_IOAS_UNMAP */
 };

Having two flags makes the code slightly more readable. After all,
HW queue does need to pin pages.

Thanks
Nicolin
Jason Gunthorpe June 5, 2025, 3:16 p.m. UTC | #2
On Wed, Jun 04, 2025 at 09:11:07PM -0700, Nicolin Chen wrote:

> I found the entire ictx would be locked by iommufd_access_create(),
> then the release fop couldn't even get invoked to destroy objects.

Yes, that makes sense..

It looks to me like you can safely leave ictx as NULL instead of
adding a flag? That would be nicer than leaving a unrefcounted
pointer floating around..

Jason
Nicolin Chen June 5, 2025, 5:04 p.m. UTC | #3
On Thu, Jun 05, 2025 at 12:16:48PM -0300, Jason Gunthorpe wrote:
> On Wed, Jun 04, 2025 at 09:11:07PM -0700, Nicolin Chen wrote:
> 
> > I found the entire ictx would be locked by iommufd_access_create(),
> > then the release fop couldn't even get invoked to destroy objects.
> 
> Yes, that makes sense..
> 
> It looks to me like you can safely leave ictx as NULL instead of
> adding a flag? That would be nicer than leaving a unrefcounted
> pointer floating around..

Hmm, there are a few iommufd_get_object calls using access->ictx
in iommufd_access_attach() and iommufd_access_destroy().

We could have a set of internal access APIs to leave access->ictx
as NULL, as an internal caller has an ictx to pass in. It's going
to be a larger change though..

Thanks
Nicolin
Jason Gunthorpe June 5, 2025, 7:40 p.m. UTC | #4
On Thu, Jun 05, 2025 at 10:04:35AM -0700, Nicolin Chen wrote:
> On Thu, Jun 05, 2025 at 12:16:48PM -0300, Jason Gunthorpe wrote:
> > On Wed, Jun 04, 2025 at 09:11:07PM -0700, Nicolin Chen wrote:
> > 
> > > I found the entire ictx would be locked by iommufd_access_create(),
> > > then the release fop couldn't even get invoked to destroy objects.
> > 
> > Yes, that makes sense..
> > 
> > It looks to me like you can safely leave ictx as NULL instead of
> > adding a flag? That would be nicer than leaving a unrefcounted
> > pointer floating around..
> 
> Hmm, there are a few iommufd_get_object calls using access->ictx
> in iommufd_access_attach() and iommufd_access_destroy().

I counted:

iommufd_access_change_ioas_id
 * Don't call this
iommufd_access_destroy_object
 * Don't put if null
iommufd_access_create
 * Don't set it
iommufd_access_destroy
 * Call iommufd_object_destroy_user directly
iommufd_access_notify_unmap
 * Check for null access->ops->unmap and skip the lock_obj/put_obj

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h
index 2f2a47a2f9ee..e04dcc79d309 100644
--- a/drivers/iommu/iommufd/io_pagetable.h
+++ b/drivers/iommu/iommufd/io_pagetable.h
@@ -115,6 +115,14 @@  static inline unsigned long iopt_area_iova_to_index(struct iopt_area *area,
 	return iopt_area_start_byte(area, iova) / PAGE_SIZE;
 }
 
+static inline bool iopt_area_check_prot(struct iopt_area *area,
+					unsigned int flags)
+{
+	if (flags & IOMMUFD_ACCESS_RW_WRITE)
+		return area->iommu_prot & IOMMU_WRITE;
+	return area->iommu_prot & IOMMU_READ;
+}
+
 #define __make_iopt_iter(name)                                                 \
 	static inline struct iopt_##name *iopt_##name##_iter_first(            \
 		struct io_pagetable *iopt, unsigned long start,                \
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 16767c231580..cef3e0e0bbb2 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -130,6 +130,12 @@  int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
 void iopt_enable_large_pages(struct io_pagetable *iopt);
 int iopt_disable_large_pages(struct io_pagetable *iopt);
 
+int iopt_pin_pages(struct io_pagetable *iopt, unsigned long iova,
+		   unsigned long length, struct page **out_pages,
+		   unsigned int flags, bool is_owner);
+void iopt_unpin_pages(struct io_pagetable *iopt, unsigned long iova,
+		      unsigned long length, bool is_owner);
+
 struct iommufd_ucmd {
 	struct iommufd_ctx *ictx;
 	void __user *ubuffer;
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 2b1ed5c8dc5b..0f2bda9e9d84 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -1239,59 +1239,17 @@  void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
 void iommufd_access_unpin_pages(struct iommufd_access *access,
 				unsigned long iova, unsigned long length)
 {
-	struct iopt_area_contig_iter iter;
-	struct io_pagetable *iopt;
-	unsigned long last_iova;
-	struct iopt_area *area;
-
-	if (WARN_ON(!length) ||
-	    WARN_ON(check_add_overflow(iova, length - 1, &last_iova)))
-		return;
-
-	mutex_lock(&access->ioas_lock);
+	guard(mutex)(&access->ioas_lock);
 	/*
 	 * The driver must be doing something wrong if it calls this before an
 	 * iommufd_access_attach() or after an iommufd_access_detach().
 	 */
-	if (WARN_ON(!access->ioas_unpin)) {
-		mutex_unlock(&access->ioas_lock);
+	if (WARN_ON(!access->ioas_unpin))
 		return;
-	}
-	iopt = &access->ioas_unpin->iopt;
-
-	down_read(&iopt->iova_rwsem);
-	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova)
-		iopt_area_remove_access(
-			area, iopt_area_iova_to_index(area, iter.cur_iova),
-			iopt_area_iova_to_index(
-				area,
-				min(last_iova, iopt_area_last_iova(area))),
-			false);
-	WARN_ON(!iopt_area_contig_done(&iter));
-	up_read(&iopt->iova_rwsem);
-	mutex_unlock(&access->ioas_lock);
+	iopt_unpin_pages(&access->ioas_unpin->iopt, iova, length, false);
 }
 EXPORT_SYMBOL_NS_GPL(iommufd_access_unpin_pages, "IOMMUFD");
 
-static bool iopt_area_contig_is_aligned(struct iopt_area_contig_iter *iter)
-{
-	if (iopt_area_start_byte(iter->area, iter->cur_iova) % PAGE_SIZE)
-		return false;
-
-	if (!iopt_area_contig_done(iter) &&
-	    (iopt_area_start_byte(iter->area, iopt_area_last_iova(iter->area)) %
-	     PAGE_SIZE) != (PAGE_SIZE - 1))
-		return false;
-	return true;
-}
-
-static bool check_area_prot(struct iopt_area *area, unsigned int flags)
-{
-	if (flags & IOMMUFD_ACCESS_RW_WRITE)
-		return area->iommu_prot & IOMMU_WRITE;
-	return area->iommu_prot & IOMMU_READ;
-}
-
 /**
  * iommufd_access_pin_pages() - Return a list of pages under the iova
  * @access: IOAS access to act on
@@ -1315,77 +1273,16 @@  int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova,
 			     unsigned long length, struct page **out_pages,
 			     unsigned int flags)
 {
-	struct iopt_area_contig_iter iter;
-	struct io_pagetable *iopt;
-	unsigned long last_iova;
-	struct iopt_area *area;
-	int rc;
-
 	/* Driver's ops don't support pin_pages */
 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
 	    WARN_ON(access->iova_alignment != PAGE_SIZE || !access->ops->unmap))
 		return -EINVAL;
 
-	if (!length)
-		return -EINVAL;
-	if (check_add_overflow(iova, length - 1, &last_iova))
-		return -EOVERFLOW;
-
-	mutex_lock(&access->ioas_lock);
-	if (!access->ioas) {
-		mutex_unlock(&access->ioas_lock);
+	guard(mutex)(&access->ioas_lock);
+	if (!access->ioas)
 		return -ENOENT;
-	}
-	iopt = &access->ioas->iopt;
-
-	down_read(&iopt->iova_rwsem);
-	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
-		unsigned long last = min(last_iova, iopt_area_last_iova(area));
-		unsigned long last_index = iopt_area_iova_to_index(area, last);
-		unsigned long index =
-			iopt_area_iova_to_index(area, iter.cur_iova);
-
-		if (area->prevent_access ||
-		    !iopt_area_contig_is_aligned(&iter)) {
-			rc = -EINVAL;
-			goto err_remove;
-		}
-
-		if (!check_area_prot(area, flags)) {
-			rc = -EPERM;
-			goto err_remove;
-		}
-
-		rc = iopt_area_add_access(area, index, last_index, out_pages,
-					  flags, false);
-		if (rc)
-			goto err_remove;
-		out_pages += last_index - index + 1;
-	}
-	if (!iopt_area_contig_done(&iter)) {
-		rc = -ENOENT;
-		goto err_remove;
-	}
-
-	up_read(&iopt->iova_rwsem);
-	mutex_unlock(&access->ioas_lock);
-	return 0;
-
-err_remove:
-	if (iova < iter.cur_iova) {
-		last_iova = iter.cur_iova - 1;
-		iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova)
-			iopt_area_remove_access(
-				area,
-				iopt_area_iova_to_index(area, iter.cur_iova),
-				iopt_area_iova_to_index(
-					area, min(last_iova,
-						  iopt_area_last_iova(area))),
-				false);
-	}
-	up_read(&iopt->iova_rwsem);
-	mutex_unlock(&access->ioas_lock);
-	return rc;
+	return iopt_pin_pages(&access->ioas->iopt, iova, length, out_pages,
+			      flags, false);
 }
 EXPORT_SYMBOL_NS_GPL(iommufd_access_pin_pages, "IOMMUFD");
 
@@ -1432,7 +1329,7 @@  int iommufd_access_rw(struct iommufd_access *access, unsigned long iova,
 			goto err_out;
 		}
 
-		if (!check_area_prot(area, flags)) {
+		if (!iopt_area_check_prot(area, flags)) {
 			rc = -EPERM;
 			goto err_out;
 		}
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index 51efebb1c6ed..3b164d364e53 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -1477,3 +1477,100 @@  int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
 	up_write(&iopt->iova_rwsem);
 	return rc;
 }
+
+static bool iopt_area_contig_is_aligned(struct iopt_area_contig_iter *iter)
+{
+	if (iopt_area_start_byte(iter->area, iter->cur_iova) % PAGE_SIZE)
+		return false;
+
+	if (!iopt_area_contig_done(iter) &&
+	    (iopt_area_start_byte(iter->area, iopt_area_last_iova(iter->area)) %
+	     PAGE_SIZE) != (PAGE_SIZE - 1))
+		return false;
+	return true;
+}
+
+int iopt_pin_pages(struct io_pagetable *iopt, unsigned long iova,
+		   unsigned long length, struct page **out_pages,
+		   unsigned int flags, bool is_owner)
+{
+	struct iopt_area_contig_iter iter;
+	unsigned long last_iova;
+	struct iopt_area *area;
+	int rc;
+
+	if (!length)
+		return -EINVAL;
+	if (check_add_overflow(iova, length - 1, &last_iova))
+		return -EOVERFLOW;
+
+	down_read(&iopt->iova_rwsem);
+	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
+		unsigned long last = min(last_iova, iopt_area_last_iova(area));
+		unsigned long last_index = iopt_area_iova_to_index(area, last);
+		unsigned long index =
+			iopt_area_iova_to_index(area, iter.cur_iova);
+
+		if (area->prevent_access ||
+		    !iopt_area_contig_is_aligned(&iter)) {
+			rc = -EINVAL;
+			goto err_remove;
+		}
+
+		if (!iopt_area_check_prot(area, flags)) {
+			rc = -EPERM;
+			goto err_remove;
+		}
+
+		rc = iopt_area_add_access(area, index, last_index, out_pages,
+					  flags, is_owner);
+		if (rc)
+			goto err_remove;
+		out_pages += last_index - index + 1;
+	}
+	if (!iopt_area_contig_done(&iter)) {
+		rc = -ENOENT;
+		goto err_remove;
+	}
+
+	up_read(&iopt->iova_rwsem);
+	return 0;
+
+err_remove:
+	if (iova < iter.cur_iova) {
+		last_iova = iter.cur_iova - 1;
+		iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova)
+			iopt_area_remove_access(
+				area,
+				iopt_area_iova_to_index(area, iter.cur_iova),
+				iopt_area_iova_to_index(
+					area, min(last_iova,
+						  iopt_area_last_iova(area))),
+				is_owner);
+	}
+	up_read(&iopt->iova_rwsem);
+	return rc;
+}
+
+void iopt_unpin_pages(struct io_pagetable *iopt, unsigned long iova,
+		      unsigned long length, bool is_owner)
+{
+	struct iopt_area_contig_iter iter;
+	unsigned long last_iova;
+	struct iopt_area *area;
+
+	if (WARN_ON(!length) ||
+	    WARN_ON(check_add_overflow(iova, length - 1, &last_iova)))
+		return;
+
+	down_read(&iopt->iova_rwsem);
+	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova)
+		iopt_area_remove_access(
+			area, iopt_area_iova_to_index(area, iter.cur_iova),
+			iopt_area_iova_to_index(
+				area,
+				min(last_iova, iopt_area_last_iova(area))),
+			is_owner);
+	WARN_ON(!iopt_area_contig_done(&iter));
+	up_read(&iopt->iova_rwsem);
+}