Message ID | 5c509f092ba61d4c0852ba57b530888ffb864ccb.1747537752.git.nicolinc@nvidia.com |
---|---|
State | New |
Headers | show |
Series | iommufd: Add vIOMMU infrastructure (Part-4 HW QUEUE) | expand |
On Sat, May 17, 2025 at 08:21:31PM -0700, Nicolin Chen wrote: > +int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd) > +{ > + struct iommu_hw_queue_alloc *cmd = ucmd->cmd; > + struct iommufd_hw_queue *hw_queue; > + struct iommufd_viommu *viommu; > + struct page **pages; > + int max_npages, i; > + u64 last, offset; > + int rc; > + > + if (cmd->flags || cmd->type == IOMMU_HW_QUEUE_TYPE_DEFAULT) > + return -EOPNOTSUPP; > + if (!cmd->nesting_parent_iova || !cmd->length) > + return -EINVAL; 0 is a legal nesting_parent_iova > + if (check_add_overflow(cmd->nesting_parent_iova, cmd->length - 1, > + &last)) > + return -EOVERFLOW; > + > + viommu = iommufd_get_viommu(ucmd, cmd->viommu_id); > + if (IS_ERR(viommu)) > + return PTR_ERR(viommu); > + > + if (!viommu->ops || !viommu->ops->hw_queue_alloc) { > + rc = -EOPNOTSUPP; > + goto out_put_viommu; > + } > + > + offset = > + cmd->nesting_parent_iova - PAGE_ALIGN(cmd->nesting_parent_iova); > + max_npages = DIV_ROUND_UP(offset + cmd->length, PAGE_SIZE); This should probably be capped to PAGE_SIZE/sizeof(void *), return EINVAL if not And we don't need the allocation if we are not doing an access. > + hw_queue = viommu->ops->hw_queue_alloc(ucmd, viommu, cmd->type, > + cmd->index, > + cmd->nesting_parent_iova, > + cmd->length); > + if (IS_ERR(hw_queue)) { > + rc = PTR_ERR(hw_queue); > + goto out_unpin; > + } > + > + /* The iommufd_hw_queue_alloc helper saves ictx in hw_queue->ictx */ > + if (WARN_ON_ONCE(hw_queue->ictx != ucmd->ictx)) { > + rc = -EINVAL; > + goto out_unpin; > + } There is another technique from RDMA which may actually be very helpful here considering how things are getting split up.. Put a size_t in the driver ops: size_t size_viommu; size_t size_hw_queue; Have the driver set it via a macro like INIT_RDMA_OBJ_SIZE #define INIT_RDMA_OBJ_SIZE(ib_struct, drv_struct, member) \ .size_##ib_struct = \ (sizeof(struct drv_struct) + \ BUILD_BUG_ON_ZERO(offsetof(struct drv_struct, member)) + \ BUILD_BUG_ON_ZERO( \ !__same_type(((struct drv_struct *)NULL)->member, \ struct ib_struct))) Which proves the core structure is at the front. Then the core code can allocate the object along with enough space for the driver and call a driver function to init the driver portion of the already allocated object. Then you don't need these dances where the driver helper has to do things like set uctx, or the core structure is partially initialized: > + hw_queue->viommu = viommu; > + refcount_inc(&viommu->obj.users); > + hw_queue->length = cmd->length; > + hw_queue->base_addr = cmd->nesting_parent_iova; When the driver is running, which can be a source of bugs. This would be useful for the existing ops too. May reduce the size of the linked in code. Jason
On Fri, May 30, 2025 at 01:14:55PM -0300, Jason Gunthorpe wrote: > On Sat, May 17, 2025 at 08:21:31PM -0700, Nicolin Chen wrote: > > + offset = > > + cmd->nesting_parent_iova - PAGE_ALIGN(cmd->nesting_parent_iova); > > + max_npages = DIV_ROUND_UP(offset + cmd->length, PAGE_SIZE); > > This should probably be capped to PAGE_SIZE/sizeof(void *), return > EINVAL if not Hmm, mind elaborating where this PAGE_SIZE/sizeof comes from? > > + hw_queue = viommu->ops->hw_queue_alloc(ucmd, viommu, cmd->type, > > + cmd->index, > > + cmd->nesting_parent_iova, > > + cmd->length); > > + if (IS_ERR(hw_queue)) { > > + rc = PTR_ERR(hw_queue); > > + goto out_unpin; > > + } > > + > > + /* The iommufd_hw_queue_alloc helper saves ictx in hw_queue->ictx */ > > + if (WARN_ON_ONCE(hw_queue->ictx != ucmd->ictx)) { > > + rc = -EINVAL; > > + goto out_unpin; > > + } > > There is another technique from RDMA which may actually be very > helpful here considering how things are getting split up.. > > Put a size_t in the driver ops: > > size_t size_viommu; > size_t size_hw_queue; > > Have the driver set it via a macro like INIT_RDMA_OBJ_SIZE > > #define INIT_RDMA_OBJ_SIZE(ib_struct, drv_struct, member) \ > .size_##ib_struct = \ > (sizeof(struct drv_struct) + \ > BUILD_BUG_ON_ZERO(offsetof(struct drv_struct, member)) + \ > BUILD_BUG_ON_ZERO( \ > !__same_type(((struct drv_struct *)NULL)->member, \ > struct ib_struct))) > > Which proves the core structure is at the front. > > Then the core code can allocate the object along with enough space for > the driver and call a driver function to init the driver portion of > the already allocated object. That's interesting! Then all "_alloc" ops would be "_init" instead. > Then you don't need these dances where the driver helper has to do > things like set uctx, or the core structure is partially initialized: > > > + hw_queue->viommu = viommu; > > + refcount_inc(&viommu->obj.users); > > + hw_queue->length = cmd->length; > > + hw_queue->base_addr = cmd->nesting_parent_iova; > > When the driver is running, which can be a source of bugs. Hmm, I don't quite follow the "bugs" here. Any example? > This would be useful for the existing ops too. > > May reduce the size of the linked in code. Yes! Haven't tried that yet, but sounds like we could move quite a few things back to the private header. Perhaps a smaller cleanup series first to the existing code. Thanks! Nicolin
On Fri, May 30, 2025 at 10:38:24AM -0700, Nicolin Chen wrote: > On Fri, May 30, 2025 at 01:14:55PM -0300, Jason Gunthorpe wrote: > > On Sat, May 17, 2025 at 08:21:31PM -0700, Nicolin Chen wrote: > > > + offset = > > > + cmd->nesting_parent_iova - PAGE_ALIGN(cmd->nesting_parent_iova); > > > + max_npages = DIV_ROUND_UP(offset + cmd->length, PAGE_SIZE); > > > > This should probably be capped to PAGE_SIZE/sizeof(void *), return > > EINVAL if not > > Hmm, mind elaborating where this PAGE_SIZE/sizeof comes from? We can usually allocate up to a PAGE_SIZE without too much trouble. Beyond that it gets more likely to fail. > > > + hw_queue->viommu = viommu; > > > + refcount_inc(&viommu->obj.users); > > > + hw_queue->length = cmd->length; > > > + hw_queue->base_addr = cmd->nesting_parent_iova; > > > > When the driver is running, which can be a source of bugs. > > Hmm, I don't quite follow the "bugs" here. Any example? Like if the driver thinks that hw_queue->length should be valid during init, it turns out it isn't. Jason
On Fri, May 30, 2025 at 02:40:37PM -0300, Jason Gunthorpe wrote: > On Fri, May 30, 2025 at 10:38:24AM -0700, Nicolin Chen wrote: > > On Fri, May 30, 2025 at 01:14:55PM -0300, Jason Gunthorpe wrote: > > > On Sat, May 17, 2025 at 08:21:31PM -0700, Nicolin Chen wrote: > > > > + offset = > > > > + cmd->nesting_parent_iova - PAGE_ALIGN(cmd->nesting_parent_iova); > > > > + max_npages = DIV_ROUND_UP(offset + cmd->length, PAGE_SIZE); > > > > > > This should probably be capped to PAGE_SIZE/sizeof(void *), return > > > EINVAL if not > > > > Hmm, mind elaborating where this PAGE_SIZE/sizeof comes from? > > We can usually allocate up to a PAGE_SIZE without too much > trouble. Beyond that it gets more likely to fail. If PAGE_SIZE=4096, the upper limit for max_npages is 512, i.e. the max size of a guest queue is 2MB? It seems to be too small, as the VMM can use a larger huge page size to back the guest queue? > > > > + hw_queue->viommu = viommu; > > > > + refcount_inc(&viommu->obj.users); > > > > + hw_queue->length = cmd->length; > > > > + hw_queue->base_addr = cmd->nesting_parent_iova; > > > > > > When the driver is running, which can be a source of bugs. > > > > Hmm, I don't quite follow the "bugs" here. Any example? > > Like if the driver thinks that hw_queue->length should be valid during > init, it turns out it isn't. Ah, I see. Yes. Nicolin
On Fri, May 30, 2025 at 11:23:02AM -0700, Nicolin Chen wrote: > On Fri, May 30, 2025 at 02:40:37PM -0300, Jason Gunthorpe wrote: > > On Fri, May 30, 2025 at 10:38:24AM -0700, Nicolin Chen wrote: > > > On Fri, May 30, 2025 at 01:14:55PM -0300, Jason Gunthorpe wrote: > > > > On Sat, May 17, 2025 at 08:21:31PM -0700, Nicolin Chen wrote: > > > > > + offset = > > > > > + cmd->nesting_parent_iova - PAGE_ALIGN(cmd->nesting_parent_iova); > > > > > + max_npages = DIV_ROUND_UP(offset + cmd->length, PAGE_SIZE); > > > > > > > > This should probably be capped to PAGE_SIZE/sizeof(void *), return > > > > EINVAL if not > > > > > > Hmm, mind elaborating where this PAGE_SIZE/sizeof comes from? > > > > We can usually allocate up to a PAGE_SIZE without too much > > trouble. Beyond that it gets more likely to fail. > > If PAGE_SIZE=4096, the upper limit for max_npages is 512, i.e. the > max size of a guest queue is 2MB? It seems to be too small, as the > VMM can use a larger huge page size to back the guest queue? May need to make a new API that returns a bio_vec or something else more efficient then :\ Jason
On Fri, May 30, 2025 at 03:25:19PM -0300, Jason Gunthorpe wrote: > On Fri, May 30, 2025 at 11:23:02AM -0700, Nicolin Chen wrote: > > On Fri, May 30, 2025 at 02:40:37PM -0300, Jason Gunthorpe wrote: > > > On Fri, May 30, 2025 at 10:38:24AM -0700, Nicolin Chen wrote: > > > > On Fri, May 30, 2025 at 01:14:55PM -0300, Jason Gunthorpe wrote: > > > > > On Sat, May 17, 2025 at 08:21:31PM -0700, Nicolin Chen wrote: > > > > > > + offset = > > > > > > + cmd->nesting_parent_iova - PAGE_ALIGN(cmd->nesting_parent_iova); > > > > > > + max_npages = DIV_ROUND_UP(offset + cmd->length, PAGE_SIZE); > > > > > > > > > > This should probably be capped to PAGE_SIZE/sizeof(void *), return > > > > > EINVAL if not > > > > > > > > Hmm, mind elaborating where this PAGE_SIZE/sizeof comes from? > > > > > > We can usually allocate up to a PAGE_SIZE without too much > > > trouble. Beyond that it gets more likely to fail. > > > > If PAGE_SIZE=4096, the upper limit for max_npages is 512, i.e. the > > max size of a guest queue is 2MB? It seems to be too small, as the > > VMM can use a larger huge page size to back the guest queue? > > May need to make a new API that returns a bio_vec or something else > more efficient then :\ Hmm, that sounds like a rabbit hole :-\ Let me leave a FIXME at this max_npages calculation instead.. Thanks Nicolin
On Fri, May 30, 2025 at 01:14:55PM -0300, Jason Gunthorpe wrote: > Put a size_t in the driver ops: > > size_t size_viommu; > size_t size_hw_queue; > > Have the driver set it via a macro like INIT_RDMA_OBJ_SIZE > > #define INIT_RDMA_OBJ_SIZE(ib_struct, drv_struct, member) \ > .size_##ib_struct = \ > (sizeof(struct drv_struct) + \ > BUILD_BUG_ON_ZERO(offsetof(struct drv_struct, member)) + \ > BUILD_BUG_ON_ZERO( \ > !__same_type(((struct drv_struct *)NULL)->member, \ > struct ib_struct))) > > Which proves the core structure is at the front. > > Then the core code can allocate the object along with enough space for > the driver and call a driver function to init the driver portion of > the already allocated object. I found that the size_viommu or size_hw_queue might not work using a static macro as that RDMA one does: - The size in vIOMMU case is type dependent. E.g. smmuv3 driver uses one iommu_ops to support two types: vSMMU and vCMDQ - Changing to a type-indexed size array would eventually result some driver having a big size array, as the type number grows I came up with two alternatives: 1) Define a get_viommu_size(unsigned int type) op: use a similar macro in the driver function to return with: #define VIOMMU_STRUCT_SIZE(ib_struct, drv_struct, member) \ (sizeof(drv_struct) + \ BUILD_BUG_ON_ZERO(offsetof(drv_struct, member)) + \ BUILD_BUG_ON_ZERO(!__same_type(((drv_struct *)NULL)->member, \ ib_struct))) if (type == SMMU) return VIOMMU_STRUCT_SIZE( struct arm_vsmmu, struct iommufd_viommu, core); return 0; 2) Let core allocate with sizeof(struct iommufd_viommu), then let driver krealloc during the viommu_init op call: viommu = kzalloc(sizeof(struct iommufd_viommu), ...); ... viommu = ops->viommu_init(viommu, dev, parent_dom, type); I am guessing that you may prefer 1 over 2? Or any better idea? Thanks Nicolin
On Mon, Jun 02, 2025 at 10:41:05PM -0700, Nicolin Chen wrote: > I found that the size_viommu or size_hw_queue might not work using > a static macro as that RDMA one does: > > - The size in vIOMMU case is type dependent. E.g. smmuv3 driver > uses one iommu_ops to support two types: vSMMU and vCMDQ Maybe they can just be max()'d? > 1) Define a get_viommu_size(unsigned int type) op: use a similar > macro in the driver function to return with: > > #define VIOMMU_STRUCT_SIZE(ib_struct, drv_struct, member) \ > (sizeof(drv_struct) + \ > BUILD_BUG_ON_ZERO(offsetof(drv_struct, member)) + \ > BUILD_BUG_ON_ZERO(!__same_type(((drv_struct *)NULL)->member, \ > ib_struct))) > > if (type == SMMU) > return VIOMMU_STRUCT_SIZE( > struct arm_vsmmu, struct iommufd_viommu, core); > return 0; I guess this is best? > 2) Let core allocate with sizeof(struct iommufd_viommu), then let > driver krealloc during the viommu_init op call: No.. memmoving things like locks doesn't work. Jason
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 35088478cb07..a94d04ab0d2c 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -603,6 +603,8 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd); void iommufd_viommu_destroy(struct iommufd_object *obj); int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd); void iommufd_vdevice_destroy(struct iommufd_object *obj); +int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd); +void iommufd_hw_queue_destroy(struct iommufd_object *obj); #ifdef CONFIG_IOMMUFD_TEST int iommufd_test(struct iommufd_ucmd *ucmd); diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 272da7324a2b..aeed0127553e 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -56,6 +56,7 @@ enum { IOMMUFD_CMD_VDEVICE_ALLOC = 0x91, IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92, IOMMUFD_CMD_VEVENTQ_ALLOC = 0x93, + IOMMUFD_CMD_HW_QUEUE_ALLOC = 0x94, }; /** @@ -1147,4 +1148,48 @@ struct iommu_veventq_alloc { __u32 __reserved; }; #define IOMMU_VEVENTQ_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VEVENTQ_ALLOC) + +/** + * enum iommu_hw_queue_type - HW Queue Type + * @IOMMU_HW_QUEUE_TYPE_DEFAULT: Reserved for future use + */ +enum iommu_hw_queue_type { + IOMMU_HW_QUEUE_TYPE_DEFAULT = 0, +}; + +/** + * struct iommu_hw_queue_alloc - ioctl(IOMMU_HW_QUEUE_ALLOC) + * @size: sizeof(struct iommu_hw_queue_alloc) + * @flags: Must be 0 + * @viommu_id: Virtual IOMMU ID to associate the HW queue with + * @type: One of enum iommu_hw_queue_type + * @index: The logical index to the HW queue per virtual IOMMU for a multi-queue + * model + * @out_hw_queue_id: The ID of the new HW queue + * @nesting_parent_iova: Base address of the queue memory in the guest physical + * address space + * @length: Length of the queue memory + * + * Allocate a HW queue object for a vIOMMU-specific HW-accelerated queue, which + * allows HW to access a guest queue memory described using @nesting_parent_iova + * and @length. + * + * Upon success, the underlying physical pages of the guest queue memory will be + * pinned to prevent VMM from unmapping them in the IOAS until the HW queue gets + * destroyed. + * + * A vIOMMU can allocate multiple queues, but it must use a different @index to + * separate each allocation, e.g. HW queue0, HW queue1, ... + */ +struct iommu_hw_queue_alloc { + __u32 size; + __u32 flags; + __u32 viommu_id; + __u32 type; + __u32 index; + __u32 out_hw_queue_id; + __aligned_u64 nesting_parent_iova; + __aligned_u64 length; +}; +#define IOMMU_HW_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HW_QUEUE_ALLOC) #endif diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index c6d0b446e632..0750e740fa1d 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -304,6 +304,7 @@ union ucmd_buffer { struct iommu_destroy destroy; struct iommu_fault_alloc fault; struct iommu_hw_info info; + struct iommu_hw_queue_alloc hw_queue; struct iommu_hwpt_alloc hwpt; struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap; struct iommu_hwpt_invalidate cache; @@ -346,6 +347,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { struct iommu_fault_alloc, out_fault_fd), IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info, __reserved), + IOCTL_OP(IOMMU_HW_QUEUE_ALLOC, iommufd_hw_queue_alloc_ioctl, + struct iommu_hw_queue_alloc, length), IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, __reserved), IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap, @@ -509,6 +512,9 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_FAULT] = { .destroy = iommufd_fault_destroy, }, + [IOMMUFD_OBJ_HW_QUEUE] = { + .destroy = iommufd_hw_queue_destroy, + }, [IOMMUFD_OBJ_HWPT_PAGING] = { .destroy = iommufd_hwpt_paging_destroy, .abort = iommufd_hwpt_paging_abort, diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c index 7248fb7c7baf..9b4e99babdb4 100644 --- a/drivers/iommu/iommufd/viommu.c +++ b/drivers/iommu/iommufd/viommu.c @@ -164,3 +164,108 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) iommufd_put_object(ucmd->ictx, &viommu->obj); return rc; } + +void iommufd_hw_queue_destroy(struct iommufd_object *obj) +{ + struct iommufd_hw_queue *hw_queue = + container_of(obj, struct iommufd_hw_queue, obj); + struct iommufd_viommu *viommu = hw_queue->viommu; + + if (viommu->ops->hw_queue_destroy) + viommu->ops->hw_queue_destroy(hw_queue); + iopt_unpin_pages(&viommu->hwpt->ioas->iopt, hw_queue->base_addr, + hw_queue->length, true); + refcount_dec(&viommu->obj.users); +} + +int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd) +{ + struct iommu_hw_queue_alloc *cmd = ucmd->cmd; + struct iommufd_hw_queue *hw_queue; + struct iommufd_viommu *viommu; + struct page **pages; + int max_npages, i; + u64 last, offset; + int rc; + + if (cmd->flags || cmd->type == IOMMU_HW_QUEUE_TYPE_DEFAULT) + return -EOPNOTSUPP; + if (!cmd->nesting_parent_iova || !cmd->length) + return -EINVAL; + if (check_add_overflow(cmd->nesting_parent_iova, cmd->length - 1, + &last)) + return -EOVERFLOW; + + viommu = iommufd_get_viommu(ucmd, cmd->viommu_id); + if (IS_ERR(viommu)) + return PTR_ERR(viommu); + + if (!viommu->ops || !viommu->ops->hw_queue_alloc) { + rc = -EOPNOTSUPP; + goto out_put_viommu; + } + + offset = + cmd->nesting_parent_iova - PAGE_ALIGN(cmd->nesting_parent_iova); + max_npages = DIV_ROUND_UP(offset + cmd->length, PAGE_SIZE); + pages = kcalloc(max_npages, sizeof(*pages), GFP_KERNEL); + if (!pages) { + rc = -ENOMEM; + goto out_put_viommu; + } + + /* + * The underlying physical pages must be pinned to prevent them from + * being unmapped (via IOMMUFD_CMD_IOAS_UNMAP) during the life cycle + * of the HW QUEUE object. And the pages should be contiguous too. + */ + if (viommu->ops->flags & IOMMUFD_VIOMMU_FLAG_HW_QUEUE_READS_PA) { + rc = iopt_pin_pages(&viommu->hwpt->ioas->iopt, + cmd->nesting_parent_iova, cmd->length, + pages, 0, true); + if (rc) + goto out_free; + + /* Validate if the underlying physical pages are contiguous */ + for (i = 1; i < max_npages && pages[i]; i++) { + if (page_to_pfn(pages[i]) == + page_to_pfn(pages[i - 1]) + 1) + continue; + rc = -EFAULT; + goto out_unpin; + } + } + + hw_queue = viommu->ops->hw_queue_alloc(ucmd, viommu, cmd->type, + cmd->index, + cmd->nesting_parent_iova, + cmd->length); + if (IS_ERR(hw_queue)) { + rc = PTR_ERR(hw_queue); + goto out_unpin; + } + + /* The iommufd_hw_queue_alloc helper saves ictx in hw_queue->ictx */ + if (WARN_ON_ONCE(hw_queue->ictx != ucmd->ictx)) { + rc = -EINVAL; + goto out_unpin; + } + + hw_queue->viommu = viommu; + refcount_inc(&viommu->obj.users); + hw_queue->length = cmd->length; + hw_queue->base_addr = cmd->nesting_parent_iova; + cmd->out_hw_queue_id = hw_queue->obj.id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + goto out_put_viommu; + +out_unpin: + if (viommu->ops->flags & IOMMUFD_VIOMMU_FLAG_HW_QUEUE_READS_PA) + iopt_unpin_pages(&viommu->hwpt->ioas->iopt, + cmd->nesting_parent_iova, cmd->length, true); +out_free: + kfree(pages); +out_put_viommu: + iommufd_put_object(ucmd->ictx, &viommu->obj); + return rc; +}