Message ID | 11-v5-4001c2997bd0+30c-iommufd_jgg@nvidia.com |
---|---|
State | Accepted |
Commit | aad37e71d5c4dc1d3c25734f0bcd51c324f94b5e |
Headers | show |
Series | IOMMUFD Generic interface | expand |
Hi Jason, On 11/16/22 22:00, Jason Gunthorpe wrote: > Connect the IOAS to its IOCTL interface. This exposes most of the > functionality in the io_pagetable to userspace. > > This is intended to be the core of the generic interface that IOMMUFD will > provide. Every IOMMU driver should be able to implement an iommu_domain > that is compatible with this generic mechanism. > > It is also designed to be easy to use for simple non virtual machine > monitor users, like DPDK: > - Universal simple support for all IOMMUs (no PPC special path) > - An IOVA allocator that considers the aperture and the allowed/reserved > ranges > - io_pagetable allows any number of iommu_domains to be connected to the > IOAS > - Automatic allocation and re-use of iommu_domains > > Along with room in the design to add non-generic features to cater to > specific HW functionality. > > Tested-by: Nicolin Chen <nicolinc@nvidia.com> > Tested-by: Yi Liu <yi.l.liu@intel.com> > Tested-by: Lixiao Yang <lixiao.yang@intel.com> > Tested-by: Matthew Rosato <mjrosato@linux.ibm.com> > Reviewed-by: Kevin Tian <kevin.tian@intel.com> > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> > Signed-off-by: Nicolin Chen <nicolinc@nvidia.com> > --- > drivers/iommu/iommufd/Makefile | 1 + > drivers/iommu/iommufd/ioas.c | 384 ++++++++++++++++++++++++ > drivers/iommu/iommufd/iommufd_private.h | 33 ++ > drivers/iommu/iommufd/main.c | 45 +++ > include/uapi/linux/iommufd.h | 246 ++++++++++++++- > 5 files changed, 708 insertions(+), 1 deletion(-) > create mode 100644 drivers/iommu/iommufd/ioas.c > > diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile > index b66a8c47ff55ec..2b4f36f1b72f9d 100644 > --- a/drivers/iommu/iommufd/Makefile > +++ b/drivers/iommu/iommufd/Makefile > @@ -1,6 +1,7 @@ > # SPDX-License-Identifier: GPL-2.0-only > iommufd-y := \ > io_pagetable.o \ > + ioas.o \ > main.o \ > pages.o > > diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c > new file mode 100644 > index 00000000000000..7671456e86413a > --- /dev/null > +++ b/drivers/iommu/iommufd/ioas.c > @@ -0,0 +1,384 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES > + */ > +#include <linux/interval_tree.h> > +#include <linux/iommufd.h> > +#include <linux/iommu.h> > +#include <uapi/linux/iommufd.h> > + > +#include "io_pagetable.h" > + > +void iommufd_ioas_destroy(struct iommufd_object *obj) > +{ > + struct iommufd_ioas *ioas = container_of(obj, struct iommufd_ioas, obj); > + int rc; > + > + rc = iopt_unmap_all(&ioas->iopt, NULL); > + WARN_ON(rc && rc != -ENOENT); > + iopt_destroy_table(&ioas->iopt); > +} > + > +struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx) > +{ > + struct iommufd_ioas *ioas; > + > + ioas = iommufd_object_alloc(ictx, ioas, IOMMUFD_OBJ_IOAS); > + if (IS_ERR(ioas)) > + return ioas; > + > + iopt_init_table(&ioas->iopt); > + return ioas; > +} > + > +int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd) > +{ > + struct iommu_ioas_alloc *cmd = ucmd->cmd; > + struct iommufd_ioas *ioas; > + int rc; > + > + if (cmd->flags) > + return -EOPNOTSUPP; > + > + ioas = iommufd_ioas_alloc(ucmd->ictx); > + if (IS_ERR(ioas)) > + return PTR_ERR(ioas); > + > + cmd->out_ioas_id = ioas->obj.id; > + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); > + if (rc) > + goto out_table; > + iommufd_object_finalize(ucmd->ictx, &ioas->obj); > + return 0; > + > +out_table: > + iommufd_object_abort_and_destroy(ucmd->ictx, &ioas->obj); > + return rc; > +} > + > +int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd) > +{ > + struct iommu_iova_range __user *ranges; > + struct iommu_ioas_iova_ranges *cmd = ucmd->cmd; > + struct iommufd_ioas *ioas; > + struct interval_tree_span_iter span; > + u32 max_iovas; > + int rc; > + > + if (cmd->__reserved) > + return -EOPNOTSUPP; > + > + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); > + if (IS_ERR(ioas)) > + return PTR_ERR(ioas); > + > + down_read(&ioas->iopt.iova_rwsem); > + max_iovas = cmd->num_iovas; > + ranges = u64_to_user_ptr(cmd->allowed_iovas); > + cmd->num_iovas = 0; > + cmd->out_iova_alignment = ioas->iopt.iova_alignment; > + interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0, > + ULONG_MAX) { > + if (!span.is_hole) > + continue; > + if (cmd->num_iovas < max_iovas) { > + struct iommu_iova_range elm = { > + .start = span.start_hole, > + .last = span.last_hole, > + }; > + > + if (copy_to_user(&ranges[cmd->num_iovas], &elm, > + sizeof(elm))) { > + rc = -EFAULT; > + goto out_put; > + } > + } > + cmd->num_iovas++; > + } > + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); > + if (rc) > + goto out_put; > + if (cmd->num_iovas > max_iovas) > + rc = -EMSGSIZE; > +out_put: > + up_read(&ioas->iopt.iova_rwsem); > + iommufd_put_object(&ioas->obj); > + return rc; > +} > + > +static int iommufd_ioas_load_iovas(struct rb_root_cached *itree, > + struct iommu_iova_range __user *ranges, > + u32 num) > +{ > + u32 i; > + > + for (i = 0; i != num; i++) { shouldn't it be < ? > + struct iommu_iova_range range; > + struct iopt_allowed *allowed; > + > + if (copy_from_user(&range, ranges + i, sizeof(range))) > + return -EFAULT; > + > + if (range.start >= range.last) > + return -EINVAL; > + > + if (interval_tree_iter_first(itree, range.start, range.last)) > + return -EINVAL; > + > + allowed = kzalloc(sizeof(*allowed), GFP_KERNEL_ACCOUNT); > + if (!allowed) > + return -ENOMEM; > + allowed->node.start = range.start; > + allowed->node.last = range.last; > + > + interval_tree_insert(&allowed->node, itree); > + } > + return 0; > +} > + > +int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd) > +{ > + struct iommu_ioas_allow_iovas *cmd = ucmd->cmd; > + struct rb_root_cached allowed_iova = RB_ROOT_CACHED; > + struct interval_tree_node *node; > + struct iommufd_ioas *ioas; > + struct io_pagetable *iopt; > + int rc = 0; > + > + if (cmd->__reserved) > + return -EOPNOTSUPP; > + > + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); > + if (IS_ERR(ioas)) > + return PTR_ERR(ioas); > + iopt = &ioas->iopt; > + > + rc = iommufd_ioas_load_iovas(&allowed_iova, > + u64_to_user_ptr(cmd->allowed_iovas), > + cmd->num_iovas); > + if (rc) > + goto out_free; > + > + rc = iopt_set_allow_iova(iopt, &allowed_iova); Please can you add a comment about why you need to proceed in 2 steps, ie. add the ranges in a first tree and then 'swap' to the iopt->allowed_tree (and eventually delete the first tree)? > +out_free: > + while ((node = interval_tree_iter_first(&allowed_iova, 0, ULONG_MAX))) { > + interval_tree_remove(node, &allowed_iova); > + kfree(container_of(node, struct iopt_allowed, node)); > + } > + iommufd_put_object(&ioas->obj); > + return rc; > +} > + > +static int conv_iommu_prot(u32 map_flags) > +{ > + int iommu_prot; > + > + /* > + * We provide no manual cache coherency ioctls to userspace and most > + * architectures make the CPU ops for cache flushing privileged. > + * Therefore we require the underlying IOMMU to support CPU coherent > + * operation. Support for IOMMU_CACHE is enforced by the > + * IOMMU_CAP_CACHE_COHERENCY test during bind. > + */ > + iommu_prot = IOMMU_CACHE; at init? > + if (map_flags & IOMMU_IOAS_MAP_WRITEABLE) > + iommu_prot |= IOMMU_WRITE; > + if (map_flags & IOMMU_IOAS_MAP_READABLE) > + iommu_prot |= IOMMU_READ; > + return iommu_prot; > +} > + > +int iommufd_ioas_map(struct iommufd_ucmd *ucmd) > +{ > + struct iommu_ioas_map *cmd = ucmd->cmd; > + struct iommufd_ioas *ioas; > + unsigned int flags = 0; > + unsigned long iova; > + int rc; > + > + if ((cmd->flags & > + ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE | > + IOMMU_IOAS_MAP_READABLE)) || > + cmd->__reserved) > + return -EOPNOTSUPP; > + if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) > + return -EOVERFLOW; > + > + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); > + if (IS_ERR(ioas)) > + return PTR_ERR(ioas); > + > + if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA)) > + flags = IOPT_ALLOC_IOVA; > + iova = cmd->iova; can be done either at initialization or only if MAP_FIXED_IOVA. > + rc = iopt_map_user_pages(ucmd->ictx, &ioas->iopt, &iova, > + u64_to_user_ptr(cmd->user_va), cmd->length, > + conv_iommu_prot(cmd->flags), flags); > + if (rc) > + goto out_put; > + > + cmd->iova = iova; > + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); > +out_put: > + iommufd_put_object(&ioas->obj); > + return rc; > +} > + > +int iommufd_ioas_copy(struct iommufd_ucmd *ucmd) > +{ > + struct iommu_ioas_copy *cmd = ucmd->cmd; > + struct iommufd_ioas *src_ioas; > + struct iommufd_ioas *dst_ioas; > + unsigned int flags = 0; > + LIST_HEAD(pages_list); > + unsigned long iova; > + int rc; > + > + if ((cmd->flags & > + ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE | > + IOMMU_IOAS_MAP_READABLE))) > + return -EOPNOTSUPP; > + if (cmd->length >= ULONG_MAX || cmd->src_iova >= ULONG_MAX || > + cmd->dst_iova >= ULONG_MAX) > + return -EOVERFLOW; > + > + src_ioas = iommufd_get_ioas(ucmd, cmd->src_ioas_id); > + if (IS_ERR(src_ioas)) > + return PTR_ERR(src_ioas); > + rc = iopt_get_pages(&src_ioas->iopt, cmd->src_iova, cmd->length, > + &pages_list); > + iommufd_put_object(&src_ioas->obj); > + if (rc) > + return rc; > + > + dst_ioas = iommufd_get_ioas(ucmd, cmd->dst_ioas_id); > + if (IS_ERR(dst_ioas)) { > + rc = PTR_ERR(dst_ioas); > + goto out_pages; > + } > + > + if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA)) > + flags = IOPT_ALLOC_IOVA; > + iova = cmd->dst_iova; > + rc = iopt_map_pages(&dst_ioas->iopt, &pages_list, cmd->length, &iova, > + conv_iommu_prot(cmd->flags), flags); > + if (rc) > + goto out_put_dst; > + > + cmd->dst_iova = iova; > + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); > +out_put_dst: > + iommufd_put_object(&dst_ioas->obj); > +out_pages: > + iopt_free_pages_list(&pages_list); > + return rc; > +} > + > +int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd) > +{ > + struct iommu_ioas_unmap *cmd = ucmd->cmd; > + struct iommufd_ioas *ioas; > + unsigned long unmapped = 0; > + int rc; > + > + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); > + if (IS_ERR(ioas)) > + return PTR_ERR(ioas); > + > + if (cmd->iova == 0 && cmd->length == U64_MAX) { > + rc = iopt_unmap_all(&ioas->iopt, &unmapped); > + if (rc) > + goto out_put; > + } else { > + if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) { > + rc = -EOVERFLOW; > + goto out_put; > + } > + rc = iopt_unmap_iova(&ioas->iopt, cmd->iova, cmd->length, > + &unmapped); > + if (rc) > + goto out_put; > + } > + > + cmd->length = unmapped; > + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); > + > +out_put: > + iommufd_put_object(&ioas->obj); > + return rc; > +} > + > +int iommufd_option_rlimit_mode(struct iommu_option *cmd, > + struct iommufd_ctx *ictx) > +{ *object_id and __reserved should be checked as per the uapi doc* > + if (cmd->op == IOMMU_OPTION_OP_GET) { > + cmd->val64 = ictx->account_mode == IOPT_PAGES_ACCOUNT_MM; > + return 0; > + } > + if (cmd->op == IOMMU_OPTION_OP_SET) { > + int rc = 0; > + > + if (!capable(CAP_SYS_RESOURCE)) > + return -EPERM; > + > + xa_lock(&ictx->objects); > + if (!xa_empty(&ictx->objects)) { > + rc = -EBUSY; > + } else { > + if (cmd->val64 == 0) > + ictx->account_mode = IOPT_PAGES_ACCOUNT_USER; > + else if (cmd->val64 == 1) > + ictx->account_mode = IOPT_PAGES_ACCOUNT_MM; > + else > + rc = -EINVAL; > + } > + xa_unlock(&ictx->objects); > + > + return rc; > + } > + return -EOPNOTSUPP; > +} > + > +static int iommufd_ioas_option_huge_pages(struct iommu_option *cmd, > + struct iommufd_ioas *ioas) > +{ > + if (cmd->op == IOMMU_OPTION_OP_GET) { > + cmd->val64 = !ioas->iopt.disable_large_pages; > + return 0; > + } > + if (cmd->op == IOMMU_OPTION_OP_SET) { > + if (cmd->val64 == 0) > + return iopt_disable_large_pages(&ioas->iopt); > + if (cmd->val64 == 1) { > + iopt_enable_large_pages(&ioas->iopt); > + return 0; > + } > + return -EINVAL; > + } > + return -EOPNOTSUPP; > +} > + > +int iommufd_ioas_option(struct iommufd_ucmd *ucmd) > +{ > + struct iommu_option *cmd = ucmd->cmd; > + struct iommufd_ioas *ioas; > + int rc = 0; > + > + if (cmd->__reserved) > + return -EOPNOTSUPP; > + > + ioas = iommufd_get_ioas(ucmd, cmd->object_id); > + if (IS_ERR(ioas)) > + return PTR_ERR(ioas); > + > + switch (cmd->option_id) { > + case IOMMU_OPTION_HUGE_PAGES: > + rc = iommufd_ioas_option_huge_pages(cmd, ioas); > + break; > + default: > + rc = -EOPNOTSUPP; > + } > + > + iommufd_put_object(&ioas->obj); > + return rc; > +} > diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h > index dadd90cae543ba..6721332dbbba03 100644 > --- a/drivers/iommu/iommufd/iommufd_private.h > +++ b/drivers/iommu/iommufd/iommufd_private.h > @@ -11,6 +11,7 @@ > > struct iommu_domain; > struct iommu_group; > +struct iommu_option; > > struct iommufd_ctx { > struct file *file; > @@ -102,6 +103,7 @@ static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd, > enum iommufd_object_type { > IOMMUFD_OBJ_NONE, > IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE, > + IOMMUFD_OBJ_IOAS, > }; > > /* Base struct for all objects with a userspace ID handle. */ > @@ -174,6 +176,37 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, > type), \ > typeof(*(ptr)), obj) > > +/* > + * The IO Address Space (IOAS) pagetable is a virtual page table backed by the > + * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The > + * mapping is copied into all of the associated domains and made available to > + * in-kernel users. > + */ > +struct iommufd_ioas { > + struct iommufd_object obj; > + struct io_pagetable iopt; > +}; > + > +static inline struct iommufd_ioas *iommufd_get_ioas(struct iommufd_ucmd *ucmd, > + u32 id) > +{ > + return container_of(iommufd_get_object(ucmd->ictx, id, > + IOMMUFD_OBJ_IOAS), > + struct iommufd_ioas, obj); > +} > + > +struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx); > +int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd); > +void iommufd_ioas_destroy(struct iommufd_object *obj); > +int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd); > +int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd); > +int iommufd_ioas_map(struct iommufd_ucmd *ucmd); > +int iommufd_ioas_copy(struct iommufd_ucmd *ucmd); > +int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd); > +int iommufd_ioas_option(struct iommufd_ucmd *ucmd); > +int iommufd_option_rlimit_mode(struct iommu_option *cmd, > + struct iommufd_ctx *ictx); > + > struct iommufd_access { > unsigned long iova_alignment; > u32 iopt_access_list_id; > diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c > index 3a705cadb85020..266109045537ed 100644 > --- a/drivers/iommu/iommufd/main.c > +++ b/drivers/iommu/iommufd/main.c > @@ -202,8 +202,36 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) > return 0; > } > > +static int iommufd_option(struct iommufd_ucmd *ucmd) > +{ > + struct iommu_option *cmd = ucmd->cmd; > + int rc; __reserved can be checked here instead. > + switch (cmd->option_id) { > + case IOMMU_OPTION_RLIMIT_MODE: > + rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx); > + break; > + case IOMMU_OPTION_HUGE_PAGES: > + rc = iommufd_ioas_option(ucmd); > + break; > + default: > + return -EOPNOTSUPP; > + } > + if (rc) > + return rc; > + if (copy_to_user(&((struct iommu_option __user *)ucmd->ubuffer)->val64, > + &cmd->val64, sizeof(cmd->val64))) > + return -EFAULT; > + return 0; > +} > + > union ucmd_buffer { > struct iommu_destroy destroy; > + struct iommu_ioas_alloc alloc; > + struct iommu_ioas_allow_iovas allow_iovas; > + struct iommu_ioas_iova_ranges iova_ranges; > + struct iommu_ioas_map map; > + struct iommu_ioas_unmap unmap; > }; > > struct iommufd_ioctl_op { > @@ -224,6 +252,20 @@ struct iommufd_ioctl_op { > } > static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { > IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id), > + IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, > + struct iommu_ioas_alloc, out_ioas_id), > + IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, > + struct iommu_ioas_allow_iovas, allowed_iovas), > + IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy, > + src_iova), > + IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges, > + struct iommu_ioas_iova_ranges, out_iova_alignment), > + IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map, > + iova), > + IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap, > + length), > + IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, > + val64), > }; > > static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd, > @@ -310,6 +352,9 @@ void iommufd_ctx_put(struct iommufd_ctx *ictx) > EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD); > > static const struct iommufd_object_ops iommufd_object_ops[] = { > + [IOMMUFD_OBJ_IOAS] = { > + .destroy = iommufd_ioas_destroy, > + }, > }; > > static struct miscdevice iommu_misc_dev = { > diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h > index 2ad06b27a35fe5..9e9250dfc4fb1b 100644 > --- a/include/uapi/linux/iommufd.h > +++ b/include/uapi/linux/iommufd.h > @@ -37,12 +37,19 @@ > enum { > IOMMUFD_CMD_BASE = 0x80, > IOMMUFD_CMD_DESTROY = IOMMUFD_CMD_BASE, > + IOMMUFD_CMD_IOAS_ALLOC, > + IOMMUFD_CMD_IOAS_ALLOW_IOVAS, > + IOMMUFD_CMD_IOAS_COPY, > + IOMMUFD_CMD_IOAS_IOVA_RANGES, > + IOMMUFD_CMD_IOAS_MAP, > + IOMMUFD_CMD_IOAS_UNMAP, > + IOMMUFD_CMD_OPTION, > }; > > /** > * struct iommu_destroy - ioctl(IOMMU_DESTROY) > * @size: sizeof(struct iommu_destroy) > - * @id: iommufd object ID to destroy. Can by any destroyable object type. > + * @id: iommufd object ID to destroy. Can be any destroyable object type. > * > * Destroy any object held within iommufd. > */ > @@ -52,4 +59,241 @@ struct iommu_destroy { > }; > #define IOMMU_DESTROY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_DESTROY) > > +/** > + * struct iommu_ioas_alloc - ioctl(IOMMU_IOAS_ALLOC) > + * @size: sizeof(struct iommu_ioas_alloc) > + * @flags: Must be 0 > + * @out_ioas_id: Output IOAS ID for the allocated object > + * > + * Allocate an IO Address Space (IOAS) which holds an IO Virtual Address (IOVA) > + * to memory mapping. > + */ > +struct iommu_ioas_alloc { > + __u32 size; > + __u32 flags; > + __u32 out_ioas_id; > +}; > +#define IOMMU_IOAS_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_ALLOC) > + > +/** > + * struct iommu_iova_range - ioctl(IOMMU_IOVA_RANGE) > + * @start: First IOVA > + * @last: Inclusive last IOVA > + * > + * An interval in IOVA space. > + */ > +struct iommu_iova_range { > + __aligned_u64 start; > + __aligned_u64 last; > +}; > + > +/** > + * struct iommu_ioas_iova_ranges - ioctl(IOMMU_IOAS_IOVA_RANGES) > + * @size: sizeof(struct iommu_ioas_iova_ranges) > + * @ioas_id: IOAS ID to read ranges from > + * @num_iovas: Input/Output total number of ranges in the IOAS > + * @__reserved: Must be 0 > + * @allowed_iovas: Pointer to the output array of struct iommu_iova_range > + * @out_iova_alignment: Minimum alignment required for mapping IOVA > + * > + * Query an IOAS for ranges of allowed IOVAs. Mapping IOVA outside these ranges > + * is not allowed. num_iovas will be set to the total number of iovas and > + * the allowed_iovas[] will be filled in as space permits. > + * > + * The allowed ranges are dependent on the HW path the DMA operation takes, and > + * can change during the lifetime of the IOAS. A fresh empty IOAS will have a > + * full range, and each attached device will narrow the ranges based on that > + * device's HW restrictions. Detatching a device can widen the ranges. Userspace detaching > + * should query ranges after every attach/detatch to know what IOVAs are valid detach > + * for mapping. > + * > + * On input num_iovas is the length of the allowed_iovas array. On output it is > + * the total number of iovas filled in. The ioctl will return -EMSGSIZE and set > + * num_iovas to the required value if num_iovas is too small. In this case the > + * caller should allocate a larger output array and re-issue the ioctl. > + */ > +struct iommu_ioas_iova_ranges { > + __u32 size; > + __u32 ioas_id; > + __u32 num_iovas; > + __u32 __reserved; > + __aligned_u64 allowed_iovas; > + __aligned_u64 out_iova_alignment; document @out_iova_alignment? > +}; > +#define IOMMU_IOAS_IOVA_RANGES _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_IOVA_RANGES) > + > +/** > + * struct iommu_ioas_allow_iovas - ioctl(IOMMU_IOAS_ALLOW_IOVAS) > + * @size: sizeof(struct iommu_ioas_allow_iovas) > + * @ioas_id: IOAS ID to allow IOVAs from > + * @num_iovas: Input/Output total number of ranges in the IOAS > + * @__reserved: Must be 0 > + * @allowed_iovas: Pointer to array of struct iommu_iova_range > + * > + * Ensure a range of IOVAs are always available for allocation. If this call > + * succeeds then IOMMU_IOAS_IOVA_RANGES will never return a list of IOVA ranges > + * that are narrower than the ranges provided here. This call will fail if > + * IOMMU_IOAS_IOVA_RANGES is currently narrower than the given ranges. > + * > + * When an IOAS is first created the IOVA_RANGES will be maximally sized, and as > + * devices are attached the IOVA will narrow based on the device restrictions. > + * When an allowed range is specified any narrowing will be refused, ie device > + * attachment can fail if the device requires limiting within the allowed range. > + * > + * Automatic IOVA allocation is also impacted by this call. MAP will only > + * allocate within the allowed IOVAs if they are present. > + * > + * This call replaces the entire allowed list with the given list. > + */ > +struct iommu_ioas_allow_iovas { > + __u32 size; > + __u32 ioas_id; > + __u32 num_iovas; > + __u32 __reserved; > + __aligned_u64 allowed_iovas; > +}; > +#define IOMMU_IOAS_ALLOW_IOVAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_ALLOW_IOVAS) > + > +/** > + * enum iommufd_ioas_map_flags - Flags for map and copy > + * @IOMMU_IOAS_MAP_FIXED_IOVA: If clear the kernel will compute an appropriate > + * IOVA to place the mapping at > + * @IOMMU_IOAS_MAP_WRITEABLE: DMA is allowed to write to this mapping > + * @IOMMU_IOAS_MAP_READABLE: DMA is allowed to read from this mapping > + */ > +enum iommufd_ioas_map_flags { > + IOMMU_IOAS_MAP_FIXED_IOVA = 1 << 0, > + IOMMU_IOAS_MAP_WRITEABLE = 1 << 1, > + IOMMU_IOAS_MAP_READABLE = 1 << 2, > +}; > + > +/** > + * struct iommu_ioas_map - ioctl(IOMMU_IOAS_MAP) > + * @size: sizeof(struct iommu_ioas_map) > + * @flags: Combination of enum iommufd_ioas_map_flags > + * @ioas_id: IOAS ID to change the mapping of > + * @__reserved: Must be 0 > + * @user_va: Userspace pointer to start mapping from > + * @length: Number of bytes to map > + * @iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is set > + * then this must be provided as input. > + * > + * Set an IOVA mapping from a user pointer. If FIXED_IOVA is specified then the > + * mapping will be established at iova, otherwise a suitable location based on > + * the reserved and allowed lists will be automatically selected and returned in > + * iova. You do not mention anything about the fact the IOCTL cannot be called twice for a given @user_va w/ FIXED_IOVA Refering to VFIO_DMA_MAP_FLAG_VADDR. > + */ > +struct iommu_ioas_map { > + __u32 size; > + __u32 flags; > + __u32 ioas_id; > + __u32 __reserved; > + __aligned_u64 user_va; > + __aligned_u64 length; > + __aligned_u64 iova; > +}; > +#define IOMMU_IOAS_MAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_MAP) > + > +/** > + * struct iommu_ioas_copy - ioctl(IOMMU_IOAS_COPY) > + * @size: sizeof(struct iommu_ioas_copy) > + * @flags: Combination of enum iommufd_ioas_map_flags > + * @dst_ioas_id: IOAS ID to change the mapping of > + * @src_ioas_id: IOAS ID to copy from > + * @length: Number of bytes to copy and map > + * @dst_iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is > + * set then this must be provided as input. > + * @src_iova: IOVA to start the copy > + * > + * Copy an already existing mapping from src_ioas_id and establish it in > + * dst_ioas_id. The src iova/length must exactly match a range used with > + * IOMMU_IOAS_MAP. > + * > + * This may be used to efficiently clone a subset of an IOAS to another, or as a > + * kind of 'cache' to speed up mapping. Copy has an effciency advantage over efficiency > + * establishing equivalent new mappings, as internal resources are shared, and > + * the kernel will pin the user memory only once. > + */ > +struct iommu_ioas_copy { > + __u32 size; > + __u32 flags; > + __u32 dst_ioas_id; > + __u32 src_ioas_id; is src_ioas_id == dst_ioas_id allowed? > + __aligned_u64 length; > + __aligned_u64 dst_iova; > + __aligned_u64 src_iova; > +}; > +#define IOMMU_IOAS_COPY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_COPY) > + > +/** > + * struct iommu_ioas_unmap - ioctl(IOMMU_IOAS_UNMAP) > + * @size: sizeof(struct iommu_ioas_unmap) > + * @ioas_id: IOAS ID to change the mapping of > + * @iova: IOVA to start the unmapping at > + * @length: Number of bytes to unmap, and return back the bytes unmapped > + * > + * Unmap an IOVA range. The iova/length must be a superset of a previously > + * mapped range used with IOMMU_IOAS_MAP or IOMMU_IOAS_COPY. Splitting or > + * truncating ranges is not allowed. The values 0 to U64_MAX will unmap > + * everything. > + */ > +struct iommu_ioas_unmap { > + __u32 size; > + __u32 ioas_id; > + __aligned_u64 iova; > + __aligned_u64 length; > +}; > +#define IOMMU_IOAS_UNMAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_UNMAP) > + > +/** > + * enum iommufd_option - ioctl(IOMMU_OPTION_RLIMIT_MODE) and > + * ioctl(IOMMU_OPTION_HUGE_PAGES) > + * @IOMMU_OPTION_RLIMIT_MODE: > + * Change how RLIMIT_MEMLOCK accounting works. The caller must have privilege > + * to invoke this. Value 0 (default) is user based accouting, 1 uses process > + * based accounting. Global option, object_id must be 0 > + * @IOMMU_OPTION_HUGE_PAGES: > + * Value 1 (default) allows contiguous pages to be combined when generating > + * iommu mappings. Value 0 disables combining, everything is mapped to > + * PAGE_SIZE. This can be useful for benchmarking. This is a per-IOAS > + * option, the object_id must be the IOAS ID. > + */ > +enum iommufd_option { > + IOMMU_OPTION_RLIMIT_MODE = 0, > + IOMMU_OPTION_HUGE_PAGES = 1, > +}; > + > +/** > + * enum iommufd_option_ops - ioctl(IOMMU_OPTION_OP_SET) and > + * ioctl(IOMMU_OPTION_OP_GET) > + * @IOMMU_OPTION_OP_SET: Set the option's value > + * @IOMMU_OPTION_OP_GET: Get the option's value > + */ > +enum iommufd_option_ops { > + IOMMU_OPTION_OP_SET = 0, > + IOMMU_OPTION_OP_GET = 1, > +}; > + > +/** > + * struct iommu_option - iommu option multiplexer > + * @size: sizeof(struct iommu_option) > + * @option_id: One of enum iommufd_option > + * @op: One of enum iommufd_option_ops > + * @__reserved: Must be 0 > + * @object_id: ID of the object if required > + * @val64: Option value to set or value returned on get > + * > + * Change a simple option value. This multiplexor allows controlling a options s/a options/options > + * on objects. IOMMU_OPTION_OP_SET will load an option and IOMMU_OPTION_OP_GET > + * will return the current value. > + */ > +struct iommu_option { > + __u32 size; > + __u32 option_id; > + __u16 op; > + __u16 __reserved; > + __u32 object_id; > + __aligned_u64 val64; > +}; > +#define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION) > #endif Thanks Eric
> From: Eric Auger <eric.auger@redhat.com> > Sent: Monday, November 28, 2022 1:49 AM > > +struct iommu_ioas_copy { > > + __u32 size; > > + __u32 flags; > > + __u32 dst_ioas_id; > > + __u32 src_ioas_id; > is src_ioas_id == dst_ioas_id allowed? I suppose so. iommufd_ioas_copy() simply gets a reference to underlying iopt_pages according to [src_ioas_id, src_iova] and then map it into [dst_ioas_id, dst_iova]. It doesn’t matter whether iopt_pages comes from a same or different ioas. The only restriction is that the src/dst ranges don't overlap.
On Mon, Nov 28, 2022 at 09:05:48AM +0000, Tian, Kevin wrote: > > From: Eric Auger <eric.auger@redhat.com> > > Sent: Monday, November 28, 2022 1:49 AM > > > +struct iommu_ioas_copy { > > > + __u32 size; > > > + __u32 flags; > > > + __u32 dst_ioas_id; > > > + __u32 src_ioas_id; > > is src_ioas_id == dst_ioas_id allowed? > > I suppose so. iommufd_ioas_copy() simply gets a reference to > underlying iopt_pages according to [src_ioas_id, src_iova] and > then map it into [dst_ioas_id, dst_iova]. It doesn’t matter > whether iopt_pages comes from a same or different ioas. > > The only restriction is that the src/dst ranges don't overlap. Right Jason
On Sun, Nov 27, 2022 at 06:49:29PM +0100, Eric Auger wrote: > > +static int iommufd_ioas_load_iovas(struct rb_root_cached *itree, > > + struct iommu_iova_range __user *ranges, > > + u32 num) > > +{ > > + u32 i; > > + > > + for (i = 0; i != num; i++) { > shouldn't it be < ? It is logically equivalent > > +int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd) > > +{ > > + struct iommu_ioas_allow_iovas *cmd = ucmd->cmd; > > + struct rb_root_cached allowed_iova = RB_ROOT_CACHED; > > + struct interval_tree_node *node; > > + struct iommufd_ioas *ioas; > > + struct io_pagetable *iopt; > > + int rc = 0; > > + > > + if (cmd->__reserved) > > + return -EOPNOTSUPP; > > + > > + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); > > + if (IS_ERR(ioas)) > > + return PTR_ERR(ioas); > > + iopt = &ioas->iopt; > > + > > + rc = iommufd_ioas_load_iovas(&allowed_iova, > > + u64_to_user_ptr(cmd->allowed_iovas), > > + cmd->num_iovas); > > + if (rc) > > + goto out_free; > > + > > + rc = iopt_set_allow_iova(iopt, &allowed_iova); > Please can you add a comment about why you need to proceed in 2 steps, > ie. add the ranges in a first tree and then 'swap' to the > iopt->allowed_tree (and eventually delete the first tree)? Sure /* * We want the allowed tree update to be atomic, so we have to keep the * original nodes around, and keep track of the new nodes as we allocate * memory for them. The simplest solution is to have a new/old tree and * then swap new for old. On success we free the old tree, on failure we * free the new tree. */ > > +static int conv_iommu_prot(u32 map_flags) > > +{ > > + int iommu_prot; > > + > > + /* > > + * We provide no manual cache coherency ioctls to userspace and most > > + * architectures make the CPU ops for cache flushing privileged. > > + * Therefore we require the underlying IOMMU to support CPU coherent > > + * operation. Support for IOMMU_CACHE is enforced by the > > + * IOMMU_CAP_CACHE_COHERENCY test during bind. > > + */ > > + iommu_prot = IOMMU_CACHE; > at init? done > > +int iommufd_ioas_map(struct iommufd_ucmd *ucmd) > > +{ > > + struct iommu_ioas_map *cmd = ucmd->cmd; > > + struct iommufd_ioas *ioas; > > + unsigned int flags = 0; > > + unsigned long iova; > > + int rc; > > + > > + if ((cmd->flags & > > + ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE | > > + IOMMU_IOAS_MAP_READABLE)) || > > + cmd->__reserved) > > + return -EOPNOTSUPP; > > + if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) > > + return -EOVERFLOW; > > + > > + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); > > + if (IS_ERR(ioas)) > > + return PTR_ERR(ioas); > > + > > + if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA)) > > + flags = IOPT_ALLOC_IOVA; > > + iova = cmd->iova; > can be done either at initialization or only if MAP_FIXED_IOVA. Done > > +int iommufd_option_rlimit_mode(struct iommu_option *cmd, > > + struct iommufd_ctx *ictx) > > +{ > *object_id and __reserved should be checked as per the uapi doc* Ohh, yes, thanks: @@ -317,6 +322,9 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd) int iommufd_option_rlimit_mode(struct iommu_option *cmd, struct iommufd_ctx *ictx) { + if (cmd->object_id) + return -EOPNOTSUPP; + if (cmd->op == IOMMU_OPTION_OP_GET) { cmd->val64 = ictx->account_mode == IOPT_PAGES_ACCOUNT_MM; return 0; diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index de5cc01023c0c5..bcb463e581009c 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -215,6 +215,9 @@ static int iommufd_option(struct iommufd_ucmd *ucmd) struct iommu_option *cmd = ucmd->cmd; int rc; + if (cmd->__reserved) + return -EOPNOTSUPP; + switch (cmd->option_id) { case IOMMU_OPTION_RLIMIT_MODE: rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx); > > +/** > > + * struct iommu_ioas_iova_ranges - ioctl(IOMMU_IOAS_IOVA_RANGES) > > + * @size: sizeof(struct iommu_ioas_iova_ranges) > > + * @ioas_id: IOAS ID to read ranges from > > + * @num_iovas: Input/Output total number of ranges in the IOAS > > + * @__reserved: Must be 0 > > + * @allowed_iovas: Pointer to the output array of struct iommu_iova_range > > + * @out_iova_alignment: Minimum alignment required for mapping IOVA > > + * > > + * Query an IOAS for ranges of allowed IOVAs. Mapping IOVA outside these ranges > > + * is not allowed. num_iovas will be set to the total number of iovas and > > + * the allowed_iovas[] will be filled in as space permits. > > + * > > + * The allowed ranges are dependent on the HW path the DMA operation takes, and > > + * can change during the lifetime of the IOAS. A fresh empty IOAS will have a > > + * full range, and each attached device will narrow the ranges based on that > > + * device's HW restrictions. Detatching a device can widen the ranges. Userspace > detaching > > + * should query ranges after every attach/detatch to know what IOVAs are valid > detach Done > > + * for mapping. > > + * > > + * On input num_iovas is the length of the allowed_iovas array. On output it is > > + * the total number of iovas filled in. The ioctl will return -EMSGSIZE and set > > + * num_iovas to the required value if num_iovas is too small. In this case the > > + * caller should allocate a larger output array and re-issue the ioctl. > > + */ > > +struct iommu_ioas_iova_ranges { > > + __u32 size; > > + __u32 ioas_id; > > + __u32 num_iovas; > > + __u32 __reserved; > > + __aligned_u64 allowed_iovas; > > + __aligned_u64 out_iova_alignment; > document @out_iova_alignment? * out_iova_alignment returns the minimum IOVA alignment that can be given * to IOMMU_IOAS_MAP/COPY. IOVA's must satisfy: * starting_iova % out_iova_alignment == 0 * (starting_iova + length) % out_iova_alignment == 0 * out_iova_alignment can be 1 indicating any IOVA is allowed. It cannot * be higher than the system PAGE_SIZE. > > +/** > > + * struct iommu_ioas_map - ioctl(IOMMU_IOAS_MAP) > > + * @size: sizeof(struct iommu_ioas_map) > > + * @flags: Combination of enum iommufd_ioas_map_flags > > + * @ioas_id: IOAS ID to change the mapping of > > + * @__reserved: Must be 0 > > + * @user_va: Userspace pointer to start mapping from > > + * @length: Number of bytes to map > > + * @iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is set > > + * then this must be provided as input. > > + * > > + * Set an IOVA mapping from a user pointer. If FIXED_IOVA is specified then the > > + * mapping will be established at iova, otherwise a suitable location based on > > + * the reserved and allowed lists will be automatically selected and returned in > > + * iova. > You do not mention anything about the fact the IOCTL cannot be called > twice for a given @user_va w/ FIXED_IOVA > Refering to VFIO_DMA_MAP_FLAG_VADDR. * If IOMMU_IOAS_MAP_FIXED_IOVA is specified then the iova range must currently * be unused, existing IOVA cannot be replaced. > > +/** > > + * struct iommu_ioas_copy - ioctl(IOMMU_IOAS_COPY) > > + * @size: sizeof(struct iommu_ioas_copy) > > + * @flags: Combination of enum iommufd_ioas_map_flags > > + * @dst_ioas_id: IOAS ID to change the mapping of > > + * @src_ioas_id: IOAS ID to copy from > > + * @length: Number of bytes to copy and map > > + * @dst_iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is > > + * set then this must be provided as input. > > + * @src_iova: IOVA to start the copy > > + * > > + * Copy an already existing mapping from src_ioas_id and establish it in > > + * dst_ioas_id. The src iova/length must exactly match a range used with > > + * IOMMU_IOAS_MAP. > > + * > > + * This may be used to efficiently clone a subset of an IOAS to another, or as a > > + * kind of 'cache' to speed up mapping. Copy has an effciency advantage over > efficiency > > + * establishing equivalent new mappings, as internal resources are shared, and > > + * the kernel will pin the user memory only once. > > + */ > > +struct iommu_ioas_copy { > > + __u32 size; > > + __u32 flags; > > + __u32 dst_ioas_id; > > + __u32 src_ioas_id; > is src_ioas_id == dst_ioas_id allowed? Yes > > +/** > > + * struct iommu_option - iommu option multiplexer > > + * @size: sizeof(struct iommu_option) > > + * @option_id: One of enum iommufd_option > > + * @op: One of enum iommufd_option_ops > > + * @__reserved: Must be 0 > > + * @object_id: ID of the object if required > > + * @val64: Option value to set or value returned on get > > + * > > + * Change a simple option value. This multiplexor allows controlling a options > s/a options/options Done Thanks, Jason
On 11/28/22 19:27, Jason Gunthorpe wrote: > On Sun, Nov 27, 2022 at 06:49:29PM +0100, Eric Auger wrote: > >>> +static int iommufd_ioas_load_iovas(struct rb_root_cached *itree, >>> + struct iommu_iova_range __user *ranges, >>> + u32 num) >>> +{ >>> + u32 i; >>> + >>> + for (i = 0; i != num; i++) { >> shouldn't it be < ? > It is logically equivalent damn. That sometimes happens to me when staring at so much code ;-) > >>> +int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd) >>> +{ >>> + struct iommu_ioas_allow_iovas *cmd = ucmd->cmd; >>> + struct rb_root_cached allowed_iova = RB_ROOT_CACHED; >>> + struct interval_tree_node *node; >>> + struct iommufd_ioas *ioas; >>> + struct io_pagetable *iopt; >>> + int rc = 0; >>> + >>> + if (cmd->__reserved) >>> + return -EOPNOTSUPP; >>> + >>> + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); >>> + if (IS_ERR(ioas)) >>> + return PTR_ERR(ioas); >>> + iopt = &ioas->iopt; >>> + >>> + rc = iommufd_ioas_load_iovas(&allowed_iova, >>> + u64_to_user_ptr(cmd->allowed_iovas), >>> + cmd->num_iovas); >>> + if (rc) >>> + goto out_free; >>> + >>> + rc = iopt_set_allow_iova(iopt, &allowed_iova); >> Please can you add a comment about why you need to proceed in 2 steps, >> ie. add the ranges in a first tree and then 'swap' to the >> iopt->allowed_tree (and eventually delete the first tree)? > Sure > > /* > * We want the allowed tree update to be atomic, so we have to keep the > * original nodes around, and keep track of the new nodes as we allocate > * memory for them. The simplest solution is to have a new/old tree and > * then swap new for old. On success we free the old tree, on failure we > * free the new tree. > */ > >>> +static int conv_iommu_prot(u32 map_flags) >>> +{ >>> + int iommu_prot; >>> + >>> + /* >>> + * We provide no manual cache coherency ioctls to userspace and most >>> + * architectures make the CPU ops for cache flushing privileged. >>> + * Therefore we require the underlying IOMMU to support CPU coherent >>> + * operation. Support for IOMMU_CACHE is enforced by the >>> + * IOMMU_CAP_CACHE_COHERENCY test during bind. >>> + */ >>> + iommu_prot = IOMMU_CACHE; >> at init? > done > >>> +int iommufd_ioas_map(struct iommufd_ucmd *ucmd) >>> +{ >>> + struct iommu_ioas_map *cmd = ucmd->cmd; >>> + struct iommufd_ioas *ioas; >>> + unsigned int flags = 0; >>> + unsigned long iova; >>> + int rc; >>> + >>> + if ((cmd->flags & >>> + ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE | >>> + IOMMU_IOAS_MAP_READABLE)) || >>> + cmd->__reserved) >>> + return -EOPNOTSUPP; >>> + if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) >>> + return -EOVERFLOW; >>> + >>> + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); >>> + if (IS_ERR(ioas)) >>> + return PTR_ERR(ioas); >>> + >>> + if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA)) >>> + flags = IOPT_ALLOC_IOVA; >>> + iova = cmd->iova; >> can be done either at initialization or only if MAP_FIXED_IOVA. > Done > > >>> +int iommufd_option_rlimit_mode(struct iommu_option *cmd, >>> + struct iommufd_ctx *ictx) >>> +{ >> *object_id and __reserved should be checked as per the uapi doc* > Ohh, yes, thanks: > > @@ -317,6 +322,9 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd) > int iommufd_option_rlimit_mode(struct iommu_option *cmd, > struct iommufd_ctx *ictx) > { > + if (cmd->object_id) > + return -EOPNOTSUPP; > + > if (cmd->op == IOMMU_OPTION_OP_GET) { > cmd->val64 = ictx->account_mode == IOPT_PAGES_ACCOUNT_MM; > return 0; > diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c > index de5cc01023c0c5..bcb463e581009c 100644 > --- a/drivers/iommu/iommufd/main.c > +++ b/drivers/iommu/iommufd/main.c > @@ -215,6 +215,9 @@ static int iommufd_option(struct iommufd_ucmd *ucmd) > struct iommu_option *cmd = ucmd->cmd; > int rc; > > + if (cmd->__reserved) > + return -EOPNOTSUPP; > + > switch (cmd->option_id) { > case IOMMU_OPTION_RLIMIT_MODE: > rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx); > >>> +/** >>> + * struct iommu_ioas_iova_ranges - ioctl(IOMMU_IOAS_IOVA_RANGES) >>> + * @size: sizeof(struct iommu_ioas_iova_ranges) >>> + * @ioas_id: IOAS ID to read ranges from >>> + * @num_iovas: Input/Output total number of ranges in the IOAS >>> + * @__reserved: Must be 0 >>> + * @allowed_iovas: Pointer to the output array of struct iommu_iova_range >>> + * @out_iova_alignment: Minimum alignment required for mapping IOVA >>> + * >>> + * Query an IOAS for ranges of allowed IOVAs. Mapping IOVA outside these ranges >>> + * is not allowed. num_iovas will be set to the total number of iovas and >>> + * the allowed_iovas[] will be filled in as space permits. >>> + * >>> + * The allowed ranges are dependent on the HW path the DMA operation takes, and >>> + * can change during the lifetime of the IOAS. A fresh empty IOAS will have a >>> + * full range, and each attached device will narrow the ranges based on that >>> + * device's HW restrictions. Detatching a device can widen the ranges. Userspace >> detaching >>> + * should query ranges after every attach/detatch to know what IOVAs are valid >> detach > Done > >>> + * for mapping. >>> + * >>> + * On input num_iovas is the length of the allowed_iovas array. On output it is >>> + * the total number of iovas filled in. The ioctl will return -EMSGSIZE and set >>> + * num_iovas to the required value if num_iovas is too small. In this case the >>> + * caller should allocate a larger output array and re-issue the ioctl. >>> + */ >>> +struct iommu_ioas_iova_ranges { >>> + __u32 size; >>> + __u32 ioas_id; >>> + __u32 num_iovas; >>> + __u32 __reserved; >>> + __aligned_u64 allowed_iovas; >>> + __aligned_u64 out_iova_alignment; >> document @out_iova_alignment? > * out_iova_alignment returns the minimum IOVA alignment that can be given > * to IOMMU_IOAS_MAP/COPY. IOVA's must satisfy: > * starting_iova % out_iova_alignment == 0 > * (starting_iova + length) % out_iova_alignment == 0 > * out_iova_alignment can be 1 indicating any IOVA is allowed. It cannot > * be higher than the system PAGE_SIZE. > >>> +/** >>> + * struct iommu_ioas_map - ioctl(IOMMU_IOAS_MAP) >>> + * @size: sizeof(struct iommu_ioas_map) >>> + * @flags: Combination of enum iommufd_ioas_map_flags >>> + * @ioas_id: IOAS ID to change the mapping of >>> + * @__reserved: Must be 0 >>> + * @user_va: Userspace pointer to start mapping from >>> + * @length: Number of bytes to map >>> + * @iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is set >>> + * then this must be provided as input. >>> + * >>> + * Set an IOVA mapping from a user pointer. If FIXED_IOVA is specified then the >>> + * mapping will be established at iova, otherwise a suitable location based on >>> + * the reserved and allowed lists will be automatically selected and returned in >>> + * iova. >> You do not mention anything about the fact the IOCTL cannot be called >> twice for a given @user_va w/ FIXED_IOVA >> Refering to VFIO_DMA_MAP_FLAG_VADDR. > * If IOMMU_IOAS_MAP_FIXED_IOVA is specified then the iova range must currently > * be unused, existing IOVA cannot be replaced. > >>> +/** >>> + * struct iommu_ioas_copy - ioctl(IOMMU_IOAS_COPY) >>> + * @size: sizeof(struct iommu_ioas_copy) >>> + * @flags: Combination of enum iommufd_ioas_map_flags >>> + * @dst_ioas_id: IOAS ID to change the mapping of >>> + * @src_ioas_id: IOAS ID to copy from >>> + * @length: Number of bytes to copy and map >>> + * @dst_iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is >>> + * set then this must be provided as input. >>> + * @src_iova: IOVA to start the copy >>> + * >>> + * Copy an already existing mapping from src_ioas_id and establish it in >>> + * dst_ioas_id. The src iova/length must exactly match a range used with >>> + * IOMMU_IOAS_MAP. >>> + * >>> + * This may be used to efficiently clone a subset of an IOAS to another, or as a >>> + * kind of 'cache' to speed up mapping. Copy has an effciency advantage over >> efficiency >>> + * establishing equivalent new mappings, as internal resources are shared, and >>> + * the kernel will pin the user memory only once. >>> + */ >>> +struct iommu_ioas_copy { >>> + __u32 size; >>> + __u32 flags; >>> + __u32 dst_ioas_id; >>> + __u32 src_ioas_id; >> is src_ioas_id == dst_ioas_id allowed? > Yes > >>> +/** >>> + * struct iommu_option - iommu option multiplexer >>> + * @size: sizeof(struct iommu_option) >>> + * @option_id: One of enum iommufd_option >>> + * @op: One of enum iommufd_option_ops >>> + * @__reserved: Must be 0 >>> + * @object_id: ID of the object if required >>> + * @val64: Option value to set or value returned on get >>> + * >>> + * Change a simple option value. This multiplexor allows controlling a options >> s/a options/options > Done > > Thanks, > Jason > Eric
diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile index b66a8c47ff55ec..2b4f36f1b72f9d 100644 --- a/drivers/iommu/iommufd/Makefile +++ b/drivers/iommu/iommufd/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only iommufd-y := \ io_pagetable.o \ + ioas.o \ main.o \ pages.o diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c new file mode 100644 index 00000000000000..7671456e86413a --- /dev/null +++ b/drivers/iommu/iommufd/ioas.c @@ -0,0 +1,384 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#include <linux/interval_tree.h> +#include <linux/iommufd.h> +#include <linux/iommu.h> +#include <uapi/linux/iommufd.h> + +#include "io_pagetable.h" + +void iommufd_ioas_destroy(struct iommufd_object *obj) +{ + struct iommufd_ioas *ioas = container_of(obj, struct iommufd_ioas, obj); + int rc; + + rc = iopt_unmap_all(&ioas->iopt, NULL); + WARN_ON(rc && rc != -ENOENT); + iopt_destroy_table(&ioas->iopt); +} + +struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx) +{ + struct iommufd_ioas *ioas; + + ioas = iommufd_object_alloc(ictx, ioas, IOMMUFD_OBJ_IOAS); + if (IS_ERR(ioas)) + return ioas; + + iopt_init_table(&ioas->iopt); + return ioas; +} + +int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_alloc *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + int rc; + + if (cmd->flags) + return -EOPNOTSUPP; + + ioas = iommufd_ioas_alloc(ucmd->ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + cmd->out_ioas_id = ioas->obj.id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_table; + iommufd_object_finalize(ucmd->ictx, &ioas->obj); + return 0; + +out_table: + iommufd_object_abort_and_destroy(ucmd->ictx, &ioas->obj); + return rc; +} + +int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd) +{ + struct iommu_iova_range __user *ranges; + struct iommu_ioas_iova_ranges *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + struct interval_tree_span_iter span; + u32 max_iovas; + int rc; + + if (cmd->__reserved) + return -EOPNOTSUPP; + + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + down_read(&ioas->iopt.iova_rwsem); + max_iovas = cmd->num_iovas; + ranges = u64_to_user_ptr(cmd->allowed_iovas); + cmd->num_iovas = 0; + cmd->out_iova_alignment = ioas->iopt.iova_alignment; + interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0, + ULONG_MAX) { + if (!span.is_hole) + continue; + if (cmd->num_iovas < max_iovas) { + struct iommu_iova_range elm = { + .start = span.start_hole, + .last = span.last_hole, + }; + + if (copy_to_user(&ranges[cmd->num_iovas], &elm, + sizeof(elm))) { + rc = -EFAULT; + goto out_put; + } + } + cmd->num_iovas++; + } + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_put; + if (cmd->num_iovas > max_iovas) + rc = -EMSGSIZE; +out_put: + up_read(&ioas->iopt.iova_rwsem); + iommufd_put_object(&ioas->obj); + return rc; +} + +static int iommufd_ioas_load_iovas(struct rb_root_cached *itree, + struct iommu_iova_range __user *ranges, + u32 num) +{ + u32 i; + + for (i = 0; i != num; i++) { + struct iommu_iova_range range; + struct iopt_allowed *allowed; + + if (copy_from_user(&range, ranges + i, sizeof(range))) + return -EFAULT; + + if (range.start >= range.last) + return -EINVAL; + + if (interval_tree_iter_first(itree, range.start, range.last)) + return -EINVAL; + + allowed = kzalloc(sizeof(*allowed), GFP_KERNEL_ACCOUNT); + if (!allowed) + return -ENOMEM; + allowed->node.start = range.start; + allowed->node.last = range.last; + + interval_tree_insert(&allowed->node, itree); + } + return 0; +} + +int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_allow_iovas *cmd = ucmd->cmd; + struct rb_root_cached allowed_iova = RB_ROOT_CACHED; + struct interval_tree_node *node; + struct iommufd_ioas *ioas; + struct io_pagetable *iopt; + int rc = 0; + + if (cmd->__reserved) + return -EOPNOTSUPP; + + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + iopt = &ioas->iopt; + + rc = iommufd_ioas_load_iovas(&allowed_iova, + u64_to_user_ptr(cmd->allowed_iovas), + cmd->num_iovas); + if (rc) + goto out_free; + + rc = iopt_set_allow_iova(iopt, &allowed_iova); +out_free: + while ((node = interval_tree_iter_first(&allowed_iova, 0, ULONG_MAX))) { + interval_tree_remove(node, &allowed_iova); + kfree(container_of(node, struct iopt_allowed, node)); + } + iommufd_put_object(&ioas->obj); + return rc; +} + +static int conv_iommu_prot(u32 map_flags) +{ + int iommu_prot; + + /* + * We provide no manual cache coherency ioctls to userspace and most + * architectures make the CPU ops for cache flushing privileged. + * Therefore we require the underlying IOMMU to support CPU coherent + * operation. Support for IOMMU_CACHE is enforced by the + * IOMMU_CAP_CACHE_COHERENCY test during bind. + */ + iommu_prot = IOMMU_CACHE; + if (map_flags & IOMMU_IOAS_MAP_WRITEABLE) + iommu_prot |= IOMMU_WRITE; + if (map_flags & IOMMU_IOAS_MAP_READABLE) + iommu_prot |= IOMMU_READ; + return iommu_prot; +} + +int iommufd_ioas_map(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_map *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + unsigned int flags = 0; + unsigned long iova; + int rc; + + if ((cmd->flags & + ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE)) || + cmd->__reserved) + return -EOPNOTSUPP; + if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) + return -EOVERFLOW; + + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA)) + flags = IOPT_ALLOC_IOVA; + iova = cmd->iova; + rc = iopt_map_user_pages(ucmd->ictx, &ioas->iopt, &iova, + u64_to_user_ptr(cmd->user_va), cmd->length, + conv_iommu_prot(cmd->flags), flags); + if (rc) + goto out_put; + + cmd->iova = iova; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); +out_put: + iommufd_put_object(&ioas->obj); + return rc; +} + +int iommufd_ioas_copy(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_copy *cmd = ucmd->cmd; + struct iommufd_ioas *src_ioas; + struct iommufd_ioas *dst_ioas; + unsigned int flags = 0; + LIST_HEAD(pages_list); + unsigned long iova; + int rc; + + if ((cmd->flags & + ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE))) + return -EOPNOTSUPP; + if (cmd->length >= ULONG_MAX || cmd->src_iova >= ULONG_MAX || + cmd->dst_iova >= ULONG_MAX) + return -EOVERFLOW; + + src_ioas = iommufd_get_ioas(ucmd, cmd->src_ioas_id); + if (IS_ERR(src_ioas)) + return PTR_ERR(src_ioas); + rc = iopt_get_pages(&src_ioas->iopt, cmd->src_iova, cmd->length, + &pages_list); + iommufd_put_object(&src_ioas->obj); + if (rc) + return rc; + + dst_ioas = iommufd_get_ioas(ucmd, cmd->dst_ioas_id); + if (IS_ERR(dst_ioas)) { + rc = PTR_ERR(dst_ioas); + goto out_pages; + } + + if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA)) + flags = IOPT_ALLOC_IOVA; + iova = cmd->dst_iova; + rc = iopt_map_pages(&dst_ioas->iopt, &pages_list, cmd->length, &iova, + conv_iommu_prot(cmd->flags), flags); + if (rc) + goto out_put_dst; + + cmd->dst_iova = iova; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); +out_put_dst: + iommufd_put_object(&dst_ioas->obj); +out_pages: + iopt_free_pages_list(&pages_list); + return rc; +} + +int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_unmap *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + unsigned long unmapped = 0; + int rc; + + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + if (cmd->iova == 0 && cmd->length == U64_MAX) { + rc = iopt_unmap_all(&ioas->iopt, &unmapped); + if (rc) + goto out_put; + } else { + if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) { + rc = -EOVERFLOW; + goto out_put; + } + rc = iopt_unmap_iova(&ioas->iopt, cmd->iova, cmd->length, + &unmapped); + if (rc) + goto out_put; + } + + cmd->length = unmapped; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + +out_put: + iommufd_put_object(&ioas->obj); + return rc; +} + +int iommufd_option_rlimit_mode(struct iommu_option *cmd, + struct iommufd_ctx *ictx) +{ + if (cmd->op == IOMMU_OPTION_OP_GET) { + cmd->val64 = ictx->account_mode == IOPT_PAGES_ACCOUNT_MM; + return 0; + } + if (cmd->op == IOMMU_OPTION_OP_SET) { + int rc = 0; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + xa_lock(&ictx->objects); + if (!xa_empty(&ictx->objects)) { + rc = -EBUSY; + } else { + if (cmd->val64 == 0) + ictx->account_mode = IOPT_PAGES_ACCOUNT_USER; + else if (cmd->val64 == 1) + ictx->account_mode = IOPT_PAGES_ACCOUNT_MM; + else + rc = -EINVAL; + } + xa_unlock(&ictx->objects); + + return rc; + } + return -EOPNOTSUPP; +} + +static int iommufd_ioas_option_huge_pages(struct iommu_option *cmd, + struct iommufd_ioas *ioas) +{ + if (cmd->op == IOMMU_OPTION_OP_GET) { + cmd->val64 = !ioas->iopt.disable_large_pages; + return 0; + } + if (cmd->op == IOMMU_OPTION_OP_SET) { + if (cmd->val64 == 0) + return iopt_disable_large_pages(&ioas->iopt); + if (cmd->val64 == 1) { + iopt_enable_large_pages(&ioas->iopt); + return 0; + } + return -EINVAL; + } + return -EOPNOTSUPP; +} + +int iommufd_ioas_option(struct iommufd_ucmd *ucmd) +{ + struct iommu_option *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + int rc = 0; + + if (cmd->__reserved) + return -EOPNOTSUPP; + + ioas = iommufd_get_ioas(ucmd, cmd->object_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + switch (cmd->option_id) { + case IOMMU_OPTION_HUGE_PAGES: + rc = iommufd_ioas_option_huge_pages(cmd, ioas); + break; + default: + rc = -EOPNOTSUPP; + } + + iommufd_put_object(&ioas->obj); + return rc; +} diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index dadd90cae543ba..6721332dbbba03 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -11,6 +11,7 @@ struct iommu_domain; struct iommu_group; +struct iommu_option; struct iommufd_ctx { struct file *file; @@ -102,6 +103,7 @@ static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd, enum iommufd_object_type { IOMMUFD_OBJ_NONE, IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE, + IOMMUFD_OBJ_IOAS, }; /* Base struct for all objects with a userspace ID handle. */ @@ -174,6 +176,37 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, type), \ typeof(*(ptr)), obj) +/* + * The IO Address Space (IOAS) pagetable is a virtual page table backed by the + * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The + * mapping is copied into all of the associated domains and made available to + * in-kernel users. + */ +struct iommufd_ioas { + struct iommufd_object obj; + struct io_pagetable iopt; +}; + +static inline struct iommufd_ioas *iommufd_get_ioas(struct iommufd_ucmd *ucmd, + u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_IOAS), + struct iommufd_ioas, obj); +} + +struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx); +int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd); +void iommufd_ioas_destroy(struct iommufd_object *obj); +int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd); +int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd); +int iommufd_ioas_map(struct iommufd_ucmd *ucmd); +int iommufd_ioas_copy(struct iommufd_ucmd *ucmd); +int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd); +int iommufd_ioas_option(struct iommufd_ucmd *ucmd); +int iommufd_option_rlimit_mode(struct iommu_option *cmd, + struct iommufd_ctx *ictx); + struct iommufd_access { unsigned long iova_alignment; u32 iopt_access_list_id; diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 3a705cadb85020..266109045537ed 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -202,8 +202,36 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) return 0; } +static int iommufd_option(struct iommufd_ucmd *ucmd) +{ + struct iommu_option *cmd = ucmd->cmd; + int rc; + + switch (cmd->option_id) { + case IOMMU_OPTION_RLIMIT_MODE: + rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx); + break; + case IOMMU_OPTION_HUGE_PAGES: + rc = iommufd_ioas_option(ucmd); + break; + default: + return -EOPNOTSUPP; + } + if (rc) + return rc; + if (copy_to_user(&((struct iommu_option __user *)ucmd->ubuffer)->val64, + &cmd->val64, sizeof(cmd->val64))) + return -EFAULT; + return 0; +} + union ucmd_buffer { struct iommu_destroy destroy; + struct iommu_ioas_alloc alloc; + struct iommu_ioas_allow_iovas allow_iovas; + struct iommu_ioas_iova_ranges iova_ranges; + struct iommu_ioas_map map; + struct iommu_ioas_unmap unmap; }; struct iommufd_ioctl_op { @@ -224,6 +252,20 @@ struct iommufd_ioctl_op { } static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id), + IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, + struct iommu_ioas_alloc, out_ioas_id), + IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, + struct iommu_ioas_allow_iovas, allowed_iovas), + IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy, + src_iova), + IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges, + struct iommu_ioas_iova_ranges, out_iova_alignment), + IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map, + iova), + IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap, + length), + IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, + val64), }; static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd, @@ -310,6 +352,9 @@ void iommufd_ctx_put(struct iommufd_ctx *ictx) EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD); static const struct iommufd_object_ops iommufd_object_ops[] = { + [IOMMUFD_OBJ_IOAS] = { + .destroy = iommufd_ioas_destroy, + }, }; static struct miscdevice iommu_misc_dev = { diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 2ad06b27a35fe5..9e9250dfc4fb1b 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -37,12 +37,19 @@ enum { IOMMUFD_CMD_BASE = 0x80, IOMMUFD_CMD_DESTROY = IOMMUFD_CMD_BASE, + IOMMUFD_CMD_IOAS_ALLOC, + IOMMUFD_CMD_IOAS_ALLOW_IOVAS, + IOMMUFD_CMD_IOAS_COPY, + IOMMUFD_CMD_IOAS_IOVA_RANGES, + IOMMUFD_CMD_IOAS_MAP, + IOMMUFD_CMD_IOAS_UNMAP, + IOMMUFD_CMD_OPTION, }; /** * struct iommu_destroy - ioctl(IOMMU_DESTROY) * @size: sizeof(struct iommu_destroy) - * @id: iommufd object ID to destroy. Can by any destroyable object type. + * @id: iommufd object ID to destroy. Can be any destroyable object type. * * Destroy any object held within iommufd. */ @@ -52,4 +59,241 @@ struct iommu_destroy { }; #define IOMMU_DESTROY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_DESTROY) +/** + * struct iommu_ioas_alloc - ioctl(IOMMU_IOAS_ALLOC) + * @size: sizeof(struct iommu_ioas_alloc) + * @flags: Must be 0 + * @out_ioas_id: Output IOAS ID for the allocated object + * + * Allocate an IO Address Space (IOAS) which holds an IO Virtual Address (IOVA) + * to memory mapping. + */ +struct iommu_ioas_alloc { + __u32 size; + __u32 flags; + __u32 out_ioas_id; +}; +#define IOMMU_IOAS_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_ALLOC) + +/** + * struct iommu_iova_range - ioctl(IOMMU_IOVA_RANGE) + * @start: First IOVA + * @last: Inclusive last IOVA + * + * An interval in IOVA space. + */ +struct iommu_iova_range { + __aligned_u64 start; + __aligned_u64 last; +}; + +/** + * struct iommu_ioas_iova_ranges - ioctl(IOMMU_IOAS_IOVA_RANGES) + * @size: sizeof(struct iommu_ioas_iova_ranges) + * @ioas_id: IOAS ID to read ranges from + * @num_iovas: Input/Output total number of ranges in the IOAS + * @__reserved: Must be 0 + * @allowed_iovas: Pointer to the output array of struct iommu_iova_range + * @out_iova_alignment: Minimum alignment required for mapping IOVA + * + * Query an IOAS for ranges of allowed IOVAs. Mapping IOVA outside these ranges + * is not allowed. num_iovas will be set to the total number of iovas and + * the allowed_iovas[] will be filled in as space permits. + * + * The allowed ranges are dependent on the HW path the DMA operation takes, and + * can change during the lifetime of the IOAS. A fresh empty IOAS will have a + * full range, and each attached device will narrow the ranges based on that + * device's HW restrictions. Detatching a device can widen the ranges. Userspace + * should query ranges after every attach/detatch to know what IOVAs are valid + * for mapping. + * + * On input num_iovas is the length of the allowed_iovas array. On output it is + * the total number of iovas filled in. The ioctl will return -EMSGSIZE and set + * num_iovas to the required value if num_iovas is too small. In this case the + * caller should allocate a larger output array and re-issue the ioctl. + */ +struct iommu_ioas_iova_ranges { + __u32 size; + __u32 ioas_id; + __u32 num_iovas; + __u32 __reserved; + __aligned_u64 allowed_iovas; + __aligned_u64 out_iova_alignment; +}; +#define IOMMU_IOAS_IOVA_RANGES _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_IOVA_RANGES) + +/** + * struct iommu_ioas_allow_iovas - ioctl(IOMMU_IOAS_ALLOW_IOVAS) + * @size: sizeof(struct iommu_ioas_allow_iovas) + * @ioas_id: IOAS ID to allow IOVAs from + * @num_iovas: Input/Output total number of ranges in the IOAS + * @__reserved: Must be 0 + * @allowed_iovas: Pointer to array of struct iommu_iova_range + * + * Ensure a range of IOVAs are always available for allocation. If this call + * succeeds then IOMMU_IOAS_IOVA_RANGES will never return a list of IOVA ranges + * that are narrower than the ranges provided here. This call will fail if + * IOMMU_IOAS_IOVA_RANGES is currently narrower than the given ranges. + * + * When an IOAS is first created the IOVA_RANGES will be maximally sized, and as + * devices are attached the IOVA will narrow based on the device restrictions. + * When an allowed range is specified any narrowing will be refused, ie device + * attachment can fail if the device requires limiting within the allowed range. + * + * Automatic IOVA allocation is also impacted by this call. MAP will only + * allocate within the allowed IOVAs if they are present. + * + * This call replaces the entire allowed list with the given list. + */ +struct iommu_ioas_allow_iovas { + __u32 size; + __u32 ioas_id; + __u32 num_iovas; + __u32 __reserved; + __aligned_u64 allowed_iovas; +}; +#define IOMMU_IOAS_ALLOW_IOVAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_ALLOW_IOVAS) + +/** + * enum iommufd_ioas_map_flags - Flags for map and copy + * @IOMMU_IOAS_MAP_FIXED_IOVA: If clear the kernel will compute an appropriate + * IOVA to place the mapping at + * @IOMMU_IOAS_MAP_WRITEABLE: DMA is allowed to write to this mapping + * @IOMMU_IOAS_MAP_READABLE: DMA is allowed to read from this mapping + */ +enum iommufd_ioas_map_flags { + IOMMU_IOAS_MAP_FIXED_IOVA = 1 << 0, + IOMMU_IOAS_MAP_WRITEABLE = 1 << 1, + IOMMU_IOAS_MAP_READABLE = 1 << 2, +}; + +/** + * struct iommu_ioas_map - ioctl(IOMMU_IOAS_MAP) + * @size: sizeof(struct iommu_ioas_map) + * @flags: Combination of enum iommufd_ioas_map_flags + * @ioas_id: IOAS ID to change the mapping of + * @__reserved: Must be 0 + * @user_va: Userspace pointer to start mapping from + * @length: Number of bytes to map + * @iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is set + * then this must be provided as input. + * + * Set an IOVA mapping from a user pointer. If FIXED_IOVA is specified then the + * mapping will be established at iova, otherwise a suitable location based on + * the reserved and allowed lists will be automatically selected and returned in + * iova. + */ +struct iommu_ioas_map { + __u32 size; + __u32 flags; + __u32 ioas_id; + __u32 __reserved; + __aligned_u64 user_va; + __aligned_u64 length; + __aligned_u64 iova; +}; +#define IOMMU_IOAS_MAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_MAP) + +/** + * struct iommu_ioas_copy - ioctl(IOMMU_IOAS_COPY) + * @size: sizeof(struct iommu_ioas_copy) + * @flags: Combination of enum iommufd_ioas_map_flags + * @dst_ioas_id: IOAS ID to change the mapping of + * @src_ioas_id: IOAS ID to copy from + * @length: Number of bytes to copy and map + * @dst_iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is + * set then this must be provided as input. + * @src_iova: IOVA to start the copy + * + * Copy an already existing mapping from src_ioas_id and establish it in + * dst_ioas_id. The src iova/length must exactly match a range used with + * IOMMU_IOAS_MAP. + * + * This may be used to efficiently clone a subset of an IOAS to another, or as a + * kind of 'cache' to speed up mapping. Copy has an effciency advantage over + * establishing equivalent new mappings, as internal resources are shared, and + * the kernel will pin the user memory only once. + */ +struct iommu_ioas_copy { + __u32 size; + __u32 flags; + __u32 dst_ioas_id; + __u32 src_ioas_id; + __aligned_u64 length; + __aligned_u64 dst_iova; + __aligned_u64 src_iova; +}; +#define IOMMU_IOAS_COPY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_COPY) + +/** + * struct iommu_ioas_unmap - ioctl(IOMMU_IOAS_UNMAP) + * @size: sizeof(struct iommu_ioas_unmap) + * @ioas_id: IOAS ID to change the mapping of + * @iova: IOVA to start the unmapping at + * @length: Number of bytes to unmap, and return back the bytes unmapped + * + * Unmap an IOVA range. The iova/length must be a superset of a previously + * mapped range used with IOMMU_IOAS_MAP or IOMMU_IOAS_COPY. Splitting or + * truncating ranges is not allowed. The values 0 to U64_MAX will unmap + * everything. + */ +struct iommu_ioas_unmap { + __u32 size; + __u32 ioas_id; + __aligned_u64 iova; + __aligned_u64 length; +}; +#define IOMMU_IOAS_UNMAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_UNMAP) + +/** + * enum iommufd_option - ioctl(IOMMU_OPTION_RLIMIT_MODE) and + * ioctl(IOMMU_OPTION_HUGE_PAGES) + * @IOMMU_OPTION_RLIMIT_MODE: + * Change how RLIMIT_MEMLOCK accounting works. The caller must have privilege + * to invoke this. Value 0 (default) is user based accouting, 1 uses process + * based accounting. Global option, object_id must be 0 + * @IOMMU_OPTION_HUGE_PAGES: + * Value 1 (default) allows contiguous pages to be combined when generating + * iommu mappings. Value 0 disables combining, everything is mapped to + * PAGE_SIZE. This can be useful for benchmarking. This is a per-IOAS + * option, the object_id must be the IOAS ID. + */ +enum iommufd_option { + IOMMU_OPTION_RLIMIT_MODE = 0, + IOMMU_OPTION_HUGE_PAGES = 1, +}; + +/** + * enum iommufd_option_ops - ioctl(IOMMU_OPTION_OP_SET) and + * ioctl(IOMMU_OPTION_OP_GET) + * @IOMMU_OPTION_OP_SET: Set the option's value + * @IOMMU_OPTION_OP_GET: Get the option's value + */ +enum iommufd_option_ops { + IOMMU_OPTION_OP_SET = 0, + IOMMU_OPTION_OP_GET = 1, +}; + +/** + * struct iommu_option - iommu option multiplexer + * @size: sizeof(struct iommu_option) + * @option_id: One of enum iommufd_option + * @op: One of enum iommufd_option_ops + * @__reserved: Must be 0 + * @object_id: ID of the object if required + * @val64: Option value to set or value returned on get + * + * Change a simple option value. This multiplexor allows controlling a options + * on objects. IOMMU_OPTION_OP_SET will load an option and IOMMU_OPTION_OP_GET + * will return the current value. + */ +struct iommu_option { + __u32 size; + __u32 option_id; + __u16 op; + __u16 __reserved; + __u32 object_id; + __aligned_u64 val64; +}; +#define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION) #endif