diff mbox series

[v5,11/19] iommufd: IOCTLs for the io_pagetable

Message ID 11-v5-4001c2997bd0+30c-iommufd_jgg@nvidia.com
State Accepted
Commit aad37e71d5c4dc1d3c25734f0bcd51c324f94b5e
Headers show
Series IOMMUFD Generic interface | expand

Commit Message

Jason Gunthorpe Nov. 16, 2022, 9 p.m. UTC
Connect the IOAS to its IOCTL interface. This exposes most of the
functionality in the io_pagetable to userspace.

This is intended to be the core of the generic interface that IOMMUFD will
provide. Every IOMMU driver should be able to implement an iommu_domain
that is compatible with this generic mechanism.

It is also designed to be easy to use for simple non virtual machine
monitor users, like DPDK:
 - Universal simple support for all IOMMUs (no PPC special path)
 - An IOVA allocator that considers the aperture and the allowed/reserved
   ranges
 - io_pagetable allows any number of iommu_domains to be connected to the
   IOAS
 - Automatic allocation and re-use of iommu_domains

Along with room in the design to add non-generic features to cater to
specific HW functionality.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Yi Liu <yi.l.liu@intel.com>
Tested-by: Lixiao Yang <lixiao.yang@intel.com>
Tested-by: Matthew Rosato <mjrosato@linux.ibm.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 drivers/iommu/iommufd/Makefile          |   1 +
 drivers/iommu/iommufd/ioas.c            | 384 ++++++++++++++++++++++++
 drivers/iommu/iommufd/iommufd_private.h |  33 ++
 drivers/iommu/iommufd/main.c            |  45 +++
 include/uapi/linux/iommufd.h            | 246 ++++++++++++++-
 5 files changed, 708 insertions(+), 1 deletion(-)
 create mode 100644 drivers/iommu/iommufd/ioas.c

Comments

Eric Auger Nov. 27, 2022, 5:49 p.m. UTC | #1
Hi Jason,

On 11/16/22 22:00, Jason Gunthorpe wrote:
> Connect the IOAS to its IOCTL interface. This exposes most of the
> functionality in the io_pagetable to userspace.
>
> This is intended to be the core of the generic interface that IOMMUFD will
> provide. Every IOMMU driver should be able to implement an iommu_domain
> that is compatible with this generic mechanism.
>
> It is also designed to be easy to use for simple non virtual machine
> monitor users, like DPDK:
>  - Universal simple support for all IOMMUs (no PPC special path)
>  - An IOVA allocator that considers the aperture and the allowed/reserved
>    ranges
>  - io_pagetable allows any number of iommu_domains to be connected to the
>    IOAS
>  - Automatic allocation and re-use of iommu_domains
>
> Along with room in the design to add non-generic features to cater to
> specific HW functionality.
>
> Tested-by: Nicolin Chen <nicolinc@nvidia.com>
> Tested-by: Yi Liu <yi.l.liu@intel.com>
> Tested-by: Lixiao Yang <lixiao.yang@intel.com>
> Tested-by: Matthew Rosato <mjrosato@linux.ibm.com>
> Reviewed-by: Kevin Tian <kevin.tian@intel.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> ---
>  drivers/iommu/iommufd/Makefile          |   1 +
>  drivers/iommu/iommufd/ioas.c            | 384 ++++++++++++++++++++++++
>  drivers/iommu/iommufd/iommufd_private.h |  33 ++
>  drivers/iommu/iommufd/main.c            |  45 +++
>  include/uapi/linux/iommufd.h            | 246 ++++++++++++++-
>  5 files changed, 708 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/iommu/iommufd/ioas.c
>
> diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
> index b66a8c47ff55ec..2b4f36f1b72f9d 100644
> --- a/drivers/iommu/iommufd/Makefile
> +++ b/drivers/iommu/iommufd/Makefile
> @@ -1,6 +1,7 @@
>  # SPDX-License-Identifier: GPL-2.0-only
>  iommufd-y := \
>  	io_pagetable.o \
> +	ioas.o \
>  	main.o \
>  	pages.o
>  
> diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
> new file mode 100644
> index 00000000000000..7671456e86413a
> --- /dev/null
> +++ b/drivers/iommu/iommufd/ioas.c
> @@ -0,0 +1,384 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
> + */
> +#include <linux/interval_tree.h>
> +#include <linux/iommufd.h>
> +#include <linux/iommu.h>
> +#include <uapi/linux/iommufd.h>
> +
> +#include "io_pagetable.h"
> +
> +void iommufd_ioas_destroy(struct iommufd_object *obj)
> +{
> +	struct iommufd_ioas *ioas = container_of(obj, struct iommufd_ioas, obj);
> +	int rc;
> +
> +	rc = iopt_unmap_all(&ioas->iopt, NULL);
> +	WARN_ON(rc && rc != -ENOENT);
> +	iopt_destroy_table(&ioas->iopt);
> +}
> +
> +struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx)
> +{
> +	struct iommufd_ioas *ioas;
> +
> +	ioas = iommufd_object_alloc(ictx, ioas, IOMMUFD_OBJ_IOAS);
> +	if (IS_ERR(ioas))
> +		return ioas;
> +
> +	iopt_init_table(&ioas->iopt);
> +	return ioas;
> +}
> +
> +int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd)
> +{
> +	struct iommu_ioas_alloc *cmd = ucmd->cmd;
> +	struct iommufd_ioas *ioas;
> +	int rc;
> +
> +	if (cmd->flags)
> +		return -EOPNOTSUPP;
> +
> +	ioas = iommufd_ioas_alloc(ucmd->ictx);
> +	if (IS_ERR(ioas))
> +		return PTR_ERR(ioas);
> +
> +	cmd->out_ioas_id = ioas->obj.id;
> +	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
> +	if (rc)
> +		goto out_table;
> +	iommufd_object_finalize(ucmd->ictx, &ioas->obj);
> +	return 0;
> +
> +out_table:
> +	iommufd_object_abort_and_destroy(ucmd->ictx, &ioas->obj);
> +	return rc;
> +}
> +
> +int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd)
> +{
> +	struct iommu_iova_range __user *ranges;
> +	struct iommu_ioas_iova_ranges *cmd = ucmd->cmd;
> +	struct iommufd_ioas *ioas;
> +	struct interval_tree_span_iter span;
> +	u32 max_iovas;
> +	int rc;
> +
> +	if (cmd->__reserved)
> +		return -EOPNOTSUPP;
> +
> +	ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
> +	if (IS_ERR(ioas))
> +		return PTR_ERR(ioas);
> +
> +	down_read(&ioas->iopt.iova_rwsem);
> +	max_iovas = cmd->num_iovas;
> +	ranges = u64_to_user_ptr(cmd->allowed_iovas);
> +	cmd->num_iovas = 0;
> +	cmd->out_iova_alignment = ioas->iopt.iova_alignment;
> +	interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0,
> +				    ULONG_MAX) {
> +		if (!span.is_hole)
> +			continue;
> +		if (cmd->num_iovas < max_iovas) {
> +			struct iommu_iova_range elm = {
> +				.start = span.start_hole,
> +				.last = span.last_hole,
> +			};
> +
> +			if (copy_to_user(&ranges[cmd->num_iovas], &elm,
> +					 sizeof(elm))) {
> +				rc = -EFAULT;
> +				goto out_put;
> +			}
> +		}
> +		cmd->num_iovas++;
> +	}
> +	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
> +	if (rc)
> +		goto out_put;
> +	if (cmd->num_iovas > max_iovas)
> +		rc = -EMSGSIZE;
> +out_put:
> +	up_read(&ioas->iopt.iova_rwsem);
> +	iommufd_put_object(&ioas->obj);
> +	return rc;
> +}
> +
> +static int iommufd_ioas_load_iovas(struct rb_root_cached *itree,
> +				   struct iommu_iova_range __user *ranges,
> +				   u32 num)
> +{
> +	u32 i;
> +
> +	for (i = 0; i != num; i++) {
shouldn't it be < ?
> +		struct iommu_iova_range range;
> +		struct iopt_allowed *allowed;
> +
> +		if (copy_from_user(&range, ranges + i, sizeof(range)))
> +			return -EFAULT;
> +
> +		if (range.start >= range.last)
> +			return -EINVAL;
> +
> +		if (interval_tree_iter_first(itree, range.start, range.last))
> +			return -EINVAL;
> +
> +		allowed = kzalloc(sizeof(*allowed), GFP_KERNEL_ACCOUNT);
> +		if (!allowed)
> +			return -ENOMEM;
> +		allowed->node.start = range.start;
> +		allowed->node.last = range.last;
> +
> +		interval_tree_insert(&allowed->node, itree);
> +	}
> +	return 0;
> +}
> +
> +int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd)
> +{
> +	struct iommu_ioas_allow_iovas *cmd = ucmd->cmd;
> +	struct rb_root_cached allowed_iova = RB_ROOT_CACHED;
> +	struct interval_tree_node *node;
> +	struct iommufd_ioas *ioas;
> +	struct io_pagetable *iopt;
> +	int rc = 0;
> +
> +	if (cmd->__reserved)
> +		return -EOPNOTSUPP;
> +
> +	ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
> +	if (IS_ERR(ioas))
> +		return PTR_ERR(ioas);
> +	iopt = &ioas->iopt;
> +
> +	rc = iommufd_ioas_load_iovas(&allowed_iova,
> +				     u64_to_user_ptr(cmd->allowed_iovas),
> +				     cmd->num_iovas);
> +	if (rc)
> +		goto out_free;
> +
> +	rc = iopt_set_allow_iova(iopt, &allowed_iova);
Please can you add a comment about why you need to proceed in 2 steps,
ie. add the ranges in a first tree and then 'swap' to the
iopt->allowed_tree (and eventually delete the first tree)?
> +out_free:
> +	while ((node = interval_tree_iter_first(&allowed_iova, 0, ULONG_MAX))) {
> +		interval_tree_remove(node, &allowed_iova);
> +		kfree(container_of(node, struct iopt_allowed, node));
> +	}
> +	iommufd_put_object(&ioas->obj);
> +	return rc;
> +}
> +
> +static int conv_iommu_prot(u32 map_flags)
> +{
> +	int iommu_prot;
> +
> +	/*
> +	 * We provide no manual cache coherency ioctls to userspace and most
> +	 * architectures make the CPU ops for cache flushing privileged.
> +	 * Therefore we require the underlying IOMMU to support CPU coherent
> +	 * operation. Support for IOMMU_CACHE is enforced by the
> +	 * IOMMU_CAP_CACHE_COHERENCY test during bind.
> +	 */
> +	iommu_prot = IOMMU_CACHE;
at init?
> +	if (map_flags & IOMMU_IOAS_MAP_WRITEABLE)
> +		iommu_prot |= IOMMU_WRITE;
> +	if (map_flags & IOMMU_IOAS_MAP_READABLE)
> +		iommu_prot |= IOMMU_READ;
> +	return iommu_prot;
> +}
> +
> +int iommufd_ioas_map(struct iommufd_ucmd *ucmd)
> +{
> +	struct iommu_ioas_map *cmd = ucmd->cmd;
> +	struct iommufd_ioas *ioas;
> +	unsigned int flags = 0;
> +	unsigned long iova;
> +	int rc;
> +
> +	if ((cmd->flags &
> +	     ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE |
> +	       IOMMU_IOAS_MAP_READABLE)) ||
> +	    cmd->__reserved)
> +		return -EOPNOTSUPP;
> +	if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX)
> +		return -EOVERFLOW;
> +
> +	ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
> +	if (IS_ERR(ioas))
> +		return PTR_ERR(ioas);
> +
> +	if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
> +		flags = IOPT_ALLOC_IOVA;
> +	iova = cmd->iova;
can be done either at initialization or only if MAP_FIXED_IOVA.
> +	rc = iopt_map_user_pages(ucmd->ictx, &ioas->iopt, &iova,
> +				 u64_to_user_ptr(cmd->user_va), cmd->length,
> +				 conv_iommu_prot(cmd->flags), flags);
> +	if (rc)
> +		goto out_put;
> +
> +	cmd->iova = iova;
> +	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
> +out_put:
> +	iommufd_put_object(&ioas->obj);
> +	return rc;
> +}
> +
> +int iommufd_ioas_copy(struct iommufd_ucmd *ucmd)
> +{
> +	struct iommu_ioas_copy *cmd = ucmd->cmd;
> +	struct iommufd_ioas *src_ioas;
> +	struct iommufd_ioas *dst_ioas;
> +	unsigned int flags = 0;
> +	LIST_HEAD(pages_list);
> +	unsigned long iova;
> +	int rc;
> +
> +	if ((cmd->flags &
> +	     ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE |
> +	       IOMMU_IOAS_MAP_READABLE)))
> +		return -EOPNOTSUPP;
> +	if (cmd->length >= ULONG_MAX || cmd->src_iova >= ULONG_MAX ||
> +	    cmd->dst_iova >= ULONG_MAX)
> +		return -EOVERFLOW;
> +
> +	src_ioas = iommufd_get_ioas(ucmd, cmd->src_ioas_id);
> +	if (IS_ERR(src_ioas))
> +		return PTR_ERR(src_ioas);
> +	rc = iopt_get_pages(&src_ioas->iopt, cmd->src_iova, cmd->length,
> +			    &pages_list);
> +	iommufd_put_object(&src_ioas->obj);
> +	if (rc)
> +		return rc;
> +
> +	dst_ioas = iommufd_get_ioas(ucmd, cmd->dst_ioas_id);
> +	if (IS_ERR(dst_ioas)) {
> +		rc = PTR_ERR(dst_ioas);
> +		goto out_pages;
> +	}
> +
> +	if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
> +		flags = IOPT_ALLOC_IOVA;
> +	iova = cmd->dst_iova;
> +	rc = iopt_map_pages(&dst_ioas->iopt, &pages_list, cmd->length, &iova,
> +			    conv_iommu_prot(cmd->flags), flags);
> +	if (rc)
> +		goto out_put_dst;
> +
> +	cmd->dst_iova = iova;
> +	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
> +out_put_dst:
> +	iommufd_put_object(&dst_ioas->obj);
> +out_pages:
> +	iopt_free_pages_list(&pages_list);
> +	return rc;
> +}
> +
> +int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd)
> +{
> +	struct iommu_ioas_unmap *cmd = ucmd->cmd;
> +	struct iommufd_ioas *ioas;
> +	unsigned long unmapped = 0;
> +	int rc;
> +
> +	ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
> +	if (IS_ERR(ioas))
> +		return PTR_ERR(ioas);
> +
> +	if (cmd->iova == 0 && cmd->length == U64_MAX) {
> +		rc = iopt_unmap_all(&ioas->iopt, &unmapped);
> +		if (rc)
> +			goto out_put;
> +	} else {
> +		if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) {
> +			rc = -EOVERFLOW;
> +			goto out_put;
> +		}
> +		rc = iopt_unmap_iova(&ioas->iopt, cmd->iova, cmd->length,
> +				     &unmapped);
> +		if (rc)
> +			goto out_put;
> +	}
> +
> +	cmd->length = unmapped;
> +	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
> +
> +out_put:
> +	iommufd_put_object(&ioas->obj);
> +	return rc;
> +}
> +
> +int iommufd_option_rlimit_mode(struct iommu_option *cmd,
> +			       struct iommufd_ctx *ictx)
> +{
*object_id  and __reserved should be checked as per the uapi doc*
> +	if (cmd->op == IOMMU_OPTION_OP_GET) {
> +		cmd->val64 = ictx->account_mode == IOPT_PAGES_ACCOUNT_MM;
> +		return 0;
> +	}
> +	if (cmd->op == IOMMU_OPTION_OP_SET) {
> +		int rc = 0;
> +
> +		if (!capable(CAP_SYS_RESOURCE))
> +			return -EPERM;
> +
> +		xa_lock(&ictx->objects);
> +		if (!xa_empty(&ictx->objects)) {
> +			rc = -EBUSY;
> +		} else {
> +			if (cmd->val64 == 0)
> +				ictx->account_mode = IOPT_PAGES_ACCOUNT_USER;
> +			else if (cmd->val64 == 1)
> +				ictx->account_mode = IOPT_PAGES_ACCOUNT_MM;
> +			else
> +				rc = -EINVAL;
> +		}
> +		xa_unlock(&ictx->objects);
> +
> +		return rc;
> +	}
> +	return -EOPNOTSUPP;
> +}
> +
> +static int iommufd_ioas_option_huge_pages(struct iommu_option *cmd,
> +					  struct iommufd_ioas *ioas)
> +{
> +	if (cmd->op == IOMMU_OPTION_OP_GET) {
> +		cmd->val64 = !ioas->iopt.disable_large_pages;
> +		return 0;
> +	}
> +	if (cmd->op == IOMMU_OPTION_OP_SET) {
> +		if (cmd->val64 == 0)
> +			return iopt_disable_large_pages(&ioas->iopt);
> +		if (cmd->val64 == 1) {
> +			iopt_enable_large_pages(&ioas->iopt);
> +			return 0;
> +		}
> +		return -EINVAL;
> +	}
> +	return -EOPNOTSUPP;
> +}
> +
> +int iommufd_ioas_option(struct iommufd_ucmd *ucmd)
> +{
> +	struct iommu_option *cmd = ucmd->cmd;
> +	struct iommufd_ioas *ioas;
> +	int rc = 0;
> +
> +	if (cmd->__reserved)
> +		return -EOPNOTSUPP;
> +
> +	ioas = iommufd_get_ioas(ucmd, cmd->object_id);
> +	if (IS_ERR(ioas))
> +		return PTR_ERR(ioas);
> +
> +	switch (cmd->option_id) {
> +	case IOMMU_OPTION_HUGE_PAGES:
> +		rc = iommufd_ioas_option_huge_pages(cmd, ioas);
> +		break;
> +	default:
> +		rc = -EOPNOTSUPP;
> +	}
> +
> +	iommufd_put_object(&ioas->obj);
> +	return rc;
> +}
> diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
> index dadd90cae543ba..6721332dbbba03 100644
> --- a/drivers/iommu/iommufd/iommufd_private.h
> +++ b/drivers/iommu/iommufd/iommufd_private.h
> @@ -11,6 +11,7 @@
>  
>  struct iommu_domain;
>  struct iommu_group;
> +struct iommu_option;
>  
>  struct iommufd_ctx {
>  	struct file *file;
> @@ -102,6 +103,7 @@ static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd,
>  enum iommufd_object_type {
>  	IOMMUFD_OBJ_NONE,
>  	IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE,
> +	IOMMUFD_OBJ_IOAS,
>  };
>  
>  /* Base struct for all objects with a userspace ID handle. */
> @@ -174,6 +176,37 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
>  			     type),                                            \
>  		     typeof(*(ptr)), obj)
>  
> +/*
> + * The IO Address Space (IOAS) pagetable is a virtual page table backed by the
> + * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The
> + * mapping is copied into all of the associated domains and made available to
> + * in-kernel users.
> + */
> +struct iommufd_ioas {
> +	struct iommufd_object obj;
> +	struct io_pagetable iopt;
> +};
> +
> +static inline struct iommufd_ioas *iommufd_get_ioas(struct iommufd_ucmd *ucmd,
> +						    u32 id)
> +{
> +	return container_of(iommufd_get_object(ucmd->ictx, id,
> +					       IOMMUFD_OBJ_IOAS),
> +			    struct iommufd_ioas, obj);
> +}
> +
> +struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx);
> +int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd);
> +void iommufd_ioas_destroy(struct iommufd_object *obj);
> +int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd);
> +int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd);
> +int iommufd_ioas_map(struct iommufd_ucmd *ucmd);
> +int iommufd_ioas_copy(struct iommufd_ucmd *ucmd);
> +int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd);
> +int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
> +int iommufd_option_rlimit_mode(struct iommu_option *cmd,
> +			       struct iommufd_ctx *ictx);
> +
>  struct iommufd_access {
>  	unsigned long iova_alignment;
>  	u32 iopt_access_list_id;
> diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
> index 3a705cadb85020..266109045537ed 100644
> --- a/drivers/iommu/iommufd/main.c
> +++ b/drivers/iommu/iommufd/main.c
> @@ -202,8 +202,36 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp)
>  	return 0;
>  }
>  
> +static int iommufd_option(struct iommufd_ucmd *ucmd)
> +{
> +	struct iommu_option *cmd = ucmd->cmd;
> +	int rc;
__reserved can be checked here instead.
> +	switch (cmd->option_id) {
> +	case IOMMU_OPTION_RLIMIT_MODE:
> +		rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx);
> +		break;
> +	case IOMMU_OPTION_HUGE_PAGES:
> +		rc = iommufd_ioas_option(ucmd);
> +		break;
> +	default:
> +		return -EOPNOTSUPP;
> +	}
> +	if (rc)
> +		return rc;
> +	if (copy_to_user(&((struct iommu_option __user *)ucmd->ubuffer)->val64,
> +			 &cmd->val64, sizeof(cmd->val64)))
> +		return -EFAULT;
> +	return 0;
> +}
> +
>  union ucmd_buffer {
>  	struct iommu_destroy destroy;
> +	struct iommu_ioas_alloc alloc;
> +	struct iommu_ioas_allow_iovas allow_iovas;
> +	struct iommu_ioas_iova_ranges iova_ranges;
> +	struct iommu_ioas_map map;
> +	struct iommu_ioas_unmap unmap;
>  };
>  
>  struct iommufd_ioctl_op {
> @@ -224,6 +252,20 @@ struct iommufd_ioctl_op {
>  	}
>  static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
>  	IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id),
> +	IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl,
> +		 struct iommu_ioas_alloc, out_ioas_id),
> +	IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
> +		 struct iommu_ioas_allow_iovas, allowed_iovas),
> +	IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy,
> +		 src_iova),
> +	IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges,
> +		 struct iommu_ioas_iova_ranges, out_iova_alignment),
> +	IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map,
> +		 iova),
> +	IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap,
> +		 length),
> +	IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option,
> +		 val64),
>  };
>  
>  static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd,
> @@ -310,6 +352,9 @@ void iommufd_ctx_put(struct iommufd_ctx *ictx)
>  EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD);
>  
>  static const struct iommufd_object_ops iommufd_object_ops[] = {
> +	[IOMMUFD_OBJ_IOAS] = {
> +		.destroy = iommufd_ioas_destroy,
> +	},
>  };
>  
>  static struct miscdevice iommu_misc_dev = {
> diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
> index 2ad06b27a35fe5..9e9250dfc4fb1b 100644
> --- a/include/uapi/linux/iommufd.h
> +++ b/include/uapi/linux/iommufd.h
> @@ -37,12 +37,19 @@
>  enum {
>  	IOMMUFD_CMD_BASE = 0x80,
>  	IOMMUFD_CMD_DESTROY = IOMMUFD_CMD_BASE,
> +	IOMMUFD_CMD_IOAS_ALLOC,
> +	IOMMUFD_CMD_IOAS_ALLOW_IOVAS,
> +	IOMMUFD_CMD_IOAS_COPY,
> +	IOMMUFD_CMD_IOAS_IOVA_RANGES,
> +	IOMMUFD_CMD_IOAS_MAP,
> +	IOMMUFD_CMD_IOAS_UNMAP,
> +	IOMMUFD_CMD_OPTION,
>  };
>  
>  /**
>   * struct iommu_destroy - ioctl(IOMMU_DESTROY)
>   * @size: sizeof(struct iommu_destroy)
> - * @id: iommufd object ID to destroy. Can by any destroyable object type.
> + * @id: iommufd object ID to destroy. Can be any destroyable object type.
>   *
>   * Destroy any object held within iommufd.
>   */
> @@ -52,4 +59,241 @@ struct iommu_destroy {
>  };
>  #define IOMMU_DESTROY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_DESTROY)
>  
> +/**
> + * struct iommu_ioas_alloc - ioctl(IOMMU_IOAS_ALLOC)
> + * @size: sizeof(struct iommu_ioas_alloc)
> + * @flags: Must be 0
> + * @out_ioas_id: Output IOAS ID for the allocated object
> + *
> + * Allocate an IO Address Space (IOAS) which holds an IO Virtual Address (IOVA)
> + * to memory mapping.
> + */
> +struct iommu_ioas_alloc {
> +	__u32 size;
> +	__u32 flags;
> +	__u32 out_ioas_id;
> +};
> +#define IOMMU_IOAS_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_ALLOC)
> +
> +/**
> + * struct iommu_iova_range - ioctl(IOMMU_IOVA_RANGE)
> + * @start: First IOVA
> + * @last: Inclusive last IOVA
> + *
> + * An interval in IOVA space.
> + */
> +struct iommu_iova_range {
> +	__aligned_u64 start;
> +	__aligned_u64 last;
> +};
> +
> +/**
> + * struct iommu_ioas_iova_ranges - ioctl(IOMMU_IOAS_IOVA_RANGES)
> + * @size: sizeof(struct iommu_ioas_iova_ranges)
> + * @ioas_id: IOAS ID to read ranges from
> + * @num_iovas: Input/Output total number of ranges in the IOAS
> + * @__reserved: Must be 0
> + * @allowed_iovas: Pointer to the output array of struct iommu_iova_range
> + * @out_iova_alignment: Minimum alignment required for mapping IOVA
> + *
> + * Query an IOAS for ranges of allowed IOVAs. Mapping IOVA outside these ranges
> + * is not allowed. num_iovas will be set to the total number of iovas and
> + * the allowed_iovas[] will be filled in as space permits.
> + *
> + * The allowed ranges are dependent on the HW path the DMA operation takes, and
> + * can change during the lifetime of the IOAS. A fresh empty IOAS will have a
> + * full range, and each attached device will narrow the ranges based on that
> + * device's HW restrictions. Detatching a device can widen the ranges. Userspace
detaching
> + * should query ranges after every attach/detatch to know what IOVAs are valid
detach
> + * for mapping.
> + *
> + * On input num_iovas is the length of the allowed_iovas array. On output it is
> + * the total number of iovas filled in. The ioctl will return -EMSGSIZE and set
> + * num_iovas to the required value if num_iovas is too small. In this case the
> + * caller should allocate a larger output array and re-issue the ioctl.
> + */
> +struct iommu_ioas_iova_ranges {
> +	__u32 size;
> +	__u32 ioas_id;
> +	__u32 num_iovas;
> +	__u32 __reserved;
> +	__aligned_u64 allowed_iovas;
> +	__aligned_u64 out_iova_alignment;
document @out_iova_alignment?
> +};
> +#define IOMMU_IOAS_IOVA_RANGES _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_IOVA_RANGES)
> +
> +/**
> + * struct iommu_ioas_allow_iovas - ioctl(IOMMU_IOAS_ALLOW_IOVAS)
> + * @size: sizeof(struct iommu_ioas_allow_iovas)
> + * @ioas_id: IOAS ID to allow IOVAs from
> + * @num_iovas: Input/Output total number of ranges in the IOAS
> + * @__reserved: Must be 0
> + * @allowed_iovas: Pointer to array of struct iommu_iova_range
> + *
> + * Ensure a range of IOVAs are always available for allocation. If this call
> + * succeeds then IOMMU_IOAS_IOVA_RANGES will never return a list of IOVA ranges
> + * that are narrower than the ranges provided here. This call will fail if
> + * IOMMU_IOAS_IOVA_RANGES is currently narrower than the given ranges.
> + *
> + * When an IOAS is first created the IOVA_RANGES will be maximally sized, and as
> + * devices are attached the IOVA will narrow based on the device restrictions.
> + * When an allowed range is specified any narrowing will be refused, ie device
> + * attachment can fail if the device requires limiting within the allowed range.
> + *
> + * Automatic IOVA allocation is also impacted by this call. MAP will only
> + * allocate within the allowed IOVAs if they are present.
> + *
> + * This call replaces the entire allowed list with the given list.
> + */
> +struct iommu_ioas_allow_iovas {
> +	__u32 size;
> +	__u32 ioas_id;
> +	__u32 num_iovas;
> +	__u32 __reserved;
> +	__aligned_u64 allowed_iovas;
> +};
> +#define IOMMU_IOAS_ALLOW_IOVAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_ALLOW_IOVAS)
> +
> +/**
> + * enum iommufd_ioas_map_flags - Flags for map and copy
> + * @IOMMU_IOAS_MAP_FIXED_IOVA: If clear the kernel will compute an appropriate
> + *                             IOVA to place the mapping at
> + * @IOMMU_IOAS_MAP_WRITEABLE: DMA is allowed to write to this mapping
> + * @IOMMU_IOAS_MAP_READABLE: DMA is allowed to read from this mapping
> + */
> +enum iommufd_ioas_map_flags {
> +	IOMMU_IOAS_MAP_FIXED_IOVA = 1 << 0,
> +	IOMMU_IOAS_MAP_WRITEABLE = 1 << 1,
> +	IOMMU_IOAS_MAP_READABLE = 1 << 2,
> +};
> +
> +/**
> + * struct iommu_ioas_map - ioctl(IOMMU_IOAS_MAP)
> + * @size: sizeof(struct iommu_ioas_map)
> + * @flags: Combination of enum iommufd_ioas_map_flags
> + * @ioas_id: IOAS ID to change the mapping of
> + * @__reserved: Must be 0
> + * @user_va: Userspace pointer to start mapping from
> + * @length: Number of bytes to map
> + * @iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is set
> + *        then this must be provided as input.
> + *
> + * Set an IOVA mapping from a user pointer. If FIXED_IOVA is specified then the
> + * mapping will be established at iova, otherwise a suitable location based on
> + * the reserved and allowed lists will be automatically selected and returned in
> + * iova.
You do not mention anything about the fact the IOCTL cannot be called
twice for a given @user_va w/ FIXED_IOVA
Refering to VFIO_DMA_MAP_FLAG_VADDR.
> + */
> +struct iommu_ioas_map {
> +	__u32 size;
> +	__u32 flags;
> +	__u32 ioas_id;
> +	__u32 __reserved;
> +	__aligned_u64 user_va;
> +	__aligned_u64 length;
> +	__aligned_u64 iova;
> +};
> +#define IOMMU_IOAS_MAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_MAP)
> +
> +/**
> + * struct iommu_ioas_copy - ioctl(IOMMU_IOAS_COPY)
> + * @size: sizeof(struct iommu_ioas_copy)
> + * @flags: Combination of enum iommufd_ioas_map_flags
> + * @dst_ioas_id: IOAS ID to change the mapping of
> + * @src_ioas_id: IOAS ID to copy from
> + * @length: Number of bytes to copy and map
> + * @dst_iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is
> + *            set then this must be provided as input.
> + * @src_iova: IOVA to start the copy
> + *
> + * Copy an already existing mapping from src_ioas_id and establish it in
> + * dst_ioas_id. The src iova/length must exactly match a range used with
> + * IOMMU_IOAS_MAP.
> + *
> + * This may be used to efficiently clone a subset of an IOAS to another, or as a
> + * kind of 'cache' to speed up mapping. Copy has an effciency advantage over
efficiency
> + * establishing equivalent new mappings, as internal resources are shared, and
> + * the kernel will pin the user memory only once.
> + */
> +struct iommu_ioas_copy {
> +	__u32 size;
> +	__u32 flags;
> +	__u32 dst_ioas_id;
> +	__u32 src_ioas_id;
is src_ioas_id == dst_ioas_id allowed?
> +	__aligned_u64 length;
> +	__aligned_u64 dst_iova;
> +	__aligned_u64 src_iova;
> +};
> +#define IOMMU_IOAS_COPY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_COPY)
> +
> +/**
> + * struct iommu_ioas_unmap - ioctl(IOMMU_IOAS_UNMAP)
> + * @size: sizeof(struct iommu_ioas_unmap)
> + * @ioas_id: IOAS ID to change the mapping of
> + * @iova: IOVA to start the unmapping at
> + * @length: Number of bytes to unmap, and return back the bytes unmapped
> + *
> + * Unmap an IOVA range. The iova/length must be a superset of a previously
> + * mapped range used with IOMMU_IOAS_MAP or IOMMU_IOAS_COPY. Splitting or
> + * truncating ranges is not allowed. The values 0 to U64_MAX will unmap
> + * everything.
> + */
> +struct iommu_ioas_unmap {
> +	__u32 size;
> +	__u32 ioas_id;
> +	__aligned_u64 iova;
> +	__aligned_u64 length;
> +};
> +#define IOMMU_IOAS_UNMAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_UNMAP)
> +
> +/**
> + * enum iommufd_option - ioctl(IOMMU_OPTION_RLIMIT_MODE) and
> + *                       ioctl(IOMMU_OPTION_HUGE_PAGES)
> + * @IOMMU_OPTION_RLIMIT_MODE:
> + *    Change how RLIMIT_MEMLOCK accounting works. The caller must have privilege
> + *    to invoke this. Value 0 (default) is user based accouting, 1 uses process
> + *    based accounting. Global option, object_id must be 0
> + * @IOMMU_OPTION_HUGE_PAGES:
> + *    Value 1 (default) allows contiguous pages to be combined when generating
> + *    iommu mappings. Value 0 disables combining, everything is mapped to
> + *    PAGE_SIZE. This can be useful for benchmarking.  This is a per-IOAS
> + *    option, the object_id must be the IOAS ID.
> + */
> +enum iommufd_option {
> +	IOMMU_OPTION_RLIMIT_MODE = 0,
> +	IOMMU_OPTION_HUGE_PAGES = 1,
> +};
> +
> +/**
> + * enum iommufd_option_ops - ioctl(IOMMU_OPTION_OP_SET) and
> + *                           ioctl(IOMMU_OPTION_OP_GET)
> + * @IOMMU_OPTION_OP_SET: Set the option's value
> + * @IOMMU_OPTION_OP_GET: Get the option's value
> + */
> +enum iommufd_option_ops {
> +	IOMMU_OPTION_OP_SET = 0,
> +	IOMMU_OPTION_OP_GET = 1,
> +};
> +
> +/**
> + * struct iommu_option - iommu option multiplexer
> + * @size: sizeof(struct iommu_option)
> + * @option_id: One of enum iommufd_option
> + * @op: One of enum iommufd_option_ops
> + * @__reserved: Must be 0
> + * @object_id: ID of the object if required
> + * @val64: Option value to set or value returned on get
> + *
> + * Change a simple option value. This multiplexor allows controlling a options
s/a options/options
> + * on objects. IOMMU_OPTION_OP_SET will load an option and IOMMU_OPTION_OP_GET
> + * will return the current value.
> + */
> +struct iommu_option {
> +	__u32 size;
> +	__u32 option_id;
> +	__u16 op;
> +	__u16 __reserved;
> +	__u32 object_id;
> +	__aligned_u64 val64;
> +};
> +#define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
>  #endif
Thanks

Eric
Tian, Kevin Nov. 28, 2022, 9:05 a.m. UTC | #2
> From: Eric Auger <eric.auger@redhat.com>
> Sent: Monday, November 28, 2022 1:49 AM
> > +struct iommu_ioas_copy {
> > +	__u32 size;
> > +	__u32 flags;
> > +	__u32 dst_ioas_id;
> > +	__u32 src_ioas_id;
> is src_ioas_id == dst_ioas_id allowed?

I suppose so. iommufd_ioas_copy() simply gets a reference to
underlying iopt_pages according to [src_ioas_id, src_iova] and
then map it into [dst_ioas_id, dst_iova]. It doesn’t matter
whether iopt_pages comes from a same or different ioas.

The only restriction is that the src/dst ranges don't overlap.
Jason Gunthorpe Nov. 28, 2022, 6:11 p.m. UTC | #3
On Mon, Nov 28, 2022 at 09:05:48AM +0000, Tian, Kevin wrote:
> > From: Eric Auger <eric.auger@redhat.com>
> > Sent: Monday, November 28, 2022 1:49 AM
> > > +struct iommu_ioas_copy {
> > > +	__u32 size;
> > > +	__u32 flags;
> > > +	__u32 dst_ioas_id;
> > > +	__u32 src_ioas_id;
> > is src_ioas_id == dst_ioas_id allowed?
> 
> I suppose so. iommufd_ioas_copy() simply gets a reference to
> underlying iopt_pages according to [src_ioas_id, src_iova] and
> then map it into [dst_ioas_id, dst_iova]. It doesn’t matter
> whether iopt_pages comes from a same or different ioas.
> 
> The only restriction is that the src/dst ranges don't overlap.

Right

Jason
Jason Gunthorpe Nov. 28, 2022, 6:27 p.m. UTC | #4
On Sun, Nov 27, 2022 at 06:49:29PM +0100, Eric Auger wrote:

> > +static int iommufd_ioas_load_iovas(struct rb_root_cached *itree,
> > +				   struct iommu_iova_range __user *ranges,
> > +				   u32 num)
> > +{
> > +	u32 i;
> > +
> > +	for (i = 0; i != num; i++) {

> shouldn't it be < ?

It is logically equivalent

> > +int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd)
> > +{
> > +	struct iommu_ioas_allow_iovas *cmd = ucmd->cmd;
> > +	struct rb_root_cached allowed_iova = RB_ROOT_CACHED;
> > +	struct interval_tree_node *node;
> > +	struct iommufd_ioas *ioas;
> > +	struct io_pagetable *iopt;
> > +	int rc = 0;
> > +
> > +	if (cmd->__reserved)
> > +		return -EOPNOTSUPP;
> > +
> > +	ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
> > +	if (IS_ERR(ioas))
> > +		return PTR_ERR(ioas);
> > +	iopt = &ioas->iopt;
> > +
> > +	rc = iommufd_ioas_load_iovas(&allowed_iova,
> > +				     u64_to_user_ptr(cmd->allowed_iovas),
> > +				     cmd->num_iovas);
> > +	if (rc)
> > +		goto out_free;
> > +
> > +	rc = iopt_set_allow_iova(iopt, &allowed_iova);
> Please can you add a comment about why you need to proceed in 2 steps,
> ie. add the ranges in a first tree and then 'swap' to the
> iopt->allowed_tree (and eventually delete the first tree)?

Sure

	/*
	 * We want the allowed tree update to be atomic, so we have to keep the
	 * original nodes around, and keep track of the new nodes as we allocate
	 * memory for them. The simplest solution is to have a new/old tree and
	 * then swap new for old. On success we free the old tree, on failure we
	 * free the new tree.
	 */

> > +static int conv_iommu_prot(u32 map_flags)
> > +{
> > +	int iommu_prot;
> > +
> > +	/*
> > +	 * We provide no manual cache coherency ioctls to userspace and most
> > +	 * architectures make the CPU ops for cache flushing privileged.
> > +	 * Therefore we require the underlying IOMMU to support CPU coherent
> > +	 * operation. Support for IOMMU_CACHE is enforced by the
> > +	 * IOMMU_CAP_CACHE_COHERENCY test during bind.
> > +	 */
> > +	iommu_prot = IOMMU_CACHE;
> at init?

done

> > +int iommufd_ioas_map(struct iommufd_ucmd *ucmd)
> > +{
> > +	struct iommu_ioas_map *cmd = ucmd->cmd;
> > +	struct iommufd_ioas *ioas;
> > +	unsigned int flags = 0;
> > +	unsigned long iova;
> > +	int rc;
> > +
> > +	if ((cmd->flags &
> > +	     ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE |
> > +	       IOMMU_IOAS_MAP_READABLE)) ||
> > +	    cmd->__reserved)
> > +		return -EOPNOTSUPP;
> > +	if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX)
> > +		return -EOVERFLOW;
> > +
> > +	ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
> > +	if (IS_ERR(ioas))
> > +		return PTR_ERR(ioas);
> > +
> > +	if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
> > +		flags = IOPT_ALLOC_IOVA;
> > +	iova = cmd->iova;
> can be done either at initialization or only if MAP_FIXED_IOVA.

Done


> > +int iommufd_option_rlimit_mode(struct iommu_option *cmd,
> > +			       struct iommufd_ctx *ictx)
> > +{
> *object_id  and __reserved should be checked as per the uapi doc*

Ohh, yes, thanks:

@@ -317,6 +322,9 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd)
 int iommufd_option_rlimit_mode(struct iommu_option *cmd,
                               struct iommufd_ctx *ictx)
 {
+       if (cmd->object_id)
+               return -EOPNOTSUPP;
+
        if (cmd->op == IOMMU_OPTION_OP_GET) {
                cmd->val64 = ictx->account_mode == IOPT_PAGES_ACCOUNT_MM;
                return 0;
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index de5cc01023c0c5..bcb463e581009c 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -215,6 +215,9 @@ static int iommufd_option(struct iommufd_ucmd *ucmd)
        struct iommu_option *cmd = ucmd->cmd;
        int rc;
 
+       if (cmd->__reserved)
+               return -EOPNOTSUPP;
+
        switch (cmd->option_id) {
        case IOMMU_OPTION_RLIMIT_MODE:
                rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx);

> > +/**
> > + * struct iommu_ioas_iova_ranges - ioctl(IOMMU_IOAS_IOVA_RANGES)
> > + * @size: sizeof(struct iommu_ioas_iova_ranges)
> > + * @ioas_id: IOAS ID to read ranges from
> > + * @num_iovas: Input/Output total number of ranges in the IOAS
> > + * @__reserved: Must be 0
> > + * @allowed_iovas: Pointer to the output array of struct iommu_iova_range
> > + * @out_iova_alignment: Minimum alignment required for mapping IOVA
> > + *
> > + * Query an IOAS for ranges of allowed IOVAs. Mapping IOVA outside these ranges
> > + * is not allowed. num_iovas will be set to the total number of iovas and
> > + * the allowed_iovas[] will be filled in as space permits.
> > + *
> > + * The allowed ranges are dependent on the HW path the DMA operation takes, and
> > + * can change during the lifetime of the IOAS. A fresh empty IOAS will have a
> > + * full range, and each attached device will narrow the ranges based on that
> > + * device's HW restrictions. Detatching a device can widen the ranges. Userspace
> detaching
> > + * should query ranges after every attach/detatch to know what IOVAs are valid
> detach

Done

> > + * for mapping.
> > + *
> > + * On input num_iovas is the length of the allowed_iovas array. On output it is
> > + * the total number of iovas filled in. The ioctl will return -EMSGSIZE and set
> > + * num_iovas to the required value if num_iovas is too small. In this case the
> > + * caller should allocate a larger output array and re-issue the ioctl.
> > + */
> > +struct iommu_ioas_iova_ranges {
> > +	__u32 size;
> > +	__u32 ioas_id;
> > +	__u32 num_iovas;
> > +	__u32 __reserved;
> > +	__aligned_u64 allowed_iovas;
> > +	__aligned_u64 out_iova_alignment;
> document @out_iova_alignment?

 * out_iova_alignment returns the minimum IOVA alignment that can be given
 * to IOMMU_IOAS_MAP/COPY. IOVA's must satisfy:
 *   starting_iova % out_iova_alignment == 0
 *   (starting_iova + length) % out_iova_alignment == 0
 * out_iova_alignment can be 1 indicating any IOVA is allowed. It cannot
 * be higher than the system PAGE_SIZE.

> > +/**
> > + * struct iommu_ioas_map - ioctl(IOMMU_IOAS_MAP)
> > + * @size: sizeof(struct iommu_ioas_map)
> > + * @flags: Combination of enum iommufd_ioas_map_flags
> > + * @ioas_id: IOAS ID to change the mapping of
> > + * @__reserved: Must be 0
> > + * @user_va: Userspace pointer to start mapping from
> > + * @length: Number of bytes to map
> > + * @iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is set
> > + *        then this must be provided as input.
> > + *
> > + * Set an IOVA mapping from a user pointer. If FIXED_IOVA is specified then the
> > + * mapping will be established at iova, otherwise a suitable location based on
> > + * the reserved and allowed lists will be automatically selected and returned in
> > + * iova.
> You do not mention anything about the fact the IOCTL cannot be called
> twice for a given @user_va w/ FIXED_IOVA
> Refering to VFIO_DMA_MAP_FLAG_VADDR.

 * If IOMMU_IOAS_MAP_FIXED_IOVA is specified then the iova range must currently
 * be unused, existing IOVA cannot be replaced.

> > +/**
> > + * struct iommu_ioas_copy - ioctl(IOMMU_IOAS_COPY)
> > + * @size: sizeof(struct iommu_ioas_copy)
> > + * @flags: Combination of enum iommufd_ioas_map_flags
> > + * @dst_ioas_id: IOAS ID to change the mapping of
> > + * @src_ioas_id: IOAS ID to copy from
> > + * @length: Number of bytes to copy and map
> > + * @dst_iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is
> > + *            set then this must be provided as input.
> > + * @src_iova: IOVA to start the copy
> > + *
> > + * Copy an already existing mapping from src_ioas_id and establish it in
> > + * dst_ioas_id. The src iova/length must exactly match a range used with
> > + * IOMMU_IOAS_MAP.
> > + *
> > + * This may be used to efficiently clone a subset of an IOAS to another, or as a
> > + * kind of 'cache' to speed up mapping. Copy has an effciency advantage over
> efficiency
> > + * establishing equivalent new mappings, as internal resources are shared, and
> > + * the kernel will pin the user memory only once.
> > + */
> > +struct iommu_ioas_copy {
> > +	__u32 size;
> > +	__u32 flags;
> > +	__u32 dst_ioas_id;
> > +	__u32 src_ioas_id;
> is src_ioas_id == dst_ioas_id allowed?

Yes

> > +/**
> > + * struct iommu_option - iommu option multiplexer
> > + * @size: sizeof(struct iommu_option)
> > + * @option_id: One of enum iommufd_option
> > + * @op: One of enum iommufd_option_ops
> > + * @__reserved: Must be 0
> > + * @object_id: ID of the object if required
> > + * @val64: Option value to set or value returned on get
> > + *
> > + * Change a simple option value. This multiplexor allows controlling a options
> s/a options/options

Done

Thanks,
Jason
Eric Auger Nov. 28, 2022, 8:09 p.m. UTC | #5
On 11/28/22 19:27, Jason Gunthorpe wrote:
> On Sun, Nov 27, 2022 at 06:49:29PM +0100, Eric Auger wrote:
>
>>> +static int iommufd_ioas_load_iovas(struct rb_root_cached *itree,
>>> +				   struct iommu_iova_range __user *ranges,
>>> +				   u32 num)
>>> +{
>>> +	u32 i;
>>> +
>>> +	for (i = 0; i != num; i++) {
>> shouldn't it be < ?
> It is logically equivalent
damn. That sometimes happens to me when staring at so much code ;-)
>
>>> +int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd)
>>> +{
>>> +	struct iommu_ioas_allow_iovas *cmd = ucmd->cmd;
>>> +	struct rb_root_cached allowed_iova = RB_ROOT_CACHED;
>>> +	struct interval_tree_node *node;
>>> +	struct iommufd_ioas *ioas;
>>> +	struct io_pagetable *iopt;
>>> +	int rc = 0;
>>> +
>>> +	if (cmd->__reserved)
>>> +		return -EOPNOTSUPP;
>>> +
>>> +	ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
>>> +	if (IS_ERR(ioas))
>>> +		return PTR_ERR(ioas);
>>> +	iopt = &ioas->iopt;
>>> +
>>> +	rc = iommufd_ioas_load_iovas(&allowed_iova,
>>> +				     u64_to_user_ptr(cmd->allowed_iovas),
>>> +				     cmd->num_iovas);
>>> +	if (rc)
>>> +		goto out_free;
>>> +
>>> +	rc = iopt_set_allow_iova(iopt, &allowed_iova);
>> Please can you add a comment about why you need to proceed in 2 steps,
>> ie. add the ranges in a first tree and then 'swap' to the
>> iopt->allowed_tree (and eventually delete the first tree)?
> Sure
>
> 	/*
> 	 * We want the allowed tree update to be atomic, so we have to keep the
> 	 * original nodes around, and keep track of the new nodes as we allocate
> 	 * memory for them. The simplest solution is to have a new/old tree and
> 	 * then swap new for old. On success we free the old tree, on failure we
> 	 * free the new tree.
> 	 */
>
>>> +static int conv_iommu_prot(u32 map_flags)
>>> +{
>>> +	int iommu_prot;
>>> +
>>> +	/*
>>> +	 * We provide no manual cache coherency ioctls to userspace and most
>>> +	 * architectures make the CPU ops for cache flushing privileged.
>>> +	 * Therefore we require the underlying IOMMU to support CPU coherent
>>> +	 * operation. Support for IOMMU_CACHE is enforced by the
>>> +	 * IOMMU_CAP_CACHE_COHERENCY test during bind.
>>> +	 */
>>> +	iommu_prot = IOMMU_CACHE;
>> at init?
> done
>
>>> +int iommufd_ioas_map(struct iommufd_ucmd *ucmd)
>>> +{
>>> +	struct iommu_ioas_map *cmd = ucmd->cmd;
>>> +	struct iommufd_ioas *ioas;
>>> +	unsigned int flags = 0;
>>> +	unsigned long iova;
>>> +	int rc;
>>> +
>>> +	if ((cmd->flags &
>>> +	     ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE |
>>> +	       IOMMU_IOAS_MAP_READABLE)) ||
>>> +	    cmd->__reserved)
>>> +		return -EOPNOTSUPP;
>>> +	if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX)
>>> +		return -EOVERFLOW;
>>> +
>>> +	ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
>>> +	if (IS_ERR(ioas))
>>> +		return PTR_ERR(ioas);
>>> +
>>> +	if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
>>> +		flags = IOPT_ALLOC_IOVA;
>>> +	iova = cmd->iova;
>> can be done either at initialization or only if MAP_FIXED_IOVA.
> Done
>
>
>>> +int iommufd_option_rlimit_mode(struct iommu_option *cmd,
>>> +			       struct iommufd_ctx *ictx)
>>> +{
>> *object_id  and __reserved should be checked as per the uapi doc*
> Ohh, yes, thanks:
>
> @@ -317,6 +322,9 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd)
>  int iommufd_option_rlimit_mode(struct iommu_option *cmd,
>                                struct iommufd_ctx *ictx)
>  {
> +       if (cmd->object_id)
> +               return -EOPNOTSUPP;
> +
>         if (cmd->op == IOMMU_OPTION_OP_GET) {
>                 cmd->val64 = ictx->account_mode == IOPT_PAGES_ACCOUNT_MM;
>                 return 0;
> diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
> index de5cc01023c0c5..bcb463e581009c 100644
> --- a/drivers/iommu/iommufd/main.c
> +++ b/drivers/iommu/iommufd/main.c
> @@ -215,6 +215,9 @@ static int iommufd_option(struct iommufd_ucmd *ucmd)
>         struct iommu_option *cmd = ucmd->cmd;
>         int rc;
>  
> +       if (cmd->__reserved)
> +               return -EOPNOTSUPP;
> +
>         switch (cmd->option_id) {
>         case IOMMU_OPTION_RLIMIT_MODE:
>                 rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx);
>
>>> +/**
>>> + * struct iommu_ioas_iova_ranges - ioctl(IOMMU_IOAS_IOVA_RANGES)
>>> + * @size: sizeof(struct iommu_ioas_iova_ranges)
>>> + * @ioas_id: IOAS ID to read ranges from
>>> + * @num_iovas: Input/Output total number of ranges in the IOAS
>>> + * @__reserved: Must be 0
>>> + * @allowed_iovas: Pointer to the output array of struct iommu_iova_range
>>> + * @out_iova_alignment: Minimum alignment required for mapping IOVA
>>> + *
>>> + * Query an IOAS for ranges of allowed IOVAs. Mapping IOVA outside these ranges
>>> + * is not allowed. num_iovas will be set to the total number of iovas and
>>> + * the allowed_iovas[] will be filled in as space permits.
>>> + *
>>> + * The allowed ranges are dependent on the HW path the DMA operation takes, and
>>> + * can change during the lifetime of the IOAS. A fresh empty IOAS will have a
>>> + * full range, and each attached device will narrow the ranges based on that
>>> + * device's HW restrictions. Detatching a device can widen the ranges. Userspace
>> detaching
>>> + * should query ranges after every attach/detatch to know what IOVAs are valid
>> detach
> Done
>
>>> + * for mapping.
>>> + *
>>> + * On input num_iovas is the length of the allowed_iovas array. On output it is
>>> + * the total number of iovas filled in. The ioctl will return -EMSGSIZE and set
>>> + * num_iovas to the required value if num_iovas is too small. In this case the
>>> + * caller should allocate a larger output array and re-issue the ioctl.
>>> + */
>>> +struct iommu_ioas_iova_ranges {
>>> +	__u32 size;
>>> +	__u32 ioas_id;
>>> +	__u32 num_iovas;
>>> +	__u32 __reserved;
>>> +	__aligned_u64 allowed_iovas;
>>> +	__aligned_u64 out_iova_alignment;
>> document @out_iova_alignment?
>  * out_iova_alignment returns the minimum IOVA alignment that can be given
>  * to IOMMU_IOAS_MAP/COPY. IOVA's must satisfy:
>  *   starting_iova % out_iova_alignment == 0
>  *   (starting_iova + length) % out_iova_alignment == 0
>  * out_iova_alignment can be 1 indicating any IOVA is allowed. It cannot
>  * be higher than the system PAGE_SIZE.
>
>>> +/**
>>> + * struct iommu_ioas_map - ioctl(IOMMU_IOAS_MAP)
>>> + * @size: sizeof(struct iommu_ioas_map)
>>> + * @flags: Combination of enum iommufd_ioas_map_flags
>>> + * @ioas_id: IOAS ID to change the mapping of
>>> + * @__reserved: Must be 0
>>> + * @user_va: Userspace pointer to start mapping from
>>> + * @length: Number of bytes to map
>>> + * @iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is set
>>> + *        then this must be provided as input.
>>> + *
>>> + * Set an IOVA mapping from a user pointer. If FIXED_IOVA is specified then the
>>> + * mapping will be established at iova, otherwise a suitable location based on
>>> + * the reserved and allowed lists will be automatically selected and returned in
>>> + * iova.
>> You do not mention anything about the fact the IOCTL cannot be called
>> twice for a given @user_va w/ FIXED_IOVA
>> Refering to VFIO_DMA_MAP_FLAG_VADDR.
>  * If IOMMU_IOAS_MAP_FIXED_IOVA is specified then the iova range must currently
>  * be unused, existing IOVA cannot be replaced.
>
>>> +/**
>>> + * struct iommu_ioas_copy - ioctl(IOMMU_IOAS_COPY)
>>> + * @size: sizeof(struct iommu_ioas_copy)
>>> + * @flags: Combination of enum iommufd_ioas_map_flags
>>> + * @dst_ioas_id: IOAS ID to change the mapping of
>>> + * @src_ioas_id: IOAS ID to copy from
>>> + * @length: Number of bytes to copy and map
>>> + * @dst_iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is
>>> + *            set then this must be provided as input.
>>> + * @src_iova: IOVA to start the copy
>>> + *
>>> + * Copy an already existing mapping from src_ioas_id and establish it in
>>> + * dst_ioas_id. The src iova/length must exactly match a range used with
>>> + * IOMMU_IOAS_MAP.
>>> + *
>>> + * This may be used to efficiently clone a subset of an IOAS to another, or as a
>>> + * kind of 'cache' to speed up mapping. Copy has an effciency advantage over
>> efficiency
>>> + * establishing equivalent new mappings, as internal resources are shared, and
>>> + * the kernel will pin the user memory only once.
>>> + */
>>> +struct iommu_ioas_copy {
>>> +	__u32 size;
>>> +	__u32 flags;
>>> +	__u32 dst_ioas_id;
>>> +	__u32 src_ioas_id;
>> is src_ioas_id == dst_ioas_id allowed?
> Yes
>
>>> +/**
>>> + * struct iommu_option - iommu option multiplexer
>>> + * @size: sizeof(struct iommu_option)
>>> + * @option_id: One of enum iommufd_option
>>> + * @op: One of enum iommufd_option_ops
>>> + * @__reserved: Must be 0
>>> + * @object_id: ID of the object if required
>>> + * @val64: Option value to set or value returned on get
>>> + *
>>> + * Change a simple option value. This multiplexor allows controlling a options
>> s/a options/options
> Done
>
> Thanks,
> Jason
>
Eric
diff mbox series

Patch

diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index b66a8c47ff55ec..2b4f36f1b72f9d 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -1,6 +1,7 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
 iommufd-y := \
 	io_pagetable.o \
+	ioas.o \
 	main.o \
 	pages.o
 
diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
new file mode 100644
index 00000000000000..7671456e86413a
--- /dev/null
+++ b/drivers/iommu/iommufd/ioas.c
@@ -0,0 +1,384 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
+ */
+#include <linux/interval_tree.h>
+#include <linux/iommufd.h>
+#include <linux/iommu.h>
+#include <uapi/linux/iommufd.h>
+
+#include "io_pagetable.h"
+
+void iommufd_ioas_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_ioas *ioas = container_of(obj, struct iommufd_ioas, obj);
+	int rc;
+
+	rc = iopt_unmap_all(&ioas->iopt, NULL);
+	WARN_ON(rc && rc != -ENOENT);
+	iopt_destroy_table(&ioas->iopt);
+}
+
+struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx)
+{
+	struct iommufd_ioas *ioas;
+
+	ioas = iommufd_object_alloc(ictx, ioas, IOMMUFD_OBJ_IOAS);
+	if (IS_ERR(ioas))
+		return ioas;
+
+	iopt_init_table(&ioas->iopt);
+	return ioas;
+}
+
+int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_ioas_alloc *cmd = ucmd->cmd;
+	struct iommufd_ioas *ioas;
+	int rc;
+
+	if (cmd->flags)
+		return -EOPNOTSUPP;
+
+	ioas = iommufd_ioas_alloc(ucmd->ictx);
+	if (IS_ERR(ioas))
+		return PTR_ERR(ioas);
+
+	cmd->out_ioas_id = ioas->obj.id;
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+	if (rc)
+		goto out_table;
+	iommufd_object_finalize(ucmd->ictx, &ioas->obj);
+	return 0;
+
+out_table:
+	iommufd_object_abort_and_destroy(ucmd->ictx, &ioas->obj);
+	return rc;
+}
+
+int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_iova_range __user *ranges;
+	struct iommu_ioas_iova_ranges *cmd = ucmd->cmd;
+	struct iommufd_ioas *ioas;
+	struct interval_tree_span_iter span;
+	u32 max_iovas;
+	int rc;
+
+	if (cmd->__reserved)
+		return -EOPNOTSUPP;
+
+	ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
+	if (IS_ERR(ioas))
+		return PTR_ERR(ioas);
+
+	down_read(&ioas->iopt.iova_rwsem);
+	max_iovas = cmd->num_iovas;
+	ranges = u64_to_user_ptr(cmd->allowed_iovas);
+	cmd->num_iovas = 0;
+	cmd->out_iova_alignment = ioas->iopt.iova_alignment;
+	interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0,
+				    ULONG_MAX) {
+		if (!span.is_hole)
+			continue;
+		if (cmd->num_iovas < max_iovas) {
+			struct iommu_iova_range elm = {
+				.start = span.start_hole,
+				.last = span.last_hole,
+			};
+
+			if (copy_to_user(&ranges[cmd->num_iovas], &elm,
+					 sizeof(elm))) {
+				rc = -EFAULT;
+				goto out_put;
+			}
+		}
+		cmd->num_iovas++;
+	}
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+	if (rc)
+		goto out_put;
+	if (cmd->num_iovas > max_iovas)
+		rc = -EMSGSIZE;
+out_put:
+	up_read(&ioas->iopt.iova_rwsem);
+	iommufd_put_object(&ioas->obj);
+	return rc;
+}
+
+static int iommufd_ioas_load_iovas(struct rb_root_cached *itree,
+				   struct iommu_iova_range __user *ranges,
+				   u32 num)
+{
+	u32 i;
+
+	for (i = 0; i != num; i++) {
+		struct iommu_iova_range range;
+		struct iopt_allowed *allowed;
+
+		if (copy_from_user(&range, ranges + i, sizeof(range)))
+			return -EFAULT;
+
+		if (range.start >= range.last)
+			return -EINVAL;
+
+		if (interval_tree_iter_first(itree, range.start, range.last))
+			return -EINVAL;
+
+		allowed = kzalloc(sizeof(*allowed), GFP_KERNEL_ACCOUNT);
+		if (!allowed)
+			return -ENOMEM;
+		allowed->node.start = range.start;
+		allowed->node.last = range.last;
+
+		interval_tree_insert(&allowed->node, itree);
+	}
+	return 0;
+}
+
+int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_ioas_allow_iovas *cmd = ucmd->cmd;
+	struct rb_root_cached allowed_iova = RB_ROOT_CACHED;
+	struct interval_tree_node *node;
+	struct iommufd_ioas *ioas;
+	struct io_pagetable *iopt;
+	int rc = 0;
+
+	if (cmd->__reserved)
+		return -EOPNOTSUPP;
+
+	ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
+	if (IS_ERR(ioas))
+		return PTR_ERR(ioas);
+	iopt = &ioas->iopt;
+
+	rc = iommufd_ioas_load_iovas(&allowed_iova,
+				     u64_to_user_ptr(cmd->allowed_iovas),
+				     cmd->num_iovas);
+	if (rc)
+		goto out_free;
+
+	rc = iopt_set_allow_iova(iopt, &allowed_iova);
+out_free:
+	while ((node = interval_tree_iter_first(&allowed_iova, 0, ULONG_MAX))) {
+		interval_tree_remove(node, &allowed_iova);
+		kfree(container_of(node, struct iopt_allowed, node));
+	}
+	iommufd_put_object(&ioas->obj);
+	return rc;
+}
+
+static int conv_iommu_prot(u32 map_flags)
+{
+	int iommu_prot;
+
+	/*
+	 * We provide no manual cache coherency ioctls to userspace and most
+	 * architectures make the CPU ops for cache flushing privileged.
+	 * Therefore we require the underlying IOMMU to support CPU coherent
+	 * operation. Support for IOMMU_CACHE is enforced by the
+	 * IOMMU_CAP_CACHE_COHERENCY test during bind.
+	 */
+	iommu_prot = IOMMU_CACHE;
+	if (map_flags & IOMMU_IOAS_MAP_WRITEABLE)
+		iommu_prot |= IOMMU_WRITE;
+	if (map_flags & IOMMU_IOAS_MAP_READABLE)
+		iommu_prot |= IOMMU_READ;
+	return iommu_prot;
+}
+
+int iommufd_ioas_map(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_ioas_map *cmd = ucmd->cmd;
+	struct iommufd_ioas *ioas;
+	unsigned int flags = 0;
+	unsigned long iova;
+	int rc;
+
+	if ((cmd->flags &
+	     ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE |
+	       IOMMU_IOAS_MAP_READABLE)) ||
+	    cmd->__reserved)
+		return -EOPNOTSUPP;
+	if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX)
+		return -EOVERFLOW;
+
+	ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
+	if (IS_ERR(ioas))
+		return PTR_ERR(ioas);
+
+	if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
+		flags = IOPT_ALLOC_IOVA;
+	iova = cmd->iova;
+	rc = iopt_map_user_pages(ucmd->ictx, &ioas->iopt, &iova,
+				 u64_to_user_ptr(cmd->user_va), cmd->length,
+				 conv_iommu_prot(cmd->flags), flags);
+	if (rc)
+		goto out_put;
+
+	cmd->iova = iova;
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+out_put:
+	iommufd_put_object(&ioas->obj);
+	return rc;
+}
+
+int iommufd_ioas_copy(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_ioas_copy *cmd = ucmd->cmd;
+	struct iommufd_ioas *src_ioas;
+	struct iommufd_ioas *dst_ioas;
+	unsigned int flags = 0;
+	LIST_HEAD(pages_list);
+	unsigned long iova;
+	int rc;
+
+	if ((cmd->flags &
+	     ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE |
+	       IOMMU_IOAS_MAP_READABLE)))
+		return -EOPNOTSUPP;
+	if (cmd->length >= ULONG_MAX || cmd->src_iova >= ULONG_MAX ||
+	    cmd->dst_iova >= ULONG_MAX)
+		return -EOVERFLOW;
+
+	src_ioas = iommufd_get_ioas(ucmd, cmd->src_ioas_id);
+	if (IS_ERR(src_ioas))
+		return PTR_ERR(src_ioas);
+	rc = iopt_get_pages(&src_ioas->iopt, cmd->src_iova, cmd->length,
+			    &pages_list);
+	iommufd_put_object(&src_ioas->obj);
+	if (rc)
+		return rc;
+
+	dst_ioas = iommufd_get_ioas(ucmd, cmd->dst_ioas_id);
+	if (IS_ERR(dst_ioas)) {
+		rc = PTR_ERR(dst_ioas);
+		goto out_pages;
+	}
+
+	if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
+		flags = IOPT_ALLOC_IOVA;
+	iova = cmd->dst_iova;
+	rc = iopt_map_pages(&dst_ioas->iopt, &pages_list, cmd->length, &iova,
+			    conv_iommu_prot(cmd->flags), flags);
+	if (rc)
+		goto out_put_dst;
+
+	cmd->dst_iova = iova;
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+out_put_dst:
+	iommufd_put_object(&dst_ioas->obj);
+out_pages:
+	iopt_free_pages_list(&pages_list);
+	return rc;
+}
+
+int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_ioas_unmap *cmd = ucmd->cmd;
+	struct iommufd_ioas *ioas;
+	unsigned long unmapped = 0;
+	int rc;
+
+	ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
+	if (IS_ERR(ioas))
+		return PTR_ERR(ioas);
+
+	if (cmd->iova == 0 && cmd->length == U64_MAX) {
+		rc = iopt_unmap_all(&ioas->iopt, &unmapped);
+		if (rc)
+			goto out_put;
+	} else {
+		if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) {
+			rc = -EOVERFLOW;
+			goto out_put;
+		}
+		rc = iopt_unmap_iova(&ioas->iopt, cmd->iova, cmd->length,
+				     &unmapped);
+		if (rc)
+			goto out_put;
+	}
+
+	cmd->length = unmapped;
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+
+out_put:
+	iommufd_put_object(&ioas->obj);
+	return rc;
+}
+
+int iommufd_option_rlimit_mode(struct iommu_option *cmd,
+			       struct iommufd_ctx *ictx)
+{
+	if (cmd->op == IOMMU_OPTION_OP_GET) {
+		cmd->val64 = ictx->account_mode == IOPT_PAGES_ACCOUNT_MM;
+		return 0;
+	}
+	if (cmd->op == IOMMU_OPTION_OP_SET) {
+		int rc = 0;
+
+		if (!capable(CAP_SYS_RESOURCE))
+			return -EPERM;
+
+		xa_lock(&ictx->objects);
+		if (!xa_empty(&ictx->objects)) {
+			rc = -EBUSY;
+		} else {
+			if (cmd->val64 == 0)
+				ictx->account_mode = IOPT_PAGES_ACCOUNT_USER;
+			else if (cmd->val64 == 1)
+				ictx->account_mode = IOPT_PAGES_ACCOUNT_MM;
+			else
+				rc = -EINVAL;
+		}
+		xa_unlock(&ictx->objects);
+
+		return rc;
+	}
+	return -EOPNOTSUPP;
+}
+
+static int iommufd_ioas_option_huge_pages(struct iommu_option *cmd,
+					  struct iommufd_ioas *ioas)
+{
+	if (cmd->op == IOMMU_OPTION_OP_GET) {
+		cmd->val64 = !ioas->iopt.disable_large_pages;
+		return 0;
+	}
+	if (cmd->op == IOMMU_OPTION_OP_SET) {
+		if (cmd->val64 == 0)
+			return iopt_disable_large_pages(&ioas->iopt);
+		if (cmd->val64 == 1) {
+			iopt_enable_large_pages(&ioas->iopt);
+			return 0;
+		}
+		return -EINVAL;
+	}
+	return -EOPNOTSUPP;
+}
+
+int iommufd_ioas_option(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_option *cmd = ucmd->cmd;
+	struct iommufd_ioas *ioas;
+	int rc = 0;
+
+	if (cmd->__reserved)
+		return -EOPNOTSUPP;
+
+	ioas = iommufd_get_ioas(ucmd, cmd->object_id);
+	if (IS_ERR(ioas))
+		return PTR_ERR(ioas);
+
+	switch (cmd->option_id) {
+	case IOMMU_OPTION_HUGE_PAGES:
+		rc = iommufd_ioas_option_huge_pages(cmd, ioas);
+		break;
+	default:
+		rc = -EOPNOTSUPP;
+	}
+
+	iommufd_put_object(&ioas->obj);
+	return rc;
+}
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index dadd90cae543ba..6721332dbbba03 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -11,6 +11,7 @@ 
 
 struct iommu_domain;
 struct iommu_group;
+struct iommu_option;
 
 struct iommufd_ctx {
 	struct file *file;
@@ -102,6 +103,7 @@  static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd,
 enum iommufd_object_type {
 	IOMMUFD_OBJ_NONE,
 	IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE,
+	IOMMUFD_OBJ_IOAS,
 };
 
 /* Base struct for all objects with a userspace ID handle. */
@@ -174,6 +176,37 @@  struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
 			     type),                                            \
 		     typeof(*(ptr)), obj)
 
+/*
+ * The IO Address Space (IOAS) pagetable is a virtual page table backed by the
+ * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The
+ * mapping is copied into all of the associated domains and made available to
+ * in-kernel users.
+ */
+struct iommufd_ioas {
+	struct iommufd_object obj;
+	struct io_pagetable iopt;
+};
+
+static inline struct iommufd_ioas *iommufd_get_ioas(struct iommufd_ucmd *ucmd,
+						    u32 id)
+{
+	return container_of(iommufd_get_object(ucmd->ictx, id,
+					       IOMMUFD_OBJ_IOAS),
+			    struct iommufd_ioas, obj);
+}
+
+struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx);
+int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_ioas_destroy(struct iommufd_object *obj);
+int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_map(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_copy(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
+int iommufd_option_rlimit_mode(struct iommu_option *cmd,
+			       struct iommufd_ctx *ictx);
+
 struct iommufd_access {
 	unsigned long iova_alignment;
 	u32 iopt_access_list_id;
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 3a705cadb85020..266109045537ed 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -202,8 +202,36 @@  static int iommufd_fops_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static int iommufd_option(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_option *cmd = ucmd->cmd;
+	int rc;
+
+	switch (cmd->option_id) {
+	case IOMMU_OPTION_RLIMIT_MODE:
+		rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx);
+		break;
+	case IOMMU_OPTION_HUGE_PAGES:
+		rc = iommufd_ioas_option(ucmd);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+	if (rc)
+		return rc;
+	if (copy_to_user(&((struct iommu_option __user *)ucmd->ubuffer)->val64,
+			 &cmd->val64, sizeof(cmd->val64)))
+		return -EFAULT;
+	return 0;
+}
+
 union ucmd_buffer {
 	struct iommu_destroy destroy;
+	struct iommu_ioas_alloc alloc;
+	struct iommu_ioas_allow_iovas allow_iovas;
+	struct iommu_ioas_iova_ranges iova_ranges;
+	struct iommu_ioas_map map;
+	struct iommu_ioas_unmap unmap;
 };
 
 struct iommufd_ioctl_op {
@@ -224,6 +252,20 @@  struct iommufd_ioctl_op {
 	}
 static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 	IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id),
+	IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl,
+		 struct iommu_ioas_alloc, out_ioas_id),
+	IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
+		 struct iommu_ioas_allow_iovas, allowed_iovas),
+	IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy,
+		 src_iova),
+	IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges,
+		 struct iommu_ioas_iova_ranges, out_iova_alignment),
+	IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map,
+		 iova),
+	IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap,
+		 length),
+	IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option,
+		 val64),
 };
 
 static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd,
@@ -310,6 +352,9 @@  void iommufd_ctx_put(struct iommufd_ctx *ictx)
 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD);
 
 static const struct iommufd_object_ops iommufd_object_ops[] = {
+	[IOMMUFD_OBJ_IOAS] = {
+		.destroy = iommufd_ioas_destroy,
+	},
 };
 
 static struct miscdevice iommu_misc_dev = {
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 2ad06b27a35fe5..9e9250dfc4fb1b 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -37,12 +37,19 @@ 
 enum {
 	IOMMUFD_CMD_BASE = 0x80,
 	IOMMUFD_CMD_DESTROY = IOMMUFD_CMD_BASE,
+	IOMMUFD_CMD_IOAS_ALLOC,
+	IOMMUFD_CMD_IOAS_ALLOW_IOVAS,
+	IOMMUFD_CMD_IOAS_COPY,
+	IOMMUFD_CMD_IOAS_IOVA_RANGES,
+	IOMMUFD_CMD_IOAS_MAP,
+	IOMMUFD_CMD_IOAS_UNMAP,
+	IOMMUFD_CMD_OPTION,
 };
 
 /**
  * struct iommu_destroy - ioctl(IOMMU_DESTROY)
  * @size: sizeof(struct iommu_destroy)
- * @id: iommufd object ID to destroy. Can by any destroyable object type.
+ * @id: iommufd object ID to destroy. Can be any destroyable object type.
  *
  * Destroy any object held within iommufd.
  */
@@ -52,4 +59,241 @@  struct iommu_destroy {
 };
 #define IOMMU_DESTROY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_DESTROY)
 
+/**
+ * struct iommu_ioas_alloc - ioctl(IOMMU_IOAS_ALLOC)
+ * @size: sizeof(struct iommu_ioas_alloc)
+ * @flags: Must be 0
+ * @out_ioas_id: Output IOAS ID for the allocated object
+ *
+ * Allocate an IO Address Space (IOAS) which holds an IO Virtual Address (IOVA)
+ * to memory mapping.
+ */
+struct iommu_ioas_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 out_ioas_id;
+};
+#define IOMMU_IOAS_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_ALLOC)
+
+/**
+ * struct iommu_iova_range - ioctl(IOMMU_IOVA_RANGE)
+ * @start: First IOVA
+ * @last: Inclusive last IOVA
+ *
+ * An interval in IOVA space.
+ */
+struct iommu_iova_range {
+	__aligned_u64 start;
+	__aligned_u64 last;
+};
+
+/**
+ * struct iommu_ioas_iova_ranges - ioctl(IOMMU_IOAS_IOVA_RANGES)
+ * @size: sizeof(struct iommu_ioas_iova_ranges)
+ * @ioas_id: IOAS ID to read ranges from
+ * @num_iovas: Input/Output total number of ranges in the IOAS
+ * @__reserved: Must be 0
+ * @allowed_iovas: Pointer to the output array of struct iommu_iova_range
+ * @out_iova_alignment: Minimum alignment required for mapping IOVA
+ *
+ * Query an IOAS for ranges of allowed IOVAs. Mapping IOVA outside these ranges
+ * is not allowed. num_iovas will be set to the total number of iovas and
+ * the allowed_iovas[] will be filled in as space permits.
+ *
+ * The allowed ranges are dependent on the HW path the DMA operation takes, and
+ * can change during the lifetime of the IOAS. A fresh empty IOAS will have a
+ * full range, and each attached device will narrow the ranges based on that
+ * device's HW restrictions. Detatching a device can widen the ranges. Userspace
+ * should query ranges after every attach/detatch to know what IOVAs are valid
+ * for mapping.
+ *
+ * On input num_iovas is the length of the allowed_iovas array. On output it is
+ * the total number of iovas filled in. The ioctl will return -EMSGSIZE and set
+ * num_iovas to the required value if num_iovas is too small. In this case the
+ * caller should allocate a larger output array and re-issue the ioctl.
+ */
+struct iommu_ioas_iova_ranges {
+	__u32 size;
+	__u32 ioas_id;
+	__u32 num_iovas;
+	__u32 __reserved;
+	__aligned_u64 allowed_iovas;
+	__aligned_u64 out_iova_alignment;
+};
+#define IOMMU_IOAS_IOVA_RANGES _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_IOVA_RANGES)
+
+/**
+ * struct iommu_ioas_allow_iovas - ioctl(IOMMU_IOAS_ALLOW_IOVAS)
+ * @size: sizeof(struct iommu_ioas_allow_iovas)
+ * @ioas_id: IOAS ID to allow IOVAs from
+ * @num_iovas: Input/Output total number of ranges in the IOAS
+ * @__reserved: Must be 0
+ * @allowed_iovas: Pointer to array of struct iommu_iova_range
+ *
+ * Ensure a range of IOVAs are always available for allocation. If this call
+ * succeeds then IOMMU_IOAS_IOVA_RANGES will never return a list of IOVA ranges
+ * that are narrower than the ranges provided here. This call will fail if
+ * IOMMU_IOAS_IOVA_RANGES is currently narrower than the given ranges.
+ *
+ * When an IOAS is first created the IOVA_RANGES will be maximally sized, and as
+ * devices are attached the IOVA will narrow based on the device restrictions.
+ * When an allowed range is specified any narrowing will be refused, ie device
+ * attachment can fail if the device requires limiting within the allowed range.
+ *
+ * Automatic IOVA allocation is also impacted by this call. MAP will only
+ * allocate within the allowed IOVAs if they are present.
+ *
+ * This call replaces the entire allowed list with the given list.
+ */
+struct iommu_ioas_allow_iovas {
+	__u32 size;
+	__u32 ioas_id;
+	__u32 num_iovas;
+	__u32 __reserved;
+	__aligned_u64 allowed_iovas;
+};
+#define IOMMU_IOAS_ALLOW_IOVAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_ALLOW_IOVAS)
+
+/**
+ * enum iommufd_ioas_map_flags - Flags for map and copy
+ * @IOMMU_IOAS_MAP_FIXED_IOVA: If clear the kernel will compute an appropriate
+ *                             IOVA to place the mapping at
+ * @IOMMU_IOAS_MAP_WRITEABLE: DMA is allowed to write to this mapping
+ * @IOMMU_IOAS_MAP_READABLE: DMA is allowed to read from this mapping
+ */
+enum iommufd_ioas_map_flags {
+	IOMMU_IOAS_MAP_FIXED_IOVA = 1 << 0,
+	IOMMU_IOAS_MAP_WRITEABLE = 1 << 1,
+	IOMMU_IOAS_MAP_READABLE = 1 << 2,
+};
+
+/**
+ * struct iommu_ioas_map - ioctl(IOMMU_IOAS_MAP)
+ * @size: sizeof(struct iommu_ioas_map)
+ * @flags: Combination of enum iommufd_ioas_map_flags
+ * @ioas_id: IOAS ID to change the mapping of
+ * @__reserved: Must be 0
+ * @user_va: Userspace pointer to start mapping from
+ * @length: Number of bytes to map
+ * @iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is set
+ *        then this must be provided as input.
+ *
+ * Set an IOVA mapping from a user pointer. If FIXED_IOVA is specified then the
+ * mapping will be established at iova, otherwise a suitable location based on
+ * the reserved and allowed lists will be automatically selected and returned in
+ * iova.
+ */
+struct iommu_ioas_map {
+	__u32 size;
+	__u32 flags;
+	__u32 ioas_id;
+	__u32 __reserved;
+	__aligned_u64 user_va;
+	__aligned_u64 length;
+	__aligned_u64 iova;
+};
+#define IOMMU_IOAS_MAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_MAP)
+
+/**
+ * struct iommu_ioas_copy - ioctl(IOMMU_IOAS_COPY)
+ * @size: sizeof(struct iommu_ioas_copy)
+ * @flags: Combination of enum iommufd_ioas_map_flags
+ * @dst_ioas_id: IOAS ID to change the mapping of
+ * @src_ioas_id: IOAS ID to copy from
+ * @length: Number of bytes to copy and map
+ * @dst_iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is
+ *            set then this must be provided as input.
+ * @src_iova: IOVA to start the copy
+ *
+ * Copy an already existing mapping from src_ioas_id and establish it in
+ * dst_ioas_id. The src iova/length must exactly match a range used with
+ * IOMMU_IOAS_MAP.
+ *
+ * This may be used to efficiently clone a subset of an IOAS to another, or as a
+ * kind of 'cache' to speed up mapping. Copy has an effciency advantage over
+ * establishing equivalent new mappings, as internal resources are shared, and
+ * the kernel will pin the user memory only once.
+ */
+struct iommu_ioas_copy {
+	__u32 size;
+	__u32 flags;
+	__u32 dst_ioas_id;
+	__u32 src_ioas_id;
+	__aligned_u64 length;
+	__aligned_u64 dst_iova;
+	__aligned_u64 src_iova;
+};
+#define IOMMU_IOAS_COPY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_COPY)
+
+/**
+ * struct iommu_ioas_unmap - ioctl(IOMMU_IOAS_UNMAP)
+ * @size: sizeof(struct iommu_ioas_unmap)
+ * @ioas_id: IOAS ID to change the mapping of
+ * @iova: IOVA to start the unmapping at
+ * @length: Number of bytes to unmap, and return back the bytes unmapped
+ *
+ * Unmap an IOVA range. The iova/length must be a superset of a previously
+ * mapped range used with IOMMU_IOAS_MAP or IOMMU_IOAS_COPY. Splitting or
+ * truncating ranges is not allowed. The values 0 to U64_MAX will unmap
+ * everything.
+ */
+struct iommu_ioas_unmap {
+	__u32 size;
+	__u32 ioas_id;
+	__aligned_u64 iova;
+	__aligned_u64 length;
+};
+#define IOMMU_IOAS_UNMAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_UNMAP)
+
+/**
+ * enum iommufd_option - ioctl(IOMMU_OPTION_RLIMIT_MODE) and
+ *                       ioctl(IOMMU_OPTION_HUGE_PAGES)
+ * @IOMMU_OPTION_RLIMIT_MODE:
+ *    Change how RLIMIT_MEMLOCK accounting works. The caller must have privilege
+ *    to invoke this. Value 0 (default) is user based accouting, 1 uses process
+ *    based accounting. Global option, object_id must be 0
+ * @IOMMU_OPTION_HUGE_PAGES:
+ *    Value 1 (default) allows contiguous pages to be combined when generating
+ *    iommu mappings. Value 0 disables combining, everything is mapped to
+ *    PAGE_SIZE. This can be useful for benchmarking.  This is a per-IOAS
+ *    option, the object_id must be the IOAS ID.
+ */
+enum iommufd_option {
+	IOMMU_OPTION_RLIMIT_MODE = 0,
+	IOMMU_OPTION_HUGE_PAGES = 1,
+};
+
+/**
+ * enum iommufd_option_ops - ioctl(IOMMU_OPTION_OP_SET) and
+ *                           ioctl(IOMMU_OPTION_OP_GET)
+ * @IOMMU_OPTION_OP_SET: Set the option's value
+ * @IOMMU_OPTION_OP_GET: Get the option's value
+ */
+enum iommufd_option_ops {
+	IOMMU_OPTION_OP_SET = 0,
+	IOMMU_OPTION_OP_GET = 1,
+};
+
+/**
+ * struct iommu_option - iommu option multiplexer
+ * @size: sizeof(struct iommu_option)
+ * @option_id: One of enum iommufd_option
+ * @op: One of enum iommufd_option_ops
+ * @__reserved: Must be 0
+ * @object_id: ID of the object if required
+ * @val64: Option value to set or value returned on get
+ *
+ * Change a simple option value. This multiplexor allows controlling a options
+ * on objects. IOMMU_OPTION_OP_SET will load an option and IOMMU_OPTION_OP_GET
+ * will return the current value.
+ */
+struct iommu_option {
+	__u32 size;
+	__u32 option_id;
+	__u16 op;
+	__u16 __reserved;
+	__u32 object_id;
+	__aligned_u64 val64;
+};
+#define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
 #endif