diff mbox series

[2/3] drm/msm: add DRM_MSM_GEM_SYNC_CACHE for non-coherent cache maintenance

Message ID 20201001002709.21361-3-jonathan@marek.ca
State New
Headers show
Series drm/msm: support for host-cached BOs | expand

Commit Message

Jonathan Marek Oct. 1, 2020, 12:27 a.m. UTC
This makes it possible to use the non-coherent cached MSM_BO_CACHED mode,
which otherwise doesn't provide any method for cleaning/invalidating the
cache to sync with the device.

Signed-off-by: Jonathan Marek <jonathan@marek.ca>
---
 drivers/gpu/drm/msm/msm_drv.c | 21 +++++++++++++++++++++
 drivers/gpu/drm/msm/msm_drv.h |  2 ++
 drivers/gpu/drm/msm/msm_gem.c | 15 +++++++++++++++
 include/uapi/drm/msm_drm.h    | 20 ++++++++++++++++++++
 4 files changed, 58 insertions(+)

Comments

Jordan Crouse Oct. 1, 2020, 11:29 p.m. UTC | #1
On Wed, Sep 30, 2020 at 08:27:05PM -0400, Jonathan Marek wrote:
> This makes it possible to use the non-coherent cached MSM_BO_CACHED mode,
> which otherwise doesn't provide any method for cleaning/invalidating the
> cache to sync with the device.
> 
> Signed-off-by: Jonathan Marek <jonathan@marek.ca>
> ---
>  drivers/gpu/drm/msm/msm_drv.c | 21 +++++++++++++++++++++
>  drivers/gpu/drm/msm/msm_drv.h |  2 ++
>  drivers/gpu/drm/msm/msm_gem.c | 15 +++++++++++++++
>  include/uapi/drm/msm_drm.h    | 20 ++++++++++++++++++++
>  4 files changed, 58 insertions(+)
> 
> diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
> index 9716210495fc..305db1db1064 100644
> --- a/drivers/gpu/drm/msm/msm_drv.c
> +++ b/drivers/gpu/drm/msm/msm_drv.c
> @@ -964,6 +964,26 @@ static int msm_ioctl_submitqueue_close(struct drm_device *dev, void *data,
>  	return msm_submitqueue_remove(file->driver_priv, id);
>  }
>  
> +static int msm_ioctl_gem_sync_cache(struct drm_device *dev, void *data,
> +		struct drm_file *file)
> +{
> +	struct drm_msm_gem_sync_cache *args = data;
> +	struct drm_gem_object *obj;
> +
> +	if (args->flags & ~MSM_GEM_SYNC_CACHE_FLAGS)
> +		return -EINVAL;
> +
> +	obj = drm_gem_object_lookup(file, args->handle);
> +	if (!obj)
> +		return -ENOENT;
> +
> +	msm_gem_sync_cache(obj, args->flags, args->offset, args->end);
> +
> +	drm_gem_object_put(obj);
> +
> +	return 0;
> +}
> +
>  static const struct drm_ioctl_desc msm_ioctls[] = {
>  	DRM_IOCTL_DEF_DRV(MSM_GET_PARAM,    msm_ioctl_get_param,    DRM_RENDER_ALLOW),
>  	DRM_IOCTL_DEF_DRV(MSM_GEM_NEW,      msm_ioctl_gem_new,      DRM_RENDER_ALLOW),
> @@ -976,6 +996,7 @@ static const struct drm_ioctl_desc msm_ioctls[] = {
>  	DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_NEW,   msm_ioctl_submitqueue_new,   DRM_RENDER_ALLOW),
>  	DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_CLOSE, msm_ioctl_submitqueue_close, DRM_RENDER_ALLOW),
>  	DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_QUERY, msm_ioctl_submitqueue_query, DRM_RENDER_ALLOW),
> +	DRM_IOCTL_DEF_DRV(MSM_GEM_SYNC_CACHE,    msm_ioctl_gem_sync_cache,    DRM_RENDER_ALLOW),
>  };
>  
>  static const struct vm_operations_struct vm_ops = {
> diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
> index 6384844b1696..5e932dae453f 100644
> --- a/drivers/gpu/drm/msm/msm_drv.h
> +++ b/drivers/gpu/drm/msm/msm_drv.h
> @@ -314,6 +314,8 @@ void msm_gem_move_to_active(struct drm_gem_object *obj,
>  void msm_gem_move_to_inactive(struct drm_gem_object *obj);
>  int msm_gem_cpu_prep(struct drm_gem_object *obj, uint32_t op, ktime_t *timeout);
>  int msm_gem_cpu_fini(struct drm_gem_object *obj);
> +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
> +		size_t range_start, size_t range_end);
>  void msm_gem_free_object(struct drm_gem_object *obj);
>  int msm_gem_new_handle(struct drm_device *dev, struct drm_file *file,
>  		uint32_t size, uint32_t flags, uint32_t *handle, char *name);
> diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
> index ad9a627493ae..93da88b3fc50 100644
> --- a/drivers/gpu/drm/msm/msm_gem.c
> +++ b/drivers/gpu/drm/msm/msm_gem.c
> @@ -8,6 +8,7 @@
>  #include <linux/shmem_fs.h>
>  #include <linux/dma-buf.h>
>  #include <linux/pfn_t.h>
> +#include <linux/dma-noncoherent.h>
>  
>  #include <drm/drm_prime.h>
>  
> @@ -808,6 +809,20 @@ int msm_gem_cpu_fini(struct drm_gem_object *obj)
>  	return 0;
>  }
>  
> +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
> +		size_t range_start, size_t range_end)
> +{
> +	struct msm_gem_object *msm_obj = to_msm_bo(obj);
> +
> +	/* TODO: sync only the required range, and don't invalidate on clean */
> +
> +	if (flags & MSM_GEM_SYNC_CACHE_CLEAN)

Curious why you would rename these - I feel like the to_device / to_cpu model is
pretty well baked into our thought process. I know from personal experience that
I have to stop and think to remember which direction is which.

Jordan

> +		sync_for_device(msm_obj);
> +
> +	if (flags & MSM_GEM_SYNC_CACHE_INVALIDATE)
> +		sync_for_cpu(msm_obj);
> +}
> +
>  #ifdef CONFIG_DEBUG_FS
>  static void describe_fence(struct dma_fence *fence, const char *type,
>  		struct seq_file *m)
> diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h
> index 474497e8743a..1dfafa71fc94 100644
> --- a/include/uapi/drm/msm_drm.h
> +++ b/include/uapi/drm/msm_drm.h
> @@ -319,6 +319,24 @@ struct drm_msm_submitqueue_query {
>  	__u32 pad;
>  };
>  
> +/*
> + * Host cache maintenance (relevant for MSM_BO_CACHED)
> + * driver may both clean/invalidate (flush) for clean
> + */
> +
> +#define MSM_GEM_SYNC_CACHE_CLEAN	0x1
> +#define MSM_GEM_SYNC_CACHE_INVALIDATE	0x2
> +
> +#define MSM_GEM_SYNC_CACHE_FLAGS	(MSM_GEM_SYNC_CACHE_CLEAN | \
> +					 MSM_GEM_SYNC_CACHE_INVALIDATE)
> +
> +struct drm_msm_gem_sync_cache {
> +	__u32 handle;
> +	__u32 flags;
> +	__u64 offset;
> +	__u64 end;      /* offset + size */
> +};
> +
>  #define DRM_MSM_GET_PARAM              0x00
>  /* placeholder:
>  #define DRM_MSM_SET_PARAM              0x01
> @@ -336,6 +354,7 @@ struct drm_msm_submitqueue_query {
>  #define DRM_MSM_SUBMITQUEUE_NEW        0x0A
>  #define DRM_MSM_SUBMITQUEUE_CLOSE      0x0B
>  #define DRM_MSM_SUBMITQUEUE_QUERY      0x0C
> +#define DRM_MSM_GEM_SYNC_CACHE         0x0D
>  
>  #define DRM_IOCTL_MSM_GET_PARAM        DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GET_PARAM, struct drm_msm_param)
>  #define DRM_IOCTL_MSM_GEM_NEW          DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GEM_NEW, struct drm_msm_gem_new)
> @@ -348,6 +367,7 @@ struct drm_msm_submitqueue_query {
>  #define DRM_IOCTL_MSM_SUBMITQUEUE_NEW    DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_NEW, struct drm_msm_submitqueue)
>  #define DRM_IOCTL_MSM_SUBMITQUEUE_CLOSE  DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_CLOSE, __u32)
>  #define DRM_IOCTL_MSM_SUBMITQUEUE_QUERY  DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_QUERY, struct drm_msm_submitqueue_query)
> +#define DRM_IOCTL_MSM_GEM_SYNC_CACHE     DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_GEM_SYNC_CACHE, struct drm_msm_gem_sync_cache)
>  
>  #if defined(__cplusplus)
>  }
> -- 
> 2.26.1
> 
> _______________________________________________
> Freedreno mailing list
> Freedreno@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/freedreno
'Christoph Hellwig' Oct. 2, 2020, 7:53 a.m. UTC | #2
> @@ -8,6 +8,7 @@
>  #include <linux/shmem_fs.h>
>  #include <linux/dma-buf.h>
>  #include <linux/pfn_t.h>
> +#include <linux/dma-noncoherent.h>

NAK, dma-noncoherent.h is not for driver use.  And will in fact go
away in 5.10.

>  
>  #include <drm/drm_prime.h>
>  
> @@ -808,6 +809,20 @@ int msm_gem_cpu_fini(struct drm_gem_object *obj)
>  	return 0;
>  }
>  
> +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
> +		size_t range_start, size_t range_end)
> +{
> +	struct msm_gem_object *msm_obj = to_msm_bo(obj);
> +
> +	/* TODO: sync only the required range, and don't invalidate on clean */
> +
> +	if (flags & MSM_GEM_SYNC_CACHE_CLEAN)
> +		sync_for_device(msm_obj);
> +
> +	if (flags & MSM_GEM_SYNC_CACHE_INVALIDATE)
> +		sync_for_cpu(msm_obj);

And make to these ones as well.  They are complete abuses of the DMA
API, and while we had to live with that for now to not cause regressions
they absoutely must not be exposed in a userspace ABI like this.
Jonathan Marek Oct. 2, 2020, 12:46 p.m. UTC | #3
On 10/2/20 3:53 AM, Christoph Hellwig wrote:
>> @@ -8,6 +8,7 @@
>>   #include <linux/shmem_fs.h>
>>   #include <linux/dma-buf.h>
>>   #include <linux/pfn_t.h>
>> +#include <linux/dma-noncoherent.h>
> 
> NAK, dma-noncoherent.h is not for driver use.  And will in fact go
> away in 5.10.
> 

Not actually used, so can be removed.

>>   
>>   #include <drm/drm_prime.h>
>>   
>> @@ -808,6 +809,20 @@ int msm_gem_cpu_fini(struct drm_gem_object *obj)
>>   	return 0;
>>   }
>>   
>> +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
>> +		size_t range_start, size_t range_end)
>> +{
>> +	struct msm_gem_object *msm_obj = to_msm_bo(obj);
>> +
>> +	/* TODO: sync only the required range, and don't invalidate on clean */
>> +
>> +	if (flags & MSM_GEM_SYNC_CACHE_CLEAN)
>> +		sync_for_device(msm_obj);
>> +
>> +	if (flags & MSM_GEM_SYNC_CACHE_INVALIDATE)
>> +		sync_for_cpu(msm_obj);
> 
> And make to these ones as well.  They are complete abuses of the DMA
> API, and while we had to live with that for now to not cause regressions
> they absoutely must not be exposed in a userspace ABI like this.
> 

How do you propose that cached non-coherent memory be implemented? It is 
a useful feature for userspace.
'Christoph Hellwig' Oct. 5, 2020, 8:29 a.m. UTC | #4
On Fri, Oct 02, 2020 at 08:46:35AM -0400, Jonathan Marek wrote:
> > > +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
> > > +		size_t range_start, size_t range_end)
> > > +{
> > > +	struct msm_gem_object *msm_obj = to_msm_bo(obj);
> > > +
> > > +	/* TODO: sync only the required range, and don't invalidate on clean */
> > > +
> > > +	if (flags & MSM_GEM_SYNC_CACHE_CLEAN)
> > > +		sync_for_device(msm_obj);
> > > +
> > > +	if (flags & MSM_GEM_SYNC_CACHE_INVALIDATE)
> > > +		sync_for_cpu(msm_obj);
> > 
> > And make to these ones as well.  They are complete abuses of the DMA
> > API, and while we had to live with that for now to not cause regressions
> > they absoutely must not be exposed in a userspace ABI like this.
> > 
> 
> How do you propose that cached non-coherent memory be implemented? It is a
> useful feature for userspace.

If the driver is using the DMA API you need to use dma_alloc_noncoherent
and friends as of 5.10 (see the iommu list for the discussion).

If you use the raw IOMMU API (which I think the msm drm driver does) you
need to work with the maintainers to implement a cache synchronization
API that is not tied to the DMA API.
Jonathan Marek Oct. 5, 2020, 2:35 p.m. UTC | #5
On 10/5/20 4:29 AM, Christoph Hellwig wrote:
> On Fri, Oct 02, 2020 at 08:46:35AM -0400, Jonathan Marek wrote:
>>>> +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
>>>> +		size_t range_start, size_t range_end)
>>>> +{
>>>> +	struct msm_gem_object *msm_obj = to_msm_bo(obj);
>>>> +
>>>> +	/* TODO: sync only the required range, and don't invalidate on clean */
>>>> +
>>>> +	if (flags & MSM_GEM_SYNC_CACHE_CLEAN)
>>>> +		sync_for_device(msm_obj);
>>>> +
>>>> +	if (flags & MSM_GEM_SYNC_CACHE_INVALIDATE)
>>>> +		sync_for_cpu(msm_obj);
>>>
>>> And make to these ones as well.  They are complete abuses of the DMA
>>> API, and while we had to live with that for now to not cause regressions
>>> they absoutely must not be exposed in a userspace ABI like this.
>>>
>>
>> How do you propose that cached non-coherent memory be implemented? It is a
>> useful feature for userspace.
> 
> If the driver is using the DMA API you need to use dma_alloc_noncoherent
> and friends as of 5.10 (see the iommu list for the discussion).
> 
> If you use the raw IOMMU API (which I think the msm drm driver does) you
> need to work with the maintainers to implement a cache synchronization
> API that is not tied to the DMA API.
> 

The cache synchronization doesn't have anything to do with IOMMU (for 
example: cache synchronization would be useful in cases where drm/msm 
doesn't use IOMMU).

What is needed is to call arch_sync_dma_for_{cpu,device} (which is what 
I went with initially, but then decided to re-use drm/msm's 
sync_for_{cpu,device}). But you are also saying those functions aren't 
for driver use, and I doubt IOMMU maintainers will want to add wrappers 
for these functions just to satisfy this "not for driver use" requirement.
'Christoph Hellwig' Oct. 6, 2020, 7:23 a.m. UTC | #6
On Mon, Oct 05, 2020 at 10:35:43AM -0400, Jonathan Marek wrote:
> The cache synchronization doesn't have anything to do with IOMMU (for
> example: cache synchronization would be useful in cases where drm/msm
> doesn't use IOMMU).

It has to do with doing DMA.  And we have two frameworks for doing DMA:
either the DMA API which is for general driver use, and which as part of
the design includes cache maintainance hidden behind the concept of
ownership transfers.  And we have the much more bare bones IOMMU API.

If people want to use the "raw" IOMMU API with not cache coherent
devices we'll need a cache maintainance API that goes along with it.
It could either be formally part of the IOMMU API or be separate.

> What is needed is to call arch_sync_dma_for_{cpu,device} (which is what I
> went with initially, but then decided to re-use drm/msm's
> sync_for_{cpu,device}). But you are also saying those functions aren't for
> driver use, and I doubt IOMMU maintainers will want to add wrappers for
> these functions just to satisfy this "not for driver use" requirement.

arch_sync_dma_for_{cpu,device} are low-level helpers (and not very
great ones at that).  The definitively should not be used by drivers.
They would be very useful buildblocks for a IOMMU cache maintainance
API.

Of course the best outcome would be if we could find a way for the MSM
drm driver to just use DMA API and not deal with the lower level
abstractions.  Do you remember why the driver went for use of the IOMMU
API?
Jonathan Marek Oct. 6, 2020, 1:19 p.m. UTC | #7
On 10/6/20 3:23 AM, Christoph Hellwig wrote:
> On Mon, Oct 05, 2020 at 10:35:43AM -0400, Jonathan Marek wrote:
>> The cache synchronization doesn't have anything to do with IOMMU (for
>> example: cache synchronization would be useful in cases where drm/msm
>> doesn't use IOMMU).
> 
> It has to do with doing DMA.  And we have two frameworks for doing DMA:
> either the DMA API which is for general driver use, and which as part of
> the design includes cache maintainance hidden behind the concept of
> ownership transfers.  And we have the much more bare bones IOMMU API.
> 
> If people want to use the "raw" IOMMU API with not cache coherent
> devices we'll need a cache maintainance API that goes along with it.
> It could either be formally part of the IOMMU API or be separate.
> 
>> What is needed is to call arch_sync_dma_for_{cpu,device} (which is what I
>> went with initially, but then decided to re-use drm/msm's
>> sync_for_{cpu,device}). But you are also saying those functions aren't for
>> driver use, and I doubt IOMMU maintainers will want to add wrappers for
>> these functions just to satisfy this "not for driver use" requirement.
> 
> arch_sync_dma_for_{cpu,device} are low-level helpers (and not very
> great ones at that).  The definitively should not be used by drivers.
> They would be very useful buildblocks for a IOMMU cache maintainance
> API.
> 
> Of course the best outcome would be if we could find a way for the MSM
> drm driver to just use DMA API and not deal with the lower level
> abstractions.  Do you remember why the driver went for use of the IOMMU
> API?
> 

One example why drm/msm can't use DMA API is multiple page table support 
(that is landing in 5.10), which is something that definitely couldn't 
work with DMA API.

Another one is being able to choose the address for mappings, which 
AFAIK DMA API can't do (somewhat related to this: qcom hardware often 
has ranges of allowed addresses, which the dma_mask mechanism fails to 
represent, what I see is drivers using dma_mask as a "maximum address", 
and since addresses are allocated from the top it generally works)

But let us imagine drm/msm switches to using DMA API. a2xx GPUs have 
their own very basic MMU (implemented by msm_gpummu.c), that will need 
to implement dma_map_ops, which will have to call 
arch_sync_dma_for_{cpu,device}. So drm/msm still needs to call 
arch_sync_dma_for_{cpu,device} in that scenario.
'Christoph Hellwig' Oct. 7, 2020, 6:25 a.m. UTC | #8
On Tue, Oct 06, 2020 at 09:19:32AM -0400, Jonathan Marek wrote:
> One example why drm/msm can't use DMA API is multiple page table support
> (that is landing in 5.10), which is something that definitely couldn't work
> with DMA API.
> 
> Another one is being able to choose the address for mappings, which AFAIK
> DMA API can't do (somewhat related to this: qcom hardware often has ranges
> of allowed addresses, which the dma_mask mechanism fails to represent, what
> I see is drivers using dma_mask as a "maximum address", and since addresses
> are allocated from the top it generally works)

That sounds like a good enough rason to use the IOMMU API.  I just
wanted to make sure this really makes sense.
Joerg Roedel Oct. 8, 2020, 8:27 a.m. UTC | #9
On Tue, Oct 06, 2020 at 08:23:06AM +0100, Christoph Hellwig wrote:
> If people want to use the "raw" IOMMU API with not cache coherent
> devices we'll need a cache maintainance API that goes along with it.
> It could either be formally part of the IOMMU API or be separate.

The IOMMU-API does not care about the caching effects of DMA, is manages
IO address spaces for devices. I also don't know how this would be going
to be implemented, the IOMMU-API does not have the concept of handles
for mapped ranges and does not care about CPU virtual addresses (which
are needed for cache flushes) of the memory it maps into IO page-tables.

So I think a cache management API should be separate from the IOMMU-API.

Regards,

	Joerg
Robin Murphy Oct. 13, 2020, 1:42 p.m. UTC | #10
On 2020-10-07 07:25, Christoph Hellwig wrote:
> On Tue, Oct 06, 2020 at 09:19:32AM -0400, Jonathan Marek wrote:
>> One example why drm/msm can't use DMA API is multiple page table support
>> (that is landing in 5.10), which is something that definitely couldn't work
>> with DMA API.
>>
>> Another one is being able to choose the address for mappings, which AFAIK
>> DMA API can't do (somewhat related to this: qcom hardware often has ranges
>> of allowed addresses, which the dma_mask mechanism fails to represent, what
>> I see is drivers using dma_mask as a "maximum address", and since addresses
>> are allocated from the top it generally works)
> 
> That sounds like a good enough rason to use the IOMMU API.  I just
> wanted to make sure this really makes sense.

I still think this situation would be best handled with a variant of 
dma_ops_bypass that also guarantees to bypass SWIOTLB, and can be set 
automatically when attaching to an unmanaged IOMMU domain. That way the 
device driver can make DMA API calls in the appropriate places that do 
the right thing either way, and only needs logic to decide whether to 
use the returned DMA addresses directly or ignore them if it knows 
they're overridden by its own IOMMU mapping.

Robin.
Rob Clark Oct. 13, 2020, 4:11 p.m. UTC | #11
On Tue, Oct 13, 2020 at 6:42 AM Robin Murphy <robin.murphy@arm.com> wrote:
>
> On 2020-10-07 07:25, Christoph Hellwig wrote:
> > On Tue, Oct 06, 2020 at 09:19:32AM -0400, Jonathan Marek wrote:
> >> One example why drm/msm can't use DMA API is multiple page table support
> >> (that is landing in 5.10), which is something that definitely couldn't work
> >> with DMA API.
> >>
> >> Another one is being able to choose the address for mappings, which AFAIK
> >> DMA API can't do (somewhat related to this: qcom hardware often has ranges
> >> of allowed addresses, which the dma_mask mechanism fails to represent, what
> >> I see is drivers using dma_mask as a "maximum address", and since addresses
> >> are allocated from the top it generally works)
> >
> > That sounds like a good enough rason to use the IOMMU API.  I just
> > wanted to make sure this really makes sense.
>
> I still think this situation would be best handled with a variant of
> dma_ops_bypass that also guarantees to bypass SWIOTLB, and can be set
> automatically when attaching to an unmanaged IOMMU domain. That way the
> device driver can make DMA API calls in the appropriate places that do
> the right thing either way, and only needs logic to decide whether to
> use the returned DMA addresses directly or ignore them if it knows
> they're overridden by its own IOMMU mapping.
>

That would be pretty ideal from my PoV

BR,
-R
'Christoph Hellwig' Oct. 15, 2020, 6:55 a.m. UTC | #12
On Tue, Oct 13, 2020 at 02:42:38PM +0100, Robin Murphy wrote:
> I still think this situation would be best handled with a variant of
> dma_ops_bypass that also guarantees to bypass SWIOTLB, and can be set
> automatically when attaching to an unmanaged IOMMU domain.

dma_ops_bypass should mostly do the right thing as-is.  swiotlb bouncing
is triggered of two things:

 1) the dma_mask.  This is under control of the driver, and obviously
    if it is too small for a legit reason we can't just proceed
 2) force_dma_unencrypted() - we'd need to do an opt-out here, either
    by a flag or by being smart and looking for an attached iommu on
    the device

> That way the
> device driver can make DMA API calls in the appropriate places that do the
> right thing either way, and only needs logic to decide whether to use the
> returned DMA addresses directly or ignore them if it knows they're
> overridden by its own IOMMU mapping.

I'd be happy to review patches for this.
Daniel Vetter Oct. 15, 2020, 3:33 p.m. UTC | #13
On Thu, Oct 15, 2020 at 07:55:32AM +0100, Christoph Hellwig wrote:
> On Tue, Oct 13, 2020 at 02:42:38PM +0100, Robin Murphy wrote:
> > I still think this situation would be best handled with a variant of
> > dma_ops_bypass that also guarantees to bypass SWIOTLB, and can be set
> > automatically when attaching to an unmanaged IOMMU domain.
> 
> dma_ops_bypass should mostly do the right thing as-is.  swiotlb bouncing
> is triggered of two things:
> 
>  1) the dma_mask.  This is under control of the driver, and obviously
>     if it is too small for a legit reason we can't just proceed

Somewhat related, but is there a way to tell the dma-api to fail instead
of falling back to swiotlb? In many case for gpu drivers it's much better
if we fall back to dma_alloc_coherent and manage the copying ourselves
instead of abstracting this away in the dma-api. Currently that's "solved"
rather pessimistically by always allocating from dma_alloc_coherent if
swiotlb could be in the picture (at least for ttm based drivers, i915 just
falls over).
-Daniel

>  2) force_dma_unencrypted() - we'd need to do an opt-out here, either
>     by a flag or by being smart and looking for an attached iommu on
>     the device
> 
> > That way the
> > device driver can make DMA API calls in the appropriate places that do the
> > right thing either way, and only needs logic to decide whether to use the
> > returned DMA addresses directly or ignore them if it knows they're
> > overridden by its own IOMMU mapping.
> 
> I'd be happy to review patches for this.
'Christoph Hellwig' Oct. 15, 2020, 3:43 p.m. UTC | #14
On Thu, Oct 15, 2020 at 05:33:34PM +0200, Daniel Vetter wrote:
> On Thu, Oct 15, 2020 at 07:55:32AM +0100, Christoph Hellwig wrote:
> > On Tue, Oct 13, 2020 at 02:42:38PM +0100, Robin Murphy wrote:
> > > I still think this situation would be best handled with a variant of
> > > dma_ops_bypass that also guarantees to bypass SWIOTLB, and can be set
> > > automatically when attaching to an unmanaged IOMMU domain.
> > 
> > dma_ops_bypass should mostly do the right thing as-is.  swiotlb bouncing
> > is triggered of two things:
> > 
> >  1) the dma_mask.  This is under control of the driver, and obviously
> >     if it is too small for a legit reason we can't just proceed
> 
> Somewhat related, but is there a way to tell the dma-api to fail instead
> of falling back to swiotlb? In many case for gpu drivers it's much better
> if we fall back to dma_alloc_coherent and manage the copying ourselves
> instead of abstracting this away in the dma-api. Currently that's "solved"
> rather pessimistically by always allocating from dma_alloc_coherent if
> swiotlb could be in the picture (at least for ttm based drivers, i915 just
> falls over).

Is this for the alloc_pages plus manually map logic in various drivers?

They should switch to the new dma_alloc_pages API that I'll send to
Linus for 5.10 soon.
'Christoph Hellwig' Oct. 23, 2020, 6:48 a.m. UTC | #15
On Thu, Oct 15, 2020 at 04:43:01PM +0100, Christoph Hellwig wrote:
> > Somewhat related, but is there a way to tell the dma-api to fail instead
> > of falling back to swiotlb? In many case for gpu drivers it's much better
> > if we fall back to dma_alloc_coherent and manage the copying ourselves
> > instead of abstracting this away in the dma-api. Currently that's "solved"
> > rather pessimistically by always allocating from dma_alloc_coherent if
> > swiotlb could be in the picture (at least for ttm based drivers, i915 just
> > falls over).
> 
> Is this for the alloc_pages plus manually map logic in various drivers?
> 
> They should switch to the new dma_alloc_pages API that I'll send to
> Linus for 5.10 soon.

Daniel, can you clarify this?
diff mbox series

Patch

diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index 9716210495fc..305db1db1064 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -964,6 +964,26 @@  static int msm_ioctl_submitqueue_close(struct drm_device *dev, void *data,
 	return msm_submitqueue_remove(file->driver_priv, id);
 }
 
+static int msm_ioctl_gem_sync_cache(struct drm_device *dev, void *data,
+		struct drm_file *file)
+{
+	struct drm_msm_gem_sync_cache *args = data;
+	struct drm_gem_object *obj;
+
+	if (args->flags & ~MSM_GEM_SYNC_CACHE_FLAGS)
+		return -EINVAL;
+
+	obj = drm_gem_object_lookup(file, args->handle);
+	if (!obj)
+		return -ENOENT;
+
+	msm_gem_sync_cache(obj, args->flags, args->offset, args->end);
+
+	drm_gem_object_put(obj);
+
+	return 0;
+}
+
 static const struct drm_ioctl_desc msm_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(MSM_GET_PARAM,    msm_ioctl_get_param,    DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(MSM_GEM_NEW,      msm_ioctl_gem_new,      DRM_RENDER_ALLOW),
@@ -976,6 +996,7 @@  static const struct drm_ioctl_desc msm_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_NEW,   msm_ioctl_submitqueue_new,   DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_CLOSE, msm_ioctl_submitqueue_close, DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_QUERY, msm_ioctl_submitqueue_query, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(MSM_GEM_SYNC_CACHE,    msm_ioctl_gem_sync_cache,    DRM_RENDER_ALLOW),
 };
 
 static const struct vm_operations_struct vm_ops = {
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index 6384844b1696..5e932dae453f 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -314,6 +314,8 @@  void msm_gem_move_to_active(struct drm_gem_object *obj,
 void msm_gem_move_to_inactive(struct drm_gem_object *obj);
 int msm_gem_cpu_prep(struct drm_gem_object *obj, uint32_t op, ktime_t *timeout);
 int msm_gem_cpu_fini(struct drm_gem_object *obj);
+void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
+		size_t range_start, size_t range_end);
 void msm_gem_free_object(struct drm_gem_object *obj);
 int msm_gem_new_handle(struct drm_device *dev, struct drm_file *file,
 		uint32_t size, uint32_t flags, uint32_t *handle, char *name);
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index ad9a627493ae..93da88b3fc50 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -8,6 +8,7 @@ 
 #include <linux/shmem_fs.h>
 #include <linux/dma-buf.h>
 #include <linux/pfn_t.h>
+#include <linux/dma-noncoherent.h>
 
 #include <drm/drm_prime.h>
 
@@ -808,6 +809,20 @@  int msm_gem_cpu_fini(struct drm_gem_object *obj)
 	return 0;
 }
 
+void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
+		size_t range_start, size_t range_end)
+{
+	struct msm_gem_object *msm_obj = to_msm_bo(obj);
+
+	/* TODO: sync only the required range, and don't invalidate on clean */
+
+	if (flags & MSM_GEM_SYNC_CACHE_CLEAN)
+		sync_for_device(msm_obj);
+
+	if (flags & MSM_GEM_SYNC_CACHE_INVALIDATE)
+		sync_for_cpu(msm_obj);
+}
+
 #ifdef CONFIG_DEBUG_FS
 static void describe_fence(struct dma_fence *fence, const char *type,
 		struct seq_file *m)
diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h
index 474497e8743a..1dfafa71fc94 100644
--- a/include/uapi/drm/msm_drm.h
+++ b/include/uapi/drm/msm_drm.h
@@ -319,6 +319,24 @@  struct drm_msm_submitqueue_query {
 	__u32 pad;
 };
 
+/*
+ * Host cache maintenance (relevant for MSM_BO_CACHED)
+ * driver may both clean/invalidate (flush) for clean
+ */
+
+#define MSM_GEM_SYNC_CACHE_CLEAN	0x1
+#define MSM_GEM_SYNC_CACHE_INVALIDATE	0x2
+
+#define MSM_GEM_SYNC_CACHE_FLAGS	(MSM_GEM_SYNC_CACHE_CLEAN | \
+					 MSM_GEM_SYNC_CACHE_INVALIDATE)
+
+struct drm_msm_gem_sync_cache {
+	__u32 handle;
+	__u32 flags;
+	__u64 offset;
+	__u64 end;      /* offset + size */
+};
+
 #define DRM_MSM_GET_PARAM              0x00
 /* placeholder:
 #define DRM_MSM_SET_PARAM              0x01
@@ -336,6 +354,7 @@  struct drm_msm_submitqueue_query {
 #define DRM_MSM_SUBMITQUEUE_NEW        0x0A
 #define DRM_MSM_SUBMITQUEUE_CLOSE      0x0B
 #define DRM_MSM_SUBMITQUEUE_QUERY      0x0C
+#define DRM_MSM_GEM_SYNC_CACHE         0x0D
 
 #define DRM_IOCTL_MSM_GET_PARAM        DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GET_PARAM, struct drm_msm_param)
 #define DRM_IOCTL_MSM_GEM_NEW          DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GEM_NEW, struct drm_msm_gem_new)
@@ -348,6 +367,7 @@  struct drm_msm_submitqueue_query {
 #define DRM_IOCTL_MSM_SUBMITQUEUE_NEW    DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_NEW, struct drm_msm_submitqueue)
 #define DRM_IOCTL_MSM_SUBMITQUEUE_CLOSE  DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_CLOSE, __u32)
 #define DRM_IOCTL_MSM_SUBMITQUEUE_QUERY  DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_QUERY, struct drm_msm_submitqueue_query)
+#define DRM_IOCTL_MSM_GEM_SYNC_CACHE     DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_GEM_SYNC_CACHE, struct drm_msm_gem_sync_cache)
 
 #if defined(__cplusplus)
 }