diff mbox series

[v3,5/6] media: uvcvideo: Use dma_alloc_noncontiguos API

Message ID 20201125221917.150463-1-ribalda@chromium.org
State New
Headers show
Series None | expand

Commit Message

Ricardo Ribalda Nov. 25, 2020, 10:19 p.m. UTC
On architectures where the is no coherent caching such as ARM use the
dma_alloc_noncontiguos API and handle manually the cache flushing using
dma_sync_sgtable().

With this patch on the affected architectures we can measure up to 20x
performance improvement in uvc_video_copy_data_work().

Signed-off-by: Ricardo Ribalda <ribalda@chromium.org>
---

v3: Thanks to Marek Szyprowski

Use dma_sync_sgtable and _for_cpu()

 drivers/media/usb/uvc/uvc_video.c | 93 +++++++++++++++++++++++++++----
 drivers/media/usb/uvc/uvcvideo.h  |  2 +
 2 files changed, 83 insertions(+), 12 deletions(-)

Comments

Sergey Senozhatsky Nov. 26, 2020, 11:44 a.m. UTC | #1
On (20/11/25 23:19), Ricardo Ribalda wrote:
[..]
> +	if (uvc_urb->pages)

> +		dma_sync_sgtable_for_device(stream_to_dmadev(uvc_urb->stream),

> +					    &uvc_urb->sgt, DMA_FROM_DEVICE);


[..]

> +	if (uvc_urb->pages)

> +		dma_sync_sgtable_for_cpu(stream_to_dmadev(stream),

> +					 &uvc_urb->sgt, DMA_FROM_DEVICE);


[..]

> +	uvc_urb->pages = dma_alloc_noncontiguous(dma_dev, stream->urb_size,

> +						 &uvc_urb->dma,

> +						 gfp_flags | __GFP_NOWARN, 0);


Do we need to pass __GFP_NOWARN? It seems that

dma_alloc_noncontiguous()
  __iommu_dma_alloc_noncontiguous()
    __iommu_dma_alloc_pages()

does this internally.

> +	if (!uvc_urb->pages)

> +		return false;

> +

> +	uvc_urb->buffer = vmap(uvc_urb->pages,

> +			       PAGE_ALIGN(stream->urb_size) >> PAGE_SHIFT,

> +			       VM_DMA_COHERENT, PAGE_KERNEL);


This is not related to Ricardo's patch, just a side note:

  I think VM_DMA_COHERENT needs to be renamed. I found it a bit confusing
  to see DMA_COHERENT mapping being dma_sync-ed. It turned out that the
  flag has different meaning.

	-ss
Christoph Hellwig Nov. 30, 2020, 8:31 a.m. UTC | #2
On Thu, Nov 26, 2020 at 08:44:50PM +0900, Sergey Senozhatsky wrote:
> > +	uvc_urb->buffer = vmap(uvc_urb->pages,

> > +			       PAGE_ALIGN(stream->urb_size) >> PAGE_SHIFT,

> > +			       VM_DMA_COHERENT, PAGE_KERNEL);

> 

> This is not related to Ricardo's patch, just a side note:

> 

>   I think VM_DMA_COHERENT needs to be renamed. I found it a bit confusing

>   to see DMA_COHERENT mapping being dma_sync-ed. It turned out that the

>   flag has different meaning.


s/renamed/removed/.  This is a normal VM_MAP mapping as far as the
core vmalloc/vmap code is concerned.  VM_DMA_COHERENT is only for
internal use in the core DMA code.
Christoph Hellwig Nov. 30, 2020, 8:34 a.m. UTC | #3
> +#ifndef CONFIG_DMA_NONCOHERENT


I think you need to drop this ifdef.  This code should work just fine
on noncoherent mips and sh platforms.

> +	uvc_urb->pages = dma_alloc_noncontiguous(dma_dev, stream->urb_size,

> +						 &uvc_urb->dma,

> +						 gfp_flags | __GFP_NOWARN, 0);

> +	if (!uvc_urb->pages)

> +		return false;

> +

> +	uvc_urb->buffer = vmap(uvc_urb->pages,

> +			       PAGE_ALIGN(stream->urb_size) >> PAGE_SHIFT,

> +			       VM_DMA_COHERENT, PAGE_KERNEL);

> +	if (!uvc_urb->buffer) {

> +		dma_free_noncontiguous(dma_dev, stream->urb_size,

> +				       uvc_urb->pages, uvc_urb->dma);

> +		return false;

> +	}

> +

> +	if (sg_alloc_table_from_pages(&uvc_urb->sgt, uvc_urb->pages,

> +				PAGE_ALIGN(stream->urb_size) >> PAGE_SHIFT, 0,

> +				stream->urb_size, GFP_KERNEL)) {

> +		vunmap(uvc_urb->buffer);

> +		dma_free_noncontiguous(dma_dev, stream->urb_size,

> +				       uvc_urb->pages, uvc_urb->dma);

> +		return false;

> +	}

> +

> +	return true;

> +}


I wonder if we should lift this into a helper.  On the one hand I had
proliferating struct scatterlist usage, on the other hand it is all over
the media and drm code anyway, and duplicating this doesn't help anyone.

Possibly including the fallback to the coherent allocating.
Ricardo Ribalda Nov. 30, 2020, 10:49 a.m. UTC | #4
Hi Christoph

On Mon, Nov 30, 2020 at 9:34 AM Christoph Hellwig <hch@lst.de> wrote:
>

> > +#ifndef CONFIG_DMA_NONCOHERENT

>

> I think you need to drop this ifdef.  This code should work just fine

> on noncoherent mips and sh platforms.

>

> > +     uvc_urb->pages = dma_alloc_noncontiguous(dma_dev, stream->urb_size,

> > +                                              &uvc_urb->dma,

> > +                                              gfp_flags | __GFP_NOWARN, 0);

> > +     if (!uvc_urb->pages)

> > +             return false;

> > +

> > +     uvc_urb->buffer = vmap(uvc_urb->pages,

> > +                            PAGE_ALIGN(stream->urb_size) >> PAGE_SHIFT,

> > +                            VM_DMA_COHERENT, PAGE_KERNEL);

> > +     if (!uvc_urb->buffer) {

> > +             dma_free_noncontiguous(dma_dev, stream->urb_size,

> > +                                    uvc_urb->pages, uvc_urb->dma);

> > +             return false;

> > +     }

> > +

> > +     if (sg_alloc_table_from_pages(&uvc_urb->sgt, uvc_urb->pages,

> > +                             PAGE_ALIGN(stream->urb_size) >> PAGE_SHIFT, 0,

> > +                             stream->urb_size, GFP_KERNEL)) {

> > +             vunmap(uvc_urb->buffer);

> > +             dma_free_noncontiguous(dma_dev, stream->urb_size,

> > +                                    uvc_urb->pages, uvc_urb->dma);

> > +             return false;

> > +     }

> > +

> > +     return true;

> > +}

>

> I wonder if we should lift this into a helper.  On the one hand I had

> proliferating struct scatterlist usage, on the other hand it is all over

> the media and drm code anyway, and duplicating this doesn't help anyone.

>

> Possibly including the fallback to the coherent allocating.


Probably Sergey has best opinion of this than mine. I only had to look
into one driver, he has been working with the vb2, which uses the API
much more.



-- 
Ricardo Ribalda
Sergey Senozhatsky Dec. 1, 2020, 3:36 a.m. UTC | #5
On (20/11/30 09:34), Christoph Hellwig wrote:
> 

> > +	uvc_urb->pages = dma_alloc_noncontiguous(dma_dev, stream->urb_size,

> > +						 &uvc_urb->dma,

> > +						 gfp_flags | __GFP_NOWARN, 0);

> > +	if (!uvc_urb->pages)

> > +		return false;

> > +

> > +	uvc_urb->buffer = vmap(uvc_urb->pages,

> > +			       PAGE_ALIGN(stream->urb_size) >> PAGE_SHIFT,

> > +			       VM_DMA_COHERENT, PAGE_KERNEL);

> > +	if (!uvc_urb->buffer) {

> > +		dma_free_noncontiguous(dma_dev, stream->urb_size,

> > +				       uvc_urb->pages, uvc_urb->dma);

> > +		return false;

> > +	}

> > +

> > +	if (sg_alloc_table_from_pages(&uvc_urb->sgt, uvc_urb->pages,

> > +				PAGE_ALIGN(stream->urb_size) >> PAGE_SHIFT, 0,

> > +				stream->urb_size, GFP_KERNEL)) {

> > +		vunmap(uvc_urb->buffer);

> > +		dma_free_noncontiguous(dma_dev, stream->urb_size,

> > +				       uvc_urb->pages, uvc_urb->dma);

> > +		return false;

> > +	}

> 

> I wonder if we should lift this into a helper.  On the one hand I had

> proliferating struct scatterlist usage, on the other hand it is all over

> the media and drm code anyway, and duplicating this doesn't help anyone.


Not that I have any sound experience in this area, but the helper
probably won't hurt. Do you also plan to add vmap() to that helper
or dma_alloc_noncontiguous()/sg_alloc_table_from_pages() only?

	helper()
	{
		dma_alloc_noncontiguous();
		sg_alloc_table_from_pages();

		if ((dma_attrs & DMA_ATTR_NO_KERNEL_MAPPING) == 0)
			vmap();
	}

videobuf2-dma-contig still has to carry around two versions: one that
deals with the noncontig pages and sgt (new API); and the current one.
But if the helper will include fallback to coherent allocations then
this may change, depending on the helper implementation.

	-ss
Christoph Hellwig Dec. 1, 2020, 2:49 p.m. UTC | #6
On Tue, Dec 01, 2020 at 12:36:58PM +0900, Sergey Senozhatsky wrote:
> Not that I have any sound experience in this area, but the helper

> probably won't hurt. Do you also plan to add vmap() to that helper

> or dma_alloc_noncontiguous()/sg_alloc_table_from_pages() only?


Yes, I think adding the vmap is useful, and it probably makes sense
to do that unconditionally.  I'd also include the fallback to
dma_alloc_pages when the noncontig version isn't supported in the
helper.
Tomasz Figa Dec. 8, 2020, 4:54 a.m. UTC | #7
Hi Christoph,

On Tue, Dec 1, 2020 at 11:49 PM Christoph Hellwig <hch@lst.de> wrote:
>

> On Tue, Dec 01, 2020 at 12:36:58PM +0900, Sergey Senozhatsky wrote:

> > Not that I have any sound experience in this area, but the helper

> > probably won't hurt. Do you also plan to add vmap() to that helper

> > or dma_alloc_noncontiguous()/sg_alloc_table_from_pages() only?

>

> Yes, I think adding the vmap is useful, and it probably makes sense

> to do that unconditionally.  I'd also include the fallback to

> dma_alloc_pages when the noncontig version isn't supported in the

> helper.


From the media perspective, it would be good to have the vmap
optional, similarly to the DMA_ATTR_NO_KERNEL_MAPPING attribute for
coherent allocations. Actually, in the media drivers, the need to have
a kernel mapping of the DMA buffers corresponds to a minority of the
drivers. Most of them only need to map them to the userspace.

Nevertheless, that minority actually happens to be quite widely used,
e.g. the uvcvideo driver, so we can't go to the other extreme and just
drop the vmap at all.

In any case, Sergey is going to share a preliminary patch on how the
current API would be used in the V4L2 videobuf2 framework. That should
give us more input on how such a helper could look.

Other than that, again, thanks a lot for helping with this.

Best regards,
Tomasz
Sergey Senozhatsky Dec. 8, 2020, 7:13 a.m. UTC | #8
On (20/12/08 13:54), Tomasz Figa wrote:
> 

> In any case, Sergey is going to share a preliminary patch on how the

> current API would be used in the V4L2 videobuf2 framework. That should

> give us more input on how such a helper could look.


HUGE apologies for the previous screw up! I replied in the
gmail web-interface and that did not work out as expected
(at all, big times).

I'm sorry about that!

Take two, no html this time around.

---

My current WIP (deep WIP) series can be found at [0]. The patch that adds new
DMA API support is at the head of the series [1]. New DMA API requires us to
have several internal videobuf2 API changes before we can proceed [2]: v4l2 and
videobuf2 core do not pass enough information to the vb2 allocators now. Previously,
if user space requests non-coherent allocation v4l2 would set queue dma_attr bit,
videobuf2 core would pass queue->dma_attrs to vb2 allocator, which would 
those dma_attrs for dma_alloc(). So everything was transparent (sort of). Since we
don't have that dma_attr flag anymore, there is no way for v4l2 to pass the request
information (coherent or non-coherent) to the vb2 allocator. Hence we need to rework
the vb2 allocator API. I currently pass vb2 pointer, but we decided to rework it again
and to pass dedicated VB2_ALLOC_FLAGS from the videobuf2 core to the 
allocator. This is still in my private tree and not completely ready, will push those
patches to github later.

Another thing to notice is that the new API requires us to have two execution branches
in allocators - one for the current API; and one for the new API (if it's supported and
if user-space requested non-coherent allocation).

[0] https://github.com/sergey-senozhatsky/linux-next-ss/commits/master
[1] https://github.com/sergey-senozhatsky/linux-next-ss/commit/a45f48b483daee59594c62e4aaf01790aab960c8
[2] https://github.com/sergey-senozhatsky/linux-next-ss/commit/b784145101c398da7fe9e2608b6001e58e05a9b5

	-ss
Christoph Hellwig Dec. 9, 2020, 11:12 a.m. UTC | #9
On Tue, Dec 08, 2020 at 01:54:00PM +0900, Tomasz Figa wrote:
> >From the media perspective, it would be good to have the vmap

> optional, similarly to the DMA_ATTR_NO_KERNEL_MAPPING attribute for

> coherent allocations. Actually, in the media drivers, the need to have

> a kernel mapping of the DMA buffers corresponds to a minority of the

> drivers. Most of them only need to map them to the userspace.

> 

> Nevertheless, that minority actually happens to be quite widely used,

> e.g. the uvcvideo driver, so we can't go to the other extreme and just

> drop the vmap at all.


My main problem is that the DMA_ATTR_NO_KERNEL_MAPPING makes a mess
of an API.  I'd much rather have low-level API that returns the
discontiguous allocations and another one that vmaps them rather
than starting to overload arguments like in dma_alloc_attrs with
DMA_ATTR_NO_KERNEL_MAPPING.

> 

> In any case, Sergey is going to share a preliminary patch on how the

> current API would be used in the V4L2 videobuf2 framework. That should

> give us more input on how such a helper could look.


Awesome!
Christoph Hellwig Dec. 9, 2020, 11:16 a.m. UTC | #10
On Tue, Dec 08, 2020 at 04:13:20PM +0900, Sergey Senozhatsky wrote:
> On (20/12/08 13:54), Tomasz Figa wrote:

> > 

> > In any case, Sergey is going to share a preliminary patch on how the

> > current API would be used in the V4L2 videobuf2 framework. That should

> > give us more input on how such a helper could look.

> 

> HUGE apologies for the previous screw up! I replied in the

> gmail web-interface and that did not work out as expected

> (at all, big times).


Actually the previous mail was a mime multipart one, and the plain text
version displayed just fine here.  My the gmail engineers finally learned
something after all.

> Another thing to notice is that the new API requires us to have two execution branches

> in allocators - one for the current API; and one for the new API (if it's supported and

> if user-space requested non-coherent allocation).


So I think we do want these branches for coherent vs non-coherent as they
have very different semantics and I do not think that hiding them under
the same API helps people to understand those vastly different semantics.

OTOH we should look into a fallback for DMA API instances that do not
support the discontigous allocations.

I think between your comments and those from Ricardo I have a good idea
for a somewhat updated API.  I'll try to post something in the next days.
Robin Murphy Dec. 9, 2020, 1:05 p.m. UTC | #11
On 2020-12-09 11:12, Christoph Hellwig wrote:
> On Tue, Dec 08, 2020 at 01:54:00PM +0900, Tomasz Figa wrote:

>> >From the media perspective, it would be good to have the vmap

>> optional, similarly to the DMA_ATTR_NO_KERNEL_MAPPING attribute for

>> coherent allocations. Actually, in the media drivers, the need to have

>> a kernel mapping of the DMA buffers corresponds to a minority of the

>> drivers. Most of them only need to map them to the userspace.

>>

>> Nevertheless, that minority actually happens to be quite widely used,

>> e.g. the uvcvideo driver, so we can't go to the other extreme and just

>> drop the vmap at all.

> 

> My main problem is that the DMA_ATTR_NO_KERNEL_MAPPING makes a mess

> of an API.  I'd much rather have low-level API that returns the

> discontiguous allocations and another one that vmaps them rather

> than starting to overload arguments like in dma_alloc_attrs with

> DMA_ATTR_NO_KERNEL_MAPPING.


Agreed - if iommu-dma's dma_alloc_coherent() ends up as little more than 
a thin wrapper around those two functions I think that would be a good 
sign. It also seems like it might be a good idea for this API to use 
scatterlists rather than page arrays as it's fundamental format, to help 
reduce impedance with dma-buf - if we can end up with a wider redesign 
that also gets rid of dma_get_sgtable(), all the better!

Robin.
Tomasz Figa Dec. 10, 2020, 5:08 a.m. UTC | #12
On Wed, Dec 9, 2020 at 10:05 PM Robin Murphy <robin.murphy@arm.com> wrote:
>

> On 2020-12-09 11:12, Christoph Hellwig wrote:

> > On Tue, Dec 08, 2020 at 01:54:00PM +0900, Tomasz Figa wrote:

> >> >From the media perspective, it would be good to have the vmap

> >> optional, similarly to the DMA_ATTR_NO_KERNEL_MAPPING attribute for

> >> coherent allocations. Actually, in the media drivers, the need to have

> >> a kernel mapping of the DMA buffers corresponds to a minority of the

> >> drivers. Most of them only need to map them to the userspace.

> >>

> >> Nevertheless, that minority actually happens to be quite widely used,

> >> e.g. the uvcvideo driver, so we can't go to the other extreme and just

> >> drop the vmap at all.

> >

> > My main problem is that the DMA_ATTR_NO_KERNEL_MAPPING makes a mess

> > of an API.  I'd much rather have low-level API that returns the

> > discontiguous allocations and another one that vmaps them rather

> > than starting to overload arguments like in dma_alloc_attrs with

> > DMA_ATTR_NO_KERNEL_MAPPING.


Okay, makes sense. Actually, a separate mapping function makes it
possible to defer creating the mapping to when (and if) it is really
needed.

>

> Agreed - if iommu-dma's dma_alloc_coherent() ends up as little more than

> a thin wrapper around those two functions I think that would be a good

> sign. It also seems like it might be a good idea for this API to use

> scatterlists rather than page arrays as it's fundamental format, to help

> reduce impedance with dma-buf -


True.

> if we can end up with a wider redesign

> that also gets rid of dma_get_sgtable(), all the better!


That would also require taking care of the old dma_alloc_attrs() API.
I guess it could return an already mapped sgtable, with only 1 mapped
entry and as many following entries as needed to describe the physical
pages. To be honest, I'd say this is far out of scope of this
discussion, though.

Best regards,
Tomasz
Ricardo Ribalda Jan. 7, 2021, 2:14 p.m. UTC | #13
Hi Christoph

Happy new year!

On Wed, Dec 9, 2020 at 12:16 PM . Christoph Hellwig <hch@lst.de> wrote:
>

> On Tue, Dec 08, 2020 at 04:13:20PM +0900, Sergey Senozhatsky wrote:

> > On (20/12/08 13:54), Tomasz Figa wrote:

> > >

> > > In any case, Sergey is going to share a preliminary patch on how the

> > > current API would be used in the V4L2 videobuf2 framework. That should

> > > give us more input on how such a helper could look.

> >

> > HUGE apologies for the previous screw up! I replied in the

> > gmail web-interface and that did not work out as expected

> > (at all, big times).

>

> Actually the previous mail was a mime multipart one, and the plain text

> version displayed just fine here.  My the gmail engineers finally learned

> something after all.

>

> > Another thing to notice is that the new API requires us to have two execution branches

> > in allocators - one for the current API; and one for the new API (if it's supported and

> > if user-space requested non-coherent allocation).

>

> So I think we do want these branches for coherent vs non-coherent as they

> have very different semantics and I do not think that hiding them under

> the same API helps people to understand those vastly different semantics.

>

> OTOH we should look into a fallback for DMA API instances that do not

> support the discontigous allocations.

>

> I think between your comments and those from Ricardo I have a good idea

> for a somewhat updated API.  I'll try to post something in the next days.


Did you have time to look into this?

No hurry, I just want to make sure that I didn't miss anything ;)

Best regards!



-- 
Ricardo Ribalda
Christoph Hellwig Jan. 11, 2021, 8:36 a.m. UTC | #14
On Thu, Jan 07, 2021 at 03:14:08PM +0100, Ricardo Ribalda wrote:
> > So I think we do want these branches for coherent vs non-coherent as they

> > have very different semantics and I do not think that hiding them under

> > the same API helps people to understand those vastly different semantics.

> >

> > OTOH we should look into a fallback for DMA API instances that do not

> > support the discontigous allocations.

> >

> > I think between your comments and those from Ricardo I have a good idea

> > for a somewhat updated API.  I'll try to post something in the next days.

> 

> Did you have time to look into this?

> 

> No hurry, I just want to make sure that I didn't miss anything ;)


Haven't managed to get to it, sorry.

> 

> Best regards!

> 

> 

> 

> -- 

> Ricardo Ribalda

---end quoted text---
Ricardo Ribalda Jan. 15, 2021, 1:08 p.m. UTC | #15
On Mon, Jan 11, 2021 at 9:36 AM . Christoph Hellwig <hch@lst.de> wrote:
>

> On Thu, Jan 07, 2021 at 03:14:08PM +0100, Ricardo Ribalda wrote:

> > > So I think we do want these branches for coherent vs non-coherent as they

> > > have very different semantics and I do not think that hiding them under

> > > the same API helps people to understand those vastly different semantics.

> > >

> > > OTOH we should look into a fallback for DMA API instances that do not

> > > support the discontigous allocations.

> > >

> > > I think between your comments and those from Ricardo I have a good idea

> > > for a somewhat updated API.  I'll try to post something in the next days.

> >

> > Did you have time to look into this?

> >

> > No hurry, I just want to make sure that I didn't miss anything ;)

>

> Haven't managed to get to it, sorry.


No worries!, is there something we can do to help you with this?

>

> >

> > Best regards!

> >

> >

> >

> > --

> > Ricardo Ribalda

> ---end quoted text---




-- 
Ricardo Ribalda
Christoph Hellwig Jan. 20, 2021, 5:17 p.m. UTC | #16
On Fri, Jan 15, 2021 at 02:08:14PM +0100, Ricardo Ribalda wrote:
> > > Did you have time to look into this?

> > >

> > > No hurry, I just want to make sure that I didn't miss anything ;)

> >

> > Haven't managed to get to it, sorry.

> 

> No worries!, is there something we can do to help you with this?


Not yet.  I hope to get to it in the next days, and then soon I'll
need all the help you can give me for feedback and testing.
Christoph Hellwig Jan. 26, 2021, 5:06 p.m. UTC | #17
Please take a quick look at this branch:

http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/dma_alloc_noncontiguous

Warning: hot off the press, and only with the v4l conversion as that
seemed at little easier than uvcvideo.
Ricardo Ribalda Jan. 26, 2021, 11:29 p.m. UTC | #18
Hi Christoph

Thanks for the series!

I have a couple of questions:

- Is there any platform where dma_alloc_noncontiguos can fail?
This is, !ops->alloc_noncontiguous and !dev->coherent_dma_mask
If yes then we need to add a function to let the driver know in
advance that it has to use the coherent allocator (usb_alloc_coherent
for uvc)

- In dma_alloc_noncontiguos, on the dma_alloc_pages fallback. If we
have a device where the dma happens in only one direction, could not
get more performance with DMA_FROM/TO_DEVICE instead of
DMA_BIDIRECTIONAL ?


Then I have tried to use the API, and I have encountered a problem: on
uvcvideo the device passed to the memory allocator is different for
DMA_PAGES and NON_CONTIGUOUS:
https://github.com/ribalda/linux/blob/042cd497739f71c8d4a83a67ee970369e2baca4a/drivers/media/usb/uvc/uvc_video.c#L1236

I need to dig a bit tomorrow to figure out why this is, I have
hardware to test both paths, so it should not be too difficult.


Thanks again






On Tue, Jan 26, 2021 at 6:07 PM . Christoph Hellwig <hch@lst.de> wrote:
>

> Please take a quick look at this branch:

>

> http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/dma_alloc_noncontiguous

>

> Warning: hot off the press, and only with the v4l conversion as that

> seemed at little easier than uvcvideo.




--
Ricardo Ribalda
Christoph Hellwig Jan. 27, 2021, 3:56 p.m. UTC | #19
On Wed, Jan 27, 2021 at 12:29:08AM +0100, Ricardo Ribalda wrote:
> - Is there any platform where dma_alloc_noncontiguos can fail?

> This is, !ops->alloc_noncontiguous and !dev->coherent_dma_mask

> If yes then we need to add a function to let the driver know in

> advance that it has to use the coherent allocator (usb_alloc_coherent

> for uvc)


dev->coherent_dma_mask is set by the driver.  So the only reason why
dma_alloc_noncontiguos will fail is because is because it can't
allocate any memory.

> - In dma_alloc_noncontiguos, on the dma_alloc_pages fallback. If we

> have a device where the dma happens in only one direction, could not

> get more performance with DMA_FROM/TO_DEVICE instead of

> DMA_BIDIRECTIONAL ?


Yes, we could probably do that.

> 

> 

> Then I have tried to use the API, and I have encountered a problem: on

> uvcvideo the device passed to the memory allocator is different for

> DMA_PAGES and NON_CONTIGUOUS:

> https://github.com/ribalda/linux/blob/042cd497739f71c8d4a83a67ee970369e2baca4a/drivers/media/usb/uvc/uvc_video.c#L1236

> 

> I need to dig a bit tomorrow to figure out why this is, I have

> hardware to test both paths, so it should not be too difficult.


I always found the USB dma alloc API a little weird, but we might have
to follow the scheme of the usb coherent wrappers there.
Ricardo Ribalda Jan. 27, 2021, 9:35 p.m. UTC | #20
Hi Christoph

On Wed, Jan 27, 2021 at 4:56 PM . Christoph Hellwig <hch@lst.de> wrote:
>

> On Wed, Jan 27, 2021 at 12:29:08AM +0100, Ricardo Ribalda wrote:

> > - Is there any platform where dma_alloc_noncontiguos can fail?

> > This is, !ops->alloc_noncontiguous and !dev->coherent_dma_mask

> > If yes then we need to add a function to let the driver know in

> > advance that it has to use the coherent allocator (usb_alloc_coherent

> > for uvc)

>

> dev->coherent_dma_mask is set by the driver.  So the only reason why

> dma_alloc_noncontiguos will fail is because is because it can't

> allocate any memory.

>

> > - In dma_alloc_noncontiguos, on the dma_alloc_pages fallback. If we

> > have a device where the dma happens in only one direction, could not

> > get more performance with DMA_FROM/TO_DEVICE instead of

> > DMA_BIDIRECTIONAL ?

>

> Yes, we could probably do that.

>

> >

> >

> > Then I have tried to use the API, and I have encountered a problem: on

> > uvcvideo the device passed to the memory allocator is different for

> > DMA_PAGES and NON_CONTIGUOUS:

> > https://github.com/ribalda/linux/blob/042cd497739f71c8d4a83a67ee970369e2baca4a/drivers/media/usb/uvc/uvc_video.c#L1236

> >

> > I need to dig a bit tomorrow to figure out why this is, I have

> > hardware to test both paths, so it should not be too difficult.

>

> I always found the USB dma alloc API a little weird, but we might have

> to follow the scheme of the usb coherent wrappers there.


I have used the current API here:

https://git.kernel.org/pub/scm/linux/kernel/git/ribalda/linux.git/log/?h=uvc-noncontiguous

And I think the result is very clean. Great work!

I have tested it in X86 and in arm64, with similar performance as the
previous patchset.

Maybe you want to cherry pick that commit into your series I can also
send the patch to the list for review if you prefer so.

At least in 5.11 rc5 I the same dmadev worked in arm64 and x86.

Best regards!


-- 
Ricardo Ribalda
Christoph Hellwig Jan. 28, 2021, 7:57 a.m. UTC | #21
On Wed, Jan 27, 2021 at 10:35:02PM +0100, Ricardo Ribalda wrote:
> I have used the current API here:

> 

> https://git.kernel.org/pub/scm/linux/kernel/git/ribalda/linux.git/log/?h=uvc-noncontiguous

> 

> And I think the result is very clean. Great work!

> 

> I have tested it in X86 and in arm64, with similar performance as the

> previous patchset.

> 

> Maybe you want to cherry pick that commit into your series I can also

> send the patch to the list for review if you prefer so.

> 

> At least in 5.11 rc5 I the same dmadev worked in arm64 and x86.


Given that Sergey mentioned the v4l patchset needs more work I'll
use your commit as the example for sending out my series.

Thanks for all the work!
diff mbox series

Patch

diff --git a/drivers/media/usb/uvc/uvc_video.c b/drivers/media/usb/uvc/uvc_video.c
index a6a441d92b94..06ebd7a3877b 100644
--- a/drivers/media/usb/uvc/uvc_video.c
+++ b/drivers/media/usb/uvc/uvc_video.c
@@ -1097,6 +1097,11 @@  static int uvc_video_decode_start(struct uvc_streaming *stream,
 	return data[0];
 }
 
+static inline struct device *stream_to_dmadev(struct uvc_streaming *stream)
+{
+	return stream->dev->udev->bus->controller->parent;
+}
+
 /*
  * uvc_video_decode_data_work: Asynchronous memcpy processing
  *
@@ -1118,6 +1123,9 @@  static void uvc_video_copy_data_work(struct work_struct *work)
 		uvc_queue_buffer_release(op->buf);
 	}
 
+	if (uvc_urb->pages)
+		dma_sync_sgtable_for_device(stream_to_dmadev(uvc_urb->stream),
+					    &uvc_urb->sgt, DMA_FROM_DEVICE);
 	ret = usb_submit_urb(uvc_urb->urb, GFP_KERNEL);
 	if (ret < 0)
 		uvc_printk(KERN_ERR, "Failed to resubmit video URB (%d).\n",
@@ -1539,10 +1547,17 @@  static void uvc_video_complete(struct urb *urb)
 	 * Process the URB headers, and optionally queue expensive memcpy tasks
 	 * to be deferred to a work queue.
 	 */
+	if (uvc_urb->pages)
+		dma_sync_sgtable_for_cpu(stream_to_dmadev(stream),
+					 &uvc_urb->sgt, DMA_FROM_DEVICE);
 	stream->decode(uvc_urb, buf, buf_meta);
 
 	/* If no async work is needed, resubmit the URB immediately. */
 	if (!uvc_urb->async_operations) {
+		if (uvc_urb->pages)
+			dma_sync_sgtable_for_device(stream_to_dmadev(stream),
+						    &uvc_urb->sgt,
+						    DMA_FROM_DEVICE);
 		ret = usb_submit_urb(uvc_urb->urb, GFP_ATOMIC);
 		if (ret < 0)
 			uvc_printk(KERN_ERR,
@@ -1566,8 +1581,16 @@  static void uvc_free_urb_buffers(struct uvc_streaming *stream)
 			continue;
 
 #ifndef CONFIG_DMA_NONCOHERENT
-		usb_free_coherent(stream->dev->udev, stream->urb_size,
-				  uvc_urb->buffer, uvc_urb->dma);
+		if (uvc_urb->pages) {
+			sg_free_table(&uvc_urb->sgt);
+			vunmap(uvc_urb->buffer);
+			dma_free_noncontiguous(stream_to_dmadev(stream),
+					       stream->urb_size,
+					       uvc_urb->pages, uvc_urb->dma);
+		} else {
+			usb_free_coherent(stream->dev->udev, stream->urb_size,
+					  uvc_urb->buffer, uvc_urb->dma);
+		}
 #else
 		kfree(uvc_urb->buffer);
 #endif
@@ -1577,6 +1600,56 @@  static void uvc_free_urb_buffers(struct uvc_streaming *stream)
 	stream->urb_size = 0;
 }
 
+#ifndef CONFIG_DMA_NONCOHERENT
+static bool uvc_alloc_urb_buffer(struct uvc_streaming *stream,
+				 struct uvc_urb *uvc_urb, gfp_t gfp_flags)
+{
+	struct device *dma_dev = dma_dev = stream_to_dmadev(stream);
+
+	if (!dma_can_alloc_noncontiguous(dma_dev)) {
+		uvc_urb->buffer = usb_alloc_coherent(stream->dev->udev,
+						     stream->urb_size,
+						     gfp_flags | __GFP_NOWARN,
+						     &uvc_urb->dma);
+		return uvc_urb->buffer != NULL;
+	}
+
+	uvc_urb->pages = dma_alloc_noncontiguous(dma_dev, stream->urb_size,
+						 &uvc_urb->dma,
+						 gfp_flags | __GFP_NOWARN, 0);
+	if (!uvc_urb->pages)
+		return false;
+
+	uvc_urb->buffer = vmap(uvc_urb->pages,
+			       PAGE_ALIGN(stream->urb_size) >> PAGE_SHIFT,
+			       VM_DMA_COHERENT, PAGE_KERNEL);
+	if (!uvc_urb->buffer) {
+		dma_free_noncontiguous(dma_dev, stream->urb_size,
+				       uvc_urb->pages, uvc_urb->dma);
+		return false;
+	}
+
+	if (sg_alloc_table_from_pages(&uvc_urb->sgt, uvc_urb->pages,
+				PAGE_ALIGN(stream->urb_size) >> PAGE_SHIFT, 0,
+				stream->urb_size, GFP_KERNEL)) {
+		vunmap(uvc_urb->buffer);
+		dma_free_noncontiguous(dma_dev, stream->urb_size,
+				       uvc_urb->pages, uvc_urb->dma);
+		return false;
+	}
+
+	return true;
+}
+#else
+static bool uvc_alloc_urb_buffer(struct uvc_streaming *stream,
+				 struct uvc_urb *uvc_urb, gfp_t gfp_flags)
+{
+	uvc_urb->buffer = kmalloc(stream->urb_size, gfp_flags | __GFP_NOWARN);
+
+	return uvc_urb->buffer != NULL;
+}
+#endif
+
 /*
  * Allocate transfer buffers. This function can be called with buffers
  * already allocated when resuming from suspend, in which case it will
@@ -1607,19 +1680,11 @@  static int uvc_alloc_urb_buffers(struct uvc_streaming *stream,
 
 	/* Retry allocations until one succeed. */
 	for (; npackets > 1; npackets /= 2) {
+		stream->urb_size = psize * npackets;
 		for (i = 0; i < UVC_URBS; ++i) {
 			struct uvc_urb *uvc_urb = &stream->uvc_urb[i];
 
-			stream->urb_size = psize * npackets;
-#ifndef CONFIG_DMA_NONCOHERENT
-			uvc_urb->buffer = usb_alloc_coherent(
-				stream->dev->udev, stream->urb_size,
-				gfp_flags | __GFP_NOWARN, &uvc_urb->dma);
-#else
-			uvc_urb->buffer =
-			    kmalloc(stream->urb_size, gfp_flags | __GFP_NOWARN);
-#endif
-			if (!uvc_urb->buffer) {
+			if (!uvc_alloc_urb_buffer(stream, uvc_urb, gfp_flags)) {
 				uvc_free_urb_buffers(stream);
 				break;
 			}
@@ -1891,6 +1956,10 @@  static int uvc_video_start_transfer(struct uvc_streaming *stream,
 
 	/* Submit the URBs. */
 	for_each_uvc_urb(uvc_urb, stream) {
+		if (uvc_urb->pages)
+			dma_sync_sgtable_for_device(stream_to_dmadev(stream),
+						    &uvc_urb->sgt,
+						    DMA_FROM_DEVICE);
 		ret = usb_submit_urb(uvc_urb->urb, gfp_flags);
 		if (ret < 0) {
 			uvc_printk(KERN_ERR, "Failed to submit URB %u (%d).\n",
diff --git a/drivers/media/usb/uvc/uvcvideo.h b/drivers/media/usb/uvc/uvcvideo.h
index a3dfacf069c4..3e6618a2ac82 100644
--- a/drivers/media/usb/uvc/uvcvideo.h
+++ b/drivers/media/usb/uvc/uvcvideo.h
@@ -532,6 +532,8 @@  struct uvc_urb {
 
 	char *buffer;
 	dma_addr_t dma;
+	struct page **pages;
+	struct sg_table sgt;
 
 	unsigned int async_operations;
 	struct uvc_copy_op copy_operations[UVC_MAX_PACKETS];