diff mbox series

usbnet: jump to rx_cleanup case instead of calling skb_queue_tail

Message ID 20221217161851.829497-1-lsahn@ooseel.net
State New
Headers show
Series usbnet: jump to rx_cleanup case instead of calling skb_queue_tail | expand

Commit Message

Leesoo Ahn Dec. 17, 2022, 4:18 p.m. UTC
The current source pushes skb into dev->done queue by calling
skb_queue_tail() and then, call skb_dequeue() to pop for rx_cleanup state
to free urb and skb next in usbnet_bh().
It wastes CPU resource with extra instructions. Instead, use return values
jumping to rx_cleanup case directly to free them. Therefore calling
skb_queue_tail() and skb_dequeue() is not necessary.

The follows are just showing difference between calling skb_queue_tail()
and using return values jumping to rx_cleanup state directly in usbnet_bh()
in Arm64 instructions with perf tool.

----------- calling skb_queue_tail() -----------
       │     if (!(dev->driver_info->flags & FLAG_RX_ASSEMBLE))
  7.58 │248:   ldr     x0, [x20, #16]
  2.46 │24c:   ldr     w0, [x0, #8]
  1.64 │250: ↑ tbnz    w0, #14, 16c
       │     dev->net->stats.rx_errors++;
  0.57 │254:   ldr     x1, [x20, #184]
  1.64 │258:   ldr     x0, [x1, #336]
  2.65 │25c:   add     x0, x0, #0x1
       │260:   str     x0, [x1, #336]
       │     skb_queue_tail(&dev->done, skb);
  0.38 │264:   mov     x1, x19
       │268:   mov     x0, x21
  2.27 │26c: → bl      skb_queue_tail
  0.57 │270: ↑ b       44    // branch to call skb_dequeue()

----------- jumping to rx_cleanup state -----------
       │     if (!(dev->driver_info->flags & FLAG_RX_ASSEMBLE))
  1.69 │25c:   ldr     x0, [x21, #16]
  4.78 │260:   ldr     w0, [x0, #8]
  3.28 │264: ↑ tbnz    w0, #14, e4    // jump to 'rx_cleanup' state
       │     dev->net->stats.rx_errors++;
  0.09 │268:   ldr     x1, [x21, #184]
  2.72 │26c:   ldr     x0, [x1, #336]
  3.37 │270:   add     x0, x0, #0x1
  0.09 │274:   str     x0, [x1, #336]
  0.66 │278: ↑ b       e4    // branch to 'rx_cleanup' state

Signed-off-by: Leesoo Ahn <lsahn@ooseel.net>
---
 drivers/net/usb/usbnet.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

Comments

Greg KH Dec. 18, 2022, 8:55 a.m. UTC | #1
On Sun, Dec 18, 2022 at 01:18:51AM +0900, Leesoo Ahn wrote:
> The current source pushes skb into dev->done queue by calling
> skb_queue_tail() and then, call skb_dequeue() to pop for rx_cleanup state
> to free urb and skb next in usbnet_bh().
> It wastes CPU resource with extra instructions. Instead, use return values
> jumping to rx_cleanup case directly to free them. Therefore calling
> skb_queue_tail() and skb_dequeue() is not necessary.
> 
> The follows are just showing difference between calling skb_queue_tail()
> and using return values jumping to rx_cleanup state directly in usbnet_bh()
> in Arm64 instructions with perf tool.
> 
> ----------- calling skb_queue_tail() -----------
>        │     if (!(dev->driver_info->flags & FLAG_RX_ASSEMBLE))
>   7.58 │248:   ldr     x0, [x20, #16]
>   2.46 │24c:   ldr     w0, [x0, #8]
>   1.64 │250: ↑ tbnz    w0, #14, 16c
>        │     dev->net->stats.rx_errors++;
>   0.57 │254:   ldr     x1, [x20, #184]
>   1.64 │258:   ldr     x0, [x1, #336]
>   2.65 │25c:   add     x0, x0, #0x1
>        │260:   str     x0, [x1, #336]
>        │     skb_queue_tail(&dev->done, skb);
>   0.38 │264:   mov     x1, x19
>        │268:   mov     x0, x21
>   2.27 │26c: → bl      skb_queue_tail
>   0.57 │270: ↑ b       44    // branch to call skb_dequeue()
> 
> ----------- jumping to rx_cleanup state -----------
>        │     if (!(dev->driver_info->flags & FLAG_RX_ASSEMBLE))
>   1.69 │25c:   ldr     x0, [x21, #16]
>   4.78 │260:   ldr     w0, [x0, #8]
>   3.28 │264: ↑ tbnz    w0, #14, e4    // jump to 'rx_cleanup' state
>        │     dev->net->stats.rx_errors++;
>   0.09 │268:   ldr     x1, [x21, #184]
>   2.72 │26c:   ldr     x0, [x1, #336]
>   3.37 │270:   add     x0, x0, #0x1
>   0.09 │274:   str     x0, [x1, #336]
>   0.66 │278: ↑ b       e4    // branch to 'rx_cleanup' state

Interesting, but does this even really matter given the slow speed of
the USB hardware?

> Signed-off-by: Leesoo Ahn <lsahn@ooseel.net>
> ---
>  drivers/net/usb/usbnet.c | 11 ++++++-----
>  1 file changed, 6 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
> index 64a9a80b2309..924392a37297 100644
> --- a/drivers/net/usb/usbnet.c
> +++ b/drivers/net/usb/usbnet.c
> @@ -555,7 +555,7 @@ static int rx_submit (struct usbnet *dev, struct urb *urb, gfp_t flags)
>  
>  /*-------------------------------------------------------------------------*/
>  
> -static inline void rx_process (struct usbnet *dev, struct sk_buff *skb)
> +static inline int rx_process(struct usbnet *dev, struct sk_buff *skb)
>  {
>  	if (dev->driver_info->rx_fixup &&
>  	    !dev->driver_info->rx_fixup (dev, skb)) {
> @@ -576,11 +576,11 @@ static inline void rx_process (struct usbnet *dev, struct sk_buff *skb)
>  		netif_dbg(dev, rx_err, dev->net, "rx length %d\n", skb->len);
>  	} else {
>  		usbnet_skb_return(dev, skb);
> -		return;
> +		return 0;
>  	}
>  
>  done:
> -	skb_queue_tail(&dev->done, skb);
> +	return -1;

Don't make up error numbers, this makes it look like this failed, not
succeeded.  And if this failed, give it a real error value.

thanks,

greg k-h
Leesoo Ahn Dec. 19, 2022, 7:41 a.m. UTC | #2
On 22. 12. 18. 17:55, Greg KH wrote:
> On Sun, Dec 18, 2022 at 01:18:51AM +0900, Leesoo Ahn wrote:
>> The current source pushes skb into dev->done queue by calling
>> skb_queue_tail() and then, call skb_dequeue() to pop for rx_cleanup state
>> to free urb and skb next in usbnet_bh().
>> It wastes CPU resource with extra instructions. Instead, use return values
>> jumping to rx_cleanup case directly to free them. Therefore calling
>> skb_queue_tail() and skb_dequeue() is not necessary.
>>
>> The follows are just showing difference between calling skb_queue_tail()
>> and using return values jumping to rx_cleanup state directly in usbnet_bh()
>> in Arm64 instructions with perf tool.
>>
>> ----------- calling skb_queue_tail() -----------
>>         │     if (!(dev->driver_info->flags & FLAG_RX_ASSEMBLE))
>>    7.58 │248:   ldr     x0, [x20, #16]
>>    2.46 │24c:   ldr     w0, [x0, #8]
>>    1.64 │250: ↑ tbnz    w0, #14, 16c
>>         │     dev->net->stats.rx_errors++;
>>    0.57 │254:   ldr     x1, [x20, #184]
>>    1.64 │258:   ldr     x0, [x1, #336]
>>    2.65 │25c:   add     x0, x0, #0x1
>>         │260:   str     x0, [x1, #336]
>>         │     skb_queue_tail(&dev->done, skb);
>>    0.38 │264:   mov     x1, x19
>>         │268:   mov     x0, x21
>>    2.27 │26c: → bl      skb_queue_tail
>>    0.57 │270: ↑ b       44    // branch to call skb_dequeue()
>>
>> ----------- jumping to rx_cleanup state -----------
>>         │     if (!(dev->driver_info->flags & FLAG_RX_ASSEMBLE))
>>    1.69 │25c:   ldr     x0, [x21, #16]
>>    4.78 │260:   ldr     w0, [x0, #8]
>>    3.28 │264: ↑ tbnz    w0, #14, e4    // jump to 'rx_cleanup' state
>>         │     dev->net->stats.rx_errors++;
>>    0.09 │268:   ldr     x1, [x21, #184]
>>    2.72 │26c:   ldr     x0, [x1, #336]
>>    3.37 │270:   add     x0, x0, #0x1
>>    0.09 │274:   str     x0, [x1, #336]
>>    0.66 │278: ↑ b       e4    // branch to 'rx_cleanup' state
> Interesting, but does this even really matter given the slow speed of
> the USB hardware?

It doesn't if USB hardware has slow speed but in software view, it's 
still worth avoiding calling skb_queue_tail() and skb_dequeue() which 
work with spinlock, if possible.


>> Signed-off-by: Leesoo Ahn <lsahn@ooseel.net>
>> ---
>>   drivers/net/usb/usbnet.c | 11 ++++++-----
>>   1 file changed, 6 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
>> index 64a9a80b2309..924392a37297 100644
>> --- a/drivers/net/usb/usbnet.c
>> +++ b/drivers/net/usb/usbnet.c
>> @@ -555,7 +555,7 @@ static int rx_submit (struct usbnet *dev, struct urb *urb, gfp_t flags)
>>   
>>   /*-------------------------------------------------------------------------*/
>>   
>> -static inline void rx_process (struct usbnet *dev, struct sk_buff *skb)
>> +static inline int rx_process(struct usbnet *dev, struct sk_buff *skb)
>>   {
>>   	if (dev->driver_info->rx_fixup &&
>>   	    !dev->driver_info->rx_fixup (dev, skb)) {
>> @@ -576,11 +576,11 @@ static inline void rx_process (struct usbnet *dev, struct sk_buff *skb)
>>   		netif_dbg(dev, rx_err, dev->net, "rx length %d\n", skb->len);
>>   	} else {
>>   		usbnet_skb_return(dev, skb);
>> -		return;
>> +		return 0;
>>   	}
>>   
>>   done:
>> -	skb_queue_tail(&dev->done, skb);
>> +	return -1;
> Don't make up error numbers, this makes it look like this failed, not
> succeeded.  And if this failed, give it a real error value.
>
> thanks,
>
> greg k-h

Best regards,
Leesoo
Leesoo Ahn Dec. 19, 2022, 8:09 a.m. UTC | #3
On 22. 12. 19. 16:50, Greg KH wrote:
> On Mon, Dec 19, 2022 at 04:41:16PM +0900, Leesoo Ahn wrote:
>> On 22. 12. 18. 17:55, Greg KH wrote:
>>> On Sun, Dec 18, 2022 at 01:18:51AM +0900, Leesoo Ahn wrote:
>>>> The current source pushes skb into dev->done queue by calling
>>>> skb_queue_tail() and then, call skb_dequeue() to pop for rx_cleanup state
>>>> to free urb and skb next in usbnet_bh().
>>>> It wastes CPU resource with extra instructions. Instead, use return values
>>>> jumping to rx_cleanup case directly to free them. Therefore calling
>>>> skb_queue_tail() and skb_dequeue() is not necessary.
>>>>
>>>> The follows are just showing difference between calling skb_queue_tail()
>>>> and using return values jumping to rx_cleanup state directly in usbnet_bh()
>>>> in Arm64 instructions with perf tool.
>>>>
>>>> ----------- calling skb_queue_tail() -----------
>>>>          │     if (!(dev->driver_info->flags & FLAG_RX_ASSEMBLE))
>>>>     7.58 │248:   ldr     x0, [x20, #16]
>>>>     2.46 │24c:   ldr     w0, [x0, #8]
>>>>     1.64 │250: ↑ tbnz    w0, #14, 16c
>>>>          │     dev->net->stats.rx_errors++;
>>>>     0.57 │254:   ldr     x1, [x20, #184]
>>>>     1.64 │258:   ldr     x0, [x1, #336]
>>>>     2.65 │25c:   add     x0, x0, #0x1
>>>>          │260:   str     x0, [x1, #336]
>>>>          │     skb_queue_tail(&dev->done, skb);
>>>>     0.38 │264:   mov     x1, x19
>>>>          │268:   mov     x0, x21
>>>>     2.27 │26c: → bl      skb_queue_tail
>>>>     0.57 │270: ↑ b       44    // branch to call skb_dequeue()
>>>>
>>>> ----------- jumping to rx_cleanup state -----------
>>>>          │     if (!(dev->driver_info->flags & FLAG_RX_ASSEMBLE))
>>>>     1.69 │25c:   ldr     x0, [x21, #16]
>>>>     4.78 │260:   ldr     w0, [x0, #8]
>>>>     3.28 │264: ↑ tbnz    w0, #14, e4    // jump to 'rx_cleanup' state
>>>>          │     dev->net->stats.rx_errors++;
>>>>     0.09 │268:   ldr     x1, [x21, #184]
>>>>     2.72 │26c:   ldr     x0, [x1, #336]
>>>>     3.37 │270:   add     x0, x0, #0x1
>>>>     0.09 │274:   str     x0, [x1, #336]
>>>>     0.66 │278: ↑ b       e4    // branch to 'rx_cleanup' state
>>> Interesting, but does this even really matter given the slow speed of
>>> the USB hardware?
>> It doesn't if USB hardware has slow speed but in software view, it's still
>> worth avoiding calling skb_queue_tail() and skb_dequeue() which work with
>> spinlock, if possible.
> But can you actually measure that in either CPU load or in increased
> transfer speeds?
>
> thanks,
>
> greg k-h

I think the follows are maybe what you would be interested in. I have 
tested both case with perf on the same machine and environments, also 
modified driver code a bit to go to rx_cleanup case, not to net stack in 
a specific packet.

----- calling skb_queue_tail() -----
-   11.58%     0.26%  swapper          [k] usbnet_bh
    - 11.32% usbnet_bh
       - 6.43% skb_dequeue
            6.34% _raw_spin_unlock_irqrestore
       - 2.21% skb_queue_tail
            2.19% _raw_spin_unlock_irqrestore
       - 1.68% consume_skb
          - 0.97% kfree_skbmem
               0.80% kmem_cache_free
            0.53% skb_release_data

----- jump to rx_cleanup directly -----
-    7.62%     0.18%  swapper          [k] usbnet_bh
    - 7.44% usbnet_bh
       - 4.63% skb_dequeue
            4.57% _raw_spin_unlock_irqrestore
       - 1.76% consume_skb
          - 1.03% kfree_skbmem
               0.86% kmem_cache_free
            0.56% skb_release_data
         0.54% smsc95xx_rx_fixup

The first case takes CPU resource a bit much by the result.

Thank you for reviewing, by the way.

Best regards,
Leesoo
diff mbox series

Patch

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 64a9a80b2309..924392a37297 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -555,7 +555,7 @@  static int rx_submit (struct usbnet *dev, struct urb *urb, gfp_t flags)
 
 /*-------------------------------------------------------------------------*/
 
-static inline void rx_process (struct usbnet *dev, struct sk_buff *skb)
+static inline int rx_process(struct usbnet *dev, struct sk_buff *skb)
 {
 	if (dev->driver_info->rx_fixup &&
 	    !dev->driver_info->rx_fixup (dev, skb)) {
@@ -576,11 +576,11 @@  static inline void rx_process (struct usbnet *dev, struct sk_buff *skb)
 		netif_dbg(dev, rx_err, dev->net, "rx length %d\n", skb->len);
 	} else {
 		usbnet_skb_return(dev, skb);
-		return;
+		return 0;
 	}
 
 done:
-	skb_queue_tail(&dev->done, skb);
+	return -1;
 }
 
 /*-------------------------------------------------------------------------*/
@@ -1528,13 +1528,14 @@  static void usbnet_bh (struct timer_list *t)
 		entry = (struct skb_data *) skb->cb;
 		switch (entry->state) {
 		case rx_done:
-			entry->state = rx_cleanup;
-			rx_process (dev, skb);
+			if (rx_process(dev, skb))
+				goto cleanup;
 			continue;
 		case tx_done:
 			kfree(entry->urb->sg);
 			fallthrough;
 		case rx_cleanup:
+cleanup:
 			usb_free_urb (entry->urb);
 			dev_kfree_skb (skb);
 			continue;