mbox series

[PATCHv2,net,0/3] bond: fix xfrm offload issues

Message ID 20250225094049.20142-1-liuhangbin@gmail.com
Headers show
Series bond: fix xfrm offload issues | expand

Message

Hangbin Liu Feb. 25, 2025, 9:40 a.m. UTC
The first patch fixes the incorrect locks using in bond driver.
The second patch fixes the xfrm offload feature during setup active-backup
mode. The third patch add a ipsec offload testing.

v2: move the mutex lock to a work queue (Cosmin Ratiu)

Hangbin Liu (3):
  bonding: move mutex lock to a work queue for XFRM GC tasks
  bonding: fix xfrm offload feature setup on active-backup mode
  selftests: bonding: add ipsec offload test

 drivers/net/bonding/bond_main.c               |  43 +++--
 drivers/net/bonding/bond_netlink.c            |  16 +-
 include/net/bonding.h                         |   7 +
 .../selftests/drivers/net/bonding/Makefile    |   3 +-
 .../drivers/net/bonding/bond_ipsec_offload.sh | 155 ++++++++++++++++++
 .../selftests/drivers/net/bonding/config      |   4 +
 6 files changed, 209 insertions(+), 19 deletions(-)
 create mode 100755 tools/testing/selftests/drivers/net/bonding/bond_ipsec_offload.sh

Comments

Nikolay Aleksandrov Feb. 25, 2025, 11:05 a.m. UTC | #1
On 2/25/25 11:40, Hangbin Liu wrote:
> The fixed commit placed mutex_lock() inside spin_lock_bh(), which triggers
> a warning like:
> 
> BUG: sleeping function called from invalid context at...
> 
> Fix this by moving the mutex_lock() operation to a work queue.
> 
> Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to mutex")
> Reported-by: Jakub Kicinski <kuba@kernel.org>
> Closes: https://lore.kernel.org/netdev/20241212062734.182a0164@kernel.org
> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
> ---
>  drivers/net/bonding/bond_main.c | 41 +++++++++++++++++++++++++--------
>  include/net/bonding.h           |  6 +++++
>  2 files changed, 37 insertions(+), 10 deletions(-)
> 

Hi,
I think there are a few issues with this solution, comments below.

> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index e45bba240cbc..cc7064aa4b35 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -551,6 +551,25 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
>  	mutex_unlock(&bond->ipsec_lock);
>  }
>  
> +static void bond_xfrm_state_gc_work(struct work_struct *work)
> +{
> +	struct bond_xfrm_work *xfrm_work = container_of(work, struct bond_xfrm_work, work);
> +	struct bonding *bond = xfrm_work->bond;
> +	struct xfrm_state *xs = xfrm_work->xs;
> +	struct bond_ipsec *ipsec;
> +
> +	mutex_lock(&bond->ipsec_lock);
> +	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> +		if (ipsec->xs == xs) {
> +			list_del(&ipsec->list);
> +			kfree(ipsec);
> +			xfrm_state_put(xs);
> +			break;
> +		}
> +	}
> +	mutex_unlock(&bond->ipsec_lock);
> +}
> +
>  /**
>   * bond_ipsec_del_sa - clear out this specific SA
>   * @xs: pointer to transformer state struct
> @@ -558,9 +577,9 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
>  static void bond_ipsec_del_sa(struct xfrm_state *xs)
>  {
>  	struct net_device *bond_dev = xs->xso.dev;
> +	struct bond_xfrm_work *xfrm_work;
>  	struct net_device *real_dev;
>  	netdevice_tracker tracker;
> -	struct bond_ipsec *ipsec;
>  	struct bonding *bond;
>  	struct slave *slave;
>  
> @@ -592,15 +611,17 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
>  	real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
>  out:
>  	netdev_put(real_dev, &tracker);
> -	mutex_lock(&bond->ipsec_lock);
> -	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> -		if (ipsec->xs == xs) {
> -			list_del(&ipsec->list);
> -			kfree(ipsec);
> -			break;
> -		}
> -	}
> -	mutex_unlock(&bond->ipsec_lock);
> +
> +	xfrm_work = kmalloc(sizeof(*xfrm_work), GFP_ATOMIC);
> +	if (!xfrm_work)
> +		return;
> +

What happens if this allocation fails? I think you'll leak memory and
potentially call the xdo_dev callbacks for this xs again because it's
still in the list. Also this xfrm_work memory doesn't get freed anywhere, so
you're leaking it as well.

Perhaps you can do this allocation in add_sa, it seems you can sleep
there and potentially return an error if it fails, so this can never
fail later. You'll have to be careful with the freeing dance though.
Alternatively, make the work a part of struct bond so it doesn't need
memory management, but then you need a mechanism to queue these items (e.g.
a separate list with a spinlock) and would have more complexity with freeing
in parallel.

> +	INIT_WORK(&xfrm_work->work, bond_xfrm_state_gc_work);
> +	xfrm_work->bond = bond;
> +	xfrm_work->xs = xs;
> +	xfrm_state_hold(xs);
> +
> +	queue_work(bond->wq, &xfrm_work->work);

Note that nothing waits for this work anywhere and .ndo_uninit runs before
bond's .priv_destructor which means ipsec_lock will be destroyed and will be
used afterwards when destroying bond->wq from the destructor if there were
any queued works.

[snip]

Cheers,
 Nik
Hangbin Liu Feb. 25, 2025, 1:13 p.m. UTC | #2
On Tue, Feb 25, 2025 at 01:05:24PM +0200, Nikolay Aleksandrov wrote:
> > @@ -592,15 +611,17 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
> >  	real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
> >  out:
> >  	netdev_put(real_dev, &tracker);
> > -	mutex_lock(&bond->ipsec_lock);
> > -	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> > -		if (ipsec->xs == xs) {
> > -			list_del(&ipsec->list);
> > -			kfree(ipsec);
> > -			break;
> > -		}
> > -	}
> > -	mutex_unlock(&bond->ipsec_lock);
> > +
> > +	xfrm_work = kmalloc(sizeof(*xfrm_work), GFP_ATOMIC);
> > +	if (!xfrm_work)
> > +		return;
> > +
> 
> What happens if this allocation fails? I think you'll leak memory and
> potentially call the xdo_dev callbacks for this xs again because it's
> still in the list. Also this xfrm_work memory doesn't get freed anywhere, so
> you're leaking it as well.

Yes, I thought this too simply and forgot free the memory.
> 
> Perhaps you can do this allocation in add_sa, it seems you can sleep
> there and potentially return an error if it fails, so this can never
> fail later. You'll have to be careful with the freeing dance though.

Hmm, if we allocation this in add_sa, how to we get the xfrm_work
in del_sa? Add the xfrm_work to another list will need to sleep again
to find it out in del_sa.

> Alternatively, make the work a part of struct bond so it doesn't need
> memory management, but then you need a mechanism to queue these items (e.g.
> a separate list with a spinlock) and would have more complexity with freeing
> in parallel.

I used a dealy work queue in bond for my draft patch. As you said,
it need another list to queue the xs. And during the gc works, we need
to use spinlock again to get the xs out...

> 
> > +	INIT_WORK(&xfrm_work->work, bond_xfrm_state_gc_work);
> > +	xfrm_work->bond = bond;
> > +	xfrm_work->xs = xs;
> > +	xfrm_state_hold(xs);
> > +
> > +	queue_work(bond->wq, &xfrm_work->work);
> 
> Note that nothing waits for this work anywhere and .ndo_uninit runs before
> bond's .priv_destructor which means ipsec_lock will be destroyed and will be
> used afterwards when destroying bond->wq from the destructor if there were
> any queued works.

Do you mean we need to register the work queue in bond_init and cancel
it in bond_work_cancel_all()?

Thanks
Hangbin
Nikolay Aleksandrov Feb. 25, 2025, 1:30 p.m. UTC | #3
On 2/25/25 15:13, Hangbin Liu wrote:
> On Tue, Feb 25, 2025 at 01:05:24PM +0200, Nikolay Aleksandrov wrote:
>>> @@ -592,15 +611,17 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
>>>  	real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
>>>  out:
>>>  	netdev_put(real_dev, &tracker);
>>> -	mutex_lock(&bond->ipsec_lock);
>>> -	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
>>> -		if (ipsec->xs == xs) {
>>> -			list_del(&ipsec->list);
>>> -			kfree(ipsec);
>>> -			break;
>>> -		}
>>> -	}
>>> -	mutex_unlock(&bond->ipsec_lock);
>>> +
>>> +	xfrm_work = kmalloc(sizeof(*xfrm_work), GFP_ATOMIC);
>>> +	if (!xfrm_work)
>>> +		return;
>>> +
>>
>> What happens if this allocation fails? I think you'll leak memory and
>> potentially call the xdo_dev callbacks for this xs again because it's
>> still in the list. Also this xfrm_work memory doesn't get freed anywhere, so
>> you're leaking it as well.
> 
> Yes, I thought this too simply and forgot free the memory.
>>
>> Perhaps you can do this allocation in add_sa, it seems you can sleep
>> there and potentially return an error if it fails, so this can never
>> fail later. You'll have to be careful with the freeing dance though.
> 
> Hmm, if we allocation this in add_sa, how to we get the xfrm_work
> in del_sa? Add the xfrm_work to another list will need to sleep again
> to find it out in del_sa.
> 

Well, you have struct bond_ipsec and it is tied with the work's lifetime
so you can stick it there. :)
I haven't looked closely how feasible it is.

>> Alternatively, make the work a part of struct bond so it doesn't need
>> memory management, but then you need a mechanism to queue these items (e.g.
>> a separate list with a spinlock) and would have more complexity with freeing
>> in parallel.
> 
> I used a dealy work queue in bond for my draft patch. As you said,
> it need another list to queue the xs. And during the gc works, we need
> to use spinlock again to get the xs out...
> 

Correct, it's a different kind of mess. :)

>>
>>> +	INIT_WORK(&xfrm_work->work, bond_xfrm_state_gc_work);
>>> +	xfrm_work->bond = bond;
>>> +	xfrm_work->xs = xs;
>>> +	xfrm_state_hold(xs);
>>> +
>>> +	queue_work(bond->wq, &xfrm_work->work);
>>
>> Note that nothing waits for this work anywhere and .ndo_uninit runs before
>> bond's .priv_destructor which means ipsec_lock will be destroyed and will be
>> used afterwards when destroying bond->wq from the destructor if there were
>> any queued works.
> 
> Do you mean we need to register the work queue in bond_init and cancel
> it in bond_work_cancel_all()?
> 
> Thanks
> Hangbin

That is one way, the other is if you have access to the work queue items then
you can cancel them which should be easier (i.e. cancel_delayed_work_sync).

Regardless of which way you choose to solve this (gc or work in bond_ipsec), there will
be some dance to be done for the sequence of events that will not be straight-forward.

Cheers,
 Nik
Nikolay Aleksandrov Feb. 25, 2025, 2:27 p.m. UTC | #4
On 2/25/25 16:00, Cosmin Ratiu wrote:
> On Tue, 2025-02-25 at 09:40 +0000, Hangbin Liu wrote:
>> The fixed commit placed mutex_lock() inside spin_lock_bh(), which
>> triggers
>> a warning like:
>>
>> BUG: sleeping function called from invalid context at...
>>
>> Fix this by moving the mutex_lock() operation to a work queue.
>>
>> Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to
>> mutex")
>> Reported-by: Jakub Kicinski <kuba@kernel.org>
>> Closes:
>> https://lore.kernel.org/netdev/20241212062734.182a0164@kernel.org
>> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
>> ---
>>  drivers/net/bonding/bond_main.c | 41 +++++++++++++++++++++++++------
>> --
>>  include/net/bonding.h           |  6 +++++
>>  2 files changed, 37 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/net/bonding/bond_main.c
>> b/drivers/net/bonding/bond_main.c
>> index e45bba240cbc..cc7064aa4b35 100644
>> --- a/drivers/net/bonding/bond_main.c
>> +++ b/drivers/net/bonding/bond_main.c
>> @@ -551,6 +551,25 @@ static void bond_ipsec_add_sa_all(struct bonding
>> *bond)
>>  	mutex_unlock(&bond->ipsec_lock);
>>  }
>>  
>> +static void bond_xfrm_state_gc_work(struct work_struct *work)
>> +{
>> +	struct bond_xfrm_work *xfrm_work = container_of(work, struct
>> bond_xfrm_work, work);
>> +	struct bonding *bond = xfrm_work->bond;
>> +	struct xfrm_state *xs = xfrm_work->xs;
>> +	struct bond_ipsec *ipsec;
>> +
>> +	mutex_lock(&bond->ipsec_lock);
>> +	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
>> +		if (ipsec->xs == xs) {
>> +			list_del(&ipsec->list);
>> +			kfree(ipsec);
>> +			xfrm_state_put(xs);
> 
> I would expect xfrm_state_put to be called from outside the loop,
> regardless of whether an entry is found in the list or not, because it
> was unconditionally referenced when the work was created.
> 
>> +			break;
>> +		}
>> +	}
>> +	mutex_unlock(&bond->ipsec_lock);
>> +}
>> +
>>  /**
>>   * bond_ipsec_del_sa - clear out this specific SA
>>   * @xs: pointer to transformer state struct
>> @@ -558,9 +577,9 @@ static void bond_ipsec_add_sa_all(struct bonding
>> *bond)
>>  static void bond_ipsec_del_sa(struct xfrm_state *xs)
>>  {
>>  	struct net_device *bond_dev = xs->xso.dev;
>> +	struct bond_xfrm_work *xfrm_work;
>>  	struct net_device *real_dev;
>>  	netdevice_tracker tracker;
>> -	struct bond_ipsec *ipsec;
>>  	struct bonding *bond;
>>  	struct slave *slave;
>>  
>> @@ -592,15 +611,17 @@ static void bond_ipsec_del_sa(struct xfrm_state
>> *xs)
>>  	real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
>>  out:
>>  	netdev_put(real_dev, &tracker);
>> -	mutex_lock(&bond->ipsec_lock);
>> -	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
>> -		if (ipsec->xs == xs) {
>> -			list_del(&ipsec->list);
>> -			kfree(ipsec);
>> -			break;
>> -		}
>> -	}
>> -	mutex_unlock(&bond->ipsec_lock);
>> +
>> +	xfrm_work = kmalloc(sizeof(*xfrm_work), GFP_ATOMIC);
>> +	if (!xfrm_work)
>> +		return;
>> +
>> +	INIT_WORK(&xfrm_work->work, bond_xfrm_state_gc_work);
>> +	xfrm_work->bond = bond;
>> +	xfrm_work->xs = xs;
>> +	xfrm_state_hold(xs);
>> +
>> +	queue_work(bond->wq, &xfrm_work->work);
>>  }
>>  
>>  static void bond_ipsec_del_sa_all(struct bonding *bond)
>> diff --git a/include/net/bonding.h b/include/net/bonding.h
>> index 8bb5f016969f..d54ba5e3affb 100644
>> --- a/include/net/bonding.h
>> +++ b/include/net/bonding.h
>> @@ -209,6 +209,12 @@ struct bond_ipsec {
>>  	struct xfrm_state *xs;
>>  };
>>  
>> +struct bond_xfrm_work {
>> +	struct work_struct work;
>> +	struct bonding *bond;
>> +	struct xfrm_state *xs;
>> +};
> 
> Also, like Nikolai said, something needs to wait on all in-flight work
> items.
> 
> This got me to stare at the code again. What if we move the removal of
> the xs from bond->ipsec from bond_ipsec_del_sa to bond_ipsec_free_sa?
> bond_ipsec_free_sa, unlike bond_ipsec_del_sa, is not called with x-
>> lock held. It is called from the xfrm gc task or directly via
> xfrm_state_put_sync and therefore wouldn't suffer from the locking
> issue.
> 
> The tricky part is to make sure that inactive bond->ipsec entries
> (after bond_ipsec_del_sa calls) do not cause issues if there's a
> migration (bond_ipsec_del_sa_all is called) happening before
> bond_ipsec_free_sa. Perhaps filtering by x->km.state != XFRM_STATE_DEAD
> in bond_ipsec_del_sa_all.
> 
> What do you think about this idea?
> 
> Cosmin.

I know the question was for Hangbin, but I do like this solution. I missed
the xdo_dev_state_free callback, it could lead to a much simpler solution
with some care.

Cheers,
 Nik
Hangbin Liu Feb. 26, 2025, 9:48 a.m. UTC | #5
Hi Cosmin,
On Tue, Feb 25, 2025 at 02:00:05PM +0000, Cosmin Ratiu wrote:
> This got me to stare at the code again. What if we move the removal of
> the xs from bond->ipsec from bond_ipsec_del_sa to bond_ipsec_free_sa?
> bond_ipsec_free_sa, unlike bond_ipsec_del_sa, is not called with x-
> >lock held. It is called from the xfrm gc task or directly via
> xfrm_state_put_sync and therefore wouldn't suffer from the locking
> issue.
> 
> The tricky part is to make sure that inactive bond->ipsec entries
> (after bond_ipsec_del_sa calls) do not cause issues if there's a
> migration (bond_ipsec_del_sa_all is called) happening before
> bond_ipsec_free_sa. Perhaps filtering by x->km.state != XFRM_STATE_DEAD
> in bond_ipsec_del_sa_all.
> 
> What do you think about this idea?

Thanks a lot for the comments. I also skipped the DEAD xs in add_sa_all.
What about the patch like:

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index e45bba240cbc..0e4db43a833a 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -537,6 +537,12 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
 	}
 
 	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		/* No need to handle DEAD XFRM, as it has already been
+		 * deleted and will be freed later.
+		 */
+		if (ipsec->xs->km.state == XFRM_STATE_DEAD)
+			continue;
+
 		/* If new state is added before ipsec_lock acquired */
 		if (ipsec->xs->xso.real_dev == real_dev)
 			continue;
@@ -592,15 +598,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
 	real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
 out:
 	netdev_put(real_dev, &tracker);
-	mutex_lock(&bond->ipsec_lock);
-	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
-		if (ipsec->xs == xs) {
-			list_del(&ipsec->list);
-			kfree(ipsec);
-			break;
-		}
-	}
-	mutex_unlock(&bond->ipsec_lock);
 }
 
 static void bond_ipsec_del_sa_all(struct bonding *bond)
@@ -617,6 +614,12 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
 
 	mutex_lock(&bond->ipsec_lock);
 	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		/* No need to handle DEAD XFRM, as it has already been
+		 * deleted and will be freed later.
+		 */
+		if (ipsec->xs->km.state == XFRM_STATE_DEAD)
+			continue;
+
 		if (!ipsec->xs->xso.real_dev)
 			continue;
 
@@ -666,6 +669,16 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs)
 		real_dev->xfrmdev_ops->xdo_dev_state_free(xs);
 out:
 	netdev_put(real_dev, &tracker);
+
+	mutex_lock(&bond->ipsec_lock);
+	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		if (ipsec->xs == xs) {
+			list_del(&ipsec->list);
+			kfree(ipsec);
+			break;
+		}
+	}
+	mutex_unlock(&bond->ipsec_lock);
 }
 
 /**