diff mbox series

[v5,bpf-next,07/11] tcp: Migrate TCP_NEW_SYN_RECV requests at receiving the final ACK.

Message ID 20210510034433.52818-8-kuniyu@amazon.co.jp
State New
Headers show
Series Socket migration for SO_REUSEPORT. | expand

Commit Message

Kuniyuki Iwashima May 10, 2021, 3:44 a.m. UTC
This patch also changes the code to call reuseport_migrate_sock() and
reqsk_clone(), but unlike the other cases, we do not call reqsk_clone()
right after reuseport_migrate_sock().

Currently, in the receive path for TCP_NEW_SYN_RECV sockets, its listener
has three kinds of refcnt:

  (A) for listener itself
  (B) carried by reuqest_sock
  (C) sock_hold() in tcp_v[46]_rcv()

While processing the req, (A) may disappear by close(listener). Also, (B)
can disappear by accept(listener) once we put the req into the accept
queue. So, we have to hold another refcnt (C) for the listener to prevent
use-after-free.

For socket migration, we call reuseport_migrate_sock() to select a listener
with (A) and to increment the new listener's refcnt in tcp_v[46]_rcv().
This refcnt corresponds to (C) and is cleaned up later in tcp_v[46]_rcv().
Thus we have to take another refcnt (B) for the newly cloned request_sock.

In inet_csk_complete_hashdance(), we hold the count (B), clone the req, and
try to put the new req into the accept queue. By migrating req after
winning the "own_req" race, we can avoid such a worst situation:

  CPU 1 looks up req1
  CPU 2 looks up req1, unhashes it, then CPU 1 loses the race
  CPU 3 looks up req2, unhashes it, then CPU 2 loses the race
  ...

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
---
 net/ipv4/inet_connection_sock.c | 30 +++++++++++++++++++++++++++++-
 net/ipv4/tcp_ipv4.c             | 20 ++++++++++++++------
 net/ipv6/tcp_ipv6.c             | 14 +++++++++++---
 3 files changed, 54 insertions(+), 10 deletions(-)

Comments

Martin KaFai Lau May 15, 2021, 1:13 a.m. UTC | #1
On Mon, May 10, 2021 at 12:44:29PM +0900, Kuniyuki Iwashima wrote:
> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c

> index e690d1cff36e..fe666dc5c621 100644

> --- a/net/ipv4/inet_connection_sock.c

> +++ b/net/ipv4/inet_connection_sock.c

> @@ -1075,10 +1075,38 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,

>  	if (own_req) {

>  		inet_csk_reqsk_queue_drop(sk, req);

>  		reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);

In the migration case 'sk != req->rsk_listener', is sk the right
one to pass in the above two functions?

> -		if (inet_csk_reqsk_queue_add(sk, req, child))

> +

> +		if (sk != req->rsk_listener) {

> +			/* another listening sk has been selected,

> +			 * migrate the req to it.

> +			 */

> +			struct request_sock *nreq;

> +

> +			/* hold a refcnt for the nreq->rsk_listener

> +			 * which is assigned in reqsk_clone()

> +			 */

> +			sock_hold(sk);

> +			nreq = reqsk_clone(req, sk);

> +			if (!nreq) {

> +				inet_child_forget(sk, req, child);

> +				goto child_put;

> +			}

> +

> +			refcount_set(&nreq->rsk_refcnt, 1);

> +			if (inet_csk_reqsk_queue_add(sk, nreq, child)) {

> +				reqsk_migrate_reset(req);

> +				reqsk_put(req);

> +				return child;

> +			}

> +

> +			reqsk_migrate_reset(nreq);

> +			__reqsk_free(nreq);

> +		} else if (inet_csk_reqsk_queue_add(sk, req, child)) {

>  			return child;

> +		}

>  	}

>  	/* Too bad, another child took ownership of the request, undo. */

> +child_put:

>  	bh_unlock_sock(child);

>  	sock_put(child);

>  	return NULL;
Kuniyuki Iwashima May 15, 2021, 4:18 a.m. UTC | #2
From:   Martin KaFai Lau <kafai@fb.com>

Date:   Fri, 14 May 2021 18:13:05 -0700
> On Mon, May 10, 2021 at 12:44:29PM +0900, Kuniyuki Iwashima wrote:

> > diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c

> > index e690d1cff36e..fe666dc5c621 100644

> > --- a/net/ipv4/inet_connection_sock.c

> > +++ b/net/ipv4/inet_connection_sock.c

> > @@ -1075,10 +1075,38 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,

> >  	if (own_req) {

> >  		inet_csk_reqsk_queue_drop(sk, req);

> >  		reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);

> In the migration case 'sk != req->rsk_listener', is sk the right

> one to pass in the above two functions?


Good catch, 'sk' should be 'req->rsk_listener' here.
Thank you!


> 

> > -		if (inet_csk_reqsk_queue_add(sk, req, child))

> > +

> > +		if (sk != req->rsk_listener) {

> > +			/* another listening sk has been selected,

> > +			 * migrate the req to it.

> > +			 */

> > +			struct request_sock *nreq;

> > +

> > +			/* hold a refcnt for the nreq->rsk_listener

> > +			 * which is assigned in reqsk_clone()

> > +			 */

> > +			sock_hold(sk);

> > +			nreq = reqsk_clone(req, sk);

> > +			if (!nreq) {

> > +				inet_child_forget(sk, req, child);

> > +				goto child_put;

> > +			}

> > +

> > +			refcount_set(&nreq->rsk_refcnt, 1);

> > +			if (inet_csk_reqsk_queue_add(sk, nreq, child)) {

> > +				reqsk_migrate_reset(req);

> > +				reqsk_put(req);

> > +				return child;

> > +			}

> > +

> > +			reqsk_migrate_reset(nreq);

> > +			__reqsk_free(nreq);

> > +		} else if (inet_csk_reqsk_queue_add(sk, req, child)) {

> >  			return child;

> > +		}

> >  	}

> >  	/* Too bad, another child took ownership of the request, undo. */

> > +child_put:

> >  	bh_unlock_sock(child);

> >  	sock_put(child);

> >  	return NULL;
diff mbox series

Patch

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index e690d1cff36e..fe666dc5c621 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1075,10 +1075,38 @@  struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
 	if (own_req) {
 		inet_csk_reqsk_queue_drop(sk, req);
 		reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
-		if (inet_csk_reqsk_queue_add(sk, req, child))
+
+		if (sk != req->rsk_listener) {
+			/* another listening sk has been selected,
+			 * migrate the req to it.
+			 */
+			struct request_sock *nreq;
+
+			/* hold a refcnt for the nreq->rsk_listener
+			 * which is assigned in reqsk_clone()
+			 */
+			sock_hold(sk);
+			nreq = reqsk_clone(req, sk);
+			if (!nreq) {
+				inet_child_forget(sk, req, child);
+				goto child_put;
+			}
+
+			refcount_set(&nreq->rsk_refcnt, 1);
+			if (inet_csk_reqsk_queue_add(sk, nreq, child)) {
+				reqsk_migrate_reset(req);
+				reqsk_put(req);
+				return child;
+			}
+
+			reqsk_migrate_reset(nreq);
+			__reqsk_free(nreq);
+		} else if (inet_csk_reqsk_queue_add(sk, req, child)) {
 			return child;
+		}
 	}
 	/* Too bad, another child took ownership of the request, undo. */
+child_put:
 	bh_unlock_sock(child);
 	sock_put(child);
 	return NULL;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 312184cead57..214495d02143 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2000,13 +2000,21 @@  int tcp_v4_rcv(struct sk_buff *skb)
 			goto csum_error;
 		}
 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
-			inet_csk_reqsk_queue_drop_and_put(sk, req);
-			goto lookup;
+			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
+			if (!nsk) {
+				inet_csk_reqsk_queue_drop_and_put(sk, req);
+				goto lookup;
+			}
+			sk = nsk;
+			/* reuseport_migrate_sock() has already held one sk_refcnt
+			 * before returning.
+			 */
+		} else {
+			/* We own a reference on the listener, increase it again
+			 * as we might lose it too soon.
+			 */
+			sock_hold(sk);
 		}
-		/* We own a reference on the listener, increase it again
-		 * as we might lose it too soon.
-		 */
-		sock_hold(sk);
 		refcounted = true;
 		nsk = NULL;
 		if (!tcp_filter(sk, skb)) {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5f47c0b6e3de..aea8e75d3fed 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1663,10 +1663,18 @@  INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
 			goto csum_error;
 		}
 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
-			inet_csk_reqsk_queue_drop_and_put(sk, req);
-			goto lookup;
+			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
+			if (!nsk) {
+				inet_csk_reqsk_queue_drop_and_put(sk, req);
+				goto lookup;
+			}
+			sk = nsk;
+			/* reuseport_migrate_sock() has already held one sk_refcnt
+			 * before returning.
+			 */
+		} else {
+			sock_hold(sk);
 		}
-		sock_hold(sk);
 		refcounted = true;
 		nsk = NULL;
 		if (!tcp_filter(sk, skb)) {