diff mbox series

[v7,bpf-next,05/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

Message ID 20210521182104.18273-6-kuniyu@amazon.co.jp
State Superseded
Headers show
Series Socket migration for SO_REUSEPORT. | expand

Commit Message

Kuniyuki Iwashima May 21, 2021, 6:20 p.m. UTC
When we call close() or shutdown() for listening sockets, each child socket
in the accept queue are freed at inet_csk_listen_stop(). If we can get a
new listener by reuseport_migrate_sock() and clone the request by
inet_reqsk_clone(), we try to add it into the new listener's accept queue
by inet_csk_reqsk_queue_add(). If it fails, we have to call __reqsk_free()
to call sock_put() for its listener and free the cloned request.

After putting the full socket into ehash, tcp_v[46]_syn_recv_sock() sets
NULL to ireq_opt/pktopts in struct inet_request_sock, but ipv6_opt can be
non-NULL. So, we have to set NULL to ipv6_opt of the old request to avoid
double free.

Note that we do not update req->rsk_listener and instead clone the req to
migrate because another path may reference the original request. If we
protected it by RCU, we would need to add rcu_read_lock() in many places.

Link: https://lore.kernel.org/netdev/20201209030903.hhow5r53l6fmozjn@kafai-mbp.dhcp.thefacebook.com/
Suggested-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Acked-by: Martin KaFai Lau <kafai@fb.com>
---
 net/ipv4/inet_connection_sock.c | 71 ++++++++++++++++++++++++++++++++-
 1 file changed, 70 insertions(+), 1 deletion(-)

Comments

ericnetdev dumazet June 10, 2021, 6:20 p.m. UTC | #1
On 5/21/21 8:20 PM, Kuniyuki Iwashima wrote:
> When we call close() or shutdown() for listening sockets, each child socket

> in the accept queue are freed at inet_csk_listen_stop(). If we can get a

> new listener by reuseport_migrate_sock() and clone the request by

> inet_reqsk_clone(), we try to add it into the new listener's accept queue

> by inet_csk_reqsk_queue_add(). If it fails, we have to call __reqsk_free()

> to call sock_put() for its listener and free the cloned request.

> 

> After putting the full socket into ehash, tcp_v[46]_syn_recv_sock() sets

> NULL to ireq_opt/pktopts in struct inet_request_sock, but ipv6_opt can be

> non-NULL. So, we have to set NULL to ipv6_opt of the old request to avoid

> double free.

> 

> Note that we do not update req->rsk_listener and instead clone the req to

> migrate because another path may reference the original request. If we

> protected it by RCU, we would need to add rcu_read_lock() in many places.

> 

> Link: https://lore.kernel.org/netdev/20201209030903.hhow5r53l6fmozjn@kafai-mbp.dhcp.thefacebook.com/

> Suggested-by: Martin KaFai Lau <kafai@fb.com>

> Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>

> Acked-by: Martin KaFai Lau <kafai@fb.com>

> ---

>  net/ipv4/inet_connection_sock.c | 71 ++++++++++++++++++++++++++++++++-

>  1 file changed, 70 insertions(+), 1 deletion(-)

> 

> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c

> index fa806e9167ec..07e97b2f3635 100644

> --- a/net/ipv4/inet_connection_sock.c

> +++ b/net/ipv4/inet_connection_sock.c

> @@ -695,6 +695,53 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)

>  }

>  EXPORT_SYMBOL(inet_rtx_syn_ack);

>  

> +static struct request_sock *inet_reqsk_clone(struct request_sock *req,

> +					     struct sock *sk)

> +{

> +	struct sock *req_sk, *nreq_sk;

> +	struct request_sock *nreq;

> +

> +	nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN);

> +	if (!nreq) {

> +		/* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */

> +		sock_put(sk);

> +		return NULL;

> +	}

> +

> +	req_sk = req_to_sk(req);

> +	nreq_sk = req_to_sk(nreq);

> +

> +	memcpy(nreq_sk, req_sk,

> +	       offsetof(struct sock, sk_dontcopy_begin));

> +	memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end,

> +	       req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end));

> +

> +	sk_node_init(&nreq_sk->sk_node);

> +	nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping;

> +#ifdef CONFIG_XPS

> +	nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping;

> +#endif

> +	nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu;

> +	refcount_set(&nreq_sk->sk_refcnt, 0);


Not sure why you clear sk_refcnt here (it is set to 1 later)

> +

> +	nreq->rsk_listener = sk;

> +

> +	/* We need not acquire fastopenq->lock

> +	 * because the child socket is locked in inet_csk_listen_stop().

> +	 */

> +	if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener)

> +		rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq);

> +

> +	return nreq;

> +}


Ouch, this is going to be hard to maintain...




> +

> +static void reqsk_migrate_reset(struct request_sock *req)

> +{

> +#if IS_ENABLED(CONFIG_IPV6)

> +	inet_rsk(req)->ipv6_opt = NULL;

> +#endif

> +}

> +

>  /* return true if req was found in the ehash table */

>  static bool reqsk_queue_unlink(struct request_sock *req)

>  {

> @@ -1036,14 +1083,36 @@ void inet_csk_listen_stop(struct sock *sk)

>  	 * of the variants now.			--ANK

>  	 */

>  	while ((req = reqsk_queue_remove(queue, sk)) != NULL) {

> -		struct sock *child = req->sk;

> +		struct sock *child = req->sk, *nsk;

> +		struct request_sock *nreq;

>  

>  		local_bh_disable();

>  		bh_lock_sock(child);

>  		WARN_ON(sock_owned_by_user(child));

>  		sock_hold(child);

>  

> +		nsk = reuseport_migrate_sock(sk, child, NULL);

> +		if (nsk) {

> +			nreq = inet_reqsk_clone(req, nsk);

> +			if (nreq) {

> +				refcount_set(&nreq->rsk_refcnt, 1);

> +

> +				if (inet_csk_reqsk_queue_add(nsk, nreq, child)) {

> +					reqsk_migrate_reset(req);

> +				} else {

> +					reqsk_migrate_reset(nreq);

> +					__reqsk_free(nreq);

> +				}

> +

> +				/* inet_csk_reqsk_queue_add() has already

> +				 * called inet_child_forget() on failure case.

> +				 */

> +				goto skip_child_forget;

> +			}

> +		}

> +

>  		inet_child_forget(sk, req, child);

> +skip_child_forget:

>  		reqsk_put(req);

>  		bh_unlock_sock(child);

>  		local_bh_enable();

>
Kuniyuki Iwashima June 10, 2021, 10:45 p.m. UTC | #2
From:   Eric Dumazet <erdnetdev@gmail.com>

Date:   Thu, 10 Jun 2021 20:20:11 +0200
> On 5/21/21 8:20 PM, Kuniyuki Iwashima wrote:

> > When we call close() or shutdown() for listening sockets, each child socket

> > in the accept queue are freed at inet_csk_listen_stop(). If we can get a

> > new listener by reuseport_migrate_sock() and clone the request by

> > inet_reqsk_clone(), we try to add it into the new listener's accept queue

> > by inet_csk_reqsk_queue_add(). If it fails, we have to call __reqsk_free()

> > to call sock_put() for its listener and free the cloned request.

> > 

> > After putting the full socket into ehash, tcp_v[46]_syn_recv_sock() sets

> > NULL to ireq_opt/pktopts in struct inet_request_sock, but ipv6_opt can be

> > non-NULL. So, we have to set NULL to ipv6_opt of the old request to avoid

> > double free.

> > 

> > Note that we do not update req->rsk_listener and instead clone the req to

> > migrate because another path may reference the original request. If we

> > protected it by RCU, we would need to add rcu_read_lock() in many places.

> > 

> > Link: https://lore.kernel.org/netdev/20201209030903.hhow5r53l6fmozjn@kafai-mbp.dhcp.thefacebook.com/

> > Suggested-by: Martin KaFai Lau <kafai@fb.com>

> > Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>

> > Acked-by: Martin KaFai Lau <kafai@fb.com>

> > ---

> >  net/ipv4/inet_connection_sock.c | 71 ++++++++++++++++++++++++++++++++-

> >  1 file changed, 70 insertions(+), 1 deletion(-)

> > 

> > diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c

> > index fa806e9167ec..07e97b2f3635 100644

> > --- a/net/ipv4/inet_connection_sock.c

> > +++ b/net/ipv4/inet_connection_sock.c

> > @@ -695,6 +695,53 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)

> >  }

> >  EXPORT_SYMBOL(inet_rtx_syn_ack);

> >  

> > +static struct request_sock *inet_reqsk_clone(struct request_sock *req,

> > +					     struct sock *sk)

> > +{

> > +	struct sock *req_sk, *nreq_sk;

> > +	struct request_sock *nreq;

> > +

> > +	nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN);

> > +	if (!nreq) {

> > +		/* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */

> > +		sock_put(sk);

> > +		return NULL;

> > +	}

> > +

> > +	req_sk = req_to_sk(req);

> > +	nreq_sk = req_to_sk(nreq);

> > +

> > +	memcpy(nreq_sk, req_sk,

> > +	       offsetof(struct sock, sk_dontcopy_begin));

> > +	memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end,

> > +	       req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end));

> > +

> > +	sk_node_init(&nreq_sk->sk_node);

> > +	nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping;

> > +#ifdef CONFIG_XPS

> > +	nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping;

> > +#endif

> > +	nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu;

> > +	refcount_set(&nreq_sk->sk_refcnt, 0);

> 

> Not sure why you clear sk_refcnt here (it is set to 1 later)


I thought it was safer, but I'm fine to remove the line.


> 

> > +

> > +	nreq->rsk_listener = sk;

> > +

> > +	/* We need not acquire fastopenq->lock

> > +	 * because the child socket is locked in inet_csk_listen_stop().

> > +	 */

> > +	if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener)

> > +		rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq);

> > +

> > +	return nreq;

> > +}

> 

> Ouch, this is going to be hard to maintain...


How could I make it less hard ... ?


> 

> 

> 

> 

> > +

> > +static void reqsk_migrate_reset(struct request_sock *req)

> > +{

> > +#if IS_ENABLED(CONFIG_IPV6)

> > +	inet_rsk(req)->ipv6_opt = NULL;

> > +#endif

> > +}

> > +

> >  /* return true if req was found in the ehash table */

> >  static bool reqsk_queue_unlink(struct request_sock *req)

> >  {

> > @@ -1036,14 +1083,36 @@ void inet_csk_listen_stop(struct sock *sk)

> >  	 * of the variants now.			--ANK

> >  	 */

> >  	while ((req = reqsk_queue_remove(queue, sk)) != NULL) {

> > -		struct sock *child = req->sk;

> > +		struct sock *child = req->sk, *nsk;

> > +		struct request_sock *nreq;

> >  

> >  		local_bh_disable();

> >  		bh_lock_sock(child);

> >  		WARN_ON(sock_owned_by_user(child));

> >  		sock_hold(child);

> >  

> > +		nsk = reuseport_migrate_sock(sk, child, NULL);

> > +		if (nsk) {

> > +			nreq = inet_reqsk_clone(req, nsk);

> > +			if (nreq) {

> > +				refcount_set(&nreq->rsk_refcnt, 1);

> > +

> > +				if (inet_csk_reqsk_queue_add(nsk, nreq, child)) {

> > +					reqsk_migrate_reset(req);

> > +				} else {

> > +					reqsk_migrate_reset(nreq);

> > +					__reqsk_free(nreq);

> > +				}

> > +

> > +				/* inet_csk_reqsk_queue_add() has already

> > +				 * called inet_child_forget() on failure case.

> > +				 */

> > +				goto skip_child_forget;

> > +			}

> > +		}

> > +

> >  		inet_child_forget(sk, req, child);

> > +skip_child_forget:

> >  		reqsk_put(req);

> >  		bh_unlock_sock(child);

> >  		local_bh_enable();

> >
diff mbox series

Patch

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index fa806e9167ec..07e97b2f3635 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -695,6 +695,53 @@  int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
 }
 EXPORT_SYMBOL(inet_rtx_syn_ack);
 
+static struct request_sock *inet_reqsk_clone(struct request_sock *req,
+					     struct sock *sk)
+{
+	struct sock *req_sk, *nreq_sk;
+	struct request_sock *nreq;
+
+	nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN);
+	if (!nreq) {
+		/* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */
+		sock_put(sk);
+		return NULL;
+	}
+
+	req_sk = req_to_sk(req);
+	nreq_sk = req_to_sk(nreq);
+
+	memcpy(nreq_sk, req_sk,
+	       offsetof(struct sock, sk_dontcopy_begin));
+	memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end,
+	       req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end));
+
+	sk_node_init(&nreq_sk->sk_node);
+	nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping;
+#ifdef CONFIG_XPS
+	nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping;
+#endif
+	nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu;
+	refcount_set(&nreq_sk->sk_refcnt, 0);
+
+	nreq->rsk_listener = sk;
+
+	/* We need not acquire fastopenq->lock
+	 * because the child socket is locked in inet_csk_listen_stop().
+	 */
+	if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener)
+		rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq);
+
+	return nreq;
+}
+
+static void reqsk_migrate_reset(struct request_sock *req)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	inet_rsk(req)->ipv6_opt = NULL;
+#endif
+}
+
 /* return true if req was found in the ehash table */
 static bool reqsk_queue_unlink(struct request_sock *req)
 {
@@ -1036,14 +1083,36 @@  void inet_csk_listen_stop(struct sock *sk)
 	 * of the variants now.			--ANK
 	 */
 	while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
-		struct sock *child = req->sk;
+		struct sock *child = req->sk, *nsk;
+		struct request_sock *nreq;
 
 		local_bh_disable();
 		bh_lock_sock(child);
 		WARN_ON(sock_owned_by_user(child));
 		sock_hold(child);
 
+		nsk = reuseport_migrate_sock(sk, child, NULL);
+		if (nsk) {
+			nreq = inet_reqsk_clone(req, nsk);
+			if (nreq) {
+				refcount_set(&nreq->rsk_refcnt, 1);
+
+				if (inet_csk_reqsk_queue_add(nsk, nreq, child)) {
+					reqsk_migrate_reset(req);
+				} else {
+					reqsk_migrate_reset(nreq);
+					__reqsk_free(nreq);
+				}
+
+				/* inet_csk_reqsk_queue_add() has already
+				 * called inet_child_forget() on failure case.
+				 */
+				goto skip_child_forget;
+			}
+		}
+
 		inet_child_forget(sk, req, child);
+skip_child_forget:
 		reqsk_put(req);
 		bh_unlock_sock(child);
 		local_bh_enable();