From patchwork Mon Mar 23 21:26:26 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Mat Martineau X-Patchwork-Id: 222025 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-9.8 required=3.0 tests=HEADER_FROM_DIFFERENT_DOMAINS, INCLUDES_PATCH, MAILING_LIST_MULTI, SIGNED_OFF_BY, SPF_HELO_NONE, SPF_PASS, URIBL_BLOCKED, USER_AGENT_GIT autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id 15067C10DCE for ; Mon, 23 Mar 2020 21:29:36 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.kernel.org (Postfix) with ESMTP id D51EB2073E for ; Mon, 23 Mar 2020 21:29:35 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1727091AbgCWV3e (ORCPT ); Mon, 23 Mar 2020 17:29:34 -0400 Received: from mga02.intel.com ([134.134.136.20]:60319 "EHLO mga02.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726955AbgCWV3e (ORCPT ); Mon, 23 Mar 2020 17:29:34 -0400 IronPort-SDR: DdMWqGxxzLRS90LcxJOOjY/cRYS2KpVyT/nRVpnTgN1qAr2DtKe8LoJVhNTOLJNBXOlaINygrM 706du+xvUIeQ== X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from fmsmga005.fm.intel.com ([10.253.24.32]) by orsmga101.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 23 Mar 2020 14:29:32 -0700 IronPort-SDR: JaPe/eHaFXfIZFZDnJI0COlJBp5ONvSy9QyzDM1knjfIdhUOPCIO8Tno0HKt4k+puwkGzU97xm w4wtUdxhRYTA== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.72,297,1580803200"; d="scan'208";a="445960392" Received: from mjmartin-nuc02.mjmartin-nuc02 (HELO mjmartin-nuc02.sea.intel.com) ([10.254.100.76]) by fmsmga005.fm.intel.com with ESMTP; 23 Mar 2020 14:29:32 -0700 From: Mat Martineau To: netdev@vger.kernel.org Cc: Peter Krystad , eric.dumazet@gmail.com, Matthieu Baerts , Paolo Abeni , Mat Martineau Subject: [PATCH net-next 01/17] mptcp: Add ADD_ADDR handling Date: Mon, 23 Mar 2020 14:26:26 -0700 Message-Id: <20200323212642.34104-2-mathew.j.martineau@linux.intel.com> X-Mailer: git-send-email 2.26.0 In-Reply-To: <20200323212642.34104-1-mathew.j.martineau@linux.intel.com> References: <20200323212642.34104-1-mathew.j.martineau@linux.intel.com> MIME-Version: 1.0 Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org From: Peter Krystad Add handling for sending and receiving the ADD_ADDR, ADD_ADDR6, and RM_ADDR suboptions. Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Peter Krystad Signed-off-by: Mat Martineau --- include/linux/tcp.h | 20 ++++- include/net/mptcp.h | 9 ++ net/mptcp/crypto.c | 17 ++-- net/mptcp/options.c | 206 +++++++++++++++++++++++++++++++++++++++++-- net/mptcp/protocol.h | 28 +++++- 5 files changed, 262 insertions(+), 18 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 3dc964010fef..1225db308957 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -86,9 +86,13 @@ struct mptcp_options_received { u64 data_seq; u32 subflow_seq; u16 data_len; - u8 mp_capable : 1, + u16 mp_capable : 1, mp_join : 1, - dss : 1; + dss : 1, + add_addr : 1, + rm_addr : 1, + family : 4, + echo : 1; u8 use_map:1, dsn64:1, data_fin:1, @@ -96,6 +100,16 @@ struct mptcp_options_received { ack64:1, mpc_map:1, __unused:2; + u8 addr_id; + u8 rm_id; + union { + struct in_addr addr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + struct in6_addr addr6; +#endif + }; + u64 ahmac; + u16 port; }; #endif @@ -131,6 +145,8 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt) #if IS_ENABLED(CONFIG_MPTCP) rx_opt->mptcp.mp_capable = 0; rx_opt->mptcp.mp_join = 0; + rx_opt->mptcp.add_addr = 0; + rx_opt->mptcp.rm_addr = 0; rx_opt->mptcp.dss = 0; #endif } diff --git a/include/net/mptcp.h b/include/net/mptcp.h index c971d25431ea..0d5ea71dd3d0 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -33,6 +33,15 @@ struct mptcp_out_options { u16 suboptions; u64 sndr_key; u64 rcvr_key; + union { + struct in_addr addr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + struct in6_addr addr6; +#endif + }; + u8 addr_id; + u64 ahmac; + u8 rm_id; struct mptcp_ext ext_copy; #endif }; diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c index 40d1bb18fd60..c151628bd416 100644 --- a/net/mptcp/crypto.c +++ b/net/mptcp/crypto.c @@ -44,8 +44,7 @@ void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn) *idsn = be64_to_cpu(*((__be64 *)&mptcp_hashed_key[6])); } -void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, - void *hmac) +void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac) { u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE]; __be32 mptcp_hashed_key[SHA256_DIGEST_WORDS]; @@ -55,6 +54,9 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, u8 key2be[8]; int i; + if (WARN_ON_ONCE(len > SHA256_DIGEST_SIZE)) + len = SHA256_DIGEST_SIZE; + put_unaligned_be64(key1, key1be); put_unaligned_be64(key2, key2be); @@ -65,11 +67,10 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, for (i = 0; i < 8; i++) input[i + 8] ^= key2be[i]; - put_unaligned_be32(nonce1, &input[SHA256_BLOCK_SIZE]); - put_unaligned_be32(nonce2, &input[SHA256_BLOCK_SIZE + 4]); + memcpy(&input[SHA256_BLOCK_SIZE], msg, len); sha256_init(&state); - sha256_update(&state, input, SHA256_BLOCK_SIZE + 8); + sha256_update(&state, input, SHA256_BLOCK_SIZE + len); /* emit sha256(K1 || msg) on the second input block, so we can * reuse 'input' for the last hashing @@ -125,6 +126,7 @@ static int __init test_mptcp_crypto(void) char hmac[20], hmac_hex[41]; u32 nonce1, nonce2; u64 key1, key2; + u8 msg[8]; int i, j; for (i = 0; i < ARRAY_SIZE(tests); ++i) { @@ -134,7 +136,10 @@ static int __init test_mptcp_crypto(void) nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0])); nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4])); - mptcp_crypto_hmac_sha(key1, key2, nonce1, nonce2, hmac); + put_unaligned_be32(nonce1, &msg[0]); + put_unaligned_be32(nonce2, &msg[4]); + + mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); for (j = 0; j < 20; ++j) sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff); hmac_hex[40] = 0; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index aea1a62d9999..6c6c18a09a40 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -178,6 +178,71 @@ void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, break; + case MPTCPOPT_ADD_ADDR: + mp_opt->echo = (*ptr++) & MPTCP_ADDR_ECHO; + if (!mp_opt->echo) { + if (opsize == TCPOLEN_MPTCP_ADD_ADDR || + opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT) + mp_opt->family = MPTCP_ADDR_IPVERSION_4; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 || + opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT) + mp_opt->family = MPTCP_ADDR_IPVERSION_6; +#endif + else + break; + } else { + if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE || + opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) + mp_opt->family = MPTCP_ADDR_IPVERSION_4; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE || + opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) + mp_opt->family = MPTCP_ADDR_IPVERSION_6; +#endif + else + break; + } + + mp_opt->add_addr = 1; + mp_opt->port = 0; + mp_opt->addr_id = *ptr++; + pr_debug("ADD_ADDR: id=%d", mp_opt->addr_id); + if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) { + memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4); + ptr += 4; + if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT || + opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) { + mp_opt->port = get_unaligned_be16(ptr); + ptr += 2; + } + } +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else { + memcpy(mp_opt->addr6.s6_addr, (u8 *)ptr, 16); + ptr += 16; + if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT || + opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) { + mp_opt->port = get_unaligned_be16(ptr); + ptr += 2; + } + } +#endif + if (!mp_opt->echo) { + mp_opt->ahmac = get_unaligned_be64(ptr); + ptr += 8; + } + break; + + case MPTCPOPT_RM_ADDR: + if (opsize != TCPOLEN_MPTCP_RM_ADDR_BASE) + break; + + mp_opt->rm_addr = 1; + mp_opt->rm_id = *ptr++; + pr_debug("RM_ADDR: id=%d", mp_opt->rm_id); + break; + default: break; } @@ -386,6 +451,84 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, return true; } +static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id, + struct in_addr *addr) +{ + u8 hmac[MPTCP_ADDR_HMAC_LEN]; + u8 msg[7]; + + msg[0] = addr_id; + memcpy(&msg[1], &addr->s_addr, 4); + msg[5] = 0; + msg[6] = 0; + + mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac); + + return get_unaligned_be64(hmac); +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id, + struct in6_addr *addr) +{ + u8 hmac[MPTCP_ADDR_HMAC_LEN]; + u8 msg[19]; + + msg[0] = addr_id; + memcpy(&msg[1], &addr->s6_addr, 16); + msg[17] = 0; + msg[18] = 0; + + mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac); + + return get_unaligned_be64(hmac); +} +#endif + +static bool mptcp_established_options_addr(struct sock *sk, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + struct sockaddr_storage saddr; + u8 id; + + id = 0; + memset(&saddr, 0, sizeof(saddr)); + + if (saddr.ss_family == AF_INET) { + if (remaining < TCPOLEN_MPTCP_ADD_ADDR) + return false; + opts->suboptions |= OPTION_MPTCP_ADD_ADDR; + opts->addr_id = id; + opts->addr = ((struct sockaddr_in *)&saddr)->sin_addr; + opts->ahmac = add_addr_generate_hmac(msk->local_key, + msk->remote_key, + opts->addr_id, + &opts->addr); + *size = TCPOLEN_MPTCP_ADD_ADDR; + } +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (saddr.ss_family == AF_INET6) { + if (remaining < TCPOLEN_MPTCP_ADD_ADDR6) + return false; + opts->suboptions |= OPTION_MPTCP_ADD_ADDR6; + opts->addr_id = id; + opts->ahmac = add_addr6_generate_hmac(msk->local_key, + msk->remote_key, + opts->addr_id, + &opts->addr6); + opts->addr6 = ((struct sockaddr_in6 *)&saddr)->sin6_addr; + *size = TCPOLEN_MPTCP_ADD_ADDR6; + } +#endif + pr_debug("addr_id=%d, ahmac=%llu", opts->addr_id, opts->ahmac); + + return true; +} + bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) @@ -393,6 +536,8 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int opt_size = 0; bool ret = false; + opts->suboptions = 0; + if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) ret = true; else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, @@ -407,6 +552,11 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, *size += opt_size; remaining -= opt_size; + if (mptcp_established_options_addr(sk, &opt_size, remaining, opts)) { + *size += opt_size; + remaining -= opt_size; + ret = true; + } return ret; } @@ -521,10 +671,9 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) else len = TCPOLEN_MPTCP_MPC_ACK; - *ptr++ = htonl((TCPOPT_MPTCP << 24) | (len << 16) | - (MPTCPOPT_MP_CAPABLE << 12) | - (MPTCP_SUPPORTED_VERSION << 8) | - MPTCP_CAP_HMAC_SHA256); + *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len, + MPTCP_SUPPORTED_VERSION, + MPTCP_CAP_HMAC_SHA256); if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions)) @@ -546,6 +695,50 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) } mp_capable_done: + if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { + if (opts->ahmac) + *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, + TCPOLEN_MPTCP_ADD_ADDR, 0, + opts->addr_id); + else + *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, + TCPOLEN_MPTCP_ADD_ADDR_BASE, + MPTCP_ADDR_ECHO, + opts->addr_id); + memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4); + ptr += 1; + if (opts->ahmac) { + put_unaligned_be64(opts->ahmac, ptr); + ptr += 2; + } + } + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) { + if (opts->ahmac) + *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, + TCPOLEN_MPTCP_ADD_ADDR6, 0, + opts->addr_id); + else + *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, + TCPOLEN_MPTCP_ADD_ADDR6_BASE, + MPTCP_ADDR_ECHO, + opts->addr_id); + memcpy((u8 *)ptr, opts->addr6.s6_addr, 16); + ptr += 4; + if (opts->ahmac) { + put_unaligned_be64(opts->ahmac, ptr); + ptr += 2; + } + } +#endif + + if (OPTION_MPTCP_RM_ADDR & opts->suboptions) { + *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR, + TCPOLEN_MPTCP_RM_ADDR_BASE, + 0, opts->rm_id); + } + if (opts->ext_copy.use_ack || opts->ext_copy.use_map) { struct mptcp_ext *mpext = &opts->ext_copy; u8 len = TCPOLEN_MPTCP_DSS_BASE; @@ -567,10 +760,7 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) flags |= MPTCP_DSS_DATA_FIN; } - *ptr++ = htonl((TCPOPT_MPTCP << 24) | - (len << 16) | - (MPTCPOPT_DSS << 12) | - (flags)); + *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags); if (mpext->use_ack) { put_unaligned_be64(mpext->data_ack, ptr); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index eb3f65264a40..471e013d1c32 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -17,6 +17,9 @@ #define OPTION_MPTCP_MPC_SYN BIT(0) #define OPTION_MPTCP_MPC_SYNACK BIT(1) #define OPTION_MPTCP_MPC_ACK BIT(2) +#define OPTION_MPTCP_ADD_ADDR BIT(6) +#define OPTION_MPTCP_ADD_ADDR6 BIT(7) +#define OPTION_MPTCP_RM_ADDR BIT(8) /* MPTCP option subtypes */ #define MPTCPOPT_MP_CAPABLE 0 @@ -39,6 +42,16 @@ #define TCPOLEN_MPTCP_DSS_MAP32 10 #define TCPOLEN_MPTCP_DSS_MAP64 14 #define TCPOLEN_MPTCP_DSS_CHECKSUM 2 +#define TCPOLEN_MPTCP_ADD_ADDR 16 +#define TCPOLEN_MPTCP_ADD_ADDR_PORT 18 +#define TCPOLEN_MPTCP_ADD_ADDR_BASE 8 +#define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT 10 +#define TCPOLEN_MPTCP_ADD_ADDR6 28 +#define TCPOLEN_MPTCP_ADD_ADDR6_PORT 30 +#define TCPOLEN_MPTCP_ADD_ADDR6_BASE 20 +#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 22 +#define TCPOLEN_MPTCP_PORT_LEN 2 +#define TCPOLEN_MPTCP_RM_ADDR_BASE 4 /* MPTCP MP_CAPABLE flags */ #define MPTCP_VERSION_MASK (0x0F) @@ -55,10 +68,22 @@ #define MPTCP_DSS_HAS_ACK BIT(0) #define MPTCP_DSS_FLAG_MASK (0x1F) +/* MPTCP ADD_ADDR flags */ +#define MPTCP_ADDR_ECHO BIT(0) +#define MPTCP_ADDR_HMAC_LEN 20 +#define MPTCP_ADDR_IPVERSION_4 4 +#define MPTCP_ADDR_IPVERSION_6 6 + /* MPTCP socket flags */ #define MPTCP_DATA_READY 0 #define MPTCP_SEND_SPACE 1 +static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field) +{ + return htonl((TCPOPT_MPTCP << 24) | (len << 16) | (subopt << 12) | + ((nib & 0xF) << 8) | field); +} + /* MPTCP connection sock */ struct mptcp_sock { /* inet_connection_sock must be the first member */ @@ -219,8 +244,7 @@ static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) mptcp_crypto_key_sha(*key, token, idsn); } -void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, - void *hash_out); +void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac); static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) { From patchwork Mon Mar 23 21:26:27 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Mat Martineau X-Patchwork-Id: 222017 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-9.8 required=3.0 tests=HEADER_FROM_DIFFERENT_DOMAINS, INCLUDES_PATCH, MAILING_LIST_MULTI, SIGNED_OFF_BY, SPF_HELO_NONE, SPF_PASS, URIBL_BLOCKED, USER_AGENT_GIT autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id EB3B5C4332B for ; Mon, 23 Mar 2020 21:30:00 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.kernel.org (Postfix) with ESMTP id B973520719 for ; Mon, 23 Mar 2020 21:30:00 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1727238AbgCWVaA (ORCPT ); Mon, 23 Mar 2020 17:30:00 -0400 Received: from mga02.intel.com ([134.134.136.20]:60319 "EHLO mga02.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726203AbgCWV3e (ORCPT ); Mon, 23 Mar 2020 17:29:34 -0400 IronPort-SDR: FUoN/LIYMx5f7fbJQuf+4j7KrDZ0B9MYbrIqvEA0EtTnXRD1Bt9S1QAj9VudLUh61tO4VYtqJc 7f+tDIqOlzSw== X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from fmsmga005.fm.intel.com ([10.253.24.32]) by orsmga101.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 23 Mar 2020 14:29:33 -0700 IronPort-SDR: UrKFidHdAlGCjRswzQYgi6gXsLeb1O6VS4SpLerC7IPtWULlGvvIfvEqi8GkvUXOc+kkd5tZv/ p2IxVpgu68Gw== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.72,297,1580803200"; d="scan'208";a="445960395" Received: from mjmartin-nuc02.mjmartin-nuc02 (HELO mjmartin-nuc02.sea.intel.com) ([10.254.100.76]) by fmsmga005.fm.intel.com with ESMTP; 23 Mar 2020 14:29:32 -0700 From: Mat Martineau To: netdev@vger.kernel.org Cc: Peter Krystad , eric.dumazet@gmail.com, Matthieu Baerts , Florian Westphal , Paolo Abeni , Mat Martineau Subject: [PATCH net-next 02/17] mptcp: Add path manager interface Date: Mon, 23 Mar 2020 14:26:27 -0700 Message-Id: <20200323212642.34104-3-mathew.j.martineau@linux.intel.com> X-Mailer: git-send-email 2.26.0 In-Reply-To: <20200323212642.34104-1-mathew.j.martineau@linux.intel.com> References: <20200323212642.34104-1-mathew.j.martineau@linux.intel.com> MIME-Version: 1.0 Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org From: Peter Krystad Add enough of a path manager interface to allow sending of ADD_ADDR when an incoming MPTCP connection is created. Capable of sending only a single IPv4 ADD_ADDR option. The 'pm_data' element of the connection sock will need to be expanded to handle multiple interfaces and IPv6. Partial processing of the incoming ADD_ADDR is included so the path manager notification of that event happens at the proper time, which involves validating the incoming address information. This is a skeleton interface definition for events generated by MPTCP. Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Co-developed-by: Mat Martineau Signed-off-by: Mat Martineau Signed-off-by: Peter Krystad --- net/mptcp/Makefile | 2 +- net/mptcp/options.c | 80 ++++++++++++++++++++++++------ net/mptcp/pm.c | 113 +++++++++++++++++++++++++++++++++++++++++++ net/mptcp/protocol.c | 5 ++ net/mptcp/protocol.h | 79 ++++++++++++++++++++++++++++++ net/mptcp/subflow.c | 4 +- 6 files changed, 264 insertions(+), 19 deletions(-) create mode 100644 net/mptcp/pm.c diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index 4e98d9edfd0a..2848d723c252 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_MPTCP) += mptcp.o -mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o +mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 6c6c18a09a40..a3661318a7af 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -492,36 +492,35 @@ static bool mptcp_established_options_addr(struct sock *sk, { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); - struct sockaddr_storage saddr; - u8 id; + struct mptcp_addr_info saddr; + int len; - id = 0; - memset(&saddr, 0, sizeof(saddr)); + if (!mptcp_pm_should_signal(msk) || + !(mptcp_pm_addr_signal(msk, remaining, &saddr))) + return false; + + len = mptcp_add_addr_len(saddr.family); + if (remaining < len) + return false; - if (saddr.ss_family == AF_INET) { - if (remaining < TCPOLEN_MPTCP_ADD_ADDR) - return false; + *size = len; + opts->addr_id = saddr.id; + if (saddr.family == AF_INET) { opts->suboptions |= OPTION_MPTCP_ADD_ADDR; - opts->addr_id = id; - opts->addr = ((struct sockaddr_in *)&saddr)->sin_addr; + opts->addr = saddr.addr; opts->ahmac = add_addr_generate_hmac(msk->local_key, msk->remote_key, opts->addr_id, &opts->addr); - *size = TCPOLEN_MPTCP_ADD_ADDR; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) - else if (saddr.ss_family == AF_INET6) { - if (remaining < TCPOLEN_MPTCP_ADD_ADDR6) - return false; + else if (saddr.family == AF_INET6) { opts->suboptions |= OPTION_MPTCP_ADD_ADDR6; - opts->addr_id = id; + opts->addr6 = saddr.addr6; opts->ahmac = add_addr6_generate_hmac(msk->local_key, msk->remote_key, opts->addr_id, &opts->addr6); - opts->addr6 = ((struct sockaddr_in6 *)&saddr)->sin6_addr; - *size = TCPOLEN_MPTCP_ADD_ADDR6; } #endif pr_debug("addr_id=%d, ahmac=%llu", opts->addr_id, opts->ahmac); @@ -607,10 +606,37 @@ static bool check_fully_established(struct mptcp_subflow_context *subflow, return true; } +static bool add_addr_hmac_valid(struct mptcp_sock *msk, + struct mptcp_options_received *mp_opt) +{ + u64 hmac = 0; + + if (mp_opt->echo) + return true; + + if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) + hmac = add_addr_generate_hmac(msk->remote_key, + msk->local_key, + mp_opt->addr_id, &mp_opt->addr); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else + hmac = add_addr6_generate_hmac(msk->remote_key, + msk->local_key, + mp_opt->addr_id, &mp_opt->addr6); +#endif + + pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n", + msk, (unsigned long long)hmac, + (unsigned long long)mp_opt->ahmac); + + return hmac == mp_opt->ahmac; +} + void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, struct tcp_options_received *opt_rx) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_options_received *mp_opt; struct mptcp_ext *mpext; @@ -618,6 +644,26 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, if (!check_fully_established(subflow, skb, mp_opt)) return; + if (mp_opt->add_addr && add_addr_hmac_valid(msk, mp_opt)) { + struct mptcp_addr_info addr; + + addr.port = htons(mp_opt->port); + addr.id = mp_opt->addr_id; + if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) { + addr.family = AF_INET; + addr.addr = mp_opt->addr; + } +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (mp_opt->family == MPTCP_ADDR_IPVERSION_6) { + addr.family = AF_INET6; + addr.addr6 = mp_opt->addr6; + } +#endif + if (!mp_opt->echo) + mptcp_pm_add_addr_received(msk, &addr); + mp_opt->add_addr = 0; + } + if (!mp_opt->dss) return; @@ -654,6 +700,8 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, } mpext->data_fin = mp_opt->data_fin; + + mptcp_pm_fully_established(msk); } void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c new file mode 100644 index 000000000000..ad837da0193d --- /dev/null +++ b/net/mptcp/pm.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2019, Intel Corporation. + */ +#include +#include +#include +#include "protocol.h" + +static struct workqueue_struct *pm_wq; + +/* path manager command handlers */ + +int mptcp_pm_announce_addr(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + return -ENOTSUPP; +} + +int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id) +{ + return -ENOTSUPP; +} + +int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 remote_id) +{ + return -ENOTSUPP; +} + +/* path manager event handlers */ + +void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side) +{ + struct mptcp_pm_data *pm = &msk->pm; + + pr_debug("msk=%p, token=%u side=%d", msk, msk->token, server_side); + + WRITE_ONCE(pm->server_side, server_side); +} + +bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) +{ + pr_debug("msk=%p", msk); + return false; +} + +void mptcp_pm_fully_established(struct mptcp_sock *msk) +{ + pr_debug("msk=%p", msk); +} + +void mptcp_pm_connection_closed(struct mptcp_sock *msk) +{ + pr_debug("msk=%p", msk); +} + +void mptcp_pm_subflow_established(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow) +{ + pr_debug("msk=%p", msk); +} + +void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id) +{ + pr_debug("msk=%p", msk); +} + +void mptcp_pm_add_addr_received(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + pr_debug("msk=%p, remote_id=%d", msk, addr->id); +} + +/* path manager helpers */ + +bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, + struct mptcp_addr_info *saddr) +{ + return false; +} + +int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) +{ + return 0; +} + +static void pm_worker(struct work_struct *work) +{ +} + +void mptcp_pm_data_init(struct mptcp_sock *msk) +{ + msk->pm.add_addr_signaled = 0; + msk->pm.add_addr_accepted = 0; + msk->pm.local_addr_used = 0; + msk->pm.subflows = 0; + WRITE_ONCE(msk->pm.work_pending, false); + WRITE_ONCE(msk->pm.addr_signal, false); + WRITE_ONCE(msk->pm.accept_addr, false); + WRITE_ONCE(msk->pm.accept_subflow, false); + msk->pm.status = 0; + + spin_lock_init(&msk->pm.lock); + INIT_WORK(&msk->pm.work, pm_worker); +} + +void mptcp_pm_init(void) +{ + pm_wq = alloc_workqueue("pm_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8); + if (!pm_wq) + panic("Failed to allocate workqueue"); +} diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e959104832ef..4479ef04cfa7 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -703,6 +703,8 @@ static int __mptcp_init_sock(struct sock *sk) msk->first = NULL; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; + mptcp_pm_data_init(msk); + return 0; } @@ -1059,6 +1061,8 @@ void mptcp_finish_connect(struct sock *ssk) inet_sk_state_store(sk, TCP_ESTABLISHED); sk->sk_state_change(sk); } + + mptcp_pm_new_connection(msk, 0); } static void mptcp_sock_graft(struct sock *sk, struct socket *parent) @@ -1381,6 +1385,7 @@ void mptcp_proto_init(void) mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; mptcp_subflow_init(); + mptcp_pm_init(); if (proto_register(&mptcp_prot, 1) != 0) panic("Failed to register MPTCP proto.\n"); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 471e013d1c32..8d4761ae3951 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -84,6 +84,50 @@ static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field) ((nib & 0xF) << 8) | field); } +#define MPTCP_PM_MAX_ADDR 4 + +struct mptcp_addr_info { + sa_family_t family; + __be16 port; + u8 id; + union { + struct in_addr addr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + struct in6_addr addr6; +#endif + }; +}; + +enum mptcp_pm_status { + MPTCP_PM_ADD_ADDR_RECEIVED, + MPTCP_PM_ESTABLISHED, + MPTCP_PM_SUBFLOW_ESTABLISHED, +}; + +struct mptcp_pm_data { + struct mptcp_addr_info local; + struct mptcp_addr_info remote; + + spinlock_t lock; /*protects the whole PM data */ + + bool addr_signal; + bool server_side; + bool work_pending; + bool accept_addr; + bool accept_subflow; + u8 add_addr_signaled; + u8 add_addr_accepted; + u8 local_addr_used; + u8 subflows; + u8 add_addr_signal_max; + u8 add_addr_accept_max; + u8 local_addr_max; + u8 subflows_max; + u8 status; + + struct work_struct work; +}; + /* MPTCP connection sock */ struct mptcp_sock { /* inet_connection_sock must be the first member */ @@ -100,6 +144,7 @@ struct mptcp_sock { struct skb_ext *cached_ext; /* for the next sendmsg */ struct socket *subflow; /* outgoing connect/listener/!mp_capable */ struct sock *first; + struct mptcp_pm_data pm; }; #define mptcp_for_each_subflow(__msk, __subflow) \ @@ -116,6 +161,7 @@ struct mptcp_subflow_request_sock { mp_join : 1, backup : 1, remote_key_valid : 1; + u8 local_id; u64 local_key; u64 remote_key; u64 idsn; @@ -246,6 +292,39 @@ static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac); +void mptcp_pm_init(void); +void mptcp_pm_data_init(struct mptcp_sock *msk); +void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side); +void mptcp_pm_fully_established(struct mptcp_sock *msk); +bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); +void mptcp_pm_connection_closed(struct mptcp_sock *msk); +void mptcp_pm_subflow_established(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow); +void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id); +void mptcp_pm_add_addr_received(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr); + +int mptcp_pm_announce_addr(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr); +int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id); +int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 remote_id); + +static inline bool mptcp_pm_should_signal(struct mptcp_sock *msk) +{ + return READ_ONCE(msk->pm.addr_signal); +} + +static inline unsigned int mptcp_add_addr_len(int family) +{ + if (family == AF_INET) + return TCPOLEN_MPTCP_ADD_ADDR; + return TCPOLEN_MPTCP_ADD_ADDR6; +} + +bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, + struct mptcp_addr_info *saddr); +int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); + static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) { return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index e1faa88855bf..35345b2b0c44 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -234,8 +234,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, /* new mpc subflow takes ownership of the newly * created mptcp socket */ - inet_sk_state_store((struct sock *)new_msk, - TCP_ESTABLISHED); + inet_sk_state_store(new_msk, TCP_ESTABLISHED); + mptcp_pm_new_connection(mptcp_sk(new_msk), 1); ctx->conn = new_msk; new_msk = NULL; } From patchwork Mon Mar 23 21:26:30 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Mat Martineau X-Patchwork-Id: 222024 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-9.8 required=3.0 tests=HEADER_FROM_DIFFERENT_DOMAINS, INCLUDES_PATCH, MAILING_LIST_MULTI, SIGNED_OFF_BY, SPF_HELO_NONE, SPF_PASS, URIBL_BLOCKED, USER_AGENT_GIT autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id 93010C54FCF for ; Mon, 23 Mar 2020 21:29:38 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.kernel.org (Postfix) with ESMTP id 6A9C920719 for ; Mon, 23 Mar 2020 21:29:38 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726986AbgCWV3h (ORCPT ); Mon, 23 Mar 2020 17:29:37 -0400 Received: from mga02.intel.com ([134.134.136.20]:60326 "EHLO mga02.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1727067AbgCWV3g (ORCPT ); Mon, 23 Mar 2020 17:29:36 -0400 IronPort-SDR: h9A4EgKpHMfAN+gdjtPUKPhuxS/mggnGf4mMJnfSSZ9tft30SOpP8HgmGTgaNpb7eGWSsL8aZa UxF/4EtK9ctQ== X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from fmsmga005.fm.intel.com ([10.253.24.32]) by orsmga101.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 23 Mar 2020 14:29:35 -0700 IronPort-SDR: 82ySC63npCzVaTq9gWuSH3VhHz5XiQ9EeigWIenBHZSpQvLoJ2aWDAQ8aaGR3zPTmjwcfh+Jgo raECGguHnO6g== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.72,297,1580803200"; d="scan'208";a="445960407" Received: from mjmartin-nuc02.mjmartin-nuc02 (HELO mjmartin-nuc02.sea.intel.com) ([10.254.100.76]) by fmsmga005.fm.intel.com with ESMTP; 23 Mar 2020 14:29:34 -0700 From: Mat Martineau To: netdev@vger.kernel.org Cc: Peter Krystad , eric.dumazet@gmail.com, Florian Westphal , Paolo Abeni , Mat Martineau Subject: [PATCH net-next 05/17] mptcp: Implement path manager interface commands Date: Mon, 23 Mar 2020 14:26:30 -0700 Message-Id: <20200323212642.34104-6-mathew.j.martineau@linux.intel.com> X-Mailer: git-send-email 2.26.0 In-Reply-To: <20200323212642.34104-1-mathew.j.martineau@linux.intel.com> References: <20200323212642.34104-1-mathew.j.martineau@linux.intel.com> MIME-Version: 1.0 Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org From: Peter Krystad Fill in more path manager functionality by adding a worker function and modifying the related stub functions to schedule the worker. Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Peter Krystad Signed-off-by: Mat Martineau --- net/mptcp/pm.c | 132 +++++++++++++++++++++++++++++++++++++++++-- net/mptcp/protocol.c | 1 + net/mptcp/protocol.h | 1 + 3 files changed, 129 insertions(+), 5 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index ad837da0193d..3aedad58778c 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -15,7 +15,11 @@ static struct workqueue_struct *pm_wq; int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { - return -ENOTSUPP; + pr_debug("msk=%p, local_id=%d", msk, addr->id); + + msk->pm.local = *addr; + WRITE_ONCE(msk->pm.addr_signal, true); + return 0; } int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id) @@ -41,13 +45,58 @@ void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side) bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) { - pr_debug("msk=%p", msk); - return false; + struct mptcp_pm_data *pm = &msk->pm; + int ret; + + pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows, + pm->subflows_max, READ_ONCE(pm->accept_subflow)); + + /* try to avoid acquiring the lock below */ + if (!READ_ONCE(pm->accept_subflow)) + return false; + + spin_lock_bh(&pm->lock); + ret = pm->subflows < pm->subflows_max; + if (ret && ++pm->subflows == pm->subflows_max) + WRITE_ONCE(pm->accept_subflow, false); + spin_unlock_bh(&pm->lock); + + return ret; +} + +/* return true if the new status bit is currently cleared, that is, this event + * can be server, eventually by an already scheduled work + */ +static bool mptcp_pm_schedule_work(struct mptcp_sock *msk, + enum mptcp_pm_status new_status) +{ + pr_debug("msk=%p status=%x new=%lx", msk, msk->pm.status, + BIT(new_status)); + if (msk->pm.status & BIT(new_status)) + return false; + + msk->pm.status |= BIT(new_status); + if (queue_work(pm_wq, &msk->pm.work)) + sock_hold((struct sock *)msk); + return true; } void mptcp_pm_fully_established(struct mptcp_sock *msk) { + struct mptcp_pm_data *pm = &msk->pm; + pr_debug("msk=%p", msk); + + /* try to avoid acquiring the lock below */ + if (!READ_ONCE(pm->work_pending)) + return; + + spin_lock_bh(&pm->lock); + + if (READ_ONCE(pm->work_pending)) + mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED); + + spin_unlock_bh(&pm->lock); } void mptcp_pm_connection_closed(struct mptcp_sock *msk) @@ -58,7 +107,19 @@ void mptcp_pm_connection_closed(struct mptcp_sock *msk) void mptcp_pm_subflow_established(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow) { + struct mptcp_pm_data *pm = &msk->pm; + pr_debug("msk=%p", msk); + + if (!READ_ONCE(pm->work_pending)) + return; + + spin_lock_bh(&pm->lock); + + if (READ_ONCE(pm->work_pending)) + mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED); + + spin_unlock_bh(&pm->lock); } void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id) @@ -69,7 +130,23 @@ void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id) void mptcp_pm_add_addr_received(struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { - pr_debug("msk=%p, remote_id=%d", msk, addr->id); + struct mptcp_pm_data *pm = &msk->pm; + + pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id, + READ_ONCE(pm->accept_addr)); + + /* avoid acquiring the lock if there is no room for fouther addresses */ + if (!READ_ONCE(pm->accept_addr)) + return; + + spin_lock_bh(&pm->lock); + + /* be sure there is something to signal re-checking under PM lock */ + if (READ_ONCE(pm->accept_addr) && + mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) + pm->remote = *addr; + + spin_unlock_bh(&pm->lock); } /* path manager helpers */ @@ -77,7 +154,24 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk, bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_addr_info *saddr) { - return false; + int ret = false; + + spin_lock_bh(&msk->pm.lock); + + /* double check after the lock is acquired */ + if (!mptcp_pm_should_signal(msk)) + goto out_unlock; + + if (remaining < mptcp_add_addr_len(msk->pm.local.family)) + goto out_unlock; + + *saddr = msk->pm.local; + WRITE_ONCE(msk->pm.addr_signal, false); + ret = true; + +out_unlock: + spin_unlock_bh(&msk->pm.lock); + return ret; } int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) @@ -87,6 +181,28 @@ int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) static void pm_worker(struct work_struct *work) { + struct mptcp_pm_data *pm = container_of(work, struct mptcp_pm_data, + work); + struct mptcp_sock *msk = container_of(pm, struct mptcp_sock, pm); + struct sock *sk = (struct sock *)msk; + + lock_sock(sk); + spin_lock_bh(&msk->pm.lock); + + pr_debug("msk=%p status=%x", msk, pm->status); + if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { + pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); + } + if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { + pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); + } + if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { + pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); + } + + spin_unlock_bh(&msk->pm.lock); + release_sock(sk); + sock_put(sk); } void mptcp_pm_data_init(struct mptcp_sock *msk) @@ -105,6 +221,12 @@ void mptcp_pm_data_init(struct mptcp_sock *msk) INIT_WORK(&msk->pm.work, pm_worker); } +void mptcp_pm_close(struct mptcp_sock *msk) +{ + if (cancel_work_sync(&msk->pm.work)) + sock_put((struct sock *)msk); +} + void mptcp_pm_init(void) { pm_wq = alloc_workqueue("pm_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 84c28b4326ff..2f69d83c15e7 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -833,6 +833,7 @@ static void mptcp_close(struct sock *sk, long timeout) } mptcp_cancel_work(sk); + mptcp_pm_close(msk); __skb_queue_purge(&sk->sk_receive_queue); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index df134ac91274..209bdaa43dda 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -330,6 +330,7 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac); void mptcp_pm_init(void); void mptcp_pm_data_init(struct mptcp_sock *msk); +void mptcp_pm_close(struct mptcp_sock *msk); void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side); void mptcp_pm_fully_established(struct mptcp_sock *msk); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); From patchwork Mon Mar 23 21:26:31 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Mat Martineau X-Patchwork-Id: 222018 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-9.8 required=3.0 tests=HEADER_FROM_DIFFERENT_DOMAINS, INCLUDES_PATCH, MAILING_LIST_MULTI, SIGNED_OFF_BY, SPF_HELO_NONE, SPF_PASS, URIBL_BLOCKED, USER_AGENT_GIT autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id 56FFCC43331 for ; Mon, 23 Mar 2020 21:29:57 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.kernel.org (Postfix) with ESMTP id 370F220719 for ; Mon, 23 Mar 2020 21:29:57 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1727231AbgCWV34 (ORCPT ); Mon, 23 Mar 2020 17:29:56 -0400 Received: from mga02.intel.com ([134.134.136.20]:60326 "EHLO mga02.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1727107AbgCWV3h (ORCPT ); Mon, 23 Mar 2020 17:29:37 -0400 IronPort-SDR: usjHtMRWnDAxUHCQzc+ZYPxC3b0/u6bGWvgu6ALy1u4KOnm15EGd7CYrPDiZp7Co9UlBen5Vhv a1gSfThkz7QA== X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from fmsmga005.fm.intel.com ([10.253.24.32]) by orsmga101.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 23 Mar 2020 14:29:36 -0700 IronPort-SDR: wiPM7QmVxNNawbcsIizVKuR/je5Ix2B92LFyxvj1LmZWBAW1FhIzX85cv0i4vxZ38G/mYeRRg1 z3B55jKrqX3A== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.72,297,1580803200"; d="scan'208";a="445960409" Received: from mjmartin-nuc02.mjmartin-nuc02 (HELO mjmartin-nuc02.sea.intel.com) ([10.254.100.76]) by fmsmga005.fm.intel.com with ESMTP; 23 Mar 2020 14:29:35 -0700 From: Mat Martineau To: netdev@vger.kernel.org Cc: Paolo Abeni , eric.dumazet@gmail.com, Florian Westphal , Mat Martineau Subject: [PATCH net-next 06/17] mptcp: update per unacked sequence on pkt reception Date: Mon, 23 Mar 2020 14:26:31 -0700 Message-Id: <20200323212642.34104-7-mathew.j.martineau@linux.intel.com> X-Mailer: git-send-email 2.26.0 In-Reply-To: <20200323212642.34104-1-mathew.j.martineau@linux.intel.com> References: <20200323212642.34104-1-mathew.j.martineau@linux.intel.com> MIME-Version: 1.0 Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org From: Paolo Abeni So that we keep per unacked sequence number consistent; since we update per msk data, use an atomic64 cmpxchg() to protect against concurrent updates from multiple subflows. Initialize the snd_una at connect()/accept() time. Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau --- net/mptcp/options.c | 52 +++++++++++++++++++++++++++++++++++++++----- net/mptcp/protocol.c | 2 ++ net/mptcp/protocol.h | 1 + 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 20ba00865c55..b0ff8ad702a3 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -744,6 +744,46 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, return true; } +static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit) +{ + u32 old_ack32, cur_ack32; + + if (use_64bit) + return cur_ack; + + old_ack32 = (u32)old_ack; + cur_ack32 = (u32)cur_ack; + cur_ack = (old_ack & GENMASK_ULL(63, 32)) + cur_ack32; + if (unlikely(before(cur_ack32, old_ack32))) + return cur_ack + (1LL << 32); + return cur_ack; +} + +static void update_una(struct mptcp_sock *msk, + struct mptcp_options_received *mp_opt) +{ + u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una); + u64 write_seq = READ_ONCE(msk->write_seq); + + /* avoid ack expansion on update conflict, to reduce the risk of + * wrongly expanding to a future ack sequence number, which is way + * more dangerous than missing an ack + */ + new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64); + + /* ACK for data not even sent yet? Ignore. */ + if (after64(new_snd_una, write_seq)) + new_snd_una = old_snd_una; + + while (after64(new_snd_una, old_snd_una)) { + snd_una = old_snd_una; + old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una, + new_snd_una); + if (old_snd_una == snd_una) + break; + } +} + static bool add_addr_hmac_valid(struct mptcp_sock *msk, struct mptcp_options_received *mp_opt) { @@ -805,6 +845,12 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, if (!mp_opt->dss) return; + /* we can't wait for recvmsg() to update the ack_seq, otherwise + * monodirectional flows will stuck + */ + if (mp_opt->use_ack) + update_una(msk, mp_opt); + mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (!mpext) return; @@ -831,12 +877,6 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, mpext->use_map = 1; } - if (mp_opt->use_ack) { - mpext->data_ack = mp_opt->data_ack; - mpext->use_ack = 1; - mpext->ack64 = mp_opt->ack64; - } - mpext->data_fin = mp_opt->data_fin; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2f69d83c15e7..297fe460be09 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -906,6 +906,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, struct request_sock *req) } msk->write_seq = subflow_req->idsn + 1; + atomic64_set(&msk->snd_una, msk->write_seq); if (subflow_req->remote_key_valid) { msk->can_ack = true; msk->remote_key = subflow_req->remote_key; @@ -1107,6 +1108,7 @@ void mptcp_finish_connect(struct sock *ssk) WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->ack_seq, ack_seq); WRITE_ONCE(msk->can_ack, 1); + atomic64_set(&msk->snd_una, msk->write_seq); if (inet_sk_state_load(sk) != TCP_ESTABLISHED) { inet_sk_state_store(sk, TCP_ESTABLISHED); sk->sk_state_change(sk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 209bdaa43dda..29db05467cc3 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -147,6 +147,7 @@ struct mptcp_sock { u64 remote_key; u64 write_seq; u64 ack_seq; + atomic64_t snd_una; u32 token; unsigned long flags; bool can_ack; From patchwork Mon Mar 23 21:26:36 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Mat Martineau X-Patchwork-Id: 222019 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-9.8 required=3.0 tests=HEADER_FROM_DIFFERENT_DOMAINS, INCLUDES_PATCH, MAILING_LIST_MULTI, SIGNED_OFF_BY, SPF_HELO_NONE, SPF_PASS, URIBL_BLOCKED, USER_AGENT_GIT autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id 06F0CC4332B for ; Mon, 23 Mar 2020 21:29:54 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.kernel.org (Postfix) with ESMTP id CF21F2073E for ; Mon, 23 Mar 2020 21:29:53 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1727227AbgCWV3x (ORCPT ); Mon, 23 Mar 2020 17:29:53 -0400 Received: from mga02.intel.com ([134.134.136.20]:60326 "EHLO mga02.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1727122AbgCWV3j (ORCPT ); Mon, 23 Mar 2020 17:29:39 -0400 IronPort-SDR: m7dxF8RbwVfQswhYksDnAtt18zmvYniPkmSSQla7KQXwZ1OVmngLqRfSdzC5x2Id2d4VfaVJhq y3VRIB/X05+g== X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from fmsmga005.fm.intel.com ([10.253.24.32]) by orsmga101.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 23 Mar 2020 14:29:39 -0700 IronPort-SDR: NyUufOcCcCZsRIjruCGwRHp6+SysufOhAAOpxWB9hmlSmPDuW4v0XPNUAYqJo37cXN4iiH+/BN kj/8fFXQKh8w== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.72,297,1580803200"; d="scan'208";a="445960428" Received: from mjmartin-nuc02.mjmartin-nuc02 (HELO mjmartin-nuc02.sea.intel.com) ([10.254.100.76]) by fmsmga005.fm.intel.com with ESMTP; 23 Mar 2020 14:29:38 -0700 From: Mat Martineau To: netdev@vger.kernel.org Cc: Paolo Abeni , eric.dumazet@gmail.com, Florian Westphal , Mat Martineau Subject: [PATCH net-next 11/17] mptcp: rework mptcp_sendmsg_frag to accept optional dfrag Date: Mon, 23 Mar 2020 14:26:36 -0700 Message-Id: <20200323212642.34104-12-mathew.j.martineau@linux.intel.com> X-Mailer: git-send-email 2.26.0 In-Reply-To: <20200323212642.34104-1-mathew.j.martineau@linux.intel.com> References: <20200323212642.34104-1-mathew.j.martineau@linux.intel.com> MIME-Version: 1.0 Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org From: Paolo Abeni This will simplify mptcp-level retransmission implementation in the next patch. If dfrag is provided by the caller, skip kernel space memory allocation and use data and metadata provided by the dfrag itself. Because a peer could ack data at TCP level but refrain from sending mptcp-level ACKs, we could grow the mptcp socket backlog indefinitely. We should thus block mptcp_sendmsg until the peer has acked some of the sent data. In order to be able to do so, increment the mptcp socket wmem_queued counter on memory allocation and decrement it when releasing the memory on mptcp-level ack reception. Because TCP performns sndbuf auto-tuning up to tcp_wmem_max[2], make this the mptcp sk_sndbuf limit. In the future we could add experiment with autotuning as TCP does in tcp_sndbuf_expand(). Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau --- net/mptcp/protocol.c | 119 ++++++++++++++++++++++++++----------------- 1 file changed, 72 insertions(+), 47 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index d4b6ce984576..c7a333a6d917 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -316,7 +316,7 @@ static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) return NULL; } -static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk, +static inline bool mptcp_skb_can_collapse_to(u64 write_seq, const struct sk_buff *skb, const struct mptcp_ext *mpext) { @@ -324,7 +324,7 @@ static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk, return false; /* can collapse only if MPTCP level sequence is in order */ - return mpext && mpext->data_seq + mpext->data_len == msk->write_seq; + return mpext && mpext->data_seq + mpext->data_len == write_seq; } static inline bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, @@ -417,23 +417,28 @@ mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, } static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, - struct msghdr *msg, long *timeo, int *pmss_now, + struct msghdr *msg, struct mptcp_data_frag *dfrag, + long *timeo, int *pmss_now, int *ps_goal) { int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0; bool dfrag_collapsed, can_collapse = false; struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_ext *mpext = NULL; - struct mptcp_data_frag *dfrag; + bool retransmission = !!dfrag; struct sk_buff *skb, *tail; struct page_frag *pfrag; + struct page *page; + u64 *write_seq; size_t psize; /* use the mptcp page cache so that we can easily move the data * from one substream to another, but do per subflow memory accounting + * Note: pfrag is used only !retransmission, but the compiler if + * fooled into a warning if we don't init here */ pfrag = sk_page_frag(sk); - while (!mptcp_page_frag_refill(ssk, pfrag) || + while ((!retransmission && !mptcp_page_frag_refill(ssk, pfrag)) || !mptcp_ext_cache_refill(msk)) { ret = sk_stream_wait_memory(ssk, timeo); if (ret) @@ -447,6 +452,13 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, if (unlikely(__mptcp_needs_tcp_fallback(msk))) return 0; } + if (!retransmission) { + write_seq = &msk->write_seq; + page = pfrag->page; + } else { + write_seq = &dfrag->data_seq; + page = dfrag->page; + } /* compute copy limit */ mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags); @@ -464,63 +476,74 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, * SSN association set here */ can_collapse = (size_goal - skb->len > 0) && - mptcp_skb_can_collapse_to(msk, skb, mpext); + mptcp_skb_can_collapse_to(*write_seq, skb, mpext); if (!can_collapse) TCP_SKB_CB(skb)->eor = 1; else avail_size = size_goal - skb->len; } - /* reuse tail pfrag, if possible, or carve a new one from the page - * allocator - */ - dfrag = mptcp_rtx_tail(sk); - offset = pfrag->offset; - dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); - if (!dfrag_collapsed) { - dfrag = mptcp_carve_data_frag(msk, pfrag, offset); + if (!retransmission) { + /* reuse tail pfrag, if possible, or carve a new one from the + * page allocator + */ + dfrag = mptcp_rtx_tail(sk); + offset = pfrag->offset; + dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); + if (!dfrag_collapsed) { + dfrag = mptcp_carve_data_frag(msk, pfrag, offset); + offset = dfrag->offset; + frag_truesize = dfrag->overhead; + } + psize = min_t(size_t, pfrag->size - offset, avail_size); + + /* Copy to page */ + pr_debug("left=%zu", msg_data_left(msg)); + psize = copy_page_from_iter(pfrag->page, offset, + min_t(size_t, msg_data_left(msg), + psize), + &msg->msg_iter); + pr_debug("left=%zu", msg_data_left(msg)); + if (!psize) + return -EINVAL; + + if (!sk_wmem_schedule(sk, psize + dfrag->overhead)) + return -ENOMEM; + } else { offset = dfrag->offset; - frag_truesize = dfrag->overhead; + psize = min_t(size_t, dfrag->data_len, avail_size); } - psize = min_t(size_t, pfrag->size - offset, avail_size); - - /* Copy to page */ - pr_debug("left=%zu", msg_data_left(msg)); - psize = copy_page_from_iter(pfrag->page, offset, - min_t(size_t, msg_data_left(msg), psize), - &msg->msg_iter); - pr_debug("left=%zu", msg_data_left(msg)); - if (!psize) - return -EINVAL; - - if (!sk_wmem_schedule(sk, psize + dfrag->overhead)) - return -ENOMEM; /* tell the TCP stack to delay the push so that we can safely * access the skb after the sendpages call */ - ret = do_tcp_sendpages(ssk, pfrag->page, offset, psize, + ret = do_tcp_sendpages(ssk, page, offset, psize, msg->msg_flags | MSG_SENDPAGE_NOTLAST); if (ret <= 0) return ret; frag_truesize += ret; - if (unlikely(ret < psize)) - iov_iter_revert(&msg->msg_iter, psize - ret); + if (!retransmission) { + if (unlikely(ret < psize)) + iov_iter_revert(&msg->msg_iter, psize - ret); - /* send successful, keep track of sent data for mptcp-level - * retransmission - */ - dfrag->data_len += ret; - if (!dfrag_collapsed) { - get_page(dfrag->page); - list_add_tail(&dfrag->list, &msk->rtx_queue); - } + /* send successful, keep track of sent data for mptcp-level + * retransmission + */ + dfrag->data_len += ret; + if (!dfrag_collapsed) { + get_page(dfrag->page); + list_add_tail(&dfrag->list, &msk->rtx_queue); + sk_wmem_queued_add(sk, frag_truesize); + } else { + sk_wmem_queued_add(sk, ret); + } - /* charge data on mptcp rtx queue to the master socket - * Note: we charge such data both to sk and ssk - */ - sk->sk_forward_alloc -= frag_truesize; + /* charge data on mptcp rtx queue to the master socket + * Note: we charge such data both to sk and ssk + */ + sk->sk_forward_alloc -= frag_truesize; + } /* if the tail skb extension is still the cached one, collapsing * really happened. Note: we can't check for 'same skb' as the sk_buff @@ -539,7 +562,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, msk->cached_ext = NULL; memset(mpext, 0, sizeof(*mpext)); - mpext->data_seq = msk->write_seq; + mpext->data_seq = *write_seq; mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; mpext->data_len = ret; mpext->use_map = 1; @@ -550,8 +573,9 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, mpext->dsn64); out: - pfrag->offset += frag_truesize; - msk->write_seq += ret; + if (!retransmission) + pfrag->offset += frag_truesize; + *write_seq += ret; mptcp_subflow_ctx(ssk)->rel_write_seq += ret; return ret; @@ -663,7 +687,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) lock_sock(ssk); while (msg_data_left(msg)) { - ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo, &mss_now, + ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now, &size_goal); if (ret < 0) break; @@ -974,6 +998,7 @@ static int mptcp_init_sock(struct sock *sk) return ret; sk_sockets_allocated_inc(sk); + sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2]; if (!mptcp_is_enabled(sock_net(sk))) return -ENOPROTOOPT; From patchwork Mon Mar 23 21:26:38 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Mat Martineau X-Patchwork-Id: 222021 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-14.8 required=3.0 tests=HEADER_FROM_DIFFERENT_DOMAINS, INCLUDES_PATCH, MAILING_LIST_MULTI, MENTIONS_GIT_HOSTING, SIGNED_OFF_BY, SPF_HELO_NONE, SPF_PASS, URIBL_BLOCKED, USER_AGENT_GIT autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id ABBA8C10DCE for ; Mon, 23 Mar 2020 21:29:47 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.kernel.org (Postfix) with ESMTP id 824142073E for ; Mon, 23 Mar 2020 21:29:47 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1727177AbgCWV3q (ORCPT ); Mon, 23 Mar 2020 17:29:46 -0400 Received: from mga02.intel.com ([134.134.136.20]:60326 "EHLO mga02.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1727036AbgCWV3k (ORCPT ); Mon, 23 Mar 2020 17:29:40 -0400 IronPort-SDR: N3Qdjzw+kbzpesC0McNoKrqf1fhC54ETcTgvqEr9YrbrASlCf5DdiCDCxxAytdk/iOyuUbit2E 1kvRal9XOiNQ== X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from fmsmga005.fm.intel.com ([10.253.24.32]) by orsmga101.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 23 Mar 2020 14:29:39 -0700 IronPort-SDR: 17+zermOTxCDBUtCSWRkU6coPOd5qQmIMHiTzKD2AQ4rxj7dn9LuQ0arXzb8VdE7bYSPuIJql1 nHYfb2vHemAw== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.72,297,1580803200"; d="scan'208";a="445960432" Received: from mjmartin-nuc02.mjmartin-nuc02 (HELO mjmartin-nuc02.sea.intel.com) ([10.254.100.76]) by fmsmga005.fm.intel.com with ESMTP; 23 Mar 2020 14:29:38 -0700 From: Mat Martineau To: netdev@vger.kernel.org Cc: Davide Caratti , eric.dumazet@gmail.com, Matthieu Baerts , Paolo Abeni , Mat Martineau Subject: [PATCH net-next 13/17] mptcp: allow dumping subflow context to userspace Date: Mon, 23 Mar 2020 14:26:38 -0700 Message-Id: <20200323212642.34104-14-mathew.j.martineau@linux.intel.com> X-Mailer: git-send-email 2.26.0 In-Reply-To: <20200323212642.34104-1-mathew.j.martineau@linux.intel.com> References: <20200323212642.34104-1-mathew.j.martineau@linux.intel.com> MIME-Version: 1.0 Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org From: Davide Caratti add ulp-specific diagnostic functions, so that subflow information can be dumped to userspace programs like 'ss'. Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Davide Caratti Signed-off-by: Mat Martineau --- MAINTAINERS | 1 + include/uapi/linux/inet_diag.h | 1 + include/uapi/linux/mptcp.h | 34 +++++++++++ net/mptcp/Makefile | 2 +- net/mptcp/diag.c | 104 +++++++++++++++++++++++++++++++++ net/mptcp/protocol.h | 2 + net/mptcp/subflow.c | 2 + 7 files changed, 145 insertions(+), 1 deletion(-) create mode 100644 include/uapi/linux/mptcp.h create mode 100644 net/mptcp/diag.c diff --git a/MAINTAINERS b/MAINTAINERS index 97dce264bc7c..8dfafdfd6d73 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11714,6 +11714,7 @@ W: https://github.com/multipath-tcp/mptcp_net-next/wiki B: https://github.com/multipath-tcp/mptcp_net-next/issues S: Maintained F: include/net/mptcp.h +F: include/uapi/linux/mptcp.h F: net/mptcp/ F: tools/testing/selftests/net/mptcp/ diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 75dffd78363a..57cc429a9177 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -166,6 +166,7 @@ enum { INET_ULP_INFO_UNSPEC, INET_ULP_INFO_NAME, INET_ULP_INFO_TLS, + INET_ULP_INFO_MPTCP, __INET_ULP_INFO_MAX, }; #define INET_ULP_INFO_MAX (__INET_ULP_INFO_MAX - 1) diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h new file mode 100644 index 000000000000..c564140d20f0 --- /dev/null +++ b/include/uapi/linux/mptcp.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +#ifndef _UAPI_MPTCP_H +#define _UAPI_MPTCP_H + +#include + +#define MPTCP_SUBFLOW_FLAG_MCAP_REM BIT(0) +#define MPTCP_SUBFLOW_FLAG_MCAP_LOC BIT(1) +#define MPTCP_SUBFLOW_FLAG_JOIN_REM BIT(2) +#define MPTCP_SUBFLOW_FLAG_JOIN_LOC BIT(3) +#define MPTCP_SUBFLOW_FLAG_BKUP_REM BIT(4) +#define MPTCP_SUBFLOW_FLAG_BKUP_LOC BIT(5) +#define MPTCP_SUBFLOW_FLAG_FULLY_ESTABLISHED BIT(6) +#define MPTCP_SUBFLOW_FLAG_CONNECTED BIT(7) +#define MPTCP_SUBFLOW_FLAG_MAPVALID BIT(8) + +enum { + MPTCP_SUBFLOW_ATTR_UNSPEC, + MPTCP_SUBFLOW_ATTR_TOKEN_REM, + MPTCP_SUBFLOW_ATTR_TOKEN_LOC, + MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ, + MPTCP_SUBFLOW_ATTR_MAP_SEQ, + MPTCP_SUBFLOW_ATTR_MAP_SFSEQ, + MPTCP_SUBFLOW_ATTR_SSN_OFFSET, + MPTCP_SUBFLOW_ATTR_MAP_DATALEN, + MPTCP_SUBFLOW_ATTR_FLAGS, + MPTCP_SUBFLOW_ATTR_ID_REM, + MPTCP_SUBFLOW_ATTR_ID_LOC, + MPTCP_SUBFLOW_ATTR_PAD, + __MPTCP_SUBFLOW_ATTR_MAX +}; + +#define MPTCP_SUBFLOW_ATTR_MAX (__MPTCP_SUBFLOW_ATTR_MAX - 1) +#endif /* _UAPI_MPTCP_H */ diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index 2848d723c252..54494cf5bec0 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_MPTCP) += mptcp.o -mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o +mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c new file mode 100644 index 000000000000..a536586742f2 --- /dev/null +++ b/net/mptcp/diag.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* MPTCP socket monitoring support + * + * Copyright (c) 2019 Red Hat + * + * Author: Davide Caratti + */ + +#include +#include +#include +#include +#include +#include "protocol.h" + +static int subflow_get_info(const struct sock *sk, struct sk_buff *skb) +{ + struct mptcp_subflow_context *sf; + struct nlattr *start; + u32 flags = 0; + int err; + + start = nla_nest_start_noflag(skb, INET_ULP_INFO_MPTCP); + if (!start) + return -EMSGSIZE; + + rcu_read_lock(); + sf = rcu_dereference(inet_csk(sk)->icsk_ulp_data); + if (!sf) { + err = 0; + goto nla_failure; + } + + if (sf->mp_capable) + flags |= MPTCP_SUBFLOW_FLAG_MCAP_REM; + if (sf->request_mptcp) + flags |= MPTCP_SUBFLOW_FLAG_MCAP_LOC; + if (sf->mp_join) + flags |= MPTCP_SUBFLOW_FLAG_JOIN_REM; + if (sf->request_join) + flags |= MPTCP_SUBFLOW_FLAG_JOIN_LOC; + if (sf->backup) + flags |= MPTCP_SUBFLOW_FLAG_BKUP_REM; + if (sf->request_bkup) + flags |= MPTCP_SUBFLOW_FLAG_BKUP_LOC; + if (sf->fully_established) + flags |= MPTCP_SUBFLOW_FLAG_FULLY_ESTABLISHED; + if (sf->conn_finished) + flags |= MPTCP_SUBFLOW_FLAG_CONNECTED; + if (sf->map_valid) + flags |= MPTCP_SUBFLOW_FLAG_MAPVALID; + + if (nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_REM, sf->remote_token) || + nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_LOC, sf->token) || + nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ, + sf->rel_write_seq) || + nla_put_u64_64bit(skb, MPTCP_SUBFLOW_ATTR_MAP_SEQ, sf->map_seq, + MPTCP_SUBFLOW_ATTR_PAD) || + nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_MAP_SFSEQ, + sf->map_subflow_seq) || + nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_SSN_OFFSET, sf->ssn_offset) || + nla_put_u16(skb, MPTCP_SUBFLOW_ATTR_MAP_DATALEN, + sf->map_data_len) || + nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_FLAGS, flags) || + nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_REM, sf->remote_id) || + nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_LOC, sf->local_id)) { + err = -EMSGSIZE; + goto nla_failure; + } + + rcu_read_unlock(); + nla_nest_end(skb, start); + return 0; + +nla_failure: + rcu_read_unlock(); + nla_nest_cancel(skb, start); + return err; +} + +static size_t subflow_get_info_size(const struct sock *sk) +{ + size_t size = 0; + + size += nla_total_size(0) + /* INET_ULP_INFO_MPTCP */ + nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_REM */ + nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_LOC */ + nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ */ + nla_total_size_64bit(8) + /* MPTCP_SUBFLOW_ATTR_MAP_SEQ */ + nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_MAP_SFSEQ */ + nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */ + nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_MAP_DATALEN */ + nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_FLAGS */ + nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_REM */ + nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_LOC */ + 0; + return size; +} + +void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops) +{ + ops->get_info = subflow_get_info; + ops->get_info_size = subflow_get_info_size; +} diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index e9d4a852c7f1..0999f74df027 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -408,4 +408,6 @@ static inline bool before64(__u64 seq1, __u64 seq2) #define after64(seq2, seq1) before64(seq1, seq2) +void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops); + #endif /* __MPTCP_PROTOCOL_H */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index bf58b599a820..5ed524ae88ba 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1142,6 +1142,8 @@ void mptcp_subflow_init(void) subflow_v6m_specific.net_frag_header_len = 0; #endif + mptcp_diag_subflow_init(&subflow_ulp_ops); + if (tcp_register_ulp(&subflow_ulp_ops) != 0) panic("MPTCP: failed to register subflows to ULP\n"); } From patchwork Mon Mar 23 21:26:39 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Mat Martineau X-Patchwork-Id: 222023 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-9.8 required=3.0 tests=HEADER_FROM_DIFFERENT_DOMAINS, INCLUDES_PATCH, MAILING_LIST_MULTI, SIGNED_OFF_BY, SPF_HELO_NONE, SPF_PASS, URIBL_BLOCKED, USER_AGENT_GIT autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id 6B4D9C4332B for ; Mon, 23 Mar 2020 21:29:42 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.kernel.org (Postfix) with ESMTP id 3D00920719 for ; Mon, 23 Mar 2020 21:29:42 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1727150AbgCWV3l (ORCPT ); Mon, 23 Mar 2020 17:29:41 -0400 Received: from mga01.intel.com ([192.55.52.88]:27337 "EHLO mga01.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1727117AbgCWV3k (ORCPT ); Mon, 23 Mar 2020 17:29:40 -0400 IronPort-SDR: Xe/9LCq7T7YMgsI5zye41afzVXEAzvXF9t968Zxu35vv+W8obRiruCYXwwHtrzzxnGY4X+TJTb 3CPS1sMWHOtg== X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from fmsmga005.fm.intel.com ([10.253.24.32]) by fmsmga101.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 23 Mar 2020 14:29:39 -0700 IronPort-SDR: uMG5c6wGJ7yMlQuSCpHMAMlchXZPDGd/WaoYu1um4BKid1hOF8/ZShf+4YyDUDIdtUbr5xxBF7 dEqKyAc3ChQw== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.72,297,1580803200"; d="scan'208";a="445960434" Received: from mjmartin-nuc02.mjmartin-nuc02 (HELO mjmartin-nuc02.sea.intel.com) ([10.254.100.76]) by fmsmga005.fm.intel.com with ESMTP; 23 Mar 2020 14:29:39 -0700 From: Mat Martineau To: netdev@vger.kernel.org Cc: Florian Westphal , eric.dumazet@gmail.com, Paolo Abeni , Mat Martineau Subject: [PATCH net-next 14/17] mptcp: add and use MIB counter infrastructure Date: Mon, 23 Mar 2020 14:26:39 -0700 Message-Id: <20200323212642.34104-15-mathew.j.martineau@linux.intel.com> X-Mailer: git-send-email 2.26.0 In-Reply-To: <20200323212642.34104-1-mathew.j.martineau@linux.intel.com> References: <20200323212642.34104-1-mathew.j.martineau@linux.intel.com> MIME-Version: 1.0 Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org From: Florian Westphal Exported via same /proc file as the Linux TCP MIB counters, so "netstat -s" or "nstat" will show them automatically. The MPTCP MIB counters are allocated in a distinct pcpu area in order to avoid bloating/wasting TCP pcpu memory. Counters are allocated once the first MPTCP socket is created in a network namespace and free'd on exit. If no sockets have been allocated, all-zero mptcp counters are shown. The MIB counter list is taken from the multipath-tcp.org kernel, but only a few counters have been picked up so far. The counter list can be increased at any time later on. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau --- include/net/mptcp.h | 4 +++ include/net/netns/mib.h | 3 ++ net/ipv4/af_inet.c | 4 +++ net/ipv4/proc.c | 2 ++ net/mptcp/Makefile | 2 +- net/mptcp/mib.c | 69 +++++++++++++++++++++++++++++++++++++++++ net/mptcp/mib.h | 40 ++++++++++++++++++++++++ net/mptcp/protocol.c | 30 +++++++++++++----- net/mptcp/subflow.c | 33 ++++++++++++++++---- 9 files changed, 172 insertions(+), 15 deletions(-) create mode 100644 net/mptcp/mib.c create mode 100644 net/mptcp/mib.h diff --git a/include/net/mptcp.h b/include/net/mptcp.h index b648fa20eec8..0e7c5471010b 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -12,6 +12,8 @@ #include #include +struct seq_file; + /* MPTCP sk_buff extension data */ struct mptcp_ext { u64 data_ack; @@ -123,6 +125,7 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, bool mptcp_sk_is_subflow(const struct sock *sk); +void mptcp_seq_show(struct seq_file *seq); #else static inline void mptcp_init(void) @@ -194,6 +197,7 @@ static inline bool mptcp_sk_is_subflow(const struct sock *sk) return false; } +static inline void mptcp_seq_show(struct seq_file *seq) { } #endif /* CONFIG_MPTCP */ #if IS_ENABLED(CONFIG_MPTCP_IPV6) diff --git a/include/net/netns/mib.h b/include/net/netns/mib.h index b5fdb108d602..59b2c3a3db42 100644 --- a/include/net/netns/mib.h +++ b/include/net/netns/mib.h @@ -27,6 +27,9 @@ struct netns_mib { #if IS_ENABLED(CONFIG_TLS) DEFINE_SNMP_STAT(struct linux_tls_mib, tls_statistics); #endif +#ifdef CONFIG_MPTCP + DEFINE_SNMP_STAT(struct mptcp_mib, mptcp_statistics); +#endif }; #endif diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index bd7b4e92e07f..cf58e29cf746 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1793,6 +1793,10 @@ static __net_exit void ipv4_mib_exit_net(struct net *net) free_percpu(net->mib.net_statistics); free_percpu(net->mib.ip_statistics); free_percpu(net->mib.tcp_statistics); +#ifdef CONFIG_MPTCP + /* allocated on demand, see mptcp_init_sock() */ + free_percpu(net->mib.mptcp_statistics); +#endif } static __net_initdata struct pernet_operations ipv4_mib_ops = { diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 2580303249e2..75545a829a2b 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -485,6 +486,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) offsetof(struct ipstats_mib, syncp))); seq_putc(seq, '\n'); + mptcp_seq_show(seq); return 0; } diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index 54494cf5bec0..faebe8ec9f73 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_MPTCP) += mptcp.o -mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o +mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o mib.o diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c new file mode 100644 index 000000000000..0a6a15f3456d --- /dev/null +++ b/net/mptcp/mib.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include + +#include "mib.h" + +static const struct snmp_mib mptcp_snmp_list[] = { + SNMP_MIB_ITEM("MPCapableSYNRX", MPTCP_MIB_MPCAPABLEPASSIVE), + SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK), + SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK), + SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK), + SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS), + SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN), + SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX), + SNMP_MIB_ITEM("MPJoinSynAckRx", MPTCP_MIB_JOINSYNACKRX), + SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC), + SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX), + SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), + SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH), + SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX), + SNMP_MIB_SENTINEL +}; + +/* mptcp_mib_alloc - allocate percpu mib counters + * + * These are allocated when the first mptcp socket is created so + * we do not waste percpu memory if mptcp isn't in use. + */ +bool mptcp_mib_alloc(struct net *net) +{ + struct mptcp_mib __percpu *mib = alloc_percpu(struct mptcp_mib); + + if (!mib) + return false; + + if (cmpxchg(&net->mib.mptcp_statistics, NULL, mib)) + free_percpu(mib); + + return true; +} + +void mptcp_seq_show(struct seq_file *seq) +{ + struct net *net = seq->private; + int i; + + seq_puts(seq, "MPTcpExt:"); + for (i = 0; mptcp_snmp_list[i].name; i++) + seq_printf(seq, " %s", mptcp_snmp_list[i].name); + + seq_puts(seq, "\nMPTcpExt:"); + + if (!net->mib.mptcp_statistics) { + for (i = 0; mptcp_snmp_list[i].name; i++) + seq_puts(seq, " 0"); + + return; + } + + for (i = 0; mptcp_snmp_list[i].name; i++) + seq_printf(seq, " %lu", + snmp_fold_field(net->mib.mptcp_statistics, + mptcp_snmp_list[i].entry)); + seq_putc(seq, '\n'); +} diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h new file mode 100644 index 000000000000..d7de340fc997 --- /dev/null +++ b/net/mptcp/mib.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +enum linux_mptcp_mib_field { + MPTCP_MIB_NUM = 0, + MPTCP_MIB_MPCAPABLEPASSIVE, /* Received SYN with MP_CAPABLE */ + MPTCP_MIB_MPCAPABLEPASSIVEACK, /* Received third ACK with MP_CAPABLE */ + MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */ + MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */ + MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */ + MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */ + MPTCP_MIB_JOINSYNRX, /* Received a SYN + MP_JOIN */ + MPTCP_MIB_JOINSYNACKRX, /* Received a SYN/ACK + MP_JOIN */ + MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */ + MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */ + MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ + MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */ + MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */ + __MPTCP_MIB_MAX +}; + +#define LINUX_MIB_MPTCP_MAX __MPTCP_MIB_MAX +struct mptcp_mib { + unsigned long mibs[LINUX_MIB_MPTCP_MAX]; +}; + +static inline void MPTCP_INC_STATS(struct net *net, + enum linux_mptcp_mib_field field) +{ + if (likely(net->mib.mptcp_statistics)) + SNMP_INC_STATS(net->mib.mptcp_statistics, field); +} + +static inline void __MPTCP_INC_STATS(struct net *net, + enum linux_mptcp_mib_field field) +{ + if (likely(net->mib.mptcp_statistics)) + __SNMP_INC_STATS(net->mib.mptcp_statistics, field); +} + +bool mptcp_mib_alloc(struct net *net); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 75fa78a09af4..fc82b6508ae7 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -21,6 +21,7 @@ #endif #include #include "protocol.h" +#include "mib.h" #define MPTCP_SAME_STATE TCP_MAX_STATES @@ -1032,6 +1033,7 @@ static void mptcp_worker(struct work_struct *work) if (ret < 0) break; + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); copied += ret; dfrag->data_len -= ret; dfrag->offset += ret; @@ -1081,17 +1083,22 @@ static int __mptcp_init_sock(struct sock *sk) static int mptcp_init_sock(struct sock *sk) { - int ret = __mptcp_init_sock(sk); + struct net *net = sock_net(sk); + int ret; + if (!mptcp_is_enabled(net)) + return -ENOPROTOOPT; + + if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) + return -ENOMEM; + + ret = __mptcp_init_sock(sk); if (ret) return ret; sk_sockets_allocated_inc(sk); sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2]; - if (!mptcp_is_enabled(sock_net(sk))) - return -ENOPROTOOPT; - return 0; } @@ -1327,7 +1334,12 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, list_add(&subflow->node, &msk->conn_list); bh_unlock_sock(new_mptcp_sock); + + __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); local_bh_enable(); + } else { + MPTCP_INC_STATS(sock_net(sk), + MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); } return newsk; @@ -1448,13 +1460,15 @@ void mptcp_finish_connect(struct sock *ssk) u64 ack_seq; subflow = mptcp_subflow_ctx(ssk); - - if (!subflow->mp_capable) - return; - sk = subflow->conn; msk = mptcp_sk(sk); + if (!subflow->mp_capable) { + MPTCP_INC_STATS(sock_net(sk), + MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); + return; + } + pr_debug("msk=%p, token=%u", sk, subflow->token); mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 5ed524ae88ba..479ad7df4554 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -20,6 +20,13 @@ #endif #include #include "protocol.h" +#include "mib.h" + +static inline void SUBFLOW_REQ_INC_STATS(struct request_sock *req, + enum linux_mptcp_mib_field field) +{ + MPTCP_INC_STATS(sock_net(req_to_sk(req)), field); +} static int subflow_rebuild_header(struct sock *sk) { @@ -88,8 +95,7 @@ static bool subflow_token_join_request(struct request_sock *req, msk = mptcp_token_get_sock(subflow_req->token); if (!msk) { - pr_debug("subflow_req=%p, token=%u - not found\n", - subflow_req, subflow_req->token); + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN); return false; } @@ -137,8 +143,14 @@ static void subflow_init_req(struct request_sock *req, return; #endif - if (rx_opt.mptcp.mp_capable && rx_opt.mptcp.mp_join) - return; + if (rx_opt.mptcp.mp_capable) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); + + if (rx_opt.mptcp.mp_join) + return; + } else if (rx_opt.mptcp.mp_join) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); + } if (rx_opt.mptcp.mp_capable && listener->request_mptcp) { int err; @@ -231,6 +243,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow, subflow->thmac, subflow->remote_nonce); if (!subflow_thmac_valid(subflow)) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); subflow->mp_join = 0; goto do_reset; } @@ -247,6 +260,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) goto do_reset; subflow->conn_finished = 1; + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); } else { do_reset: tcp_send_active_reset(sk, GFP_ATOMIC); @@ -376,8 +390,10 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, opt_rx.mptcp.mp_join = 0; mptcp_get_options(skb, &opt_rx); if (!opt_rx.mptcp.mp_join || - !subflow_hmac_valid(req, &opt_rx)) + !subflow_hmac_valid(req, &opt_rx)) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); return NULL; + } } create_child: @@ -414,6 +430,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, ctx->conn = (struct sock *)owner; if (!mptcp_finish_join(child)) goto close_child; + + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); } } @@ -529,6 +547,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk) data_len = mpext->data_len; if (data_len == 0) { pr_err("Infinite mapping not handled"); + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); return MAPPING_INVALID; } @@ -572,8 +591,10 @@ static enum mapping_status get_mapping_status(struct sock *ssk) /* If this skb data are fully covered by the current mapping, * the new map would need caching, which is not supported */ - if (skb_is_fully_mapped(ssk, skb)) + if (skb_is_fully_mapped(ssk, skb)) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH); return MAPPING_INVALID; + } /* will validate the next map after consuming the current one */ return MAPPING_OK; From patchwork Mon Mar 23 21:26:40 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Mat Martineau X-Patchwork-Id: 222020 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-9.8 required=3.0 tests=HEADER_FROM_DIFFERENT_DOMAINS, INCLUDES_PATCH, MAILING_LIST_MULTI, SIGNED_OFF_BY, SPF_HELO_NONE, SPF_PASS, URIBL_BLOCKED, USER_AGENT_GIT autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id 8E967C43331 for ; Mon, 23 Mar 2020 21:29:50 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.kernel.org (Postfix) with ESMTP id 4CE8120719 for ; Mon, 23 Mar 2020 21:29:50 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1727196AbgCWV3t (ORCPT ); Mon, 23 Mar 2020 17:29:49 -0400 Received: from mga05.intel.com ([192.55.52.43]:18837 "EHLO mga05.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1727127AbgCWV3k (ORCPT ); Mon, 23 Mar 2020 17:29:40 -0400 IronPort-SDR: sGOGnB2VxbYGeKPZuMlDGKHoAJyQOQfdl07rlVtpYgcPTx7tPPEFBb2lhyT1+fQom5LsXkwhtm UeKXfg/MY0TQ== X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from fmsmga005.fm.intel.com ([10.253.24.32]) by fmsmga105.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 23 Mar 2020 14:29:39 -0700 IronPort-SDR: 8mPesS3ZBGrFSVuEZOCuglNsm27+wezgwgoJp7HjQky5adlDpO979DeR1N2Vlz7j+bSyLkJfaB AGtGz9Nz4cew== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.72,297,1580803200"; d="scan'208";a="445960436" Received: from mjmartin-nuc02.mjmartin-nuc02 (HELO mjmartin-nuc02.sea.intel.com) ([10.254.100.76]) by fmsmga005.fm.intel.com with ESMTP; 23 Mar 2020 14:29:39 -0700 From: Mat Martineau To: netdev@vger.kernel.org Cc: Paolo Abeni , eric.dumazet@gmail.com, Matthieu Baerts , Mat Martineau Subject: [PATCH net-next 15/17] mptcp: add netlink-based PM Date: Mon, 23 Mar 2020 14:26:40 -0700 Message-Id: <20200323212642.34104-16-mathew.j.martineau@linux.intel.com> X-Mailer: git-send-email 2.26.0 In-Reply-To: <20200323212642.34104-1-mathew.j.martineau@linux.intel.com> References: <20200323212642.34104-1-mathew.j.martineau@linux.intel.com> MIME-Version: 1.0 Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org From: Paolo Abeni Expose a new netlink family to userspace to control the PM, setting: - list of local addresses to be signalled. - list of local addresses used to created subflows. - maximum number of add_addr option to react When the msk is fully established, the PM netlink attempts to announce the 'signal' list via the ADD_ADDR option. Since we currently lack the ADD_ADDR echo (and related event) only the first addr is sent. After exhausting the 'announce' list, the PM tries to create subflow for each addr in 'local' list, waiting for each connection to be completed before attempting the next one. Idea is to add an additional PM hook for ADD_ADDR echo, to allow the PM netlink announcing multiple addresses, in sequence. Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau --- include/uapi/linux/mptcp.h | 54 +++ net/mptcp/Makefile | 3 +- net/mptcp/pm.c | 9 +- net/mptcp/pm_netlink.c | 856 +++++++++++++++++++++++++++++++++++++ net/mptcp/protocol.h | 7 + 5 files changed, 927 insertions(+), 2 deletions(-) create mode 100644 net/mptcp/pm_netlink.c diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index c564140d20f0..dfd6674c39b0 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -31,4 +31,58 @@ enum { }; #define MPTCP_SUBFLOW_ATTR_MAX (__MPTCP_SUBFLOW_ATTR_MAX - 1) + +/* netlink interface */ +#define MPTCP_PM_NAME "mptcp_pm" +#define MPTCP_PM_CMD_GRP_NAME "mptcp_pm_cmds" +#define MPTCP_PM_VER 0x1 + +/* + * ATTR types defined for MPTCP + */ +enum { + MPTCP_PM_ATTR_UNSPEC, + + MPTCP_PM_ATTR_ADDR, /* nested address */ + MPTCP_PM_ATTR_RCV_ADD_ADDRS, /* u32 */ + MPTCP_PM_ATTR_SUBFLOWS, /* u32 */ + + __MPTCP_PM_ATTR_MAX +}; + +#define MPTCP_PM_ATTR_MAX (__MPTCP_PM_ATTR_MAX - 1) + +enum { + MPTCP_PM_ADDR_ATTR_UNSPEC, + + MPTCP_PM_ADDR_ATTR_FAMILY, /* u16 */ + MPTCP_PM_ADDR_ATTR_ID, /* u8 */ + MPTCP_PM_ADDR_ATTR_ADDR4, /* struct in_addr */ + MPTCP_PM_ADDR_ATTR_ADDR6, /* struct in6_addr */ + MPTCP_PM_ADDR_ATTR_PORT, /* u16 */ + MPTCP_PM_ADDR_ATTR_FLAGS, /* u32 */ + MPTCP_PM_ADDR_ATTR_IF_IDX, /* s32 */ + + __MPTCP_PM_ADDR_ATTR_MAX +}; + +#define MPTCP_PM_ADDR_ATTR_MAX (__MPTCP_PM_ADDR_ATTR_MAX - 1) + +#define MPTCP_PM_ADDR_FLAG_SIGNAL (1 << 0) +#define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1) +#define MPTCP_PM_ADDR_FLAG_BACKUP (1 << 2) + +enum { + MPTCP_PM_CMD_UNSPEC, + + MPTCP_PM_CMD_ADD_ADDR, + MPTCP_PM_CMD_DEL_ADDR, + MPTCP_PM_CMD_GET_ADDR, + MPTCP_PM_CMD_FLUSH_ADDRS, + MPTCP_PM_CMD_SET_LIMITS, + MPTCP_PM_CMD_GET_LIMITS, + + __MPTCP_PM_CMD_AFTER_LAST +}; + #endif /* _UAPI_MPTCP_H */ diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index faebe8ec9f73..baa0640527c7 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_MPTCP) += mptcp.o -mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o mib.o +mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ + mib.o pm_netlink.o diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 3aedad58778c..064639f72487 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -176,7 +176,7 @@ bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) { - return 0; + return mptcp_pm_nl_get_local_id(msk, skc); } static void pm_worker(struct work_struct *work) @@ -192,12 +192,15 @@ static void pm_worker(struct work_struct *work) pr_debug("msk=%p status=%x", msk, pm->status); if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); + mptcp_pm_nl_add_addr_received(msk); } if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); + mptcp_pm_nl_fully_established(msk); } if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); + mptcp_pm_nl_subflow_established(msk); } spin_unlock_bh(&msk->pm.lock); @@ -219,6 +222,8 @@ void mptcp_pm_data_init(struct mptcp_sock *msk) spin_lock_init(&msk->pm.lock); INIT_WORK(&msk->pm.work, pm_worker); + + mptcp_pm_nl_data_init(msk); } void mptcp_pm_close(struct mptcp_sock *msk) @@ -232,4 +237,6 @@ void mptcp_pm_init(void) pm_wq = alloc_workqueue("pm_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8); if (!pm_wq) panic("Failed to allocate workqueue"); + + mptcp_pm_nl_init(); } diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c new file mode 100644 index 000000000000..743c3c58f826 --- /dev/null +++ b/net/mptcp/pm_netlink.c @@ -0,0 +1,856 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2020, Red Hat, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "protocol.h" + +/* forward declaration */ +static struct genl_family mptcp_genl_family; + +static int pm_nl_pernet_id; + +struct mptcp_pm_addr_entry { + struct list_head list; + unsigned int flags; + int ifindex; + struct mptcp_addr_info addr; + struct rcu_head rcu; +}; + +struct pm_nl_pernet { + /* protects pernet updates */ + spinlock_t lock; + struct list_head local_addr_list; + unsigned int addrs; + unsigned int add_addr_signal_max; + unsigned int add_addr_accept_max; + unsigned int local_addr_max; + unsigned int subflows_max; + unsigned int next_id; +}; + +#define MPTCP_PM_ADDR_MAX 8 + +static bool addresses_equal(const struct mptcp_addr_info *a, + struct mptcp_addr_info *b, bool use_port) +{ + bool addr_equals = false; + + if (a->family != b->family) + return false; + + if (a->family == AF_INET) + addr_equals = a->addr.s_addr == b->addr.s_addr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else + addr_equals = !ipv6_addr_cmp(&a->addr6, &b->addr6); +#endif + + if (!addr_equals) + return false; + if (!use_port) + return true; + + return a->port == b->port; +} + +static void local_address(const struct sock_common *skc, + struct mptcp_addr_info *addr) +{ + addr->port = 0; + addr->family = skc->skc_family; + if (addr->family == AF_INET) + addr->addr.s_addr = skc->skc_rcv_saddr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (addr->family == AF_INET6) + addr->addr6 = skc->skc_v6_rcv_saddr; +#endif +} + +static void remote_address(const struct sock_common *skc, + struct mptcp_addr_info *addr) +{ + addr->family = skc->skc_family; + addr->port = skc->skc_dport; + if (addr->family == AF_INET) + addr->addr.s_addr = skc->skc_daddr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (addr->family == AF_INET6) + addr->addr6 = skc->skc_v6_daddr; +#endif +} + +static bool lookup_subflow_by_saddr(const struct list_head *list, + struct mptcp_addr_info *saddr) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_addr_info cur; + struct sock_common *skc; + + list_for_each_entry(subflow, list, node) { + skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); + + local_address(skc, &cur); + if (addresses_equal(&cur, saddr, false)) + return true; + } + + return false; +} + +static struct mptcp_pm_addr_entry * +select_local_address(const struct pm_nl_pernet *pernet, + struct mptcp_sock *msk) +{ + struct mptcp_pm_addr_entry *entry, *ret = NULL; + + rcu_read_lock(); + spin_lock_bh(&msk->join_list_lock); + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) + continue; + + /* avoid any address already in use by subflows and + * pending join + */ + if (entry->addr.family == ((struct sock *)msk)->sk_family && + !lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) && + !lookup_subflow_by_saddr(&msk->join_list, &entry->addr)) { + ret = entry; + break; + } + } + spin_unlock_bh(&msk->join_list_lock); + rcu_read_unlock(); + return ret; +} + +static struct mptcp_pm_addr_entry * +select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos) +{ + struct mptcp_pm_addr_entry *entry, *ret = NULL; + int i = 0; + + rcu_read_lock(); + /* do not keep any additional per socket state, just signal + * the address list in order. + * Note: removal from the local address list during the msk life-cycle + * can lead to additional addresses not being announced. + */ + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) + continue; + if (i++ == pos) { + ret = entry; + break; + } + } + rcu_read_unlock(); + return ret; +} + +static void check_work_pending(struct mptcp_sock *msk) +{ + if (msk->pm.add_addr_signaled == msk->pm.add_addr_signal_max && + (msk->pm.local_addr_used == msk->pm.local_addr_max || + msk->pm.subflows == msk->pm.subflows_max)) + WRITE_ONCE(msk->pm.work_pending, false); +} + +static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) +{ + struct sock *sk = (struct sock *)msk; + struct mptcp_pm_addr_entry *local; + struct mptcp_addr_info remote; + struct pm_nl_pernet *pernet; + + pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); + + pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", + msk->pm.local_addr_used, msk->pm.local_addr_max, + msk->pm.add_addr_signaled, msk->pm.add_addr_signal_max, + msk->pm.subflows, msk->pm.subflows_max); + + /* check first for announce */ + if (msk->pm.add_addr_signaled < msk->pm.add_addr_signal_max) { + local = select_signal_address(pernet, + msk->pm.add_addr_signaled); + + if (local) { + msk->pm.add_addr_signaled++; + mptcp_pm_announce_addr(msk, &local->addr); + } else { + /* pick failed, avoid fourther attempts later */ + msk->pm.local_addr_used = msk->pm.add_addr_signal_max; + } + + check_work_pending(msk); + } + + /* check if should create a new subflow */ + if (msk->pm.local_addr_used < msk->pm.local_addr_max && + msk->pm.subflows < msk->pm.subflows_max) { + remote_address((struct sock_common *)sk, &remote); + + local = select_local_address(pernet, msk); + if (local) { + msk->pm.local_addr_used++; + msk->pm.subflows++; + check_work_pending(msk); + spin_unlock_bh(&msk->pm.lock); + __mptcp_subflow_connect(sk, local->ifindex, + &local->addr, &remote); + spin_lock_bh(&msk->pm.lock); + return; + } + + /* lookup failed, avoid fourther attempts later */ + msk->pm.local_addr_used = msk->pm.local_addr_max; + check_work_pending(msk); + } +} + +void mptcp_pm_nl_fully_established(struct mptcp_sock *msk) +{ + mptcp_pm_create_subflow_or_signal_addr(msk); +} + +void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk) +{ + mptcp_pm_create_subflow_or_signal_addr(msk); +} + +void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) +{ + struct sock *sk = (struct sock *)msk; + struct mptcp_addr_info remote; + struct mptcp_addr_info local; + + pr_debug("accepted %d:%d remote family %d", + msk->pm.add_addr_accepted, msk->pm.add_addr_accept_max, + msk->pm.remote.family); + msk->pm.add_addr_accepted++; + msk->pm.subflows++; + if (msk->pm.add_addr_accepted >= msk->pm.add_addr_accept_max || + msk->pm.subflows >= msk->pm.subflows_max) + WRITE_ONCE(msk->pm.accept_addr, false); + + /* connect to the specified remote address, using whatever + * local address the routing configuration will pick. + */ + remote = msk->pm.remote; + if (!remote.port) + remote.port = sk->sk_dport; + memset(&local, 0, sizeof(local)); + local.family = remote.family; + + spin_unlock_bh(&msk->pm.lock); + __mptcp_subflow_connect((struct sock *)msk, 0, &local, &remote); + spin_lock_bh(&msk->pm.lock); +} + +static bool address_use_port(struct mptcp_pm_addr_entry *entry) +{ + return (entry->flags & + (MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) == + MPTCP_PM_ADDR_FLAG_SIGNAL; +} + +static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, + struct mptcp_pm_addr_entry *entry) +{ + struct mptcp_pm_addr_entry *cur; + int ret = -EINVAL; + + spin_lock_bh(&pernet->lock); + /* to keep the code simple, don't do IDR-like allocation for address ID, + * just bail when we exceed limits + */ + if (pernet->next_id > 255) + goto out; + if (pernet->addrs >= MPTCP_PM_ADDR_MAX) + goto out; + + /* do not insert duplicate address, differentiate on port only + * singled addresses + */ + list_for_each_entry(cur, &pernet->local_addr_list, list) { + if (addresses_equal(&cur->addr, &entry->addr, + address_use_port(entry) && + address_use_port(cur))) + goto out; + } + + if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) + pernet->add_addr_signal_max++; + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) + pernet->local_addr_max++; + + entry->addr.id = pernet->next_id++; + pernet->addrs++; + list_add_tail_rcu(&entry->list, &pernet->local_addr_list); + ret = entry->addr.id; + +out: + spin_unlock_bh(&pernet->lock); + return ret; +} + +int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) +{ + struct mptcp_pm_addr_entry *entry; + struct mptcp_addr_info skc_local; + struct mptcp_addr_info msk_local; + struct pm_nl_pernet *pernet; + int ret = -1; + + if (WARN_ON_ONCE(!msk)) + return -1; + + /* The 0 ID mapping is defined by the first subflow, copied into the msk + * addr + */ + local_address((struct sock_common *)msk, &msk_local); + local_address((struct sock_common *)msk, &skc_local); + if (addresses_equal(&msk_local, &skc_local, false)) + return 0; + + pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (addresses_equal(&entry->addr, &skc_local, false)) { + ret = entry->addr.id; + break; + } + } + rcu_read_unlock(); + if (ret >= 0) + return ret; + + /* address not found, add to local list */ + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + entry->flags = 0; + entry->addr = skc_local; + ret = mptcp_pm_nl_append_new_local_addr(pernet, entry); + if (ret < 0) + kfree(entry); + + return ret; +} + +void mptcp_pm_nl_data_init(struct mptcp_sock *msk) +{ + struct mptcp_pm_data *pm = &msk->pm; + struct pm_nl_pernet *pernet; + bool subflows; + + pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); + + pm->add_addr_signal_max = READ_ONCE(pernet->add_addr_signal_max); + pm->add_addr_accept_max = READ_ONCE(pernet->add_addr_accept_max); + pm->local_addr_max = READ_ONCE(pernet->local_addr_max); + pm->subflows_max = READ_ONCE(pernet->subflows_max); + subflows = !!pm->subflows_max; + WRITE_ONCE(pm->work_pending, (!!pm->local_addr_max && subflows) || + !!pm->add_addr_signal_max); + WRITE_ONCE(pm->accept_addr, !!pm->add_addr_accept_max && subflows); + WRITE_ONCE(pm->accept_subflow, subflows); +} + +#define MPTCP_PM_CMD_GRP_OFFSET 0 + +static const struct genl_multicast_group mptcp_pm_mcgrps[] = { + [MPTCP_PM_CMD_GRP_OFFSET] = { .name = MPTCP_PM_CMD_GRP_NAME, }, +}; + +static const struct nla_policy +mptcp_pm_addr_policy[MPTCP_PM_ADDR_ATTR_MAX + 1] = { + [MPTCP_PM_ADDR_ATTR_FAMILY] = { .type = NLA_U16, }, + [MPTCP_PM_ADDR_ATTR_ID] = { .type = NLA_U8, }, + [MPTCP_PM_ADDR_ATTR_ADDR4] = { .type = NLA_U32, }, + [MPTCP_PM_ADDR_ATTR_ADDR6] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct in6_addr), }, + [MPTCP_PM_ADDR_ATTR_PORT] = { .type = NLA_U16 }, + [MPTCP_PM_ADDR_ATTR_FLAGS] = { .type = NLA_U32 }, + [MPTCP_PM_ADDR_ATTR_IF_IDX] = { .type = NLA_S32 }, +}; + +static const struct nla_policy mptcp_pm_policy[MPTCP_PM_ATTR_MAX + 1] = { + [MPTCP_PM_ATTR_ADDR] = + NLA_POLICY_NESTED(mptcp_pm_addr_policy), + [MPTCP_PM_ATTR_RCV_ADD_ADDRS] = { .type = NLA_U32, }, + [MPTCP_PM_ATTR_SUBFLOWS] = { .type = NLA_U32, }, +}; + +static int mptcp_pm_family_to_addr(int family) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (family == AF_INET6) + return MPTCP_PM_ADDR_ATTR_ADDR6; +#endif + return MPTCP_PM_ADDR_ATTR_ADDR4; +} + +static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, + bool require_family, + struct mptcp_pm_addr_entry *entry) +{ + struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; + int err, addr_addr; + + if (!attr) { + GENL_SET_ERR_MSG(info, "missing address info"); + return -EINVAL; + } + + /* no validation needed - was already done via nested policy */ + err = nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr, + mptcp_pm_addr_policy, info->extack); + if (err) + return err; + + memset(entry, 0, sizeof(*entry)); + if (!tb[MPTCP_PM_ADDR_ATTR_FAMILY]) { + if (!require_family) + goto skip_family; + + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "missing family"); + return -EINVAL; + } + + entry->addr.family = nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_FAMILY]); + if (entry->addr.family != AF_INET +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + && entry->addr.family != AF_INET6 +#endif + ) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "unknown address family"); + return -EINVAL; + } + addr_addr = mptcp_pm_family_to_addr(entry->addr.family); + if (!tb[addr_addr]) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "missing address data"); + return -EINVAL; + } + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (entry->addr.family == AF_INET6) + entry->addr.addr6 = nla_get_in6_addr(tb[addr_addr]); + else +#endif + entry->addr.addr.s_addr = nla_get_in_addr(tb[addr_addr]); + +skip_family: + if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) + entry->ifindex = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]); + + if (tb[MPTCP_PM_ADDR_ATTR_ID]) + entry->addr.id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]); + + if (tb[MPTCP_PM_ADDR_ATTR_FLAGS]) + entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); + + return 0; +} + +static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) +{ + return net_generic(genl_info_net(info), pm_nl_pernet_id); +} + +static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct mptcp_pm_addr_entry addr, *entry; + int ret; + + ret = mptcp_pm_parse_addr(attr, info, true, &addr); + if (ret < 0) + return ret; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + GENL_SET_ERR_MSG(info, "can't allocate addr"); + return -ENOMEM; + } + + *entry = addr; + ret = mptcp_pm_nl_append_new_local_addr(pernet, entry); + if (ret < 0) { + GENL_SET_ERR_MSG(info, "too many addresses or duplicate one"); + kfree(entry); + return ret; + } + + return 0; +} + +static struct mptcp_pm_addr_entry * +__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) +{ + struct mptcp_pm_addr_entry *entry; + + list_for_each_entry(entry, &pernet->local_addr_list, list) { + if (entry->addr.id == id) + return entry; + } + return NULL; +} + +static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct mptcp_pm_addr_entry addr, *entry; + int ret; + + ret = mptcp_pm_parse_addr(attr, info, false, &addr); + if (ret < 0) + return ret; + + spin_lock_bh(&pernet->lock); + entry = __lookup_addr_by_id(pernet, addr.addr.id); + if (!entry) { + GENL_SET_ERR_MSG(info, "address not found"); + ret = -EINVAL; + goto out; + } + if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) + pernet->add_addr_signal_max--; + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) + pernet->local_addr_max--; + + pernet->addrs--; + list_del_rcu(&entry->list); + kfree_rcu(entry, rcu); +out: + spin_unlock_bh(&pernet->lock); + return ret; +} + +static void __flush_addrs(struct pm_nl_pernet *pernet) +{ + while (!list_empty(&pernet->local_addr_list)) { + struct mptcp_pm_addr_entry *cur; + + cur = list_entry(pernet->local_addr_list.next, + struct mptcp_pm_addr_entry, list); + list_del_rcu(&cur->list); + kfree_rcu(cur, rcu); + } +} + +static void __reset_counters(struct pm_nl_pernet *pernet) +{ + pernet->add_addr_signal_max = 0; + pernet->add_addr_accept_max = 0; + pernet->local_addr_max = 0; + pernet->addrs = 0; +} + +static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + + spin_lock_bh(&pernet->lock); + __flush_addrs(pernet); + __reset_counters(pernet); + spin_unlock_bh(&pernet->lock); + return 0; +} + +static int mptcp_nl_fill_addr(struct sk_buff *skb, + struct mptcp_pm_addr_entry *entry) +{ + struct mptcp_addr_info *addr = &entry->addr; + struct nlattr *attr; + + attr = nla_nest_start(skb, MPTCP_PM_ATTR_ADDR); + if (!attr) + return -EMSGSIZE; + + if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_FAMILY, addr->family)) + goto nla_put_failure; + if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id)) + goto nla_put_failure; + if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->flags)) + goto nla_put_failure; + if (entry->ifindex && + nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->ifindex)) + goto nla_put_failure; + + if (addr->family == AF_INET) + nla_put_in_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR4, + addr->addr.s_addr); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (addr->family == AF_INET6) + nla_put_in6_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR6, &addr->addr6); +#endif + nla_nest_end(skb, attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, attr); + return -EMSGSIZE; +} + +static int mptcp_nl_cmd_get_addr(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct mptcp_pm_addr_entry addr, *entry; + struct sk_buff *msg; + void *reply; + int ret; + + ret = mptcp_pm_parse_addr(attr, info, false, &addr); + if (ret < 0) + return ret; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, + info->genlhdr->cmd); + if (!reply) { + GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); + ret = -EMSGSIZE; + goto fail; + } + + spin_lock_bh(&pernet->lock); + entry = __lookup_addr_by_id(pernet, addr.addr.id); + if (!entry) { + GENL_SET_ERR_MSG(info, "address not found"); + ret = -EINVAL; + goto unlock_fail; + } + + ret = mptcp_nl_fill_addr(msg, entry); + if (ret) + goto unlock_fail; + + genlmsg_end(msg, reply); + ret = genlmsg_reply(msg, info); + spin_unlock_bh(&pernet->lock); + return ret; + +unlock_fail: + spin_unlock_bh(&pernet->lock); + +fail: + nlmsg_free(msg); + return ret; +} + +static int mptcp_nl_cmd_dump_addrs(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct net *net = sock_net(msg->sk); + struct mptcp_pm_addr_entry *entry; + struct pm_nl_pernet *pernet; + int id = cb->args[0]; + void *hdr; + + pernet = net_generic(net, pm_nl_pernet_id); + + spin_lock_bh(&pernet->lock); + list_for_each_entry(entry, &pernet->local_addr_list, list) { + if (entry->addr.id <= id) + continue; + + hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &mptcp_genl_family, + NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); + if (!hdr) + break; + + if (mptcp_nl_fill_addr(msg, entry) < 0) { + genlmsg_cancel(msg, hdr); + break; + } + + id = entry->addr.id; + genlmsg_end(msg, hdr); + } + spin_unlock_bh(&pernet->lock); + + cb->args[0] = id; + return msg->len; +} + +static int parse_limit(struct genl_info *info, int id, unsigned int *limit) +{ + struct nlattr *attr = info->attrs[id]; + + if (!attr) { + GENL_SET_ERR_MSG(info, "missing announce accept limit"); + return -EINVAL; + } + + *limit = nla_get_u32(attr); + if (*limit > MPTCP_PM_ADDR_MAX) { + GENL_SET_ERR_MSG(info, + "announce accept limit greater than maximum"); + return -EINVAL; + } + return 0; +} + +static int +mptcp_nl_cmd_set_limits(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + unsigned int rcv_addrs, subflows; + int ret; + + ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs); + if (ret) + return ret; + + ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows); + if (ret) + return ret; + + WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs); + WRITE_ONCE(pernet->subflows_max, subflows); + return 0; +} + +static int +mptcp_nl_cmd_get_limits(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct sk_buff *msg; + void *reply; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, + MPTCP_PM_CMD_GET_LIMITS); + if (!reply) + goto fail; + + if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS, + READ_ONCE(pernet->add_addr_accept_max))) + goto fail; + + if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS, + READ_ONCE(pernet->subflows_max))) + goto fail; + + genlmsg_end(msg, reply); + return genlmsg_reply(msg, info); + +fail: + GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); + nlmsg_free(msg); + return -EMSGSIZE; +} + +static struct genl_ops mptcp_pm_ops[] = { + { + .cmd = MPTCP_PM_CMD_ADD_ADDR, + .doit = mptcp_nl_cmd_add_addr, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_PM_CMD_DEL_ADDR, + .doit = mptcp_nl_cmd_del_addr, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_PM_CMD_FLUSH_ADDRS, + .doit = mptcp_nl_cmd_flush_addrs, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_PM_CMD_GET_ADDR, + .doit = mptcp_nl_cmd_get_addr, + .dumpit = mptcp_nl_cmd_dump_addrs, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_PM_CMD_SET_LIMITS, + .doit = mptcp_nl_cmd_set_limits, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_PM_CMD_GET_LIMITS, + .doit = mptcp_nl_cmd_get_limits, + .flags = GENL_ADMIN_PERM, + }, +}; + +static struct genl_family mptcp_genl_family __ro_after_init = { + .name = MPTCP_PM_NAME, + .version = MPTCP_PM_VER, + .maxattr = MPTCP_PM_ATTR_MAX, + .policy = mptcp_pm_policy, + .netnsok = true, + .module = THIS_MODULE, + .ops = mptcp_pm_ops, + .n_ops = ARRAY_SIZE(mptcp_pm_ops), + .mcgrps = mptcp_pm_mcgrps, + .n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps), +}; + +static int __net_init pm_nl_init_net(struct net *net) +{ + struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id); + + INIT_LIST_HEAD_RCU(&pernet->local_addr_list); + __reset_counters(pernet); + pernet->next_id = 1; + spin_lock_init(&pernet->lock); + return 0; +} + +static void __net_exit pm_nl_exit_net(struct list_head *net_list) +{ + struct net *net; + + list_for_each_entry(net, net_list, exit_list) { + /* net is removed from namespace list, can't race with + * other modifiers + */ + __flush_addrs(net_generic(net, pm_nl_pernet_id)); + } +} + +static struct pernet_operations mptcp_pm_pernet_ops = { + .init = pm_nl_init_net, + .exit_batch = pm_nl_exit_net, + .id = &pm_nl_pernet_id, + .size = sizeof(struct pm_nl_pernet), +}; + +void mptcp_pm_nl_init(void) +{ + if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0) + panic("Failed to register MPTCP PM pernet subsystem.\n"); + + if (genl_register_family(&mptcp_genl_family)) + panic("Failed to register MPTCP PM netlink family\n"); +} diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 0999f74df027..f733c5425552 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -396,6 +396,13 @@ bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_addr_info *saddr); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); +void mptcp_pm_nl_init(void); +void mptcp_pm_nl_data_init(struct mptcp_sock *msk); +void mptcp_pm_nl_fully_established(struct mptcp_sock *msk); +void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk); +void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk); +int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); + static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) { return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); From patchwork Mon Mar 23 21:26:41 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Mat Martineau X-Patchwork-Id: 222022 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-9.8 required=3.0 tests=HEADER_FROM_DIFFERENT_DOMAINS, INCLUDES_PATCH, MAILING_LIST_MULTI, SIGNED_OFF_BY, SPF_HELO_NONE, SPF_PASS, URIBL_BLOCKED, USER_AGENT_GIT autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id C87EBC54FCF for ; Mon, 23 Mar 2020 21:29:45 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.kernel.org (Postfix) with ESMTP id 912D820719 for ; Mon, 23 Mar 2020 21:29:45 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1727159AbgCWV3n (ORCPT ); Mon, 23 Mar 2020 17:29:43 -0400 Received: from mga05.intel.com ([192.55.52.43]:18837 "EHLO mga05.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1727145AbgCWV3l (ORCPT ); Mon, 23 Mar 2020 17:29:41 -0400 IronPort-SDR: /kE5MEceBO3Qupvk9EokYRM1V1WtHuurwoVa89hHAT0/gq1YG9z8pBaSDEG8iMUxP7dBHyotVG Z/VDDP/mdb0g== X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from fmsmga005.fm.intel.com ([10.253.24.32]) by fmsmga105.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 23 Mar 2020 14:29:39 -0700 IronPort-SDR: 1p8I+hS80J+tvjBiMw9d7Q5ir2bm1G7rS+1s1Qjo06caMVTmgHqB9RR2iNhNEXXqJIC9cY69gL zXDaRRJGYamg== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.72,297,1580803200"; d="scan'208";a="445960439" Received: from mjmartin-nuc02.mjmartin-nuc02 (HELO mjmartin-nuc02.sea.intel.com) ([10.254.100.76]) by fmsmga005.fm.intel.com with ESMTP; 23 Mar 2020 14:29:39 -0700 From: Mat Martineau To: netdev@vger.kernel.org Cc: Paolo Abeni , eric.dumazet@gmail.com, Mat Martineau Subject: [PATCH net-next 16/17] selftests: add PM netlink functional tests Date: Mon, 23 Mar 2020 14:26:41 -0700 Message-Id: <20200323212642.34104-17-mathew.j.martineau@linux.intel.com> X-Mailer: git-send-email 2.26.0 In-Reply-To: <20200323212642.34104-1-mathew.j.martineau@linux.intel.com> References: <20200323212642.34104-1-mathew.j.martineau@linux.intel.com> MIME-Version: 1.0 Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org From: Paolo Abeni This introduces basic self-tests for the PM netlink, checking the basic APIs and possible exceptional values. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau --- tools/testing/selftests/net/mptcp/Makefile | 7 +- .../testing/selftests/net/mptcp/pm_netlink.sh | 130 ++++ tools/testing/selftests/net/mptcp/pm_nl_ctl.c | 616 ++++++++++++++++++ 3 files changed, 750 insertions(+), 3 deletions(-) create mode 100755 tools/testing/selftests/net/mptcp/pm_netlink.sh create mode 100644 tools/testing/selftests/net/mptcp/pm_nl_ctl.c diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile index ba450e62dc5b..70c831fcaf70 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -1,12 +1,13 @@ # SPDX-License-Identifier: GPL-2.0 top_srcdir = ../../../../.. +KSFT_KHDR_INSTALL := 1 -CFLAGS = -Wall -Wl,--no-as-needed -O2 -g +CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include -TEST_PROGS := mptcp_connect.sh +TEST_PROGS := mptcp_connect.sh pm_netlink.sh -TEST_GEN_FILES = mptcp_connect +TEST_GEN_FILES = mptcp_connect pm_nl_ctl TEST_FILES := settings diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh new file mode 100755 index 000000000000..cfc743c47cb2 --- /dev/null +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -0,0 +1,130 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ksft_skip=4 +ret=0 + +usage() { + echo "Usage: $0 [ -h ]" +} + + +while getopts "$optstring" option;do + case "$option" in + "h") + usage $0 + exit 0 + ;; + "?") + usage $0 + exit 1 + ;; + esac +done + +sec=$(date +%s) +rndh=$(printf %x $sec)-$(mktemp -u XXXXXX) +ns1="ns1-$rndh" +err=$(mktemp) +ret=0 + +cleanup() +{ + rm -f $out + ip netns del $ns1 +} + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +trap cleanup EXIT + +ip netns add $ns1 || exit $ksft_skip +ip -net $ns1 link set lo up +ip netns exec $ns1 sysctl -q net.mptcp.enabled=1 + +check() +{ + local cmd="$1" + local expected="$2" + local msg="$3" + local out=`$cmd 2>$err` + local cmd_ret=$? + + printf "%-50s %s" "$msg" + if [ $cmd_ret -ne 0 ]; then + echo "[FAIL] command execution '$cmd' stderr " + cat $err + ret=1 + elif [ "$out" = "$expected" ]; then + echo "[ OK ]" + else + echo -n "[FAIL] " + echo "expected '$expected' got '$out'" + ret=1 + fi +} + +check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "defaults addr list" +check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0 +subflows 0" "defaults limits" + +ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 +ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.2 flags subflow dev lo +ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 flags signal,backup +check "ip netns exec $ns1 ./pm_nl_ctl get 1" "id 1 flags 10.0.1.1 " "simple add/get addr" + +check "ip netns exec $ns1 ./pm_nl_ctl dump" \ +"id 1 flags 10.0.1.1 +id 2 flags subflow dev lo 10.0.1.2 +id 3 flags signal,backup 10.0.1.3 " "dump addrs" + +ip netns exec $ns1 ./pm_nl_ctl del 2 +check "ip netns exec $ns1 ./pm_nl_ctl get 2" "" "simple del addr" +check "ip netns exec $ns1 ./pm_nl_ctl dump" \ +"id 1 flags 10.0.1.1 +id 3 flags signal,backup 10.0.1.3 " "dump addrs after del" + +ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 +check "ip netns exec $ns1 ./pm_nl_ctl get 4" "" "duplicate addr" + +ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.4 id 10 flags signal +check "ip netns exec $ns1 ./pm_nl_ctl get 4" "id 4 flags signal 10.0.1.4 " "id addr increment" + +for i in `seq 5 9`; do + ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.$i flags signal >/dev/null 2>&1 +done +check "ip netns exec $ns1 ./pm_nl_ctl get 9" "id 9 flags signal 10.0.1.9 " "hard addr limit" +check "ip netns exec $ns1 ./pm_nl_ctl get 10" "" "above hard addr limit" + +for i in `seq 9 256`; do + ip netns exec $ns1 ./pm_nl_ctl del $i + ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.9 +done +check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags 10.0.1.1 +id 3 flags signal,backup 10.0.1.3 +id 4 flags signal 10.0.1.4 +id 5 flags signal 10.0.1.5 +id 6 flags signal 10.0.1.6 +id 7 flags signal 10.0.1.7 +id 8 flags signal 10.0.1.8 " "id limit" + +ip netns exec $ns1 ./pm_nl_ctl flush +check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "flush addrs" + +ip netns exec $ns1 ./pm_nl_ctl limits 9 1 +check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0 +subflows 0" "rcv addrs above hard limit" + +ip netns exec $ns1 ./pm_nl_ctl limits 1 9 +check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0 +subflows 0" "subflows above hard limit" + +ip netns exec $ns1 ./pm_nl_ctl limits 8 8 +check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 8 +subflows 8" "set limits" + +exit $ret diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c new file mode 100644 index 000000000000..de9209305026 --- /dev/null +++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c @@ -0,0 +1,616 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +#include "linux/mptcp.h" + +#ifndef MPTCP_PM_NAME +#define MPTCP_PM_NAME "mptcp_pm" +#endif + +static void syntax(char *argv[]) +{ + fprintf(stderr, "%s add|get|del|flush|dump|accept []\n", argv[0]); + fprintf(stderr, "\tadd [flags signal|subflow|backup] [id ] [dev ] \n"); + fprintf(stderr, "\tdel \n"); + fprintf(stderr, "\tget \n"); + fprintf(stderr, "\tflush\n"); + fprintf(stderr, "\tdump\n"); + fprintf(stderr, "\tlimits [ ]\n"); + exit(0); +} + +static int init_genl_req(char *data, int family, int cmd, int version) +{ + struct nlmsghdr *nh = (void *)data; + struct genlmsghdr *gh; + int off = 0; + + nh->nlmsg_type = family; + nh->nlmsg_flags = NLM_F_REQUEST; + nh->nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); + off += NLMSG_ALIGN(sizeof(*nh)); + + gh = (void *)(data + off); + gh->cmd = cmd; + gh->version = version; + off += NLMSG_ALIGN(sizeof(*gh)); + return off; +} + +static void nl_error(struct nlmsghdr *nh) +{ + struct nlmsgerr *err = (struct nlmsgerr *)NLMSG_DATA(nh); + int len = nh->nlmsg_len - sizeof(*nh); + uint32_t off; + + if (len < sizeof(struct nlmsgerr)) + error(1, 0, "netlink error message truncated %d min %ld", len, + sizeof(struct nlmsgerr)); + + if (!err->error) { + /* check messages from kernel */ + struct rtattr *attrs = (struct rtattr *)NLMSG_DATA(nh); + + while (RTA_OK(attrs, len)) { + if (attrs->rta_type == NLMSGERR_ATTR_MSG) + fprintf(stderr, "netlink ext ack msg: %s\n", + (char *)RTA_DATA(attrs)); + if (attrs->rta_type == NLMSGERR_ATTR_OFFS) { + memcpy(&off, RTA_DATA(attrs), 4); + fprintf(stderr, "netlink err off %d\n", + (int)off); + } + attrs = RTA_NEXT(attrs, len); + } + } else { + fprintf(stderr, "netlink error %d", err->error); + } +} + +/* do a netlink command and, if max > 0, fetch the reply */ +static int do_nl_req(int fd, struct nlmsghdr *nh, int len, int max) +{ + struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK }; + socklen_t addr_len; + void *data = nh; + int rem, ret; + int err = 0; + + nh->nlmsg_len = len; + ret = sendto(fd, data, len, 0, (void *)&nladdr, sizeof(nladdr)); + if (ret != len) + error(1, errno, "send netlink: %uB != %uB\n", ret, len); + if (max == 0) + return 0; + + addr_len = sizeof(nladdr); + rem = ret = recvfrom(fd, data, max, 0, (void *)&nladdr, &addr_len); + if (ret < 0) + error(1, errno, "recv netlink: %uB\n", ret); + + /* Beware: the NLMSG_NEXT macro updates the 'rem' argument */ + for (; NLMSG_OK(nh, rem); nh = NLMSG_NEXT(nh, rem)) { + if (nh->nlmsg_type == NLMSG_ERROR) { + nl_error(nh); + err = 1; + } + } + if (err) + error(1, 0, "bailing out due to netlink error[s]"); + return ret; +} + +static int genl_parse_getfamily(struct nlmsghdr *nlh) +{ + struct genlmsghdr *ghdr = NLMSG_DATA(nlh); + int len = nlh->nlmsg_len; + struct rtattr *attrs; + + if (nlh->nlmsg_type != GENL_ID_CTRL) + error(1, errno, "Not a controller message, len=%d type=0x%x\n", + nlh->nlmsg_len, nlh->nlmsg_type); + + len -= NLMSG_LENGTH(GENL_HDRLEN); + + if (len < 0) + error(1, errno, "wrong controller message len %d\n", len); + + if (ghdr->cmd != CTRL_CMD_NEWFAMILY) + error(1, errno, "Unknown controller command %d\n", ghdr->cmd); + + attrs = (struct rtattr *) ((char *) ghdr + GENL_HDRLEN); + while (RTA_OK(attrs, len)) { + if (attrs->rta_type == CTRL_ATTR_FAMILY_ID) + return *(__u16 *)RTA_DATA(attrs); + attrs = RTA_NEXT(attrs, len); + } + + error(1, errno, "can't find CTRL_ATTR_FAMILY_ID attr"); + return -1; +} + +static int resolve_mptcp_pm_netlink(int fd) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + struct nlmsghdr *nh; + struct rtattr *rta; + int namelen; + int off = 0; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, GENL_ID_CTRL, CTRL_CMD_GETFAMILY, 0); + + rta = (void *)(data + off); + namelen = strlen(MPTCP_PM_NAME) + 1; + rta->rta_type = CTRL_ATTR_FAMILY_NAME; + rta->rta_len = RTA_LENGTH(namelen); + memcpy(RTA_DATA(rta), MPTCP_PM_NAME, namelen); + off += NLMSG_ALIGN(rta->rta_len); + + do_nl_req(fd, nh, off, sizeof(data)); + return genl_parse_getfamily((void *)data); +} + +int add_addr(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + struct rtattr *rta, *nest; + struct nlmsghdr *nh; + u_int16_t family; + u_int32_t flags; + int nest_start; + u_int8_t id; + int off = 0; + int arg; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, MPTCP_PM_CMD_ADD_ADDR, + MPTCP_PM_VER); + + if (argc < 3) + syntax(argv); + + nest_start = off; + nest = (void *)(data + off); + nest->rta_type = NLA_F_NESTED | MPTCP_PM_ATTR_ADDR; + nest->rta_len = RTA_LENGTH(0); + off += NLMSG_ALIGN(nest->rta_len); + + /* addr data */ + rta = (void *)(data + off); + if (inet_pton(AF_INET, argv[2], RTA_DATA(rta))) { + family = AF_INET; + rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR4; + rta->rta_len = RTA_LENGTH(4); + } else if (inet_pton(AF_INET6, argv[2], RTA_DATA(rta))) { + family = AF_INET6; + rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR6; + rta->rta_len = RTA_LENGTH(16); + } else + error(1, errno, "can't parse ip %s", argv[2]); + off += NLMSG_ALIGN(rta->rta_len); + + /* family */ + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_FAMILY; + rta->rta_len = RTA_LENGTH(2); + memcpy(RTA_DATA(rta), &family, 2); + off += NLMSG_ALIGN(rta->rta_len); + + for (arg = 3; arg < argc; arg++) { + if (!strcmp(argv[arg], "flags")) { + char *tok, *str; + + /* flags */ + flags = 0; + if (++arg >= argc) + error(1, 0, " missing flags value"); + + /* do not support flag list yet */ + for (str = argv[arg]; (tok = strtok(str, ",")); + str = NULL) { + if (!strcmp(tok, "subflow")) + flags |= MPTCP_PM_ADDR_FLAG_SUBFLOW; + else if (!strcmp(tok, "signal")) + flags |= MPTCP_PM_ADDR_FLAG_SIGNAL; + else if (!strcmp(tok, "backup")) + flags |= MPTCP_PM_ADDR_FLAG_BACKUP; + else + error(1, errno, + "unknown flag %s", argv[arg]); + } + + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_FLAGS; + rta->rta_len = RTA_LENGTH(4); + memcpy(RTA_DATA(rta), &flags, 4); + off += NLMSG_ALIGN(rta->rta_len); + } else if (!strcmp(argv[arg], "id")) { + if (++arg >= argc) + error(1, 0, " missing id value"); + + id = atoi(argv[arg]); + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_ID; + rta->rta_len = RTA_LENGTH(1); + memcpy(RTA_DATA(rta), &id, 1); + off += NLMSG_ALIGN(rta->rta_len); + } else if (!strcmp(argv[arg], "dev")) { + int32_t ifindex; + + if (++arg >= argc) + error(1, 0, " missing dev name"); + + ifindex = if_nametoindex(argv[arg]); + if (!ifindex) + error(1, errno, "unknown device %s", argv[arg]); + + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_IF_IDX; + rta->rta_len = RTA_LENGTH(4); + memcpy(RTA_DATA(rta), &ifindex, 4); + off += NLMSG_ALIGN(rta->rta_len); + } else + error(1, 0, "unknown keyword %s", argv[arg]); + } + nest->rta_len = off - nest_start; + + do_nl_req(fd, nh, off, 0); + return 0; +} + +int del_addr(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + struct rtattr *rta, *nest; + struct nlmsghdr *nh; + int nest_start; + u_int8_t id; + int off = 0; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, MPTCP_PM_CMD_DEL_ADDR, + MPTCP_PM_VER); + + /* the only argument is the address id */ + if (argc != 3) + syntax(argv); + + id = atoi(argv[2]); + + nest_start = off; + nest = (void *)(data + off); + nest->rta_type = NLA_F_NESTED | MPTCP_PM_ATTR_ADDR; + nest->rta_len = RTA_LENGTH(0); + off += NLMSG_ALIGN(nest->rta_len); + + /* build a dummy addr with only the ID set */ + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_ID; + rta->rta_len = RTA_LENGTH(1); + memcpy(RTA_DATA(rta), &id, 1); + off += NLMSG_ALIGN(rta->rta_len); + nest->rta_len = off - nest_start; + + do_nl_req(fd, nh, off, 0); + return 0; +} + +static void print_addr(struct rtattr *attrs, int len) +{ + uint16_t family = 0; + char str[1024]; + uint32_t flags; + uint8_t id; + + while (RTA_OK(attrs, len)) { + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_FAMILY) + memcpy(&family, RTA_DATA(attrs), 2); + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_ADDR4) { + if (family != AF_INET) + error(1, errno, "wrong IP (v4) for family %d", + family); + inet_ntop(AF_INET, RTA_DATA(attrs), str, sizeof(str)); + printf("%s ", str); + } + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_ADDR6) { + if (family != AF_INET6) + error(1, errno, "wrong IP (v6) for family %d", + family); + inet_ntop(AF_INET6, RTA_DATA(attrs), str, sizeof(str)); + printf("%s ", str); + } + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_ID) { + memcpy(&id, RTA_DATA(attrs), 1); + printf("id %d ", id); + } + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_FLAGS) { + memcpy(&flags, RTA_DATA(attrs), 4); + + printf("flags "); + if (flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { + printf("signal"); + flags &= ~MPTCP_PM_ADDR_FLAG_SIGNAL; + if (flags) + printf(","); + } + + if (flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + printf("subflow"); + flags &= ~MPTCP_PM_ADDR_FLAG_SUBFLOW; + if (flags) + printf(","); + } + + if (flags & MPTCP_PM_ADDR_FLAG_BACKUP) { + printf("backup"); + flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; + if (flags) + printf(","); + } + + /* bump unknown flags, if any */ + if (flags) + printf("0x%x", flags); + printf(" "); + } + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_IF_IDX) { + char name[IF_NAMESIZE], *ret; + int32_t ifindex; + + memcpy(&ifindex, RTA_DATA(attrs), 4); + ret = if_indextoname(ifindex, name); + if (ret) + printf("dev %s ", ret); + else + printf("dev unknown/%d", ifindex); + } + + attrs = RTA_NEXT(attrs, len); + } + printf("\n"); +} + +static void print_addrs(struct nlmsghdr *nh, int pm_family, int total_len) +{ + struct rtattr *attrs; + + for (; NLMSG_OK(nh, total_len); nh = NLMSG_NEXT(nh, total_len)) { + int len = nh->nlmsg_len; + + if (nh->nlmsg_type == NLMSG_DONE) + break; + if (nh->nlmsg_type == NLMSG_ERROR) + nl_error(nh); + if (nh->nlmsg_type != pm_family) + continue; + + len -= NLMSG_LENGTH(GENL_HDRLEN); + attrs = (struct rtattr *) ((char *) NLMSG_DATA(nh) + + GENL_HDRLEN); + while (RTA_OK(attrs, len)) { + if (attrs->rta_type == + (MPTCP_PM_ATTR_ADDR | NLA_F_NESTED)) + print_addr((void *)RTA_DATA(attrs), + attrs->rta_len); + attrs = RTA_NEXT(attrs, len); + } + } +} + +int get_addr(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + struct rtattr *rta, *nest; + struct nlmsghdr *nh; + int nest_start; + u_int8_t id; + int off = 0; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, MPTCP_PM_CMD_GET_ADDR, + MPTCP_PM_VER); + + /* the only argument is the address id */ + if (argc != 3) + syntax(argv); + + id = atoi(argv[2]); + + nest_start = off; + nest = (void *)(data + off); + nest->rta_type = NLA_F_NESTED | MPTCP_PM_ATTR_ADDR; + nest->rta_len = RTA_LENGTH(0); + off += NLMSG_ALIGN(nest->rta_len); + + /* build a dummy addr with only the ID set */ + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_ID; + rta->rta_len = RTA_LENGTH(1); + memcpy(RTA_DATA(rta), &id, 1); + off += NLMSG_ALIGN(rta->rta_len); + nest->rta_len = off - nest_start; + + print_addrs(nh, pm_family, do_nl_req(fd, nh, off, sizeof(data))); + return 0; +} + +int dump_addrs(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + pid_t pid = getpid(); + struct nlmsghdr *nh; + int off = 0; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, MPTCP_PM_CMD_GET_ADDR, + MPTCP_PM_VER); + nh->nlmsg_flags |= NLM_F_DUMP; + nh->nlmsg_seq = 1; + nh->nlmsg_pid = pid; + nh->nlmsg_len = off; + + print_addrs(nh, pm_family, do_nl_req(fd, nh, off, sizeof(data))); + return 0; +} + +int flush_addrs(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + struct nlmsghdr *nh; + int off = 0; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, MPTCP_PM_CMD_FLUSH_ADDRS, + MPTCP_PM_VER); + + do_nl_req(fd, nh, off, 0); + return 0; +} + +static void print_limits(struct nlmsghdr *nh, int pm_family, int total_len) +{ + struct rtattr *attrs; + uint32_t max; + + for (; NLMSG_OK(nh, total_len); nh = NLMSG_NEXT(nh, total_len)) { + int len = nh->nlmsg_len; + + if (nh->nlmsg_type == NLMSG_DONE) + break; + if (nh->nlmsg_type == NLMSG_ERROR) + nl_error(nh); + if (nh->nlmsg_type != pm_family) + continue; + + len -= NLMSG_LENGTH(GENL_HDRLEN); + attrs = (struct rtattr *) ((char *) NLMSG_DATA(nh) + + GENL_HDRLEN); + while (RTA_OK(attrs, len)) { + int type = attrs->rta_type; + + if (type != MPTCP_PM_ATTR_RCV_ADD_ADDRS && + type != MPTCP_PM_ATTR_SUBFLOWS) + goto next; + + memcpy(&max, RTA_DATA(attrs), 4); + printf("%s %u\n", type == MPTCP_PM_ATTR_SUBFLOWS ? + "subflows" : "accept", max); + +next: + attrs = RTA_NEXT(attrs, len); + } + } +} + +int get_set_limits(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + uint32_t rcv_addr = 0, subflows = 0; + int cmd, len = sizeof(data); + struct nlmsghdr *nh; + int off = 0; + + /* limit */ + if (argc == 4) { + rcv_addr = atoi(argv[2]); + subflows = atoi(argv[3]); + cmd = MPTCP_PM_CMD_SET_LIMITS; + } else { + cmd = MPTCP_PM_CMD_GET_LIMITS; + } + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, cmd, MPTCP_PM_VER); + + /* limit */ + if (cmd == MPTCP_PM_CMD_SET_LIMITS) { + struct rtattr *rta = (void *)(data + off); + + rta->rta_type = MPTCP_PM_ATTR_RCV_ADD_ADDRS; + rta->rta_len = RTA_LENGTH(4); + memcpy(RTA_DATA(rta), &rcv_addr, 4); + off += NLMSG_ALIGN(rta->rta_len); + + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ATTR_SUBFLOWS; + rta->rta_len = RTA_LENGTH(4); + memcpy(RTA_DATA(rta), &subflows, 4); + off += NLMSG_ALIGN(rta->rta_len); + + /* do not expect a reply */ + len = 0; + } + + len = do_nl_req(fd, nh, off, len); + if (cmd == MPTCP_PM_CMD_GET_LIMITS) + print_limits(nh, pm_family, len); + return 0; +} + +int main(int argc, char *argv[]) +{ + int fd, pm_family; + + if (argc < 2) + syntax(argv); + + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); + if (fd == -1) + error(1, errno, "socket netlink"); + + pm_family = resolve_mptcp_pm_netlink(fd); + + if (!strcmp(argv[1], "add")) + return add_addr(fd, pm_family, argc, argv); + else if (!strcmp(argv[1], "del")) + return del_addr(fd, pm_family, argc, argv); + else if (!strcmp(argv[1], "flush")) + return flush_addrs(fd, pm_family, argc, argv); + else if (!strcmp(argv[1], "get")) + return get_addr(fd, pm_family, argc, argv); + else if (!strcmp(argv[1], "dump")) + return dump_addrs(fd, pm_family, argc, argv); + else if (!strcmp(argv[1], "limits")) + return get_set_limits(fd, pm_family, argc, argv); + + fprintf(stderr, "unknown sub-command: %s", argv[1]); + syntax(argv); + return 0; +}