@@ -151,6 +151,7 @@ struct tcp_request_sock {
bool tfo_listener;
bool is_mptcp;
u8 accecn_ok : 1,
+ saw_accecn_opt : 3,
syn_ect_snt: 2,
syn_ect_rcv: 2;
u32 txhash;
@@ -252,6 +253,7 @@ struct tcp_sock {
u8 compressed_ack;
u8 syn_ect_snt:2, /* AccECN ECT memory, only */
syn_ect_rcv:2, /* ... needed durign 3WHS + first seqno */
+ saw_accecn_opt:3, /* A valid AccECN option was seen */
ecn_fail:1; /* ECN reflector detected path mangling */
u32 chrono_start; /* Start time in jiffies of a TCP chrono */
u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */
@@ -226,6 +226,14 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
TCP_ACCECN_NUMCOUNTERS)
#define TCP_ACCECN_BEACON_FREQ_SHIFT 2 /* Send option at least 2^2 times per RTT */
+/* tp->saw_accecn_opt states, empty seen & orderbit are overloaded */
+#define TCP_ACCECN_OPT_EMPTY_SEEN 0x1
+#define TCP_ACCECN_OPT_ORDERBIT 0x1
+#define TCP_ACCECN_OPT_COUNTER_SEEN 0x2
+#define TCP_ACCECN_OPT_SEEN (TCP_ACCECN_OPT_COUNTER_SEEN | \
+ TCP_ACCECN_OPT_EMPTY_SEEN)
+#define TCP_ACCECN_OPT_FAIL 0x4
+
/* Flags in tp->nonagle */
#define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */
#define TCP_NAGLE_CORK 2 /* Socket is corked */
@@ -443,6 +451,7 @@ static inline u32 tcp_accecn_ace_deficit(const struct tcp_sock *tp)
bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, u8 sent_ect);
void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb,
u8 syn_ect_snt);
+u8 tcp_accecn_option_init(const struct sk_buff *skb, u8 opt_offset);
void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb,
u32 payload_len);
@@ -885,6 +894,7 @@ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
*/
#define TCP_ACCECN_CEP_INIT_OFFSET 5
#define TCP_ACCECN_E1B_INIT_OFFSET 0
+#define TCP_ACCECN_E1B_FIRST_INIT_OFFSET 0x800001
#define TCP_ACCECN_E0B_INIT_OFFSET 1
#define TCP_ACCECN_CEB_INIT_OFFSET 0
@@ -2624,6 +2624,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->window_clamp = 0;
tp->delivered = 0;
tp->delivered_ce = 0;
+ tp->saw_accecn_opt = 0;
tp->ecn_fail = 0;
tcp_accecn_init_counters(tp);
tp->prev_ecnfield = 0;
@@ -352,7 +352,8 @@ bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, u8 sent_ect)
}
/* See Table 2 of the AccECN draft */
-static void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th,
+static void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb,
+ const struct tcphdr *th,
u8 ip_dsfield)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -372,7 +373,12 @@ static void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th,
default:
tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK;
- tp->accecn_opt_demand = 2;
+ if (tp->rx_opt.accecn >= 0 &&
+ tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
+ tp->saw_accecn_opt = tcp_accecn_option_init(skb,
+ tp->rx_opt.accecn);
+ tp->accecn_opt_demand = 2;
+ }
if (tcp_accecn_validate_syn_feedback(sk, ace, tp->syn_ect_snt) &&
INET_ECN_is_ce(ip_dsfield))
tp->received_ce++;
@@ -436,7 +442,19 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp,
bool first_changed = false;
bool res;
+ if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL)
+ return false;
+
if (tp->rx_opt.accecn < 0) {
+ if (!tp->saw_accecn_opt) {
+ /* Too late to enable after this point due to
+ * potential counter wraps
+ */
+ if (tp->bytes_sent >= (1 << 23) - 1)
+ tp->saw_accecn_opt = TCP_ACCECN_OPT_FAIL;
+ return false;
+ }
+
if (tp->estimate_ecnfield) {
tp->delivered_ecn_bytes[tp->estimate_ecnfield - 1] +=
delivered_bytes;
@@ -453,11 +471,20 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp,
}
ptr += 2;
+ if (tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN)
+ tp->saw_accecn_opt = tcp_accecn_option_init(skb,
+ tp->rx_opt.accecn);
+
res = !!tp->estimate_ecnfield;
for (i = 0; i < 3; i++) {
if (optlen >= TCPOLEN_ACCECN_PERCOUNTER) {
- u8 ecnfield = accecn_opt_ecnfield[i];
- u32 init_offset = i ? 0 : TCP_ACCECN_E0B_INIT_OFFSET;
+ u8 orderbit = tp->saw_accecn_opt & TCP_ACCECN_OPT_ORDERBIT;
+ int idx = orderbit ? i : 2 - i;
+ u8 ecnfield = accecn_opt_ecnfield[idx];
+ u32 init_offset = i ? 0 :
+ !orderbit ?
+ TCP_ACCECN_E0B_INIT_OFFSET :
+ TCP_ACCECN_E1B_FIRST_INIT_OFFSET;
s32 delta;
delta = tcp_update_ecn_bytes(&(tp->delivered_ecn_bytes[ecnfield - 1]),
@@ -4188,6 +4215,7 @@ void tcp_parse_options(const struct net *net,
get_unaligned_be16(ptr) ==
TCPOPT_ACCECN_MAGIC)
opt_rx->accecn = (ptr - 2) - (unsigned char *)th;
+
/* Fast Open option shares code 254 using a
* 16 bits magic number.
*/
@@ -5836,7 +5864,12 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
if (th->syn) {
if (tcp_ecn_mode_accecn(tp)) {
send_accecn_reflector = true;
- tp->accecn_opt_demand = max_t(u8, 1, tp->accecn_opt_demand);
+ if (tp->rx_opt.accecn >= 0 &&
+ tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
+ tp->saw_accecn_opt = tcp_accecn_option_init(skb,
+ tp->rx_opt.accecn);
+ tp->accecn_opt_demand = max_t(u8, 1, tp->accecn_opt_demand);
+ }
}
syn_challenge:
if (syn_inerr)
@@ -6279,7 +6312,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
*/
if (tcp_ecn_mode_any(tp))
- tcp_ecn_rcv_synack(sk, th, TCP_SKB_CB(skb)->ip_dsfield);
+ tcp_ecn_rcv_synack(sk, skb, th, TCP_SKB_CB(skb)->ip_dsfield);
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
tcp_try_undo_spurious_syn(sk);
@@ -6812,6 +6845,7 @@ static void tcp_openreq_init(struct request_sock *req,
tcp_rsk(req)->snt_synack = 0;
tcp_rsk(req)->last_oow_ack_time = 0;
tcp_rsk(req)->accecn_ok = 0;
+ tcp_rsk(req)->saw_accecn_opt = 0;
tcp_rsk(req)->syn_ect_rcv = 0;
tcp_rsk(req)->syn_ect_snt = 0;
req->mss = rx_opt->mss_clamp;
@@ -97,6 +97,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
bool paws_reject = false;
tmp_opt.saw_tstamp = 0;
+ tmp_opt.accecn = -1;
if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL);
@@ -437,6 +438,7 @@ static void tcp_ecn_openreq_child(struct sock *sk,
tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
tp->syn_ect_snt = treq->syn_ect_snt;
tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt);
+ tp->saw_accecn_opt = treq->saw_accecn_opt;
tp->prev_ecnfield = treq->syn_ect_rcv;
tp->accecn_opt_demand = 1;
tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4);
@@ -491,6 +493,32 @@ static void smc_check_reset_syn_req(struct tcp_sock *oldtp,
#endif
}
+u8 tcp_accecn_option_init(const struct sk_buff *skb, u8 opt_offset)
+{
+ unsigned char *ptr = skb_transport_header(skb) + opt_offset;
+ unsigned int optlen = ptr[1];
+
+ if (ptr[0] == TCPOPT_EXP) {
+ optlen -= 2;
+ ptr += 2;
+ }
+ ptr += 2;
+
+ if (optlen >= TCPOLEN_ACCECN_PERCOUNTER) {
+ u32 first_field = get_unaligned_be32(ptr - 1) & 0xFFFFFFU;
+ u8 orderbit = first_field >> 23;
+ /* Detect option zeroing. Check the first byte counter value,
+ * if present, it must be != 0.
+ */
+ if (!first_field)
+ return TCP_ACCECN_OPT_FAIL;
+
+ return TCP_ACCECN_OPT_COUNTER_SEEN + orderbit;
+ }
+
+ return TCP_ACCECN_OPT_EMPTY_SEEN;
+}
+
/* This is not only more efficient than what we used to do, it eliminates
* a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
*
@@ -793,6 +821,10 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (!(flg & TCP_FLAG_ACK))
return NULL;
+ if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn >= 0 &&
+ tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN)
+ tcp_rsk(req)->saw_accecn_opt = tcp_accecn_option_init(skb, tmp_opt.accecn);
+
/* For Fast Open no more processing is needed (sk is the
* child socket).
*/
@@ -841,6 +841,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
/* Simultaneous open SYN/ACK needs AccECN option but not SYN */
if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) &&
tcp_ecn_mode_accecn(tp) &&
+ inet_csk(sk)->icsk_retransmits < 2 &&
!(sock_net(sk)->ipv4.sysctl_tcp_ecn & TCP_ACCECN_NO_OPT) &&
(remaining >= TCPOLEN_EXP_ACCECN_BASE))) {
opts->ecn_bytes = synack_ecn_bytes;
@@ -914,7 +915,7 @@ static unsigned int tcp_synack_options(const struct sock *sk,
smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
- if (treq->accecn_ok &&
+ if (treq->accecn_ok && req->num_timeout < 1 &&
!(sock_net(sk)->ipv4.sysctl_tcp_ecn & TCP_ACCECN_NO_OPT) &&
(remaining >= TCPOLEN_EXP_ACCECN_BASE)) {
opts->ecn_bytes = synack_ecn_bytes;
@@ -990,6 +991,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
}
if (tcp_ecn_mode_accecn(tp) &&
+ (tp->saw_accecn_opt & TCP_ACCECN_OPT_SEEN) &&
!(sock_net(sk)->ipv4.sysctl_tcp_ecn & TCP_ACCECN_NO_OPT)) {
if (tp->accecn_opt_demand ||
(tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) >=