Message ID | 20210121004148.2340206-3-arjunroy.kdev@gmail.com |
---|---|
State | New |
Headers | show |
Series | tcp: add CMSG+rx timestamps to rx. zerocopy | expand |
On Wed, 20 Jan 2021 16:41:48 -0800 Arjun Roy wrote: > diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h > index 768e93bd5b51..b216270105af 100644 > --- a/include/uapi/linux/tcp.h > +++ b/include/uapi/linux/tcp.h > @@ -353,5 +353,9 @@ struct tcp_zerocopy_receive { > __u64 copybuf_address; /* in: copybuf address (small reads) */ > __s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */ > __u32 flags; /* in: flags */ > + __u64 msg_control; /* ancillary data */ > + __u64 msg_controllen; > + __u32 msg_flags; > + /* __u32 hole; Next we must add >1 u32 otherwise length checks fail. */ Well, let's hope nobody steps on this landmine.. :) Applied, thanks!
On 1/22/21 9:07 PM, Jakub Kicinski wrote: > On Wed, 20 Jan 2021 16:41:48 -0800 Arjun Roy wrote: >> diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h >> index 768e93bd5b51..b216270105af 100644 >> --- a/include/uapi/linux/tcp.h >> +++ b/include/uapi/linux/tcp.h >> @@ -353,5 +353,9 @@ struct tcp_zerocopy_receive { >> __u64 copybuf_address; /* in: copybuf address (small reads) */ >> __s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */ >> __u32 flags; /* in: flags */ >> + __u64 msg_control; /* ancillary data */ >> + __u64 msg_controllen; >> + __u32 msg_flags; >> + /* __u32 hole; Next we must add >1 u32 otherwise length checks fail. */ > > Well, let's hope nobody steps on this landmine.. :) > Past suggestions were made to use anonymous declarations - e.g., __u32 :32; - as a way of reserving the space for future use. That or declare '__u32 resvd', check that it must be 0 and makes it available for later (either directly or with a union).
On Fri, Jan 22, 2021 at 10:55:45PM -0700, David Ahern wrote: > On 1/22/21 9:07 PM, Jakub Kicinski wrote: > > On Wed, 20 Jan 2021 16:41:48 -0800 Arjun Roy wrote: > >> diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h > >> index 768e93bd5b51..b216270105af 100644 > >> --- a/include/uapi/linux/tcp.h > >> +++ b/include/uapi/linux/tcp.h > >> @@ -353,5 +353,9 @@ struct tcp_zerocopy_receive { > >> __u64 copybuf_address; /* in: copybuf address (small reads) */ > >> __s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */ > >> __u32 flags; /* in: flags */ > >> + __u64 msg_control; /* ancillary data */ > >> + __u64 msg_controllen; > >> + __u32 msg_flags; > >> + /* __u32 hole; Next we must add >1 u32 otherwise length checks fail. */ > > > > Well, let's hope nobody steps on this landmine.. :) > > > > Past suggestions were made to use anonymous declarations - e.g., __u32 > :32; - as a way of reserving the space for future use. That or declare > '__u32 resvd', check that it must be 0 and makes it available for later > (either directly or with a union). This is the schema (reserved field without union) used by the RDMA UAPIs from the beginning (>20 years already) and it works like a charm. Highly recommend :). Thanks >
On 1/24/21 11:15 PM, Leon Romanovsky wrote: > On Fri, Jan 22, 2021 at 10:55:45PM -0700, David Ahern wrote: >> On 1/22/21 9:07 PM, Jakub Kicinski wrote: >>> On Wed, 20 Jan 2021 16:41:48 -0800 Arjun Roy wrote: >>>> diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h >>>> index 768e93bd5b51..b216270105af 100644 >>>> --- a/include/uapi/linux/tcp.h >>>> +++ b/include/uapi/linux/tcp.h >>>> @@ -353,5 +353,9 @@ struct tcp_zerocopy_receive { >>>> __u64 copybuf_address; /* in: copybuf address (small reads) */ >>>> __s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */ >>>> __u32 flags; /* in: flags */ >>>> + __u64 msg_control; /* ancillary data */ >>>> + __u64 msg_controllen; >>>> + __u32 msg_flags; >>>> + /* __u32 hole; Next we must add >1 u32 otherwise length checks fail. */ >>> >>> Well, let's hope nobody steps on this landmine.. :) >>> >> >> Past suggestions were made to use anonymous declarations - e.g., __u32 >> :32; - as a way of reserving the space for future use. That or declare >> '__u32 resvd', check that it must be 0 and makes it available for later >> (either directly or with a union). > > This is the schema (reserved field without union) used by the RDMA UAPIs from > the beginning (>20 years already) and it works like a charm. > > Highly recommend :). > agreed. Arjun: would you mind following up with a patch to make this hole explicit and usable for the next extension? Thanks,
On Mon, Feb 1, 2021 at 6:06 PM David Ahern <dsahern@gmail.com> wrote: > > On 1/24/21 11:15 PM, Leon Romanovsky wrote: > > On Fri, Jan 22, 2021 at 10:55:45PM -0700, David Ahern wrote: > >> On 1/22/21 9:07 PM, Jakub Kicinski wrote: > >>> On Wed, 20 Jan 2021 16:41:48 -0800 Arjun Roy wrote: > >>>> diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h > >>>> index 768e93bd5b51..b216270105af 100644 > >>>> --- a/include/uapi/linux/tcp.h > >>>> +++ b/include/uapi/linux/tcp.h > >>>> @@ -353,5 +353,9 @@ struct tcp_zerocopy_receive { > >>>> __u64 copybuf_address; /* in: copybuf address (small reads) */ > >>>> __s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */ > >>>> __u32 flags; /* in: flags */ > >>>> + __u64 msg_control; /* ancillary data */ > >>>> + __u64 msg_controllen; > >>>> + __u32 msg_flags; > >>>> + /* __u32 hole; Next we must add >1 u32 otherwise length checks fail. */ > >>> > >>> Well, let's hope nobody steps on this landmine.. :) > >>> > >> > >> Past suggestions were made to use anonymous declarations - e.g., __u32 > >> :32; - as a way of reserving the space for future use. That or declare > >> '__u32 resvd', check that it must be 0 and makes it available for later > >> (either directly or with a union). > > > > This is the schema (reserved field without union) used by the RDMA UAPIs from > > the beginning (>20 years already) and it works like a charm. > > > > Highly recommend :). > > > > agreed. > > Arjun: would you mind following up with a patch to make this hole > explicit and usable for the next extension? Thanks, Will do. -Arjun
On Mon, Feb 01, 2021 at 06:20:23PM -0800, Arjun Roy wrote: > On Mon, Feb 1, 2021 at 6:06 PM David Ahern <dsahern@gmail.com> wrote: > > > > On 1/24/21 11:15 PM, Leon Romanovsky wrote: > > > On Fri, Jan 22, 2021 at 10:55:45PM -0700, David Ahern wrote: > > >> On 1/22/21 9:07 PM, Jakub Kicinski wrote: > > >>> On Wed, 20 Jan 2021 16:41:48 -0800 Arjun Roy wrote: > > >>>> diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h > > >>>> index 768e93bd5b51..b216270105af 100644 > > >>>> --- a/include/uapi/linux/tcp.h > > >>>> +++ b/include/uapi/linux/tcp.h > > >>>> @@ -353,5 +353,9 @@ struct tcp_zerocopy_receive { > > >>>> __u64 copybuf_address; /* in: copybuf address (small reads) */ > > >>>> __s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */ > > >>>> __u32 flags; /* in: flags */ > > >>>> + __u64 msg_control; /* ancillary data */ > > >>>> + __u64 msg_controllen; > > >>>> + __u32 msg_flags; > > >>>> + /* __u32 hole; Next we must add >1 u32 otherwise length checks fail. */ > > >>> > > >>> Well, let's hope nobody steps on this landmine.. :) > > >>> > > >> > > >> Past suggestions were made to use anonymous declarations - e.g., __u32 > > >> :32; - as a way of reserving the space for future use. That or declare > > >> '__u32 resvd', check that it must be 0 and makes it available for later > > >> (either directly or with a union). > > > > > > This is the schema (reserved field without union) used by the RDMA UAPIs from > > > the beginning (>20 years already) and it works like a charm. > > > > > > Highly recommend :). > > > > > > > agreed. > > > > Arjun: would you mind following up with a patch to make this hole > > explicit and usable for the next extension? Thanks, > > Will do. Please pay attention that all "in" and "out" fields that marked as reserved should be zeroed and kernel must check "in" field to ensure future compatibility. Thanks > > -Arjun
On Mon, Feb 1, 2021 at 10:52 PM Leon Romanovsky <leon@kernel.org> wrote: > > On Mon, Feb 01, 2021 at 06:20:23PM -0800, Arjun Roy wrote: > > On Mon, Feb 1, 2021 at 6:06 PM David Ahern <dsahern@gmail.com> wrote: > > > > > > On 1/24/21 11:15 PM, Leon Romanovsky wrote: > > > > On Fri, Jan 22, 2021 at 10:55:45PM -0700, David Ahern wrote: > > > >> On 1/22/21 9:07 PM, Jakub Kicinski wrote: > > > >>> On Wed, 20 Jan 2021 16:41:48 -0800 Arjun Roy wrote: > > > >>>> diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h > > > >>>> index 768e93bd5b51..b216270105af 100644 > > > >>>> --- a/include/uapi/linux/tcp.h > > > >>>> +++ b/include/uapi/linux/tcp.h > > > >>>> @@ -353,5 +353,9 @@ struct tcp_zerocopy_receive { > > > >>>> __u64 copybuf_address; /* in: copybuf address (small reads) */ > > > >>>> __s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */ > > > >>>> __u32 flags; /* in: flags */ > > > >>>> + __u64 msg_control; /* ancillary data */ > > > >>>> + __u64 msg_controllen; > > > >>>> + __u32 msg_flags; > > > >>>> + /* __u32 hole; Next we must add >1 u32 otherwise length checks fail. */ > > > >>> > > > >>> Well, let's hope nobody steps on this landmine.. :) > > > >>> > > > >> > > > >> Past suggestions were made to use anonymous declarations - e.g., __u32 > > > >> :32; - as a way of reserving the space for future use. That or declare > > > >> '__u32 resvd', check that it must be 0 and makes it available for later > > > >> (either directly or with a union). > > > > > > > > This is the schema (reserved field without union) used by the RDMA UAPIs from > > > > the beginning (>20 years already) and it works like a charm. > > > > > > > > Highly recommend :). > > > > > > > > > > agreed. > > > > > > Arjun: would you mind following up with a patch to make this hole > > > explicit and usable for the next extension? Thanks, > > > > Will do. > > Please pay attention that all "in" and "out" fields that marked as reserved > should be zeroed and kernel must check "in" field to ensure future compatibility. > > Thanks > A question about the approach where we mandate it as a reserved field; assuming in the future it is only used as an OUT field where 0 is a meaningful no-op value, then just setting it to 0 works just fine. But, if it's an IN or IN-OUT field, it seems like mandating that the application set it to 0 could break the case where a future application sets it to some non-zero value and runs on an older kernel. And allowing it to be non-zero can maybe yield an unexpected outcome if an old application that did not zero it runs on a newer kernel. So: maybe the right move is to mark it as reserved, not care what the input value is, always set it to 0 before returning to the user, and explicitly mandate that any future use of the field must be as an OUT-only parameter? Thanks, -Arjun > > > > -Arjun
On Thu, 4 Feb 2021 15:03:40 -0800 Arjun Roy wrote: > But, if it's an IN or IN-OUT field, it seems like mandating that the > application set it to 0 could break the case where a future > application sets it to some non-zero value and runs on an older > kernel. That usually works fine in practice, 0 means "do what old kernels did / feature not requested", then if newer userspace sets the field to non-0 that means it requires a feature the kernel doesn't support. So -EINVAL / -EOPNOTSUPP is right. BPF syscall has been successfully doing this since day 1, I'm not aware of any major snags. > And allowing it to be non-zero can maybe yield an unexpected > outcome if an old application that did not zero it runs on a newer > kernel. Could you refresh our memory as to why we can't require the application to pass zero-ed memory to TCP ZC? In practice is there are max reasonable length of the argument that such legacy application may pass so that we can start checking at a certain offset? > So: maybe the right move is to mark it as reserved, not care what the > input value is, always set it to 0 before returning to the user, and > explicitly mandate that any future use of the field must be as an > OUT-only parameter?
On Thu, Feb 4, 2021 at 4:00 PM Jakub Kicinski <kuba@kernel.org> wrote: > > On Thu, 4 Feb 2021 15:03:40 -0800 Arjun Roy wrote: > > But, if it's an IN or IN-OUT field, it seems like mandating that the > > application set it to 0 could break the case where a future > > application sets it to some non-zero value and runs on an older > > kernel. > > That usually works fine in practice, 0 means "do what old kernels did / > feature not requested", then if newer userspace sets the field to non-0 > that means it requires a feature the kernel doesn't support. So -EINVAL > / -EOPNOTSUPP is right. BPF syscall has been successfully doing this > since day 1, I'm not aware of any major snags. > Alright, sounds good. > > And allowing it to be non-zero can maybe yield an unexpected > > outcome if an old application that did not zero it runs on a newer > > kernel. > > Could you refresh our memory as to why we can't require the application > to pass zero-ed memory to TCP ZC? In practice is there are max > reasonable length of the argument that such legacy application may pass > so that we can start checking at a certain offset? > Actually I think that's fine. We have hitherto been just using length checks to distinguish between feature capability for rx. zerocopy but I think we can draw the line at this point (now that there's ambiguity) and start requiring zero'd memory. I will send out a patch soon; reserved u32 field, must be set to 0 for the current kernel, can be non-zero and in/out in future kernels as discussed. Thanks, -Arjun > > So: maybe the right move is to mark it as reserved, not care what the > > input value is, always set it to 0 before returning to the user, and > > explicitly mandate that any future use of the field must be as an > > OUT-only parameter? >
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 768e93bd5b51..b216270105af 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -353,5 +353,9 @@ struct tcp_zerocopy_receive { __u64 copybuf_address; /* in: copybuf address (small reads) */ __s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */ __u32 flags; /* in: flags */ + __u64 msg_control; /* ancillary data */ + __u64 msg_controllen; + __u32 msg_flags; + /* __u32 hole; Next we must add >1 u32 otherwise length checks fail. */ }; #endif /* _UAPI_LINUX_TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 28ca6a024f63..0e6f9b8d9f43 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1745,6 +1745,20 @@ int tcp_set_rcvlowat(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_set_rcvlowat); +static void tcp_update_recv_tstamps(struct sk_buff *skb, + struct scm_timestamping_internal *tss) +{ + if (skb->tstamp) + tss->ts[0] = ktime_to_timespec64(skb->tstamp); + else + tss->ts[0] = (struct timespec64) {0}; + + if (skb_hwtstamps(skb)->hwtstamp) + tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp); + else + tss->ts[2] = (struct timespec64) {0}; +} + #ifdef CONFIG_MMU static const struct vm_operations_struct tcp_vm_ops = { }; @@ -1848,13 +1862,13 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, struct scm_timestamping_internal *tss, int *cmsg_flags); static int receive_fallback_to_copy(struct sock *sk, - struct tcp_zerocopy_receive *zc, int inq) + struct tcp_zerocopy_receive *zc, int inq, + struct scm_timestamping_internal *tss) { unsigned long copy_address = (unsigned long)zc->copybuf_address; - struct scm_timestamping_internal tss_unused; - int err, cmsg_flags_unused; struct msghdr msg = {}; struct iovec iov; + int err; zc->length = 0; zc->recv_skip_hint = 0; @@ -1868,7 +1882,7 @@ static int receive_fallback_to_copy(struct sock *sk, return err; err = tcp_recvmsg_locked(sk, &msg, inq, /*nonblock=*/1, /*flags=*/0, - &tss_unused, &cmsg_flags_unused); + tss, &zc->msg_flags); if (err < 0) return err; @@ -1909,21 +1923,27 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, return (__s32)copylen; } -static int tcp_zerocopy_handle_leftover_data(struct tcp_zerocopy_receive *zc, - struct sock *sk, - struct sk_buff *skb, - u32 *seq, - s32 copybuf_len) +static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc, + struct sock *sk, + struct sk_buff *skb, + u32 *seq, + s32 copybuf_len, + struct scm_timestamping_internal *tss) { u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint); if (!copylen) return 0; /* skb is null if inq < PAGE_SIZE. */ - if (skb) + if (skb) { offset = *seq - TCP_SKB_CB(skb)->seq; - else + } else { skb = tcp_recv_skb(sk, *seq, &offset); + if (TCP_SKB_CB(skb)->has_rxtstamp) { + tcp_update_recv_tstamps(skb, tss); + zc->msg_flags |= TCP_CMSG_TS; + } + } zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset, seq); @@ -2010,9 +2030,37 @@ static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, err); } +static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, + struct scm_timestamping_internal *tss); +static void tcp_zc_finalize_rx_tstamp(struct sock *sk, + struct tcp_zerocopy_receive *zc, + struct scm_timestamping_internal *tss) +{ + unsigned long msg_control_addr; + struct msghdr cmsg_dummy; + + msg_control_addr = (unsigned long)zc->msg_control; + cmsg_dummy.msg_control = (void *)msg_control_addr; + cmsg_dummy.msg_controllen = + (__kernel_size_t)zc->msg_controllen; + cmsg_dummy.msg_flags = in_compat_syscall() + ? MSG_CMSG_COMPAT : 0; + zc->msg_flags = 0; + if (zc->msg_control == msg_control_addr && + zc->msg_controllen == cmsg_dummy.msg_controllen) { + tcp_recv_timestamp(&cmsg_dummy, sk, tss); + zc->msg_control = (__u64) + ((uintptr_t)cmsg_dummy.msg_control); + zc->msg_controllen = + (__u64)cmsg_dummy.msg_controllen; + zc->msg_flags = (__u32)cmsg_dummy.msg_flags; + } +} + #define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32 static int tcp_zerocopy_receive(struct sock *sk, - struct tcp_zerocopy_receive *zc) + struct tcp_zerocopy_receive *zc, + struct scm_timestamping_internal *tss) { u32 length = 0, offset, vma_len, avail_len, copylen = 0; unsigned long address = (unsigned long)zc->address; @@ -2029,6 +2077,7 @@ static int tcp_zerocopy_receive(struct sock *sk, int ret; zc->copybuf_len = 0; + zc->msg_flags = 0; if (address & (PAGE_SIZE - 1) || address != zc->address) return -EINVAL; @@ -2039,7 +2088,7 @@ static int tcp_zerocopy_receive(struct sock *sk, sock_rps_record_flow(sk); if (inq && inq <= copybuf_len) - return receive_fallback_to_copy(sk, zc, inq); + return receive_fallback_to_copy(sk, zc, inq, tss); if (inq < PAGE_SIZE) { zc->length = 0; @@ -2084,6 +2133,11 @@ static int tcp_zerocopy_receive(struct sock *sk, } else { skb = tcp_recv_skb(sk, seq, &offset); } + + if (TCP_SKB_CB(skb)->has_rxtstamp) { + tcp_update_recv_tstamps(skb, tss); + zc->msg_flags |= TCP_CMSG_TS; + } zc->recv_skip_hint = skb->len - offset; frags = skb_advance_to_frag(skb, offset, &offset_frag); if (!frags || offset_frag) @@ -2126,8 +2180,7 @@ static int tcp_zerocopy_receive(struct sock *sk, mmap_read_unlock(current->mm); /* Try to copy straggler data. */ if (!ret) - copylen = tcp_zerocopy_handle_leftover_data(zc, sk, skb, &seq, - copybuf_len); + copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss); if (length + copylen) { WRITE_ONCE(tp->copied_seq, seq); @@ -2148,20 +2201,6 @@ static int tcp_zerocopy_receive(struct sock *sk, } #endif -static void tcp_update_recv_tstamps(struct sk_buff *skb, - struct scm_timestamping_internal *tss) -{ - if (skb->tstamp) - tss->ts[0] = ktime_to_timespec64(skb->tstamp); - else - tss->ts[0] = (struct timespec64) {0}; - - if (skb_hwtstamps(skb)->hwtstamp) - tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp); - else - tss->ts[2] = (struct timespec64) {0}; -} - /* Similar to __sock_recv_timestamp, but does not require an skb */ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, struct scm_timestamping_internal *tss) @@ -4089,6 +4128,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, } #ifdef CONFIG_MMU case TCP_ZEROCOPY_RECEIVE: { + struct scm_timestamping_internal tss; struct tcp_zerocopy_receive zc = {}; int err; @@ -4104,11 +4144,18 @@ static int do_tcp_getsockopt(struct sock *sk, int level, if (copy_from_user(&zc, optval, len)) return -EFAULT; lock_sock(sk); - err = tcp_zerocopy_receive(sk, &zc); + err = tcp_zerocopy_receive(sk, &zc, &tss); release_sock(sk); - if (len >= offsetofend(struct tcp_zerocopy_receive, err)) - goto zerocopy_rcv_sk_err; + if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) + goto zerocopy_rcv_cmsg; switch (len) { + case offsetofend(struct tcp_zerocopy_receive, msg_flags): + goto zerocopy_rcv_cmsg; + case offsetofend(struct tcp_zerocopy_receive, msg_controllen): + case offsetofend(struct tcp_zerocopy_receive, msg_control): + case offsetofend(struct tcp_zerocopy_receive, flags): + case offsetofend(struct tcp_zerocopy_receive, copybuf_len): + case offsetofend(struct tcp_zerocopy_receive, copybuf_address): case offsetofend(struct tcp_zerocopy_receive, err): goto zerocopy_rcv_sk_err; case offsetofend(struct tcp_zerocopy_receive, inq): @@ -4117,6 +4164,11 @@ static int do_tcp_getsockopt(struct sock *sk, int level, default: goto zerocopy_rcv_out; } +zerocopy_rcv_cmsg: + if (zc.msg_flags & TCP_CMSG_TS) + tcp_zc_finalize_rx_tstamp(sk, &zc, &tss); + else + zc.msg_flags = 0; zerocopy_rcv_sk_err: if (!err) zc.err = sock_error(sk);