Message ID | 160109391820.6363.6475038352873960677.stgit@john-Precision-5820-Tower |
---|---|
Headers | show |
Series | Add skb_adjust_room() for SK_SKB | expand |
On 9/26/20 6:27 AM, John Fastabend wrote: > This implements a new helper skb_adjust_room() so users can push/pop > extra bytes from a BPF_SK_SKB_STREAM_VERDICT program. > > Some protocols may include headers and other information that we may > not want to include when doing a redirect from a BPF_SK_SKB_STREAM_VERDICT > program. One use case is to redirect TLS packets into a receive socket > that doesn't expect TLS data. In TLS case the first 13B or so contain the > protocol header. With KTLS the payload is decrypted so we should be able > to redirect this to a receiving socket, but the receiving socket may not > be expecting to receive a TLS header and discard the data. Using the > above helper we can pop the header off and put an appropriate header on > the payload. This allows for creating a proxy between protocols without > extra hops through the stack or userspace. > > So in order to fix this case add skb_adjust_room() so users can strip the > header. After this the user can strip the header and an unmodified receiver > thread will work correctly when data is redirected into the ingress path > of a sock. > > Signed-off-by: John Fastabend <john.fastabend@gmail.com> > --- > net/core/filter.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 51 insertions(+) > > diff --git a/net/core/filter.c b/net/core/filter.c > index 4d8dc7a31a78..d232358f1dcd 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -76,6 +76,7 @@ > #include <net/bpf_sk_storage.h> > #include <net/transp_v6.h> > #include <linux/btf_ids.h> > +#include <net/tls.h> > > static const struct bpf_func_proto * > bpf_sk_base_func_proto(enum bpf_func_id func_id); > @@ -3218,6 +3219,53 @@ static u32 __bpf_skb_max_len(const struct sk_buff *skb) > SKB_MAX_ALLOC; > } > > +BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, > + u32, mode, u64, flags) > +{ > + unsigned int len_diff_abs = abs(len_diff); small nit: u32 > + bool shrink = len_diff < 0; > + int ret = 0; > + > + if (unlikely(flags)) > + return -EINVAL; Parameter 'mode' is not used here, I guess we need to reject anything non-zero? Similarly, any interaction wrt bpf_csum_level() that was needed back then for the bpf_skb_adjust_room()? > + if (unlikely(len_diff_abs > 0xfffU)) > + return -EFAULT; > + > + if (!shrink) { > + unsigned int grow = len_diff; nit: u32 or just directly len_diff? > + ret = skb_cow(skb, grow); > + if (likely(!ret)) { > + __skb_push(skb, len_diff_abs); > + memset(skb->data, 0, len_diff_abs); > + } > + } else { > + /* skb_ensure_writable() is not needed here, as we're > + * already working on an uncloned skb. > + */ > + if (unlikely(!pskb_may_pull(skb, len_diff_abs))) > + return -ENOMEM; > + __skb_pull(skb, len_diff_abs); > + } > + bpf_compute_data_end_sk_skb(skb); > + if (tls_sw_has_ctx_rx(skb->sk)) { > + struct strp_msg *rxm = strp_msg(skb); > + > + rxm->full_len += len_diff; If skb_cow() failed, we still adjust rxm->full_len? > + } > + return ret; > +} > + > +static const struct bpf_func_proto sk_skb_adjust_room_proto = { > + .func = sk_skb_adjust_room, > + .gpl_only = false, > + .ret_type = RET_INTEGER, > + .arg1_type = ARG_PTR_TO_CTX, > + .arg2_type = ARG_ANYTHING, > + .arg3_type = ARG_ANYTHING, > + .arg4_type = ARG_ANYTHING, > +}; > + > BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, > u32, mode, u64, flags) > { > @@ -6483,6 +6531,7 @@ bool bpf_helper_changes_pkt_data(void *func) > func == bpf_skb_change_tail || > func == sk_skb_change_tail || > func == bpf_skb_adjust_room || > + func == sk_skb_adjust_room || > func == bpf_skb_pull_data || > func == sk_skb_pull_data || > func == bpf_clone_redirect || > @@ -6950,6 +6999,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) > return &sk_skb_change_tail_proto; > case BPF_FUNC_skb_change_head: > return &sk_skb_change_head_proto; > + case BPF_FUNC_skb_adjust_room: > + return &sk_skb_adjust_room_proto; > case BPF_FUNC_get_socket_cookie: > return &bpf_get_socket_cookie_proto; > case BPF_FUNC_get_socket_uid: >
Daniel Borkmann wrote: > On 9/26/20 6:27 AM, John Fastabend wrote: > > This implements a new helper skb_adjust_room() so users can push/pop > > extra bytes from a BPF_SK_SKB_STREAM_VERDICT program. > > > > Some protocols may include headers and other information that we may > > not want to include when doing a redirect from a BPF_SK_SKB_STREAM_VERDICT > > program. One use case is to redirect TLS packets into a receive socket > > that doesn't expect TLS data. In TLS case the first 13B or so contain the > > protocol header. With KTLS the payload is decrypted so we should be able > > to redirect this to a receiving socket, but the receiving socket may not > > be expecting to receive a TLS header and discard the data. Using the > > above helper we can pop the header off and put an appropriate header on > > the payload. This allows for creating a proxy between protocols without > > extra hops through the stack or userspace. > > > > So in order to fix this case add skb_adjust_room() so users can strip the > > header. After this the user can strip the header and an unmodified receiver > > thread will work correctly when data is redirected into the ingress path > > of a sock. > > > > Signed-off-by: John Fastabend <john.fastabend@gmail.com> > > --- > > net/core/filter.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ > > 1 file changed, 51 insertions(+) > > > > diff --git a/net/core/filter.c b/net/core/filter.c > > index 4d8dc7a31a78..d232358f1dcd 100644 > > --- a/net/core/filter.c > > +++ b/net/core/filter.c > > @@ -76,6 +76,7 @@ > > #include <net/bpf_sk_storage.h> > > #include <net/transp_v6.h> > > #include <linux/btf_ids.h> > > +#include <net/tls.h> > > > > static const struct bpf_func_proto * > > bpf_sk_base_func_proto(enum bpf_func_id func_id); > > @@ -3218,6 +3219,53 @@ static u32 __bpf_skb_max_len(const struct sk_buff *skb) > > SKB_MAX_ALLOC; > > } > > > > +BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, > > + u32, mode, u64, flags) > > +{ > > + unsigned int len_diff_abs = abs(len_diff); > > small nit: u32 Sure. > > > + bool shrink = len_diff < 0; > > + int ret = 0; > > + > > + if (unlikely(flags)) > > + return -EINVAL; > > Parameter 'mode' is not used here, I guess we need to reject anything non-zero? Probably its not used. > > Similarly, any interaction wrt bpf_csum_level() that was needed back then for the > bpf_skb_adjust_room()? I don't believe so because we are above csum checks at this point. Either we will put the skb data in the receive_queue for the socket or redirect it into sendpage. > > > + if (unlikely(len_diff_abs > 0xfffU)) > > + return -EFAULT; > > + > > + if (!shrink) { > > + unsigned int grow = len_diff; > > nit: u32 or just directly len_diff? Just use len_diff missed when I cleaned this up. > > > + ret = skb_cow(skb, grow); > > + if (likely(!ret)) { > > + __skb_push(skb, len_diff_abs); > > + memset(skb->data, 0, len_diff_abs); > > + } > > + } else { > > + /* skb_ensure_writable() is not needed here, as we're > > + * already working on an uncloned skb. > > + */ > > + if (unlikely(!pskb_may_pull(skb, len_diff_abs))) > > + return -ENOMEM; > > + __skb_pull(skb, len_diff_abs); > > + } > > + bpf_compute_data_end_sk_skb(skb); > > + if (tls_sw_has_ctx_rx(skb->sk)) { > > + struct strp_msg *rxm = strp_msg(skb); > > + > > + rxm->full_len += len_diff; > > If skb_cow() failed, we still adjust rxm->full_len? Thanks. Will just return above on error like in the else branch. I'll send a v2 shortly.
On Sat, Sep 26, 2020 at 06:27 AM CEST, John Fastabend wrote: > This implements a new helper skb_adjust_room() so users can push/pop > extra bytes from a BPF_SK_SKB_STREAM_VERDICT program. > > Some protocols may include headers and other information that we may > not want to include when doing a redirect from a BPF_SK_SKB_STREAM_VERDICT > program. One use case is to redirect TLS packets into a receive socket > that doesn't expect TLS data. In TLS case the first 13B or so contain the > protocol header. With KTLS the payload is decrypted so we should be able > to redirect this to a receiving socket, but the receiving socket may not > be expecting to receive a TLS header and discard the data. Using the > above helper we can pop the header off and put an appropriate header on > the payload. This allows for creating a proxy between protocols without > extra hops through the stack or userspace. This is useful stuff. Apart from the TLS use-case, you might want to pop off proxy headers like PROXY v1/v2 (CC Marek): https://www.haproxy.org/download/1.8/doc/proxy-protocol.txt > > So in order to fix this case add skb_adjust_room() so users can strip the > header. After this the user can strip the header and an unmodified receiver > thread will work correctly when data is redirected into the ingress path > of a sock. > > Signed-off-by: John Fastabend <john.fastabend@gmail.com> > --- > net/core/filter.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 51 insertions(+) > > diff --git a/net/core/filter.c b/net/core/filter.c > index 4d8dc7a31a78..d232358f1dcd 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -76,6 +76,7 @@ > #include <net/bpf_sk_storage.h> > #include <net/transp_v6.h> > #include <linux/btf_ids.h> > +#include <net/tls.h> > > static const struct bpf_func_proto * > bpf_sk_base_func_proto(enum bpf_func_id func_id); > @@ -3218,6 +3219,53 @@ static u32 __bpf_skb_max_len(const struct sk_buff *skb) > SKB_MAX_ALLOC; > } > > +BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, > + u32, mode, u64, flags) > +{ > + unsigned int len_diff_abs = abs(len_diff); > + bool shrink = len_diff < 0; > + int ret = 0; > + > + if (unlikely(flags)) > + return -EINVAL; > + if (unlikely(len_diff_abs > 0xfffU)) > + return -EFAULT; > + > + if (!shrink) { > + unsigned int grow = len_diff; > + > + ret = skb_cow(skb, grow); > + if (likely(!ret)) { > + __skb_push(skb, len_diff_abs); > + memset(skb->data, 0, len_diff_abs); > + } > + } else { > + /* skb_ensure_writable() is not needed here, as we're > + * already working on an uncloned skb. > + */ I'm trying to digest the above comment. What if: static int __strp_recv(…) { … while (eaten < orig_len) { /* Always clone since we will consume something */ skb = skb_clone(orig_skb, GFP_ATOMIC); … head = strp->skb_head; if (!head) { head = skb; … } else { … } … /* Give skb to upper layer */ strp->cb.rcv_msg(strp, head); // → sk_psock_init_strp … } … } That looks like a code path where we pass a cloned SKB. > + if (unlikely(!pskb_may_pull(skb, len_diff_abs))) > + return -ENOMEM; > + __skb_pull(skb, len_diff_abs); > + } > + bpf_compute_data_end_sk_skb(skb); > + if (tls_sw_has_ctx_rx(skb->sk)) { > + struct strp_msg *rxm = strp_msg(skb); > + > + rxm->full_len += len_diff; > + } > + return ret; > +} [...]
Jakub Sitnicki wrote: > On Sat, Sep 26, 2020 at 06:27 AM CEST, John Fastabend wrote: > > This implements a new helper skb_adjust_room() so users can push/pop > > extra bytes from a BPF_SK_SKB_STREAM_VERDICT program. > > > > Some protocols may include headers and other information that we may > > not want to include when doing a redirect from a BPF_SK_SKB_STREAM_VERDICT > > program. One use case is to redirect TLS packets into a receive socket > > that doesn't expect TLS data. In TLS case the first 13B or so contain the > > protocol header. With KTLS the payload is decrypted so we should be able > > to redirect this to a receiving socket, but the receiving socket may not > > be expecting to receive a TLS header and discard the data. Using the > > above helper we can pop the header off and put an appropriate header on > > the payload. This allows for creating a proxy between protocols without > > extra hops through the stack or userspace. > > This is useful stuff. Apart from the TLS use-case, you might want to pop > off proxy headers like PROXY v1/v2 (CC Marek): > > https://www.haproxy.org/download/1.8/doc/proxy-protocol.txt Great! > > > > > So in order to fix this case add skb_adjust_room() so users can strip the > > header. After this the user can strip the header and an unmodified receiver > > thread will work correctly when data is redirected into the ingress path > > of a sock. > > > > Signed-off-by: John Fastabend <john.fastabend@gmail.com> > > --- > > net/core/filter.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ > > 1 file changed, 51 insertions(+) > > > > diff --git a/net/core/filter.c b/net/core/filter.c > > index 4d8dc7a31a78..d232358f1dcd 100644 > > --- a/net/core/filter.c > > +++ b/net/core/filter.c > > @@ -76,6 +76,7 @@ > > #include <net/bpf_sk_storage.h> > > #include <net/transp_v6.h> > > #include <linux/btf_ids.h> > > +#include <net/tls.h> > > > > static const struct bpf_func_proto * > > bpf_sk_base_func_proto(enum bpf_func_id func_id); > > @@ -3218,6 +3219,53 @@ static u32 __bpf_skb_max_len(const struct sk_buff *skb) > > SKB_MAX_ALLOC; > > } > > > > +BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, > > + u32, mode, u64, flags) > > +{ > > + unsigned int len_diff_abs = abs(len_diff); > > + bool shrink = len_diff < 0; > > + int ret = 0; > > + > > + if (unlikely(flags)) > > + return -EINVAL; > > + if (unlikely(len_diff_abs > 0xfffU)) > > + return -EFAULT; > > + > > + if (!shrink) { > > + unsigned int grow = len_diff; > > + > > + ret = skb_cow(skb, grow); > > + if (likely(!ret)) { > > + __skb_push(skb, len_diff_abs); > > + memset(skb->data, 0, len_diff_abs); > > + } > > + } else { > > + /* skb_ensure_writable() is not needed here, as we're > > + * already working on an uncloned skb. > > + */ > > I'm trying to digest the above comment. What if: I'll delete the comment its not accurate. We happily write headers from verdict programs today. Do you have a specific concern or just noticing I was a bit careless and cut'n'pasted an incorrect comment around. > > static int __strp_recv(…) > { > … > while (eaten < orig_len) { > /* Always clone since we will consume something */ > skb = skb_clone(orig_skb, GFP_ATOMIC); > … > head = strp->skb_head; > if (!head) { > head = skb; > … > } else { > … > } > … > /* Give skb to upper layer */ > strp->cb.rcv_msg(strp, head); // → sk_psock_init_strp > … > } > … > } > > That looks like a code path where we pass a cloned SKB. Right but its there to cover the sk_eat_skb in tcp_read_sock() otherwise sk_eat_skb() -> __kfree_skb() -> skb_release_all() would go all the way to page_frag_free(). > > > + if (unlikely(!pskb_may_pull(skb, len_diff_abs))) > > + return -ENOMEM; > > + __skb_pull(skb, len_diff_abs); > > + } > > + bpf_compute_data_end_sk_skb(skb); > > + if (tls_sw_has_ctx_rx(skb->sk)) { > > + struct strp_msg *rxm = strp_msg(skb); > > + > > + rxm->full_len += len_diff; > > + } > > + return ret; > > +} > > [...]