Message ID | 20210624123005.1301761-4-dwmw2@infradead.org |
---|---|
State | New |
Headers | show |
Series | net: tun: fix tun_xdp_one() for IFF_TUN mode | expand |
在 2021/6/24 下午8:30, David Woodhouse 写道: > From: David Woodhouse <dwmw@amazon.co.uk> > > In tun_get_user(), skb->protocol is either taken from the tun_pi header > or inferred from the first byte of the packet in IFF_TUN mode, while > eth_type_trans() is called only in the IFF_TAP mode where the payload > is expected to be an Ethernet frame. > > The equivalent code path in tun_xdp_one() was unconditionally using > eth_type_trans(), which is the wrong thing to do in IFF_TUN mode and > corrupts packets. > > Pull the logic out to a separate tun_skb_set_protocol() function, and > call it from both tun_get_user() and tun_xdp_one(). > > XX: It is not entirely clear to me why it's OK to call eth_type_trans() > in some cases without first checking that enough of the Ethernet header > is linearly present by calling pskb_may_pull(). Looks like a bug. > Such a check was never > present in the tun_xdp_one() code path, and commit 96aa1b22bd6bb ("tun: > correct header offsets in napi frags mode") deliberately added it *only* > for the IFF_NAPI_FRAGS mode. We had already checked this in tun_get_user() before: if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) { align += NET_IP_ALIGN; if (unlikely(len < ETH_HLEN || (gso.hdr_len && tun16_to_cpu(tun, gso.hdr_len) < ETH_HLEN))) return -EINVAL; } > > I would like to see specific explanations of *why* it's ever valid and > necessary (is it so much faster?) to skip the pskb_may_pull() by setting > the 'no_pull_check' flag to tun_skb_set_protocol(), but for now I'll > settle for faithfully preserving the existing behaviour and pretending > it's someone else's problem. > > Cc: Willem de Bruijn <willemb@google.com> > Fixes: 043d222f93ab ("tuntap: accept an array of XDP buffs through sendmsg()") > Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> Thanks > --- > drivers/net/tun.c | 95 +++++++++++++++++++++++++++++++---------------- > 1 file changed, 63 insertions(+), 32 deletions(-) > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index 1b553f79adb0..9379fa86fae9 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -1641,6 +1641,47 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, > return NULL; > } > > +static int tun_skb_set_protocol(struct tun_struct *tun, struct sk_buff *skb, > + __be16 pi_proto, bool no_pull_check) > +{ > + switch (tun->flags & TUN_TYPE_MASK) { > + case IFF_TUN: > + if (tun->flags & IFF_NO_PI) { > + u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0; > + > + switch (ip_version) { > + case 4: > + pi_proto = htons(ETH_P_IP); > + break; > + case 6: > + pi_proto = htons(ETH_P_IPV6); > + break; > + default: > + return -EINVAL; > + } > + } > + > + skb_reset_mac_header(skb); > + skb->protocol = pi_proto; > + skb->dev = tun->dev; > + break; > + case IFF_TAP: > + /* As an optimisation, no_pull_check can be set in the cases > + * where the caller *knows* that eth_type_trans() will be OK > + * because the Ethernet header is linearised at skb->data. > + * > + * XX: Or so I was reliably assured when I moved this code > + * and didn't make it unconditional. dwmw2. > + */ > + if (!no_pull_check && !pskb_may_pull(skb, ETH_HLEN)) > + return -ENOMEM; > + > + skb->protocol = eth_type_trans(skb, tun->dev); > + break; > + } > + return 0; > +} > + > /* Get packet from user space buffer */ > static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, > void *msg_control, struct iov_iter *from, > @@ -1784,37 +1825,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, > return -EINVAL; > } > > - switch (tun->flags & TUN_TYPE_MASK) { > - case IFF_TUN: > - if (tun->flags & IFF_NO_PI) { > - u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0; > - > - switch (ip_version) { > - case 4: > - pi.proto = htons(ETH_P_IP); > - break; > - case 6: > - pi.proto = htons(ETH_P_IPV6); > - break; > - default: > - atomic_long_inc(&tun->dev->rx_dropped); > - kfree_skb(skb); > - return -EINVAL; > - } > - } > - > - skb_reset_mac_header(skb); > - skb->protocol = pi.proto; > - skb->dev = tun->dev; > - break; > - case IFF_TAP: > - if (frags && !pskb_may_pull(skb, ETH_HLEN)) { > - err = -ENOMEM; > - goto drop; > - } > - skb->protocol = eth_type_trans(skb, tun->dev); > - break; > - } > + err = tun_skb_set_protocol(tun, skb, pi.proto, !frags); > + if (err) > + goto drop; > > /* copy skb_ubuf_info for callback when skb has no error */ > if (zerocopy) { > @@ -2335,12 +2348,24 @@ static int tun_xdp_one(struct tun_struct *tun, > struct virtio_net_hdr *gso = NULL; > struct bpf_prog *xdp_prog; > struct sk_buff *skb = NULL; > + __be16 proto = 0; > u32 rxhash = 0, act; > int buflen = hdr->buflen; > int err = 0; > bool skb_xdp = false; > struct page *page; > > + if (!(tun->flags & IFF_NO_PI)) { > + struct tun_pi *pi = tun_hdr; > + tun_hdr += sizeof(*pi); > + > + if (tun_hdr > xdp->data) { > + atomic_long_inc(&tun->rx_frame_errors); > + return -EINVAL; > + } > + proto = pi->proto; > + } As replied in patch 2, I think it's better to make XDP path work only for TAP but not TUN. Then the series would be much simpler. Thanks > + > if (tun->flags & IFF_VNET_HDR) { > gso = tun_hdr; > tun_hdr += sizeof(*gso); > @@ -2413,7 +2438,13 @@ static int tun_xdp_one(struct tun_struct *tun, > goto out; > } > > - skb->protocol = eth_type_trans(skb, tun->dev); > + err = tun_skb_set_protocol(tun, skb, proto, true); > + if (err) { > + atomic_long_inc(&tun->dev->rx_dropped); > + kfree_skb(skb); > + goto out; > + } > + > skb_reset_network_header(skb); > skb_probe_transport_header(skb); > skb_record_rx_queue(skb, tfile->queue_index);
On Fri, 2021-06-25 at 15:41 +0800, Jason Wang wrote: > 在 2021/6/24 下午8:30, David Woodhouse 写道: > > From: David Woodhouse <dwmw@amazon.co.uk> > > > > In tun_get_user(), skb->protocol is either taken from the tun_pi header > > or inferred from the first byte of the packet in IFF_TUN mode, while > > eth_type_trans() is called only in the IFF_TAP mode where the payload > > is expected to be an Ethernet frame. > > > > The equivalent code path in tun_xdp_one() was unconditionally using > > eth_type_trans(), which is the wrong thing to do in IFF_TUN mode and > > corrupts packets. > > > > Pull the logic out to a separate tun_skb_set_protocol() function, and > > call it from both tun_get_user() and tun_xdp_one(). > > > > XX: It is not entirely clear to me why it's OK to call eth_type_trans() > > in some cases without first checking that enough of the Ethernet header > > is linearly present by calling pskb_may_pull(). > > > Looks like a bug. > > > > Such a check was never > > present in the tun_xdp_one() code path, and commit 96aa1b22bd6bb ("tun: > > correct header offsets in napi frags mode") deliberately added it *only* > > for the IFF_NAPI_FRAGS mode. > > > We had already checked this in tun_get_user() before: > > if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) { > align += NET_IP_ALIGN; > if (unlikely(len < ETH_HLEN || > (gso.hdr_len && tun16_to_cpu(tun, > gso.hdr_len) < ETH_HLEN))) > return -EINVAL; > } We'd checked skb->len, but that doesn't mean we had a full Ethernet header *linearly* at skb->data, does it? For the basic tun_get_user() case I suppose we copy_from_user() into a single linear skb anyway, even if userspace had fragment it and used writev(). So we *are* probably safe there? I'm sure we *can* contrive a proof that it's safe for that case, if we must. But I think we should *need* that proof, if we're going to bypass the check. And I wasn't comfortable touching that code without it. We should also have a fairly good reason... it isn't clear to me *why* we're bothering to avoid the check. Is it so slow, even in the case where there's nothing to be done? For a linear skb, the inline pskb_may_pull() is going to immediately return true because ETH_HLEN < skb_headlen(skb), isn't it? Why optimise *that* away? Willem, was there a reason you made that conditional in the first place? If we're going to continue to *not* check on the XDP path, we similarly need a proof that it can't be fragmented. And also a reason to bother with the "optimisation", of course.
On Thu, Jun 24, 2021 at 8:30 AM David Woodhouse <dwmw2@infradead.org> wrote: > > From: David Woodhouse <dwmw@amazon.co.uk> > > In tun_get_user(), skb->protocol is either taken from the tun_pi header > or inferred from the first byte of the packet in IFF_TUN mode, while > eth_type_trans() is called only in the IFF_TAP mode where the payload > is expected to be an Ethernet frame. > > The equivalent code path in tun_xdp_one() was unconditionally using > eth_type_trans(), which is the wrong thing to do in IFF_TUN mode and > corrupts packets. > > Pull the logic out to a separate tun_skb_set_protocol() function, and > call it from both tun_get_user() and tun_xdp_one(). I think this should be two patches. The support for parsing pi is an independent fix. > XX: It is not entirely clear to me why it's OK to call eth_type_trans() > in some cases without first checking that enough of the Ethernet header > is linearly present by calling pskb_may_pull(). Such a check was never > present in the tun_xdp_one() code path, and commit 96aa1b22bd6bb ("tun: > correct header offsets in napi frags mode") deliberately added it *only* > for the IFF_NAPI_FRAGS mode. IFF_NAPI_FRAGS exercises napi_gro_frags, which uses the frag0 optimization where all data is in frags. The other receive paths do not. For the other cases, linear is guaranteed to include the link layer header. __tun_build_skb, for instance, just allocates one big skb->data. It is admittedly not trivial to prove this point exhaustively for all paths. commit 96aa1b22bd6bb restricted the new test to the frags case, to limit the potential blast radius of a bug fix to only the code path affected by the bug. > I would like to see specific explanations of *why* it's ever valid and > necessary (is it so much faster?) to skip the pskb_may_pull() It was just not needed and that did not complicate anything until this patch. It's fine to unconditionally check if that simplifies this change.
On Fri, 2021-06-25 at 14:43 -0400, Willem de Bruijn wrote: > On Thu, Jun 24, 2021 at 8:30 AM David Woodhouse <dwmw2@infradead.org> wrote: > > > > From: David Woodhouse <dwmw@amazon.co.uk> > > > > In tun_get_user(), skb->protocol is either taken from the tun_pi header > > or inferred from the first byte of the packet in IFF_TUN mode, while > > eth_type_trans() is called only in the IFF_TAP mode where the payload > > is expected to be an Ethernet frame. > > > > The equivalent code path in tun_xdp_one() was unconditionally using > > eth_type_trans(), which is the wrong thing to do in IFF_TUN mode and > > corrupts packets. > > > > Pull the logic out to a separate tun_skb_set_protocol() function, and > > call it from both tun_get_user() and tun_xdp_one(). > > I think this should be two patches. The support for parsing pi is an > independent fix. As things stand, this patch *doesn't* change the pskb_may_pull() behaviour. I think we should make that unconditional, and can indeed do so in a separate subsequent patch.
在 2021/6/25 下午4:51, David Woodhouse 写道: > On Fri, 2021-06-25 at 15:41 +0800, Jason Wang wrote: >> 在 2021/6/24 下午8:30, David Woodhouse 写道: >>> From: David Woodhouse <dwmw@amazon.co.uk> >>> >>> In tun_get_user(), skb->protocol is either taken from the tun_pi header >>> or inferred from the first byte of the packet in IFF_TUN mode, while >>> eth_type_trans() is called only in the IFF_TAP mode where the payload >>> is expected to be an Ethernet frame. >>> >>> The equivalent code path in tun_xdp_one() was unconditionally using >>> eth_type_trans(), which is the wrong thing to do in IFF_TUN mode and >>> corrupts packets. >>> >>> Pull the logic out to a separate tun_skb_set_protocol() function, and >>> call it from both tun_get_user() and tun_xdp_one(). >>> >>> XX: It is not entirely clear to me why it's OK to call eth_type_trans() >>> in some cases without first checking that enough of the Ethernet header >>> is linearly present by calling pskb_may_pull(). >> >> Looks like a bug. >> >> >>> Such a check was never >>> present in the tun_xdp_one() code path, and commit 96aa1b22bd6bb ("tun: >>> correct header offsets in napi frags mode") deliberately added it *only* >>> for the IFF_NAPI_FRAGS mode. >> >> We had already checked this in tun_get_user() before: >> >> if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) { >> align += NET_IP_ALIGN; >> if (unlikely(len < ETH_HLEN || >> (gso.hdr_len && tun16_to_cpu(tun, >> gso.hdr_len) < ETH_HLEN))) >> return -EINVAL; >> } > We'd checked skb->len, but that doesn't mean we had a full Ethernet > header *linearly* at skb->data, does it? The linear room is guaranteed through either: 1) tun_build_skb() or 2) tun_alloc_skb() > > For the basic tun_get_user() case I suppose we copy_from_user() into a > single linear skb anyway, even if userspace had fragment it and used > writev(). So we *are* probably safe there? > > I'm sure we *can* contrive a proof that it's safe for that case, if we > must. But I think we should *need* that proof, if we're going to bypass > the check. And I wasn't comfortable touching that code without it. > > We should also have a fairly good reason... it isn't clear to me *why* > we're bothering to avoid the check. Is it so slow, even in the case > where there's nothing to be done? > > For a linear skb, the inline pskb_may_pull() is going to immediately > return true because ETH_HLEN < skb_headlen(skb), isn't it? Why optimise > *that* away? > > Willem, was there a reason you made that conditional in the first > place? > > If we're going to continue to *not* check on the XDP path, we similarly > need a proof that it can't be fragmented. And also a reason to bother > with the "optimisation", of course. For XDP path, we simply need to add a length check since the packet is always a linear memory. Thanks > >
On Mon, 2021-06-28 at 12:27 +0800, Jason Wang wrote: > > If we're going to continue to *not* check on the XDP path, we similarly > > need a proof that it can't be fragmented. And also a reason to bother > > with the "optimisation", of course. > > > For XDP path, we simply need to add a length check since the packet is > always a linear memory. Sure, but in that case skb_headlen is going to be enough to cover ETH_HLEN, and that's the very first thing that the standard inline version of pskb_may_pull() checks, without ever even having to make an out-of-line call. So there's just no reason ever for us *not* to keep it really simple, and use pskb_may_pull() in all cases.
diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 1b553f79adb0..9379fa86fae9 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1641,6 +1641,47 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, return NULL; } +static int tun_skb_set_protocol(struct tun_struct *tun, struct sk_buff *skb, + __be16 pi_proto, bool no_pull_check) +{ + switch (tun->flags & TUN_TYPE_MASK) { + case IFF_TUN: + if (tun->flags & IFF_NO_PI) { + u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0; + + switch (ip_version) { + case 4: + pi_proto = htons(ETH_P_IP); + break; + case 6: + pi_proto = htons(ETH_P_IPV6); + break; + default: + return -EINVAL; + } + } + + skb_reset_mac_header(skb); + skb->protocol = pi_proto; + skb->dev = tun->dev; + break; + case IFF_TAP: + /* As an optimisation, no_pull_check can be set in the cases + * where the caller *knows* that eth_type_trans() will be OK + * because the Ethernet header is linearised at skb->data. + * + * XX: Or so I was reliably assured when I moved this code + * and didn't make it unconditional. dwmw2. + */ + if (!no_pull_check && !pskb_may_pull(skb, ETH_HLEN)) + return -ENOMEM; + + skb->protocol = eth_type_trans(skb, tun->dev); + break; + } + return 0; +} + /* Get packet from user space buffer */ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, void *msg_control, struct iov_iter *from, @@ -1784,37 +1825,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, return -EINVAL; } - switch (tun->flags & TUN_TYPE_MASK) { - case IFF_TUN: - if (tun->flags & IFF_NO_PI) { - u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0; - - switch (ip_version) { - case 4: - pi.proto = htons(ETH_P_IP); - break; - case 6: - pi.proto = htons(ETH_P_IPV6); - break; - default: - atomic_long_inc(&tun->dev->rx_dropped); - kfree_skb(skb); - return -EINVAL; - } - } - - skb_reset_mac_header(skb); - skb->protocol = pi.proto; - skb->dev = tun->dev; - break; - case IFF_TAP: - if (frags && !pskb_may_pull(skb, ETH_HLEN)) { - err = -ENOMEM; - goto drop; - } - skb->protocol = eth_type_trans(skb, tun->dev); - break; - } + err = tun_skb_set_protocol(tun, skb, pi.proto, !frags); + if (err) + goto drop; /* copy skb_ubuf_info for callback when skb has no error */ if (zerocopy) { @@ -2335,12 +2348,24 @@ static int tun_xdp_one(struct tun_struct *tun, struct virtio_net_hdr *gso = NULL; struct bpf_prog *xdp_prog; struct sk_buff *skb = NULL; + __be16 proto = 0; u32 rxhash = 0, act; int buflen = hdr->buflen; int err = 0; bool skb_xdp = false; struct page *page; + if (!(tun->flags & IFF_NO_PI)) { + struct tun_pi *pi = tun_hdr; + tun_hdr += sizeof(*pi); + + if (tun_hdr > xdp->data) { + atomic_long_inc(&tun->rx_frame_errors); + return -EINVAL; + } + proto = pi->proto; + } + if (tun->flags & IFF_VNET_HDR) { gso = tun_hdr; tun_hdr += sizeof(*gso); @@ -2413,7 +2438,13 @@ static int tun_xdp_one(struct tun_struct *tun, goto out; } - skb->protocol = eth_type_trans(skb, tun->dev); + err = tun_skb_set_protocol(tun, skb, proto, true); + if (err) { + atomic_long_inc(&tun->dev->rx_dropped); + kfree_skb(skb); + goto out; + } + skb_reset_network_header(skb); skb_probe_transport_header(skb); skb_record_rx_queue(skb, tfile->queue_index);