Message ID | 20240915-rss-v3-6-c630015db082@daynix.com |
---|---|
State | New |
Headers | show |
Series | tun: Introduce virtio-net hashing feature | expand |
Akihiko Odaki wrote: > Allow the guest to reuse the hash value to make receive steering > consistent between the host and guest, and to save hash computation. > > Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> > --- > Documentation/networking/tuntap.rst | 7 ++ > drivers/net/Kconfig | 1 + > drivers/net/tun.c | 146 +++++++++++++++++++++++++++++++----- > include/uapi/linux/if_tun.h | 44 +++++++++++ > 4 files changed, 180 insertions(+), 18 deletions(-) > > diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst > index 4d7087f727be..86b4ae8caa8a 100644 > --- a/Documentation/networking/tuntap.rst > +++ b/Documentation/networking/tuntap.rst > @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: > return ioctl(fd, TUNSETQUEUE, (void *)&ifr); > } > > +3.4 Reference > +------------- > + > +``linux/if_tun.h`` defines the interface described below: > + > +.. kernel-doc:: include/uapi/linux/if_tun.h > + > Universal TUN/TAP device driver Frequently Asked Question > ========================================================= > > diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig > index 9920b3a68ed1..e2a7bd703550 100644 > --- a/drivers/net/Kconfig > +++ b/drivers/net/Kconfig > @@ -395,6 +395,7 @@ config TUN > tristate "Universal TUN/TAP device driver support" > depends on INET > select CRC32 > + select SKB_EXTENSIONS > help > TUN/TAP provides packet reception and transmission for user space > programs. It can be viewed as a simple Point-to-Point or Ethernet > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index 9d93ab9ee58f..b8fcd71becac 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -173,6 +173,10 @@ struct tun_prog { > struct bpf_prog *prog; > }; > > +struct tun_vnet_hash_container { > + struct tun_vnet_hash common; > +}; > + > /* Since the socket were moved to tun_file, to preserve the behavior of persist > * device, socket filter, sndbuf and vnet header size were restore when the > * file were attached to a persist device. > @@ -210,6 +214,7 @@ struct tun_struct { > struct bpf_prog __rcu *xdp_prog; > struct tun_prog __rcu *steering_prog; > struct tun_prog __rcu *filter_prog; > + struct tun_vnet_hash_container __rcu *vnet_hash; This is just +struct tun_vnet_hash { + u32 value; + u16 report; +}; Can just be fields in the struct directly. Also, only one bit really used for report, so probably can be condensed further. > struct ethtool_link_ksettings link_ksettings; > /* init args */ > struct file *file; > @@ -221,6 +226,11 @@ struct veth { > __be16 h_vlan_TCI; > }; > > +static const struct tun_vnet_hash tun_vnet_hash_cap = { > + .flags = TUN_VNET_HASH_REPORT, > + .types = VIRTIO_NET_SUPPORTED_HASH_TYPES > +}; > + > static void tun_flow_init(struct tun_struct *tun); > static void tun_flow_uninit(struct tun_struct *tun); > > @@ -322,10 +332,17 @@ static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp) > if (get_user(be, argp)) > return -EFAULT; > > - if (be) > + if (be) { > + struct tun_vnet_hash_container *vnet_hash = rtnl_dereference(tun->vnet_hash); > + > + if (!(tun->flags & TUN_VNET_LE) && > + vnet_hash && (vnet_hash->flags & TUN_VNET_HASH_REPORT)) > + return -EBUSY; > + Doesn't be here imply !tun->flags & TUN_VNET_LE? Same again below. > tun->flags |= TUN_VNET_BE; > - else > + } else { > tun->flags &= ~TUN_VNET_BE; > + } > > return 0; > } > @@ -522,14 +539,20 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) > * the userspace application move between processors, we may get a > * different rxq no. here. > */ > -static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) > +static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb, > + const struct tun_vnet_hash_container *vnet_hash) > { > + struct tun_vnet_hash_ext *ext; > + struct flow_keys keys; > struct tun_flow_entry *e; > u32 txq, numqueues; > > numqueues = READ_ONCE(tun->numqueues); > > - txq = __skb_get_hash_symmetric(skb); > + memset(&keys, 0, sizeof(keys)); > + skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0); > + > + txq = flow_hash_from_keys(&keys); > e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); > if (e) { > tun_flow_save_rps_rxhash(e, txq); > @@ -538,6 +561,16 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) > txq = reciprocal_scale(txq, numqueues); > } > > + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_REPORT)) { > + ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH); > + if (ext) { > + u32 types = vnet_hash->common.types; > + > + ext->report = virtio_net_hash_report(types, keys.basic); > + ext->value = skb->l4_hash ? skb->hash : txq; > + } > + } > + > return txq; > } > > @@ -565,10 +598,13 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, > u16 ret; > > rcu_read_lock(); > - if (rcu_dereference(tun->steering_prog)) > + if (rcu_dereference(tun->steering_prog)) { > ret = tun_ebpf_select_queue(tun, skb); > - else > - ret = tun_automq_select_queue(tun, skb); > + } else { > + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash); > + > + ret = tun_automq_select_queue(tun, skb, vnet_hash); Already passing tun, no need to pass tun->vnet_hash separately. > + } > rcu_read_unlock(); > > return ret; > @@ -2120,33 +2156,63 @@ static ssize_t tun_put_user(struct tun_struct *tun, > } > > if (vnet_hdr_sz) { > - struct virtio_net_hdr gso; > + struct tun_vnet_hash_ext *ext; > + size_t vnet_hdr_content_sz = sizeof(struct virtio_net_hdr); > + union { > + struct virtio_net_hdr hdr; > + struct virtio_net_hdr_v1_hash hdr_v1_hash; > + } vnet_hdr; > + int ret; > > if (iov_iter_count(iter) < vnet_hdr_sz) > return -EINVAL; > > - if (virtio_net_hdr_from_skb(skb, &gso, > - tun_is_little_endian(tun), true, > - vlan_hlen)) { > + ext = vnet_hdr_sz < sizeof(vnet_hdr.hdr_v1_hash) ? > + NULL : skb_ext_find(skb, SKB_EXT_TUN_VNET_HASH); > + > + if (ext) { > + struct virtio_net_hash hash = { > + .value = ext->value, > + .report = ext->report, > + }; > + > + vnet_hdr_content_sz = sizeof(vnet_hdr.hdr_v1_hash); > + ret = virtio_net_hdr_v1_hash_from_skb(skb, > + &vnet_hdr.hdr_v1_hash, > + true, > + vlan_hlen, > + &hash); > + } else { > + vnet_hdr_content_sz = sizeof(struct virtio_net_hdr); > + ret = virtio_net_hdr_from_skb(skb, > + &vnet_hdr.hdr, > + tun_is_little_endian(tun), > + true, > + vlan_hlen); > + } > + This is why just setting the fields directly rather than adding virtio_net_hdr_v1_hash_from_skb is actually simpler. > + if (ret) { > struct skb_shared_info *sinfo = skb_shinfo(skb); > > if (net_ratelimit()) { > netdev_err(tun->dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", > - sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size), > - tun16_to_cpu(tun, gso.hdr_len)); > + sinfo->gso_type, > + tun16_to_cpu(tun, vnet_hdr.hdr.gso_size), > + tun16_to_cpu(tun, vnet_hdr.hdr.hdr_len)); > print_hex_dump(KERN_ERR, "tun: ", > DUMP_PREFIX_NONE, > 16, 1, skb->head, > - min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true); > + min(tun16_to_cpu(tun, vnet_hdr.hdr.hdr_len), 64), > + true); > } > WARN_ON_ONCE(1); > return -EINVAL; > } > > - if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso)) > + if (copy_to_iter(&vnet_hdr, vnet_hdr_content_sz, iter) != vnet_hdr_content_sz) > return -EFAULT; > > - iov_iter_zero(vnet_hdr_sz - sizeof(gso), iter); > + iov_iter_zero(vnet_hdr_sz - vnet_hdr_content_sz, iter); > } > > if (vlan_hlen) { > @@ -3094,6 +3160,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > int le; > int ret; > bool do_notify = false; > + struct tun_vnet_hash vnet_hash_common; > + struct tun_vnet_hash_container *vnet_hash; > > if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || > (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { > @@ -3115,6 +3183,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) > return -EPERM; > return open_related_ns(&net->ns, get_net_ns); > + } else if (cmd == TUNGETVNETHASHCAP) { > + return copy_to_user(argp, &tun_vnet_hash_cap, sizeof(tun_vnet_hash_cap)) ? > + -EFAULT : 0; > } > > rtnl_lock(); > @@ -3314,6 +3385,13 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > break; > } > > + vnet_hash = rtnl_dereference(tun->vnet_hash); > + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) && > + vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr_v1_hash)) { > + ret = -EBUSY; > + break; > + } > + > tun->vnet_hdr_sz = vnet_hdr_sz; > break; > > @@ -3328,10 +3406,18 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > ret = -EFAULT; > break; > } > - if (le) > + if (le) { > tun->flags |= TUN_VNET_LE; > - else > + } else { > + vnet_hash = rtnl_dereference(tun->vnet_hash); > + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) && > + !tun_legacy_is_little_endian(tun)) { > + ret = -EBUSY; > + break; > + } > + > tun->flags &= ~TUN_VNET_LE; > + } > break; > > case TUNGETVNETBE: > @@ -3396,6 +3482,30 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > ret = open_related_ns(&net->ns, get_net_ns); > break; > > + case TUNSETVNETHASH: > + if (copy_from_user(&vnet_hash_common, argp, sizeof(vnet_hash_common))) { > + ret = -EFAULT; > + break; > + } > + argp = (struct tun_vnet_hash __user *)argp + 1; > + > + if ((vnet_hash_common.flags & TUN_VNET_HASH_REPORT) && > + (tun->vnet_hdr_sz < sizeof(struct virtio_net_hdr_v1_hash) || > + !tun_is_little_endian(tun))) { > + ret = -EBUSY; > + break; > + } > + > + vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL); > + if (!vnet_hash) { > + ret = -ENOMEM; > + break; > + } > + > + vnet_hash->common = vnet_hash_common; > + kfree_rcu_mightsleep(rcu_replace_pointer_rtnl(tun->vnet_hash, vnet_hash)); > + break; > + > default: > ret = -EINVAL; > break; > diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h > index 287cdc81c939..1561e8ce0a0a 100644 > --- a/include/uapi/linux/if_tun.h > +++ b/include/uapi/linux/if_tun.h > @@ -62,6 +62,30 @@ > #define TUNSETCARRIER _IOW('T', 226, int) > #define TUNGETDEVNETNS _IO('T', 227) > > +/** > + * define TUNGETVNETHASHCAP - ioctl to get virtio_net hashing capability. > + * > + * The argument is a pointer to &struct tun_vnet_hash which will store the > + * maximal virtio_net hashing configuration. > + */ > +#define TUNGETVNETHASHCAP _IOR('T', 228, struct tun_vnet_hash) > + > +/** > + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing > + * > + * The argument is a pointer to &struct tun_vnet_hash. > + * > + * %TUNSETVNETHDRSZ ioctl must be called with a number greater than or equal to > + * the size of &struct virtio_net_hdr_v1_hash before calling this ioctl with > + * %TUN_VNET_HASH_REPORT. > + * > + * The virtio_net header must be configured as little-endian before calling this > + * ioctl with %TUN_VNET_HASH_REPORT. > + * > + * This ioctl currently has no effect on XDP packets. > + */ > +#define TUNSETVNETHASH _IOW('T', 229, struct tun_vnet_hash) > + > /* TUNSETIFF ifr flags */ > #define IFF_TUN 0x0001 > #define IFF_TAP 0x0002 > @@ -115,4 +139,24 @@ struct tun_filter { > __u8 addr[][ETH_ALEN]; > }; > > +/** > + * define TUN_VNET_HASH_REPORT - Request virtio_net hash reporting for vhost > + */ > +#define TUN_VNET_HASH_REPORT 0x0001 > + > +/** > + * struct tun_vnet_hash - virtio_net hashing configuration > + * @flags: > + * Bitmask consists of %TUN_VNET_HASH_REPORT and %TUN_VNET_HASH_RSS > + * @pad: > + * Should be filled with zero before passing to %TUNSETVNETHASH > + * @types: > + * Bitmask of allowed hash types > + */ > +struct tun_vnet_hash { > + __u16 flags; > + __u8 pad[2]; > + __u32 types; > +}; > + The values for flags and types should probably be defined here. > #endif /* _UAPI__IF_TUN_H */ > > -- > 2.46.0 >
On 2024/09/18 15:17, Willem de Bruijn wrote: > Akihiko Odaki wrote: >> Allow the guest to reuse the hash value to make receive steering >> consistent between the host and guest, and to save hash computation. >> >> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> >> --- >> Documentation/networking/tuntap.rst | 7 ++ >> drivers/net/Kconfig | 1 + >> drivers/net/tun.c | 146 +++++++++++++++++++++++++++++++----- >> include/uapi/linux/if_tun.h | 44 +++++++++++ >> 4 files changed, 180 insertions(+), 18 deletions(-) >> >> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst >> index 4d7087f727be..86b4ae8caa8a 100644 >> --- a/Documentation/networking/tuntap.rst >> +++ b/Documentation/networking/tuntap.rst >> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: >> return ioctl(fd, TUNSETQUEUE, (void *)&ifr); >> } >> >> +3.4 Reference >> +------------- >> + >> +``linux/if_tun.h`` defines the interface described below: >> + >> +.. kernel-doc:: include/uapi/linux/if_tun.h >> + >> Universal TUN/TAP device driver Frequently Asked Question >> ========================================================= >> >> diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig >> index 9920b3a68ed1..e2a7bd703550 100644 >> --- a/drivers/net/Kconfig >> +++ b/drivers/net/Kconfig >> @@ -395,6 +395,7 @@ config TUN >> tristate "Universal TUN/TAP device driver support" >> depends on INET >> select CRC32 >> + select SKB_EXTENSIONS >> help >> TUN/TAP provides packet reception and transmission for user space >> programs. It can be viewed as a simple Point-to-Point or Ethernet >> diff --git a/drivers/net/tun.c b/drivers/net/tun.c >> index 9d93ab9ee58f..b8fcd71becac 100644 >> --- a/drivers/net/tun.c >> +++ b/drivers/net/tun.c >> @@ -173,6 +173,10 @@ struct tun_prog { >> struct bpf_prog *prog; >> }; >> >> +struct tun_vnet_hash_container { >> + struct tun_vnet_hash common; >> +}; >> + >> /* Since the socket were moved to tun_file, to preserve the behavior of persist >> * device, socket filter, sndbuf and vnet header size were restore when the >> * file were attached to a persist device. >> @@ -210,6 +214,7 @@ struct tun_struct { >> struct bpf_prog __rcu *xdp_prog; >> struct tun_prog __rcu *steering_prog; >> struct tun_prog __rcu *filter_prog; >> + struct tun_vnet_hash_container __rcu *vnet_hash; > > This is just > > +struct tun_vnet_hash { > + u32 value; > + u16 report; > +}; > > Can just be fields in the struct directly. I will change to store struct tun_vnet_hash directly. > > Also, only one bit really used for report, so probably can be > condensed further. It is more than one bit; the report types are defined as follows: #define VIRTIO_NET_HASH_REPORT_NONE 0 #define VIRTIO_NET_HASH_REPORT_IPv4 1 #define VIRTIO_NET_HASH_REPORT_TCPv4 2 #define VIRTIO_NET_HASH_REPORT_UDPv4 3 #define VIRTIO_NET_HASH_REPORT_IPv6 4 #define VIRTIO_NET_HASH_REPORT_TCPv6 5 #define VIRTIO_NET_HASH_REPORT_UDPv6 6 #define VIRTIO_NET_HASH_REPORT_IPv6_EX 7 #define VIRTIO_NET_HASH_REPORT_TCPv6_EX 8 #define VIRTIO_NET_HASH_REPORT_UDPv6_EX 9 > >> struct ethtool_link_ksettings link_ksettings; >> /* init args */ >> struct file *file; >> @@ -221,6 +226,11 @@ struct veth { >> __be16 h_vlan_TCI; >> }; >> >> +static const struct tun_vnet_hash tun_vnet_hash_cap = { >> + .flags = TUN_VNET_HASH_REPORT, >> + .types = VIRTIO_NET_SUPPORTED_HASH_TYPES >> +}; >> + >> static void tun_flow_init(struct tun_struct *tun); >> static void tun_flow_uninit(struct tun_struct *tun); >> >> @@ -322,10 +332,17 @@ static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp) >> if (get_user(be, argp)) >> return -EFAULT; >> >> - if (be) >> + if (be) { >> + struct tun_vnet_hash_container *vnet_hash = rtnl_dereference(tun->vnet_hash); >> + >> + if (!(tun->flags & TUN_VNET_LE) && >> + vnet_hash && (vnet_hash->flags & TUN_VNET_HASH_REPORT)) >> + return -EBUSY; >> + > > Doesn't be here imply !tun->flags & TUN_VNET_LE? Same again below. Unfortunately no. TUN_VNET_LE and TUN_VNET_BE can be set at the same time, and TUN_VNET_LE is enforced in such a case. > >> tun->flags |= TUN_VNET_BE; >> - else >> + } else { >> tun->flags &= ~TUN_VNET_BE; >> + } >> >> return 0; >> } >> @@ -522,14 +539,20 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) >> * the userspace application move between processors, we may get a >> * different rxq no. here. >> */ >> -static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) >> +static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb, >> + const struct tun_vnet_hash_container *vnet_hash) >> { >> + struct tun_vnet_hash_ext *ext; >> + struct flow_keys keys; >> struct tun_flow_entry *e; >> u32 txq, numqueues; >> >> numqueues = READ_ONCE(tun->numqueues); >> >> - txq = __skb_get_hash_symmetric(skb); >> + memset(&keys, 0, sizeof(keys)); >> + skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0); >> + >> + txq = flow_hash_from_keys(&keys); >> e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); >> if (e) { >> tun_flow_save_rps_rxhash(e, txq); >> @@ -538,6 +561,16 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) >> txq = reciprocal_scale(txq, numqueues); >> } >> >> + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_REPORT)) { >> + ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH); >> + if (ext) { >> + u32 types = vnet_hash->common.types; >> + >> + ext->report = virtio_net_hash_report(types, keys.basic); >> + ext->value = skb->l4_hash ? skb->hash : txq; >> + } >> + } >> + >> return txq; >> } >> >> @@ -565,10 +598,13 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, >> u16 ret; >> >> rcu_read_lock(); >> - if (rcu_dereference(tun->steering_prog)) >> + if (rcu_dereference(tun->steering_prog)) { >> ret = tun_ebpf_select_queue(tun, skb); >> - else >> - ret = tun_automq_select_queue(tun, skb); >> + } else { >> + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash); >> + >> + ret = tun_automq_select_queue(tun, skb, vnet_hash); > > Already passing tun, no need to pass tun->vnet_hash separately. I will remove the parameter with v4. >> + } >> rcu_read_unlock(); >> >> return ret; >> @@ -2120,33 +2156,63 @@ static ssize_t tun_put_user(struct tun_struct *tun, >> } >> >> if (vnet_hdr_sz) { >> - struct virtio_net_hdr gso; >> + struct tun_vnet_hash_ext *ext; >> + size_t vnet_hdr_content_sz = sizeof(struct virtio_net_hdr); >> + union { >> + struct virtio_net_hdr hdr; >> + struct virtio_net_hdr_v1_hash hdr_v1_hash; >> + } vnet_hdr; >> + int ret; >> >> if (iov_iter_count(iter) < vnet_hdr_sz) >> return -EINVAL; >> >> - if (virtio_net_hdr_from_skb(skb, &gso, >> - tun_is_little_endian(tun), true, >> - vlan_hlen)) { >> + ext = vnet_hdr_sz < sizeof(vnet_hdr.hdr_v1_hash) ? >> + NULL : skb_ext_find(skb, SKB_EXT_TUN_VNET_HASH); >> + >> + if (ext) { >> + struct virtio_net_hash hash = { >> + .value = ext->value, >> + .report = ext->report, >> + }; >> + >> + vnet_hdr_content_sz = sizeof(vnet_hdr.hdr_v1_hash); >> + ret = virtio_net_hdr_v1_hash_from_skb(skb, >> + &vnet_hdr.hdr_v1_hash, >> + true, >> + vlan_hlen, >> + &hash); >> + } else { >> + vnet_hdr_content_sz = sizeof(struct virtio_net_hdr); >> + ret = virtio_net_hdr_from_skb(skb, >> + &vnet_hdr.hdr, >> + tun_is_little_endian(tun), >> + true, >> + vlan_hlen); >> + } >> + > > This is why just setting the fields directly rather than adding > virtio_net_hdr_v1_hash_from_skb is actually simpler. I'll make a change accordingly in v4. > >> + if (ret) { >> struct skb_shared_info *sinfo = skb_shinfo(skb); >> >> if (net_ratelimit()) { >> netdev_err(tun->dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", >> - sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size), >> - tun16_to_cpu(tun, gso.hdr_len)); >> + sinfo->gso_type, >> + tun16_to_cpu(tun, vnet_hdr.hdr.gso_size), >> + tun16_to_cpu(tun, vnet_hdr.hdr.hdr_len)); >> print_hex_dump(KERN_ERR, "tun: ", >> DUMP_PREFIX_NONE, >> 16, 1, skb->head, >> - min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true); >> + min(tun16_to_cpu(tun, vnet_hdr.hdr.hdr_len), 64), >> + true); >> } >> WARN_ON_ONCE(1); >> return -EINVAL; >> } >> >> - if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso)) >> + if (copy_to_iter(&vnet_hdr, vnet_hdr_content_sz, iter) != vnet_hdr_content_sz) >> return -EFAULT; >> >> - iov_iter_zero(vnet_hdr_sz - sizeof(gso), iter); >> + iov_iter_zero(vnet_hdr_sz - vnet_hdr_content_sz, iter); >> } >> >> if (vlan_hlen) { >> @@ -3094,6 +3160,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> int le; >> int ret; >> bool do_notify = false; >> + struct tun_vnet_hash vnet_hash_common; >> + struct tun_vnet_hash_container *vnet_hash; >> >> if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || >> (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { >> @@ -3115,6 +3183,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) >> return -EPERM; >> return open_related_ns(&net->ns, get_net_ns); >> + } else if (cmd == TUNGETVNETHASHCAP) { >> + return copy_to_user(argp, &tun_vnet_hash_cap, sizeof(tun_vnet_hash_cap)) ? >> + -EFAULT : 0; >> } >> >> rtnl_lock(); >> @@ -3314,6 +3385,13 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> break; >> } >> >> + vnet_hash = rtnl_dereference(tun->vnet_hash); >> + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) && >> + vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr_v1_hash)) { >> + ret = -EBUSY; >> + break; >> + } >> + >> tun->vnet_hdr_sz = vnet_hdr_sz; >> break; >> >> @@ -3328,10 +3406,18 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> ret = -EFAULT; >> break; >> } >> - if (le) >> + if (le) { >> tun->flags |= TUN_VNET_LE; >> - else >> + } else { >> + vnet_hash = rtnl_dereference(tun->vnet_hash); >> + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) && >> + !tun_legacy_is_little_endian(tun)) { >> + ret = -EBUSY; >> + break; >> + } >> + >> tun->flags &= ~TUN_VNET_LE; >> + } >> break; >> >> case TUNGETVNETBE: >> @@ -3396,6 +3482,30 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> ret = open_related_ns(&net->ns, get_net_ns); >> break; >> >> + case TUNSETVNETHASH: >> + if (copy_from_user(&vnet_hash_common, argp, sizeof(vnet_hash_common))) { >> + ret = -EFAULT; >> + break; >> + } >> + argp = (struct tun_vnet_hash __user *)argp + 1; >> + >> + if ((vnet_hash_common.flags & TUN_VNET_HASH_REPORT) && >> + (tun->vnet_hdr_sz < sizeof(struct virtio_net_hdr_v1_hash) || >> + !tun_is_little_endian(tun))) { >> + ret = -EBUSY; >> + break; >> + } >> + >> + vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL); >> + if (!vnet_hash) { >> + ret = -ENOMEM; >> + break; >> + } >> + >> + vnet_hash->common = vnet_hash_common; >> + kfree_rcu_mightsleep(rcu_replace_pointer_rtnl(tun->vnet_hash, vnet_hash)); >> + break; >> + >> default: >> ret = -EINVAL; >> break; >> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h >> index 287cdc81c939..1561e8ce0a0a 100644 >> --- a/include/uapi/linux/if_tun.h >> +++ b/include/uapi/linux/if_tun.h >> @@ -62,6 +62,30 @@ >> #define TUNSETCARRIER _IOW('T', 226, int) >> #define TUNGETDEVNETNS _IO('T', 227) >> >> +/** >> + * define TUNGETVNETHASHCAP - ioctl to get virtio_net hashing capability. >> + * >> + * The argument is a pointer to &struct tun_vnet_hash which will store the >> + * maximal virtio_net hashing configuration. >> + */ >> +#define TUNGETVNETHASHCAP _IOR('T', 228, struct tun_vnet_hash) >> + >> +/** >> + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing >> + * >> + * The argument is a pointer to &struct tun_vnet_hash. >> + * >> + * %TUNSETVNETHDRSZ ioctl must be called with a number greater than or equal to >> + * the size of &struct virtio_net_hdr_v1_hash before calling this ioctl with >> + * %TUN_VNET_HASH_REPORT. >> + * >> + * The virtio_net header must be configured as little-endian before calling this >> + * ioctl with %TUN_VNET_HASH_REPORT. >> + * >> + * This ioctl currently has no effect on XDP packets. >> + */ >> +#define TUNSETVNETHASH _IOW('T', 229, struct tun_vnet_hash) >> + >> /* TUNSETIFF ifr flags */ >> #define IFF_TUN 0x0001 >> #define IFF_TAP 0x0002 >> @@ -115,4 +139,24 @@ struct tun_filter { >> __u8 addr[][ETH_ALEN]; >> }; >> >> +/** >> + * define TUN_VNET_HASH_REPORT - Request virtio_net hash reporting for vhost >> + */ >> +#define TUN_VNET_HASH_REPORT 0x0001 >> + >> +/** >> + * struct tun_vnet_hash - virtio_net hashing configuration >> + * @flags: >> + * Bitmask consists of %TUN_VNET_HASH_REPORT and %TUN_VNET_HASH_RSS >> + * @pad: >> + * Should be filled with zero before passing to %TUNSETVNETHASH >> + * @types: >> + * Bitmask of allowed hash types >> + */ >> +struct tun_vnet_hash { >> + __u16 flags; >> + __u8 pad[2]; >> + __u32 types; >> +}; >> + > > The values for flags and types should probably be defined here. I put TUN_VNET_HASH_REPORT before struct tun_vnet_hash following the examples of TUN_PKT_STRIP/struct tun_pi and TUN_FLT_ALLMULTI/struct tun_filter. The types are defined in: include/uapi/linux/virtio_net.h Regards, Akihiko Odaki > >> #endif /* _UAPI__IF_TUN_H */ >> >> -- >> 2.46.0 >> > >
diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst index 4d7087f727be..86b4ae8caa8a 100644 --- a/Documentation/networking/tuntap.rst +++ b/Documentation/networking/tuntap.rst @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: return ioctl(fd, TUNSETQUEUE, (void *)&ifr); } +3.4 Reference +------------- + +``linux/if_tun.h`` defines the interface described below: + +.. kernel-doc:: include/uapi/linux/if_tun.h + Universal TUN/TAP device driver Frequently Asked Question ========================================================= diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 9920b3a68ed1..e2a7bd703550 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -395,6 +395,7 @@ config TUN tristate "Universal TUN/TAP device driver support" depends on INET select CRC32 + select SKB_EXTENSIONS help TUN/TAP provides packet reception and transmission for user space programs. It can be viewed as a simple Point-to-Point or Ethernet diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 9d93ab9ee58f..b8fcd71becac 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -173,6 +173,10 @@ struct tun_prog { struct bpf_prog *prog; }; +struct tun_vnet_hash_container { + struct tun_vnet_hash common; +}; + /* Since the socket were moved to tun_file, to preserve the behavior of persist * device, socket filter, sndbuf and vnet header size were restore when the * file were attached to a persist device. @@ -210,6 +214,7 @@ struct tun_struct { struct bpf_prog __rcu *xdp_prog; struct tun_prog __rcu *steering_prog; struct tun_prog __rcu *filter_prog; + struct tun_vnet_hash_container __rcu *vnet_hash; struct ethtool_link_ksettings link_ksettings; /* init args */ struct file *file; @@ -221,6 +226,11 @@ struct veth { __be16 h_vlan_TCI; }; +static const struct tun_vnet_hash tun_vnet_hash_cap = { + .flags = TUN_VNET_HASH_REPORT, + .types = VIRTIO_NET_SUPPORTED_HASH_TYPES +}; + static void tun_flow_init(struct tun_struct *tun); static void tun_flow_uninit(struct tun_struct *tun); @@ -322,10 +332,17 @@ static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp) if (get_user(be, argp)) return -EFAULT; - if (be) + if (be) { + struct tun_vnet_hash_container *vnet_hash = rtnl_dereference(tun->vnet_hash); + + if (!(tun->flags & TUN_VNET_LE) && + vnet_hash && (vnet_hash->flags & TUN_VNET_HASH_REPORT)) + return -EBUSY; + tun->flags |= TUN_VNET_BE; - else + } else { tun->flags &= ~TUN_VNET_BE; + } return 0; } @@ -522,14 +539,20 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) * the userspace application move between processors, we may get a * different rxq no. here. */ -static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) +static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb, + const struct tun_vnet_hash_container *vnet_hash) { + struct tun_vnet_hash_ext *ext; + struct flow_keys keys; struct tun_flow_entry *e; u32 txq, numqueues; numqueues = READ_ONCE(tun->numqueues); - txq = __skb_get_hash_symmetric(skb); + memset(&keys, 0, sizeof(keys)); + skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0); + + txq = flow_hash_from_keys(&keys); e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); if (e) { tun_flow_save_rps_rxhash(e, txq); @@ -538,6 +561,16 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) txq = reciprocal_scale(txq, numqueues); } + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_REPORT)) { + ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH); + if (ext) { + u32 types = vnet_hash->common.types; + + ext->report = virtio_net_hash_report(types, keys.basic); + ext->value = skb->l4_hash ? skb->hash : txq; + } + } + return txq; } @@ -565,10 +598,13 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, u16 ret; rcu_read_lock(); - if (rcu_dereference(tun->steering_prog)) + if (rcu_dereference(tun->steering_prog)) { ret = tun_ebpf_select_queue(tun, skb); - else - ret = tun_automq_select_queue(tun, skb); + } else { + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash); + + ret = tun_automq_select_queue(tun, skb, vnet_hash); + } rcu_read_unlock(); return ret; @@ -2120,33 +2156,63 @@ static ssize_t tun_put_user(struct tun_struct *tun, } if (vnet_hdr_sz) { - struct virtio_net_hdr gso; + struct tun_vnet_hash_ext *ext; + size_t vnet_hdr_content_sz = sizeof(struct virtio_net_hdr); + union { + struct virtio_net_hdr hdr; + struct virtio_net_hdr_v1_hash hdr_v1_hash; + } vnet_hdr; + int ret; if (iov_iter_count(iter) < vnet_hdr_sz) return -EINVAL; - if (virtio_net_hdr_from_skb(skb, &gso, - tun_is_little_endian(tun), true, - vlan_hlen)) { + ext = vnet_hdr_sz < sizeof(vnet_hdr.hdr_v1_hash) ? + NULL : skb_ext_find(skb, SKB_EXT_TUN_VNET_HASH); + + if (ext) { + struct virtio_net_hash hash = { + .value = ext->value, + .report = ext->report, + }; + + vnet_hdr_content_sz = sizeof(vnet_hdr.hdr_v1_hash); + ret = virtio_net_hdr_v1_hash_from_skb(skb, + &vnet_hdr.hdr_v1_hash, + true, + vlan_hlen, + &hash); + } else { + vnet_hdr_content_sz = sizeof(struct virtio_net_hdr); + ret = virtio_net_hdr_from_skb(skb, + &vnet_hdr.hdr, + tun_is_little_endian(tun), + true, + vlan_hlen); + } + + if (ret) { struct skb_shared_info *sinfo = skb_shinfo(skb); if (net_ratelimit()) { netdev_err(tun->dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", - sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size), - tun16_to_cpu(tun, gso.hdr_len)); + sinfo->gso_type, + tun16_to_cpu(tun, vnet_hdr.hdr.gso_size), + tun16_to_cpu(tun, vnet_hdr.hdr.hdr_len)); print_hex_dump(KERN_ERR, "tun: ", DUMP_PREFIX_NONE, 16, 1, skb->head, - min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true); + min(tun16_to_cpu(tun, vnet_hdr.hdr.hdr_len), 64), + true); } WARN_ON_ONCE(1); return -EINVAL; } - if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso)) + if (copy_to_iter(&vnet_hdr, vnet_hdr_content_sz, iter) != vnet_hdr_content_sz) return -EFAULT; - iov_iter_zero(vnet_hdr_sz - sizeof(gso), iter); + iov_iter_zero(vnet_hdr_sz - vnet_hdr_content_sz, iter); } if (vlan_hlen) { @@ -3094,6 +3160,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, int le; int ret; bool do_notify = false; + struct tun_vnet_hash vnet_hash_common; + struct tun_vnet_hash_container *vnet_hash; if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { @@ -3115,6 +3183,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; return open_related_ns(&net->ns, get_net_ns); + } else if (cmd == TUNGETVNETHASHCAP) { + return copy_to_user(argp, &tun_vnet_hash_cap, sizeof(tun_vnet_hash_cap)) ? + -EFAULT : 0; } rtnl_lock(); @@ -3314,6 +3385,13 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, break; } + vnet_hash = rtnl_dereference(tun->vnet_hash); + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) && + vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr_v1_hash)) { + ret = -EBUSY; + break; + } + tun->vnet_hdr_sz = vnet_hdr_sz; break; @@ -3328,10 +3406,18 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = -EFAULT; break; } - if (le) + if (le) { tun->flags |= TUN_VNET_LE; - else + } else { + vnet_hash = rtnl_dereference(tun->vnet_hash); + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) && + !tun_legacy_is_little_endian(tun)) { + ret = -EBUSY; + break; + } + tun->flags &= ~TUN_VNET_LE; + } break; case TUNGETVNETBE: @@ -3396,6 +3482,30 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = open_related_ns(&net->ns, get_net_ns); break; + case TUNSETVNETHASH: + if (copy_from_user(&vnet_hash_common, argp, sizeof(vnet_hash_common))) { + ret = -EFAULT; + break; + } + argp = (struct tun_vnet_hash __user *)argp + 1; + + if ((vnet_hash_common.flags & TUN_VNET_HASH_REPORT) && + (tun->vnet_hdr_sz < sizeof(struct virtio_net_hdr_v1_hash) || + !tun_is_little_endian(tun))) { + ret = -EBUSY; + break; + } + + vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL); + if (!vnet_hash) { + ret = -ENOMEM; + break; + } + + vnet_hash->common = vnet_hash_common; + kfree_rcu_mightsleep(rcu_replace_pointer_rtnl(tun->vnet_hash, vnet_hash)); + break; + default: ret = -EINVAL; break; diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 287cdc81c939..1561e8ce0a0a 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -62,6 +62,30 @@ #define TUNSETCARRIER _IOW('T', 226, int) #define TUNGETDEVNETNS _IO('T', 227) +/** + * define TUNGETVNETHASHCAP - ioctl to get virtio_net hashing capability. + * + * The argument is a pointer to &struct tun_vnet_hash which will store the + * maximal virtio_net hashing configuration. + */ +#define TUNGETVNETHASHCAP _IOR('T', 228, struct tun_vnet_hash) + +/** + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing + * + * The argument is a pointer to &struct tun_vnet_hash. + * + * %TUNSETVNETHDRSZ ioctl must be called with a number greater than or equal to + * the size of &struct virtio_net_hdr_v1_hash before calling this ioctl with + * %TUN_VNET_HASH_REPORT. + * + * The virtio_net header must be configured as little-endian before calling this + * ioctl with %TUN_VNET_HASH_REPORT. + * + * This ioctl currently has no effect on XDP packets. + */ +#define TUNSETVNETHASH _IOW('T', 229, struct tun_vnet_hash) + /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 #define IFF_TAP 0x0002 @@ -115,4 +139,24 @@ struct tun_filter { __u8 addr[][ETH_ALEN]; }; +/** + * define TUN_VNET_HASH_REPORT - Request virtio_net hash reporting for vhost + */ +#define TUN_VNET_HASH_REPORT 0x0001 + +/** + * struct tun_vnet_hash - virtio_net hashing configuration + * @flags: + * Bitmask consists of %TUN_VNET_HASH_REPORT and %TUN_VNET_HASH_RSS + * @pad: + * Should be filled with zero before passing to %TUNSETVNETHASH + * @types: + * Bitmask of allowed hash types + */ +struct tun_vnet_hash { + __u16 flags; + __u8 pad[2]; + __u32 types; +}; + #endif /* _UAPI__IF_TUN_H */
Allow the guest to reuse the hash value to make receive steering consistent between the host and guest, and to save hash computation. Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> --- Documentation/networking/tuntap.rst | 7 ++ drivers/net/Kconfig | 1 + drivers/net/tun.c | 146 +++++++++++++++++++++++++++++++----- include/uapi/linux/if_tun.h | 44 +++++++++++ 4 files changed, 180 insertions(+), 18 deletions(-)