On Fri, May 30, 2025 at 12:50 PM Akihiko Odaki <akihiko.odaki@xxxxxxxxxx> wrote: > > Add common code required for the features being added to TUN and TAP. > They will be enabled for each of them in following patches. > > Added Features > ============== > > Hash reporting > -------------- > > Allow the guest to reuse the hash value to make receive steering > consistent between the host and guest, and to save hash computation. > > Receive Side Scaling (RSS) > -------------------------- > > RSS is a receive steering algorithm that can be negotiated to use with > virtio_net. Conventionally the hash calculation was done by the VMM. > However, computing the hash after the queue was chosen defeats the > purpose of RSS. > > Another approach is to use eBPF steering program. This approach has > another downside: it cannot report the calculated hash due to the > restrictive nature of eBPF steering program. > > Introduce the code to perform RSS to the kernel in order to overcome > thse challenges. An alternative solution is to extend the eBPF steering > program so that it will be able to report to the userspace, but I didn't > opt for it because extending the current mechanism of eBPF steering > program as is because it relies on legacy context rewriting, and > introducing kfunc-based eBPF will result in non-UAPI dependency while > the other relevant virtualization APIs such as KVM and vhost_net are > UAPIs. > > Added ioctls > ============ > > They are designed to make extensibility and VM migration compatible. > This change only adds the implementation and does not expose them to > the userspace. > > TUNGETVNETHASHTYPES > ------------------- > > This ioctl tells supported hash types. It is useful to check if a VM can > be migrated to the current host. > > TUNSETVNETREPORTINGAUTOMQ, TUNSETVNETREPORTINGRSS, and TUNSETVNETRSS > -------------------------------------------------------------------- > > These ioctls configures a steering algorithm and, if needed, hash > reporting. > > Signed-off-by: Akihiko Odaki <akihiko.odaki@xxxxxxxxxx> > Tested-by: Lei Yang <leiyang@xxxxxxxxxx> > --- > drivers/net/tap.c | 10 ++- > drivers/net/tun.c | 12 +++- > drivers/net/tun_vnet.h | 165 +++++++++++++++++++++++++++++++++++++++++--- > include/uapi/linux/if_tun.h | 71 +++++++++++++++++++ > 4 files changed, 244 insertions(+), 14 deletions(-) > > diff --git a/drivers/net/tap.c b/drivers/net/tap.c > index d4ece538f1b2..25c60ff2d3f2 100644 > --- a/drivers/net/tap.c > +++ b/drivers/net/tap.c > @@ -179,6 +179,11 @@ static void tap_put_queue(struct tap_queue *q) > sock_put(&q->sk); > } > > +static const struct virtio_net_hash *tap_find_hash(const struct sk_buff *skb) > +{ > + return NULL; > +} > + > /* > * Select a queue based on the rxq of the device on which this packet > * arrived. If the incoming device is not mq, calculate a flow hash > @@ -711,11 +716,12 @@ static ssize_t tap_put_user(struct tap_queue *q, > int total; > > if (q->flags & IFF_VNET_HDR) { > - struct virtio_net_hdr vnet_hdr; > + struct virtio_net_hdr_v1_hash vnet_hdr; > > vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); > > - ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr); > + ret = tun_vnet_hdr_from_skb(vnet_hdr_len, q->flags, NULL, skb, > + tap_find_hash, &vnet_hdr); > if (ret) > return ret; > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index 9133ab9ed3f5..03d47799e9bd 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -451,6 +451,11 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) > e->rps_rxhash = hash; > } > > +static const struct virtio_net_hash *tun_find_hash(const struct sk_buff *skb) > +{ > + return NULL; > +} > + > /* We try to identify a flow through its rxhash. The reason that > * we do not check rxq no. is because some cards(e.g 82599), chooses > * the rxq based on the txq where the last packet of the flow comes. As > @@ -1993,7 +1998,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun, > ssize_t ret; > > if (tun->flags & IFF_VNET_HDR) { > - struct virtio_net_hdr gso = { 0 }; > + struct virtio_net_hdr_v1_hash gso = { 0 }; > > vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); > ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); > @@ -2046,9 +2051,10 @@ static ssize_t tun_put_user(struct tun_struct *tun, > } > > if (vnet_hdr_sz) { > - struct virtio_net_hdr gso; > + struct virtio_net_hdr_v1_hash gso; > > - ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso); > + ret = tun_vnet_hdr_from_skb(vnet_hdr_sz, tun->flags, tun->dev, > + skb, tun_find_hash, &gso); > if (ret) > return ret; > > diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h > index 58b9ac7a5fc4..45d0533efc8d 100644 > --- a/drivers/net/tun_vnet.h > +++ b/drivers/net/tun_vnet.h > @@ -6,6 +6,17 @@ > #define TUN_VNET_LE 0x80000000 > #define TUN_VNET_BE 0x40000000 > > +typedef struct virtio_net_hash *(*tun_vnet_hash_add)(struct sk_buff *); > +typedef const struct virtio_net_hash *(*tun_vnet_hash_find)(const struct sk_buff *); > + > +struct tun_vnet_hash { > + bool report; > + bool rss; > + struct tun_vnet_rss common; > + u32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; > + u16 rss_indirection_table[]; > +}; > + > static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags) > { > bool be = IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && > @@ -107,6 +118,128 @@ static inline long tun_vnet_ioctl(int *vnet_hdr_sz, unsigned int *flags, > } > } > > +static inline long tun_vnet_ioctl_gethashtypes(u32 __user *argp) > +{ > + return put_user(VIRTIO_NET_SUPPORTED_HASH_TYPES, argp) ? -EFAULT : 0; > +} > + > +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash __rcu **hashp, > + unsigned int cmd, > + void __user *argp) > +{ > + struct tun_vnet_rss common; > + struct tun_vnet_hash *hash; > + size_t indirection_table_size; > + size_t key_size; > + size_t size; > + > + switch (cmd) { > + case TUNSETVNETREPORTINGAUTOMQ: > + if (get_user(common.hash_types, (u32 __user *)argp)) > + return -EFAULT; > + > + if (common.hash_types) { > + hash = kzalloc(sizeof(*hash), GFP_KERNEL); > + if (!hash) > + return -ENOMEM; > + > + hash->report = true; > + hash->common.hash_types = common.hash_types; > + } else { > + hash = NULL; > + } > + break; > + > + case TUNSETVNETREPORTINGRSS: > + case TUNSETVNETRSS: So the above three shows unnecessary design redundancy as well as a burden for the future extension. Why not simply have 1) TUNSETVNET_RSS 2) TUNSETVNET_HASH_REPORT ? Which maps to #define VIRTIO_NET_CTRL_MQ_RSS_CONFIG 1 (for configurable receive steering) #define VIRTIO_NET_CTRL_MQ_HASH_CONFIG 2 (for configurable hash calculation) It would be always easier to start with what spec had or at least we need to explain why we choose a different design here or in the changelog to ease our life. One day we would have tun_select_queue_algorithm_x() we don't have to duplicate the ioctls once again here like TUNSETVNETREPORTINGXYZ > + if (copy_from_user(&common, argp, sizeof(common))) > + return -EFAULT; > + argp = (struct tun_vnet_rss __user *)argp + 1; > + > + indirection_table_size = ((size_t)common.indirection_table_mask + 1) * 2; > + key_size = virtio_net_hash_key_length(common.hash_types); > + size = struct_size(hash, rss_indirection_table, > + (size_t)common.indirection_table_mask + 1); > + > + hash = kmalloc(size, GFP_KERNEL); > + if (!hash) > + return -ENOMEM; > + > + if (copy_from_user(hash->rss_indirection_table, > + argp, indirection_table_size)) { > + kfree(hash); > + return -EFAULT; > + } > + argp = (u16 __user *)argp + common.indirection_table_mask + 1; > + > + if (copy_from_user(hash->rss_key, argp, key_size)) { > + kfree(hash); > + return -EFAULT; > + } > + > + virtio_net_toeplitz_convert_key(hash->rss_key, key_size); > + hash->report = cmd == TUNSETVNETREPORTINGRSS; At least, if this is the only difference why not simply code this into the ioctl itself other than having a very similar command? > + hash->rss = true; > + hash->common = common; > + break; > + > + default: > + return -EINVAL; > + } > + > + kfree_rcu_mightsleep(rcu_replace_pointer_rtnl(*hashp, hash)); > + return 0; > +} > + > +static inline void tun_vnet_hash_report(const struct tun_vnet_hash *hash, > + struct sk_buff *skb, > + const struct flow_keys_basic *keys, > + u32 value, > + tun_vnet_hash_add vnet_hash_add) > +{ > + struct virtio_net_hash *report; > + > + if (!hash || !hash->report) > + return; > + > + report = vnet_hash_add(skb); > + if (!report) > + return; > + > + *report = (struct virtio_net_hash) { > + .report = virtio_net_hash_report(hash->common.hash_types, keys), > + .value = value > + }; > +} > + > +static inline u16 tun_vnet_rss_select_queue(u32 numqueues, > + const struct tun_vnet_hash *hash, > + struct sk_buff *skb, > + tun_vnet_hash_add vnet_hash_add) > +{ > + struct virtio_net_hash *report; > + struct virtio_net_hash ret; > + u16 index; > + > + if (!numqueues) > + return 0; > + > + virtio_net_hash_rss(skb, hash->common.hash_types, hash->rss_key, &ret); > + > + if (!ret.report) > + return hash->common.unclassified_queue % numqueues; > + > + if (hash->report) { > + report = vnet_hash_add(skb); > + if (report) > + *report = ret; > + } > + > + index = ret.value & hash->common.indirection_table_mask; > + > + return hash->rss_indirection_table[index] % numqueues; > +} > + > static inline int tun_vnet_hdr_get(int sz, unsigned int flags, > struct iov_iter *from, > struct virtio_net_hdr *hdr) > @@ -135,15 +268,17 @@ static inline int tun_vnet_hdr_get(int sz, unsigned int flags, > } > > static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter, > - const struct virtio_net_hdr *hdr) > + const struct virtio_net_hdr_v1_hash *hdr) > { > + int content_sz = MIN(sizeof(*hdr), sz); > + > if (unlikely(iov_iter_count(iter) < sz)) > return -EINVAL; > > - if (unlikely(copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr))) > + if (unlikely(copy_to_iter(hdr, content_sz, iter) != content_sz)) > return -EFAULT; > > - if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr)) > + if (iov_iter_zero(sz - content_sz, iter) != sz - content_sz) > return -EFAULT; > > return 0; > @@ -155,26 +290,38 @@ static inline int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb, > return virtio_net_hdr_to_skb(skb, hdr, tun_vnet_is_little_endian(flags)); > } > > -static inline int tun_vnet_hdr_from_skb(unsigned int flags, > +static inline int tun_vnet_hdr_from_skb(int sz, unsigned int flags, > const struct net_device *dev, > const struct sk_buff *skb, > - struct virtio_net_hdr *hdr) > + tun_vnet_hash_find vnet_hash_find, > + struct virtio_net_hdr_v1_hash *hdr) > { > int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0; > + const struct virtio_net_hash *report = sz < sizeof(struct virtio_net_hdr_v1_hash) ? > + NULL : vnet_hash_find(skb); > + > + *hdr = (struct virtio_net_hdr_v1_hash) { > + .hash_report = VIRTIO_NET_HASH_REPORT_NONE > + }; > + > + if (report) { > + hdr->hash_value = cpu_to_le32(report->value); > + hdr->hash_report = cpu_to_le16(report->report); > + } > > - if (virtio_net_hdr_from_skb(skb, hdr, > + if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr, > tun_vnet_is_little_endian(flags), true, > vlan_hlen)) { > struct skb_shared_info *sinfo = skb_shinfo(skb); > > if (net_ratelimit()) { > netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", > - sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->gso_size), > - tun_vnet16_to_cpu(flags, hdr->hdr_len)); > + sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->hdr.gso_size), > + tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len)); > print_hex_dump(KERN_ERR, "tun: ", > DUMP_PREFIX_NONE, > 16, 1, skb->head, > - min(tun_vnet16_to_cpu(flags, hdr->hdr_len), 64), true); > + min(tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len), 64), true); > } > WARN_ON_ONCE(1); > return -EINVAL; > diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h > index 980de74724fc..fe4b984d3bbb 100644 > --- a/include/uapi/linux/if_tun.h > +++ b/include/uapi/linux/if_tun.h > @@ -62,6 +62,62 @@ > #define TUNSETCARRIER _IOW('T', 226, int) > #define TUNGETDEVNETNS _IO('T', 227) > > +/** > + * define TUNGETVNETHASHTYPES - ioctl to get supported virtio_net hashing types > + * > + * The argument is a pointer to __u32 which will store the supported virtio_net > + * hashing types. > + */ > +#define TUNGETVNETHASHTYPES _IOR('T', 228, __u32) > + > +/** > + * define TUNSETVNETREPORTINGAUTOMQ - ioctl to enable automq with hash reporting > + * > + * Disable RSS and enable automatic receive steering with hash reporting. > + * > + * The argument is a pointer to __u32 that contains a bitmask of hash types > + * allowed to be reported. > + * > + * This ioctl results in %EBADFD if the underlying device is deleted. It affects > + * all queues attached to the same device. > + * > + * This ioctl currently has no effect on XDP packets and packets with > + * queue_mapping set by TC. > + */ > +#define TUNSETVNETREPORTINGAUTOMQ _IOR('T', 229, __u32) > + > +/** > + * define TUNSETVNETREPORTINGRSS - ioctl to enable RSS with hash reporting > + * > + * Disable automatic receive steering and enable RSS with hash reporting. This is unnecessary, e.g one day will have select_queue_xyz(), we don't want to say "Disable automatic receive steering and xyz ..." > + * > + * This ioctl results in %EBADFD if the underlying device is deleted. It affects > + * all queues attached to the same device. > + * > + * This ioctl currently has no effect on XDP packets and packets with > + * queue_mapping set by TC. > + */ > +#define TUNSETVNETREPORTINGRSS _IOR('T', 230, struct tun_vnet_rss) > + > +/** > + * define TUNSETVNETRSS - ioctl to enable RSS without hash reporting > + * > + * Disable automatic receive steering and enable RSS without hash reporting. Same here. > + * > + * The argument is a pointer to the compound of the following in order: > + * > + * 1. &struct tun_vnet_rss > + * 3. Indirection table > + * 4. Key > + * > + * This ioctl results in %EBADFD if the underlying device is deleted. It affects > + * all queues attached to the same device. > + * > + * This ioctl currently has no effect on XDP packets and packets with > + * queue_mapping set by TC. > + */ > +#define TUNSETVNETRSS _IOR('T', 231, struct tun_vnet_rss) > + > /* TUNSETIFF ifr flags */ > #define IFF_TUN 0x0001 > #define IFF_TAP 0x0002 > @@ -124,4 +180,19 @@ struct tun_filter { > */ > #define TUN_STEERINGEBPF_FALLBACK -1 > > +/** > + * struct tun_vnet_rss - virtio_net RSS configuration > + * @hash_types: > + * Bitmask of allowed hash types > + * @indirection_table_mask: > + * Bitmask to be applied to the indirection table index > + * @unclassified_queue: > + * The index of the queue to place unclassified packets in > + */ > +struct tun_vnet_rss { > + __u32 hash_types; > + __u16 indirection_table_mask; > + __u16 unclassified_queue; > +}; > + > #endif /* _UAPI__IF_TUN_H */ > > -- > 2.49.0 > Thanks