On Fri, Jun 6, 2025 at 5:10 PM Akihiko Odaki <akihiko.odaki@xxxxxxxxxx> wrote: > > On 2025/06/06 9:48, Jason Wang wrote: > > On Thu, Jun 5, 2025 at 3:58 PM Akihiko Odaki <akihiko.odaki@xxxxxxxxxx> wrote: > >> > >> On 2025/06/05 10:53, Jason Wang wrote: > >>> On Wed, Jun 4, 2025 at 3:20 PM Akihiko Odaki <akihiko.odaki@xxxxxxxxxx> wrote: > >>>> > >>>> On 2025/06/04 10:18, Jason Wang wrote: > >>>>> On Tue, Jun 3, 2025 at 1:31 PM Akihiko Odaki <akihiko.odaki@xxxxxxxxxx> wrote: > >>>>>> > >>>>>> On 2025/06/03 12:19, Jason Wang wrote: > >>>>>>> On Fri, May 30, 2025 at 12:50 PM Akihiko Odaki <akihiko.odaki@xxxxxxxxxx> wrote: > >>>>>>>> > >>>>>>>> They are useful to implement VIRTIO_NET_F_RSS and > >>>>>>>> VIRTIO_NET_F_HASH_REPORT. > >>>>>>>> > >>>>>>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@xxxxxxxxxx> > >>>>>>>> Tested-by: Lei Yang <leiyang@xxxxxxxxxx> > >>>>>>>> --- > >>>>>>>> include/linux/virtio_net.h | 188 +++++++++++++++++++++++++++++++++++++++++++++ > >>>>>>>> 1 file changed, 188 insertions(+) > >>>>>>>> > >>>>>>>> diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h > >>>>>>>> index 02a9f4dc594d..426f33b4b824 100644 > >>>>>>>> --- a/include/linux/virtio_net.h > >>>>>>>> +++ b/include/linux/virtio_net.h > >>>>>>>> @@ -9,6 +9,194 @@ > >>>>>>>> #include <uapi/linux/tcp.h> > >>>>>>>> #include <uapi/linux/virtio_net.h> > >>>>>>>> > >>>>>>>> +struct virtio_net_hash { > >>>>>>>> + u32 value; > >>>>>>>> + u16 report; > >>>>>>>> +}; > >>>>>>>> + > >>>>>>>> +struct virtio_net_toeplitz_state { > >>>>>>>> + u32 hash; > >>>>>>>> + const u32 *key; > >>>>>>>> +}; > >>>>>>>> + > >>>>>>>> +#define VIRTIO_NET_SUPPORTED_HASH_TYPES (VIRTIO_NET_RSS_HASH_TYPE_IPv4 | \ > >>>>>>>> + VIRTIO_NET_RSS_HASH_TYPE_TCPv4 | \ > >>>>>>>> + VIRTIO_NET_RSS_HASH_TYPE_UDPv4 | \ > >>>>>>>> + VIRTIO_NET_RSS_HASH_TYPE_IPv6 | \ > >>>>>>>> + VIRTIO_NET_RSS_HASH_TYPE_TCPv6 | \ > >>>>>>>> + VIRTIO_NET_RSS_HASH_TYPE_UDPv6) > >>>>>>>> + > >>>>>>>> +#define VIRTIO_NET_RSS_MAX_KEY_SIZE 40 > >>>>>>>> + > >>>>>>>> +static inline void virtio_net_toeplitz_convert_key(u32 *input, size_t len) > >>>>>>>> +{ > >>>>>>>> + while (len >= sizeof(*input)) { > >>>>>>>> + *input = be32_to_cpu((__force __be32)*input); > >>>>>>>> + input++; > >>>>>>>> + len -= sizeof(*input); > >>>>>>>> + } > >>>>>>>> +} > >>>>>>>> + > >>>>>>>> +static inline void virtio_net_toeplitz_calc(struct virtio_net_toeplitz_state *state, > >>>>>>>> + const __be32 *input, size_t len) > >>>>>>>> +{ > >>>>>>>> + while (len >= sizeof(*input)) { > >>>>>>>> + for (u32 map = be32_to_cpu(*input); map; map &= (map - 1)) { > >>>>>>>> + u32 i = ffs(map); > >>>>>>>> + > >>>>>>>> + state->hash ^= state->key[0] << (32 - i) | > >>>>>>>> + (u32)((u64)state->key[1] >> i); > >>>>>>>> + } > >>>>>>>> + > >>>>>>>> + state->key++; > >>>>>>>> + input++; > >>>>>>>> + len -= sizeof(*input); > >>>>>>>> + } > >>>>>>>> +} > >>>>>>>> + > >>>>>>>> +static inline u8 virtio_net_hash_key_length(u32 types) > >>>>>>>> +{ > >>>>>>>> + size_t len = 0; > >>>>>>>> + > >>>>>>>> + if (types & VIRTIO_NET_HASH_REPORT_IPv4) > >>>>>>>> + len = max(len, > >>>>>>>> + sizeof(struct flow_dissector_key_ipv4_addrs)); > >>>>>>>> + > >>>>>>>> + if (types & > >>>>>>>> + (VIRTIO_NET_HASH_REPORT_TCPv4 | VIRTIO_NET_HASH_REPORT_UDPv4)) > >>>>>>>> + len = max(len, > >>>>>>>> + sizeof(struct flow_dissector_key_ipv4_addrs) + > >>>>>>>> + sizeof(struct flow_dissector_key_ports)); > >>>>>>>> + > >>>>>>>> + if (types & VIRTIO_NET_HASH_REPORT_IPv6) > >>>>>>>> + len = max(len, > >>>>>>>> + sizeof(struct flow_dissector_key_ipv6_addrs)); > >>>>>>>> + > >>>>>>>> + if (types & > >>>>>>>> + (VIRTIO_NET_HASH_REPORT_TCPv6 | VIRTIO_NET_HASH_REPORT_UDPv6)) > >>>>>>>> + len = max(len, > >>>>>>>> + sizeof(struct flow_dissector_key_ipv6_addrs) + > >>>>>>>> + sizeof(struct flow_dissector_key_ports)); > >>>>>>>> + > >>>>>>>> + return len + sizeof(u32); > >>>>>>>> +} > >>>>>>>> + > >>>>>>>> +static inline u32 virtio_net_hash_report(u32 types, > >>>>>>>> + const struct flow_keys_basic *keys) > >>>>>>>> +{ > >>>>>>>> + switch (keys->basic.n_proto) { > >>>>>>>> + case cpu_to_be16(ETH_P_IP): > >>>>>>>> + if (!(keys->control.flags & FLOW_DIS_IS_FRAGMENT)) { > >>>>>>>> + if (keys->basic.ip_proto == IPPROTO_TCP && > >>>>>>>> + (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4)) > >>>>>>>> + return VIRTIO_NET_HASH_REPORT_TCPv4; > >>>>>>>> + > >>>>>>>> + if (keys->basic.ip_proto == IPPROTO_UDP && > >>>>>>>> + (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4)) > >>>>>>>> + return VIRTIO_NET_HASH_REPORT_UDPv4; > >>>>>>>> + } > >>>>>>>> + > >>>>>>>> + if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv4) > >>>>>>>> + return VIRTIO_NET_HASH_REPORT_IPv4; > >>>>>>>> + > >>>>>>>> + return VIRTIO_NET_HASH_REPORT_NONE; > >>>>>>>> + > >>>>>>>> + case cpu_to_be16(ETH_P_IPV6): > >>>>>>>> + if (!(keys->control.flags & FLOW_DIS_IS_FRAGMENT)) { > >>>>>>>> + if (keys->basic.ip_proto == IPPROTO_TCP && > >>>>>>>> + (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6)) > >>>>>>>> + return VIRTIO_NET_HASH_REPORT_TCPv6; > >>>>>>>> + > >>>>>>>> + if (keys->basic.ip_proto == IPPROTO_UDP && > >>>>>>>> + (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6)) > >>>>>>>> + return VIRTIO_NET_HASH_REPORT_UDPv6; > >>>>>>>> + } > >>>>>>>> + > >>>>>>>> + if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv6) > >>>>>>>> + return VIRTIO_NET_HASH_REPORT_IPv6; > >>>>>>>> + > >>>>>>>> + return VIRTIO_NET_HASH_REPORT_NONE; > >>>>>>>> + > >>>>>>>> + default: > >>>>>>>> + return VIRTIO_NET_HASH_REPORT_NONE; > >>>>>>>> + } > >>>>>>>> +} > >>>>>>>> + > >>>>>>>> +static inline void virtio_net_hash_rss(const struct sk_buff *skb, > >>>>>>>> + u32 types, const u32 *key, > >>>>>>>> + struct virtio_net_hash *hash) > >>>>>>>> +{ > >>>>>>>> + struct virtio_net_toeplitz_state toeplitz_state = { .key = key }; > >>>>>>>> + struct flow_keys flow; > >>>>>>>> + struct flow_keys_basic flow_basic; > >>>>>>>> + u16 report; > >>>>>>>> + > >>>>>>>> + if (!skb_flow_dissect_flow_keys(skb, &flow, 0)) { > >>>>>>>> + hash->report = VIRTIO_NET_HASH_REPORT_NONE; > >>>>>>>> + return; > >>>>>>>> + } > >>>>>>>> + > >>>>>>>> + flow_basic = (struct flow_keys_basic) { > >>>>>>>> + .control = flow.control, > >>>>>>>> + .basic = flow.basic > >>>>>>>> + }; > >>>>>>>> + > >>>>>>>> + report = virtio_net_hash_report(types, &flow_basic); > >>>>>>>> + > >>>>>>>> + switch (report) { > >>>>>>>> + case VIRTIO_NET_HASH_REPORT_IPv4: > >>>>>>>> + virtio_net_toeplitz_calc(&toeplitz_state, > >>>>>>>> + (__be32 *)&flow.addrs.v4addrs, > >>>>>>>> + sizeof(flow.addrs.v4addrs)); > >>>>>>>> + break; > >>>>>>>> + > >>>>>>>> + case VIRTIO_NET_HASH_REPORT_TCPv4: > >>>>>>>> + virtio_net_toeplitz_calc(&toeplitz_state, > >>>>>>>> + (__be32 *)&flow.addrs.v4addrs, > >>>>>>>> + sizeof(flow.addrs.v4addrs)); > >>>>>>>> + virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports, > >>>>>>>> + sizeof(flow.ports.ports)); > >>>>>>>> + break; > >>>>>>>> + > >>>>>>>> + case VIRTIO_NET_HASH_REPORT_UDPv4: > >>>>>>>> + virtio_net_toeplitz_calc(&toeplitz_state, > >>>>>>>> + (__be32 *)&flow.addrs.v4addrs, > >>>>>>>> + sizeof(flow.addrs.v4addrs)); > >>>>>>>> + virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports, > >>>>>>>> + sizeof(flow.ports.ports)); > >>>>>>>> + break; > >>>>>>>> + > >>>>>>>> + case VIRTIO_NET_HASH_REPORT_IPv6: > >>>>>>>> + virtio_net_toeplitz_calc(&toeplitz_state, > >>>>>>>> + (__be32 *)&flow.addrs.v6addrs, > >>>>>>>> + sizeof(flow.addrs.v6addrs)); > >>>>>>>> + break; > >>>>>>>> + > >>>>>>>> + case VIRTIO_NET_HASH_REPORT_TCPv6: > >>>>>>>> + virtio_net_toeplitz_calc(&toeplitz_state, > >>>>>>>> + (__be32 *)&flow.addrs.v6addrs, > >>>>>>>> + sizeof(flow.addrs.v6addrs)); > >>>>>>>> + virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports, > >>>>>>>> + sizeof(flow.ports.ports)); > >>>>>>>> + break; > >>>>>>>> + > >>>>>>>> + case VIRTIO_NET_HASH_REPORT_UDPv6: > >>>>>>>> + virtio_net_toeplitz_calc(&toeplitz_state, > >>>>>>>> + (__be32 *)&flow.addrs.v6addrs, > >>>>>>>> + sizeof(flow.addrs.v6addrs)); > >>>>>>>> + virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports, > >>>>>>>> + sizeof(flow.ports.ports)); > >>>>>>>> + break; > >>>>>>>> + > >>>>>>>> + default: > >>>>>>>> + hash->report = VIRTIO_NET_HASH_REPORT_NONE; > >>>>>>>> + return; > >>>>>>> > >>>>>>> So I still think we need a comment here to explain why this is not an > >>>>>>> issue if the device can report HASH_XXX_EX. Or we need to add the > >>>>>>> support, since this is the code from the driver side, I don't think we > >>>>>>> need to worry about the device implementation issues. > >>>>>> > >>>>>> This is on the device side, and don't report HASH_TYPE_XXX_EX. > >>>>>> > >>>>>>> > >>>>>>> For the issue of the number of options, does the spec forbid fallback > >>>>>>> to VIRTIO_NET_HASH_REPORT_NONE? If not, we can do that. > >>>>>> > >>>>>> 5.1.6.4.3.4 "IPv6 packets with extension header" says: > >>>>>> > If VIRTIO_NET_HASH_TYPE_TCP_EX is set and the packet has a TCPv6 > >>>>>> > header, the hash is calculated over the following fields: > >>>>>> > - Home address from the home address option in the IPv6 destination > >>>>>> > options header. If the extension header is not present, use the > >>>>>> > Source IPv6 address. > >>>>>> > - IPv6 address that is contained in the Routing-Header-Type-2 from the > >>>>>> > associated extension header. If the extension header is not present, > >>>>>> > use the Destination IPv6 address. > >>>>>> > - Source TCP port > >>>>>> > - Destination TCP port > >>>>>> > >>>>>> Therefore, if VIRTIO_NET_HASH_TYPE_TCP_EX is set, the packet has a TCPv6 > >>>>>> and an home address option in the IPv6 destination options header is > >>>>>> present, the hash is calculated over the home address. If the hash is > >>>>>> not calculated over the home address in such a case, the device is > >>>>>> contradicting with this section and violating the spec. The same goes > >>>>>> for the other HASH_TYPE_XXX_EX types and Routing-Header-Type-2. > >>>>> > >>>>> Just to make sure we are one the same page. I meant: > >>>>> > >>>>> 1) If the hash is not calculated over the home address (in the case of > >>>>> IPv6 destination destination), it can still report > >>>>> VIRTIO_NET_RSS_HASH_TYPE_IPv6. This is what you implemented in your > >>>>> series. So the device can simply fallback to e.g TCPv6 if it can't > >>>>> understand all or part of the IPv6 options. > >>>> > >>>> The spec says it can fallback if "the extension header is not present", > >>>> not if the device can't understand the extension header. > >>> > >>> I don't think so, > >>> > >>> 1) spec had a condition beforehand: > >>> > >>> """ > >>> If VIRTIO_NET_HASH_TYPE_TCP_EX is set and the packet has a TCPv6 > >>> header, the hash is calculated over the following fields: > >>> ... > >>> If the extension header is not present ... > >>> """ > >>> > >>> So the device can choose not to set VIRTIO_NET_HASH_TYPE_TCP_EX as > >>> spec doesn't say device MUST set VIRTIO_NET_HASH_TYPE_TCP_EX if ... > >>> > >>> 2) implementation wise, since device has limited resources, we can't > >>> expect the device can parse arbitrary number of ipv6 options > >>> > >>> 3) if 1) and 2) not the case, we need fix the spec otherwise implement > >>> a spec compliant device is impractical > >> > >> The statement is preceded by the following: > >> > The device calculates the hash on IPv4 packets according to > >> > ’Enabled hash types’ bitmask as follows: > >> > >> The 'Enabled hash types' bitmask is specified by the device. > >> > >> I think the spec needs amendment. > > > > Michael, can you help to clarify here? > > > >> > >> I wonder if there are any people interested in the feature though. > >> Looking at virtnet_set_hashflow() in drivers/net/virtio_net.c, the > >> driver of Linux does not let users configure HASH_TYPE_XXX_EX. I suppose > >> Windows supports HASH_TYPE_XXX_EX, but those who care network > >> performance most would use Linux so HASH_TYPE_XXX_EX support without > >> Linux driver's support may not be useful. > > > > It might be still interesting for example for the hardware virtio > > vendors to support windows etc. > > I don't know if Windows needs them for e.g., device/driver certification > so surveying Windows makes sense. Yuri, can you help to clarify this? > > > > >> > >>> > >>>> > >>>>> 2) the VIRTIO_NET_SUPPORTED_HASH_TYPES is not checked against the > >>>>> tun_vnet_ioctl_sethash(), so userspace may set > >>>>> VIRTIO_NET_HASH_TYPE_TCP_EX regardless of what has been returned by > >>>>> tun_vnet_ioctl_gethashtypes(). In this case they won't get > >>>>> VIRTIO_NET_HASH_TYPE_TCP_EX. > >>>> > >>>> That's right. It's the responsibility of the userspace to set only the > >>>> supported hash types. > >>> > >>> Well, the kernel should filter out the unsupported one to have a > >>> robust uAPI. Otherwise, we give green light to the buggy userspace > >>> which will have unexpected results. > >> > >> My reasoning was that it may be fine for some use cases other than VM > >> (e.g., DPDK); in such a use case, it is fine as long as the UAPI works > >> in the best-effort basis. > > > > Best-effort might increase the chance for user visisable changes after > > migration. > > It is a trade-off between catching a migration bug for VMM and making > life a bit easier for userspace programs other than VMM. My understanding is to avoid breaking the migration compatibility as much as possible as fixing that would be complicated or even impossible. > > > > >> > >> For example, suppose a userspace program that processes TCP packets; the > >> program can enable: HASH_TYPE_IPv4, HASH_TYPE_TCPv4, HASH_TYPE_IPv6, and > >> HASH_TYPE_TCPv6. Ideally, the kernel should support all the hash types, > >> but, even if e.g., HASH_TYPE_TCPv6 is not available, > > > > For "available" did you mean it is not supported by the device? > > > >> it will fall back > >> to HASH_TYPE_IPv6, which still does something good and may be acceptable. > > > > This fallback is exactly the same as I said above, let > > VIRTIO_NET_HASH_TYPE_TCP_EX to fallback. > > > > My point is that, the implementation should either: > > > > 1) allow fallback so it can claim to support all hash types > > > > or > > > > 2) don't allow fallback so it can only support a part of the hash types > > > > If we're doing something in the middle, for example, allow part of the > > type to fallback. > > 1) or the middle will make it unsuitable for VM because it violates the > virtio spec. 2) makes sense though the trade-off I mentioned should be > taken into consideration. > > > > >> > >> That said, for a use case that involves VM and implements virtio-net > >> (e.g., QEMU), setting an unsupported hash type here is definitely a bug. > >> Catching the bug may outweigh the extra trouble for other use cases. > >> > >>> > >>>> > >>>>> 3) implementing part of the hash types might complicate the migration > >>>>> or at least we need to describe the expectations of libvirt or other > >>>>> management in this case. For example, do we plan to have a dedicated > >>>>> Qemu command line like: > >>>>> > >>>>> -device virtio-net-pci,hash_report=on,supported_hash_types=X,Y,Z? > >>>> > >>>> I posted a patch series to implement such a command line for vDPA[1]. > >>>> The patch series that wires this tuntap feature up[2] reuses the > >>>> infrastructure so it doesn't bring additional complexity. > >>>> > >>>> [1] > >>>> https://lore.kernel.org/qemu-devel/20250530-vdpa-v1-0-5af4109b1c19@xxxxxxxxxx/ > >>>> [2] > >>>> https://lore.kernel.org/qemu-devel/20250530-hash-v5-0-343d7d7a8200@xxxxxxxxxx/ > >>> > >>> I meant, if we implement a full hash report feature, it means a single > >>> hash cmdline option is more than sufficient and so compatibility code > >>> can just turn it off when dealing with machine types. This is much > >>> more simpler than > >>> > >>> 1) having both hash as well as supported_hash_features > >>> 2) dealing both hash as well as supported_hash_features in compatibility codes > >>> 3) libvirt will be happy > >>> > >>> For [1], it seems it introduces a per has type option, this seems to > >>> be a burden to the management layer as it need to learn new option > >>> everytime a new hash type is supported > >> > >> Even with the command line you proposed (supported_hash_types=X,Y,Z), it > >> is still necessary to know the values the supported_hash_types property > >> accepts (X.Y,Z), so I don't think it makes difference. > > > > It could be a uint32_t. > > The management layer will need to know what bits are accepted even with > uint32_t. Ease the management, basically it would be used by debugging or machine type only. > > > > >> > >> The burden to the management layer is already present for features, so > >> it is an existing problem (or its mere extension). > > > > Yes, but since this feature is new it's better to try our best to avoid that. > > > >> > >> This problem was discussed in the following thread in the past, but no > >> solution is implemented yet, and probably solving it will be difficult. > >> https://lore.kernel.org/qemu-devel/20230731223148.1002258-5-yuri.benditovich@xxxxxxxxxx/ > > > > It's a similar issue but not the same, it looks more like a discussion > > on whether the fallback from vhost-net to qemu works for missing > > features etc. > > Perhaps we may be able to do better since this feature is new as you say > and we don't have to worry much about breaking change. I don't have an > idea for that yet. Right. > > Regards, > Akihiko Odaki > Thanks