Re: [PATCH net-next v12 01/10] virtio_net: Add functions for hashing

Jason Wang <jasowang@xxxxxxxxxx> · Tue, 17 Jun 2025 11:28:26 +0800

On Fri, Jun 6, 2025 at 5:10 PM Akihiko Odaki <akihiko.odaki@xxxxxxxxxx> wrote:
>
> On 2025/06/06 9:48, Jason Wang wrote:
> > On Thu, Jun 5, 2025 at 3:58 PM Akihiko Odaki <akihiko.odaki@xxxxxxxxxx> wrote:
> >>
> >> On 2025/06/05 10:53, Jason Wang wrote:
> >>> On Wed, Jun 4, 2025 at 3:20 PM Akihiko Odaki <akihiko.odaki@xxxxxxxxxx> wrote:
> >>>>
> >>>> On 2025/06/04 10:18, Jason Wang wrote:
> >>>>> On Tue, Jun 3, 2025 at 1:31 PM Akihiko Odaki <akihiko.odaki@xxxxxxxxxx> wrote:
> >>>>>>
> >>>>>> On 2025/06/03 12:19, Jason Wang wrote:
> >>>>>>> On Fri, May 30, 2025 at 12:50 PM Akihiko Odaki <akihiko.odaki@xxxxxxxxxx> wrote:
> >>>>>>>>
> >>>>>>>> They are useful to implement VIRTIO_NET_F_RSS and
> >>>>>>>> VIRTIO_NET_F_HASH_REPORT.
> >>>>>>>>
> >>>>>>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@xxxxxxxxxx>
> >>>>>>>> Tested-by: Lei Yang <leiyang@xxxxxxxxxx>
> >>>>>>>> ---
> >>>>>>>>      include/linux/virtio_net.h | 188 +++++++++++++++++++++++++++++++++++++++++++++
> >>>>>>>>      1 file changed, 188 insertions(+)
> >>>>>>>>
> >>>>>>>> diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
> >>>>>>>> index 02a9f4dc594d..426f33b4b824 100644
> >>>>>>>> --- a/include/linux/virtio_net.h
> >>>>>>>> +++ b/include/linux/virtio_net.h
> >>>>>>>> @@ -9,6 +9,194 @@
> >>>>>>>>      #include <uapi/linux/tcp.h>
> >>>>>>>>      #include <uapi/linux/virtio_net.h>
> >>>>>>>>
> >>>>>>>> +struct virtio_net_hash {
> >>>>>>>> +       u32 value;
> >>>>>>>> +       u16 report;
> >>>>>>>> +};
> >>>>>>>> +
> >>>>>>>> +struct virtio_net_toeplitz_state {
> >>>>>>>> +       u32 hash;
> >>>>>>>> +       const u32 *key;
> >>>>>>>> +};
> >>>>>>>> +
> >>>>>>>> +#define VIRTIO_NET_SUPPORTED_HASH_TYPES (VIRTIO_NET_RSS_HASH_TYPE_IPv4 | \
> >>>>>>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_TCPv4 | \
> >>>>>>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_UDPv4 | \
> >>>>>>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_IPv6 | \
> >>>>>>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_TCPv6 | \
> >>>>>>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_UDPv6)
> >>>>>>>> +
> >>>>>>>> +#define VIRTIO_NET_RSS_MAX_KEY_SIZE 40
> >>>>>>>> +
> >>>>>>>> +static inline void virtio_net_toeplitz_convert_key(u32 *input, size_t len)
> >>>>>>>> +{
> >>>>>>>> +       while (len >= sizeof(*input)) {
> >>>>>>>> +               *input = be32_to_cpu((__force __be32)*input);
> >>>>>>>> +               input++;
> >>>>>>>> +               len -= sizeof(*input);
> >>>>>>>> +       }
> >>>>>>>> +}
> >>>>>>>> +
> >>>>>>>> +static inline void virtio_net_toeplitz_calc(struct virtio_net_toeplitz_state *state,
> >>>>>>>> +                                           const __be32 *input, size_t len)
> >>>>>>>> +{
> >>>>>>>> +       while (len >= sizeof(*input)) {
> >>>>>>>> +               for (u32 map = be32_to_cpu(*input); map; map &= (map - 1)) {
> >>>>>>>> +                       u32 i = ffs(map);
> >>>>>>>> +
> >>>>>>>> +                       state->hash ^= state->key[0] << (32 - i) |
> >>>>>>>> +                                      (u32)((u64)state->key[1] >> i);
> >>>>>>>> +               }
> >>>>>>>> +
> >>>>>>>> +               state->key++;
> >>>>>>>> +               input++;
> >>>>>>>> +               len -= sizeof(*input);
> >>>>>>>> +       }
> >>>>>>>> +}
> >>>>>>>> +
> >>>>>>>> +static inline u8 virtio_net_hash_key_length(u32 types)
> >>>>>>>> +{
> >>>>>>>> +       size_t len = 0;
> >>>>>>>> +
> >>>>>>>> +       if (types & VIRTIO_NET_HASH_REPORT_IPv4)
> >>>>>>>> +               len = max(len,
> >>>>>>>> +                         sizeof(struct flow_dissector_key_ipv4_addrs));
> >>>>>>>> +
> >>>>>>>> +       if (types &
> >>>>>>>> +           (VIRTIO_NET_HASH_REPORT_TCPv4 | VIRTIO_NET_HASH_REPORT_UDPv4))
> >>>>>>>> +               len = max(len,
> >>>>>>>> +                         sizeof(struct flow_dissector_key_ipv4_addrs) +
> >>>>>>>> +                         sizeof(struct flow_dissector_key_ports));
> >>>>>>>> +
> >>>>>>>> +       if (types & VIRTIO_NET_HASH_REPORT_IPv6)
> >>>>>>>> +               len = max(len,
> >>>>>>>> +                         sizeof(struct flow_dissector_key_ipv6_addrs));
> >>>>>>>> +
> >>>>>>>> +       if (types &
> >>>>>>>> +           (VIRTIO_NET_HASH_REPORT_TCPv6 | VIRTIO_NET_HASH_REPORT_UDPv6))
> >>>>>>>> +               len = max(len,
> >>>>>>>> +                         sizeof(struct flow_dissector_key_ipv6_addrs) +
> >>>>>>>> +                         sizeof(struct flow_dissector_key_ports));
> >>>>>>>> +
> >>>>>>>> +       return len + sizeof(u32);
> >>>>>>>> +}
> >>>>>>>> +
> >>>>>>>> +static inline u32 virtio_net_hash_report(u32 types,
> >>>>>>>> +                                        const struct flow_keys_basic *keys)
> >>>>>>>> +{
> >>>>>>>> +       switch (keys->basic.n_proto) {
> >>>>>>>> +       case cpu_to_be16(ETH_P_IP):
> >>>>>>>> +               if (!(keys->control.flags & FLOW_DIS_IS_FRAGMENT)) {
> >>>>>>>> +                       if (keys->basic.ip_proto == IPPROTO_TCP &&
> >>>>>>>> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4))
> >>>>>>>> +                               return VIRTIO_NET_HASH_REPORT_TCPv4;
> >>>>>>>> +
> >>>>>>>> +                       if (keys->basic.ip_proto == IPPROTO_UDP &&
> >>>>>>>> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4))
> >>>>>>>> +                               return VIRTIO_NET_HASH_REPORT_UDPv4;
> >>>>>>>> +               }
> >>>>>>>> +
> >>>>>>>> +               if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv4)
> >>>>>>>> +                       return VIRTIO_NET_HASH_REPORT_IPv4;
> >>>>>>>> +
> >>>>>>>> +               return VIRTIO_NET_HASH_REPORT_NONE;
> >>>>>>>> +
> >>>>>>>> +       case cpu_to_be16(ETH_P_IPV6):
> >>>>>>>> +               if (!(keys->control.flags & FLOW_DIS_IS_FRAGMENT)) {
> >>>>>>>> +                       if (keys->basic.ip_proto == IPPROTO_TCP &&
> >>>>>>>> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6))
> >>>>>>>> +                               return VIRTIO_NET_HASH_REPORT_TCPv6;
> >>>>>>>> +
> >>>>>>>> +                       if (keys->basic.ip_proto == IPPROTO_UDP &&
> >>>>>>>> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6))
> >>>>>>>> +                               return VIRTIO_NET_HASH_REPORT_UDPv6;
> >>>>>>>> +               }
> >>>>>>>> +
> >>>>>>>> +               if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv6)
> >>>>>>>> +                       return VIRTIO_NET_HASH_REPORT_IPv6;
> >>>>>>>> +
> >>>>>>>> +               return VIRTIO_NET_HASH_REPORT_NONE;
> >>>>>>>> +
> >>>>>>>> +       default:
> >>>>>>>> +               return VIRTIO_NET_HASH_REPORT_NONE;
> >>>>>>>> +       }
> >>>>>>>> +}
> >>>>>>>> +
> >>>>>>>> +static inline void virtio_net_hash_rss(const struct sk_buff *skb,
> >>>>>>>> +                                      u32 types, const u32 *key,
> >>>>>>>> +                                      struct virtio_net_hash *hash)
> >>>>>>>> +{
> >>>>>>>> +       struct virtio_net_toeplitz_state toeplitz_state = { .key = key };
> >>>>>>>> +       struct flow_keys flow;
> >>>>>>>> +       struct flow_keys_basic flow_basic;
> >>>>>>>> +       u16 report;
> >>>>>>>> +
> >>>>>>>> +       if (!skb_flow_dissect_flow_keys(skb, &flow, 0)) {
> >>>>>>>> +               hash->report = VIRTIO_NET_HASH_REPORT_NONE;
> >>>>>>>> +               return;
> >>>>>>>> +       }
> >>>>>>>> +
> >>>>>>>> +       flow_basic = (struct flow_keys_basic) {
> >>>>>>>> +               .control = flow.control,
> >>>>>>>> +               .basic = flow.basic
> >>>>>>>> +       };
> >>>>>>>> +
> >>>>>>>> +       report = virtio_net_hash_report(types, &flow_basic);
> >>>>>>>> +
> >>>>>>>> +       switch (report) {
> >>>>>>>> +       case VIRTIO_NET_HASH_REPORT_IPv4:
> >>>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >>>>>>>> +                                        (__be32 *)&flow.addrs.v4addrs,
> >>>>>>>> +                                        sizeof(flow.addrs.v4addrs));
> >>>>>>>> +               break;
> >>>>>>>> +
> >>>>>>>> +       case VIRTIO_NET_HASH_REPORT_TCPv4:
> >>>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >>>>>>>> +                                        (__be32 *)&flow.addrs.v4addrs,
> >>>>>>>> +                                        sizeof(flow.addrs.v4addrs));
> >>>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
> >>>>>>>> +                                        sizeof(flow.ports.ports));
> >>>>>>>> +               break;
> >>>>>>>> +
> >>>>>>>> +       case VIRTIO_NET_HASH_REPORT_UDPv4:
> >>>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >>>>>>>> +                                        (__be32 *)&flow.addrs.v4addrs,
> >>>>>>>> +                                        sizeof(flow.addrs.v4addrs));
> >>>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
> >>>>>>>> +                                        sizeof(flow.ports.ports));
> >>>>>>>> +               break;
> >>>>>>>> +
> >>>>>>>> +       case VIRTIO_NET_HASH_REPORT_IPv6:
> >>>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >>>>>>>> +                                        (__be32 *)&flow.addrs.v6addrs,
> >>>>>>>> +                                        sizeof(flow.addrs.v6addrs));
> >>>>>>>> +               break;
> >>>>>>>> +
> >>>>>>>> +       case VIRTIO_NET_HASH_REPORT_TCPv6:
> >>>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >>>>>>>> +                                        (__be32 *)&flow.addrs.v6addrs,
> >>>>>>>> +                                        sizeof(flow.addrs.v6addrs));
> >>>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
> >>>>>>>> +                                        sizeof(flow.ports.ports));
> >>>>>>>> +               break;
> >>>>>>>> +
> >>>>>>>> +       case VIRTIO_NET_HASH_REPORT_UDPv6:
> >>>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >>>>>>>> +                                        (__be32 *)&flow.addrs.v6addrs,
> >>>>>>>> +                                        sizeof(flow.addrs.v6addrs));
> >>>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
> >>>>>>>> +                                        sizeof(flow.ports.ports));
> >>>>>>>> +               break;
> >>>>>>>> +
> >>>>>>>> +       default:
> >>>>>>>> +               hash->report = VIRTIO_NET_HASH_REPORT_NONE;
> >>>>>>>> +               return;
> >>>>>>>
> >>>>>>> So I still think we need a comment here to explain why this is not an
> >>>>>>> issue if the device can report HASH_XXX_EX. Or we need to add the
> >>>>>>> support, since this is the code from the driver side, I don't think we
> >>>>>>> need to worry about the device implementation issues.
> >>>>>>
> >>>>>> This is on the device side, and don't report HASH_TYPE_XXX_EX.
> >>>>>>
> >>>>>>>
> >>>>>>> For the issue of the number of options, does the spec forbid fallback
> >>>>>>> to VIRTIO_NET_HASH_REPORT_NONE? If not, we can do that.
> >>>>>>
> >>>>>> 5.1.6.4.3.4 "IPv6 packets with extension header" says:
> >>>>>>     > If VIRTIO_NET_HASH_TYPE_TCP_EX is set and the packet has a TCPv6
> >>>>>>     > header, the hash is calculated over the following fields:
> >>>>>>     > - Home address from the home address option in the IPv6 destination
> >>>>>>     >   options header. If the extension header is not present, use the
> >>>>>>     >   Source IPv6 address.
> >>>>>>     > - IPv6 address that is contained in the Routing-Header-Type-2 from the
> >>>>>>     >   associated extension header. If the extension header is not present,
> >>>>>>     >   use the Destination IPv6 address.
> >>>>>>     > - Source TCP port
> >>>>>>     > - Destination TCP port
> >>>>>>
> >>>>>> Therefore, if VIRTIO_NET_HASH_TYPE_TCP_EX is set, the packet has a TCPv6
> >>>>>> and an home address option in the IPv6 destination options header is
> >>>>>> present, the hash is calculated over the home address. If the hash is
> >>>>>> not calculated over the home address in such a case, the device is
> >>>>>> contradicting with this section and violating the spec. The same goes
> >>>>>> for the other HASH_TYPE_XXX_EX types and Routing-Header-Type-2.
> >>>>>
> >>>>> Just to make sure we are one the same page. I meant:
> >>>>>
> >>>>> 1) If the hash is not calculated over the home address (in the case of
> >>>>> IPv6 destination destination), it can still report
> >>>>> VIRTIO_NET_RSS_HASH_TYPE_IPv6. This is what you implemented in your
> >>>>> series. So the device can simply fallback to e.g TCPv6 if it can't
> >>>>> understand all or part of the IPv6 options.
> >>>>
> >>>> The spec says it can fallback if "the extension header is not present",
> >>>> not if the device can't understand the extension header.
> >>>
> >>> I don't think so,
> >>>
> >>> 1) spec had a condition beforehand:
> >>>
> >>> """
> >>> If VIRTIO_NET_HASH_TYPE_TCP_EX is set and the packet has a TCPv6
> >>> header, the hash is calculated over the following fields:
> >>> ...
> >>> If the extension header is not present ...
> >>> """
> >>>
> >>> So the device can choose not to set VIRTIO_NET_HASH_TYPE_TCP_EX as
> >>> spec doesn't say device MUST set VIRTIO_NET_HASH_TYPE_TCP_EX if ...
> >>>
> >>> 2) implementation wise, since device has limited resources, we can't
> >>> expect the device can parse arbitrary number of ipv6 options
> >>>
> >>> 3) if 1) and 2) not the case, we need fix the spec otherwise implement
> >>> a spec compliant device is impractical
> >>
> >> The statement is preceded by the following:
> >>   >  The device calculates the hash on IPv4 packets according to
> >>   > ’Enabled hash types’ bitmask as follows:
> >>
> >> The 'Enabled hash types' bitmask is specified by the device.
> >>
> >> I think the spec needs amendment.
> >
> > Michael, can you help to clarify here?
> >
> >>
> >> I wonder if there are any people interested in the feature though.
> >> Looking at virtnet_set_hashflow() in drivers/net/virtio_net.c, the
> >> driver of Linux does not let users configure HASH_TYPE_XXX_EX. I suppose
> >> Windows supports HASH_TYPE_XXX_EX, but those who care network
> >> performance most would use Linux so HASH_TYPE_XXX_EX support without
> >> Linux driver's support may not be useful.
> >
> > It might be still interesting for example for the hardware virtio
> > vendors to support windows etc.
>
> I don't know if Windows needs them for e.g., device/driver certification
> so surveying Windows makes sense.

Yuri, can you help to clarify this?

>
> >
> >>
> >>>
> >>>>
> >>>>> 2) the VIRTIO_NET_SUPPORTED_HASH_TYPES is not checked against the
> >>>>> tun_vnet_ioctl_sethash(), so userspace may set
> >>>>> VIRTIO_NET_HASH_TYPE_TCP_EX regardless of what has been returned by
> >>>>> tun_vnet_ioctl_gethashtypes(). In this case they won't get
> >>>>> VIRTIO_NET_HASH_TYPE_TCP_EX.
> >>>>
> >>>> That's right. It's the responsibility of the userspace to set only the
> >>>> supported hash types.
> >>>
> >>> Well, the kernel should filter out the unsupported one to have a
> >>> robust uAPI. Otherwise, we give green light to the buggy userspace
> >>> which will have unexpected results.
> >>
> >> My reasoning was that it may be fine for some use cases other than VM
> >> (e.g., DPDK); in such a use case, it is fine as long as the UAPI works
> >> in the best-effort basis.
> >
> > Best-effort might increase the chance for user visisable changes after
> > migration.
>
> It is a trade-off between catching a migration bug for VMM and making
> life a bit easier for userspace programs other than VMM.

My understanding is to avoid breaking the migration compatibility as
much as possible as fixing that would be complicated or even
impossible.

>
> >
> >>
> >> For example, suppose a userspace program that processes TCP packets; the
> >> program can enable: HASH_TYPE_IPv4, HASH_TYPE_TCPv4, HASH_TYPE_IPv6, and
> >> HASH_TYPE_TCPv6. Ideally, the kernel should support all the hash types,
> >> but, even if e.g., HASH_TYPE_TCPv6 is not available,
> >
> > For "available" did you mean it is not supported by the device?
> >
> >> it will fall back
> >> to HASH_TYPE_IPv6, which still does something good and may be acceptable.
> >
> > This fallback is exactly the same as I said above, let
> > VIRTIO_NET_HASH_TYPE_TCP_EX to fallback.
> >
> > My point is that, the implementation should either:
> >
> > 1) allow fallback so it can claim to support all hash types
> >
> > or
> >
> > 2) don't allow fallback so it can only support a part of the hash types
> >
> > If we're doing something in the middle, for example, allow part of the
> > type to fallback.
>
> 1) or the middle will make it unsuitable for VM because it violates the
> virtio spec. 2) makes sense though the trade-off I mentioned should be
> taken into consideration.
>
> >
> >>
> >> That said, for a use case that involves VM and implements virtio-net
> >> (e.g., QEMU), setting an unsupported hash type here is definitely a bug.
> >> Catching the bug may outweigh the extra trouble for other use cases.
> >>
> >>>
> >>>>
> >>>>> 3) implementing part of the hash types might complicate the migration
> >>>>> or at least we need to describe the expectations of libvirt or other
> >>>>> management in this case. For example, do we plan to have a dedicated
> >>>>> Qemu command line like:
> >>>>>
> >>>>> -device virtio-net-pci,hash_report=on,supported_hash_types=X,Y,Z?
> >>>>
> >>>> I posted a patch series to implement such a command line for vDPA[1].
> >>>> The patch series that wires this tuntap feature up[2] reuses the
> >>>> infrastructure so it doesn't bring additional complexity.
> >>>>
> >>>> [1]
> >>>> https://lore.kernel.org/qemu-devel/20250530-vdpa-v1-0-5af4109b1c19@xxxxxxxxxx/
> >>>> [2]
> >>>> https://lore.kernel.org/qemu-devel/20250530-hash-v5-0-343d7d7a8200@xxxxxxxxxx/
> >>>
> >>> I meant, if we implement a full hash report feature, it means a single
> >>> hash cmdline option is more than sufficient and so compatibility code
> >>> can just turn it off when dealing with machine types. This is much
> >>> more simpler than
> >>>
> >>> 1) having both hash as well as supported_hash_features
> >>> 2) dealing both hash as well as supported_hash_features in compatibility codes
> >>> 3) libvirt will be happy
> >>>
> >>> For [1], it seems it introduces a per has type option, this seems to
> >>> be a burden to the management layer as it need to learn new option
> >>> everytime a new hash type is supported
> >>
> >> Even with the command line you proposed (supported_hash_types=X,Y,Z), it
> >> is still necessary to know the values the supported_hash_types property
> >> accepts (X.Y,Z), so I don't think it makes difference.
> >
> > It could be a uint32_t.
>
> The management layer will need to know what bits are accepted even with
> uint32_t.

Ease the management, basically it would be used by debugging or
machine type only.

>
> >
> >>
> >> The burden to the management layer is already present for features, so
> >> it is an existing problem (or its mere extension).
> >
> > Yes, but since this feature is new it's better to try our best to avoid that.
> >
> >>
> >> This problem was discussed in the following thread in the past, but no
> >> solution is implemented yet, and probably solving it will be difficult.
> >> https://lore.kernel.org/qemu-devel/20230731223148.1002258-5-yuri.benditovich@xxxxxxxxxx/
> >
> > It's a similar issue but not the same, it looks more like a discussion
> > on whether the fallback from vhost-net to qemu works for missing
> > features etc.
>
> Perhaps we may be able to do better since this feature is new as you say
> and we don't have to worry much about breaking change. I don't have an
> idea for that yet.

Right.

>
> Regards,
> Akihiko Odaki
>

Thanks