On Fri, 15 Aug 2025 16:36:57 +0200 Florian Westphal <fw@xxxxxxxxx> wrote: > Split the main avx2 lookup function into a helper. > > This is a preparation patch: followup change will use the new helper > from the insertion path if possible. This greatly improves insertion > performance when avx2 is supported. Ah, nice. This was on my to-do list at some point, but then I thought insertion time didn't matter as much, now clearly disproved by the numbers from 2/2. One actual concern and a few nits: > > Signed-off-by: Florian Westphal <fw@xxxxxxxxx> > --- > net/netfilter/nft_set_pipapo_avx2.c | 127 +++++++++++++++++----------- > 1 file changed, 76 insertions(+), 51 deletions(-) > > diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c > index 2f090e253caf..d512877cc211 100644 > --- a/net/netfilter/nft_set_pipapo_avx2.c > +++ b/net/netfilter/nft_set_pipapo_avx2.c > @@ -1133,58 +1133,35 @@ static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, uns > } > > /** > - * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation > - * @net: Network namespace > - * @set: nftables API set representation > - * @key: nftables API element representation containing key data > + * pipapo_get_avx2() - Lookup function for AVX2 implementation > + * @m: storage containing the set elements > + * @data: Key data to be matched against existing elements > + * @genmask: If set, check that element is active in given genmask > + * @tstamp: timestamp to check for expired elements Nits: Storage, Timestamp (or all lowercase, for consistency with the other ones). > * > * For more details, see DOC: Theory of Operation in nft_set_pipapo.c. > * > * This implementation exploits the repetitive characteristic of the algorithm > * to provide a fast, vectorised version using the AVX2 SIMD instruction set. > * > - * Return: true on match, false otherwise. > + * The caller MUST check that the fpu is usable. FPU > + * This function must be called with BH disabled. > + * > + * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise. > */ > -const struct nft_set_ext * > -nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, > - const u32 *key) > +static struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m, > + const u8 *data, u8 genmask, > + u64 tstamp) > { > - struct nft_pipapo *priv = nft_set_priv(set); > - const struct nft_set_ext *ext = NULL; > struct nft_pipapo_scratch *scratch; > - u8 genmask = nft_genmask_cur(net); > - const struct nft_pipapo_match *m; > const struct nft_pipapo_field *f; > - const u8 *rp = (const u8 *)key; > unsigned long *res, *fill; > bool map_index; > int i; > > - local_bh_disable(); > - > - if (unlikely(!irq_fpu_usable())) { > - ext = nft_pipapo_lookup(net, set, key); > - > - local_bh_enable(); > - return ext; > - } > - > - m = rcu_dereference(priv->match); > - > - /* This also protects access to all data related to scratch maps. > - * > - * Note that we don't need a valid MXCSR state for any of the > - * operations we use here, so pass 0 as mask and spare a LDMXCSR > - * instruction. > - */ > - kernel_fpu_begin_mask(0); > - > scratch = *raw_cpu_ptr(m->scratch); > - if (unlikely(!scratch)) { > - kernel_fpu_end(); > - local_bh_enable(); > + if (unlikely(!scratch)) > return NULL; > - } > > map_index = scratch->map_index; > > @@ -1202,7 +1179,7 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, > > #define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \ > (ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \ > - ret, rp, \ > + ret, data, \ > first, last)) > > if (likely(f->bb == 8)) { > @@ -1218,7 +1195,7 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, > NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16); > } else { > ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f, > - ret, rp, > + ret, data, > first, last); > } > } else { > @@ -1234,7 +1211,7 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, > NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32); > } else { > ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f, > - ret, rp, > + ret, data, > first, last); > } > } > @@ -1242,29 +1219,77 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, > > #undef NFT_SET_PIPAPO_AVX2_LOOKUP > > - if (ret < 0) > - goto out; > + if (ret < 0) { > + scratch->map_index = map_index; > + return NULL; > + } > > if (last) { > - const struct nft_set_ext *e = &f->mt[ret].e->ext; > + struct nft_pipapo_elem *e; > > - if (unlikely(nft_set_elem_expired(e) || > - !nft_set_elem_active(e, genmask))) > + e = f->mt[ret].e; > + if (unlikely(__nft_set_elem_expired(&e->ext, tstamp) || Here's the actual concern, even if I haven't tested this: I guess you now pass the timestamp to this function instead of getting it with each nft_set_elem_expired() call for either correctness (it should be done at the beginning of the insertion?) or as an optimisation (if BITS_PER_LONG < 64 the overhead isn't necessarily trivial). As long as machines supporting AVX2 are concerned, we can assume in general that BITS_PER_LONG >= 64 (even though I'm not sure about x32). But with 2/2, you need to call get_jiffies_64() as a result, from non-AVX2 code, even for sets without a timeout (without NFT_SET_EXT_TIMEOUT extension). Does that risk causing a regression on non-AVX2? If it's for correctness, I think we shouldn't care, but if it's done as an optimisation, perhaps it's not a universal one. > + !nft_set_elem_active(&e->ext, genmask))) > goto next_match; > > - ext = e; > - goto out; > + scratch->map_index = map_index; > + return e; > } > > + map_index = !map_index; > swap(res, fill); > - rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); > + data += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); > + } > + > + return NULL; > +} > + > +/** > + * nft_pipapo_avx2_lookup() - Dataplane frontend for AVX2 implementation > + * @net: Network namespace > + * @set: nftables API set representation > + * @key: nftables API element representation containing key data > + * > + * This function is called from the data path. It will search for > + * an element matching the given key in the current active copy using > + * the avx2 routines if the fpu is usable or fall back to the generic AVX2, FPU > + * implementation of the algorithm otherwise. > + * > + * Return: ntables API extension pointer or NULL if no match. nftables > + */ > +const struct nft_set_ext * > +nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, > + const u32 *key) > +{ > + struct nft_pipapo *priv = nft_set_priv(set); > + u8 genmask = nft_genmask_cur(net); > + const struct nft_pipapo_match *m; > + const u8 *rp = (const u8 *)key; > + const struct nft_pipapo_elem *e; > + > + local_bh_disable(); > + > + if (unlikely(!irq_fpu_usable())) { > + const struct nft_set_ext *ext; > + > + ext = nft_pipapo_lookup(net, set, key); > + > + local_bh_enable(); > + return ext; > } > > -out: > - if (i % 2) > - scratch->map_index = !map_index; I originally did it like this to avoid an assignment in the fast path, but the amount of time I just spent figuring out why I did this shows that perhaps setting map_index = !map_index as you do now is worth it. > + m = rcu_dereference(priv->match); > + > + /* This also protects access to all data related to scratch maps. > + * > + * Note that we don't need a valid MXCSR state for any of the > + * operations we use here, so pass 0 as mask and spare a LDMXCSR > + * instruction. > + */ > + kernel_fpu_begin_mask(0); I would leave an empty line here so that it's clear what "This" in the comment refers to (it always sounds a bit confusing to me that kernel_fpu_begin_mask() actually does that, hence the comment). > + e = pipapo_get_avx2(m, rp, genmask, get_jiffies_64()); > kernel_fpu_end(); > local_bh_enable(); > > - return ext; > + return e ? &e->ext : NULL; > } -- Stefano