It took me a bit to decipher some parts, but it looks functionally good to me. A few comments below. I wonder, by the way, if 1/3 and 2/3 shouldn't be applied meanwhile (perhaps that was the reason for moving this at the end...?). On Mon, 7 Apr 2025 19:40:20 +0200 Florian Westphal <fw@xxxxxxxxx> wrote: > Add ymm register tracking for avx2 helpers. > > A register can have following states: > - mem (contains packet data) > - and (was consumed, value folded into other register) > - tmp (holds result of folding operation) > > mem and tmp are mutually exclusive. > > Warn if > a) register store happens while register has 'mem' bit set > but 'and' unset. > This detects clobbering of a register with new > packet data before the previous load has been processed. > b) register is read but wasn't written to before > This detects operations happening on undefined register > content, such as AND or GOTOs. > c) register is saved to memory, but it doesn't hold result > of an AND operation. > This detects erroneous stores to the memory scratch map. > d) register is used for goto, but it doesn't contain result > of earlier AND operation. > e) There is an unprocessed register left when the function > returns (only mem bit is set). I think d) and e) are pretty valuable, indeed. > Also print a notice when we have a never-used-register, > it would hint at some way to optimize code. > For those helpers that don't process enough data fields > to fill all, this error is suppressed -- they pass the > highest inuse register. > > There is one exception for c) in the code (where we only > have one byte to process. The helper > nft_pipapo_avx2_force_tmp() is used for this to forcibly > convert the state from 'mem' to 'tmp'. > > This is disabled for NET_DEBUG=n builds. > > v3: add additional 'done' test to check all registers > were handled > fix macro space/tab indent in a few places (Stefano) > fix yym typos (Stefano) > make this change last in the series > > v2: Improve kdoc (Stefano Brivio) > Use u16, not long (Stefano Brivio) > Reduce macro usage in favor of inline helpers > warn if we store register to memory but its not holding > result of AND operation > > Signed-off-by: Florian Westphal <fw@xxxxxxxxx> > --- > net/netfilter/nft_set_pipapo_avx2.c | 300 +++++++++++++++++++++++++--- > 1 file changed, 269 insertions(+), 31 deletions(-) > > diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c > index c15db28c5ebc..d2e321cc870f 100644 > --- a/net/netfilter/nft_set_pipapo_avx2.c > +++ b/net/netfilter/nft_set_pipapo_avx2.c > @@ -26,7 +26,209 @@ > > #define NFT_PIPAPO_LONGS_PER_M256 (XSAVE_YMM_SIZE / BITS_PER_LONG) > > -/* Load from memory into YMM register with non-temporal hint ("stream load"), > +/** > + * struct nft_pipapo_debug_regmap - Bitmaps representing sets of YMM registers > + * > + * @mem: n-th bit set if YMM<n> contains packet data loaded from memory > + * @and: n-th bit set if YMM<n> was folded (AND operation done) > + * @tmp: n-th bit set if YMM<n> contains folded data (result of AND operation) > + */ > +struct nft_pipapo_debug_regmap { > +#ifdef CONFIG_DEBUG_NET > + u16 mem; > + u16 and; > + u16 tmp; > +#endif > +}; > + > +#ifdef CONFIG_DEBUG_NET > +/* ymm15 is used as an always-0-register, see nft_pipapo_avx2_prepare */ > +#define NFT_PIPAPO_AVX2_DEBUG_MAP \ > + struct nft_pipapo_debug_regmap __pipapo_debug_regmap = { \ > + .tmp = BIT(15), \ > + } > + > +#define NFT_PIPAPO_WARN(cond, reg, rmap, line, message) ({ \ > + const struct nft_pipapo_debug_regmap *rm__ = (rmap); \ > + DEBUG_NET_WARN_ONCE((cond), "reg %d line %u %s, mem %04x, and %04x tmp %04x",\ > + (reg), (line), (message), rm__->mem, rm__->and, rm__->tmp); \ > +}) > +#else /* !CONFIG_DEBUG_NET */ > +#define NFT_PIPAPO_AVX2_DEBUG_MAP \ > + struct nft_pipapo_debug_regmap __pipapo_debug_regmap > +#endif > + > +/** > + * nft_pipapo_avx2_load_packet() - Check and record packet data store > + * > + * @reg: Index of register being written to Nit: kerneldoc style usually aligns argument descriptions, say: * @reg: ... * @r: ... > + * @r: Current bitmap of registers for debugging purposes > + * @line: __LINE__ number filled via AVX2 macro > + * > + * Mark reg as holding packet data. > + * Check reg is unused or had an AND operation performed on it. ...and those should be @reg. > + */ > +static inline void nft_pipapo_avx2_load_packet(unsigned int reg, > + struct nft_pipapo_debug_regmap *r, > + unsigned int line) > +{ > +#ifdef CONFIG_DEBUG_NET > + bool used = BIT(reg) & (r->mem | r->tmp); > + bool anded = BIT(reg) & r->and; > + > + r->and &= ~BIT(reg); > + r->tmp &= ~BIT(reg); > + r->mem |= BIT(reg); > + > + if (used) > + NFT_PIPAPO_WARN(!anded, reg, r, line, "busy"); > +#endif > +} > + > +/** > + * nft_pipapo_avx2_force_tmp() - Mark @reg as holding result of AND operation > + * @reg: Index of register > + * @r: Current bitmap of registers for debugging purposes > + * > + * Mark reg as holding temporary data, no checks are performed. > + */ > +static inline void nft_pipapo_avx2_force_tmp(unsigned int reg, > + struct nft_pipapo_debug_regmap *r) > +{ > +#ifdef CONFIG_DEBUG_NET > + r->tmp |= BIT(reg); > + r->mem &= ~BIT(reg); > +#endif > +} > + > +/** > + * nft_pipapo_avx2_load_tmp() - Check and record scratchmap restore I'm not sure if we should call this "scratchmap restore", it's more like a generic load to a register (from a bucket, perhaps no need to specify this). > + * > + * @reg: Index of register being written to > + * @r: Current bitmap of registers for debugging purposes > + * @line: __LINE__ number filled via AVX2 macro > + * > + * Mark reg as holding temporary data. > + * Check reg is unused or had an AND operation performed on it. Same as above (@reg). > + */ > +static inline void nft_pipapo_avx2_load_tmp(unsigned int reg, > + struct nft_pipapo_debug_regmap *r, > + unsigned int line) > +{ > +#ifdef CONFIG_DEBUG_NET > + bool used = BIT(reg) & (r->mem | r->tmp); > + bool anded = BIT(reg) & r->and; > + > + r->and &= ~BIT(reg); > + > + nft_pipapo_avx2_force_tmp(reg, r); I wonder if it wouldn't be more obvious to open code the bitmap setting/clearing here, because nft_pipapo_avx2_force_tmp() has a different purpose in general. > + > + if (used) > + NFT_PIPAPO_WARN(!anded, reg, r, line, "busy"); > +#endif > +} > + > +/** > + * nft_pipapo_avx2_debug_and() - Mark registers as being ANDed > + * > + * @a: Index of register being written to > + * @b: Index of first register being ANDed > + * @c: Index of second register being ANDed > + * @r: Current bitmap of registers for debugging purposes > + * @line: __LINE__ number filled via AVX2 macro > + * > + * Tags @reg2 and @reg3 as ANDed register registers > + * Tags @reg1 as containing AND result Those are @a, @b, @c now (sorry, I missed this during review of v2). By the way, just as a reminder, NFT_PIPAPO_AVX2_AND() uses @dst, @a, @b, but maybe you have a good reason to deviate from that. > + */ > +static inline void nft_pipapo_avx2_debug_and(unsigned int a, unsigned int b, > + unsigned int c, > + struct nft_pipapo_debug_regmap *r, > + unsigned int line) > +{ > +#ifdef CONFIG_DEBUG_NET > + bool b_and = BIT(b) & r->and; > + bool c_and = BIT(c) & r->and; > + bool b_tmp = BIT(b) & r->tmp; > + bool c_tmp = BIT(c) & r->tmp; > + bool b_mem = BIT(b) & r->mem; > + bool c_mem = BIT(c) & r->mem; > + > + r->and |= BIT(b); > + r->and |= BIT(c); > + > + nft_pipapo_avx2_force_tmp(a, r); > + > + NFT_PIPAPO_WARN((!b_mem && !b_and && !b_tmp), b, r, line, "unused"); > + NFT_PIPAPO_WARN((!c_mem && !c_and && !c_tmp), c, r, line, "unused"); > +#endif > +} > + > +/** > + * nft_pipapo_avx2_reg_tmp() - Check that @reg holds result of AND operation > + * @reg: Index of register > + * @r: Current bitmap of registers for debugging purposes > + * @line: __LINE__ number filled via AVX2 macro > + */ > +static inline void nft_pipapo_avx2_reg_tmp(unsigned int reg, > + const struct nft_pipapo_debug_regmap *r, > + unsigned int line) > +{ > +#ifdef CONFIG_DEBUG_NET > + bool holds_and_result = BIT(reg) & r->tmp; > + > + NFT_PIPAPO_WARN(!holds_and_result, reg, r, line, "unused"); > +#endif > +} > + > +/** > + * nft_pipapo_avx2_debug_map_done() - Check all registers were used > + * @ret: Return value > + * @r: Current bitmap of registers for debugging purposes > + * @reg_hi: The highest ymm register used (0: all are used) > + * @line: __LINE__ number filled via AVX2 macro > + * > + * Raises a warning if a register hasn't been processed (AND'ed). > + * Prints a notice if it finds an unused register, this hints at > + * possible optimization. > + */ > +static inline int nft_pipapo_avx2_debug_map_done(int ret, struct nft_pipapo_debug_regmap *r, > + unsigned int reg_hi, > + unsigned int line) > +{ > +#ifdef CONFIG_DEBUG_NET > + static const unsigned int ymm_regs = 16; > + u16 reg_bad, reg_ok; > + unsigned int i; > + > + reg_ok = r->and | r->tmp; > + reg_bad = r->mem; > + > + reg_hi = reg_hi > 0 ? reg_hi + 1 : ymm_regs; > + > + for (i = 0; i < reg_hi; i++) { > + if (BIT(i) & reg_ok) > + continue; > + > + if (BIT(i) & reg_bad) > + NFT_PIPAPO_WARN(1, i, r, line, "unprocessed"); > + else > + pr_info_once("%s: at %u: reg %u unused\n", __func__, line, i); > + } > + > + r->mem = 0; > + r->and = 0; > + r->tmp = 0; > +#endif > + > + return ret; > +} > + > +#define NFT_PIPAPO_AVX2_DEBUG_MAP_DONE(ret) \ > + nft_pipapo_avx2_debug_map_done((ret), &__pipapo_debug_regmap, 0, __LINE__) > +#define NFT_PIPAPO_AVX2_DEBUG_MAP_DONE2(ret, hr) \ > + nft_pipapo_avx2_debug_map_done((ret), &__pipapo_debug_regmap, (hr), __LINE__) Should 'hr' be 'reg_hi' as it is for nft_pipapo_avx2_debug_map_done(), or 'rmax' or 'max_reg' or something like that? Otherwise it's a bit difficult (for me at least) to understand how this macro should be used (without following the whole path). Alternatively, a comment could also fix that I guess. > + > +/* Load from memory into ymm register with non-temporal hint ("stream load"), > * that is, don't fetch lines from memory into the cache. This avoids pushing > * precious packet data out of the cache hierarchy, and is appropriate when: > * > @@ -36,38 +238,59 @@ > * - loading the result bitmap from the previous field, as it's never used > * again > */ > -#define NFT_PIPAPO_AVX2_LOAD(reg, loc) \ > +#define __NFT_PIPAPO_AVX2_LOAD(reg, loc) \ > asm volatile("vmovntdqa %0, %%ymm" #reg : : "m" (loc)) > > -/* Stream a single lookup table bucket into YMM register given lookup table, > +#define NFT_PIPAPO_AVX2_LOAD(reg, loc) do { \ > + nft_pipapo_avx2_load_tmp(reg, \ > + &__pipapo_debug_regmap, __LINE__); \ > + __NFT_PIPAPO_AVX2_LOAD(reg, loc); \ > +} while (0) > + > +/* Stream a single lookup table bucket into ymm register given lookup table, > * group index, value of packet bits, bucket size. > */ > -#define NFT_PIPAPO_AVX2_BUCKET_LOAD4(reg, lt, group, v, bsize) \ > - NFT_PIPAPO_AVX2_LOAD(reg, \ > - lt[((group) * NFT_PIPAPO_BUCKETS(4) + \ > - (v)) * (bsize)]) > -#define NFT_PIPAPO_AVX2_BUCKET_LOAD8(reg, lt, group, v, bsize) \ > - NFT_PIPAPO_AVX2_LOAD(reg, \ > - lt[((group) * NFT_PIPAPO_BUCKETS(8) + \ > - (v)) * (bsize)]) > +#define NFT_PIPAPO_AVX2_BUCKET_LOAD4(reg, lt, group, v, bsize) do { \ > + nft_pipapo_avx2_load_packet(reg, \ > + &__pipapo_debug_regmap, __LINE__); \ > + __NFT_PIPAPO_AVX2_LOAD(reg, \ > + lt[((group) * NFT_PIPAPO_BUCKETS(4) + \ > + (v)) * (bsize)]); \ > +} while (0) > + > +#define NFT_PIPAPO_AVX2_BUCKET_LOAD8(reg, lt, group, v, bsize) do { \ > + nft_pipapo_avx2_load_packet(reg, \ > + &__pipapo_debug_regmap, __LINE__); \ > + __NFT_PIPAPO_AVX2_LOAD(reg, \ > + lt[((group) * NFT_PIPAPO_BUCKETS(8) + \ > + (v)) * (bsize)]); \ > +} while (0) > > /* Bitwise AND: the staple operation of this algorithm */ > -#define NFT_PIPAPO_AVX2_AND(dst, a, b) \ > - asm volatile("vpand %ymm" #a ", %ymm" #b ", %ymm" #dst) > +#define NFT_PIPAPO_AVX2_AND(dst, a, b) do { \ > + BUILD_BUG_ON((a) == (b)); \ > + asm volatile("vpand %ymm" #a ", %ymm" #b ", %ymm" #dst); \ > + nft_pipapo_avx2_debug_and(dst, a, b, \ > + &__pipapo_debug_regmap, __LINE__); \ > +} while (0) > > /* Jump to label if @reg is zero */ > -#define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) \ > - asm goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \ > - "je %l[" #label "]" : : : : label) > +#define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) do { \ > + nft_pipapo_avx2_reg_tmp(reg, &__pipapo_debug_regmap, __LINE__); \ > + asm goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \ > + "je %l[" #label "]" : : : : label); \ > +} while (0) > > -/* Store 256 bits from YMM register into memory. Contrary to bucket load > +/* Store 256 bits from ymm register into memory. Contrary to bucket load > * operation, we don't bypass the cache here, as stored matching results > * are always used shortly after. > */ > -#define NFT_PIPAPO_AVX2_STORE(loc, reg) \ > - asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (loc)) > +#define NFT_PIPAPO_AVX2_STORE(loc, reg) do { \ > + nft_pipapo_avx2_reg_tmp(reg, &__pipapo_debug_regmap, __LINE__); \ > + asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (loc)); \ > +} while (0) > > -/* Zero out a complete YMM register, @reg */ > +/* Zero out a complete ymm register, @reg */ > #define NFT_PIPAPO_AVX2_ZERO(reg) \ > asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg) > > @@ -219,6 +442,7 @@ static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill, > int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; > u8 pg[2] = { pkt[0] >> 4, pkt[0] & 0xf }; > unsigned long *lt = f->lt, bsize = f->bsize; > + NFT_PIPAPO_AVX2_DEBUG_MAP; > > lt += offset * NFT_PIPAPO_LONGS_PER_M256; > for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { > @@ -242,7 +466,7 @@ static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill, > > b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); > if (last) > - return b; > + return NFT_PIPAPO_AVX2_DEBUG_MAP_DONE2(b, 4); > > if (unlikely(ret == -1)) > ret = b / XSAVE_YMM_SIZE; > @@ -254,7 +478,7 @@ static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill, > ; > } > > - return ret; > + return NFT_PIPAPO_AVX2_DEBUG_MAP_DONE2(ret, 4); > } > > /** > @@ -282,6 +506,7 @@ static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill, > int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; > u8 pg[4] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf }; > unsigned long *lt = f->lt, bsize = f->bsize; > + NFT_PIPAPO_AVX2_DEBUG_MAP; > > lt += offset * NFT_PIPAPO_LONGS_PER_M256; > for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { > @@ -319,7 +544,7 @@ static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill, > > b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); > if (last) > - return b; > + return NFT_PIPAPO_AVX2_DEBUG_MAP_DONE2(b, 7); > > if (unlikely(ret == -1)) > ret = b / XSAVE_YMM_SIZE; > @@ -331,6 +556,7 @@ static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill, > ; > } > > + NFT_PIPAPO_AVX2_DEBUG_MAP_DONE2(ret, 7); > return ret; > } > > @@ -361,6 +587,7 @@ static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill, > }; > int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; > unsigned long *lt = f->lt, bsize = f->bsize; > + NFT_PIPAPO_AVX2_DEBUG_MAP; > > lt += offset * NFT_PIPAPO_LONGS_PER_M256; > for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { > @@ -414,7 +641,7 @@ static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill, > > b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); > if (last) > - return b; > + return NFT_PIPAPO_AVX2_DEBUG_MAP_DONE(b); > > if (unlikely(ret == -1)) > ret = b / XSAVE_YMM_SIZE; > @@ -427,7 +654,7 @@ static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill, > ; > } > > - return ret; > + return NFT_PIPAPO_AVX2_DEBUG_MAP_DONE(ret); > } > > /** > @@ -458,6 +685,7 @@ static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill, > }; > int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; > unsigned long *lt = f->lt, bsize = f->bsize; > + NFT_PIPAPO_AVX2_DEBUG_MAP; > > lt += offset * NFT_PIPAPO_LONGS_PER_M256; > for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { > @@ -505,7 +733,7 @@ static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill, > > b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); > if (last) > - return b; > + return NFT_PIPAPO_AVX2_DEBUG_MAP_DONE(b); > > if (unlikely(ret == -1)) > ret = b / XSAVE_YMM_SIZE; > @@ -517,7 +745,7 @@ static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill, > ; > } > > - return ret; > + return NFT_PIPAPO_AVX2_DEBUG_MAP_DONE(ret); > } > > /** > @@ -553,6 +781,7 @@ static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill, > }; > int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; > unsigned long *lt = f->lt, bsize = f->bsize; > + NFT_PIPAPO_AVX2_DEBUG_MAP; > > lt += offset * NFT_PIPAPO_LONGS_PER_M256; > for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { > @@ -641,7 +870,7 @@ static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill, > > b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); > if (last) > - return b; > + return NFT_PIPAPO_AVX2_DEBUG_MAP_DONE(b); > > if (unlikely(ret == -1)) > ret = b / XSAVE_YMM_SIZE; > @@ -653,7 +882,7 @@ static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill, > ; > } > > - return ret; > + return NFT_PIPAPO_AVX2_DEBUG_MAP_DONE(ret); > } > > /** > @@ -680,6 +909,7 @@ static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill, > { > int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; > unsigned long *lt = f->lt, bsize = f->bsize; > + NFT_PIPAPO_AVX2_DEBUG_MAP; > > lt += offset * NFT_PIPAPO_LONGS_PER_M256; > for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { > @@ -687,6 +917,7 @@ static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill, > > if (first) { > NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 0, pkt[0], bsize); > + nft_pipapo_avx2_force_tmp(2, &__pipapo_debug_regmap); > } else { > NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); > NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); > @@ -699,7 +930,7 @@ static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill, > > b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); > if (last) > - return b; > + return NFT_PIPAPO_AVX2_DEBUG_MAP_DONE2(b, 2); > > if (unlikely(ret == -1)) > ret = b / XSAVE_YMM_SIZE; > @@ -711,7 +942,10 @@ static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill, > ; > } > > - return ret; > + if (first) > + return ret; > + > + return NFT_PIPAPO_AVX2_DEBUG_MAP_DONE2(ret, 2); > } > > /** > @@ -738,6 +972,7 @@ static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill, > { > int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; > unsigned long *lt = f->lt, bsize = f->bsize; > + NFT_PIPAPO_AVX2_DEBUG_MAP; > > lt += offset * NFT_PIPAPO_LONGS_PER_M256; > for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { > @@ -803,6 +1038,7 @@ static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill, > { > int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; > unsigned long *lt = f->lt, bsize = f->bsize; > + NFT_PIPAPO_AVX2_DEBUG_MAP; > > lt += offset * NFT_PIPAPO_LONGS_PER_M256; > for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { > @@ -879,6 +1115,7 @@ static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill, > { > int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; > unsigned long *lt = f->lt, bsize = f->bsize; > + NFT_PIPAPO_AVX2_DEBUG_MAP; > > lt += offset * NFT_PIPAPO_LONGS_PER_M256; > for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { > @@ -965,6 +1202,7 @@ static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill, > { > int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; > unsigned long *lt = f->lt, bsize = f->bsize; > + NFT_PIPAPO_AVX2_DEBUG_MAP; > > lt += offset * NFT_PIPAPO_LONGS_PER_M256; > for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { Everything else looks good to me, thanks for all the improvements! -- Stefano