On Tue, 1 Jul 2025 at 17:25, Alexei Starovoitov <alexei.starovoitov@xxxxxxxxx> wrote: > > What is the height of 100m tree ? Because this trie implementation is essentially a binary tree the height is given by log2(100m) = 26. > What kind of "recursive algo" you have in mind? Something like this: --- kernel/bpf/lpm_trie.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 010e91ac978e..f4b07920a321 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -33,7 +33,13 @@ struct lpm_trie { struct bpf_map map; struct lpm_trie_node __rcu *root; size_t n_entries; + /* Maximum prefix length permitted */ size_t max_prefixlen; + /* Largest prefix length of any node ever inserted. Used for an + * optimisation in trie_free() and is not updated on node + * deletion. + */ + size_t largest_prefixlen; size_t data_size; spinlock_t lock; }; @@ -450,6 +456,10 @@ static long trie_update_elem(struct bpf_map *map, out: if (ret) kfree(new_node); + else + trie->largest_prefixlen = max(trie->largest_prefixlen, + key->prefixlen); + spin_unlock_irqrestore(&trie->lock, irq_flags); kfree_rcu(free_node, rcu); @@ -599,12 +609,40 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) return &trie->map; } +static void __trie_free(struct lpm_trie_node __rcu **slot) +{ + struct lpm_trie_node *node; + + node = rcu_dereference_protected(*slot, 1); + if (!node) + return; + + if (rcu_access_pointer(node->child[0])) + __trie_free(&node->child[0]); + + if (rcu_access_pointer(node->child[1])) + __trie_free(&node->child[1]); + + kfree(node); + RCU_INIT_POINTER(*slot, NULL); +} + static void trie_free(struct bpf_map *map) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct lpm_trie_node __rcu **slot; struct lpm_trie_node *node; + /* When we know the largest prefixlen used by any node is <= 32 + * we're guaranteed that the height of the trie is at most 32. + * And in that case, we can use a faster recursive freeing + * algorithm without worrying about blowing the stack. + */ + if (trie->largest_prefixlen <= 32) { + __trie_free(&trie->root); + goto out; + } + /* Always start at the root and walk down to a node that has no * children. Then free that node, nullify its reference in the parent * and start over. > Could you try to keep a stack of nodes visited and once leaf is > freed pop a node and continue walking. > Then total height won't be a factor. > The stack would need to be kmalloc-ed, of course, > but still should be faster than walking from the root. Sure, I can give this a shot. Plus we won't need to guard against blowing up the stack.