Re: [RFC dwarves] btf_encoder: Remove duplicates from functions entries

Alan Maguire <alan.maguire@xxxxxxxxxx> · Tue, 22 Jul 2025 11:45:58 +0100

On 22/07/2025 00:27, Ihor Solodrai wrote:
> On 7/21/25 7:27 AM, Jiri Olsa wrote:
>> On Mon, Jul 21, 2025 at 12:41:00PM +0100, Alan Maguire wrote:
>>> On 17/07/2025 16:25, Jiri Olsa wrote:
>>>> Menglong reported issue where we can have function in BTF which has
>>>> multiple addresses in kallsysm [1].
>>>>
>>>> Rather than filtering this in runtime, let's teach pahole to remove
>>>> such functions.
>>>>
>>>> Removing duplicate records from functions entries that have more
>>>> at least one different address. This way btf_encoder__find_function
>>>> won't find such functions and they won't be added in BTF.
>>>>
>>>> In my setup it removed 428 functions out of 77141.
>>>>
>>>
>>> Is such removal necessary? If the presence of an mcount annotation is
>>> the requirement, couldn't we just utilize
>>>
>>> /sys/kernel/tracing/available_filter_functions_addrs
>>>
>>> to map name to address safely? It identifies mcount-containing functions
>>> and some of these appear to be duplicates, for example there I see
>>>
>>> ffffffff8376e8b4 acpi_attr_is_visible
>>> ffffffff8379b7d4 acpi_attr_is_visible
>>
>> for that we'd need new interface for loading fentry/fexit.. programs, right?
>>
>> the current interface to get fentry/fexit.. attach address is:
>>    - user specifies function name, that translates to btf_id
>>    - in kernel that btf id translates back to function name
>>    - kernel uses kallsyms_lookup_name or find_kallsyms_symbol_value
>>      to get the address
>>
>> so we don't really know which address user wanted in the first place
> 
> Hi Jiri, Alan.
> 
> I stumbled on a bug today which seems to be relevant to this
> discussion.
> 
> I tried building the kernel with KASAN and UBSAN, and that resulted in
> some kfuncs disappearing from vmlinux.h, triggering selftests/bpf
> compilation errors, for example:
> 
>       CLNG-BPF [test_progs-no_alu32] cgroup_read_xattr.bpf.o
>     progs/cgroup_read_xattr.c:127:13: error: call to undeclared function 'bpf_cgroup_ancestor'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
>       127 |         ancestor = bpf_cgroup_ancestor(cgrp, 1);
>           |                    ^
> 
> Here is a piece of vmlinux.h diff between CONFIG_UBSAN=y/n:
> 
>     --- ./tools/testing/selftests/bpf/tools/include/vmlinux.h	2025-07-21 17:35:14.415733105 +0000
>     +++ ubsan_vmlinux.h	2025-07-21 17:33:10.455312623 +0000
>     @@ -117203,7 +117292,6 @@
>      extern int bpf_arena_reserve_pages(void *p__map, void __attribute__((address_space(1))) *ptr__ign, u32 page_cnt) __weak __ksym;
>      extern __bpf_fastcall void *bpf_cast_to_kern_ctx(void *obj) __weak __ksym;
>      extern struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp) __weak __ksym;
>     -extern struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __weak __ksym;
>      extern struct cgroup *bpf_cgroup_from_id(u64 cgid) __weak __ksym;
>      extern int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__str, struct bpf_dynptr *value_p) __weak __ksym;
>      extern void bpf_cgroup_release(struct cgroup *cgrp) __weak __ksym;
>     @@ -117260,7 +117348,6 @@
>      extern bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p) __weak __ksym;
>      extern int bpf_dynptr_memset(struct bpf_dynptr *p, u32 offset, u32 size, u8 val) __weak __ksym;
>      extern __u32 bpf_dynptr_size(const struct bpf_dynptr *p) __weak __ksym;
>     -extern void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, void *buffer__opt, u32 buffer__szk) __weak __ksym;
>      extern void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset, void *buffer__opt, u32 buffer__szk) __weak __ksym;
>      extern int bpf_fentry_test1(int a) __weak __ksym;
>      extern int bpf_get_dentry_xattr(struct dentry *dentry, const char *name__str, struct bpf_dynptr *value_p) __weak __ksym;
>     @@ -117287,7 +117374,6 @@
>      extern int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) __weak __ksym;
>      extern int *bpf_iter_num_next(struct bpf_iter_num *it) __weak __ksym;
>      extern void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __weak __ksym;
>     -extern int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __weak __ksym;
>      extern struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __weak __ksym;
>      extern void bpf_iter_task_destroy(struct bpf_iter_task *it) __weak __ksym;
>      extern int bpf_iter_task_new(struct bpf_iter_task *it, struct task_struct *task__nullable, unsigned int flags) __weak __ksym;
>     @@ -117373,11 +117459,8 @@
>      extern int bpf_strspn(const char *s__ign, const char *accept__ign) __weak __ksym;
>      extern int bpf_strstr(const char *s1__ign, const char *s2__ign) __weak __ksym;
>      extern struct task_struct *bpf_task_acquire(struct task_struct *p) __weak __ksym;
>     -extern struct task_struct *bpf_task_from_pid(s32 pid) __weak __ksym;
>     -extern struct task_struct *bpf_task_from_vpid(s32 vpid) __weak __ksym;
>      extern struct cgroup *bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id) __weak __ksym;
>      extern void bpf_task_release(struct task_struct *p) __weak __ksym;
>     -extern long int bpf_task_under_cgroup(struct task_struct *task, struct cgroup *ancestor) __weak __ksym;
>      extern void bpf_throw(u64 cookie) __weak __ksym;
>      extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, struct bpf_dynptr *sig_p, struct bpf_key *trusted_keyring) __weak __ksym;
>      extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym;
>     @@ -117412,15 +117495,10 @@
>      extern u32 scx_bpf_cpuperf_cur(s32 cpu) __weak __ksym;
>      extern void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __weak __ksym;
>      extern s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __weak __ksym;
>     -extern void scx_bpf_destroy_dsq(u64 dsq_id) __weak __ksym;
>     -extern void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __weak __ksym;
>      extern void scx_bpf_dispatch_cancel(void) __weak __ksym;
>      extern bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __weak __ksym;
>     -extern void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __weak __ksym;
>     -extern void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __weak __ksym;
>      extern u32 scx_bpf_dispatch_nr_slots(void) __weak __ksym;
>      extern void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __weak __ksym;
>     -extern bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __weak __ksym;
>      extern void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __weak __ksym;
>      extern void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __weak __ksym;
>      extern bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __weak __ksym;
>     @@ -117428,10 +117506,8 @@
>      extern void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __weak __ksym;
>      extern bool scx_bpf_dsq_move_to_local(u64 dsq_id) __weak __ksym;
>      extern bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __weak __ksym;
>     -extern s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __weak __ksym;
>      extern void scx_bpf_dump_bstr(char *fmt, long long unsigned int *data, u32 data__sz) __weak __ksym;
>      extern void scx_bpf_error_bstr(char *fmt, long long unsigned int *data, u32 data__sz) __weak __ksym;
>     -extern void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __weak __ksym;
>      extern void scx_bpf_exit_bstr(s64 exit_code, char *fmt, long long unsigned int *data, u32 data__sz) __weak __ksym;
>      extern const struct cpumask *scx_bpf_get_idle_cpumask(void) __weak __ksym;
>      extern const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) __weak __ksym;
> 
> Then I checked the difference between BTFs, and found that there is no
> DECL_TAG 'bpf_kfunc' produced for the affected functions:
> 
>     $ diff -u vmlinux.btf.out vmlinux_ubsan.btf.out | grep -C5 cgroup_ancestor
>     +[52749] FUNC 'bpf_cgroup_acquire' type_id=52748 linkage=static
>     +[52750] DECL_TAG 'bpf_kfunc' type_id=52749 component_idx=-1
>     +[52751] FUNC_PROTO '(anon)' ret_type_id=426 vlen=2
>             'cgrp' type_id=426
>             'level' type_id=21
>     -[52681] FUNC 'bpf_cgroup_ancestor' type_id=52680 linkage=static
>     -[52682] DECL_TAG 'bpf_kfunc' type_id=52681 component_idx=-1
>     -[52683] FUNC_PROTO '(anon)' ret_type_id=3987 vlen=2
>     +[52752] FUNC 'bpf_cgroup_ancestor' type_id=52751 linkage=static
>     +[52753] FUNC_PROTO '(anon)' ret_type_id=3987 vlen=2
>             'attach_type' type_id=1767
>             'attach_btf_id' type_id=34
>     -[52684] FUNC 'bpf_cgroup_atype_find' type_id=52683 linkage=static
>     -[52685] FUNC_PROTO '(anon)' ret_type_id=0 vlen=2
> 
> Which is clearly wrong and suggests a bug.
> 
> After some debugging, I found that the problem is in
> btf_encoder__find_function(), and more specifically in
> the comparator used for bsearch and qsort.
> 
>    static int functions_cmp(const void *_a, const void *_b)
>    {
>    	const struct elf_function *a = _a;
>    	const struct elf_function *b = _b;
> 
>    	/* if search key allows prefix match, verify target has matching
>    	 * prefix len and prefix matches.
>    	 */
>    	if (a->prefixlen && a->prefixlen == b->prefixlen)
>    		return strncmp(a->name, b->name, b->prefixlen);
>    	return strcmp(a->name, b->name);
>    }
> 
> For this particular vmlinux that I compiled,
> btf_encoder__find_function("bpf_cgroup_ancestor", prefixlen=0) returns
> NULL, even though there is an elf_function struct for
> bpf_cgroup_ancestor in the table.
> 
> The reason for it is that bsearch() happens to hit
> "bpf_cgroup_ancestor.cold" in the table first.
> strcmp("bpf_cgroup_ancestor", "bpf_cgroup_ancestor.cold)") gives a
> negative value, but bpf_cgroup_ancestor entry is stored in the other
> direction in the table.
> 
> This is surprising at the first glance, because we use the same
> functions_cmp both for search and for qsort.
> 
> But as it turns we are actually using two different comparators within
> functions_cmp(): we set key.prefixlen=0 for exact match and when it's
> non-zero we search for prefix match. When sorting the table, there are
> no entries with prefixlen=0, so the order of elements is not exactly
> right for the bsearch().
> 
> That's a nasty bug, but as far as I understand, all this complexity is
> unnecessary in case of '.cold' suffix, because they simply are not
> supposed to be in the elf_functions table: it's usually just a piece
> of a target function.
> 
> There are a couple of ways this particular bug could be fixed
> (filtering out .cold symbols, for example). But I think this bug and
> the problem Jiri is trying to solve stems from the fact that one
> function name, which is an identifier the users care about the most,
> may be associated with many ELF symbols and/or addresses.
> 
> What is clear to me in the context of pahole's BTF encoding is that we
> want elf_functions table to only have a single entry per name (where
> name is an actual name that might be referred to by users, not an ELF
> sym name), and have a deterministic mechanism for selecting one (or
> none) func from many at the time of processing ELF data.
> 
> The current implementation is just buggy in this regard.
> 

There are a few separable issues here I think.

First as you say, certain suffixes should not be eligible as matches at
all - .cold is one, and .part is another (partial inlining). As such
they should be filtered and removed as potential matches.

Second we need to fix the function sort/search logic.

Third we need to decide how to deal with cases where the function does
not correspond to an mcount boundary. It'd be interesting to see if the
filtering helps here, but part of the problem is also that we don't
currently have a mechanism to help guide the match between function name
and function site that is done when the fentry attach is carried out.
Yonghong and I talked about it in [1].

Addresses seem like the natural means to help guide that, so a
DATASEC-like set of addresses would help this matching. I had a WIP
version of this but it wasn't working fully yet. I'll revive it and see
if I can get it out as an RFC. Needs to take into account the work being
done on inlines too [1].

In terms of the tracer's actual intent, multi-site functions are often
"static inline" functions in a .h file that don't actually get inlined;
the user intent would be often to trace all instances, but it seems to
me we need to provide a means to support both this or to trace a
specific instance. How the latter is best represented from the tracer
side I'm not sure; raw addresses would be one way I suppose. Absent an
explicit request from the tracer I'm not sure what heuristics make most
sense; currently we just pick the first instance I suspect, and might
need to continue to do so for backwards compatibility.

> I am not aware of long term plans for addressing this, though it looks
> like this was discussed before. I'd appreciate if you share any
> relevant threads.
> 

Yonghong and I discussed this a bit in [1], and the inline thread in [2]
has some more details.

[1]
https://lpc.events/event/18/contributions/1945/attachments/1508/3179/Kernel%20func%20tracing%20in%20the%20face%20of%20compiler%20optimization.pdf
[2]
https://lore.kernel.org/bpf/20250416-btf_inline-v1-0-e4bd2f8adae5@xxxxxxxx/

> Thanks.
> 
> 
>>
>> I think we discussed this issue some time ago, but I'm not sure what
>> the proposal was at the end (function address stored in BTF?)
>>
>> this change is to make sure there are no duplicate functions in BTF
>> that could cause this confusion.. rather than bad result, let's deny
>> the attach for such function
>>
>> jirka
>>
>>
>>>
>>> ?
>>>
>>>> [1] https://lore.kernel.org/bpf/20250710070835.260831-1-dongml2@xxxxxxxxxxxxxxx/
>>>> Reported-by: Menglong Dong <menglong8.dong@xxxxxxxxx>
>>>> Signed-off-by: Jiri Olsa <jolsa@xxxxxxxxxx>
>>>> ---
>>>>
>>>> Alan,
>>>> I'd like to test this in the pahole CI, is there a way to manualy trigger it?
>>>>
>>>
>>> Easiest way is to base from pahole's next branch and push to a github
>>> repo; the tests will run as actions there. I've just merged the function
>>> comparison work so that will be available if you base/sync a branch on
>>> next from git.kernel.org/pub/scm/devel/pahole/pahole.git/ . Thanks!
>>>
>>> Alan
>>>
>>>
>>>> thanks,
>>>> jirka
>>>>
>>>>
>>>> ---
>>>>   btf_encoder.c | 37 +++++++++++++++++++++++++++++++++++++
>>>>   1 file changed, 37 insertions(+)
>>>>
>>>> diff --git a/btf_encoder.c b/btf_encoder.c
>>>> index 16739066caae..a25fe2f8bfb1 100644
>>>> --- a/btf_encoder.c
>>>> +++ b/btf_encoder.c
>>>> @@ -99,6 +99,7 @@ struct elf_function {
>>>>   	size_t		prefixlen;
>>>>   	bool		kfunc;
>>>>   	uint32_t	kfunc_flags;
>>>> +	unsigned long	addr;
>>>>   };
>>>>   
>>>>   struct elf_secinfo {
>>>> @@ -1469,6 +1470,7 @@ static void elf_functions__collect_function(struct elf_functions *functions, GEl
>>>>   
>>>>   	func = &functions->entries[functions->cnt];
>>>>   	func->name = name;
>>>> +	func->addr = sym->st_value;
>>>>   	if (strchr(name, '.')) {
>>>>   		const char *suffix = strchr(name, '.');
>>>>   
>>>> @@ -2143,6 +2145,40 @@ int btf_encoder__encode(struct btf_encoder *encoder, struct conf_load *conf)
>>>>   	return err;
>>>>   }
>>>>   
>>>> +/*
>>>> + * Remove name duplicates from functions->entries that have
>>>> + * at least 2 different addresses.
>>>> + */
>>>> +static void functions_remove_dups(struct elf_functions *functions)
>>>> +{
>>>> +	struct elf_function *n = &functions->entries[0];
>>>> +	bool matched = false, diff = false;
>>>> +	int i, j;
>>>> +
>>>> +	for (i = 0, j = 1; i < functions->cnt && j < functions->cnt; i++, j++) {
>>>> +		struct elf_function *a = &functions->entries[i];
>>>> +		struct elf_function *b = &functions->entries[j];
>>>> +
>>>> +		if (!strcmp(a->name, b->name)) {
>>>> +			matched = true;
>>>> +			diff |= a->addr != b->addr;
>>>> +			continue;
>>>> +		}
>>>> +
>>>> +		/*
>>>> +		 * Keep only not-matched entries and last one of the matched/duplicates
>>>> +		 * ones if all of the matched entries had the same address.
>>>> +		 **/
>>>> +		if (!matched || !diff)
>>>> +			*n++ = *a;
>>>> +		matched = diff = false;
>>>> +	}
>>>> +
>>>> +	if (!matched || !diff)
>>>> +		*n++ = functions->entries[functions->cnt - 1];
>>>> +	functions->cnt = n - &functions->entries[0];
>>>> +}
>>>> +
>>>>   static int elf_functions__collect(struct elf_functions *functions)
>>>>   {
>>>>   	uint32_t nr_symbols = elf_symtab__nr_symbols(functions->symtab);
>>>> @@ -2168,6 +2204,7 @@ static int elf_functions__collect(struct elf_functions *functions)
>>>>   
>>>>   	if (functions->cnt) {
>>>>   		qsort(functions->entries, functions->cnt, sizeof(*functions->entries), functions_cmp);
>>>> +		functions_remove_dups(functions);
>>>>   	} else {
>>>>   		err = 0;
>>>>   		goto out_free;
>>>