On Thu, 5 Jun 2025 09:03:03 +0000 "Flot, Julien" <julien.flot@xxxxxxxxxxx> wrote: > Hi, > > Sorry to create this duplicate message, the previous one was not > formatted as plain text ... > > With recent versions of the kernel, ftrace seems to experience some > latencies when writing events filter files on a real time kernel. > Latencies experienced add a extra minute of runtime with trace-cmd > compare to a regular execution and sometimes `trace-cmd` just hangs for > multiples minutes. > > The bug has been tested and reproduced with these version of the kernel > (all with real time enable) : 6.12.19, 6.14-rc7, 6.15 and current > upstream. > The bug has also been reproduced on two machines, both running Ubuntu > 24.04 with an upstream kernel, one running with an INTEL(R) XEON(R) > GOLD 5512U, the other a Intel(R) Xeon(R) E-2278GE. The bug also appears > is virtualised environments (KVM/QEMU). > I have run `git-bisect` between v6.12 and v6.15, the faulty commit > seems to be this one : a363d27cdbc2bc2d1899b5a1520b64e3590fcd9a > > Faulty commit message : > ``` > commit a363d27cdbc2bc2d1899b5a1520b64e3590fcd9a > Author: Mathieu Desnoyers > <[mathieu.desnoyers@xxxxxxxxxxxx](mailto:mathieu.desnoyers@xxxxxxxxxxxx > )> > Date: Tue Oct 8 21:07:15 2024 -0400 > fix > tracing: Allow system call tracepoints to handle page faults > > Use Tasks Trace RCU to protect iteration of system call enter/exit > tracepoint probes to allow those probes to handle page faults. > > In preparation for this change, all tracers registering to system call > enter/exit tracepoints should expect those to be called with preemption > enabled. > > This allows tracers to fault-in userspace system call arguments such as > path strings within their probe callbacks. > I believe I know the issue. The filter logic calls: tracepoint_synchronize_unregister(); And that patch added: @@ -107,6 +108,7 @@ void for_each_tracepoint_in_module(struct module *mod, #ifdef CONFIG_TRACEPOINTS static inline void tracepoint_synchronize_unregister(void) { + synchronize_rcu_tasks_trace(); synchronize_rcu(); } Where synchronize_rcu_tasks_trace() is much more heavy weight call than synchronize_rcu(). Can you try this patch to see if it helps? -- Steve diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 10f714c80e4d..a3857c36684b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -2158,6 +2158,29 @@ struct filter_list { struct event_filter *filter; }; +struct filter_head { + struct list_head list; + struct rcu_head rcu; +}; + +static void free_filter_list(struct rcu_head *rhp) +{ + struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu); + struct filter_list *filter_item, *tmp; + + list_for_each_entry_safe(filter_item, tmp, &filter_list->list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } + kfree(filter_list); +} + +static void free_filter_list_tasks(struct rcu_head *rhp) +{ + call_rcu(rhp, free_filter_list); +} + static int process_system_preds(struct trace_subsystem_dir *dir, struct trace_array *tr, struct filter_parse_error *pe, @@ -2167,10 +2190,16 @@ static int process_system_preds(struct trace_subsystem_dir *dir, struct filter_list *filter_item; struct event_filter *filter = NULL; struct filter_list *tmp; - LIST_HEAD(filter_list); + struct filter_head *filter_list; bool fail = true; int err; + filter_list = kmalloc(sizeof(*filter_list), GFP_KERNEL); + if (!filter_list) + return -ENOMEM; + + INIT_LIST_HEAD(&filter_list->list); + list_for_each_entry(file, &tr->events, list) { if (file->system != dir) @@ -2197,7 +2226,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, if (!filter_item) goto fail_mem; - list_add_tail(&filter_item->list, &filter_list); + list_add_tail(&filter_item->list, &filter_list->list); /* * Regardless of if this returned an error, we still * replace the filter for the call. @@ -2217,31 +2246,22 @@ static int process_system_preds(struct trace_subsystem_dir *dir, * Do a synchronize_rcu() and to ensure all calls are * done with them before we free them. */ - tracepoint_synchronize_unregister(); - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - __free_filter(filter_item->filter); - list_del(&filter_item->list); - kfree(filter_item); - } + call_rcu_tasks(&filter_list->rcu, free_filter_list_tasks); return 0; fail: /* No call succeeded */ - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - list_del(&filter_item->list); - kfree(filter_item); - } + free_filter_list(&filter_list->rcu); parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); return -EINVAL; fail_mem: __free_filter(filter); + /* If any call succeeded, we still need to sync */ if (!fail) - tracepoint_synchronize_unregister(); - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - __free_filter(filter_item->filter); - list_del(&filter_item->list); - kfree(filter_item); - } + call_rcu_tasks(&filter_list->rcu, free_filter_list_tasks); + else + free_filter_list(&filter_list->rcu); + return -ENOMEM; }