On Thu, 2025-06-05 at 15:41 -0400, Steven Rostedt wrote: > On Thu, 5 Jun 2025 10:50:50 -0400 > Steven Rostedt <rostedt@xxxxxxxxxxx> wrote: > > > On Thu, 5 Jun 2025 14:41:43 +0000 > > "Flot, Julien" <julien.flot@xxxxxxxxxxx> wrote: > > > > > > > I have tried the patch, I am afraid it does not help. I have > > > roughly > > > the same results as in previous versions. > > > > Bah, there's three more places that call > > tracepoint_synchronize_unregister() :-p > > > > Let me try to get rid of those too. > > So I decided to enable PREEMPT_RT and yes, I can reproduce this too. > > I ran the following: > > # trace-cmd sqlhist -e -n sched_stack select start.prev_state as > state, end.next_comm as comm, TIMESTAMP_DELTA_USECS as delta, > start.STACKTRACE as stack from sched_switch as start join > sched_switch as end on start.prev_pid = end.next_pid > > The above creates a synthetic event that creates a stack trace when a > task > schedules out and records it with the time it scheduled back in. > Basically > the time a task is off the CPU. It also records the state of the task > when > it left the CPU (running, blocked, sleeping, etc). It also saves the > comm > of the task as "comm" (needed for the next command). > > # echo > 'hist:keys=state,stack.stacktrace:vals=delta:sort=state,delta if comm > == "trace-cmd" && state & 3' > > /sys/kernel/tracing/events/synthetic/sched_stack/trigger > > The above creates a histogram with buckets per state, per stack, and > the > value of the total time it was off the CPU for that stack trace. It > filters > on tasks with "comm == trace-cmd" and only the sleeping and blocked > states > (1 - sleeping, 2 - blocked). > > # trace-cmd record -p function sleep 1 > > # cat /sys/kernel/tracing/events/synthetic/sched_stack/hist | tail > -50 > path_openat+0x1a36/0x2990 > do_filp_open+0x1c5/0x420 > do_sys_openat2+0xed/0x180 > __x64_sys_openat+0x108/0x1d0 > do_syscall_64+0x7b/0x420 > } hitcount: 2 delta: 74781 > { state: 2, stack.stacktrace > __schedule+0x1545/0x3700 > schedule+0xe2/0x390 > schedule_timeout+0x175/0x200 > wait_for_completion_state+0x294/0x440 > __wait_rcu_gp+0x247/0x4f0 > synchronize_rcu_tasks_generic+0x151/0x230 > apply_subsystem_event_filter+0x77c/0x1300 > subsystem_filter_write+0x67/0xc0 > vfs_write+0x1e2/0xeb0 > ksys_write+0xff/0x1d0 > do_syscall_64+0x7b/0x420 > entry_SYSCALL_64_after_hwframe+0x76/0x7e > } hitcount: 1 delta: 210957 > { state: 2, stack.stacktrace > __schedule+0x1545/0x3700 > schedule+0xe2/0x390 > schedule_timeout+0x175/0x200 > wait_for_completion_state+0x294/0x440 > __wait_rcu_gp+0x247/0x4f0 > synchronize_rcu_normal+0x3db/0x5c0 > apply_subsystem_event_filter+0xa30/0x1300 > subsystem_filter_write+0x67/0xc0 > vfs_write+0x1e2/0xeb0 > ksys_write+0xff/0x1d0 > do_syscall_64+0x7b/0x420 > entry_SYSCALL_64_after_hwframe+0x76/0x7e > } hitcount: 237 delta: 6316500 > { state: 2, stack.stacktrace > __schedule+0x1545/0x3700 > schedule+0xe2/0x390 > schedule_timeout+0x175/0x200 > wait_for_completion_state+0x294/0x440 > __wait_rcu_gp+0x247/0x4f0 > synchronize_rcu_tasks_generic+0x151/0x230 > apply_subsystem_event_filter+0xa2b/0x1300 > subsystem_filter_write+0x67/0xc0 > vfs_write+0x1e2/0xeb0 > ksys_write+0xff/0x1d0 > do_syscall_64+0x7b/0x420 > entry_SYSCALL_64_after_hwframe+0x76/0x7e > } hitcount: 237 delta: 99756288 <<--------------- Delta is > 99 seconds! > > Totals: > Hits: 525 > Entries: 21 > Dropped: 0 > > > This shows that it is definitely waiting on the > synchronize_rcu_tasks() but > I see that it's called by apply_subsystem_event_filter() which I > didn't > cover. But I looked at all the tracepoint_synchronize_unregister() in > the > trace_events_filter.c code and replaced them with a double > call_rcu_task/rcu(). > > This appears to fix the issue. With that, trace-cmd record is much > faster > and the histogram now looks like this: > > ~# cat /sys/kernel/tracing/events/synthetic/sched_stack/hist | tail - > 50 > schedule_timeout+0x175/0x200 > wait_for_completion_state+0x294/0x440 > __wait_rcu_gp+0x247/0x4f0 > synchronize_rcu_normal+0x3db/0x5c0Tested-by: Julien Flot > <julien.flot@xxxxxxxxxxx>Tested-by: Julien Flot > <julien.flot@xxxxxxxxxxx> > ring_buffer_reset_online_cpus+0x142/0x350 > tracing_reset_online_cpus+0x14b/0x1e0 > tracing_open+0x335/0x440 > do_dentry_open+0x4c6/0x17a0 > vfs_open+0x82/0x360 > path_openat+0x1a36/0x2990 > do_filp_open+0x1c5/0x420 > do_sys_openat2+0xed/0x180 > __x64_sys_openat+0x108/0x1d0 > } hitcount: 2 delta: 61260 > { state: 2, stack.stacktrace > __schedule+0x1545/0x3700 > schedule+0xe2/0x390 > schedule_timeout+0x175/0x200 > wait_for_completion_state+0x294/0x440 > __wait_rcu_gp+0x247/0x4f0 > synchronize_rcu_normal+0x3db/0x5c0 > ring_buffer_reset_online_cpus+0x142/0x350 > tracing_reset_online_cpus+0x14b/0x1e0 > tracing_set_tracer+0x4d5/0xa80 > tracing_set_trace_write+0x138/0x1e0 > vfs_write+0x1e2/0xeb0 > ksys_write+0xff/0x1d0 > do_syscall_64+0x7b/0x420 > entry_SYSCALL_64_after_hwframe+0x76/0x7e > } hitcount: 2 delta: 61699 > { state: 2, stack.stacktrace > __schedule+0x1545/0x3700 > schedule+0xe2/0x390 > schedule_timeout+0x175/0x200 > wait_for_completion_state+0x294/0x440 > __wait_rcu_gp+0x247/0x4f0 > synchronize_rcu_normal+0x3db/0x5c0 > tracing_reset_online_cpus+0x8f/0x1e0 > tracing_open+0x335/0x440 > do_dentry_open+0x4c6/0x17a0 > vfs_open+0x82/0x360 > path_openat+0x1a36/0x2990 > do_filp_open+0x1c5/0x420 > do_sys_openat2+0xed/0x180 > __x64_sys_openat+0x108/0x1d0 > do_syscall_64+0x7b/0x420 > } hitcount: 2 delta: 77044 > > Totals: > Hits: 55 > Entries: 28 > Dropped: 0 > > > With the max delta wait time of just 77 milliseconds total. > > -- Steve > > diff --git a/kernel/trace/trace_events_filter.c > b/kernel/trace/trace_events_filter.c > index 2048560264bb..3ff782d6b522 100644 > --- a/kernel/trace/trace_events_filter.c > +++ b/kernel/trace/trace_events_filter.c > @@ -1335,6 +1335,74 @@ static void filter_free_subsystem_preds(struct > trace_subsystem_dir *dir, > } > } > > +struct filter_list { > + struct list_head list; > + struct event_filter *filter; > +}; > + > +struct filter_head { > + struct list_head list; > + struct rcu_head rcu; > +}; > + > + > +static void free_filter_list(struct rcu_head *rhp) > +{ > + struct filter_head *filter_list = container_of(rhp, struct > filter_head, rcu); > + struct filter_list *filter_item, *tmp; > + > + list_for_each_entry_safe(filter_item, tmp, &filter_list- > >list, list) { > + __free_filter(filter_item->filter); > + list_del(&filter_item->list); > + kfree(filter_item); > + } > + kfree(filter_list); > +} > + > +static void free_filter_list_tasks(struct rcu_head *rhp) > +{ > + call_rcu(rhp, free_filter_list); > +} > + > +/* > + * The tracepoint_synchronize_unregister() is a double rcu call. > + * It calls synchronize_rcu_tasks_trace() followed by > synchronize_rcu(). > + * Instead of waiting for it, simply call these via the call_rcu*() > + * variants. > + */ > +static void delay_free_filter(struct filter_head *head) > +{ > + call_rcu_tasks_trace(&head->rcu, free_filter_list_tasks); > +} > + > +static void try_delay_free_filter(struct event_filter *filter) > +{ > + struct filter_head *head; > + struct filter_list *item; > + > + head = kmalloc(sizeof(*head), GFP_KERNEL); > + if (!head) > + goto free_now; > + > + INIT_LIST_HEAD(&head->list); > + > + item = kmalloc(sizeof(*item), GFP_KERNEL); > + if (!item) { > + kfree(head); > + goto free_now; > + } > + > + item->filter = filter; > + list_add_tail(&item->list, &head->list); > + delay_free_filter(head); > + return; > + > + free_now: > + /* Make sure the filter is not being used */ > + tracepoint_synchronize_unregister(); > + __free_filter(filter); > +} > + > static inline void __free_subsystem_filter(struct trace_event_file > *file) > { > __free_filter(file->filter); > @@ -1342,15 +1410,53 @@ static inline void > __free_subsystem_filter(struct trace_event_file *file) > } > > static void filter_free_subsystem_filters(struct trace_subsystem_dir > *dir, > - struct trace_array *tr) > + struct trace_array *tr, > + struct event_filter > *filter) > { > struct trace_event_file *file; > + struct filter_head *head; > + struct filter_list *item; > + > + head = kmalloc(sizeof(*head), GFP_KERNEL); > + if (!head) > + goto free_now; > + > + INIT_LIST_HEAD(&head->list); > + > + item = kmalloc(sizeof(*item), GFP_KERNEL); > + if (!item) { > + kfree(head); > + goto free_now; > + } > + > + item->filter = filter; > + list_add_tail(&item->list, &head->list); > > list_for_each_entry(file, &tr->events, list) { > if (file->system != dir) > continue; > - __free_subsystem_filter(file); > + item = kmalloc(sizeof(*item), GFP_KERNEL); > + if (!item) > + goto free_now; > + item->filter = file->filter; > + list_add_tail(&item->list, &head->list); > + file->filter = NULL; > + } > + > + delay_free_filter(head); > + return; > + free_now: > + tracepoint_synchronize_unregister(); > + > + if (head) > + free_filter_list(&head->rcu); > + > + list_for_each_entry(file, &tr->events, list) { > + if (file->system != dir || !file->filter) > + continue; > + __free_filter(file->filter); > } > + __free_filter(filter); > } > > int filter_assign_type(const char *type) > @@ -2131,11 +2237,6 @@ static inline void event_clear_filter(struct > trace_event_file *file) > RCU_INIT_POINTER(file->filter, NULL); > } > > -struct filter_list { > - struct list_head list; > - struct event_filter *filter; > -}; > - > static int process_system_preds(struct trace_subsystem_dir *dir, > struct trace_array *tr, > struct filter_parse_error *pe, > @@ -2144,11 +2245,16 @@ static int process_system_preds(struct > trace_subsystem_dir *dir, > struct trace_event_file *file; > struct filter_list *filter_item; > struct event_filter *filter = NULL; > - struct filter_list *tmp; > - LIST_HEAD(filter_list); > + struct filter_head *filter_list; > bool fail = true; > int err; > > + filter_list = kmalloc(sizeof(*filter_list), GFP_KERNEL); > + if (!filter_list) > + return -ENOMEM; > + > + INIT_LIST_HEAD(&filter_list->list); > + > list_for_each_entry(file, &tr->events, list) { > > if (file->system != dir) > @@ -2175,7 +2281,7 @@ static int process_system_preds(struct > trace_subsystem_dir *dir, > if (!filter_item) > goto fail_mem; > > - list_add_tail(&filter_item->list, &filter_list); > + list_add_tail(&filter_item->list, &filter_list- > >list); > /* > * Regardless of if this returned an error, we still > * replace the filter for the call. > @@ -2195,31 +2301,22 @@ static int process_system_preds(struct > trace_subsystem_dir *dir, > * Do a synchronize_rcu() and to ensure all calls are > * done with them before we free them. > */ > - tracepoint_synchronize_unregister(); > - list_for_each_entry_safe(filter_item, tmp, &filter_list, > list) { > - __free_filter(filter_item->filter); > - list_del(&filter_item->list); > - kfree(filter_item); > - } > + delay_free_filter(filter_list); > return 0; > fail: > /* No call succeeded */ > - list_for_each_entry_safe(filter_item, tmp, &filter_list, > list) { > - list_del(&filter_item->list); > - kfree(filter_item); > - } > + free_filter_list(&filter_list->rcu); > parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); > return -EINVAL; > fail_mem: > __free_filter(filter); > + > /* If any call succeeded, we still need to sync */ > if (!fail) > - tracepoint_synchronize_unregister(); > - list_for_each_entry_safe(filter_item, tmp, &filter_list, > list) { > - __free_filter(filter_item->filter); > - list_del(&filter_item->list); > - kfree(filter_item); > - } > + delay_free_filter(filter_list); > + else > + free_filter_list(&filter_list->rcu); > + > return -ENOMEM; > } > > @@ -2361,9 +2458,7 @@ int apply_event_filter(struct trace_event_file > *file, char *filter_string) > > event_clear_filter(file); > > - /* Make sure the filter is not being used */ > - tracepoint_synchronize_unregister(); > - __free_filter(filter); > + try_delay_free_filter(filter); > > return 0; > } > @@ -2387,11 +2482,8 @@ int apply_event_filter(struct trace_event_file > *file, char *filter_string) > > event_set_filter(file, filter); > > - if (tmp) { > - /* Make sure the call is done with the > filter */ > - tracepoint_synchronize_unregister(); > - __free_filter(tmp); > - } > + if (tmp) > + try_delay_free_filter(tmp); > } > > return err; > @@ -2417,9 +2509,7 @@ int apply_subsystem_event_filter(struct > trace_subsystem_dir *dir, > filter = system->filter; > system->filter = NULL; > /* Ensure all filters are no longer used */ > - tracepoint_synchronize_unregister(); > - filter_free_subsystem_filters(dir, tr); > - __free_filter(filter); > + filter_free_subsystem_filters(dir, tr, filter); > return 0; > } > Thank you very much, it also works for me ! Tested-by: Julien Flot <julien.flot@xxxxxxxxxxx> -- Julien Flot Siemens AG www.siemens.com