On 2025/8/22 9:37, Tiffany Yang wrote: > There isn't yet a clear way to identify a set of "lost" time that > everyone (or at least a wider group of users) cares about. However, > users can perform some delay accounting by iterating over components of > interest. This patch allows cgroup v2 freezing time to be one of those > components. > > Track the cumulative time that each v2 cgroup spends freezing and expose > it to userland via a new local stat file in cgroupfs. Thank you to > Michal, who provided the ASCII art in the updated documentation. > > To access this value: > $ mkdir /sys/fs/cgroup/test > $ cat /sys/fs/cgroup/test/cgroup.stat.local > freeze_time_total 0 > > Ensure consistent freeze time reads with freeze_seq, a per-cgroup > sequence counter. Writes are serialized using the css_set_lock. > > Signed-off-by: Tiffany Yang <ynaffit@xxxxxxxxxx> > Cc: Tejun Heo <tj@xxxxxxxxxx> > Cc: Michal Koutný <mkoutny@xxxxxxxx> > --- > v3 -> v4: > * Replace "freeze_time_total" with "frozen" and expose stats via > cgroup.stat.local, as recommended by Tejun. > * Use the same timestamp when freezing/unfreezing a cgroup as its > descendants, as suggested by Michal. > > v2 -> v3: > * Use seqcount along with css_set_lock to guard freeze time accesses, as > suggested by Michal. > > v1 -> v2: > * Track per-cgroup freezing time instead of per-task frozen time, as > suggested by Tejun. > --- > Documentation/admin-guide/cgroup-v2.rst | 18 ++++++++++++++++ > include/linux/cgroup-defs.h | 17 +++++++++++++++ > kernel/cgroup/cgroup.c | 28 +++++++++++++++++++++++++ > kernel/cgroup/freezer.c | 16 ++++++++++---- > 4 files changed, 75 insertions(+), 4 deletions(-) > > diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst > index 51c0bc4c2dc5..a1e3d431974c 100644 > --- a/Documentation/admin-guide/cgroup-v2.rst > +++ b/Documentation/admin-guide/cgroup-v2.rst > @@ -1001,6 +1001,24 @@ All cgroup core files are prefixed with "cgroup." > Total number of dying cgroup subsystems (e.g. memory > cgroup) at and beneath the current cgroup. > > + cgroup.stat.local > + A read-only flat-keyed file which exists in non-root cgroups. > + The following entry is defined: > + > + frozen_usec > + Cumulative time that this cgroup has spent between freezing and > + thawing, regardless of whether by self or ancestor groups. > + NB: (not) reaching "frozen" state is not accounted here. > + > + Using the following ASCII representation of a cgroup's freezer > + state, :: > + > + 1 _____ > + frozen 0 __/ \__ > + ab cd > + > + the duration being measured is the span between a and c. > + > cgroup.freeze > A read-write single value file which exists on non-root cgroups. > Allowed values are "0" and "1". The default is "0". > diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h > index 6b93a64115fe..539c64eeef38 100644 > --- a/include/linux/cgroup-defs.h > +++ b/include/linux/cgroup-defs.h > @@ -433,6 +433,23 @@ struct cgroup_freezer_state { > * frozen, SIGSTOPped, and PTRACEd. > */ > int nr_frozen_tasks; > + > + /* Freeze time data consistency protection */ > + seqcount_t freeze_seq; > + > + /* > + * Most recent time the cgroup was requested to freeze. > + * Accesses guarded by freeze_seq counter. Writes serialized > + * by css_set_lock. > + */ > + u64 freeze_start_nsec; > + > + /* > + * Total duration the cgroup has spent freezing. > + * Accesses guarded by freeze_seq counter. Writes serialized > + * by css_set_lock. > + */ > + u64 frozen_nsec; > }; > > struct cgroup { > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c > index 312c6a8b55bb..ab096b884bbc 100644 > --- a/kernel/cgroup/cgroup.c > +++ b/kernel/cgroup/cgroup.c > @@ -3763,6 +3763,27 @@ static int cgroup_stat_show(struct seq_file *seq, void *v) > return 0; > } > > +static int cgroup_core_local_stat_show(struct seq_file *seq, void *v) > +{ > + struct cgroup *cgrp = seq_css(seq)->cgroup; > + unsigned int sequence; > + u64 freeze_time; > + > + do { > + sequence = read_seqcount_begin(&cgrp->freezer.freeze_seq); > + freeze_time = cgrp->freezer.frozen_nsec; > + /* Add in current freezer interval if the cgroup is freezing. */ > + if (test_bit(CGRP_FREEZE, &cgrp->flags)) > + freeze_time += (ktime_get_ns() - > + cgrp->freezer.freeze_start_nsec); > + } while (read_seqcount_retry(&cgrp->freezer.freeze_seq, sequence)); > + > + seq_printf(seq, "frozen_usec %llu\n", > + (unsigned long long) freeze_time / NSEC_PER_USEC); > + > + return 0; > +} > + > #ifdef CONFIG_CGROUP_SCHED > /** > * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem > @@ -5354,6 +5375,11 @@ static struct cftype cgroup_base_files[] = { > .name = "cgroup.stat", > .seq_show = cgroup_stat_show, > }, > + { > + .name = "cgroup.stat.local", > + .flags = CFTYPE_NOT_ON_ROOT, > + .seq_show = cgroup_core_local_stat_show, > + }, > { > .name = "cgroup.freeze", > .flags = CFTYPE_NOT_ON_ROOT, > @@ -5763,6 +5789,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, > * if the parent has to be frozen, the child has too. > */ > cgrp->freezer.e_freeze = parent->freezer.e_freeze; > + seqcount_init(&cgrp->freezer.freeze_seq); > if (cgrp->freezer.e_freeze) { > /* > * Set the CGRP_FREEZE flag, so when a process will be > @@ -5771,6 +5798,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, > * consider it frozen immediately. > */ > set_bit(CGRP_FREEZE, &cgrp->flags); > + cgrp->freezer.freeze_start_nsec = ktime_get_ns(); > set_bit(CGRP_FROZEN, &cgrp->flags); > } > > diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c > index bf1690a167dd..6c18854bff34 100644 > --- a/kernel/cgroup/freezer.c > +++ b/kernel/cgroup/freezer.c > @@ -171,7 +171,7 @@ static void cgroup_freeze_task(struct task_struct *task, bool freeze) > /* > * Freeze or unfreeze all tasks in the given cgroup. > */ > -static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) > +static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze, u64 ts_nsec) > { > struct css_task_iter it; > struct task_struct *task; > @@ -179,10 +179,16 @@ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) > lockdep_assert_held(&cgroup_mutex); > > spin_lock_irq(&css_set_lock); > - if (freeze) > + write_seqcount_begin(&cgrp->freezer.freeze_seq); > + if (freeze) { > set_bit(CGRP_FREEZE, &cgrp->flags); > - else > + cgrp->freezer.freeze_start_nsec = ts_nsec; > + } else { > clear_bit(CGRP_FREEZE, &cgrp->flags); > + cgrp->freezer.frozen_nsec += (ts_nsec - > + cgrp->freezer.freeze_start_nsec); > + } > + write_seqcount_end(&cgrp->freezer.freeze_seq); > spin_unlock_irq(&css_set_lock); > Hello Tiffany, I wanted to check if there are any specific considerations regarding how we should input the ts_nsec value. Would it be possible to define this directly within the cgroup_do_freeze function rather than passing it as a parameter? This approach might simplify the implementation and potentially improve timing accuracy when it have lots of descendants. -- Best regards, Ridong > if (freeze) > @@ -260,6 +266,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) > struct cgroup *parent; > struct cgroup *dsct; > bool applied = false; > + u64 ts_nsec; > bool old_e; > > lockdep_assert_held(&cgroup_mutex); > @@ -271,6 +278,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) > return; > > cgrp->freezer.freeze = freeze; > + ts_nsec = ktime_get_ns(); > > /* > * Propagate changes downwards the cgroup tree. > @@ -298,7 +306,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) > /* > * Do change actual state: freeze or unfreeze. > */ > - cgroup_do_freeze(dsct, freeze); > + cgroup_do_freeze(dsct, freeze, ts_nsec); > applied = true; > } >