On 8/22/25 3:01 AM, Tejun Heo wrote:
Hi,
Hello,
On Fri, Aug 22, 2025 at 02:00:10AM +0800, Julian Sun wrote:
...
Do you mean logic like this?
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
wb_wait_for_completion(&memcg->cgwb_frn[i].done);
kfree(memcg);
But there still exist task hang issues as long as
wb_wait_for_completion() exists.
Ah, right. I was just thinking about the workqueue being stalled. The
problem is that the wait itself is too long.
I think the scope of impact of the current changes should be
manageable. I have checked all the other places where wb_queue_work()
is called, and their free_done values are all 0, and I also tested
this patch with the reproducer in [1] with kasan and kmemleak enabled.
The test result looks fine, so this should not have a significant
impact.
What do you think?
My source of reluctance is that it's a peculiar situation where flushing of
a cgroup takes that long due to hard throttling and the self-freeing
mechanism isn't the prettiest thing. Do you think you can do the same thing
through custom waitq wakeup function?
Yeah, this method looks more general if I understand correctly.
If the idea of the following code makes sense to you, I'd like to split
and convert it into formal patches.
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a07b8cf73ae2..10fede792178 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -172,13 +172,8 @@ static void finish_writeback_work(struct
wb_writeback_work *work)
if (work->auto_free)
kfree(work);
- if (done) {
- wait_queue_head_t *waitq = done->waitq;
-
- /* @done can't be accessed after the following dec */
- if (atomic_dec_and_test(&done->cnt))
- wake_up_all(waitq);
- }
+ if (done)
+ done->wb_waitq->wb_wakeup_func(done->wb_waitq, done);
}
static void wb_queue_work(struct bdi_writeback *wb,
@@ -213,7 +208,7 @@ static void wb_queue_work(struct bdi_writeback *wb,
void wb_wait_for_completion(struct wb_completion *done)
{
atomic_dec(&done->cnt); /* put down the initial count */
- wait_event(*done->waitq, !atomic_read(&done->cnt));
+ wait_event(done->wb_waitq->waitq, !atomic_read(&done->cnt));
}
#ifdef CONFIG_CGROUP_WRITEBACK
diff --git a/include/linux/backing-dev-defs.h
b/include/linux/backing-dev-defs.h
index 2ad261082bba..04699458ac50 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -60,13 +60,56 @@ enum wb_reason {
WB_REASON_MAX,
};
+struct wb_completion;
+typedef struct wb_wait_queue_head wb_wait_queue_head_t;
+typedef void (*wb_wait_wakeup_func_t)(wb_wait_queue_head_t *wq_waitq,
+ struct wb_completion *done);
+struct wb_wait_queue_head {
+ wait_queue_head_t waitq;
+ wb_wait_wakeup_func_t wb_wakeup_func;
+};
+
struct wb_completion {
atomic_t cnt;
- wait_queue_head_t *waitq;
+ wb_wait_queue_head_t *wb_waitq;
};
+static inline void wb_default_wakeup_func(wb_wait_queue_head_t *wq_waitq,
+ struct wb_completion *done)
+{
+ /* @done can't be accessed after the following dec */
+ if (atomic_dec_and_test(&done->cnt))
+ wake_up_all(&wq_waitq->waitq);
+}
+
+/* used for cgwb_frn, be careful here, @done can't be accessed */
+static inline void wb_empty_wakeup_func(wb_wait_queue_head_t *wq_waitq,
+ struct wb_completion *done)
+{
+}
+
+#define __init_wb_waitqueue_head(wb_waitq, func) \
+ do { \
+ init_waitqueue_head(&wb_waitq.waitq); \
+ wb_waitq.wb_wakeup_func = func; \
+ } while (0)
+
+#define init_wb_waitqueue_head(wb_waitq) \
+ __init_wb_waitqueue_head(wb_waitq, wb_default_wakeup_func)
+
+#define __WB_WAIT_QUEUE_HEAD_INITIALIZER(name, func) { \
+ .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(name.waitq), \
+ .wb_wakeup_func = func, \
+}
+
+#define __DECLARE_WB_WAIT_QUEUE_HEAD(name, func) \
+ struct wb_wait_queue_head name =
__WB_WAIT_QUEUE_HEAD_INITIALIZER(name, func)
+
+#define DECLARE_WB_WAIT_QUEUE_HEAD(name) \
+ __DECLARE_WB_WAIT_QUEUE_HEAD(name, wb_default_wakeup_func)
+
#define __WB_COMPLETION_INIT(_waitq) \
- (struct wb_completion){ .cnt = ATOMIC_INIT(1), .waitq = (_waitq) }
+ (struct wb_completion){ .cnt = ATOMIC_INIT(1), .wb_waitq = (_waitq) }
/*
* If one wants to wait for one or more wb_writeback_works, each work's
@@ -190,7 +233,7 @@ struct backing_dev_info {
struct mutex cgwb_release_mutex; /* protect shutdown of wb structs */
struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */
#endif
- wait_queue_head_t wb_waitq;
+ wb_wait_queue_head_t wb_waitq;
struct device *dev;
char dev_name[64];
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 783904d8c5ef..c4fec9e22978 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1008,7 +1008,7 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->max_prop_frac = FPROP_FRAC_BASE;
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
- init_waitqueue_head(&bdi->wb_waitq);
+ init_wb_waitqueue_head(bdi->wb_waitq);
bdi->last_bdp_sleep = jiffies;
return cgwb_bdi_init(bdi);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8dd7fbed5a94..999624535470 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -99,7 +99,7 @@ static struct kmem_cache *memcg_cachep;
static struct kmem_cache *memcg_pn_cachep;
#ifdef CONFIG_CGROUP_WRITEBACK
-static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
+static __DECLARE_WB_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq,
wb_empty_wakeup_func);
#endif
static inline bool task_is_dying(void)
@@ -3909,12 +3909,7 @@ static void mem_cgroup_css_released(struct
cgroup_subsys_state *css)
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- int __maybe_unused i;
-#ifdef CONFIG_CGROUP_WRITEBACK
- for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
- wb_wait_for_completion(&memcg->cgwb_frn[i].done);
-#endif
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_dec(&memcg_sockets_enabled_key);
Thanks.
Thanks,
--
Julian Sun <sunjunchao@xxxxxxxxxxxxx>