Re: [External] Re: [PATCH] memcg: Don't wait writeback completion when release memcg.

Julian Sun <sunjunchao@xxxxxxxxxxxxx> · Fri, 22 Aug 2025 16:22:09 +0800

On 8/22/25 3:01 AM, Tejun Heo wrote:

Hi,

Hello,

On Fri, Aug 22, 2025 at 02:00:10AM +0800, Julian Sun wrote:
...
Do you mean logic like this?

     for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
         wb_wait_for_completion(&memcg->cgwb_frn[i].done);
     kfree(memcg);

But there still exist task hang issues as long as
wb_wait_for_completion() exists.

Ah, right. I was just thinking about the workqueue being stalled. The
problem is that the wait itself is too long.

I think the scope of impact of the current changes should be
manageable. I have checked all the other places where wb_queue_work()
is called, and their free_done values are all 0, and I also tested
this patch with the reproducer in [1] with kasan and kmemleak enabled.
The test result looks fine, so this should not have a significant
impact.
What do you think?

My source of reluctance is that it's a peculiar situation where flushing of
a cgroup takes that long due to hard throttling and the self-freeing
mechanism isn't the prettiest thing. Do you think you can do the same thing
through custom waitq wakeup function?

Yeah, this method looks more general if I understand correctly.

If the idea of the following code makes sense to you, I'd like to split
and convert it into formal patches.

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a07b8cf73ae2..10fede792178 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -172,13 +172,8 @@ static void finish_writeback_work(struct 
wb_writeback_work *work)

 	if (work->auto_free)
 		kfree(work);
-	if (done) {
-		wait_queue_head_t *waitq = done->waitq;
-
-		/* @done can't be accessed after the following dec */
-		if (atomic_dec_and_test(&done->cnt))
-			wake_up_all(waitq);
-	}
+	if (done)
+		done->wb_waitq->wb_wakeup_func(done->wb_waitq, done);
 }

 static void wb_queue_work(struct bdi_writeback *wb,
@@ -213,7 +208,7 @@ static void wb_queue_work(struct bdi_writeback *wb,
 void wb_wait_for_completion(struct wb_completion *done)
 {
 	atomic_dec(&done->cnt);		/* put down the initial count */
-	wait_event(*done->waitq, !atomic_read(&done->cnt));
+	wait_event(done->wb_waitq->waitq, !atomic_read(&done->cnt));
 }

 #ifdef CONFIG_CGROUP_WRITEBACK
diff --git a/include/linux/backing-dev-defs.h 
b/include/linux/backing-dev-defs.h
index 2ad261082bba..04699458ac50 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -60,13 +60,56 @@ enum wb_reason {
 	WB_REASON_MAX,
 };

+struct wb_completion;
+typedef struct wb_wait_queue_head wb_wait_queue_head_t;
+typedef void (*wb_wait_wakeup_func_t)(wb_wait_queue_head_t *wq_waitq,
+									  struct wb_completion *done);
+struct wb_wait_queue_head {
+	wait_queue_head_t waitq;
+	wb_wait_wakeup_func_t wb_wakeup_func;
+};
+
 struct wb_completion {
 	atomic_t		cnt;
-	wait_queue_head_t	*waitq;
+	wb_wait_queue_head_t	*wb_waitq;
 };

+static inline void wb_default_wakeup_func(wb_wait_queue_head_t *wq_waitq,
+										  struct wb_completion *done)
+{
+	/* @done can't be accessed after the following dec */
+	if (atomic_dec_and_test(&done->cnt))
+		wake_up_all(&wq_waitq->waitq);
+}
+
+/* used for cgwb_frn, be careful here, @done can't be accessed */
+static inline void wb_empty_wakeup_func(wb_wait_queue_head_t *wq_waitq,
+										struct wb_completion *done)
+{
+}
+
+#define __init_wb_waitqueue_head(wb_waitq, func) 	\
+	do {											\
+		init_waitqueue_head(&wb_waitq.waitq);		\
+		wb_waitq.wb_wakeup_func = func; 			\
+	} while (0)
+
+#define init_wb_waitqueue_head(wb_waitq) 	\
+	__init_wb_waitqueue_head(wb_waitq, wb_default_wakeup_func)
+
+#define __WB_WAIT_QUEUE_HEAD_INITIALIZER(name, func) {	\
+	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(name.waitq),	\
+	.wb_wakeup_func = func, 							\
+}
+
+#define __DECLARE_WB_WAIT_QUEUE_HEAD(name, func) \
+	struct wb_wait_queue_head name = 
__WB_WAIT_QUEUE_HEAD_INITIALIZER(name, func)
+
+#define DECLARE_WB_WAIT_QUEUE_HEAD(name) \
+	__DECLARE_WB_WAIT_QUEUE_HEAD(name, wb_default_wakeup_func)
+
 #define __WB_COMPLETION_INIT(_waitq)	\
-	(struct wb_completion){ .cnt = ATOMIC_INIT(1), .waitq = (_waitq) }
+	(struct wb_completion){ .cnt = ATOMIC_INIT(1), .wb_waitq = (_waitq) }

 /*
  * If one wants to wait for one or more wb_writeback_works, each work's
@@ -190,7 +233,7 @@ struct backing_dev_info {
 	struct mutex cgwb_release_mutex;  /* protect shutdown of wb structs */
 	struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */
 #endif
-	wait_queue_head_t wb_waitq;
+	wb_wait_queue_head_t wb_waitq;

 	struct device *dev;
 	char dev_name[64];
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 783904d8c5ef..c4fec9e22978 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1008,7 +1008,7 @@ int bdi_init(struct backing_dev_info *bdi)
 	bdi->max_prop_frac = FPROP_FRAC_BASE;
 	INIT_LIST_HEAD(&bdi->bdi_list);
 	INIT_LIST_HEAD(&bdi->wb_list);
-	init_waitqueue_head(&bdi->wb_waitq);
+	init_wb_waitqueue_head(bdi->wb_waitq);
 	bdi->last_bdp_sleep = jiffies;

 	return cgwb_bdi_init(bdi);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8dd7fbed5a94..999624535470 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -99,7 +99,7 @@ static struct kmem_cache *memcg_cachep;
 static struct kmem_cache *memcg_pn_cachep;

 #ifdef CONFIG_CGROUP_WRITEBACK
-static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
+static __DECLARE_WB_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq, 
wb_empty_wakeup_func);
 #endif

 static inline bool task_is_dying(void)
@@ -3909,12 +3909,7 @@ static void mem_cgroup_css_released(struct 
cgroup_subsys_state *css)
 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-	int __maybe_unused i;

-#ifdef CONFIG_CGROUP_WRITEBACK
-	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
-		wb_wait_for_completion(&memcg->cgwb_frn[i].done);
-#endif
 	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
 		static_branch_dec(&memcg_sockets_enabled_key);




Thanks.


Thanks,
--
Julian Sun <sunjunchao@xxxxxxxxxxxxx>