[PATCH 1/4] writeback: Avoid contention on wb->list_lock when switching inodes

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



There can be multiple inode switch works that are trying to switch
inodes to / from the same wb. This can happen in particular if some
cgroup exits which owns many (thousands) inodes and we need to switch
them all. In this case several inode_switch_wbs_work_fn() instances will
be just spinning on the same wb->list_lock while only one of them makes
forward progress. This wastes CPU cycles and quickly leads to softlockup
reports and unusable system.

Instead of running several inode_switch_wbs_work_fn() instances in
parallel switching to the same wb and contending on wb->list_lock, run
just one instance and let the other isw items switching to this wb queue
behind the one being processed.

Signed-off-by: Jan Kara <jack@xxxxxxx>
---
 fs/fs-writeback.c                | 42 +++++++++++++++++++++++++++++---
 include/linux/backing-dev-defs.h |  5 +++-
 mm/backing-dev.c                 |  2 ++
 3 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a07b8cf73ae2..3f3e6efd5d78 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -369,6 +369,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
 
 struct inode_switch_wbs_context {
 	struct rcu_work		work;
+	/* List of queued switching contexts for new_wb */
+	struct list_head	list;
 
 	/*
 	 * Multiple inodes can be switched at once.  The switching procedure
@@ -486,10 +488,8 @@ static bool inode_do_switch_wbs(struct inode *inode,
 	return switched;
 }
 
-static void inode_switch_wbs_work_fn(struct work_struct *work)
+static void process_inode_switch_wbs_work(struct inode_switch_wbs_context *isw)
 {
-	struct inode_switch_wbs_context *isw =
-		container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
 	struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
 	struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
 	struct bdi_writeback *new_wb = isw->new_wb;
@@ -539,10 +539,44 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
 	for (inodep = isw->inodes; *inodep; inodep++)
 		iput(*inodep);
 	wb_put(new_wb);
-	kfree(isw);
 	atomic_dec(&isw_nr_in_flight);
 }
 
+static void inode_switch_wbs_work_fn(struct work_struct *work)
+{
+	struct inode_switch_wbs_context *isw =
+		container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
+	struct bdi_writeback *new_wb = isw->new_wb;
+	bool switch_running;
+
+	spin_lock_irq(&new_wb->work_lock);
+	switch_running = !list_empty(&new_wb->switch_wbs_ctxs);
+	list_add_tail(&isw->list, &new_wb->switch_wbs_ctxs);
+	spin_unlock_irq(&new_wb->work_lock);
+
+	/*
+	 * Let's leave the real work for the running worker since we'd just
+	 * contend with it on wb->list_lock anyway.
+	 */
+	if (switch_running)
+		return;
+
+	/* OK, we will be doing the switching work */
+	wb_get(new_wb);
+	spin_lock_irq(&new_wb->work_lock);
+	while (!list_empty(&new_wb->switch_wbs_ctxs)) {
+		isw = list_first_entry(&new_wb->switch_wbs_ctxs,
+				       struct inode_switch_wbs_context, list);
+		spin_unlock_irq(&new_wb->work_lock);
+		process_inode_switch_wbs_work(isw);
+		spin_lock_irq(&new_wb->work_lock);
+		list_del(&isw->list);
+		kfree(isw);
+	}
+	spin_unlock_irq(&new_wb->work_lock);
+	wb_put(new_wb);
+}
+
 static bool inode_prepare_wbs_switch(struct inode *inode,
 				     struct bdi_writeback *new_wb)
 {
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 2ad261082bba..f94fd458c248 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -136,7 +136,8 @@ struct bdi_writeback {
 	int dirty_exceeded;
 	enum wb_reason start_all_reason;
 
-	spinlock_t work_lock;		/* protects work_list & dwork scheduling */
+	spinlock_t work_lock;		/* protects work_list,
+					   dwork scheduling, and switch_wbs_ctxs */
 	struct list_head work_list;
 	struct delayed_work dwork;	/* work item used for writeback */
 	struct delayed_work bw_dwork;	/* work item used for bandwidth estimate */
@@ -152,6 +153,8 @@ struct bdi_writeback {
 	struct list_head blkcg_node;	/* anchored at blkcg->cgwb_list */
 	struct list_head b_attached;	/* attached inodes, protected by list_lock */
 	struct list_head offline_node;	/* anchored at offline_cgwbs */
+	struct list_head switch_wbs_ctxs;	/* queued contexts for
+						 * writeback switching */
 
 	union {
 		struct work_struct release_work;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 783904d8c5ef..ce0aa7d03cc5 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -709,6 +709,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
 	wb->memcg_css = memcg_css;
 	wb->blkcg_css = blkcg_css;
 	INIT_LIST_HEAD(&wb->b_attached);
+	INIT_LIST_HEAD(&wb->switch_wbs_ctxs);
 	INIT_WORK(&wb->release_work, cgwb_release_workfn);
 	set_bit(WB_registered, &wb->state);
 	bdi_get(bdi);
@@ -839,6 +840,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
 	if (!ret) {
 		bdi->wb.memcg_css = &root_mem_cgroup->css;
 		bdi->wb.blkcg_css = blkcg_root_css;
+		INIT_LIST_HEAD(&bdi->wb.switch_wbs_ctxs);
 	}
 	return ret;
 }
-- 
2.51.0





[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux