There can be multiple inode switch works that are trying to switch inodes to / from the same wb. This can happen in particular if some cgroup exits which owns many (thousands) inodes and we need to switch them all. In this case several inode_switch_wbs_work_fn() instances will be just spinning on the same wb->list_lock while only one of them makes forward progress. This wastes CPU cycles and quickly leads to softlockup reports and unusable system. Instead of running several inode_switch_wbs_work_fn() instances in parallel switching to the same wb and contending on wb->list_lock, run just one instance and let the other isw items switching to this wb queue behind the one being processed. Signed-off-by: Jan Kara <jack@xxxxxxx> --- fs/fs-writeback.c | 42 +++++++++++++++++++++++++++++--- include/linux/backing-dev-defs.h | 5 +++- mm/backing-dev.c | 2 ++ 3 files changed, 44 insertions(+), 5 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a07b8cf73ae2..3f3e6efd5d78 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -369,6 +369,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) struct inode_switch_wbs_context { struct rcu_work work; + /* List of queued switching contexts for new_wb */ + struct list_head list; /* * Multiple inodes can be switched at once. The switching procedure @@ -486,10 +488,8 @@ static bool inode_do_switch_wbs(struct inode *inode, return switched; } -static void inode_switch_wbs_work_fn(struct work_struct *work) +static void process_inode_switch_wbs_work(struct inode_switch_wbs_context *isw) { - struct inode_switch_wbs_context *isw = - container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]); struct bdi_writeback *old_wb = isw->inodes[0]->i_wb; struct bdi_writeback *new_wb = isw->new_wb; @@ -539,10 +539,44 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) for (inodep = isw->inodes; *inodep; inodep++) iput(*inodep); wb_put(new_wb); - kfree(isw); atomic_dec(&isw_nr_in_flight); } +static void inode_switch_wbs_work_fn(struct work_struct *work) +{ + struct inode_switch_wbs_context *isw = + container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); + struct bdi_writeback *new_wb = isw->new_wb; + bool switch_running; + + spin_lock_irq(&new_wb->work_lock); + switch_running = !list_empty(&new_wb->switch_wbs_ctxs); + list_add_tail(&isw->list, &new_wb->switch_wbs_ctxs); + spin_unlock_irq(&new_wb->work_lock); + + /* + * Let's leave the real work for the running worker since we'd just + * contend with it on wb->list_lock anyway. + */ + if (switch_running) + return; + + /* OK, we will be doing the switching work */ + wb_get(new_wb); + spin_lock_irq(&new_wb->work_lock); + while (!list_empty(&new_wb->switch_wbs_ctxs)) { + isw = list_first_entry(&new_wb->switch_wbs_ctxs, + struct inode_switch_wbs_context, list); + spin_unlock_irq(&new_wb->work_lock); + process_inode_switch_wbs_work(isw); + spin_lock_irq(&new_wb->work_lock); + list_del(&isw->list); + kfree(isw); + } + spin_unlock_irq(&new_wb->work_lock); + wb_put(new_wb); +} + static bool inode_prepare_wbs_switch(struct inode *inode, struct bdi_writeback *new_wb) { diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 2ad261082bba..f94fd458c248 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -136,7 +136,8 @@ struct bdi_writeback { int dirty_exceeded; enum wb_reason start_all_reason; - spinlock_t work_lock; /* protects work_list & dwork scheduling */ + spinlock_t work_lock; /* protects work_list, + dwork scheduling, and switch_wbs_ctxs */ struct list_head work_list; struct delayed_work dwork; /* work item used for writeback */ struct delayed_work bw_dwork; /* work item used for bandwidth estimate */ @@ -152,6 +153,8 @@ struct bdi_writeback { struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ struct list_head b_attached; /* attached inodes, protected by list_lock */ struct list_head offline_node; /* anchored at offline_cgwbs */ + struct list_head switch_wbs_ctxs; /* queued contexts for + * writeback switching */ union { struct work_struct release_work; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 783904d8c5ef..ce0aa7d03cc5 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -709,6 +709,7 @@ static int cgwb_create(struct backing_dev_info *bdi, wb->memcg_css = memcg_css; wb->blkcg_css = blkcg_css; INIT_LIST_HEAD(&wb->b_attached); + INIT_LIST_HEAD(&wb->switch_wbs_ctxs); INIT_WORK(&wb->release_work, cgwb_release_workfn); set_bit(WB_registered, &wb->state); bdi_get(bdi); @@ -839,6 +840,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) if (!ret) { bdi->wb.memcg_css = &root_mem_cgroup->css; bdi->wb.blkcg_css = blkcg_root_css; + INIT_LIST_HEAD(&bdi->wb.switch_wbs_ctxs); } return ret; } -- 2.51.0