This patch improves writeback throughput by introducing parallelism within a bdi_writeback. Instead of a single delayed_work, we create NUM_WB (currently 4) work items, each backed by a dedicated 'struct delayed_work' and associated backpointer to the parent 'bdi_writeback'. All writeback wakeups—regular or delayed—are now scheduled in a round-robin fashion across these work items. The underlying inode lists (b_dirty, b_io, b_more_io, b_dirty_time) remain shared and protected by existing locking, so no changes are needed there. This approach allows multiple threads to process dirty inodes in parallel while retaining the simplicity of a single worklist. Performance gains: - On PMEM: Single thread: 155 MiB/s 4 threads: 342 MiB/s (+120%) - On NVMe: Single thread: 111 MiB/s 4 threads: 139 MiB/s (+25%) Signed-off-by: Kundan Kumar <kundan.kumar@xxxxxxxxxxx> Signed-off-by: Anuj Gupta <anuj20.g@xxxxxxxxxxx> --- fs/fs-writeback.c | 49 ++++++++++++++++++++++++++------ include/linux/backing-dev-defs.h | 10 ++++++- mm/backing-dev.c | 16 ++++++++--- 3 files changed, 62 insertions(+), 13 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index cc57367fb641..761423b5cc1e 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -133,11 +133,24 @@ static bool inode_io_list_move_locked(struct inode *inode, return false; } -static void wb_wakeup(struct bdi_writeback *wb) +static void wb_wakeup_work(struct bdi_writeback *wb, + struct delayed_work *dwork) { spin_lock_irq(&wb->work_lock); if (test_bit(WB_registered, &wb->state)) - mod_delayed_work(bdi_wq, &wb->dwork, 0); + mod_delayed_work(bdi_wq, dwork, 0); + spin_unlock_irq(&wb->work_lock); +} + +static void wb_wakeup(struct bdi_writeback *wb) +{ + spin_lock_irq(&wb->work_lock); + if (test_bit(WB_registered, &wb->state)) { + mod_delayed_work(bdi_wq, + &wb->wb_dwork[wb->wb_idx].dwork, + 0); + wb->wb_idx = (wb->wb_idx + 1) % NUM_WB; + } spin_unlock_irq(&wb->work_lock); } @@ -159,10 +172,26 @@ static void wb_wakeup_delayed(struct bdi_writeback *wb) { unsigned long timeout; + timeout = msecs_to_jiffies(dirty_writeback_interval * 10); + spin_lock_irq(&wb->work_lock); + if (test_bit(WB_registered, &wb->state)) { + queue_delayed_work(bdi_wq, + &wb->wb_dwork[wb->wb_idx].dwork, + timeout); + wb->wb_idx = (wb->wb_idx + 1) % NUM_WB; + } + spin_unlock_irq(&wb->work_lock); +} + +static void wb_wakeup_delayed_work(struct bdi_writeback *wb, + struct delayed_work *dwork) +{ + unsigned long timeout; + timeout = msecs_to_jiffies(dirty_writeback_interval * 10); spin_lock_irq(&wb->work_lock); if (test_bit(WB_registered, &wb->state)) - queue_delayed_work(bdi_wq, &wb->dwork, timeout); + queue_delayed_work(bdi_wq, dwork, timeout); spin_unlock_irq(&wb->work_lock); } @@ -193,7 +222,10 @@ static void wb_queue_work(struct bdi_writeback *wb, if (test_bit(WB_registered, &wb->state)) { list_add_tail(&work->list, &wb->work_list); - mod_delayed_work(bdi_wq, &wb->dwork, 0); + mod_delayed_work(bdi_wq, + &wb->wb_dwork[wb->wb_idx].dwork, + 0); + wb->wb_idx = (wb->wb_idx + 1) % NUM_WB; } else finish_writeback_work(work); @@ -2325,8 +2357,9 @@ static long wb_do_writeback(struct bdi_writeback *wb) */ void wb_workfn(struct work_struct *work) { - struct bdi_writeback *wb = container_of(to_delayed_work(work), - struct bdi_writeback, dwork); + struct delayed_work *p_dwork = to_delayed_work(work); + struct mul_dwork *md = container_of(p_dwork, struct mul_dwork, dwork); + struct bdi_writeback *wb = md->p_wb; long pages_written; set_worker_desc("flush-%s", bdi_dev_name(wb->bdi)); @@ -2355,9 +2388,9 @@ void wb_workfn(struct work_struct *work) } if (!list_empty(&wb->work_list)) - wb_wakeup(wb); + wb_wakeup_work(wb, p_dwork); else if (wb_has_dirty_io(wb) && dirty_writeback_interval) - wb_wakeup_delayed(wb); + wb_wakeup_delayed_work(wb, p_dwork); } /* diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 2ad261082bba..d099b8846c43 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -18,6 +18,8 @@ struct page; struct device; struct dentry; +#define NUM_WB 4 + /* * Bits in bdi_writeback.state */ @@ -80,6 +82,11 @@ struct wb_completion { #define DEFINE_WB_COMPLETION(cmpl, bdi) \ struct wb_completion cmpl = WB_COMPLETION_INIT(bdi) +struct mul_dwork { + struct delayed_work dwork; + struct bdi_writeback *p_wb; +}; + /* * Each wb (bdi_writeback) can perform writeback operations, is measured * and throttled, independently. Without cgroup writeback, each bdi @@ -138,7 +145,8 @@ struct bdi_writeback { spinlock_t work_lock; /* protects work_list & dwork scheduling */ struct list_head work_list; - struct delayed_work dwork; /* work item used for writeback */ + struct mul_dwork wb_dwork[NUM_WB]; /* multiple dworks */ + int wb_idx; struct delayed_work bw_dwork; /* work item used for bandwidth estimate */ struct list_head bdi_node; /* anchored at bdi->wb_list */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index e61bbb1bd622..e1b5e5ddd4eb 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -536,7 +536,11 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, spin_lock_init(&wb->work_lock); INIT_LIST_HEAD(&wb->work_list); - INIT_DELAYED_WORK(&wb->dwork, wb_workfn); + wb->wb_idx = 0; + for (int i = 0; i < NUM_WB; i++) { + INIT_DELAYED_WORK(&wb->wb_dwork[i].dwork, wb_workfn); + wb->wb_dwork[i].p_wb = wb; + } INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn); err = fprop_local_init_percpu(&wb->completions, gfp); @@ -571,15 +575,19 @@ static void wb_shutdown(struct bdi_writeback *wb) * tells wb_workfn() that @wb is dying and its work_list needs to * be drained no matter what. */ - mod_delayed_work(bdi_wq, &wb->dwork, 0); - flush_delayed_work(&wb->dwork); + for (int i = 0; i < NUM_WB; i++) { + mod_delayed_work(bdi_wq, &wb->wb_dwork[i].dwork, 0); + flush_delayed_work(&wb->wb_dwork[i].dwork); + } WARN_ON(!list_empty(&wb->work_list)); flush_delayed_work(&wb->bw_dwork); } static void wb_exit(struct bdi_writeback *wb) { - WARN_ON(delayed_work_pending(&wb->dwork)); + for (int i = 0; i < NUM_WB; i++) + WARN_ON(delayed_work_pending(&wb->wb_dwork[i].dwork)); + percpu_counter_destroy_many(wb->stat, NR_WB_STAT_ITEMS); fprop_local_destroy_percpu(&wb->completions); } -- 2.25.1