[RFC 1/1] writeback: enable parallel writeback using multiple work items

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This patch improves writeback throughput by introducing parallelism
within a bdi_writeback. Instead of a single delayed_work, we create
NUM_WB (currently 4) work items, each backed by a dedicated
'struct delayed_work' and associated backpointer to the parent
'bdi_writeback'.

All writeback wakeups—regular or delayed—are now scheduled in a
round-robin fashion across these work items. The underlying inode
lists (b_dirty, b_io, b_more_io, b_dirty_time) remain shared and
protected by existing locking, so no changes are needed there.

This approach allows multiple threads to process dirty inodes in
parallel while retaining the simplicity of a single worklist.

Performance gains:

  - On PMEM:
      Single thread: 155 MiB/s
      4 threads:     342 MiB/s  (+120%)

  - On NVMe:
      Single thread: 111 MiB/s
      4 threads:     139 MiB/s  (+25%)

Signed-off-by: Kundan Kumar <kundan.kumar@xxxxxxxxxxx>
Signed-off-by: Anuj Gupta <anuj20.g@xxxxxxxxxxx>
---
 fs/fs-writeback.c                | 49 ++++++++++++++++++++++++++------
 include/linux/backing-dev-defs.h | 10 ++++++-
 mm/backing-dev.c                 | 16 ++++++++---
 3 files changed, 62 insertions(+), 13 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index cc57367fb641..761423b5cc1e 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -133,11 +133,24 @@ static bool inode_io_list_move_locked(struct inode *inode,
 	return false;
 }
 
-static void wb_wakeup(struct bdi_writeback *wb)
+static void wb_wakeup_work(struct bdi_writeback *wb,
+		struct delayed_work *dwork)
 {
 	spin_lock_irq(&wb->work_lock);
 	if (test_bit(WB_registered, &wb->state))
-		mod_delayed_work(bdi_wq, &wb->dwork, 0);
+		mod_delayed_work(bdi_wq, dwork, 0);
+	spin_unlock_irq(&wb->work_lock);
+}
+
+static void wb_wakeup(struct bdi_writeback *wb)
+{
+	spin_lock_irq(&wb->work_lock);
+	if (test_bit(WB_registered, &wb->state)) {
+		mod_delayed_work(bdi_wq,
+				 &wb->wb_dwork[wb->wb_idx].dwork,
+				 0);
+		wb->wb_idx = (wb->wb_idx + 1) % NUM_WB;
+	}
 	spin_unlock_irq(&wb->work_lock);
 }
 
@@ -159,10 +172,26 @@ static void wb_wakeup_delayed(struct bdi_writeback *wb)
 {
 	unsigned long timeout;
 
+	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
+	spin_lock_irq(&wb->work_lock);
+	if (test_bit(WB_registered, &wb->state)) {
+		queue_delayed_work(bdi_wq,
+				   &wb->wb_dwork[wb->wb_idx].dwork,
+				   timeout);
+		wb->wb_idx = (wb->wb_idx + 1) % NUM_WB;
+	}
+	spin_unlock_irq(&wb->work_lock);
+}
+
+static void wb_wakeup_delayed_work(struct bdi_writeback *wb,
+				   struct delayed_work *dwork)
+{
+	unsigned long timeout;
+
 	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
 	spin_lock_irq(&wb->work_lock);
 	if (test_bit(WB_registered, &wb->state))
-		queue_delayed_work(bdi_wq, &wb->dwork, timeout);
+		queue_delayed_work(bdi_wq, dwork, timeout);
 	spin_unlock_irq(&wb->work_lock);
 }
 
@@ -193,7 +222,10 @@ static void wb_queue_work(struct bdi_writeback *wb,
 
 	if (test_bit(WB_registered, &wb->state)) {
 		list_add_tail(&work->list, &wb->work_list);
-		mod_delayed_work(bdi_wq, &wb->dwork, 0);
+		mod_delayed_work(bdi_wq,
+				 &wb->wb_dwork[wb->wb_idx].dwork,
+				 0);
+		wb->wb_idx = (wb->wb_idx + 1) % NUM_WB;
 	} else
 		finish_writeback_work(work);
 
@@ -2325,8 +2357,9 @@ static long wb_do_writeback(struct bdi_writeback *wb)
  */
 void wb_workfn(struct work_struct *work)
 {
-	struct bdi_writeback *wb = container_of(to_delayed_work(work),
-						struct bdi_writeback, dwork);
+	struct delayed_work *p_dwork = to_delayed_work(work);
+	struct mul_dwork *md = container_of(p_dwork, struct mul_dwork, dwork);
+	struct bdi_writeback *wb =  md->p_wb;
 	long pages_written;
 
 	set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));
@@ -2355,9 +2388,9 @@ void wb_workfn(struct work_struct *work)
 	}
 
 	if (!list_empty(&wb->work_list))
-		wb_wakeup(wb);
+		wb_wakeup_work(wb, p_dwork);
 	else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
-		wb_wakeup_delayed(wb);
+		wb_wakeup_delayed_work(wb, p_dwork);
 }
 
 /*
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 2ad261082bba..d099b8846c43 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -18,6 +18,8 @@ struct page;
 struct device;
 struct dentry;
 
+#define NUM_WB 4
+
 /*
  * Bits in bdi_writeback.state
  */
@@ -80,6 +82,11 @@ struct wb_completion {
 #define DEFINE_WB_COMPLETION(cmpl, bdi)	\
 	struct wb_completion cmpl = WB_COMPLETION_INIT(bdi)
 
+struct mul_dwork {
+	struct delayed_work dwork;
+	struct bdi_writeback *p_wb;
+};
+
 /*
  * Each wb (bdi_writeback) can perform writeback operations, is measured
  * and throttled, independently.  Without cgroup writeback, each bdi
@@ -138,7 +145,8 @@ struct bdi_writeback {
 
 	spinlock_t work_lock;		/* protects work_list & dwork scheduling */
 	struct list_head work_list;
-	struct delayed_work dwork;	/* work item used for writeback */
+	struct mul_dwork wb_dwork[NUM_WB];   /* multiple dworks */
+	int wb_idx;
 	struct delayed_work bw_dwork;	/* work item used for bandwidth estimate */
 
 	struct list_head bdi_node;	/* anchored at bdi->wb_list */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e61bbb1bd622..e1b5e5ddd4eb 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -536,7 +536,11 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
 
 	spin_lock_init(&wb->work_lock);
 	INIT_LIST_HEAD(&wb->work_list);
-	INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
+	wb->wb_idx = 0;
+	for (int i = 0; i < NUM_WB; i++) {
+		INIT_DELAYED_WORK(&wb->wb_dwork[i].dwork, wb_workfn);
+		wb->wb_dwork[i].p_wb = wb;
+	}
 	INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
 
 	err = fprop_local_init_percpu(&wb->completions, gfp);
@@ -571,15 +575,19 @@ static void wb_shutdown(struct bdi_writeback *wb)
 	 * tells wb_workfn() that @wb is dying and its work_list needs to
 	 * be drained no matter what.
 	 */
-	mod_delayed_work(bdi_wq, &wb->dwork, 0);
-	flush_delayed_work(&wb->dwork);
+	for (int i = 0; i < NUM_WB; i++) {
+		mod_delayed_work(bdi_wq, &wb->wb_dwork[i].dwork, 0);
+		flush_delayed_work(&wb->wb_dwork[i].dwork);
+	}
 	WARN_ON(!list_empty(&wb->work_list));
 	flush_delayed_work(&wb->bw_dwork);
 }
 
 static void wb_exit(struct bdi_writeback *wb)
 {
-	WARN_ON(delayed_work_pending(&wb->dwork));
+	for (int i = 0; i < NUM_WB; i++)
+		WARN_ON(delayed_work_pending(&wb->wb_dwork[i].dwork));
+
 	percpu_counter_destroy_many(wb->stat, NR_WB_STAT_ITEMS);
 	fprop_local_destroy_percpu(&wb->completions);
 }
-- 
2.25.1





[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux