On Tue, 25 Jun 2024, Mike Snitzer wrote:
> From: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx>
>
> For localio access, don't call filesystem read() and write() routines
> directly.
>
> Some filesystem writeback routines can end up taking up a lot of stack
> space (particularly xfs). Instead of risking running over due to the
> extra overhead from the NFS stack, we should just call these routines
> from a workqueue job.
>
> Use of dedicated workqueues improves performance over using the
> system_unbound_wq. Localio is motivated by the promise of improved
> performance, it makes little sense to yield it back.
>
> But further analysis of the latest stack depth requirements would be
> useful. It'd be nice to root cause and fix the latest stack hogs,
> because using workqueues at all can cause a loss in performance due to
> context switches.
>
I would rather this patch were left out of the final submission, and
only added if/when we have documented evidence that it helps.
Thanks,
NeilBrown
> Signed-off-by: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx>
> Signed-off-by: Mike Snitzer <snitzer@xxxxxxxxxx>
> ---
> fs/nfs/inode.c | 57 +++++++++++++++++---------
> fs/nfs/internal.h | 1 +
> fs/nfs/localio.c | 102 +++++++++++++++++++++++++++++++++++-----------
> 3 files changed, 118 insertions(+), 42 deletions(-)
>
> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> index f9923cbf6058..aac8c5302503 100644
> --- a/fs/nfs/inode.c
> +++ b/fs/nfs/inode.c
> @@ -2394,35 +2394,54 @@ static void nfs_destroy_inodecache(void)
> kmem_cache_destroy(nfs_inode_cachep);
> }
>
> +struct workqueue_struct *nfslocaliod_workqueue;
> struct workqueue_struct *nfsiod_workqueue;
> EXPORT_SYMBOL_GPL(nfsiod_workqueue);
>
> /*
> - * start up the nfsiod workqueue
> - */
> -static int nfsiod_start(void)
> -{
> - struct workqueue_struct *wq;
> - dprintk("RPC: creating workqueue nfsiod\n");
> - wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
> - if (wq == NULL)
> - return -ENOMEM;
> - nfsiod_workqueue = wq;
> - return 0;
> -}
> -
> -/*
> - * Destroy the nfsiod workqueue
> + * Destroy the nfsiod workqueues
> */
> static void nfsiod_stop(void)
> {
> struct workqueue_struct *wq;
>
> wq = nfsiod_workqueue;
> - if (wq == NULL)
> - return;
> - nfsiod_workqueue = NULL;
> - destroy_workqueue(wq);
> + if (wq != NULL) {
> + nfsiod_workqueue = NULL;
> + destroy_workqueue(wq);
> + }
> +#if IS_ENABLED(CONFIG_NFS_LOCALIO)
> + wq = nfslocaliod_workqueue;
> + if (wq != NULL) {
> + nfslocaliod_workqueue = NULL;
> + destroy_workqueue(wq);
> + }
> +#endif /* CONFIG_NFS_LOCALIO */
> +}
> +
> +/*
> + * Start the nfsiod workqueues
> + */
> +static int nfsiod_start(void)
> +{
> + dprintk("RPC: creating workqueue nfsiod\n");
> + nfsiod_workqueue = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
> + if (nfsiod_workqueue == NULL)
> + return -ENOMEM;
> +#if IS_ENABLED(CONFIG_NFS_LOCALIO)
> + /*
> + * localio writes need to use a normal (non-memreclaim) workqueue.
> + * When we start getting low on space, XFS goes and calls flush_work() on
> + * a non-memreclaim work queue, which causes a priority inversion problem.
> + */
> + dprintk("RPC: creating workqueue nfslocaliod\n");
> + nfslocaliod_workqueue = alloc_workqueue("nfslocaliod", WQ_UNBOUND, 0);
> + if (unlikely(nfslocaliod_workqueue == NULL)) {
> + nfsiod_stop();
> + return -ENOMEM;
> + }
> +#endif /* CONFIG_NFS_LOCALIO */
> + return 0;
> }
>
> unsigned int nfs_net_id;
> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> index d352040e3232..9251a357d097 100644
> --- a/fs/nfs/internal.h
> +++ b/fs/nfs/internal.h
> @@ -440,6 +440,7 @@ int nfs_check_flags(int);
>
> /* inode.c */
> extern struct workqueue_struct *nfsiod_workqueue;
> +extern struct workqueue_struct *nfslocaliod_workqueue;
> extern struct inode *nfs_alloc_inode(struct super_block *sb);
> extern void nfs_free_inode(struct inode *);
> extern int nfs_write_inode(struct inode *, struct writeback_control *);
> diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
> index 724e81716b16..418b8d76692b 100644
> --- a/fs/nfs/localio.c
> +++ b/fs/nfs/localio.c
> @@ -44,6 +44,12 @@ struct nfs_local_fsync_ctx {
> };
> static void nfs_local_fsync_work(struct work_struct *work);
>
> +struct nfs_local_io_args {
> + struct nfs_local_kiocb *iocb;
> + struct work_struct work;
> + struct completion *done;
> +};
> +
> /*
> * We need to translate between nfs status return values and
> * the local errno values which may not be the same.
> @@ -307,30 +313,54 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
> status > 0 ? status : 0, hdr->res.eof);
> }
>
> -static int
> -nfs_do_local_read(struct nfs_pgio_header *hdr, struct file *filp,
> - const struct rpc_call_ops *call_ops)
> +static void nfs_local_call_read(struct work_struct *work)
> {
> - struct nfs_local_kiocb *iocb;
> + struct nfs_local_io_args *args =
> + container_of(work, struct nfs_local_io_args, work);
> + struct nfs_local_kiocb *iocb = args->iocb;
> + struct file *filp = iocb->kiocb.ki_filp;
> struct iov_iter iter;
> ssize_t status;
>
> + nfs_local_iter_init(&iter, iocb, READ);
> +
> + status = filp->f_op->read_iter(&iocb->kiocb, &iter);
> + if (status != -EIOCBQUEUED) {
> + nfs_local_read_done(iocb, status);
> + nfs_local_pgio_release(iocb);
> + }
> + complete(args->done);
> +}
> +
> +static int nfs_do_local_read(struct nfs_pgio_header *hdr, struct file *filp,
> + const struct rpc_call_ops *call_ops)
> +{
> + struct nfs_local_io_args args;
> + DECLARE_COMPLETION_ONSTACK(done);
> + struct nfs_local_kiocb *iocb;
> +
> dprintk("%s: vfs_read count=%u pos=%llu\n",
> __func__, hdr->args.count, hdr->args.offset);
>
> iocb = nfs_local_iocb_alloc(hdr, filp, GFP_KERNEL);
> if (iocb == NULL)
> return -ENOMEM;
> - nfs_local_iter_init(&iter, iocb, READ);
>
> nfs_local_pgio_init(hdr, call_ops);
> hdr->res.eof = false;
>
> - status = filp->f_op->read_iter(&iocb->kiocb, &iter);
> - if (status != -EIOCBQUEUED) {
> - nfs_local_read_done(iocb, status);
> - nfs_local_pgio_release(iocb);
> - }
> + /*
> + * Don't call filesystem read() routines directly.
> + * In order to avoid issues with stack overflow,
> + * call the read routines from a workqueue job.
> + */
> + args.iocb = iocb;
> + args.done = &done;
> + INIT_WORK_ONSTACK(&args.work, nfs_local_call_read);
> + queue_work(nfslocaliod_workqueue, &args.work);
> + wait_for_completion(&done);
> + destroy_work_on_stack(&args.work);
> +
> return 0;
> }
>
> @@ -420,14 +450,35 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
> nfs_local_pgio_done(hdr, status);
> }
>
> -static int
> -nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
> - const struct rpc_call_ops *call_ops)
> +static void nfs_local_call_write(struct work_struct *work)
> {
> - struct nfs_local_kiocb *iocb;
> + struct nfs_local_io_args *args =
> + container_of(work, struct nfs_local_io_args, work);
> + struct nfs_local_kiocb *iocb = args->iocb;
> + struct file *filp = iocb->kiocb.ki_filp;
> struct iov_iter iter;
> ssize_t status;
>
> + nfs_local_iter_init(&iter, iocb, WRITE);
> +
> + file_start_write(filp);
> + status = filp->f_op->write_iter(&iocb->kiocb, &iter);
> + file_end_write(filp);
> + if (status != -EIOCBQUEUED) {
> + nfs_local_write_done(iocb, status);
> + nfs_get_vfs_attr(filp, iocb->hdr->res.fattr);
> + nfs_local_pgio_release(iocb);
> + }
> + complete(args->done);
> +}
> +
> +static int nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
> + const struct rpc_call_ops *call_ops)
> +{
> + struct nfs_local_io_args args;
> + DECLARE_COMPLETION_ONSTACK(done);
> + struct nfs_local_kiocb *iocb;
> +
> dprintk("%s: vfs_write count=%u pos=%llu %s\n",
> __func__, hdr->args.count, hdr->args.offset,
> (hdr->args.stable == NFS_UNSTABLE) ? "unstable" : "stable");
> @@ -435,7 +486,6 @@ nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
> iocb = nfs_local_iocb_alloc(hdr, filp, GFP_NOIO);
> if (iocb == NULL)
> return -ENOMEM;
> - nfs_local_iter_init(&iter, iocb, WRITE);
>
> switch (hdr->args.stable) {
> default:
> @@ -450,14 +500,20 @@ nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
>
> nfs_set_local_verifier(hdr->inode, hdr->res.verf, hdr->args.stable);
>
> - file_start_write(filp);
> - status = filp->f_op->write_iter(&iocb->kiocb, &iter);
> - file_end_write(filp);
> - if (status != -EIOCBQUEUED) {
> - nfs_local_write_done(iocb, status);
> - nfs_get_vfs_attr(filp, hdr->res.fattr);
> - nfs_local_pgio_release(iocb);
> - }
> + /*
> + * Don't call filesystem write() routines directly.
> + * Some filesystem writeback routines can end up taking up a lot of
> + * stack (particularly xfs). Instead of risking running over due to
> + * the extra overhead from the NFS stack, call these write routines
> + * from a workqueue job.
> + */
> + args.iocb = iocb;
> + args.done = &done;
> + INIT_WORK_ONSTACK(&args.work, nfs_local_call_write);
> + queue_work(nfslocaliod_workqueue, &args.work);
> + wait_for_completion(&done);
> + destroy_work_on_stack(&args.work);
> +
> return 0;
> }
>
> --
> 2.44.0
>
>