Re: [PATCH RFC] fs/ceph : fix build warning in ceph_writepages_start()

Alex Markuze <amarkuze@xxxxxxxxxx> · Mon, 7 Jul 2025 22:57:11 +0300

Well, kmem_cache is a bit faster AFAIK. TBH, I would use a magazine
allocator here which would bridge the two, but that's a different
discussion.
We already have an allocation of a big struct in the read path as well
where we just kmalloc and it should change as well to
kmemcache/mempool.

On Mon, Jul 7, 2025 at 9:24 PM Viacheslav Dubeyko <Slava.Dubeyko@xxxxxxx> wrote:
>
> On Mon, 2025-07-07 at 13:11 +0300, Alex Markuze wrote:
> > Hi Abinash,
> >
> > Thanks for your patch, you’ve correctly identified a real concern
> > around stack usage in ceph_writepages_start().
> >
> > However, dynamically allocating ceph_writeback_ctl inside .writepages
> > isn't ideal. This function may be called in memory reclaim paths or
> > under writeback pressure, where any GFP allocation (even GFP_NOFS)
> > could deadlock or fail unexpectedly.
> >
> > Instead of allocating the struct on each call, I’d suggest one of the following:
> >
> > Add a dedicated kmem_cache for ceph_writeback_ctl, initialized during
> > Ceph FS client init.
> > This allows reuse across calls without hitting the slab allocator each time.
> >
>
> I had discussion with Ilya several days ago related to this issue. :) I
> considered namely to add the dedicated kmem_cache for ceph_writeback_ctl.
> However, Ilya mentioned that CONFIG_FRAME_WARN defaulting to 2048 on x86_64. So,
> you even could not see such warning on x86_64 systems. But, yes, this rework
> makes sense from my point of view.
>
> By the way, do we need to consider mempool here instead of kmem_cache? The
> writeback is pretty intensive and critical code path.
>
> Thanks,
> Slava.
>
> > Embed a ceph_writeback_ctl struct inside ceph_inode_info, if it's
> > feasible to manage lifetimes and synchronization. Note that
> > .writepages can run concurrently on the same inode, so this approach
> > would require proper locking or reuse guarantees.
> >
> > On Sat, Jul 5, 2025 at 6:54 PM Abinash Singh <abinashlalotra@xxxxxxxxx> wrote:
> > >
> > > The function `ceph_writepages_start()` triggers
> > > a large stack frame warning:
> > > ld.lld: warning:
> > >         fs/ceph/addr.c:1632:0: stack frame size (1072) exceeds limit (1024) in function 'ceph_writepages_start.llvm.2555023190050417194'
> > >
> > > fix it by dynamically allocating `ceph_writeback_ctl` struct.
> > >
> > > Signed-off-by: Abinash Singh <abinashsinghlalotra@xxxxxxxxx>
> > > ---
> > > The high stack usage of ceph_writepages_start() was further
> > > confirmed by using -fstack-usage flag :
> > > ...
> > > fs/ceph/addr.c:1837:ceph_netfs_check_write_begin        104     static
> > > fs/ceph/addr.c:1630:ceph_writepages_start       648     static
> > > ...
> > > After optimzations it may go upto 1072 as seen in warning.
> > > After allocating `ceph_writeback_ctl` struct it is reduced to:
> > > ...
> > > fs/ceph/addr.c:1630:ceph_writepages_start       288     static
> > > fs/ceph/addr.c:81:ceph_dirty_folio      72      static
> > > ...
> > > Is this fun used very frequently ?? or dynamic allocation is
> > > a fine fix for reducing the stack usage ?
> > >
> > > Thank You
> > > ---
> > >  fs/ceph/addr.c | 82 ++++++++++++++++++++++++++------------------------
> > >  1 file changed, 43 insertions(+), 39 deletions(-)
> > >
> > > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> > > index 60a621b00c65..503a86c1dda6 100644
> > > --- a/fs/ceph/addr.c
> > > +++ b/fs/ceph/addr.c
> > > @@ -1633,9 +1633,13 @@ static int ceph_writepages_start(struct address_space *mapping,
> > >         struct inode *inode = mapping->host;
> > >         struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
> > >         struct ceph_client *cl = fsc->client;
> > > -       struct ceph_writeback_ctl ceph_wbc;
> > > +       struct ceph_writeback_ctl *ceph_wbc __free(kfree) = NULL;
> > >         int rc = 0;
> > >
> > > +       ceph_wbc = kmalloc(sizeof(*ceph_wbc), GFP_NOFS);
> > > +       if (!ceph_wbc)
> > > +               return -ENOMEM;
> > > +
> > >         if (wbc->sync_mode == WB_SYNC_NONE && fsc->write_congested)
> > >                 return 0;
> > >
> > > @@ -1648,7 +1652,7 @@ static int ceph_writepages_start(struct address_space *mapping,
> > >                 return -EIO;
> > >         }
> > >
> > > -       ceph_init_writeback_ctl(mapping, wbc, &ceph_wbc);
> > > +       ceph_init_writeback_ctl(mapping, wbc, ceph_wbc);
> > >
> > >         if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
> > >                 rc = -EIO;
> > > @@ -1656,7 +1660,7 @@ static int ceph_writepages_start(struct address_space *mapping,
> > >         }
> > >
> > >  retry:
> > > -       rc = ceph_define_writeback_range(mapping, wbc, &ceph_wbc);
> > > +       rc = ceph_define_writeback_range(mapping, wbc, ceph_wbc);
> > >         if (rc == -ENODATA) {
> > >                 /* hmm, why does writepages get called when there
> > >                    is no dirty data? */
> > > @@ -1665,55 +1669,55 @@ static int ceph_writepages_start(struct address_space *mapping,
> > >         }
> > >
> > >         if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
> > > -               tag_pages_for_writeback(mapping, ceph_wbc.index, ceph_wbc.end);
> > > +               tag_pages_for_writeback(mapping, ceph_wbc->index, ceph_wbc->end);
> > >
> > > -       while (!has_writeback_done(&ceph_wbc)) {
> > > -               ceph_wbc.locked_pages = 0;
> > > -               ceph_wbc.max_pages = ceph_wbc.wsize >> PAGE_SHIFT;
> > > +       while (!has_writeback_done(ceph_wbc)) {
> > > +               ceph_wbc->locked_pages = 0;
> > > +               ceph_wbc->max_pages = ceph_wbc->wsize >> PAGE_SHIFT;
> > >
> > >  get_more_pages:
> > > -               ceph_folio_batch_reinit(&ceph_wbc);
> > > +               ceph_folio_batch_reinit(ceph_wbc);
> > >
> > > -               ceph_wbc.nr_folios = filemap_get_folios_tag(mapping,
> > > -                                                           &ceph_wbc.index,
> > > -                                                           ceph_wbc.end,
> > > -                                                           ceph_wbc.tag,
> > > -                                                           &ceph_wbc.fbatch);
> > > +               ceph_wbc->nr_folios = filemap_get_folios_tag(mapping,
> > > +                                                           &ceph_wbc->index,
> > > +                                                           ceph_wbc->end,
> > > +                                                           ceph_wbc->tag,
> > > +                                                           &ceph_wbc->fbatch);
> > >                 doutc(cl, "pagevec_lookup_range_tag for tag %#x got %d\n",
> > > -                       ceph_wbc.tag, ceph_wbc.nr_folios);
> > > +                       ceph_wbc->tag, ceph_wbc->nr_folios);
> > >
> > > -               if (!ceph_wbc.nr_folios && !ceph_wbc.locked_pages)
> > > +               if (!ceph_wbc->nr_folios && !ceph_wbc->locked_pages)
> > >                         break;
> > >
> > >  process_folio_batch:
> > > -               rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc);
> > > +               rc = ceph_process_folio_batch(mapping, wbc, ceph_wbc);
> > >                 if (rc)
> > >                         goto release_folios;
> > >
> > >                 /* did we get anything? */
> > > -               if (!ceph_wbc.locked_pages)
> > > +               if (!ceph_wbc->locked_pages)
> > >                         goto release_folios;
> > >
> > > -               if (ceph_wbc.processed_in_fbatch) {
> > > -                       ceph_shift_unused_folios_left(&ceph_wbc.fbatch);
> > > +               if (ceph_wbc->processed_in_fbatch) {
> > > +                       ceph_shift_unused_folios_left(&ceph_wbc->fbatch);
> > >
> > > -                       if (folio_batch_count(&ceph_wbc.fbatch) == 0 &&
> > > -                           ceph_wbc.locked_pages < ceph_wbc.max_pages) {
> > > +                       if (folio_batch_count(&ceph_wbc->fbatch) == 0 &&
> > > +                           ceph_wbc->locked_pages < ceph_wbc->max_pages) {
> > >                                 doutc(cl, "reached end fbatch, trying for more\n");
> > >                                 goto get_more_pages;
> > >                         }
> > >                 }
> > >
> > > -               rc = ceph_submit_write(mapping, wbc, &ceph_wbc);
> > > +               rc = ceph_submit_write(mapping, wbc, ceph_wbc);
> > >                 if (rc)
> > >                         goto release_folios;
> > >
> > > -               ceph_wbc.locked_pages = 0;
> > > -               ceph_wbc.strip_unit_end = 0;
> > > +               ceph_wbc->locked_pages = 0;
> > > +               ceph_wbc->strip_unit_end = 0;
> > >
> > > -               if (folio_batch_count(&ceph_wbc.fbatch) > 0) {
> > > -                       ceph_wbc.nr_folios =
> > > -                               folio_batch_count(&ceph_wbc.fbatch);
> > > +               if (folio_batch_count(&ceph_wbc->fbatch) > 0) {
> > > +                       ceph_wbc->nr_folios =
> > > +                               folio_batch_count(&ceph_wbc->fbatch);
> > >                         goto process_folio_batch;
> > >                 }
> > >
> > > @@ -1724,38 +1728,38 @@ static int ceph_writepages_start(struct address_space *mapping,
> > >                  * we tagged for writeback prior to entering this loop.
> > >                  */
> > >                 if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE)
> > > -                       ceph_wbc.done = true;
> > > +                       ceph_wbc->done = true;
> > >
> > >  release_folios:
> > >                 doutc(cl, "folio_batch release on %d folios (%p)\n",
> > > -                     (int)ceph_wbc.fbatch.nr,
> > > -                     ceph_wbc.fbatch.nr ? ceph_wbc.fbatch.folios[0] : NULL);
> > > -               folio_batch_release(&ceph_wbc.fbatch);
> > > +                     (int)ceph_wbc->fbatch.nr,
> > > +                     ceph_wbc->fbatch.nr ? ceph_wbc->fbatch.folios[0] : NULL);
> > > +               folio_batch_release(&ceph_wbc->fbatch);
> > >         }
> > >
> > > -       if (ceph_wbc.should_loop && !ceph_wbc.done) {
> > > +       if (ceph_wbc->should_loop && !ceph_wbc->done) {
> > >                 /* more to do; loop back to beginning of file */
> > >                 doutc(cl, "looping back to beginning of file\n");
> > >                 /* OK even when start_index == 0 */
> > > -               ceph_wbc.end = ceph_wbc.start_index - 1;
> > > +               ceph_wbc->end = ceph_wbc->start_index - 1;
> > >
> > >                 /* to write dirty pages associated with next snapc,
> > >                  * we need to wait until current writes complete */
> > > -               ceph_wait_until_current_writes_complete(mapping, wbc, &ceph_wbc);
> > > +               ceph_wait_until_current_writes_complete(mapping, wbc, ceph_wbc);
> > >
> > > -               ceph_wbc.start_index = 0;
> > > -               ceph_wbc.index = 0;
> > > +               ceph_wbc->start_index = 0;
> > > +               ceph_wbc->index = 0;
> > >                 goto retry;
> > >         }
> > >
> > > -       if (wbc->range_cyclic || (ceph_wbc.range_whole && wbc->nr_to_write > 0))
> > > -               mapping->writeback_index = ceph_wbc.index;
> > > +       if (wbc->range_cyclic || (ceph_wbc->range_whole && wbc->nr_to_write > 0))
> > > +               mapping->writeback_index = ceph_wbc->index;
> > >
> > >  dec_osd_stopping_blocker:
> > >         ceph_dec_osd_stopping_blocker(fsc->mdsc);
> > >
> > >  out:
> > > -       ceph_put_snap_context(ceph_wbc.last_snapc);
> > > +       ceph_put_snap_context(ceph_wbc->last_snapc);
> > >         doutc(cl, "%llx.%llx dend - startone, rc = %d\n", ceph_vinop(inode),
> > >               rc);
> > >
> > > --
> > > 2.43.0
> > >
> > >
> >