On Fri, 2025-08-08 at 15:08 +0800, Zhao Sun wrote: > A deadlock can occur when ceph_get_inode is called outside of locks: > > 1) handle_reply calls ceph_get_inode, gets a new inode with I_NEW, > and blocks on mdsc->snap_rwsem for write. > Frankly speaking, it's hard to follow to your logic. Which particular mdsc- >snap_rwsem lock do you mean in handle_reply()? > 2) At the same time, ceph_readdir_prepopulate calls ceph_get_inode > for the same inode while holding mdsc->snap_rwsem for read, > and blocks on I_NEW. > The same here. Which particular mdsc->snap_rwsem lock do you mean in ceph_readdir_prepopulate()? > This causes an ABBA deadlock between mdsc->snap_rwsem and the I_NEW bit. > > The issue was introduced by commit bca9fc14c70f > ("ceph: when filling trace, call ceph_get_inode outside of mutexes") > which attempted to avoid a deadlock involving ceph_check_caps. > > That concern is now obsolete since commit 6a92b08fdad2 > ("ceph: don't take s_mutex or snap_rwsem in ceph_check_caps") > which made ceph_check_caps fully lock-free. > > This patch primarily reverts bca9fc14c70f to resolve the new deadlock, > with a few minor adjustments to fit the current codebase. > I assume that you hit the issue. I believe it will be good to have the explanation which use-case/workload trigger the issue and which symptoms do you see (system log's content, for example). Thanks, Slava. > Link: https://tracker.ceph.com/issues/72307 > Signed-off-by: Zhao Sun <sunzhao03@xxxxxxxxxxxx> > --- > fs/ceph/inode.c | 26 ++++++++++++++++++++++---- > fs/ceph/mds_client.c | 29 ----------------------------- > 2 files changed, 22 insertions(+), 33 deletions(-) > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c > index 06cd2963e41e..d0f0035ee117 100644 > --- a/fs/ceph/inode.c > +++ b/fs/ceph/inode.c > @@ -1623,10 +1623,28 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) > } > > if (rinfo->head->is_target) { > - /* Should be filled in by handle_reply */ > - BUG_ON(!req->r_target_inode); > + in = xchg(&req->r_new_inode, NULL); > + tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); > + tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); > + > + /* > + * If we ended up opening an existing inode, discard > + * r_new_inode > + */ > + if (req->r_op == CEPH_MDS_OP_CREATE && > + !req->r_reply_info.has_create_ino) { > + /* This should never happen on an async create */ > + WARN_ON_ONCE(req->r_deleg_ino); > + iput(in); > + in = NULL; > + } > + > + in = ceph_get_inode(mdsc->fsc->sb, tvino, in); > + if (IS_ERR(in)) { > + err = PTR_ERR(in); > + goto done; > + } > > - in = req->r_target_inode; > err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti, > NULL, session, > (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && > @@ -1636,13 +1654,13 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) > if (err < 0) { > pr_err_client(cl, "badness %p %llx.%llx\n", in, > ceph_vinop(in)); > - req->r_target_inode = NULL; > if (in->i_state & I_NEW) > discard_new_inode(in); > else > iput(in); > goto done; > } > + req->r_target_inode = in; > if (in->i_state & I_NEW) > unlock_new_inode(in); > } > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > index 230e0c3f341f..8b70f2b96f46 100644 > --- a/fs/ceph/mds_client.c > +++ b/fs/ceph/mds_client.c > @@ -3874,36 +3874,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) > session->s_con.peer_features); > mutex_unlock(&mdsc->mutex); > > - /* Must find target inode outside of mutexes to avoid deadlocks */ > rinfo = &req->r_reply_info; > - if ((err >= 0) && rinfo->head->is_target) { > - struct inode *in = xchg(&req->r_new_inode, NULL); > - struct ceph_vino tvino = { > - .ino = le64_to_cpu(rinfo->targeti.in->ino), > - .snap = le64_to_cpu(rinfo->targeti.in->snapid) > - }; > - > - /* > - * If we ended up opening an existing inode, discard > - * r_new_inode > - */ > - if (req->r_op == CEPH_MDS_OP_CREATE && > - !req->r_reply_info.has_create_ino) { > - /* This should never happen on an async create */ > - WARN_ON_ONCE(req->r_deleg_ino); > - iput(in); > - in = NULL; > - } > - > - in = ceph_get_inode(mdsc->fsc->sb, tvino, in); > - if (IS_ERR(in)) { > - err = PTR_ERR(in); > - mutex_lock(&session->s_mutex); > - goto out_err; > - } > - req->r_target_inode = in; > - } > - > mutex_lock(&session->s_mutex); > if (err < 0) { > pr_err_client(cl, "got corrupt reply mds%d(tid:%lld)\n",