A deadlock can occur when ceph_get_inode is called outside of locks: 1) handle_reply calls ceph_get_inode, gets a new inode with I_NEW, and blocks on mdsc->snap_rwsem for write. 2) At the same time, ceph_readdir_prepopulate calls ceph_get_inode for the same inode while holding mdsc->snap_rwsem for read, and blocks on I_NEW. This causes an ABBA deadlock between mdsc->snap_rwsem and the I_NEW bit. The issue was introduced by commit bca9fc14c70f ("ceph: when filling trace, call ceph_get_inode outside of mutexes") which attempted to avoid a deadlock involving ceph_check_caps. That concern is now obsolete since commit 6a92b08fdad2 ("ceph: don't take s_mutex or snap_rwsem in ceph_check_caps") which made ceph_check_caps fully lock-free. This patch primarily reverts bca9fc14c70f to resolve the new deadlock, with a few minor adjustments to fit the current codebase. Link: https://tracker.ceph.com/issues/72307 Signed-off-by: Zhao Sun <sunzhao03@xxxxxxxxxxxx> --- fs/ceph/inode.c | 26 ++++++++++++++++++++++---- fs/ceph/mds_client.c | 29 ----------------------------- 2 files changed, 22 insertions(+), 33 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 06cd2963e41e..d0f0035ee117 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1623,10 +1623,28 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) } if (rinfo->head->is_target) { - /* Should be filled in by handle_reply */ - BUG_ON(!req->r_target_inode); + in = xchg(&req->r_new_inode, NULL); + tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); + tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + + /* + * If we ended up opening an existing inode, discard + * r_new_inode + */ + if (req->r_op == CEPH_MDS_OP_CREATE && + !req->r_reply_info.has_create_ino) { + /* This should never happen on an async create */ + WARN_ON_ONCE(req->r_deleg_ino); + iput(in); + in = NULL; + } + + in = ceph_get_inode(mdsc->fsc->sb, tvino, in); + if (IS_ERR(in)) { + err = PTR_ERR(in); + goto done; + } - in = req->r_target_inode; err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL, session, (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && @@ -1636,13 +1654,13 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) if (err < 0) { pr_err_client(cl, "badness %p %llx.%llx\n", in, ceph_vinop(in)); - req->r_target_inode = NULL; if (in->i_state & I_NEW) discard_new_inode(in); else iput(in); goto done; } + req->r_target_inode = in; if (in->i_state & I_NEW) unlock_new_inode(in); } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 230e0c3f341f..8b70f2b96f46 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3874,36 +3874,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) session->s_con.peer_features); mutex_unlock(&mdsc->mutex); - /* Must find target inode outside of mutexes to avoid deadlocks */ rinfo = &req->r_reply_info; - if ((err >= 0) && rinfo->head->is_target) { - struct inode *in = xchg(&req->r_new_inode, NULL); - struct ceph_vino tvino = { - .ino = le64_to_cpu(rinfo->targeti.in->ino), - .snap = le64_to_cpu(rinfo->targeti.in->snapid) - }; - - /* - * If we ended up opening an existing inode, discard - * r_new_inode - */ - if (req->r_op == CEPH_MDS_OP_CREATE && - !req->r_reply_info.has_create_ino) { - /* This should never happen on an async create */ - WARN_ON_ONCE(req->r_deleg_ino); - iput(in); - in = NULL; - } - - in = ceph_get_inode(mdsc->fsc->sb, tvino, in); - if (IS_ERR(in)) { - err = PTR_ERR(in); - mutex_lock(&session->s_mutex); - goto out_err; - } - req->r_target_inode = in; - } - mutex_lock(&session->s_mutex); if (err < 0) { pr_err_client(cl, "got corrupt reply mds%d(tid:%lld)\n", -- 2.39.2 (Apple Git-143)