Re: [PATCH 1/2] ceph: fix client race condition validating r_parent before applying state

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, 2025-07-30 at 15:18 +0000, Alex Markuze wrote:
> Add validation to ensure the cached parent directory inode matches the
> directory info in MDS replies. This prevents client-side race conditions
> where concurrent operations (e.g. rename) cause r_parent to become stale
> between request initiation and reply processing, which could lead to
> applying state changes to incorrect directory inodes.
> ---
>  fs/ceph/mds_client.c | 67 +++++++++++++++++++++++++++++++-------------
>  1 file changed, 47 insertions(+), 20 deletions(-)
> 
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 8d9fc5e18b17..a164783fc1e1 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -2853,7 +2853,7 @@ char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
>  
>  static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
>  			     struct inode *dir, const char **ppath, int *ppathlen,
> -			     u64 *pino, bool *pfreepath, bool parent_locked)
> +			     struct ceph_vino *pvino, bool *pfreepath, bool parent_locked)

We have too much arguments here. It looks like we need to introduce some
structure.

>  {
>  	char *path;
>  
> @@ -2862,23 +2862,29 @@ static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry
>  		dir = d_inode_rcu(dentry->d_parent);
>  	if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP &&
>  	    !IS_ENCRYPTED(dir)) {
> -		*pino = ceph_ino(dir);
> +		pvino->ino = ceph_ino(dir);
> +		pvino->snap = ceph_snap(dir);
>  		rcu_read_unlock();
>  		*ppath = dentry->d_name.name;
>  		*ppathlen = dentry->d_name.len;
>  		return 0;
>  	}
>  	rcu_read_unlock();
> -	path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
> +	path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, &pvino->ino, 1);
>  	if (IS_ERR(path))
>  		return PTR_ERR(path);
>  	*ppath = path;
>  	*pfreepath = true;
> +	/* For paths built by ceph_mdsc_build_path, we need to get snap from dentry */

I believe the multi-line comment could be good here.

> +	if (dentry && d_inode(dentry))

Could dentry be really NULL here?

> +		pvino->snap = ceph_snap(d_inode(dentry));
> +	else
> +		pvino->snap = CEPH_NOSNAP;
>  	return 0;
>  }
>  
>  static int build_inode_path(struct inode *inode,
> -			    const char **ppath, int *ppathlen, u64 *pino,
> +			    const char **ppath, int *ppathlen, struct ceph_vino *pvino,
>  			    bool *pfreepath)

We have too much arguments here too.

>  {
>  	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
> @@ -2886,17 +2892,19 @@ static int build_inode_path(struct inode *inode,
>  	char *path;
>  
>  	if (ceph_snap(inode) == CEPH_NOSNAP) {
> -		*pino = ceph_ino(inode);
> +		pvino->ino = ceph_ino(inode);
> +		pvino->snap = ceph_snap(inode);
>  		*ppathlen = 0;
>  		return 0;
>  	}
>  	dentry = d_find_alias(inode);
> -	path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
> +	path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, &pvino->ino, 1);
>  	dput(dentry);
>  	if (IS_ERR(path))
>  		return PTR_ERR(path);
>  	*ppath = path;
>  	*pfreepath = true;
> +	pvino->snap = ceph_snap(inode);
>  	return 0;
>  }
>  
> @@ -2907,7 +2915,7 @@ static int build_inode_path(struct inode *inode,
>  static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode,
>  				 struct dentry *rdentry, struct inode *rdiri,
>  				 const char *rpath, u64 rino, const char **ppath,
> -				 int *pathlen, u64 *ino, bool *freepath,
> +				 int *pathlen, struct ceph_vino *p_vino, bool *freepath,
>  				 bool parent_locked)

Ditto. :)

Thanks,
Slava.

>  {
>  	struct ceph_client *cl = mdsc->fsc->client;
> @@ -2915,16 +2923,17 @@ static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rin
>  	int r = 0;
>  
>  	if (rinode) {
> -		r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
> +		r = build_inode_path(rinode, ppath, pathlen, p_vino, freepath);
>  		boutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
> -		      ceph_snap(rinode));
> +		ceph_snap(rinode));
>  	} else if (rdentry) {
> -		r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino,
> -					freepath, parent_locked);
> +		r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, p_vino,
> +				       freepath, parent_locked);
>  		CEPH_SAN_STRNCPY(result_str, sizeof(result_str), *ppath, *pathlen);
> -		boutc(cl, " dentry %p %llx/%s\n", rdentry, *ino, result_str);
> +		boutc(cl, " dentry %p %llx/%s\n", rdentry, p_vino->ino, result_str);
>  	} else if (rpath || rino) {
> -		*ino = rino;
> +		p_vino->ino = rino;
> +		p_vino->snap = CEPH_NOSNAP;
>  		*ppath = rpath;
>  		*pathlen = rpath ? strlen(rpath) : 0;
>  		CEPH_SAN_STRNCPY(result_str, sizeof(result_str), rpath, *pathlen);
> @@ -3007,7 +3016,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>  	struct ceph_mds_request_head_legacy *lhead;
>  	const char *path1 = NULL;
>  	const char *path2 = NULL;
> -	u64 ino1 = 0, ino2 = 0;
> +	struct ceph_vino vino1 = {0}, vino2 = {0};
>  	int pathlen1 = 0, pathlen2 = 0;
>  	bool freepath1 = false, freepath2 = false;
>  	struct dentry *old_dentry = NULL;
> @@ -3019,17 +3028,35 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>  	u16 request_head_version = mds_supported_head_version(session);
>  	kuid_t caller_fsuid = req->r_cred->fsuid;
>  	kgid_t caller_fsgid = req->r_cred->fsgid;
> +	bool parent_locked = test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
>  
>  	ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
>  			      req->r_parent, req->r_path1, req->r_ino1.ino,
> -			      &path1, &pathlen1, &ino1, &freepath1,
> -			      test_bit(CEPH_MDS_R_PARENT_LOCKED,
> -					&req->r_req_flags));
> +			      &path1, &pathlen1, &vino1, &freepath1,
> +			      parent_locked);
>  	if (ret < 0) {
>  		msg = ERR_PTR(ret);
>  		goto out;
>  	}
>  
> +	/*
> +	 * When the parent directory's i_rwsem is *not* locked, req->r_parent may
> +	 * have become stale (e.g. after a concurrent rename) between the time the
> +	 * dentry was looked up and now.  If we detect that the stored r_parent
> +	 * does not match the inode number we just encoded for the request, switch
> +	 * to the correct inode so that the MDS receives a valid parent reference.
> +	 */
> +	if (!parent_locked &&
> +	    req->r_parent && vino1.ino && ceph_ino(req->r_parent) != vino1.ino) {
> +		struct inode *correct_dir = ceph_get_inode(mdsc->fsc->sb, vino1, NULL);
> +		if (!IS_ERR(correct_dir)) {
> +			WARN(1, "ceph: r_parent mismatch (had %llx wanted %llx) - updating\n",
> +			     ceph_ino(req->r_parent), vino1.ino);
> +			iput(req->r_parent);
> +			req->r_parent = correct_dir;
> +		}
> +	}
> +
>  	/* If r_old_dentry is set, then assume that its parent is locked */
>  	if (req->r_old_dentry &&
>  	    !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))
> @@ -3037,7 +3064,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>  	ret = set_request_path_attr(mdsc, NULL, old_dentry,
>  			      req->r_old_dentry_dir,
>  			      req->r_path2, req->r_ino2.ino,
> -			      &path2, &pathlen2, &ino2, &freepath2, true);
> +			      &path2, &pathlen2, &vino2, &freepath2, true);
>  	if (ret < 0) {
>  		msg = ERR_PTR(ret);
>  		goto out_free1;
> @@ -3191,8 +3218,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>  	lhead->ino = cpu_to_le64(req->r_deleg_ino);
>  	lhead->args = req->r_args;
>  
> -	ceph_encode_filepath(&p, end, ino1, path1);
> -	ceph_encode_filepath(&p, end, ino2, path2);
> +	ceph_encode_filepath(&p, end, vino1.ino, path1);
> +	ceph_encode_filepath(&p, end, vino2.ino, path2);
>  
>  	/* make note of release offset, in case we need to replay */
>  	req->r_request_release_offset = p - msg->front.iov_base;




[Index of Archives]     [CEPH Users]     [Ceph Large]     [Ceph Dev]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux