On Tue, Mar 28, 2017 at 06:13:16PM +0200, Jan Kara wrote:
> Currently we free fsnotify_mark_connector structure only when inode /
> vfsmount is getting freed. This can however impose noticeable memory
> overhead when marks get attached to inodes only temporarily. So free the
> connector structure once the last mark is detached from the object.
> Since notification infrastructure can be working with the connector
> under the protection of fsnotify_mark_srcu, we have to be careful and
> free the fsnotify_mark_connector only after SRCU period passes.
Hmm, I'm thinking about the corner case when connector gets created and removed
repeatedly. Is this a potential DoS problem? Should perhaps creating new
connectors be throttled when the destroy_list becomes too large?
Probably not a big issue, but perhaps needs to be addressed in some way at the
end of the series...
>
> Reviewed-by: Amir Goldstein <amir73il@xxxxxxxxx>
> Signed-off-by: Jan Kara <jack@xxxxxxx>
> ---
> fs/inode.c | 3 -
> fs/mount.h | 2 +-
> fs/namespace.c | 3 -
> fs/notify/fsnotify.c | 9 ++-
> fs/notify/fsnotify.h | 10 +--
> fs/notify/inode_mark.c | 2 +-
> fs/notify/mark.c | 137 ++++++++++++++++++++++++++++-----------
> fs/notify/vfsmount_mark.c | 2 +-
> include/linux/fs.h | 2 +-
> include/linux/fsnotify_backend.h | 11 ++--
> kernel/auditsc.c | 6 +-
> 11 files changed, 123 insertions(+), 64 deletions(-)
>
> diff --git a/fs/inode.c b/fs/inode.c
> index 750e952d2918..131b2bcebc48 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -234,9 +234,6 @@ void __destroy_inode(struct inode *inode)
> inode_detach_wb(inode);
> security_inode_free(inode);
> fsnotify_inode_delete(inode);
> -#ifdef CONFIG_FSNOTIFY
> - fsnotify_connector_free(&inode->i_fsnotify_marks);
> -#endif
> locks_free_lock_context(inode);
> if (!inode->i_nlink) {
> WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
> diff --git a/fs/mount.h b/fs/mount.h
> index bc409360a03b..bf1fda6eed8f 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -59,7 +59,7 @@ struct mount {
> struct mountpoint *mnt_mp; /* where is it mounted */
> struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */
> #ifdef CONFIG_FSNOTIFY
> - struct fsnotify_mark_connector *mnt_fsnotify_marks;
> + struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
> __u32 mnt_fsnotify_mask;
> #endif
> int mnt_id; /* mount identifier */
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 2625e1d97a3a..b3b115bd4e1e 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1108,9 +1108,6 @@ static void cleanup_mnt(struct mount *mnt)
> if (unlikely(mnt->mnt_pins.first))
> mnt_pin_kill(mnt);
> fsnotify_vfsmount_delete(&mnt->mnt);
> -#ifdef CONFIG_FSNOTIFY
> - fsnotify_connector_free(&mnt->mnt_fsnotify_marks);
> -#endif
> dput(mnt->mnt.mnt_root);
> deactivate_super(mnt->mnt.mnt_sb);
> mnt_free_id(mnt);
> diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
> index eae621a18ac9..d512ef9f75fc 100644
> --- a/fs/notify/fsnotify.c
> +++ b/fs/notify/fsnotify.c
> @@ -228,7 +228,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
>
> if ((mask & FS_MODIFY) ||
> (test_mask & to_tell->i_fsnotify_mask)) {
> - inode_conn = lockless_dereference(to_tell->i_fsnotify_marks);
> + inode_conn = srcu_dereference(to_tell->i_fsnotify_marks,
> + &fsnotify_mark_srcu);
> if (inode_conn)
> inode_node = srcu_dereference(inode_conn->list.first,
> &fsnotify_mark_srcu);
> @@ -236,11 +237,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
>
> if (mnt && ((mask & FS_MODIFY) ||
> (test_mask & mnt->mnt_fsnotify_mask))) {
> - inode_conn = lockless_dereference(to_tell->i_fsnotify_marks);
> + inode_conn = srcu_dereference(to_tell->i_fsnotify_marks,
> + &fsnotify_mark_srcu);
> if (inode_conn)
> inode_node = srcu_dereference(inode_conn->list.first,
> &fsnotify_mark_srcu);
> - vfsmount_conn = lockless_dereference(mnt->mnt_fsnotify_marks);
> + vfsmount_conn = srcu_dereference(mnt->mnt_fsnotify_marks,
> + &fsnotify_mark_srcu);
> if (vfsmount_conn)
> vfsmount_node = srcu_dereference(
> vfsmount_conn->list.first,
> diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
> index 510f027bdf0f..72050b75ca8c 100644
> --- a/fs/notify/fsnotify.h
> +++ b/fs/notify/fsnotify.h
> @@ -20,19 +20,19 @@ extern int fsnotify_compare_groups(struct fsnotify_group *a,
>
> /* Find mark belonging to given group in the list of marks */
> extern struct fsnotify_mark *fsnotify_find_mark(
> - struct fsnotify_mark_connector *conn,
> - struct fsnotify_group *group);
> + struct fsnotify_mark_connector __rcu **connp,
> + struct fsnotify_group *group);
> /* Destroy all marks connected via given connector */
> -extern void fsnotify_destroy_marks(struct fsnotify_mark_connector *conn);
> +extern void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp);
> /* run the list of all marks associated with inode and destroy them */
> static inline void fsnotify_clear_marks_by_inode(struct inode *inode)
> {
> - fsnotify_destroy_marks(inode->i_fsnotify_marks);
> + fsnotify_destroy_marks(&inode->i_fsnotify_marks);
> }
> /* run the list of all marks associated with vfsmount and destroy them */
> static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
> {
> - fsnotify_destroy_marks(real_mount(mnt)->mnt_fsnotify_marks);
> + fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks);
> }
> /* prepare for freeing all marks associated with given group */
> extern void fsnotify_detach_group_marks(struct fsnotify_group *group);
> diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
> index 080b6d8b9973..b9370316727e 100644
> --- a/fs/notify/inode_mark.c
> +++ b/fs/notify/inode_mark.c
> @@ -50,7 +50,7 @@ void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
> struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group,
> struct inode *inode)
> {
> - return fsnotify_find_mark(inode->i_fsnotify_marks, group);
> + return fsnotify_find_mark(&inode->i_fsnotify_marks, group);
> }
>
> /**
> diff --git a/fs/notify/mark.c b/fs/notify/mark.c
> index 72afa0aed313..3bb012d68eb2 100644
> --- a/fs/notify/mark.c
> +++ b/fs/notify/mark.c
> @@ -89,10 +89,14 @@ struct kmem_cache *fsnotify_mark_connector_cachep;
>
> static DEFINE_SPINLOCK(destroy_lock);
> static LIST_HEAD(destroy_list);
> +static struct fsnotify_mark_connector *connector_destroy_list;
>
> static void fsnotify_mark_destroy_workfn(struct work_struct *work);
> static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn);
>
> +static void fsnotify_connector_destroy_workfn(struct work_struct *work);
> +static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn);
> +
> void fsnotify_get_mark(struct fsnotify_mark *mark)
> {
> atomic_inc(&mark->refcnt);
> @@ -139,23 +143,63 @@ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> __fsnotify_update_child_dentry_flags(conn->inode);
> }
>
> +/* Free all connectors queued for freeing once SRCU period ends */
> +static void fsnotify_connector_destroy_workfn(struct work_struct *work)
> +{
> + struct fsnotify_mark_connector *conn, *free;
> +
> + spin_lock(&destroy_lock);
> + conn = connector_destroy_list;
> + connector_destroy_list = NULL;
> + spin_unlock(&destroy_lock);
> +
> + synchronize_srcu(&fsnotify_mark_srcu);
> + while (conn) {
> + free = conn;
> + conn = conn->destroy_next;
> + kmem_cache_free(fsnotify_mark_connector_cachep, free);
> + }
> +}
> +
> static struct inode *fsnotify_detach_from_object(struct fsnotify_mark *mark)
> {
> struct fsnotify_mark_connector *conn;
> struct inode *inode = NULL;
> + bool free_conn = false;
>
> conn = mark->connector;
> spin_lock(&conn->lock);
> hlist_del_init_rcu(&mark->obj_list);
> if (hlist_empty(&conn->list)) {
> - if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE)
> + if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) {
> inode = conn->inode;
> + rcu_assign_pointer(inode->i_fsnotify_marks, NULL);
> + inode->i_fsnotify_mask = 0;
> + conn->inode = NULL;
> + conn->flags &= ~FSNOTIFY_OBJ_TYPE_INODE;
> + } else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
> + rcu_assign_pointer(
> + real_mount(conn->mnt)->mnt_fsnotify_marks,
> + NULL);
> + real_mount(conn->mnt)->mnt_fsnotify_mask = 0;
> + conn->mnt = NULL;
> + conn->flags &= ~FSNOTIFY_OBJ_TYPE_VFSMOUNT;
> + }
> + free_conn = true;
> } else {
> __fsnotify_recalc_mask(conn);
> }
> mark->connector = NULL;
> spin_unlock(&conn->lock);
>
> + if (free_conn) {
> + spin_lock(&destroy_lock);
> + conn->destroy_next = connector_destroy_list;
> + connector_destroy_list = conn;
> + spin_unlock(&destroy_lock);
> + queue_work(system_unbound_wq, &connector_reaper_work);
> + }
> +
> return inode;
> }
>
> @@ -260,14 +304,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark,
> fsnotify_free_mark(mark);
> }
>
> -void fsnotify_connector_free(struct fsnotify_mark_connector **connp)
> -{
> - if (*connp) {
> - kmem_cache_free(fsnotify_mark_connector_cachep, *connp);
> - *connp = NULL;
> - }
> -}
> -
> void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
> {
> assert_spin_locked(&mark->lock);
> @@ -319,9 +355,9 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
> }
>
> static int fsnotify_attach_connector_to_object(
> - struct fsnotify_mark_connector **connp,
> - struct inode *inode,
> - struct vfsmount *mnt)
> + struct fsnotify_mark_connector __rcu **connp,
> + struct inode *inode,
> + struct vfsmount *mnt)
> {
> struct fsnotify_mark_connector *conn;
>
> @@ -332,7 +368,7 @@ static int fsnotify_attach_connector_to_object(
> INIT_HLIST_HEAD(&conn->list);
> if (inode) {
> conn->flags = FSNOTIFY_OBJ_TYPE_INODE;
> - conn->inode = inode;
> + conn->inode = igrab(inode);
> } else {
> conn->flags = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
> conn->mnt = mnt;
> @@ -343,6 +379,8 @@ static int fsnotify_attach_connector_to_object(
> */
> if (cmpxchg(connp, NULL, conn)) {
> /* Someone else created list structure for us */
> + if (inode)
> + iput(inode);
> kmem_cache_free(fsnotify_mark_connector_cachep, conn);
> }
>
> @@ -350,6 +388,34 @@ static int fsnotify_attach_connector_to_object(
> }
>
> /*
> + * Get mark connector, make sure it is alive and return with its lock held.
> + * This is for users that get connector pointer from inode or mount. Users that
> + * hold reference to a mark on the list may directly lock connector->lock as
> + * they are sure list cannot go away under them.
> + */
> +static struct fsnotify_mark_connector *fsnotify_grab_connector(
> + struct fsnotify_mark_connector __rcu **connp)
> +{
> + struct fsnotify_mark_connector *conn;
> + int idx;
> +
> + idx = srcu_read_lock(&fsnotify_mark_srcu);
> + conn = srcu_dereference(*connp, &fsnotify_mark_srcu);
> + if (!conn)
> + goto out;
> + spin_lock(&conn->lock);
> + if (!(conn->flags & (FSNOTIFY_OBJ_TYPE_INODE |
> + FSNOTIFY_OBJ_TYPE_VFSMOUNT))) {
> + spin_unlock(&conn->lock);
> + srcu_read_unlock(&fsnotify_mark_srcu, idx);
> + return NULL;
> + }
> +out:
> + srcu_read_unlock(&fsnotify_mark_srcu, idx);
> + return conn;
> +}
> +
> +/*
> * Add mark into proper place in given list of marks. These marks may be used
> * for the fsnotify backend to determine which event types should be delivered
> * to which group and for which inodes. These marks are ordered according to
> @@ -361,7 +427,7 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
> {
> struct fsnotify_mark *lmark, *last = NULL;
> struct fsnotify_mark_connector *conn;
> - struct fsnotify_mark_connector **connp;
> + struct fsnotify_mark_connector __rcu **connp;
> int cmp;
> int err = 0;
>
> @@ -371,21 +437,20 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
> connp = &inode->i_fsnotify_marks;
> else
> connp = &real_mount(mnt)->mnt_fsnotify_marks;
> -
> - if (!*connp) {
> +restart:
> + spin_lock(&mark->lock);
> + conn = fsnotify_grab_connector(connp);
> + if (!conn) {
> + spin_unlock(&mark->lock);
> err = fsnotify_attach_connector_to_object(connp, inode, mnt);
> if (err)
> return err;
> + goto restart;
> }
> - spin_lock(&mark->lock);
> - conn = *connp;
> - spin_lock(&conn->lock);
>
> /* is mark the first mark? */
> if (hlist_empty(&conn->list)) {
> hlist_add_head_rcu(&mark->obj_list, &conn->list);
> - if (inode)
> - igrab(inode);
> goto added;
> }
>
> @@ -487,15 +552,17 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
> * Given a list of marks, find the mark associated with given group. If found
> * take a reference to that mark and return it, else return NULL.
> */
> -struct fsnotify_mark *fsnotify_find_mark(struct fsnotify_mark_connector *conn,
> - struct fsnotify_group *group)
> +struct fsnotify_mark *fsnotify_find_mark(
> + struct fsnotify_mark_connector __rcu **connp,
> + struct fsnotify_group *group)
> {
> + struct fsnotify_mark_connector *conn;
> struct fsnotify_mark *mark;
>
> + conn = fsnotify_grab_connector(connp);
> if (!conn)
> return NULL;
>
> - spin_lock(&conn->lock);
> hlist_for_each_entry(mark, &conn->list, obj_list) {
> if (mark->group == group) {
> fsnotify_get_mark(mark);
> @@ -573,26 +640,20 @@ void fsnotify_detach_group_marks(struct fsnotify_group *group)
> }
> }
>
> -void fsnotify_destroy_marks(struct fsnotify_mark_connector *conn)
> +/* Destroy all marks attached to inode / vfsmount */
> +void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp)
> {
> + struct fsnotify_mark_connector *conn;
> struct fsnotify_mark *mark;
>
> - if (!conn)
> - return;
> -
> - while (1) {
> + while ((conn = fsnotify_grab_connector(connp))) {
> /*
> * We have to be careful since we can race with e.g.
> - * fsnotify_clear_marks_by_group() and once we drop 'lock',
> - * mark can get removed from the obj_list and destroyed. But
> - * we are holding mark reference so mark cannot be freed and
> - * calling fsnotify_destroy_mark() more than once is fine.
> + * fsnotify_clear_marks_by_group() and once we drop the list
> + * lock, mark can get removed from the obj_list and destroyed.
> + * But we are holding mark reference so mark cannot be freed
> + * and calling fsnotify_destroy_mark() more than once is fine.
> */
> - spin_lock(&conn->lock);
> - if (hlist_empty(&conn->list)) {
> - spin_unlock(&conn->lock);
> - break;
> - }
> mark = hlist_entry(conn->list.first, struct fsnotify_mark,
> obj_list);
> fsnotify_get_mark(mark);
> diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
> index 26da5c209944..dd5f3fcbccfb 100644
> --- a/fs/notify/vfsmount_mark.c
> +++ b/fs/notify/vfsmount_mark.c
> @@ -48,5 +48,5 @@ struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group,
> {
> struct mount *m = real_mount(mnt);
>
> - return fsnotify_find_mark(m->mnt_fsnotify_marks, group);
> + return fsnotify_find_mark(&m->mnt_fsnotify_marks, group);
> }
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 66e52342be2d..c0b6150c5fcc 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -647,7 +647,7 @@ struct inode {
>
> #ifdef CONFIG_FSNOTIFY
> __u32 i_fsnotify_mask; /* all events this inode cares about */
> - struct fsnotify_mark_connector *i_fsnotify_marks;
> + struct fsnotify_mark_connector __rcu *i_fsnotify_marks;
> #endif
>
> #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
> diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
> index 02c6fac652a4..84d71b6f75f6 100644
> --- a/include/linux/fsnotify_backend.h
> +++ b/include/linux/fsnotify_backend.h
> @@ -197,8 +197,8 @@ struct fsnotify_group {
> /*
> * Inode / vfsmount point to this structure which tracks all marks attached to
> * the inode / vfsmount. The reference to inode / vfsmount is held by this
> - * structure whenever the list is non-empty. The structure is freed only when
> - * inode / vfsmount gets freed.
> + * structure. We destroy this structure when there are no more marks attached
> + * to it. The structure is protected by fsnotify_mark_srcu.
> */
> struct fsnotify_mark_connector {
> spinlock_t lock;
> @@ -209,7 +209,11 @@ struct fsnotify_mark_connector {
> struct inode *inode;
> struct vfsmount *mnt;
> };
> - struct hlist_head list;
> + union {
> + struct hlist_head list;
> + /* Used listing heads to free after srcu period expires */
> + struct fsnotify_mark_connector *destroy_next;
> + };
> };
>
> /*
> @@ -361,7 +365,6 @@ extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
> extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group);
> /* run all the marks in a group, and clear all of the marks attached to given object type */
> extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags);
> -extern void fsnotify_connector_free(struct fsnotify_mark_connector **connp);
> extern void fsnotify_get_mark(struct fsnotify_mark *mark);
> extern void fsnotify_put_mark(struct fsnotify_mark *mark);
> extern void fsnotify_unmount_inodes(struct super_block *sb);
> diff --git a/kernel/auditsc.c b/kernel/auditsc.c
> index bf7b7ca295d0..d383c33540af 100644
> --- a/kernel/auditsc.c
> +++ b/kernel/auditsc.c
> @@ -1597,8 +1597,7 @@ static inline void handle_one(const struct inode *inode)
> struct audit_tree_refs *p;
> struct audit_chunk *chunk;
> int count;
> - if (likely(!inode->i_fsnotify_marks ||
> - hlist_empty(&inode->i_fsnotify_marks->list)))
> + if (likely(!inode->i_fsnotify_marks))
> return;
> context = current->audit_context;
> p = context->trees;
> @@ -1641,8 +1640,7 @@ static void handle_path(const struct dentry *dentry)
> seq = read_seqbegin(&rename_lock);
> for(;;) {
> struct inode *inode = d_backing_inode(d);
> - if (inode && unlikely(inode->i_fsnotify_marks &&
> - !hlist_empty(&inode->i_fsnotify_marks->list))) {
> + if (inode && unlikely(inode->i_fsnotify_marks)) {
> struct audit_chunk *chunk;
> chunk = audit_tree_lookup(inode);
> if (chunk) {
> --
> 2.10.2
>