On May 29, 2025, at 8:42 AM, David Wang <00107082@xxxxxxx> wrote: > > When searching files, ext4_readdir would kzalloc() a fname > object for each entry. It would be faster if a dedicated > kmem_cache is used for fname. > > But fnames are of variable length. > > This patch suggests using kmem_cache for fname with short > length, and resorting to kzalloc when fname needs larger buffer. > Assuming long file names are not very common. It may be reasonable to have a cache, but I suspect that 127 bytes is too large for most common workloads. Statistics that I've seen show 99-percentile filename length is <= 48 bytes on most filesystems, so allocating 128 bytes is probably sub-optimal (most is wasted). That said, kmalloc() is mostly a wrapper for the kmalloc-* slabs, so creating a 128-byte slab for this doesn't seem super useful? > Profiling when searching files in kernel code base, with following > command: > # perf record -g -e cpu-clock --freq=max bash -c \ > "for i in {1..100}; do find ./linux -name notfoundatall > /dev/null; done" > And using sample counts as indicator of performance improvement. > (The faster, the less samples collected. And the tests are carried out > when system is under no memory pressure.) > > Before without the change: > 1232868--ext4_readdir > | > |--839085--ext4_htree_fill_tree > | | > | --829223--htree_dirblock_to_tree > | | > | |--365869--ext4_htree_store_dirent > | | | > | | |--43169--0xffffffffa7f8d094 > | | | > | | --21947--0xffffffffa7f8d0f7 > | | > | |--213124--ext4fs_dirhash > | | | > | | --86339--str2hashbuf_signed > | | > | |--145839--__ext4_read_dirblock > > and with the change, ~3% less samples: > 1202922--ext4_readdir > | > |--805105--ext4_htree_fill_tree > | | > | --795055--htree_dirblock_to_tree > | | > | |--328876--ext4_htree_store_dirent > | | | > | | |--123207--kmem_cache_alloc_noprof > | | | | > | | | |--26453--__alloc_tagging_slab_alloc_hook > | | | | > | | | --20413--__slab_alloc.isra.0 > | | | > | | --31566--rb_insert_color > | | > | |--212915--ext4fs_dirhash > | | | > | | --86004--str2hashbuf_signed > | | > | |--149146--__ext4_read_dirblock > > readdir() would have sigfinicant improvement, but the overall > improvements for searching files is only ~0.5%, might be more > sigfinicant if the system is under some memory pressures. > > The slab stats after the test: > ext4_dir_fname 1242 1242 176 23 1 : tunables 0 0 0 : slabdata 54 54 0 > > Signed-off-by: David Wang <00107082@xxxxxxx> > --- > fs/ext4/dir.c | 47 ++++++++++++++++++++++++++++++++++++++++++++--- > fs/ext4/ext4.h | 4 ++++ > fs/ext4/super.c | 3 +++ > 3 files changed, 51 insertions(+), 3 deletions(-) > > diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c > index d4164c507a90..3adfa0d038cd 100644 > --- a/fs/ext4/dir.c > +++ b/fs/ext4/dir.c > @@ -424,6 +424,48 @@ struct fname { > char name[] __counted_by(name_len); > }; > > +#define EXT4_DIR_FNAME_SHORT_LENGTH 127 > +static struct kmem_cache *ext4_dir_fname_cachep; > + > +void __init ext4_init_dir(void) > +{ > + ext4_dir_fname_cachep = kmem_cache_create_usercopy("ext4_dir_fname", > + struct_size_t(struct fname, name, EXT4_DIR_FNAME_SHORT_LENGTH + 1), > + 0, SLAB_RECLAIM_ACCOUNT, > + offsetof(struct fname, name), EXT4_DIR_FNAME_SHORT_LENGTH + 1, > + NULL); > +} > + > +void ext4_exit_dir(void) > +{ > + if (ext4_dir_fname_cachep) > + kmem_cache_destroy(ext4_dir_fname_cachep); > +} > + > +static struct fname *rb_node_fname_zalloc(__u8 name_len) > +{ > + struct fname *p; > + if (ext4_dir_fname_cachep && name_len <= EXT4_DIR_FNAME_SHORT_LENGTH) > + p = kmem_cache_alloc(ext4_dir_fname_cachep, GFP_KERNEL); > + else > + p = kmalloc(struct_size(p, name, name_len + 1), GFP_KERNEL); > + if (p) { > + /* no need to fill name with zeroes*/ > + memset(p, 0, offsetof(struct fname, name)); > + p->name[name_len] = 0; > + } > + return p; > +} > + > +static void rb_node_fname_free(struct fname *p) { > + if (!p) > + return; > + if (ext4_dir_fname_cachep && p->name_len <= EXT4_DIR_FNAME_SHORT_LENGTH) > + kmem_cache_free(ext4_dir_fname_cachep, p); > + else > + kfree(p); > +} > + > /* > * This function implements a non-recursive way of freeing all of the > * nodes in the red-black tree. > @@ -436,7 +478,7 @@ static void free_rb_tree_fname(struct rb_root *root) > while (fname) { > struct fname *old = fname; > fname = fname->next; > - kfree(old); > + rb_node_fname_free(old); > } > > *root = RB_ROOT; > @@ -479,8 +521,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, > p = &info->root.rb_node; > > /* Create and allocate the fname structure */ > - new_fn = kzalloc(struct_size(new_fn, name, ent_name->len + 1), > - GFP_KERNEL); > + new_fn = rb_node_fname_zalloc(ent_name->len); > if (!new_fn) > return -ENOMEM; > new_fn->hash = hash; > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 5a20e9cd7184..33ab97143000 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -3795,6 +3795,10 @@ extern void ext4_orphan_file_block_trigger( > struct buffer_head *bh, > void *data, size_t size); > > +/* dir.c */ > +extern void __init ext4_init_dir(void); > +extern void ext4_exit_dir(void); > + > /* > * Add new method to test whether block and inode bitmaps are properly > * initialized. With uninit_bg reading the block from disk is not enough > diff --git a/fs/ext4/super.c b/fs/ext4/super.c > index 181934499624..21ce3d78912a 100644 > --- a/fs/ext4/super.c > +++ b/fs/ext4/super.c > @@ -7457,6 +7457,8 @@ static int __init ext4_init_fs(void) > if (err) > goto out; > > + ext4_init_dir(); > + > return 0; > out: > unregister_as_ext2(); > @@ -7497,6 +7499,7 @@ static void __exit ext4_exit_fs(void) > ext4_exit_post_read_processing(); > ext4_exit_es(); > ext4_exit_pending(); > + ext4_exit_dir(); > } > > MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); > -- > 2.39.2 > Cheers, Andreas
Attachment:
signature.asc
Description: Message signed with OpenPGP