On Tue, Aug 19, 2025 at 03:34:47PM +0200, Christoph Hellwig wrote: > On Tue, Aug 19, 2025 at 12:14:26PM +0200, Christian Brauner wrote: > > On Tue, Aug 19, 2025 at 11:22:19AM +0200, Christoph Hellwig wrote: > > > On Tue, Aug 19, 2025 at 11:14:41AM +0200, Christian Brauner wrote: > > > > It kind of feels like that f_iocb_flags should be changed so that > > > > subsystems like block can just raise some internal flags directly > > > > instead of grabbing a f_mode flag everytime they need to make some > > > > IOCB_* flag conditional on the file. That would mean changing the > > > > unconditional assigment to file->f_iocb_flags to a |= to not mask flags > > > > raised by the kernel itself. > > > > > > This isn't about block. I will be setting this for a file system > > > operation as well and use the same io_uring code for that. That's > > > how I ran into the issue. > > > > Yes, I get that. That's not what this is about. If IOCB_* flags keep > > getting added that then need an additional opt-out via an FMODE_* flag > > it's very annoying because you keep taking FMODE_* bits. > > Agreed. > > > The thing is > > that it should be possible to keep that information completely contained > > to f_iocb_flags without polluting f_mode. > > I don't really understand how that would work. The basic problem is that > we add optional features/flags to read and write, and we need a way to > check that they are supported and reject them without each time having > to update all instances. For that VFS-level code needs some way to do > a per-instance check of available features. I meant something like this which should effectively be the same thing just that we move the burden of having to use two bits completely into file->f_iocb_flags instead of wasting a file->f_mode bit: diff --git a/block/fops.c b/block/fops.c index ddbc69c0922b..a90f1127d035 100644 --- a/block/fops.c +++ b/block/fops.c @@ -689,7 +689,7 @@ static int blkdev_open(struct inode *inode, struct file *filp) if (bdev_can_atomic_write(bdev)) filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; if (blk_get_integrity(bdev->bd_disk)) - filp->f_mode |= FMODE_HAS_METADATA; + filp->f_iocb_flags |= IOCB_MAY_USE_METADATA; ret = bdev_open(bdev, mode, filp->private_data, NULL, filp); if (ret) diff --git a/include/linux/fs.h b/include/linux/fs.h index 601d036a6c78..a40a1bf7bad5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -149,9 +149,6 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* Expect random access pattern */ #define FMODE_RANDOM ((__force fmode_t)(1 << 12)) -/* Supports IOCB_HAS_METADATA */ -#define FMODE_HAS_METADATA ((__force fmode_t)(1 << 13)) - /* File is opened with O_PATH; almost nothing can be done with it */ #define FMODE_PATH ((__force fmode_t)(1 << 14)) @@ -384,25 +381,27 @@ struct readahead_control; /* kiocb is a read or write operation submitted by fs/aio.c. */ #define IOCB_AIO_RW (1 << 23) #define IOCB_HAS_METADATA (1 << 24) +#define IOCB_MAY_USE_METADATA (1 << 25) /* for use in trace events */ #define TRACE_IOCB_STRINGS \ - { IOCB_HIPRI, "HIPRI" }, \ - { IOCB_DSYNC, "DSYNC" }, \ - { IOCB_SYNC, "SYNC" }, \ - { IOCB_NOWAIT, "NOWAIT" }, \ - { IOCB_APPEND, "APPEND" }, \ - { IOCB_ATOMIC, "ATOMIC" }, \ - { IOCB_DONTCACHE, "DONTCACHE" }, \ - { IOCB_EVENTFD, "EVENTFD"}, \ - { IOCB_DIRECT, "DIRECT" }, \ - { IOCB_WRITE, "WRITE" }, \ - { IOCB_WAITQ, "WAITQ" }, \ - { IOCB_NOIO, "NOIO" }, \ - { IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \ - { IOCB_DIO_CALLER_COMP, "CALLER_COMP" }, \ - { IOCB_AIO_RW, "AIO_RW" }, \ - { IOCB_HAS_METADATA, "AIO_HAS_METADATA" } + { IOCB_HIPRI, "HIPRI" }, \ + { IOCB_DSYNC, "DSYNC" }, \ + { IOCB_SYNC, "SYNC" }, \ + { IOCB_NOWAIT, "NOWAIT" }, \ + { IOCB_APPEND, "APPEND" }, \ + { IOCB_ATOMIC, "ATOMIC" }, \ + { IOCB_DONTCACHE, "DONTCACHE" }, \ + { IOCB_EVENTFD, "EVENTFD"}, \ + { IOCB_DIRECT, "DIRECT" }, \ + { IOCB_WRITE, "WRITE" }, \ + { IOCB_WAITQ, "WAITQ" }, \ + { IOCB_NOIO, "NOIO" }, \ + { IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \ + { IOCB_DIO_CALLER_COMP, "CALLER_COMP" }, \ + { IOCB_AIO_RW, "AIO_RW" }, \ + { IOCB_HAS_METADATA, "AIO_HAS_METADATA" }, \ + { IOCB_MAY_USE_METADATA, "AIO_MAY_USE_METADATA" } struct kiocb { struct file *ki_filp; @@ -3786,6 +3785,10 @@ static inline bool vma_is_fsdax(struct vm_area_struct *vma) static inline int iocb_flags(struct file *file) { int res = 0; + + /* Retain flags that the kernel raises internally. */ + res |= (file->f_iocb_flags & (IOCB_HAS_METADATA | IOCB_MAY_USE_METADATA)); + if (file->f_flags & O_APPEND) res |= IOCB_APPEND; if (file->f_flags & O_DIRECT) diff --git a/io_uring/rw.c b/io_uring/rw.c index af5a54b5db12..23e9103c62d4 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -886,7 +886,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) if (req->flags & REQ_F_HAS_METADATA) { struct io_async_rw *io = req->async_data; - if (!(file->f_mode & FMODE_HAS_METADATA)) + if (!(file->f_iocb_flags & IOCB_MAY_USE_METADATA)) return -EINVAL; /*