[PATCH 1/4] fuse: cache iomaps

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Darrick J. Wong <djwong@xxxxxxxxxx>

Cache iomaps to a file so that we don't have to upcall the server.

Signed-off-by: "Darrick J. Wong" <djwong@xxxxxxxxxx>
---
 fs/fuse/fuse_i.h          |   87 ++
 fs/fuse/fuse_trace.h      |  436 ++++++++++++
 fs/fuse/iomap_cache.h     |  119 +++
 include/uapi/linux/fuse.h |    4 
 fs/fuse/Makefile          |    2 
 fs/fuse/dev.c             |    1 
 fs/fuse/file_iomap.c      |   32 +
 fs/fuse/iomap_cache.c     | 1651 +++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 2322 insertions(+), 10 deletions(-)
 create mode 100644 fs/fuse/iomap_cache.h
 create mode 100644 fs/fuse/iomap_cache.c


diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 5fba84c75f4a64..196d2b57e80bb1 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -106,6 +106,24 @@ struct fuse_backing {
 	struct rcu_head rcu;
 };
 
+#if IS_ENABLED(CONFIG_FUSE_IOMAP)
+/*
+ * File incore extent information, present for each of data & attr forks.
+ */
+struct fuse_ifork {
+	int64_t			if_bytes;	/* bytes in if_data */
+	void			*if_data;	/* extent tree root */
+	int			if_height;	/* height of the extent tree */
+};
+
+struct fuse_iomap_cache {
+	struct fuse_ifork	im_read;
+	struct fuse_ifork	*im_write;
+	uint64_t		im_seq;		/* validity counter */
+	struct rw_semaphore	im_lock;	/* mapping lock */
+};
+#endif
+
 /** FUSE inode */
 struct fuse_inode {
 	/** Inode data */
@@ -167,6 +185,7 @@ struct fuse_inode {
 			spinlock_t ioend_lock;
 			struct work_struct ioend_work;
 			struct list_head ioend_list;
+			struct fuse_iomap_cache cache;
 #endif
 		};
 
@@ -237,6 +256,11 @@ enum {
 	FUSE_I_IOMAP_DIRECTIO,
 	/* Use iomap for buffered read and writes */
 	FUSE_I_IOMAP_FILEIO,
+	/*
+	 * Cache iomaps in the kernel.  This is required for any filesystem
+	 * that needs to synchronize pagecache write and writeback.
+	 */
+	FUSE_I_IOMAP_CACHE,
 };
 
 struct fuse_conn;
@@ -1716,6 +1740,65 @@ int fuse_dev_ioctl_iomap_support(struct file *file,
 				 struct fuse_iomap_support __user *argp);
 
 int fuse_iomap_fadvise(struct file *file, loff_t start, loff_t end, int advice);
+
+enum fuse_iomap_fork {
+	FUSE_IOMAP_READ_FORK,
+	FUSE_IOMAP_WRITE_FORK,
+};
+
+struct fuse_iomap {
+	uint64_t		addr;	/* disk offset of mapping, bytes */
+	loff_t			offset;	/* file offset of mapping, bytes */
+	uint64_t		length;	/* length of mapping, bytes */
+	uint16_t		type;	/* FUSE_IOMAP_TYPE_* */
+	uint16_t		flags;	/* FUSE_IOMAP_F_* */
+	uint32_t		dev;	/* device cookie */
+	uint64_t		validity_cookie; /* used with .iomap_valid() */
+};
+
+static inline bool fuse_has_iomap_cache(const struct inode *inode)
+{
+	const struct fuse_inode *fi = get_fuse_inode_c(inode);
+
+	return test_bit(FUSE_I_IOMAP_CACHE, &fi->state);
+}
+
+int fuse_iomap_cache_remove(struct inode *inode,
+			    enum fuse_iomap_fork whichfork,
+			    loff_t off, uint64_t len);
+
+int fuse_iomap_cache_add(struct inode *inode,
+			 enum fuse_iomap_fork whichfork,
+			 const struct fuse_iomap *map);
+
+static inline int fuse_iomap_cache_upsert(struct inode *inode,
+					  enum fuse_iomap_fork whichfork,
+					  const struct fuse_iomap *map)
+{
+	int err = fuse_iomap_cache_remove(inode, whichfork, map->offset,
+					  map->length);
+	if (err)
+		return err;
+
+	return fuse_iomap_cache_add(inode, whichfork, map);
+}
+
+static inline uint64_t fuse_iext_read_seq(struct fuse_iomap_cache *ip)
+{
+	return (uint64_t)READ_ONCE(ip->im_seq);
+}
+
+enum fuse_iomap_lookup_result {
+	LOOKUP_HIT,
+	LOOKUP_MISS,
+	LOOKUP_NOFORK,
+};
+
+enum fuse_iomap_lookup_result
+fuse_iomap_cache_lookup(struct inode *inode,
+			enum fuse_iomap_fork whichfork,
+			loff_t off, uint64_t len,
+			struct fuse_iomap *mval);
 #else
 # define fuse_iomap_enabled(...)		(false)
 # define fuse_has_iomap(...)			(false)
@@ -1745,6 +1828,10 @@ int fuse_iomap_fadvise(struct file *file, loff_t start, loff_t end, int advice);
 # define fuse_iomap_flush_unmap_range(...)	(-ENOSYS)
 # define fuse_dev_ioctl_iomap_support(...)	(-EOPNOTSUPP)
 # define fuse_iomap_fadvise			NULL
+# define fuse_has_iomap_cache(...)		(false)
+# define fuse_iomap_cache_remove(...)		(-ENOSYS)
+# define fuse_iomap_cache_add(...)		(-ENOSYS)
+# define fuse_iomap_cache_upsert(...)		(-ENOSYS)
 #endif
 
 #endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h
index 20257aed0cd89f..598c0e603a32b1 100644
--- a/fs/fuse/fuse_trace.h
+++ b/fs/fuse/fuse_trace.h
@@ -129,6 +129,7 @@ TRACE_EVENT(fuse_request_end,
 );
 
 #if IS_ENABLED(CONFIG_FUSE_IOMAP)
+struct fuse_iext_cursor;
 
 #define FUSE_IOMAP_F_STRINGS \
 	{ FUSE_IOMAP_F_NEW,			"new" }, \
@@ -182,6 +183,7 @@ TRACE_DEFINE_ENUM(FUSE_I_BTIME);
 TRACE_DEFINE_ENUM(FUSE_I_CACHE_IO_MODE);
 TRACE_DEFINE_ENUM(FUSE_I_IOMAP_DIRECTIO);
 TRACE_DEFINE_ENUM(FUSE_I_IOMAP_FILEIO);
+TRACE_DEFINE_ENUM(FUSE_I_IOMAP_CACHE);
 
 #define FUSE_IFLAG_STRINGS \
 	{ 1 << FUSE_I_ADVISE_RDPLUS,		"advise_rdplus" }, \
@@ -191,7 +193,8 @@ TRACE_DEFINE_ENUM(FUSE_I_IOMAP_FILEIO);
 	{ 1 << FUSE_I_BTIME,			"btime" }, \
 	{ 1 << FUSE_I_CACHE_IO_MODE,		"cacheio" }, \
 	{ 1 << FUSE_I_IOMAP_DIRECTIO,		"iomap_dio" }, \
-	{ 1 << FUSE_I_IOMAP_FILEIO,		"iomap_fileio" }
+	{ 1 << FUSE_I_IOMAP_FILEIO,		"iomap_fileio" }, \
+	{ 1 << FUSE_I_IOMAP_CACHE,		"iomap_cache" }
 
 #define IOMAP_IOEND_STRINGS \
 	{ IOMAP_IOEND_SHARED,			"shared" }, \
@@ -207,6 +210,26 @@ TRACE_DEFINE_ENUM(FUSE_I_IOMAP_FILEIO);
 	{ FUSE_IOMAP_CONFIG_TIME,		"time" }, \
 	{ FUSE_IOMAP_CONFIG_MAXBYTES,		"maxbytes" }
 
+TRACE_DEFINE_ENUM(FUSE_IOMAP_READ_FORK);
+TRACE_DEFINE_ENUM(FUSE_IOMAP_WRITE_FORK);
+
+#define FUSE_IOMAP_FORK_STRINGS \
+	{ FUSE_IOMAP_READ_FORK,			"read" }, \
+	{ FUSE_IOMAP_WRITE_FORK,		"write" }
+
+#define FUSE_IOMAP_CACHE_LOCK_STRINGS \
+	{ FUSE_IOMAP_LOCK_SHARED,		"shared" }, \
+	{ FUSE_IOMAP_LOCK_EXCL,			"exclusive" }
+
+#define FUSE_IEXT_STATE_STRINGS \
+	{ FUSE_IEXT_LEFT_CONTIG,		"l_cont" }, \
+	{ FUSE_IEXT_RIGHT_CONTIG,		"r_cont" }, \
+	{ FUSE_IEXT_LEFT_FILLING,		"l_fill" }, \
+	{ FUSE_IEXT_RIGHT_FILLING,		"r_fill" }, \
+	{ FUSE_IEXT_LEFT_VALID,			"l_valid" }, \
+	{ FUSE_IEXT_RIGHT_VALID,		"r_valid" }, \
+	{ FUSE_IEXT_WRITEFORK,			"writefork" }
+
 TRACE_EVENT(fuse_iomap_begin,
 	TP_PROTO(const struct inode *inode, loff_t pos, loff_t count,
 		 unsigned opflags),
@@ -1289,6 +1312,417 @@ DEFINE_FUSE_IOMAP_INLINE_EVENT(fuse_iomap_inline_read);
 DEFINE_FUSE_IOMAP_INLINE_EVENT(fuse_iomap_inline_write);
 DEFINE_FUSE_IOMAP_INLINE_EVENT(fuse_iomap_set_inline_iomap);
 DEFINE_FUSE_IOMAP_INLINE_EVENT(fuse_iomap_set_inline_srcmap);
+
+DECLARE_EVENT_CLASS(fuse_iomap_cache_lock_class,
+	TP_PROTO(const struct inode *inode, unsigned int lock_flags,
+		 unsigned long caller_ip),
+	TP_ARGS(inode, lock_flags, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, connection)
+		__field(uint64_t, ino)
+		__field(uint64_t,	nodeid)
+		__field(loff_t,		isize)
+		__field(unsigned int, lock_flags)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		const struct fuse_inode *fi = get_fuse_inode_c(inode);
+		const struct fuse_mount *fm = get_fuse_mount_c(inode);
+
+		__entry->connection	=	fm->fc->dev;
+		__entry->ino		=	fi->orig_ino;
+		__entry->nodeid		=	fi->nodeid;
+		__entry->isize		=	i_size_read(inode);
+		__entry->lock_flags	=	lock_flags;
+		__entry->caller_ip	=	caller_ip;
+	),
+	TP_printk("connection %u ino %llu nodeid %llu isize 0x%llx lock (%s) caller %pS",
+		  __entry->connection, __entry->ino, __entry->nodeid,
+		  __entry->isize,
+		  __print_flags(__entry->lock_flags, "|", FUSE_IOMAP_CACHE_LOCK_STRINGS),
+		  (void *)__entry->caller_ip)
+)
+#define DEFINE_FUSE_IOMAP_CACHE_LOCK_EVENT(name)	\
+DEFINE_EVENT(fuse_iomap_cache_lock_class, name,		\
+	TP_PROTO(const struct inode *inode, unsigned int lock_flags, \
+		 unsigned long caller_ip), \
+	TP_ARGS(inode, lock_flags, caller_ip))
+DEFINE_FUSE_IOMAP_CACHE_LOCK_EVENT(fuse_iomap_cache_lock);
+DEFINE_FUSE_IOMAP_CACHE_LOCK_EVENT(fuse_iomap_cache_unlock);
+
+DECLARE_EVENT_CLASS(fuse_iext_class,
+	TP_PROTO(const struct inode *inode, const struct fuse_iext_cursor *cur,
+		 int state, unsigned long caller_ip),
+
+	TP_ARGS(inode, cur, state, caller_ip),
+
+	TP_STRUCT__entry(
+		__field(dev_t, connection)
+		__field(uint64_t, ino)
+		__field(void *, leaf)
+		__field(int, pos)
+		__field(loff_t, offset)
+		__field(uint64_t, addr)
+		__field(uint64_t, length)
+		__field(uint16_t, type)
+		__field(uint16_t, mapflags)
+		__field(uint32_t, dev)
+		__field(int, iext_state)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		const struct fuse_inode *fi = get_fuse_inode_c(inode);
+		const struct fuse_mount *fm = get_fuse_mount_c(inode);
+		const struct fuse_ifork *ifp;
+		struct fuse_iomap	r = { };
+
+		if (state & FUSE_IEXT_WRITEFORK)
+			ifp = fi->cache.im_write;
+		else
+			ifp = &fi->cache.im_read;
+		if (ifp)
+			fuse_iext_get_extent(ifp, cur, &r);
+
+		__entry->connection	=	fm->fc->dev;
+		__entry->ino		=	fi->orig_ino;
+		__entry->leaf		=	cur->leaf;
+		__entry->pos		=	cur->pos;
+		__entry->offset		=	r.offset;
+		__entry->addr		=	r.addr;
+		__entry->length		=	r.length;
+		__entry->dev		=	r.dev;
+		__entry->type		=	r.type;
+		__entry->mapflags	=	r.flags;
+		__entry->iext_state	=	state;
+		__entry->caller_ip	=	caller_ip;
+	),
+	TP_printk("connection %u ino %llu state (%s) cur %p/%d "
+		  "offset 0x%llx addr 0x%llx length 0x%llx type %s mapflags (%s) dev %u caller %pS",
+		  __entry->connection, __entry->ino,
+		  __print_flags(__entry->iext_state, "|", FUSE_IEXT_STATE_STRINGS),
+		  __entry->leaf,
+		  __entry->pos,
+		  __entry->offset,
+		  __entry->addr,
+		  __entry->length,
+		  __print_symbolic(__entry->type, FUSE_IOMAP_TYPE_STRINGS),
+		  __print_flags(__entry->mapflags, "|", FUSE_IOMAP_F_STRINGS),
+		  __entry->dev,
+		  (void *)__entry->caller_ip)
+)
+
+#define DEFINE_IEXT_EVENT(name) \
+DEFINE_EVENT(fuse_iext_class, name, \
+	TP_PROTO(const struct inode *inode, const struct fuse_iext_cursor *cur, \
+		 int state, unsigned long caller_ip), \
+	TP_ARGS(inode, cur, state, caller_ip))
+DEFINE_IEXT_EVENT(fuse_iext_insert);
+DEFINE_IEXT_EVENT(fuse_iext_remove);
+DEFINE_IEXT_EVENT(fuse_iext_pre_update);
+DEFINE_IEXT_EVENT(fuse_iext_post_update);
+
+TRACE_EVENT(fuse_iext_update_class,
+	TP_PROTO(const struct inode *inode, uint32_t iext_state,
+		 const struct fuse_iomap *map),
+	TP_ARGS(inode, iext_state, map),
+
+	TP_STRUCT__entry(
+		__field(dev_t,			connection)
+		__field(uint64_t,		ino)
+		__field(uint64_t,		nodeid)
+		__field(loff_t,			isize)
+
+		__field(loff_t,			map_offset)
+		__field(loff_t,			map_length)
+		__field(uint16_t,		map_type)
+		__field(uint16_t,		map_flags)
+		__field(uint32_t,		map_dev)
+		__field(uint64_t,		map_addr)
+
+		__field(uint32_t,		iext_state)
+	),
+
+	TP_fast_assign(
+		const struct fuse_inode *fi = get_fuse_inode_c(inode);
+		const struct fuse_mount *fm = get_fuse_mount_c(inode);
+
+		__entry->connection	=	fm->fc->dev;
+		__entry->ino		=	fi->orig_ino;
+		__entry->nodeid		=	fi->nodeid;
+		__entry->isize		=	i_size_read(inode);
+
+		__entry->map_offset	=	map->offset;
+		__entry->map_length	=	map->length;
+		__entry->map_type	=	map->type;
+		__entry->map_flags	=	map->flags;
+		__entry->map_dev	=	map->dev;
+		__entry->map_addr	=	map->addr;
+
+		__entry->iext_state	=	iext_state;
+	),
+
+	TP_printk("connection %u ino %llu nodeid %llu isize 0x%llx state (%s) offset 0x%llx length 0x%llx type %s mapflags (%s) dev %u addr 0x%llx",
+		  __entry->connection, __entry->ino, __entry->nodeid,
+		  __entry->isize,
+		  __print_flags(__entry->iext_state, "|", FUSE_IEXT_STATE_STRINGS),
+		  __entry->map_offset, __entry->map_length,
+		  __print_symbolic(__entry->map_type, FUSE_IOMAP_TYPE_STRINGS),
+		  __print_flags(__entry->map_flags, "|", FUSE_IOMAP_F_STRINGS),
+		  __entry->map_dev, __entry->map_addr)
+);
+#define DEFINE_IEXT_UPDATE_EVENT(name) \
+DEFINE_EVENT(fuse_iext_update_class, name, \
+	TP_PROTO(const struct inode *inode, uint32_t iext_state, \
+		 const struct fuse_iomap *map), \
+	TP_ARGS(inode, iext_state, map))
+DEFINE_IEXT_UPDATE_EVENT(fuse_iext_del_mapping);
+DEFINE_IEXT_UPDATE_EVENT(fuse_iext_add_mapping);
+
+TRACE_EVENT(fuse_iext_alt_update_class,
+	TP_PROTO(const struct inode *inode, const struct fuse_iomap *map),
+	TP_ARGS(inode, map),
+
+	TP_STRUCT__entry(
+		__field(dev_t,			connection)
+		__field(uint64_t,		ino)
+		__field(uint64_t,		nodeid)
+
+		__field(loff_t,			map_offset)
+		__field(loff_t,			map_length)
+		__field(uint16_t,		map_type)
+		__field(uint16_t,		map_flags)
+		__field(uint32_t,		map_dev)
+		__field(uint64_t,		map_addr)
+	),
+
+	TP_fast_assign(
+		const struct fuse_inode *fi = get_fuse_inode_c(inode);
+		const struct fuse_mount *fm = get_fuse_mount_c(inode);
+
+		__entry->connection	=	fm->fc->dev;
+		__entry->ino		=	fi->orig_ino;
+		__entry->nodeid		=	fi->nodeid;
+
+		__entry->map_offset	=	map->offset;
+		__entry->map_length	=	map->length;
+		__entry->map_type	=	map->type;
+		__entry->map_flags	=	map->flags;
+		__entry->map_dev	=	map->dev;
+		__entry->map_addr	=	map->addr;
+	),
+
+	TP_printk("connection %u ino %llu nodeid %llu offset 0x%llx length 0x%llx type %s mapflags (%s) dev %u addr 0x%llx",
+		  __entry->connection, __entry->ino, __entry->nodeid,
+		  __entry->map_offset, __entry->map_length,
+		  __print_symbolic(__entry->map_type, FUSE_IOMAP_TYPE_STRINGS),
+		  __print_flags(__entry->map_flags, "|", FUSE_IOMAP_F_STRINGS),
+		  __entry->map_dev, __entry->map_addr)
+);
+#define DEFINE_IEXT_ALT_UPDATE_EVENT(name) \
+DEFINE_EVENT(fuse_iext_alt_update_class, name, \
+	TP_PROTO(const struct inode *inode, const struct fuse_iomap *map), \
+	TP_ARGS(inode, map))
+DEFINE_IEXT_ALT_UPDATE_EVENT(fuse_iext_del_mapping_got);
+DEFINE_IEXT_ALT_UPDATE_EVENT(fuse_iext_add_mapping_left);
+DEFINE_IEXT_ALT_UPDATE_EVENT(fuse_iext_add_mapping_right);
+
+TRACE_EVENT(fuse_iomap_cache_remove,
+	TP_PROTO(const struct inode *inode, enum fuse_iomap_fork whichfork,
+		 loff_t offset, uint64_t length, unsigned long caller_ip),
+	TP_ARGS(inode, whichfork, offset, length, caller_ip),
+
+	TP_STRUCT__entry(
+		__field(dev_t,			connection)
+		__field(uint64_t,		ino)
+		__field(uint64_t,		nodeid)
+		__field(loff_t,			isize)
+		__field(enum fuse_iomap_fork,	whichfork)
+		__field(loff_t,			offset)
+		__field(uint64_t,		length)
+		__field(unsigned long,		caller_ip)
+	),
+
+	TP_fast_assign(
+		const struct fuse_inode *fi = get_fuse_inode_c(inode);
+		const struct fuse_mount *fm = get_fuse_mount_c(inode);
+
+		__entry->connection	=	fm->fc->dev;
+		__entry->ino		=	fi->orig_ino;
+		__entry->nodeid		=	fi->nodeid;
+		__entry->isize		=	i_size_read(inode);
+		__entry->whichfork	=	whichfork;
+		__entry->offset		=	offset;
+		__entry->length		=	length;
+		__entry->caller_ip	=	caller_ip;
+	),
+
+	TP_printk("connection %u ino %llu nodeid %llu isize 0x%llx whichfork %s offset 0x%llx length 0x%llx caller %pS",
+		  __entry->connection, __entry->ino, __entry->nodeid,
+		  __entry->isize,
+		  __print_symbolic(__entry->whichfork, FUSE_IOMAP_FORK_STRINGS),
+		  __entry->offset, __entry->length, (void *)__entry->caller_ip)
+);
+
+TRACE_EVENT(fuse_iomap_mapping_class,
+	TP_PROTO(const struct inode *inode, enum fuse_iomap_fork whichfork,
+		 const struct fuse_iomap *map, unsigned long caller_ip),
+	TP_ARGS(inode, whichfork, map, caller_ip),
+
+	TP_STRUCT__entry(
+		__field(dev_t,			connection)
+		__field(uint64_t,		ino)
+		__field(uint64_t,		nodeid)
+		__field(loff_t,			isize)
+		__field(enum fuse_iomap_fork,	whichfork)
+		__field(loff_t,			offset)
+		__field(loff_t,			length)
+		__field(uint16_t,		maptype)
+		__field(uint16_t,		mapflags)
+		__field(uint32_t,		dev)
+		__field(uint64_t,		addr)
+		__field(unsigned long,		caller_ip)
+	),
+
+	TP_fast_assign(
+		const struct fuse_inode *fi = get_fuse_inode_c(inode);
+		const struct fuse_mount *fm = get_fuse_mount_c(inode);
+
+		__entry->connection	=	fm->fc->dev;
+		__entry->ino		=	fi->orig_ino;
+		__entry->nodeid		=	fi->nodeid;
+		__entry->isize		=	i_size_read(inode);
+		__entry->whichfork	=	whichfork;
+		__entry->offset		=	map->offset;
+		__entry->length		=	map->length;
+		__entry->maptype	=	map->type;
+		__entry->mapflags	=	map->flags;
+		__entry->dev		=	map->dev;
+		__entry->addr		=	map->addr;
+		__entry->caller_ip	=	caller_ip;
+	),
+
+	TP_printk("connection %u ino %llu nodeid %llu isize 0x%llx whichfork %s offset 0x%llx length 0x%llx type %s mapflags (%s) dev %u addr 0x%llx caller %pS",
+		  __entry->connection, __entry->ino, __entry->nodeid,
+		  __entry->isize,
+		  __print_symbolic(__entry->whichfork, FUSE_IOMAP_FORK_STRINGS),
+		  __entry->offset, __entry->length,
+		  __print_symbolic(__entry->maptype, FUSE_IOMAP_TYPE_STRINGS),
+		  __print_flags(__entry->mapflags, "|", FUSE_IOMAP_F_STRINGS),
+		  __entry->dev, __entry->addr, (void *)__entry->caller_ip)
+);
+#define DEFINE_FUSE_IOMAP_MAPPING_EVENT(name) \
+DEFINE_EVENT(fuse_iomap_mapping_class, name, \
+	TP_PROTO(const struct inode *inode, enum fuse_iomap_fork whichfork, \
+		 const struct fuse_iomap *map, unsigned long caller_ip), \
+	TP_ARGS(inode, whichfork, map, caller_ip))
+DEFINE_FUSE_IOMAP_MAPPING_EVENT(fuse_iomap_cache_add);
+DEFINE_FUSE_IOMAP_MAPPING_EVENT(fuse_iext_check_mapping);
+
+TRACE_EVENT(fuse_iomap_cache_lookup,
+	TP_PROTO(const struct inode *inode, enum fuse_iomap_fork whichfork,
+		 loff_t pos, uint64_t count, unsigned long caller_ip),
+	TP_ARGS(inode, whichfork, pos, count, caller_ip),
+
+	TP_STRUCT__entry(
+		__field(dev_t,			connection)
+		__field(uint64_t,		ino)
+		__field(uint64_t,		nodeid)
+		__field(loff_t,			isize)
+		__field(enum fuse_iomap_fork,	whichfork)
+		__field(loff_t,			pos)
+		__field(uint64_t,		count)
+		__field(unsigned long,		caller_ip)
+	),
+
+	TP_fast_assign(
+		const struct fuse_inode *fi = get_fuse_inode_c(inode);
+		const struct fuse_mount *fm = get_fuse_mount_c(inode);
+
+		__entry->connection	=	fm->fc->dev;
+		__entry->ino		=	fi->orig_ino;
+		__entry->nodeid		=	fi->nodeid;
+		__entry->isize		=	i_size_read(inode);
+		__entry->whichfork	=	whichfork;
+		__entry->pos		=	pos;
+		__entry->count		=	count;
+		__entry->caller_ip	=	caller_ip;
+	),
+
+	TP_printk("connection %u ino %llu nodeid %llu isize 0x%llx whichfork %s pos 0x%llx count 0x%llx caller %pS",
+		  __entry->connection, __entry->ino, __entry->nodeid,
+		  __entry->isize,
+		  __print_symbolic(__entry->whichfork, FUSE_IOMAP_FORK_STRINGS),
+		  __entry->pos, __entry->count,
+		  (void *)__entry->caller_ip)
+);
+
+TRACE_EVENT(fuse_iomap_cache_lookup_result,
+	TP_PROTO(const struct inode *inode, enum fuse_iomap_fork whichfork,
+		 loff_t pos, uint64_t count, const struct fuse_iomap *got,
+		 const struct fuse_iomap *map),
+	TP_ARGS(inode, whichfork, pos, count, got, map),
+
+	TP_STRUCT__entry(
+		__field(dev_t,			connection)
+		__field(uint64_t,		ino)
+		__field(uint64_t,		nodeid)
+		__field(loff_t,			isize)
+		__field(enum fuse_iomap_fork,	whichfork)
+		__field(loff_t,			pos)
+		__field(uint64_t,		count)
+
+		__field(loff_t,			got_offset)
+		__field(uint64_t,		got_length)
+		__field(uint64_t,		got_addr)
+
+		__field(loff_t,			map_offset)
+		__field(uint64_t,		map_length)
+		__field(uint16_t,		map_type)
+		__field(uint16_t,		map_flags)
+		__field(uint32_t,		map_dev)
+		__field(uint64_t,		map_addr)
+
+		__field(uint64_t,		validity_cookie)
+	),
+
+	TP_fast_assign(
+		const struct fuse_inode *fi = get_fuse_inode_c(inode);
+		const struct fuse_mount *fm = get_fuse_mount_c(inode);
+
+		__entry->connection	=	fm->fc->dev;
+		__entry->ino		=	fi->orig_ino;
+		__entry->nodeid		=	fi->nodeid;
+		__entry->isize		=	i_size_read(inode);
+		__entry->whichfork	=	whichfork;
+		__entry->pos		=	pos;
+		__entry->count		=	count;
+
+		__entry->got_offset	=	got->offset;
+		__entry->got_length	=	got->length;
+		__entry->got_addr	=	got->addr;
+
+		__entry->map_offset	=	map->offset;
+		__entry->map_length	=	map->length;
+		__entry->map_type	=	map->type;
+		__entry->map_flags	=	map->flags;
+		__entry->map_dev	=	map->dev;
+		__entry->map_addr	=	map->addr;
+
+		__entry->validity_cookie=	map->validity_cookie;
+	),
+
+	TP_printk("connection %u ino %llu nodeid %llu isize 0x%llx whichfork %s pos 0x%llx count 0x%llx map offset 0x%llx length 0x%llx type %s mapflags (%s) dev %u addr 0x%llx got offset 0x%llx length 0x%llx addr 0x%llx cookie 0x%llx",
+		  __entry->connection, __entry->ino, __entry->nodeid,
+		  __entry->isize,
+		  __print_symbolic(__entry->whichfork, FUSE_IOMAP_FORK_STRINGS),
+		  __entry->pos, __entry->count,
+		  __entry->map_offset, __entry->map_length,
+		  __print_symbolic(__entry->map_type, FUSE_IOMAP_TYPE_STRINGS),
+		  __print_flags(__entry->map_flags, "|", FUSE_IOMAP_F_STRINGS),
+		  __entry->map_dev, __entry->map_addr, __entry->got_offset,
+		  __entry->got_length, __entry->got_addr,
+		  __entry->validity_cookie)
+);
 #endif /* CONFIG_FUSE_IOMAP */
 
 #endif /* _TRACE_FUSE_H */
diff --git a/fs/fuse/iomap_cache.h b/fs/fuse/iomap_cache.h
new file mode 100644
index 00000000000000..7efa23be18d155
--- /dev/null
+++ b/fs/fuse/iomap_cache.h
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2017 Christoph Hellwig.
+ */
+
+#ifndef _FS_FUSE_IOMAP_CACHE_H
+#define _FS_FUSE_IOMAP_CACHE_H
+
+#if IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG)
+# define ASSERT(a)		do { WARN(!(a), "Assertion failed: %s, func: %s, line: %d", #a, __func__, __LINE__); } while (0)
+# define BAD_DATA(condition)	(WARN(condition, "Bad mapping: %s, func: %s, line: %d", #condition, __func__, __LINE__))
+#else
+# define ASSERT(a)
+# define BAD_DATA(condition)	(condition)
+#endif
+
+#define FUSE_IOMAP_LOCK_SHARED	(1U << 0)
+#define FUSE_IOMAP_LOCK_EXCL	(1U << 1)
+
+void fuse_iomap_cache_lock(struct inode *inode, unsigned int lock_flags);
+void fuse_iomap_cache_unlock(struct inode *inode, unsigned int lock_flags);
+
+#define FUSE_IOMAP_MAX_LEN	((loff_t)(1ULL << 63))
+
+struct fuse_iext_leaf;
+
+struct fuse_iext_cursor {
+	struct fuse_iext_leaf	*leaf;
+	int			pos;
+};
+
+#define FUSE_IEXT_LEFT_CONTIG	(1u << 0)
+#define FUSE_IEXT_RIGHT_CONTIG	(1u << 1)
+#define FUSE_IEXT_LEFT_FILLING	(1u << 2)
+#define FUSE_IEXT_RIGHT_FILLING	(1u << 3)
+#define FUSE_IEXT_LEFT_VALID	(1u << 4)
+#define FUSE_IEXT_RIGHT_VALID	(1u << 5)
+#define FUSE_IEXT_WRITEFORK	(1u << 6)
+
+struct fuse_ifork *fuse_iext_state_to_fork(struct fuse_iomap_cache *ip,
+		unsigned int state);
+
+uint64_t	fuse_iext_count(const struct fuse_ifork *ifp);
+void		fuse_iext_insert_raw(struct fuse_iomap_cache *ip,
+			struct fuse_ifork *ifp,
+			struct fuse_iext_cursor *cur,
+			const struct fuse_iomap *irec);
+void		fuse_iext_insert(struct fuse_iomap_cache *,
+			struct fuse_iext_cursor *cur,
+			const struct fuse_iomap *, int);
+void		fuse_iext_remove(struct fuse_iomap_cache *,
+			struct fuse_iext_cursor *,
+			int);
+void		fuse_iext_destroy(struct fuse_ifork *);
+
+bool		fuse_iext_lookup_extent(struct fuse_iomap_cache *ip,
+			struct fuse_ifork *ifp, loff_t bno,
+			struct fuse_iext_cursor *cur,
+			struct fuse_iomap *gotp);
+bool		fuse_iext_lookup_extent_before(struct fuse_iomap_cache *ip,
+			struct fuse_ifork *ifp, loff_t *end,
+			struct fuse_iext_cursor *cur,
+			struct fuse_iomap *gotp);
+bool		fuse_iext_get_extent(const struct fuse_ifork *ifp,
+			const struct fuse_iext_cursor *cur,
+			struct fuse_iomap *gotp);
+void		fuse_iext_update_extent(struct fuse_iomap_cache *ip, int state,
+			struct fuse_iext_cursor *cur,
+			struct fuse_iomap *gotp);
+
+void		fuse_iext_first(struct fuse_ifork *, struct fuse_iext_cursor *);
+void		fuse_iext_last(struct fuse_ifork *, struct fuse_iext_cursor *);
+void		fuse_iext_next(struct fuse_ifork *, struct fuse_iext_cursor *);
+void		fuse_iext_prev(struct fuse_ifork *, struct fuse_iext_cursor *);
+
+static inline bool fuse_iext_next_extent(struct fuse_ifork *ifp,
+		struct fuse_iext_cursor *cur, struct fuse_iomap *gotp)
+{
+	fuse_iext_next(ifp, cur);
+	return fuse_iext_get_extent(ifp, cur, gotp);
+}
+
+static inline bool fuse_iext_prev_extent(struct fuse_ifork *ifp,
+		struct fuse_iext_cursor *cur, struct fuse_iomap *gotp)
+{
+	fuse_iext_prev(ifp, cur);
+	return fuse_iext_get_extent(ifp, cur, gotp);
+}
+
+/*
+ * Return the extent after cur in gotp without updating the cursor.
+ */
+static inline bool fuse_iext_peek_next_extent(struct fuse_ifork *ifp,
+		struct fuse_iext_cursor *cur, struct fuse_iomap *gotp)
+{
+	struct fuse_iext_cursor ncur = *cur;
+
+	fuse_iext_next(ifp, &ncur);
+	return fuse_iext_get_extent(ifp, &ncur, gotp);
+}
+
+/*
+ * Return the extent before cur in gotp without updating the cursor.
+ */
+static inline bool fuse_iext_peek_prev_extent(struct fuse_ifork *ifp,
+		struct fuse_iext_cursor *cur, struct fuse_iomap *gotp)
+{
+	struct fuse_iext_cursor ncur = *cur;
+
+	fuse_iext_prev(ifp, &ncur);
+	return fuse_iext_get_extent(ifp, &ncur, gotp);
+}
+
+#define for_each_fuse_iext(ifp, ext, got)		\
+	for (fuse_iext_first((ifp), (ext));		\
+	     fuse_iext_get_extent((ifp), (ext), (got));	\
+	     fuse_iext_next((ifp), (ext)))
+
+#endif /* _FS_FUSE_IOMAP_CACHE_H */
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 2aac5a0c4cef0a..a9b2d68b4b79c3 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -1330,6 +1330,7 @@ struct fuse_uring_cmd_req {
 };
 
 #define FUSE_IOMAP_TYPE_PURE_OVERWRITE	(0xFFFF) /* use read mapping data */
+#define FUSE_IOMAP_TYPE_NULL		(0xFFFE) /* no record here */
 #define FUSE_IOMAP_TYPE_HOLE		0	/* no blocks allocated, need allocation */
 #define FUSE_IOMAP_TYPE_DELALLOC	1	/* delayed allocation blocks */
 #define FUSE_IOMAP_TYPE_MAPPED		2	/* blocks allocated at @addr */
@@ -1462,4 +1463,7 @@ struct fuse_iomap_config_out {
 	int64_t s_maxbytes;	/* max file size */
 };
 
+/* invalidate all cached iomap mappings up to EOF */
+#define FUSE_IOMAP_INVAL_TO_EOF		(~0ULL)
+
 #endif /* _LINUX_FUSE_H */
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 63a41ef9336aaa..cf5c242be09f84 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -16,6 +16,6 @@ fuse-$(CONFIG_FUSE_DAX) += dax.o
 fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o
 fuse-$(CONFIG_SYSCTL) += sysctl.o
 fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o
-fuse-$(CONFIG_FUSE_IOMAP) += file_iomap.o
+fuse-$(CONFIG_FUSE_IOMAP) += file_iomap.o iomap_cache.o
 
 virtiofs-y := virtio_fs.o
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 4ad90d212379ff..3dd04c2fdae7ba 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -9,6 +9,7 @@
 #include "dev_uring_i.h"
 #include "fuse_i.h"
 #include "fuse_dev_i.h"
+#include "iomap_cache.h"
 
 #include <linux/init.h>
 #include <linux/module.h>
diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c
index 5ef9fa67db807e..66e1be93592023 100644
--- a/fs/fuse/file_iomap.c
+++ b/fs/fuse/file_iomap.c
@@ -4,6 +4,7 @@
  * Author: Darrick J. Wong <djwong@xxxxxxxxxx.
  */
 #include "fuse_i.h"
+#include "iomap_cache.h"
 #include "fuse_trace.h"
 #include <linux/iomap.h>
 #include <linux/pagemap.h>
@@ -19,14 +20,6 @@ static bool __read_mostly enable_iomap =
 module_param(enable_iomap, bool, 0644);
 MODULE_PARM_DESC(enable_iomap, "Enable file I/O through iomap");
 
-#if IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG)
-# define ASSERT(a)		do { WARN(!(a), "Assertion failed: %s, func: %s, line: %d", #a, __func__, __LINE__); } while (0)
-# define BAD_DATA(condition)	(WARN(condition, "Bad mapping: %s, func: %s, line: %d", #condition, __func__, __LINE__))
-#else
-# define ASSERT(a)
-# define BAD_DATA(condition)	(condition)
-#endif
-
 bool fuse_iomap_enabled(void)
 {
 	/*
@@ -1102,6 +1095,21 @@ static inline void fuse_iomap_clear_fileio(struct inode *inode)
 	clear_bit(FUSE_I_IOMAP_FILEIO, &fi->state);
 }
 
+static inline void fuse_iomap_clear_cache(struct inode *inode)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	ASSERT(fuse_has_iomap(inode));
+
+	clear_bit(FUSE_I_IOMAP_CACHE, &fi->state);
+
+	fuse_iext_destroy(&fi->cache.im_read);
+	if (fi->cache.im_write) {
+		fuse_iext_destroy(fi->cache.im_write);
+		kfree(fi->cache.im_write);
+	}
+}
+
 void fuse_iomap_init_inode(struct inode *inode, unsigned attr_flags)
 {
 	struct fuse_conn *conn = get_fuse_conn(inode);
@@ -1122,6 +1130,8 @@ void fuse_iomap_evict_inode(struct inode *inode)
 		fuse_iomap_clear_directio(inode);
 	if (fuse_has_iomap_fileio(inode))
 		fuse_iomap_clear_fileio(inode);
+	if (fuse_has_iomap_cache(inode))
+		fuse_iomap_clear_cache(inode);
 }
 
 ssize_t fuse_iomap_direct_read(struct kiocb *iocb, struct iov_iter *to)
@@ -1641,6 +1651,12 @@ static inline void fuse_iomap_set_fileio(struct inode *inode)
 		min_order = inode->i_blkbits - PAGE_SHIFT;
 
 	mapping_set_folio_min_order(inode->i_mapping, min_order);
+
+	memset(&fi->cache.im_read, 0, sizeof(fi->cache.im_read));
+	fi->cache.im_seq = 0;
+	fi->cache.im_write = NULL;
+
+	init_rwsem(&fi->cache.im_lock);
 	set_bit(FUSE_I_IOMAP_FILEIO, &fi->state);
 }
 
diff --git a/fs/fuse/iomap_cache.c b/fs/fuse/iomap_cache.c
new file mode 100644
index 00000000000000..6244352f543f03
--- /dev/null
+++ b/fs/fuse/iomap_cache.c
@@ -0,0 +1,1651 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fuse_iext* code adapted from xfs_iext_tree.c:
+ * Copyright (c) 2017 Christoph Hellwig.
+ *
+ * fuse_iomap_cache*lock* code adapted from xfs_inode.c:
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * Copyright (C) 2025 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@xxxxxxxxxx.
+ */
+#include "fuse_i.h"
+#include "iomap_cache.h"
+#include "fuse_trace.h"
+#include <linux/iomap.h>
+
+static inline void fuse_iomap_cache_lock_flags_assert(unsigned int lock_flags)
+{
+	ASSERT((lock_flags & (FUSE_IOMAP_LOCK_SHARED | FUSE_IOMAP_LOCK_EXCL)) !=
+		(FUSE_IOMAP_LOCK_SHARED | FUSE_IOMAP_LOCK_EXCL));
+	ASSERT(lock_flags != 0);
+}
+
+void fuse_iomap_cache_lock(struct inode *inode, unsigned int lock_flags)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_iomap_cache *ip = &fi->cache;
+
+	fuse_iomap_cache_lock_flags_assert(lock_flags);
+
+	if (lock_flags & FUSE_IOMAP_LOCK_EXCL)
+		down_write(&ip->im_lock);
+	else if (lock_flags & FUSE_IOMAP_LOCK_SHARED)
+		down_read(&ip->im_lock);
+
+	trace_fuse_iomap_cache_lock(inode, lock_flags, _RET_IP_);
+}
+
+void fuse_iomap_cache_unlock(struct inode *inode, unsigned int lock_flags)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_iomap_cache *ip = &fi->cache;
+
+	fuse_iomap_cache_lock_flags_assert(lock_flags);
+
+	trace_fuse_iomap_cache_unlock(inode, lock_flags, _RET_IP_);
+
+	if (lock_flags & FUSE_IOMAP_LOCK_EXCL)
+		up_write(&ip->im_lock);
+	else if (lock_flags & FUSE_IOMAP_LOCK_SHARED)
+		up_read(&ip->im_lock);
+}
+
+static inline void fuse_iomap_assert_locked(struct fuse_iomap_cache *ip,
+					    unsigned int lock_flags)
+{
+	if (lock_flags & FUSE_IOMAP_LOCK_SHARED)
+		rwsem_assert_held(&ip->im_lock);
+	else if (lock_flags & FUSE_IOMAP_LOCK_EXCL)
+		rwsem_assert_held_write_nolockdep(&ip->im_lock);
+}
+
+struct fuse_iext_rec {
+	uint64_t		addr;	/* disk offset of mapping, bytes */
+	loff_t			offset;	/* file offset of mapping, bytes */
+	uint64_t		length;	/* length of mapping, bytes */
+	uint16_t		type;	/* FUSE_IOMAP_TYPE_* */
+	uint16_t		flags;	/* FUSE_IOMAP_F_* */
+	uint32_t		dev;	/* device cookie */
+};
+
+static inline struct fuse_inode *FUSE_I(struct fuse_iomap_cache *ip)
+{
+	return container_of(ip, struct fuse_inode, cache);
+}
+
+static inline struct inode *VFS_I(struct fuse_iomap_cache *ip)
+{
+	struct fuse_inode *fi = FUSE_I(ip);
+
+	return &fi->inode;
+}
+
+static inline uint32_t
+fuse_iomap_fork_to_state(const struct fuse_iomap_cache *ip,
+			 const struct fuse_ifork *ifp)
+{
+	ASSERT(ifp == ip->im_write || ifp == &ip->im_read);
+
+	if (ifp == ip->im_write)
+		return FUSE_IEXT_WRITEFORK;
+	return 0;
+}
+
+/* Convert bmap state flags to an inode fork. */
+struct fuse_ifork *
+fuse_iext_state_to_fork(
+	struct fuse_iomap_cache	*ip,
+	unsigned int		state)
+{
+	if (state & FUSE_IEXT_WRITEFORK)
+		return ip->im_write;
+	return &ip->im_read;
+}
+
+static bool fuse_iext_rec_is_empty(const struct fuse_iext_rec *rec)
+{
+	return rec->length == 0;
+}
+
+static inline void fuse_iext_rec_clear(struct fuse_iext_rec *rec)
+{
+	memset(rec, 0, sizeof(*rec));
+}
+
+static void
+fuse_iext_set(
+	struct fuse_iext_rec	*rec,
+	const struct fuse_iomap	*irec)
+{
+	ASSERT(irec->length > 0);
+
+	rec->addr = irec->addr;
+	rec->offset = irec->offset;
+	rec->length = irec->length;
+	rec->type = irec->type;
+	rec->flags = irec->flags;
+	rec->dev = irec->dev;
+}
+
+static void
+fuse_iext_get(
+	struct fuse_iomap		*irec,
+	const struct fuse_iext_rec	*rec)
+{
+	irec->addr = rec->addr;
+	irec->offset = rec->offset;
+	irec->length = rec->length;
+	irec->type = rec->type;
+	irec->flags = rec->flags;
+	irec->dev = rec->dev;
+	/* validity cookie is set at the end of lookup */
+}
+
+enum {
+	NODE_SIZE	= 256,
+	KEYS_PER_NODE	= NODE_SIZE / (sizeof(uint64_t) + sizeof(void *)),
+	RECS_PER_LEAF	= (NODE_SIZE - (2 * sizeof(struct fuse_iext_leaf *))) /
+				sizeof(struct fuse_iext_rec),
+};
+
+/*
+ * In-core extent btree block layout:
+ *
+ * There are two types of blocks in the btree: leaf and inner (non-leaf) blocks.
+ *
+ * The leaf blocks are made up by %KEYS_PER_NODE extent records, which each
+ * contain the startoffset, blockcount, startblock and unwritten extent flag.
+ * See above for the exact format, followed by pointers to the previous and next
+ * leaf blocks (if there are any).
+ *
+ * The inner (non-leaf) blocks first contain KEYS_PER_NODE lookup keys, followed
+ * by an equal number of pointers to the btree blocks at the next lower level.
+ *
+ *		+-------+-------+-------+-------+-------+----------+----------+
+ * Leaf:	| rec 1 | rec 2 | rec 3 | rec 4 | rec N | prev-ptr | next-ptr |
+ *		+-------+-------+-------+-------+-------+----------+----------+
+ *
+ *		+-------+-------+-------+-------+-------+-------+------+-------+
+ * Inner:	| key 1 | key 2 | key 3 | key N | ptr 1 | ptr 2 | ptr3 | ptr N |
+ *		+-------+-------+-------+-------+-------+-------+------+-------+
+ */
+struct fuse_iext_node {
+	uint64_t		keys[KEYS_PER_NODE];
+#define FUSE_IEXT_KEY_INVALID	(1ULL << 63)
+	void			*ptrs[KEYS_PER_NODE];
+};
+
+struct fuse_iext_leaf {
+	struct fuse_iext_rec	recs[RECS_PER_LEAF];
+	struct fuse_iext_leaf	*prev;
+	struct fuse_iext_leaf	*next;
+};
+
+inline uint64_t fuse_iext_count(const struct fuse_ifork *ifp)
+{
+	return ifp->if_bytes / sizeof(struct fuse_iext_rec);
+}
+
+static inline int fuse_iext_max_recs(const struct fuse_ifork *ifp)
+{
+	if (ifp->if_height == 1)
+		return fuse_iext_count(ifp);
+	return RECS_PER_LEAF;
+}
+
+static inline struct fuse_iext_rec *cur_rec(const struct fuse_iext_cursor *cur)
+{
+	return &cur->leaf->recs[cur->pos];
+}
+
+static inline bool fuse_iext_valid(const struct fuse_ifork *ifp,
+				   const struct fuse_iext_cursor *cur)
+{
+	if (!cur->leaf)
+		return false;
+	if (cur->pos < 0 || cur->pos >= fuse_iext_max_recs(ifp))
+		return false;
+	if (fuse_iext_rec_is_empty(cur_rec(cur)))
+		return false;
+	return true;
+}
+
+static void *
+fuse_iext_find_first_leaf(
+	struct fuse_ifork	*ifp)
+{
+	struct fuse_iext_node	*node = ifp->if_data;
+	int			height;
+
+	if (!ifp->if_height)
+		return NULL;
+
+	for (height = ifp->if_height; height > 1; height--) {
+		node = node->ptrs[0];
+		ASSERT(node);
+	}
+
+	return node;
+}
+
+static void *
+fuse_iext_find_last_leaf(
+	struct fuse_ifork	*ifp)
+{
+	struct fuse_iext_node	*node = ifp->if_data;
+	int			height, i;
+
+	if (!ifp->if_height)
+		return NULL;
+
+	for (height = ifp->if_height; height > 1; height--) {
+		for (i = 1; i < KEYS_PER_NODE; i++)
+			if (!node->ptrs[i])
+				break;
+		node = node->ptrs[i - 1];
+		ASSERT(node);
+	}
+
+	return node;
+}
+
+void
+fuse_iext_first(
+	struct fuse_ifork	*ifp,
+	struct fuse_iext_cursor	*cur)
+{
+	cur->pos = 0;
+	cur->leaf = fuse_iext_find_first_leaf(ifp);
+}
+
+void
+fuse_iext_last(
+	struct fuse_ifork	*ifp,
+	struct fuse_iext_cursor	*cur)
+{
+	int			i;
+
+	cur->leaf = fuse_iext_find_last_leaf(ifp);
+	if (!cur->leaf) {
+		cur->pos = 0;
+		return;
+	}
+
+	for (i = 1; i < fuse_iext_max_recs(ifp); i++) {
+		if (fuse_iext_rec_is_empty(&cur->leaf->recs[i]))
+			break;
+	}
+	cur->pos = i - 1;
+}
+
+void
+fuse_iext_next(
+	struct fuse_ifork	*ifp,
+	struct fuse_iext_cursor	*cur)
+{
+	if (!cur->leaf) {
+		ASSERT(cur->pos <= 0 || cur->pos >= RECS_PER_LEAF);
+		fuse_iext_first(ifp, cur);
+		return;
+	}
+
+	ASSERT(cur->pos >= 0);
+	ASSERT(cur->pos < fuse_iext_max_recs(ifp));
+
+	cur->pos++;
+	if (ifp->if_height > 1 && !fuse_iext_valid(ifp, cur) &&
+	    cur->leaf->next) {
+		cur->leaf = cur->leaf->next;
+		cur->pos = 0;
+	}
+}
+
+void
+fuse_iext_prev(
+	struct fuse_ifork	*ifp,
+	struct fuse_iext_cursor	*cur)
+{
+	if (!cur->leaf) {
+		ASSERT(cur->pos <= 0 || cur->pos >= RECS_PER_LEAF);
+		fuse_iext_last(ifp, cur);
+		return;
+	}
+
+	ASSERT(cur->pos >= 0);
+	ASSERT(cur->pos <= RECS_PER_LEAF);
+
+recurse:
+	do {
+		cur->pos--;
+		if (fuse_iext_valid(ifp, cur))
+			return;
+	} while (cur->pos > 0);
+
+	if (ifp->if_height > 1 && cur->leaf->prev) {
+		cur->leaf = cur->leaf->prev;
+		cur->pos = RECS_PER_LEAF;
+		goto recurse;
+	}
+}
+
+static inline int
+fuse_iext_key_cmp(
+	struct fuse_iext_node	*node,
+	int			n,
+	loff_t			offset)
+{
+	if (node->keys[n] > offset)
+		return 1;
+	if (node->keys[n] < offset)
+		return -1;
+	return 0;
+}
+
+static inline int
+fuse_iext_rec_cmp(
+	struct fuse_iext_rec	*rec,
+	loff_t			offset)
+{
+	if (rec->offset > offset)
+		return 1;
+	if (rec->offset + rec->length <= offset)
+		return -1;
+	return 0;
+}
+
+static void *
+fuse_iext_find_level(
+	struct fuse_ifork	*ifp,
+	loff_t			offset,
+	int			level)
+{
+	struct fuse_iext_node	*node = ifp->if_data;
+	int			height, i;
+
+	if (!ifp->if_height)
+		return NULL;
+
+	for (height = ifp->if_height; height > level; height--) {
+		for (i = 1; i < KEYS_PER_NODE; i++)
+			if (fuse_iext_key_cmp(node, i, offset) > 0)
+				break;
+
+		node = node->ptrs[i - 1];
+		if (!node)
+			break;
+	}
+
+	return node;
+}
+
+static int
+fuse_iext_node_pos(
+	struct fuse_iext_node	*node,
+	loff_t			offset)
+{
+	int			i;
+
+	for (i = 1; i < KEYS_PER_NODE; i++) {
+		if (fuse_iext_key_cmp(node, i, offset) > 0)
+			break;
+	}
+
+	return i - 1;
+}
+
+static int
+fuse_iext_node_insert_pos(
+	struct fuse_iext_node	*node,
+	loff_t			offset)
+{
+	int			i;
+
+	for (i = 0; i < KEYS_PER_NODE; i++) {
+		if (fuse_iext_key_cmp(node, i, offset) > 0)
+			return i;
+	}
+
+	return KEYS_PER_NODE;
+}
+
+static int
+fuse_iext_node_nr_entries(
+	struct fuse_iext_node	*node,
+	int			start)
+{
+	int			i;
+
+	for (i = start; i < KEYS_PER_NODE; i++) {
+		if (node->keys[i] == FUSE_IEXT_KEY_INVALID)
+			break;
+	}
+
+	return i;
+}
+
+static int
+fuse_iext_leaf_nr_entries(
+	struct fuse_ifork	*ifp,
+	struct fuse_iext_leaf	*leaf,
+	int			start)
+{
+	int			i;
+
+	for (i = start; i < fuse_iext_max_recs(ifp); i++) {
+		if (fuse_iext_rec_is_empty(&leaf->recs[i]))
+			break;
+	}
+
+	return i;
+}
+
+static inline uint64_t
+fuse_iext_leaf_key(
+	struct fuse_iext_leaf	*leaf,
+	int			n)
+{
+	return leaf->recs[n].offset;
+}
+
+static inline void *
+fuse_iext_alloc_node(
+	int	size)
+{
+	return kzalloc(size, GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
+}
+
+static void
+fuse_iext_grow(
+	struct fuse_ifork	*ifp)
+{
+	struct fuse_iext_node	*node = fuse_iext_alloc_node(NODE_SIZE);
+	int			i;
+
+	if (ifp->if_height == 1) {
+		struct fuse_iext_leaf *prev = ifp->if_data;
+
+		node->keys[0] = fuse_iext_leaf_key(prev, 0);
+		node->ptrs[0] = prev;
+	} else  {
+		struct fuse_iext_node *prev = ifp->if_data;
+
+		ASSERT(ifp->if_height > 1);
+
+		node->keys[0] = prev->keys[0];
+		node->ptrs[0] = prev;
+	}
+
+	for (i = 1; i < KEYS_PER_NODE; i++)
+		node->keys[i] = FUSE_IEXT_KEY_INVALID;
+
+	ifp->if_data = node;
+	ifp->if_height++;
+}
+
+static void
+fuse_iext_update_node(
+	struct fuse_ifork	*ifp,
+	loff_t			old_offset,
+	loff_t			new_offset,
+	int			level,
+	void			*ptr)
+{
+	struct fuse_iext_node	*node = ifp->if_data;
+	int			height, i;
+
+	for (height = ifp->if_height; height > level; height--) {
+		for (i = 0; i < KEYS_PER_NODE; i++) {
+			if (i > 0 && fuse_iext_key_cmp(node, i, old_offset) > 0)
+				break;
+			if (node->keys[i] == old_offset)
+				node->keys[i] = new_offset;
+		}
+		node = node->ptrs[i - 1];
+		ASSERT(node);
+	}
+
+	ASSERT(node == ptr);
+}
+
+static struct fuse_iext_node *
+fuse_iext_split_node(
+	struct fuse_iext_node	**nodep,
+	int			*pos,
+	int			*nr_entries)
+{
+	struct fuse_iext_node	*node = *nodep;
+	struct fuse_iext_node	*new = fuse_iext_alloc_node(NODE_SIZE);
+	const int		nr_move = KEYS_PER_NODE / 2;
+	int			nr_keep = nr_move + (KEYS_PER_NODE & 1);
+	int			i = 0;
+
+	/* for sequential append operations just spill over into the new node */
+	if (*pos == KEYS_PER_NODE) {
+		*nodep = new;
+		*pos = 0;
+		*nr_entries = 0;
+		goto done;
+	}
+
+
+	for (i = 0; i < nr_move; i++) {
+		new->keys[i] = node->keys[nr_keep + i];
+		new->ptrs[i] = node->ptrs[nr_keep + i];
+
+		node->keys[nr_keep + i] = FUSE_IEXT_KEY_INVALID;
+		node->ptrs[nr_keep + i] = NULL;
+	}
+
+	if (*pos >= nr_keep) {
+		*nodep = new;
+		*pos -= nr_keep;
+		*nr_entries = nr_move;
+	} else {
+		*nr_entries = nr_keep;
+	}
+done:
+	for (; i < KEYS_PER_NODE; i++)
+		new->keys[i] = FUSE_IEXT_KEY_INVALID;
+	return new;
+}
+
+static void
+fuse_iext_insert_node(
+	struct fuse_ifork	*ifp,
+	uint64_t		offset,
+	void			*ptr,
+	int			level)
+{
+	struct fuse_iext_node	*node, *new;
+	int			i, pos, nr_entries;
+
+again:
+	if (ifp->if_height < level)
+		fuse_iext_grow(ifp);
+
+	new = NULL;
+	node = fuse_iext_find_level(ifp, offset, level);
+	pos = fuse_iext_node_insert_pos(node, offset);
+	nr_entries = fuse_iext_node_nr_entries(node, pos);
+
+	ASSERT(pos >= nr_entries || fuse_iext_key_cmp(node, pos, offset) != 0);
+	ASSERT(nr_entries <= KEYS_PER_NODE);
+
+	if (nr_entries == KEYS_PER_NODE)
+		new = fuse_iext_split_node(&node, &pos, &nr_entries);
+
+	/*
+	 * Update the pointers in higher levels if the first entry changes
+	 * in an existing node.
+	 */
+	if (node != new && pos == 0 && nr_entries > 0)
+		fuse_iext_update_node(ifp, node->keys[0], offset, level, node);
+
+	for (i = nr_entries; i > pos; i--) {
+		node->keys[i] = node->keys[i - 1];
+		node->ptrs[i] = node->ptrs[i - 1];
+	}
+	node->keys[pos] = offset;
+	node->ptrs[pos] = ptr;
+
+	if (new) {
+		offset = new->keys[0];
+		ptr = new;
+		level++;
+		goto again;
+	}
+}
+
+static struct fuse_iext_leaf *
+fuse_iext_split_leaf(
+	struct fuse_iext_cursor	*cur,
+	int			*nr_entries)
+{
+	struct fuse_iext_leaf	*leaf = cur->leaf;
+	struct fuse_iext_leaf	*new = fuse_iext_alloc_node(NODE_SIZE);
+	const int		nr_move = RECS_PER_LEAF / 2;
+	int			nr_keep = nr_move + (RECS_PER_LEAF & 1);
+	int			i;
+
+	/* for sequential append operations just spill over into the new node */
+	if (cur->pos == RECS_PER_LEAF) {
+		cur->leaf = new;
+		cur->pos = 0;
+		*nr_entries = 0;
+		goto done;
+	}
+
+	for (i = 0; i < nr_move; i++) {
+		new->recs[i] = leaf->recs[nr_keep + i];
+		fuse_iext_rec_clear(&leaf->recs[nr_keep + i]);
+	}
+
+	if (cur->pos >= nr_keep) {
+		cur->leaf = new;
+		cur->pos -= nr_keep;
+		*nr_entries = nr_move;
+	} else {
+		*nr_entries = nr_keep;
+	}
+done:
+	if (leaf->next)
+		leaf->next->prev = new;
+	new->next = leaf->next;
+	new->prev = leaf;
+	leaf->next = new;
+	return new;
+}
+
+static void
+fuse_iext_alloc_root(
+	struct fuse_ifork	*ifp,
+	struct fuse_iext_cursor	*cur)
+{
+	ASSERT(ifp->if_bytes == 0);
+
+	ifp->if_data = fuse_iext_alloc_node(sizeof(struct fuse_iext_rec));
+	ifp->if_height = 1;
+
+	/* now that we have a node step into it */
+	cur->leaf = ifp->if_data;
+	cur->pos = 0;
+}
+
+static void
+fuse_iext_realloc_root(
+	struct fuse_ifork	*ifp,
+	struct fuse_iext_cursor	*cur)
+{
+	int64_t new_size = ifp->if_bytes + sizeof(struct fuse_iext_rec);
+	void *new;
+
+	/* account for the prev/next pointers */
+	if (new_size / sizeof(struct fuse_iext_rec) == RECS_PER_LEAF)
+		new_size = NODE_SIZE;
+
+	new = krealloc(ifp->if_data, new_size,
+			GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
+	memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes);
+	ifp->if_data = new;
+	cur->leaf = new;
+}
+
+/*
+ * Increment the sequence counter on extent tree changes. We use WRITE_ONCE
+ * here to ensure the update to the sequence counter is seen before the
+ * modifications to the extent tree itself take effect.
+ */
+static inline void fuse_iext_inc_seq(struct fuse_iomap_cache *ip)
+{
+	WRITE_ONCE(ip->im_seq, READ_ONCE(ip->im_seq) + 1);
+}
+
+void
+fuse_iext_insert_raw(
+	struct fuse_iomap_cache	*ip,
+	struct fuse_ifork	*ifp,
+	struct fuse_iext_cursor	*cur,
+	const struct fuse_iomap	*irec)
+{
+	loff_t			offset = irec->offset;
+	struct fuse_iext_leaf	*new = NULL;
+	int			nr_entries, i;
+
+	fuse_iext_inc_seq(ip);
+
+	if (ifp->if_height == 0)
+		fuse_iext_alloc_root(ifp, cur);
+	else if (ifp->if_height == 1)
+		fuse_iext_realloc_root(ifp, cur);
+
+	nr_entries = fuse_iext_leaf_nr_entries(ifp, cur->leaf, cur->pos);
+	ASSERT(nr_entries <= RECS_PER_LEAF);
+	ASSERT(cur->pos >= nr_entries ||
+	       fuse_iext_rec_cmp(cur_rec(cur), irec->offset) != 0);
+
+	if (nr_entries == RECS_PER_LEAF)
+		new = fuse_iext_split_leaf(cur, &nr_entries);
+
+	/*
+	 * Update the pointers in higher levels if the first entry changes
+	 * in an existing node.
+	 */
+	if (cur->leaf != new && cur->pos == 0 && nr_entries > 0) {
+		fuse_iext_update_node(ifp, fuse_iext_leaf_key(cur->leaf, 0),
+				offset, 1, cur->leaf);
+	}
+
+	for (i = nr_entries; i > cur->pos; i--)
+		cur->leaf->recs[i] = cur->leaf->recs[i - 1];
+	fuse_iext_set(cur_rec(cur), irec);
+	ifp->if_bytes += sizeof(struct fuse_iext_rec);
+
+	if (new)
+		fuse_iext_insert_node(ifp, fuse_iext_leaf_key(new, 0), new, 2);
+}
+
+void
+fuse_iext_insert(
+	struct fuse_iomap_cache	*ip,
+	struct fuse_iext_cursor	*cur,
+	const struct fuse_iomap	*irec,
+	int			state)
+{
+	struct fuse_ifork	*ifp = fuse_iext_state_to_fork(ip, state);
+
+	fuse_iext_insert_raw(ip, ifp, cur, irec);
+	trace_fuse_iext_insert(VFS_I(ip), cur, state, _RET_IP_);
+}
+
+static struct fuse_iext_node *
+fuse_iext_rebalance_node(
+	struct fuse_iext_node	*parent,
+	int			*pos,
+	struct fuse_iext_node	*node,
+	int			nr_entries)
+{
+	/*
+	 * If the neighbouring nodes are completely full, or have different
+	 * parents, we might never be able to merge our node, and will only
+	 * delete it once the number of entries hits zero.
+	 */
+	if (nr_entries == 0)
+		return node;
+
+	if (*pos > 0) {
+		struct fuse_iext_node *prev = parent->ptrs[*pos - 1];
+		int nr_prev = fuse_iext_node_nr_entries(prev, 0), i;
+
+		if (nr_prev + nr_entries <= KEYS_PER_NODE) {
+			for (i = 0; i < nr_entries; i++) {
+				prev->keys[nr_prev + i] = node->keys[i];
+				prev->ptrs[nr_prev + i] = node->ptrs[i];
+			}
+			return node;
+		}
+	}
+
+	if (*pos + 1 < fuse_iext_node_nr_entries(parent, *pos)) {
+		struct fuse_iext_node *next = parent->ptrs[*pos + 1];
+		int nr_next = fuse_iext_node_nr_entries(next, 0), i;
+
+		if (nr_entries + nr_next <= KEYS_PER_NODE) {
+			/*
+			 * Merge the next node into this node so that we don't
+			 * have to do an additional update of the keys in the
+			 * higher levels.
+			 */
+			for (i = 0; i < nr_next; i++) {
+				node->keys[nr_entries + i] = next->keys[i];
+				node->ptrs[nr_entries + i] = next->ptrs[i];
+			}
+
+			++*pos;
+			return next;
+		}
+	}
+
+	return NULL;
+}
+
+static void
+fuse_iext_remove_node(
+	struct fuse_ifork	*ifp,
+	loff_t			offset,
+	void			*victim)
+{
+	struct fuse_iext_node	*node, *parent;
+	int			level = 2, pos, nr_entries, i;
+
+	ASSERT(level <= ifp->if_height);
+	node = fuse_iext_find_level(ifp, offset, level);
+	pos = fuse_iext_node_pos(node, offset);
+again:
+	ASSERT(node->ptrs[pos]);
+	ASSERT(node->ptrs[pos] == victim);
+	kfree(victim);
+
+	nr_entries = fuse_iext_node_nr_entries(node, pos) - 1;
+	offset = node->keys[0];
+	for (i = pos; i < nr_entries; i++) {
+		node->keys[i] = node->keys[i + 1];
+		node->ptrs[i] = node->ptrs[i + 1];
+	}
+	node->keys[nr_entries] = FUSE_IEXT_KEY_INVALID;
+	node->ptrs[nr_entries] = NULL;
+
+	if (pos == 0 && nr_entries > 0) {
+		fuse_iext_update_node(ifp, offset, node->keys[0], level, node);
+		offset = node->keys[0];
+	}
+
+	if (nr_entries >= KEYS_PER_NODE / 2)
+		return;
+
+	if (level < ifp->if_height) {
+		/*
+		 * If we aren't at the root yet try to find a neighbour node to
+		 * merge with (or delete the node if it is empty), and then
+		 * recurse up to the next level.
+		 */
+		level++;
+		parent = fuse_iext_find_level(ifp, offset, level);
+		pos = fuse_iext_node_pos(parent, offset);
+
+		ASSERT(pos != KEYS_PER_NODE);
+		ASSERT(parent->ptrs[pos] == node);
+
+		node = fuse_iext_rebalance_node(parent, &pos, node, nr_entries);
+		if (node) {
+			victim = node;
+			node = parent;
+			goto again;
+		}
+	} else if (nr_entries == 1) {
+		/*
+		 * If we are at the root and only one entry is left we can just
+		 * free this node and update the root pointer.
+		 */
+		ASSERT(node == ifp->if_data);
+		ifp->if_data = node->ptrs[0];
+		ifp->if_height--;
+		kfree(node);
+	}
+}
+
+static void
+fuse_iext_rebalance_leaf(
+	struct fuse_ifork	*ifp,
+	struct fuse_iext_cursor	*cur,
+	struct fuse_iext_leaf	*leaf,
+	loff_t			offset,
+	int			nr_entries)
+{
+	/*
+	 * If the neighbouring nodes are completely full we might never be able
+	 * to merge our node, and will only delete it once the number of
+	 * entries hits zero.
+	 */
+	if (nr_entries == 0)
+		goto remove_node;
+
+	if (leaf->prev) {
+		int nr_prev = fuse_iext_leaf_nr_entries(ifp, leaf->prev, 0), i;
+
+		if (nr_prev + nr_entries <= RECS_PER_LEAF) {
+			for (i = 0; i < nr_entries; i++)
+				leaf->prev->recs[nr_prev + i] = leaf->recs[i];
+
+			if (cur->leaf == leaf) {
+				cur->leaf = leaf->prev;
+				cur->pos += nr_prev;
+			}
+			goto remove_node;
+		}
+	}
+
+	if (leaf->next) {
+		int nr_next = fuse_iext_leaf_nr_entries(ifp, leaf->next, 0), i;
+
+		if (nr_entries + nr_next <= RECS_PER_LEAF) {
+			/*
+			 * Merge the next node into this node so that we don't
+			 * have to do an additional update of the keys in the
+			 * higher levels.
+			 */
+			for (i = 0; i < nr_next; i++) {
+				leaf->recs[nr_entries + i] =
+					leaf->next->recs[i];
+			}
+
+			if (cur->leaf == leaf->next) {
+				cur->leaf = leaf;
+				cur->pos += nr_entries;
+			}
+
+			offset = fuse_iext_leaf_key(leaf->next, 0);
+			leaf = leaf->next;
+			goto remove_node;
+		}
+	}
+
+	return;
+remove_node:
+	if (leaf->prev)
+		leaf->prev->next = leaf->next;
+	if (leaf->next)
+		leaf->next->prev = leaf->prev;
+	fuse_iext_remove_node(ifp, offset, leaf);
+}
+
+static void
+fuse_iext_free_last_leaf(
+	struct fuse_ifork	*ifp)
+{
+	ifp->if_height--;
+	kfree(ifp->if_data);
+	ifp->if_data = NULL;
+}
+
+void
+fuse_iext_remove(
+	struct fuse_iomap_cache	*ip,
+	struct fuse_iext_cursor	*cur,
+	int			state)
+{
+	struct fuse_ifork	*ifp = fuse_iext_state_to_fork(ip, state);
+	struct fuse_iext_leaf	*leaf = cur->leaf;
+	loff_t			offset = fuse_iext_leaf_key(leaf, 0);
+	int			i, nr_entries;
+
+	trace_fuse_iext_remove(VFS_I(ip), cur, state, _RET_IP_);
+
+	ASSERT(ifp->if_height > 0);
+	ASSERT(ifp->if_data != NULL);
+	ASSERT(fuse_iext_valid(ifp, cur));
+
+	fuse_iext_inc_seq(ip);
+
+	nr_entries = fuse_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1;
+	for (i = cur->pos; i < nr_entries; i++)
+		leaf->recs[i] = leaf->recs[i + 1];
+	fuse_iext_rec_clear(&leaf->recs[nr_entries]);
+	ifp->if_bytes -= sizeof(struct fuse_iext_rec);
+
+	if (cur->pos == 0 && nr_entries > 0) {
+		fuse_iext_update_node(ifp, offset, fuse_iext_leaf_key(leaf, 0), 1,
+				leaf);
+		offset = fuse_iext_leaf_key(leaf, 0);
+	} else if (cur->pos == nr_entries) {
+		if (ifp->if_height > 1 && leaf->next)
+			cur->leaf = leaf->next;
+		else
+			cur->leaf = NULL;
+		cur->pos = 0;
+	}
+
+	if (nr_entries >= RECS_PER_LEAF / 2)
+		return;
+
+	if (ifp->if_height > 1)
+		fuse_iext_rebalance_leaf(ifp, cur, leaf, offset, nr_entries);
+	else if (nr_entries == 0)
+		fuse_iext_free_last_leaf(ifp);
+}
+
+/*
+ * Lookup the extent covering offset.
+ *
+ * If there is an extent covering offset return the extent index, and store the
+ * expanded extent structure in *gotp, and the extent cursor in *cur.
+ * If there is no extent covering offset, but there is an extent after it (e.g.
+ * it lies in a hole) return that extent in *gotp and its cursor in *cur
+ * instead.
+ * If offset is beyond the last extent return false, and return an invalid
+ * cursor value.
+ */
+bool
+fuse_iext_lookup_extent(
+	struct fuse_iomap_cache	*ip,
+	struct fuse_ifork	*ifp,
+	loff_t			offset,
+	struct fuse_iext_cursor	*cur,
+	struct fuse_iomap	*gotp)
+{
+	cur->leaf = fuse_iext_find_level(ifp, offset, 1);
+	if (!cur->leaf) {
+		cur->pos = 0;
+		return false;
+	}
+
+	for (cur->pos = 0; cur->pos < fuse_iext_max_recs(ifp); cur->pos++) {
+		struct fuse_iext_rec *rec = cur_rec(cur);
+
+		if (fuse_iext_rec_is_empty(rec))
+			break;
+		if (fuse_iext_rec_cmp(rec, offset) >= 0)
+			goto found;
+	}
+
+	/* Try looking in the next node for an entry > offset */
+	if (ifp->if_height == 1 || !cur->leaf->next)
+		return false;
+	cur->leaf = cur->leaf->next;
+	cur->pos = 0;
+	if (!fuse_iext_valid(ifp, cur))
+		return false;
+found:
+	fuse_iext_get(gotp, cur_rec(cur));
+	return true;
+}
+
+/*
+ * Returns the last extent before end, and if this extent doesn't cover
+ * end, update end to the end of the extent.
+ */
+bool
+fuse_iext_lookup_extent_before(
+	struct fuse_iomap_cache	*ip,
+	struct fuse_ifork	*ifp,
+	loff_t			*end,
+	struct fuse_iext_cursor	*cur,
+	struct fuse_iomap	*gotp)
+{
+	/* could be optimized to not even look up the next on a match.. */
+	if (fuse_iext_lookup_extent(ip, ifp, *end - 1, cur, gotp) &&
+	    gotp->offset <= *end - 1)
+		return true;
+	if (!fuse_iext_prev_extent(ifp, cur, gotp))
+		return false;
+	*end = gotp->offset + gotp->length;
+	return true;
+}
+
+void
+fuse_iext_update_extent(
+	struct fuse_iomap_cache	*ip,
+	int			state,
+	struct fuse_iext_cursor	*cur,
+	struct fuse_iomap	*new)
+{
+	struct fuse_ifork	*ifp = fuse_iext_state_to_fork(ip, state);
+
+	fuse_iext_inc_seq(ip);
+
+	if (cur->pos == 0) {
+		struct fuse_iomap	old;
+
+		fuse_iext_get(&old, cur_rec(cur));
+		if (new->offset != old.offset) {
+			fuse_iext_update_node(ifp, old.offset,
+					new->offset, 1, cur->leaf);
+		}
+	}
+
+	trace_fuse_iext_pre_update(VFS_I(ip), cur, state, _RET_IP_);
+	fuse_iext_set(cur_rec(cur), new);
+	trace_fuse_iext_post_update(VFS_I(ip), cur, state, _RET_IP_);
+}
+
+/*
+ * Return true if the cursor points at an extent and return the extent structure
+ * in gotp.  Else return false.
+ */
+bool
+fuse_iext_get_extent(
+	const struct fuse_ifork		*ifp,
+	const struct fuse_iext_cursor	*cur,
+	struct fuse_iomap		*gotp)
+{
+	if (!fuse_iext_valid(ifp, cur))
+		return false;
+	fuse_iext_get(gotp, cur_rec(cur));
+	return true;
+}
+
+/*
+ * This is a recursive function, because of that we need to be extremely
+ * careful with stack usage.
+ */
+static void
+fuse_iext_destroy_node(
+	struct fuse_iext_node	*node,
+	int			level)
+{
+	int			i;
+
+	if (level > 1) {
+		for (i = 0; i < KEYS_PER_NODE; i++) {
+			if (node->keys[i] == FUSE_IEXT_KEY_INVALID)
+				break;
+			fuse_iext_destroy_node(node->ptrs[i], level - 1);
+		}
+	}
+
+	kfree(node);
+}
+
+void
+fuse_iext_destroy(
+	struct fuse_ifork	*ifp)
+{
+	fuse_iext_destroy_node(ifp->if_data, ifp->if_height);
+
+	ifp->if_bytes = 0;
+	ifp->if_height = 0;
+	ifp->if_data = NULL;
+}
+
+static inline struct fuse_ifork *
+fuse_iomap_fork_ptr(
+	struct fuse_iomap_cache	*ip,
+	enum fuse_iomap_fork	whichfork)
+{
+	switch (whichfork) {
+	case FUSE_IOMAP_READ_FORK:
+		return &ip->im_read;
+	case FUSE_IOMAP_WRITE_FORK:
+		return ip->im_write;
+	default:
+		ASSERT(0);
+		return NULL;
+	}
+}
+
+static inline bool fuse_iomap_addrs_adjacent(const struct fuse_iomap *left,
+					     const struct fuse_iomap *right)
+{
+	switch (left->type) {
+	case FUSE_IOMAP_TYPE_MAPPED:
+	case FUSE_IOMAP_TYPE_UNWRITTEN:
+		return left->addr + left->length == right->addr;
+	default:
+		return left->addr  == FUSE_IOMAP_NULL_ADDR &&
+		       right->addr == FUSE_IOMAP_NULL_ADDR;
+	}
+}
+
+static inline bool fuse_iomap_can_merge(const struct fuse_iomap *left,
+					const struct fuse_iomap *right)
+{
+	return (left->dev == right->dev &&
+		left->offset + left->length == right->offset &&
+		left->type  == right->type &&
+		fuse_iomap_addrs_adjacent(left, right) &&
+		left->flags == right->flags &&
+		left->length + right->length <= FUSE_IOMAP_MAX_LEN);
+}
+
+static inline bool fuse_iomap_can_merge3(const struct fuse_iomap *left,
+					 const struct fuse_iomap *new,
+					 const struct fuse_iomap *right)
+{
+	return left->length + new->length + right->length <= FUSE_IOMAP_MAX_LEN;
+}
+
+#if IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG)
+static void fuse_iext_check_mappings(struct inode *inode,
+				      struct fuse_iomap_cache *ip,
+				      struct fuse_ifork *ifp)
+{
+	struct fuse_inode	*fi = FUSE_I(ip);
+	struct fuse_iext_cursor	icur;
+	struct fuse_iomap	prev, got;
+	unsigned long long	nr = 0;
+	enum fuse_iomap_fork	whichfork;
+
+	if (!ifp)
+		return;
+
+	if (ifp == ip->im_write)
+		whichfork = FUSE_IOMAP_WRITE_FORK;
+	else
+		whichfork = FUSE_IOMAP_READ_FORK;
+
+	fuse_iext_first(ifp, &icur);
+	if (!fuse_iext_get_extent(ifp, &icur, &prev))
+		return;
+	trace_fuse_iext_check_mapping(inode, whichfork, &prev, _RET_IP_);
+	nr++;
+
+	fuse_iext_next(ifp, &icur);
+	while (fuse_iext_get_extent(ifp, &icur, &got)) {
+		trace_fuse_iext_check_mapping(inode, whichfork, &got, _RET_IP_);
+		if (got.length == 0 ||
+		    got.offset < prev.offset + prev.length ||
+		    fuse_iomap_can_merge(&prev, &got)) {
+			printk(KERN_ERR "FUSE IOMAP CORRUPTION ino=%llu nr=%llu",
+			       fi->orig_ino, nr);
+			printk(KERN_ERR "prev: offset=%llu length=%llu type=%u flags=0x%x cookie=%llu dev=%u addr=%llu\n",
+			       prev.offset, prev.length, prev.type, prev.flags,
+			       prev.validity_cookie, prev.dev, prev.addr);
+			printk(KERN_ERR "curr: offset=%llu length=%llu type=%u flags=0x%x cookie=%llu dev=%u addr=%llu\n",
+			       got.offset, got.length, got.type, got.flags,
+			       got.validity_cookie, got.dev, got.addr);
+		}
+
+		prev = got;
+		nr++;
+		fuse_iext_next(ifp, &icur);
+	}
+}
+#else
+# define fuse_iext_check_mappings(...)	((void)0)
+#endif
+
+static void
+fuse_iext_del_mapping(
+	struct fuse_iomap_cache	*ip,
+	struct fuse_ifork	*ifp,
+	struct fuse_iext_cursor	*icur,
+	struct fuse_iomap	*got,	/* current extent entry */
+	struct fuse_iomap	*del)	/* data to remove from extents */
+{
+	struct fuse_iomap	new;	/* new record to be inserted */
+	/* first addr (fsblock aligned) past del */
+	uint64_t		del_endaddr;
+	/* first offset (fsblock aligned) past del */
+	uint64_t		del_endoff = del->offset + del->length;
+	/* first offset (fsblock aligned) past got */
+	uint64_t		got_endoff = got->offset + got->length;
+	uint32_t		state = fuse_iomap_fork_to_state(ip, ifp);
+
+	ASSERT(del->length > 0);
+	ASSERT(got->offset <= del->offset);
+	ASSERT(got_endoff >= del_endoff);
+
+	switch (del->type) {
+	case FUSE_IOMAP_TYPE_MAPPED:
+	case FUSE_IOMAP_TYPE_UNWRITTEN:
+		del_endaddr = del->addr + del->length;
+		break;
+	default:
+		del_endaddr = FUSE_IOMAP_NULL_ADDR;
+		break;
+	}
+
+	if (got->offset == del->offset)
+		state |= FUSE_IEXT_LEFT_FILLING;
+	if (got_endoff == del_endoff)
+		state |= FUSE_IEXT_RIGHT_FILLING;
+
+	trace_fuse_iext_del_mapping(VFS_I(ip), state, del);
+	trace_fuse_iext_del_mapping_got(VFS_I(ip), got);
+
+	switch (state & (FUSE_IEXT_LEFT_FILLING | FUSE_IEXT_RIGHT_FILLING)) {
+	case FUSE_IEXT_LEFT_FILLING | FUSE_IEXT_RIGHT_FILLING:
+		/*
+		 * Matches the whole extent.  Delete the entry.
+		 */
+		fuse_iext_remove(ip, icur, state);
+		fuse_iext_prev(ifp, icur);
+		break;
+	case FUSE_IEXT_LEFT_FILLING:
+		/*
+		 * Deleting the first part of the extent.
+		 */
+		got->offset = del_endoff;
+		got->addr = del_endaddr;
+		got->length -= del->length;
+		fuse_iext_update_extent(ip, state, icur, got);
+		break;
+	case FUSE_IEXT_RIGHT_FILLING:
+		/*
+		 * Deleting the last part of the extent.
+		 */
+		got->length -= del->length;
+		fuse_iext_update_extent(ip, state, icur, got);
+		break;
+	case 0:
+		/*
+		 * Deleting the middle of the extent.
+		 */
+		got->length = del->offset - got->offset;
+		fuse_iext_update_extent(ip, state, icur, got);
+
+		new.offset = del_endoff;
+		new.length = got_endoff - del_endoff;
+		new.type = got->type;
+		new.flags = got->flags;
+		new.addr = del_endaddr;
+		new.dev = got->dev;
+
+		fuse_iext_next(ifp, icur);
+		fuse_iext_insert(ip, icur, &new, state);
+		break;
+	}
+}
+
+int
+fuse_iomap_cache_remove(
+	struct inode		*inode,
+	enum fuse_iomap_fork	whichfork,
+	loff_t			start,		/* first file offset deleted */
+	uint64_t		len)		/* length to unmap */
+{
+	struct fuse_iext_cursor	icur;
+	struct fuse_iomap	got;		/* current extent record */
+	struct fuse_iomap	del;		/* extent being deleted */
+	loff_t			end;
+	struct fuse_inode	*fi = get_fuse_inode(inode);
+	struct fuse_iomap_cache	*ip = &fi->cache;
+	struct fuse_ifork	*ifp = fuse_iomap_fork_ptr(ip, whichfork);
+	bool			wasreal;
+	bool			done = false;
+	int			ret = 0;
+
+	fuse_iomap_assert_locked(ip, FUSE_IOMAP_LOCK_EXCL);
+
+	trace_fuse_iomap_cache_remove(inode, whichfork, start, len, _RET_IP_);
+
+	if (!ifp || fuse_iext_count(ifp) == 0)
+		return 0;
+
+	/* Fast shortcut if the caller wants to erase everything */
+	if (start == 0 && len >= inode->i_sb->s_maxbytes) {
+		fuse_iext_destroy(ifp);
+		return 0;
+	}
+
+	if (!len)
+		goto out;
+
+	/*
+	 * If the caller wants us to remove everything to EOF, we set the end
+	 * of the removal range to the maximum file offset.  We don't support
+	 * unsigned file offsets.
+	 */
+	if (len == FUSE_IOMAP_INVAL_TO_EOF) {
+		const unsigned int blocksize = i_blocksize(inode);
+
+		len = round_up(inode->i_sb->s_maxbytes, blocksize) - start;
+	}
+
+	/*
+	 * Now that we've settled len, look up the extent before the end of the
+	 * range.
+	 */
+	end = start + len;
+	if (!fuse_iext_lookup_extent_before(ip, ifp, &end, &icur, &got))
+		goto out;
+	end--;
+
+	while (end != -1 && end >= start) {
+		/*
+		 * Is the found extent after a hole in which end lives?
+		 * Just back up to the previous extent, if so.
+		 */
+		if (got.offset > end &&
+		    !fuse_iext_prev_extent(ifp, &icur, &got)) {
+			done = true;
+			break;
+		}
+		/*
+		 * Is the last block of this extent before the range
+		 * we're supposed to delete?  If so, we're done.
+		 */
+		end = min_t(loff_t, end, got.offset + got.length - 1);
+		if (end < start)
+			break;
+		/*
+		 * Then deal with the (possibly delayed) allocated space
+		 * we found.
+		 */
+		del = got;
+		switch (del.type) {
+		case FUSE_IOMAP_TYPE_DELALLOC:
+		case FUSE_IOMAP_TYPE_HOLE:
+		case FUSE_IOMAP_TYPE_INLINE:
+		case FUSE_IOMAP_TYPE_PURE_OVERWRITE:
+			wasreal = false;
+			break;
+		case FUSE_IOMAP_TYPE_MAPPED:
+		case FUSE_IOMAP_TYPE_UNWRITTEN:
+			wasreal = true;
+			break;
+		default:
+			ASSERT(0);
+			ret = -EUCLEAN;
+			goto out;
+		}
+
+		if (got.offset < start) {
+			del.offset = start;
+			del.length -= start - got.offset;
+			if (wasreal)
+				del.addr += start - got.offset;
+		}
+		if (del.offset + del.length > end + 1)
+			del.length = end + 1 - del.offset;
+
+		fuse_iext_del_mapping(ip, ifp, &icur, &got, &del);
+		end = del.offset - 1;
+
+		/*
+		 * If not done go on to the next (previous) record.
+		 */
+		if (end != -1 && end >= start) {
+			if (!fuse_iext_get_extent(ifp, &icur, &got) ||
+			    (got.offset > end &&
+			     !fuse_iext_prev_extent(ifp, &icur, &got))) {
+				done = true;
+				break;
+			}
+		}
+	}
+
+	/* Should have removed everything */
+	if (len == 0 || done || end == (loff_t)-1 || end < start)
+		ret = 0;
+	else
+		ret = -EUCLEAN;
+
+out:
+	fuse_iext_check_mappings(inode, ip, ifp);
+	return ret;
+}
+
+static void
+fuse_iext_add_mapping(
+	struct fuse_iomap_cache	*ip,
+	struct fuse_ifork	*ifp,
+	struct fuse_iext_cursor	*icur,
+	const struct fuse_iomap	*new)	/* new extent entry */
+{
+	struct fuse_iomap	left;	/* left neighbor extent entry */
+	struct fuse_iomap	right;	/* right neighbor extent entry */
+	uint32_t		state = fuse_iomap_fork_to_state(ip, ifp);
+
+	/*
+	 * Check and set flags if this segment has a left neighbor.
+	 */
+	if (fuse_iext_peek_prev_extent(ifp, icur, &left))
+		state |= FUSE_IEXT_LEFT_VALID;
+
+	/*
+	 * Check and set flags if this segment has a current value.
+	 * Not true if we're inserting into the "hole" at eof.
+	 */
+	if (fuse_iext_get_extent(ifp, icur, &right))
+		state |= FUSE_IEXT_RIGHT_VALID;
+
+	/*
+	 * We're inserting a real allocation between "left" and "right".
+	 * Set the contiguity flags.  Don't let extents get too large.
+	 */
+	if ((state & FUSE_IEXT_LEFT_VALID) && fuse_iomap_can_merge(&left, new))
+		state |= FUSE_IEXT_LEFT_CONTIG;
+
+	if ((state & FUSE_IEXT_RIGHT_VALID) &&
+	    fuse_iomap_can_merge(new, &right) &&
+	    (!(state & FUSE_IEXT_LEFT_CONTIG) ||
+	     fuse_iomap_can_merge3(&left, new, &right)))
+		state |= FUSE_IEXT_RIGHT_CONTIG;
+
+	trace_fuse_iext_add_mapping(VFS_I(ip), state, new);
+	if (state & FUSE_IEXT_LEFT_VALID)
+		trace_fuse_iext_add_mapping_left(VFS_I(ip), &left);
+	if (state & FUSE_IEXT_RIGHT_VALID)
+		trace_fuse_iext_add_mapping_right(VFS_I(ip), &right);
+
+	/*
+	 * Select which case we're in here, and implement it.
+	 */
+	switch (state & (FUSE_IEXT_LEFT_CONTIG | FUSE_IEXT_RIGHT_CONTIG)) {
+	case FUSE_IEXT_LEFT_CONTIG | FUSE_IEXT_RIGHT_CONTIG:
+		/*
+		 * New allocation is contiguous with real allocations on the
+		 * left and on the right.
+		 * Merge all three into a single extent record.
+		 */
+		left.length += new->length + right.length;
+
+		fuse_iext_remove(ip, icur, state);
+		fuse_iext_prev(ifp, icur);
+		fuse_iext_update_extent(ip, state, icur, &left);
+		break;
+
+	case FUSE_IEXT_LEFT_CONTIG:
+		/*
+		 * New allocation is contiguous with a real allocation
+		 * on the left.
+		 * Merge the new allocation with the left neighbor.
+		 */
+		left.length += new->length;
+
+		fuse_iext_prev(ifp, icur);
+		fuse_iext_update_extent(ip, state, icur, &left);
+		break;
+
+	case FUSE_IEXT_RIGHT_CONTIG:
+		/*
+		 * New allocation is contiguous with a real allocation
+		 * on the right.
+		 * Merge the new allocation with the right neighbor.
+		 */
+		right.offset = new->offset;
+		right.addr = new->addr;
+		right.length += new->length;
+		fuse_iext_update_extent(ip, state, icur, &right);
+		break;
+
+	case 0:
+		/*
+		 * New allocation is not contiguous with another
+		 * real allocation.
+		 * Insert a new entry.
+		 */
+		fuse_iext_insert(ip, icur, new, state);
+		break;
+	}
+}
+
+int
+fuse_iomap_cache_add(
+	struct inode		*inode,
+	enum fuse_iomap_fork	whichfork,
+	const struct fuse_iomap	*new)
+{
+	struct fuse_iext_cursor	icur;
+	struct fuse_iomap	got;
+	struct fuse_inode	*fi = get_fuse_inode(inode);
+	struct fuse_iomap_cache	*ip = &fi->cache;
+	struct fuse_ifork	*ifp = fuse_iomap_fork_ptr(ip, whichfork);
+
+	fuse_iomap_assert_locked(ip, FUSE_IOMAP_LOCK_EXCL);
+	ASSERT(new->length > 0);
+	ASSERT(new->offset < inode->i_sb->s_maxbytes);
+
+	trace_fuse_iomap_cache_add(inode, whichfork, new, _RET_IP_);
+
+	if (!ifp) {
+		ifp = kzalloc(sizeof(struct fuse_ifork),
+			      GFP_KERNEL | __GFP_NOFAIL);
+		if (!ifp)
+			return -ENOMEM;
+
+		ip->im_write = ifp;
+	}
+
+	if (fuse_iext_lookup_extent(ip, ifp, new->offset, &icur, &got)) {
+		/* make sure we only add into a hole. */
+		ASSERT(got.offset > new->offset);
+		ASSERT(got.offset - new->offset >= new->length);
+
+		if (got.offset <= new->offset ||
+		    got.offset - new->offset < new->length)
+			return -EUCLEAN;
+	}
+
+	fuse_iext_add_mapping(ip, ifp, &icur, new);
+	fuse_iext_check_mappings(inode, ip, ifp);
+	return 0;
+}
+
+/*
+ * Trim the returned map to the required bounds
+ */
+static void
+fuse_iomap_trim(
+	struct inode		*inode,
+	struct fuse_iomap	*mval,
+	const struct fuse_iomap	*got,
+	loff_t			off,
+	loff_t			len)
+{
+	const unsigned int blocksize = i_blocksize(inode);
+	const loff_t aligned_off = round_down(off, blocksize);
+	const loff_t aligned_end = round_up(off + len, blocksize);
+	const loff_t aligned_len = aligned_end - aligned_off;
+
+	ASSERT(aligned_off >= got->offset);
+
+	switch (got->type) {
+	case FUSE_IOMAP_TYPE_MAPPED:
+	case FUSE_IOMAP_TYPE_UNWRITTEN:
+		mval->addr = got->addr + (aligned_off - got->offset);
+		break;
+	default:
+		mval->addr = FUSE_IOMAP_NULL_ADDR;
+		break;
+	}
+	mval->offset = aligned_off;
+	mval->length = min_t(loff_t, aligned_len,
+			     got->length - (aligned_off - got->offset));
+	mval->type = got->type;
+	mval->flags = got->flags;
+	mval->dev = got->dev;
+}
+
+enum fuse_iomap_lookup_result
+fuse_iomap_cache_lookup(
+	struct inode		*inode,
+	enum fuse_iomap_fork	whichfork,
+	loff_t			off,
+	uint64_t		len,
+	struct fuse_iomap	*mval)
+{
+	struct fuse_iomap	got;
+	struct fuse_iext_cursor	icur;
+	struct fuse_inode	*fi = get_fuse_inode(inode);
+	struct fuse_iomap_cache	*ip = &fi->cache;
+	struct fuse_ifork	*ifp = fuse_iomap_fork_ptr(ip, whichfork);
+
+	fuse_iomap_assert_locked(ip, FUSE_IOMAP_LOCK_SHARED |
+				     FUSE_IOMAP_LOCK_EXCL);
+
+	trace_fuse_iomap_cache_lookup(inode, whichfork, off, len, _RET_IP_);
+
+	if (!ifp) {
+		/*
+		 * No write fork at all means this filesystem doesn't do out of
+		 * place writes.
+		 */
+		return LOOKUP_NOFORK;
+	}
+
+	if (!fuse_iext_lookup_extent(ip, ifp, off, &icur, &got)) {
+		/*
+		 * Write fork does not contain a mapping at or beyond off,
+		 * which is a cache miss.
+		 */
+		return LOOKUP_MISS;
+	}
+
+	if (got.offset > off) {
+		/*
+		 * Found a mapping, but it doesn't cover the start of the
+		 * range, which is effectively a miss.
+		 */
+		return LOOKUP_MISS;
+	}
+
+	/* Found a mapping in the cache, return it */
+	fuse_iomap_trim(inode, mval, &got, off, len);
+	mval->validity_cookie = fuse_iext_read_seq(ip);
+	trace_fuse_iomap_cache_lookup_result(inode, whichfork, off, len, &got,
+					     mval);
+	return LOOKUP_HIT;
+}





[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux