[Virtio-fs] [PATCH 11/19] fuse, dax: Implement dax read/write operations

Liu Bo bo.liu at linux.alibaba.com
Wed Aug 21 19:49:34 UTC 2019


On Wed, Aug 21, 2019 at 01:57:12PM -0400, Vivek Goyal wrote:
> This patch implements basic DAX support. mmap() is not implemented
> yet and will come in later patches. This patch looks into implemeting
> read/write.
> 
> We make use of interval tree to keep track of per inode dax mappings.
> 
> Do not use dax for file extending writes, instead just send WRITE message
> to daemon (like we do for direct I/O path). This will keep write and
> i_size change atomic w.r.t crash.
> 
> Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
> Signed-off-by: Dr. David Alan Gilbert <dgilbert at redhat.com>
> Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
> Signed-off-by: Miklos Szeredi <mszeredi at redhat.com>
> Signed-off-by: Liu Bo <bo.liu at linux.alibaba.com>
> Signed-off-by: Peng Tao <tao.peng at linux.alibaba.com>
> ---
>  fs/fuse/file.c            | 603 +++++++++++++++++++++++++++++++++++++-
>  fs/fuse/fuse_i.h          |  23 ++
>  fs/fuse/inode.c           |   6 +
>  include/uapi/linux/fuse.h |   1 +
>  4 files changed, 627 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index c45ffe6f1ecb..f323b7b04414 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -18,6 +18,12 @@
>  #include <linux/swap.h>
>  #include <linux/falloc.h>
>  #include <linux/uio.h>
> +#include <linux/dax.h>
> +#include <linux/iomap.h>
> +#include <linux/interval_tree_generic.h>
> +
> +INTERVAL_TREE_DEFINE(struct fuse_dax_mapping, rb, __u64, __subtree_last,
> +                     START, LAST, static inline, fuse_dax_interval_tree);
>  
>  static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
>  			  int opcode, struct fuse_open_out *outargp)
> @@ -171,6 +177,248 @@ static void fuse_link_write_file(struct file *file)
>  	spin_unlock(&fi->lock);
>  }
>  
> +static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn *fc)
> +{
> +	struct fuse_dax_mapping *dmap = NULL;
> +
> +	spin_lock(&fc->lock);
> +
> +	/* TODO: Add logic to try to free up memory if wait is allowed */
> +	if (fc->nr_free_ranges <= 0) {
> +		spin_unlock(&fc->lock);
> +		return NULL;
> +	}
> +
> +	WARN_ON(list_empty(&fc->free_ranges));
> +
> +	/* Take a free range */
> +	dmap = list_first_entry(&fc->free_ranges, struct fuse_dax_mapping,
> +					list);
> +	list_del_init(&dmap->list);
> +	fc->nr_free_ranges--;
> +	spin_unlock(&fc->lock);
> +	return dmap;
> +}
> +
> +/* This assumes fc->lock is held */
> +static void __dmap_add_to_free_pool(struct fuse_conn *fc,
> +				struct fuse_dax_mapping *dmap)
> +{
> +	list_add_tail(&dmap->list, &fc->free_ranges);
> +	fc->nr_free_ranges++;
> +}
> +
> +static void dmap_add_to_free_pool(struct fuse_conn *fc,
> +				struct fuse_dax_mapping *dmap)
> +{
> +	/* Return fuse_dax_mapping to free list */
> +	spin_lock(&fc->lock);
> +	__dmap_add_to_free_pool(fc, dmap);
> +	spin_unlock(&fc->lock);
> +}
> +
> +/* offset passed in should be aligned to FUSE_DAX_MEM_RANGE_SZ */
> +static int fuse_setup_one_mapping(struct inode *inode, loff_t offset,
> +				  struct fuse_dax_mapping *dmap, bool writable,
> +				  bool upgrade)
> +{
> +	struct fuse_conn *fc = get_fuse_conn(inode);
> +	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct fuse_setupmapping_in inarg;
> +	FUSE_ARGS(args);
> +	ssize_t err;
> +
> +	WARN_ON(offset % FUSE_DAX_MEM_RANGE_SZ);
> +	WARN_ON(fc->nr_free_ranges < 0);
> +
> +	/* Ask fuse daemon to setup mapping */
> +	memset(&inarg, 0, sizeof(inarg));
> +	inarg.foffset = offset;
> +	inarg.fh = -1;
> +	inarg.moffset = dmap->window_offset;
> +	inarg.len = FUSE_DAX_MEM_RANGE_SZ;
> +	inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ;
> +	if (writable)
> +		inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE;
> +	args.in.h.opcode = FUSE_SETUPMAPPING;
> +	args.in.h.nodeid = fi->nodeid;
> +	args.in.numargs = 1;
> +	args.in.args[0].size = sizeof(inarg);
> +	args.in.args[0].value = &inarg;
> +	err = fuse_simple_request(fc, &args);
> +	if (err < 0) {
> +		printk(KERN_ERR "%s request failed at mem_offset=0x%llx %zd\n",
> +				 __func__, dmap->window_offset, err);
> +		return err;
> +	}
> +
> +	pr_debug("fuse_setup_one_mapping() succeeded. offset=0x%llx writable=%d"
> +		 " err=%zd\n", offset, writable, err);
> +
> +	dmap->writable = writable;
> +	if (!upgrade) {
> +		/* TODO: What locking is required here. For now,
> +		 * using fc->lock
> +		 */
> +		dmap->start = offset;
> +		dmap->end = offset + FUSE_DAX_MEM_RANGE_SZ - 1;
> +		/* Protected by fi->i_dmap_sem */
> +		fuse_dax_interval_tree_insert(dmap, &fi->dmap_tree);
> +		fi->nr_dmaps++;
> +	}
> +	return 0;
> +}
> +
> +static int
> +fuse_send_removemapping(struct inode *inode,
> +			struct fuse_removemapping_in *inargp,
> +			struct fuse_removemapping_one *remove_one)
> +{
> +	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct fuse_conn *fc = get_fuse_conn(inode);
> +	FUSE_ARGS(args);
> +
> +	args.in.h.opcode = FUSE_REMOVEMAPPING;
> +	args.in.h.nodeid = fi->nodeid;
> +	args.in.numargs = 2;
> +	args.in.args[0].size = sizeof(*inargp);
> +	args.in.args[0].value = inargp;
> +	args.in.args[1].size = inargp->count * sizeof(*remove_one);
> +	args.in.args[1].value = remove_one;
> +	return fuse_simple_request(fc, &args);
> +}
> +
> +static int dmap_removemapping_list(struct inode *inode, unsigned num,
> +				   struct list_head *to_remove)
> +{
> +	struct fuse_removemapping_one *remove_one, *ptr;
> +	struct fuse_removemapping_in inarg;
> +	struct fuse_dax_mapping *dmap;
> +	int ret, i = 0, nr_alloc;
> +
> +	nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY);
> +	remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS);
> +	if (!remove_one)
> +		return -ENOMEM;
> +
> +	ptr = remove_one;
> +	list_for_each_entry(dmap, to_remove, list) {
> +		ptr->moffset = dmap->window_offset;
> +		ptr->len = dmap->length;
> +		ptr++;
> +		i++;
> +		num--;
> +		if (i >= nr_alloc || num == 0) {
> +			memset(&inarg, 0, sizeof(inarg));
> +			inarg.count = i;
> +			ret = fuse_send_removemapping(inode, &inarg,
> +						      remove_one);
> +			if (ret)
> +				goto out;
> +			ptr = remove_one;
> +			i = 0;
> +		}
> +	}
> +out:
> +	kfree(remove_one);
> +	return ret;
> +}
> +
> +/*
> + * Cleanup dmap entry and add back to free list. This should be called with
> + * fc->lock held.
> + */
> +static void dmap_reinit_add_to_free_pool(struct fuse_conn *fc,
> +					    struct fuse_dax_mapping *dmap)
> +{
> +	pr_debug("fuse: freeing memory range start=0x%llx end=0x%llx "
> +		 "window_offset=0x%llx length=0x%llx\n", dmap->start,
> +		 dmap->end, dmap->window_offset, dmap->length);
> +	dmap->start = dmap->end = 0;
> +	__dmap_add_to_free_pool(fc, dmap);
> +}
> +
> +/*
> + * Free inode dmap entries whose range falls entirely inside [start, end].
> + * Does not take any locks. Caller must take care of any lock requirements.
> + * Lock ordering follows fuse_dax_free_one_mapping().
> + * inode->i_rwsem, fuse_inode->i_mmap_sem and fuse_inode->i_dmap_sem must be
> + * held exclusively, unless it is called from evict_inode() where no one else
> + * is accessing the inode.
> + */
> +static void inode_reclaim_dmap_range(struct fuse_conn *fc, struct inode *inode,
> +				      loff_t start, loff_t end)
> +{
> +	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct fuse_dax_mapping *dmap, *n;
> +	int err, num = 0;
> +	LIST_HEAD(to_remove);
> +
> +	pr_debug("fuse: %s: start=0x%llx, end=0x%llx\n", __func__, start, end);
> +
> +	/*
> +	 * Interval tree search matches intersecting entries. Adjust the range
> +	 * to avoid dropping partial valid entries.
> +	 */
> +	start = ALIGN(start, FUSE_DAX_MEM_RANGE_SZ);
> +	end = ALIGN_DOWN(end, FUSE_DAX_MEM_RANGE_SZ);
> +
> +	while (1) {
> +		dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, start,
> +							 end);
> +		if (!dmap)
> +			break;
> +		fuse_dax_interval_tree_remove(dmap, &fi->dmap_tree);
> +		num++;
> +		list_add(&dmap->list, &to_remove);
> +	}
> +
> +	/* Nothing to remove */
> +	if (list_empty(&to_remove))
> +		return;
> +
> +	WARN_ON(fi->nr_dmaps < num);
> +	fi->nr_dmaps -= num;
> +	/*
> +	 * During umount/shutdown, fuse connection is dropped first
> +	 * and evict_inode() is called later. That means any
> +	 * removemapping messages are going to fail. Send messages
> +	 * only if connection is up. Otherwise fuse daemon is
> +	 * responsible for cleaning up any leftover references and
> +	 * mappings.
> +	 */
> +	if (fc->connected) {
> +		err = dmap_removemapping_list(inode, num, &to_remove);
> +		if (err) {
> +			pr_warn("Failed to removemappings. start=0x%llx"
> +				" end=0x%llx\n", start, end);
> +		}
> +	}
> +	spin_lock(&fc->lock);
> +	list_for_each_entry_safe(dmap, n, &to_remove, list) {
> +		list_del_init(&dmap->list);
> +		dmap_reinit_add_to_free_pool(fc, dmap);
> +	}
> +	spin_unlock(&fc->lock);
> +}
> +
> +/*
> + * It is called from evict_inode() and by that time inode is going away. So
> + * this function does not take any locks like fi->i_dmap_sem for traversing
> + * that fuse inode interval tree. If that lock is taken then lock validator
> + * complains of deadlock situation w.r.t fs_reclaim lock.
> + */
> +void fuse_cleanup_inode_mappings(struct inode *inode)
> +{
> +	struct fuse_conn *fc = get_fuse_conn(inode);
> +	/*
> +	 * fuse_evict_inode() has alredy called truncate_inode_pages_final()
> +	 * before we arrive here. So we should not have to worry about
> +	 * any pages/exception entries still associated with inode.
> +	 */
> +	inode_reclaim_dmap_range(fc, inode, 0, -1);
> +}
> +
>  void fuse_finish_open(struct inode *inode, struct file *file)
>  {
>  	struct fuse_file *ff = file->private_data;
> @@ -1481,32 +1729,364 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	return res;
>  }
>  
> +static ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to);
>  static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  {
>  	struct file *file = iocb->ki_filp;
>  	struct fuse_file *ff = file->private_data;
> +	struct inode *inode = file->f_mapping->host;
>  
>  	if (is_bad_inode(file_inode(file)))
>  		return -EIO;
>  
> -	if (!(ff->open_flags & FOPEN_DIRECT_IO))
> -		return fuse_cache_read_iter(iocb, to);
> -	else
> +	if (IS_DAX(inode))
> +		return fuse_dax_read_iter(iocb, to);
> +
> +	if (ff->open_flags & FOPEN_DIRECT_IO)
>  		return fuse_direct_read_iter(iocb, to);
> +
> +	return fuse_cache_read_iter(iocb, to);
>  }
>  
> +static ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
>  static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  {
>  	struct file *file = iocb->ki_filp;
>  	struct fuse_file *ff = file->private_data;
> +	struct inode *inode = file->f_mapping->host;
>  
>  	if (is_bad_inode(file_inode(file)))
>  		return -EIO;
>  
> -	if (!(ff->open_flags & FOPEN_DIRECT_IO))
> -		return fuse_cache_write_iter(iocb, from);
> -	else
> +	if (IS_DAX(inode))
> +		return fuse_dax_write_iter(iocb, from);
> +
> +	if (ff->open_flags & FOPEN_DIRECT_IO)
>  		return fuse_direct_write_iter(iocb, from);
> +
> +	return fuse_cache_write_iter(iocb, from);
> +}
> +
> +static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length)
> +{
> +	iomap->addr = IOMAP_NULL_ADDR;
> +	iomap->length = length;
> +	iomap->type = IOMAP_HOLE;
> +}
> +
> +static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length,
> +			struct iomap *iomap, struct fuse_dax_mapping *dmap,
> +			unsigned flags)
> +{
> +	loff_t offset, len;
> +	loff_t i_size = i_size_read(inode);
> +
> +	offset = pos - dmap->start;
> +	len = min(length, dmap->length - offset);
> +
> +	/* If length is beyond end of file, truncate further */
> +	if (pos + len > i_size)
> +		len = i_size - pos;
> +
> +	if (len > 0) {
> +		iomap->addr = dmap->window_offset + offset;
> +		iomap->length = len;
> +		if (flags & IOMAP_FAULT)
> +			iomap->length = ALIGN(len, PAGE_SIZE);
> +		iomap->type = IOMAP_MAPPED;
> +		pr_debug("%s: returns iomap: addr 0x%llx offset 0x%llx"
> +				" length 0x%llx\n", __func__, iomap->addr,
> +				iomap->offset, iomap->length);
> +	} else {
> +		/* Mapping beyond end of file is hole */
> +		fuse_fill_iomap_hole(iomap, length);
> +		pr_debug("%s: returns iomap: addr 0x%llx offset 0x%llx"
> +				"length 0x%llx\n", __func__, iomap->addr,
> +				iomap->offset, iomap->length);
> +	}
> +}
> +
> +static int iomap_begin_setup_new_mapping(struct inode *inode, loff_t pos,
> +					 loff_t length, unsigned flags,
> +					 struct iomap *iomap)
> +{
> +	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct fuse_conn *fc = get_fuse_conn(inode);
> +	struct fuse_dax_mapping *dmap, *alloc_dmap = NULL;
> +	int ret;
> +	bool writable = flags & IOMAP_WRITE;
> +
> +	alloc_dmap = alloc_dax_mapping(fc);
> +	if (!alloc_dmap)
> +		return -EBUSY;
> +
> +	/*
> +	 * Take write lock so that only one caller can try to setup mapping
> +	 * and other waits.
> +	 */
> +	down_write(&fi->i_dmap_sem);
> +	/*
> +	 * We dropped lock. Check again if somebody else setup
> +	 * mapping already.
> +	 */
> +	dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos,
> +						pos);
> +	if (dmap) {
> +		fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
> +		dmap_add_to_free_pool(fc, alloc_dmap);
> +		up_write(&fi->i_dmap_sem);
> +		return 0;
> +	}
> +
> +	/* Setup one mapping */
> +	ret = fuse_setup_one_mapping(inode,
> +				     ALIGN_DOWN(pos, FUSE_DAX_MEM_RANGE_SZ),
> +				     alloc_dmap, writable, false);
> +	if (ret < 0) {
> +		printk("fuse_setup_one_mapping() failed. err=%d"
> +			" pos=0x%llx, writable=%d\n", ret, pos, writable);
> +		dmap_add_to_free_pool(fc, alloc_dmap);
> +		up_write(&fi->i_dmap_sem);
> +		return ret;
> +	}
> +	fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags);
> +	up_write(&fi->i_dmap_sem);
> +	return 0;
> +}
> +
> +static int iomap_begin_upgrade_mapping(struct inode *inode, loff_t pos,
> +					 loff_t length, unsigned flags,
> +					 struct iomap *iomap)
> +{
> +	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct fuse_dax_mapping *dmap;
> +	int ret;
> +
> +	/*
> +	 * Take exclusive lock so that only one caller can try to setup
> +	 * mapping and others wait.
> +	 */
> +	down_write(&fi->i_dmap_sem);
> +	dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos, pos);
> +
> +	/* We are holding either inode lock or i_mmap_sem, and that should
> +	 * ensure that dmap can't reclaimed or truncated and it should still
> +	 * be there in tree despite the fact we dropped and re-acquired the
> +	 * lock.
> +	 */
> +	ret = -EIO;
> +	if (WARN_ON(!dmap))
> +		goto out_err;
> +
> +	/* Maybe another thread already upgraded mapping while we were not
> +	 * holding lock.
> +	 */
> +	if (dmap->writable)
> +		goto out_fill_iomap;

@ret needs to be reset here.

thanks,
-liubo

> +
> +	ret = fuse_setup_one_mapping(inode,
> +				     ALIGN_DOWN(pos, FUSE_DAX_MEM_RANGE_SZ),
> +				     dmap, true, true);
> +	if (ret < 0) {
> +		printk("fuse_setup_one_mapping() failed. err=%d pos=0x%llx\n",
> +		       ret, pos);
> +		goto out_err;
> +	}
> +
> +out_fill_iomap:
> +	fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
> +out_err:
> +	up_write(&fi->i_dmap_sem);
> +	return ret;
> +}
> +
> +/* This is just for DAX and the mapping is ephemeral, do not use it for other
> + * purposes since there is no block device with a permanent mapping.
> + */
> +static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
> +			    unsigned flags, struct iomap *iomap)
> +{
> +	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct fuse_conn *fc = get_fuse_conn(inode);
> +	struct fuse_dax_mapping *dmap;
> +	bool writable = flags & IOMAP_WRITE;
> +
> +	/* We don't support FIEMAP */
> +	BUG_ON(flags & IOMAP_REPORT);
> +
> +	pr_debug("fuse_iomap_begin() called. pos=0x%llx length=0x%llx\n",
> +			pos, length);
> +
> +	/*
> +	 * Writes beyond end of file are not handled using dax path. Instead
> +	 * a fuse write message is sent to daemon
> +	 */
> +	if (flags & IOMAP_WRITE && pos >= i_size_read(inode))
> +		return -EIO;
> +
> +	iomap->offset = pos;
> +	iomap->flags = 0;
> +	iomap->bdev = NULL;
> +	iomap->dax_dev = fc->dax_dev;
> +
> +	/*
> +	 * Both read/write and mmap path can race here. So we need something
> +	 * to make sure if we are setting up mapping, then other path waits
> +	 *
> +	 * For now, use a semaphore for this. It probably needs to be
> +	 * optimized later.
> +	 */
> +	down_read(&fi->i_dmap_sem);
> +	dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos, pos);
> +
> +	if (dmap) {
> +		if (writable && !dmap->writable) {
> +			/* Upgrade read-only mapping to read-write. This will
> +			 * require exclusive i_dmap_sem lock as we don't want
> +			 * two threads to be trying to this simultaneously
> +			 * for same dmap. So drop shared lock and acquire
> +			 * exclusive lock.
> +			 */
> +			up_read(&fi->i_dmap_sem);
> +			pr_debug("%s: Upgrading mapping at offset 0x%llx"
> +				 " length 0x%llx\n", __func__, pos, length);
> +			return iomap_begin_upgrade_mapping(inode, pos, length,
> +							   flags, iomap);
> +		} else {
> +			fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
> +			up_read(&fi->i_dmap_sem);
> +			return 0;
> +		}
> +	} else {
> +		up_read(&fi->i_dmap_sem);
> +		pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n",
> +				__func__, pos, length);
> +		if (pos >= i_size_read(inode))
> +			goto iomap_hole;
> +
> +		return iomap_begin_setup_new_mapping(inode, pos, length, flags,
> +						     iomap);
> +	}
> +
> +	/*
> +	 * If read beyond end of file happnes, fs code seems to return
> +	 * it as hole
> +	 */
> +iomap_hole:
> +	fuse_fill_iomap_hole(iomap, length);
> +	pr_debug("fuse_iomap_begin() returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n", pos, length, iomap->length);
> +	return 0;
> +}
> +
> +static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length,
> +			  ssize_t written, unsigned flags,
> +			  struct iomap *iomap)
> +{
> +	/* DAX writes beyond end-of-file aren't handled using iomap, so the
> +	 * file size is unchanged and there is nothing to do here.
> +	 */
> +	return 0;
> +}
> +
> +static const struct iomap_ops fuse_iomap_ops = {
> +	.iomap_begin = fuse_iomap_begin,
> +	.iomap_end = fuse_iomap_end,
> +};
> +
> +static ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	ssize_t ret;
> +
> +	if (iocb->ki_flags & IOCB_NOWAIT) {
> +		if (!inode_trylock_shared(inode))
> +			return -EAGAIN;
> +	} else {
> +		inode_lock_shared(inode);
> +	}
> +
> +	ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops);
> +	inode_unlock_shared(inode);
> +
> +	/* TODO file_accessed(iocb->f_filp) */
> +
> +	return ret;
> +}
> +
> +static bool file_extending_write(struct kiocb *iocb, struct iov_iter *from)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +
> +	return (iov_iter_rw(from) == WRITE &&
> +		((iocb->ki_pos) >= i_size_read(inode)));
> +}
> +
> +static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
> +	ssize_t ret;
> +
> +	ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
> +	if (ret < 0)
> +		return ret;
> +
> +	fuse_invalidate_attr(inode);
> +	fuse_write_update_size(inode, iocb->ki_pos);
> +	return ret;
> +}
> +
> +static ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	ssize_t ret, count;
> +
> +	if (iocb->ki_flags & IOCB_NOWAIT) {
> +		if (!inode_trylock(inode))
> +			return -EAGAIN;
> +	} else {
> +		inode_lock(inode);
> +	}
> +
> +	ret = generic_write_checks(iocb, from);
> +	if (ret <= 0)
> +		goto out;
> +
> +	ret = file_remove_privs(iocb->ki_filp);
> +	if (ret)
> +		goto out;
> +	/* TODO file_update_time() but we don't want metadata I/O */
> +
> +	/* Do not use dax for file extending writes as its an mmap and
> +	 * trying to write beyong end of existing page will generate
> +	 * SIGBUS.
> +	 */
> +	if (file_extending_write(iocb, from)) {
> +		ret = fuse_dax_direct_write(iocb, from);
> +		goto out;
> +	}
> +
> +	ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops);
> +	if (ret < 0)
> +		goto out;
> +
> +	/*
> +	 * If part of the write was file extending, fuse dax path will not
> +	 * take care of that. Do direct write instead.
> +	 */
> +	if (iov_iter_count(from) && file_extending_write(iocb, from)) {
> +		count = fuse_dax_direct_write(iocb, from);
> +		if (count < 0)
> +			goto out;
> +		ret += count;
> +	}
> +
> +out:
> +	inode_unlock(inode);
> +
> +	if (ret > 0)
> +		ret = generic_write_sync(iocb, ret);
> +	return ret;
>  }
>  
>  static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
> @@ -2185,6 +2765,11 @@ static ssize_t fuse_file_splice_read(struct file *in, loff_t *ppos,
>  
>  }
>  
> +static int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +	return -EINVAL; /* TODO */
> +}
> +
>  static int convert_fuse_file_lock(struct fuse_conn *fc,
>  				  const struct fuse_file_lock *ffl,
>  				  struct file_lock *fl)
> @@ -3266,6 +3851,7 @@ static const struct address_space_operations fuse_file_aops  = {
>  void fuse_init_file_inode(struct inode *inode)
>  {
>  	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct fuse_conn *fc = get_fuse_conn(inode);
>  
>  	inode->i_fop = &fuse_file_operations;
>  	inode->i_data.a_ops = &fuse_file_aops;
> @@ -3275,4 +3861,9 @@ void fuse_init_file_inode(struct inode *inode)
>  	fi->writectr = 0;
>  	init_waitqueue_head(&fi->page_waitq);
>  	INIT_LIST_HEAD(&fi->writepages);
> +	fi->dmap_tree = RB_ROOT_CACHED;
> +
> +	if (fc->dax_dev) {
> +		inode->i_flags |= S_DAX;
> +	}
>  }
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index b020a4071f80..37b31c5435ff 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -104,16 +104,29 @@ struct fuse_forget_link {
>  	struct fuse_forget_link *next;
>  };
>  
> +#define START(node) ((node)->start)
> +#define LAST(node) ((node)->end)
> +
>  /** Translation information for file offsets to DAX window offsets */
>  struct fuse_dax_mapping {
>  	/* Will connect in fc->free_ranges to keep track of free memory */
>  	struct list_head list;
>  
> +	/* For interval tree in file/inode */
> +	struct rb_node rb;
> +	/** Start Position in file */
> +	__u64 start;
> +	/** End Position in file */
> +	__u64 end;
> +	__u64 __subtree_last;
>  	/** Position in DAX window */
>  	u64 window_offset;
>  
>  	/** Length of mapping, in bytes */
>  	loff_t length;
> +
> +	/* Is this mapping read-only or read-write */
> +	bool writable;
>  };
>  
>  /** FUSE inode */
> @@ -201,6 +214,15 @@ struct fuse_inode {
>  
>  	/** Lock to protect write related fields */
>  	spinlock_t lock;
> +
> +	/*
> +	 * Semaphore to protect modifications to dmap_tree
> +	 */
> +	struct rw_semaphore i_dmap_sem;
> +
> +	/** Sorted rb tree of struct fuse_dax_mapping elements */
> +	struct rb_root_cached dmap_tree;
> +	unsigned long nr_dmaps;
>  };
>  
>  /** FUSE inode state bits */
> @@ -1242,5 +1264,6 @@ unsigned fuse_len_args(unsigned numargs, struct fuse_arg *args);
>   */
>  u64 fuse_get_unique(struct fuse_iqueue *fiq);
>  void fuse_free_conn(struct fuse_conn *fc);
> +void fuse_cleanup_inode_mappings(struct inode *inode);
>  
>  #endif /* _FS_FUSE_I_H */
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index d5d134a01117..7e0ed5f3f7e6 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -81,7 +81,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
>  	fi->attr_version = 0;
>  	fi->orig_ino = 0;
>  	fi->state = 0;
> +	fi->nr_dmaps = 0;
>  	mutex_init(&fi->mutex);
> +	init_rwsem(&fi->i_dmap_sem);
>  	spin_lock_init(&fi->lock);
>  	fi->forget = fuse_alloc_forget();
>  	if (!fi->forget) {
> @@ -109,6 +111,10 @@ static void fuse_evict_inode(struct inode *inode)
>  	clear_inode(inode);
>  	if (inode->i_sb->s_flags & SB_ACTIVE) {
>  		struct fuse_conn *fc = get_fuse_conn(inode);
> +		if (IS_DAX(inode)) {
> +			fuse_cleanup_inode_mappings(inode);
> +			WARN_ON(fi->nr_dmaps);
> +		}
>  		fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
>  		fi->forget = NULL;
>  	}
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index 7c2ad3d418df..ac23f57d8fd6 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -854,6 +854,7 @@ struct fuse_copy_file_range_in {
>  
>  #define FUSE_SETUPMAPPING_ENTRIES 8
>  #define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0)
> +#define FUSE_SETUPMAPPING_FLAG_READ (1ull << 1)
>  struct fuse_setupmapping_in {
>  	/* An already open handle */
>  	uint64_t	fh;
> -- 
> 2.20.1




More information about the Virtio-fs mailing list