[Virtio-fs] [PATCH RFC] virtiofs: use fine-grained lock for dmap reclaim
Vivek Goyal
vgoyal at redhat.com
Mon Jul 15 20:37:39 UTC 2019
On Thu, Jul 04, 2019 at 03:25:31PM +0800, Liu Bo wrote:
> With free fuse dax mapping reducing, read performance is impacted
> significantly because reads need to wait for a free fuse dax mapping.
>
> Although reads will trigger reclaim work to try to reclaim fuse dax
> mapping, reclaim code can barely make any progress if most of fuse dax
> mappings are used by the file we're reading since inode lock is required
> by reclaim code.
>
> However, we don't have to take inode lock for reclaiming if dax mapping
> has its own reference count, reference counting is to tell reclaim code to
> skip those in use dax mappings, such that we can avoid the risk of
> accidentally reclaiming a dax mapping that other readers are using.
>
> On the other hand, holding ->i_dmap_sem during reclaim can be used to
> prevent the follwing reads to get a dax mapping under reclaim.
>
> Another reason is that reads/writes only use fuse dax mapping within
> dax_iomap_rw(), so we can do such a trick, while mmap/faulting is a
> different story and we have to take ->i_mmap_sem prior to reclaiming a dax
> mapping in order to avoid the race.
>
> This adds reference count for fuse dax mapping and removes the acquisition
> of inode lock during reclaim.
I am not sure that this reference count implementation is safe. For
example, what prevents atomic_dec() from being reordered so that it
could be executed before actually copying to user space is finished.
Say cpu1 is reading a dax page and cpu 2 is freeing memory.
cpu1 read cpu2 free dmap
--------- -------------
atomic_inc() atomic_read()
copy data to user space
atomic_dec
Say atomic_dec() gets reordered w.r.t copy_data_to_user_space.
cpu1 read cpu2 free dmap
--------- -------------
atomic_inc() atomic_read()
atomic_dec
copy data to user space
Now cpu2 will free up dax range while it is still being read?
Thanks
Vivek
>
>
> RESULTS:
>
> virtiofsd -cache_size=2G
>
> vanilla kernel: IOPS=378
> patched kernel: IOPS=4508
>
>
> *********************************
> $ cat fio-rand-read.job
> ; fio-rand-read.job for fiotest
>
> [global]
> name=fio-rand-read
> filename=fio_file
> rw=randread
> bs=4K
> direct=1
> numjobs=1
> time_based=1
> runtime=120
> directory=/mnt/test/
> fsync=1
> group_reporting=1
>
> [file1]
> size=5G
> # use sync/libaio
> ioengine=sync
> iodepth=1
>
>
> Signed-off-by: Liu Bo <bo.liu at linux.alibaba.com>
> ---
> fs/fuse/file.c | 55 +++++++++++++++++++++++++++++++++++++------------------
> fs/fuse/fuse_i.h | 3 +++
> fs/fuse/inode.c | 1 +
> 3 files changed, 41 insertions(+), 18 deletions(-)
>
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 4ed45a7..e3ec646 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -1870,6 +1870,17 @@ static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length,
> if (flags & IOMAP_FAULT)
> iomap->length = ALIGN(len, PAGE_SIZE);
> iomap->type = IOMAP_MAPPED;
> +
> + /*
> + * increace refcnt so that reclaim code knows this dmap is in
> + * use.
> + */
> + atomic_inc(&dmap->refcnt);
> +
> + /* iomap->private should be NULL */
> + WARN_ON_ONCE(iomap->private);
> + iomap->private = dmap;
> +
> pr_debug("%s: returns iomap: addr 0x%llx offset 0x%llx"
> " length 0x%llx\n", __func__, iomap->addr,
> iomap->offset, iomap->length);
> @@ -2024,6 +2035,11 @@ static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length,
> ssize_t written, unsigned flags,
> struct iomap *iomap)
> {
> + struct fuse_dax_mapping *dmap = iomap->private;
> +
> + if (dmap)
> + atomic_dec(&dmap->refcnt);
> +
> /* DAX writes beyond end-of-file aren't handled using iomap, so the
> * file size is unchanged and there is nothing to do here.
> */
> @@ -3959,6 +3975,10 @@ static int reclaim_one_dmap_locked(struct fuse_conn *fc, struct inode *inode,
> int ret;
> struct fuse_inode *fi = get_fuse_inode(inode);
>
> + /*
> + * igrab() was done to make sure inode won't go under us, and this
> + * further avoids the race with evict().
> + */
> ret = dmap_writeback_invalidate(inode, dmap);
>
> /* TODO: What to do if above fails? For now,
> @@ -4053,23 +4073,25 @@ static struct fuse_dax_mapping *alloc_dax_mapping_reclaim(struct fuse_conn *fc,
> }
> }
>
> -int lookup_and_reclaim_dmap_locked(struct fuse_conn *fc, struct inode *inode,
> - u64 dmap_start)
> +static int lookup_and_reclaim_dmap_locked(struct fuse_conn *fc,
> + struct inode *inode, u64 dmap_start)
> {
> int ret;
> struct fuse_inode *fi = get_fuse_inode(inode);
> struct fuse_dax_mapping *dmap;
>
> - WARN_ON(!inode_is_locked(inode));
> -
> /* Find fuse dax mapping at file offset inode. */
> dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, dmap_start,
> - dmap_start);
> + dmap_start);
>
> /* Range already got cleaned up by somebody else */
> if (!dmap)
> return 0;
>
> + /* still in use. */
> + if (atomic_read(&dmap->refcnt))
> + return 0;
> +
> ret = reclaim_one_dmap_locked(fc, inode, dmap);
> if (ret < 0)
> return ret;
> @@ -4084,29 +4106,21 @@ int lookup_and_reclaim_dmap_locked(struct fuse_conn *fc, struct inode *inode,
> /*
> * Free a range of memory.
> * Locking.
> - * 1. Take inode->i_rwsem to prever further read/write.
> - * 2. Take fuse_inode->i_mmap_sem to block dax faults.
> - * 3. Take fuse_inode->i_dmap_sem to protect interval tree. It might not
> + * 1. Take fuse_inode->i_mmap_sem to block dax faults.
> + * 2. Take fuse_inode->i_dmap_sem to protect interval tree. It might not
> * be strictly necessary as lock 1 and 2 seem sufficient.
> */
> -int lookup_and_reclaim_dmap(struct fuse_conn *fc, struct inode *inode,
> - u64 dmap_start)
> +static int lookup_and_reclaim_dmap(struct fuse_conn *fc, struct inode *inode,
> + u64 dmap_start)
> {
> int ret;
> struct fuse_inode *fi = get_fuse_inode(inode);
>
> - /*
> - * If process is blocked waiting for memory while holding inode
> - * lock, we will deadlock. So continue to free next range.
> - */
> - if (!inode_trylock(inode))
> - return -EAGAIN;
> down_write(&fi->i_mmap_sem);
> down_write(&fi->i_dmap_sem);
> ret = lookup_and_reclaim_dmap_locked(fc, inode, dmap_start);
> up_write(&fi->i_dmap_sem);
> up_write(&fi->i_mmap_sem);
> - inode_unlock(inode);
> return ret;
> }
>
> @@ -4134,6 +4148,12 @@ static int try_to_free_dmap_chunks(struct fuse_conn *fc,
>
> list_for_each_entry_safe(pos, temp, &fc->busy_ranges,
> busy_list) {
> + dmap = pos;
> +
> + /* skip this range if it's in use. */
> + if (atomic_read(&dmap->refcnt))
> + continue;
> +
> inode = igrab(pos->inode);
> /*
> * This inode is going away. That will free
> @@ -4147,7 +4167,6 @@ static int try_to_free_dmap_chunks(struct fuse_conn *fc,
> * inode lock can't be obtained, this will help with
> * selecting new element
> */
> - dmap = pos;
> list_move_tail(&dmap->busy_list, &fc->busy_ranges);
> dmap_start = dmap->start;
> window_offset = dmap->window_offset;
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 94cfde0..31ecdac 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -143,6 +143,9 @@ struct fuse_dax_mapping {
>
> /* indicate if the mapping is set up for write purpose */
> unsigned flags;
> +
> + /* reference count when the mapping is used by dax iomap. */
> + atomic_t refcnt;
> };
>
> /** FUSE inode */
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index 288daee..27c6055 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -671,6 +671,7 @@ static int fuse_dax_mem_range_init(struct fuse_conn *fc,
> range->length = FUSE_DAX_MEM_RANGE_SZ;
> list_add_tail(&range->list, &mem_ranges);
> INIT_LIST_HEAD(&range->busy_list);
> + atomic_set(&range->refcnt, 0);
> allocated_ranges++;
> }
>
> --
> 1.8.3.1
>
> _______________________________________________
> Virtio-fs mailing list
> Virtio-fs at redhat.com
> https://www.redhat.com/mailman/listinfo/virtio-fs
More information about the Virtio-fs
mailing list