[Virtio-fs] [PATCH RFC] virtiofs: use fine-grained lock for dmap reclaim

Liu Bo bo.liu at linux.alibaba.com
Mon Jul 15 21:38:06 UTC 2019


On Mon, Jul 15, 2019 at 04:37:39PM -0400, Vivek Goyal wrote:
> On Thu, Jul 04, 2019 at 03:25:31PM +0800, Liu Bo wrote:
> > With free fuse dax mapping reducing, read performance is impacted
> > significantly because reads need to wait for a free fuse dax mapping.
> > 
> > Although reads will trigger reclaim work to try to reclaim fuse dax
> > mapping, reclaim code can barely make any progress if most of fuse dax
> > mappings are used by the file we're reading since inode lock is required
> > by reclaim code.
> > 
> > However, we don't have to take inode lock for reclaiming if dax mapping
> > has its own reference count, reference counting is to tell reclaim code to
> > skip those in use dax mappings, such that we can avoid the risk of
> > accidentally reclaiming a dax mapping that other readers are using.
> > 
> > On the other hand, holding ->i_dmap_sem during reclaim can be used to
> > prevent the follwing reads to get a dax mapping under reclaim.
> > 
> > Another reason is that reads/writes only use fuse dax mapping within
> > dax_iomap_rw(), so we can do such a trick, while mmap/faulting is a
> > different story and we have to take ->i_mmap_sem prior to reclaiming a dax
> > mapping in order to avoid the race.
> > 
> > This adds reference count for fuse dax mapping and removes the acquisition
> > of inode lock during reclaim.
> 
> I am not sure that this reference count implementation is safe. For
> example, what prevents atomic_dec() from being reordered so that it
> could be executed before actually copying to user space is finished.
> 
> Say cpu1 is reading a dax page and cpu 2 is freeing memory. 
> 
> cpu1 read				cpu2 free dmap
> ---------				-------------
> atomic_inc()				atomic_read()			
> copy data to user space
> atomic_dec
> 
> Say atomic_dec() gets reordered w.r.t copy_data_to_user_space.
> 
> cpu1 read				cpu2 free dmap
> ---------				-------------
> atomic_inc()				atomic_read()			
> atomic_dec
> copy data to user space
> 
> Now cpu2 will free up dax range while it is still being read?

Yep, I think this is possible.

For this specific reorder, barriers like smp_mb__{before,after}_atomic could fix
it.

thanks,
-liubo
> 
> Thanks
> Vivek
> 
> > 
> > 
> > RESULTS:
> > 
> > virtiofsd -cache_size=2G
> > 
> > vanilla kernel: IOPS=378
> > patched kernel: IOPS=4508
> > 
> > 
> > *********************************
> > $ cat fio-rand-read.job
> > ; fio-rand-read.job for fiotest
> > 
> > [global]
> > name=fio-rand-read
> > filename=fio_file
> > rw=randread
> > bs=4K
> > direct=1
> > numjobs=1
> > time_based=1
> > runtime=120
> > directory=/mnt/test/
> > fsync=1
> > group_reporting=1
> > 
> > [file1]
> > size=5G
> > # use sync/libaio
> > ioengine=sync
> > iodepth=1
> > 
> > 
> > Signed-off-by: Liu Bo <bo.liu at linux.alibaba.com>
> > ---
> >  fs/fuse/file.c   | 55 +++++++++++++++++++++++++++++++++++++------------------
> >  fs/fuse/fuse_i.h |  3 +++
> >  fs/fuse/inode.c  |  1 +
> >  3 files changed, 41 insertions(+), 18 deletions(-)
> > 
> > diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> > index 4ed45a7..e3ec646 100644
> > --- a/fs/fuse/file.c
> > +++ b/fs/fuse/file.c
> > @@ -1870,6 +1870,17 @@ static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length,
> >  		if (flags & IOMAP_FAULT)
> >  			iomap->length = ALIGN(len, PAGE_SIZE);
> >  		iomap->type = IOMAP_MAPPED;
> > +
> > +		/*
> > +		 * increace refcnt so that reclaim code knows this dmap is in
> > +		 * use.
> > +		 */
> > +		atomic_inc(&dmap->refcnt);
> > +
> > +		/* iomap->private should be NULL */
> > +		WARN_ON_ONCE(iomap->private);
> > +		iomap->private = dmap;
> > +
> >  		pr_debug("%s: returns iomap: addr 0x%llx offset 0x%llx"
> >  				" length 0x%llx\n", __func__, iomap->addr,
> >  				iomap->offset, iomap->length);
> > @@ -2024,6 +2035,11 @@ static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length,
> >  			  ssize_t written, unsigned flags,
> >  			  struct iomap *iomap)
> >  {
> > +	struct fuse_dax_mapping *dmap = iomap->private;
> > +
> > +	if (dmap)
> > +		atomic_dec(&dmap->refcnt);
> > +
> >  	/* DAX writes beyond end-of-file aren't handled using iomap, so the
> >  	 * file size is unchanged and there is nothing to do here.
> >  	 */
> > @@ -3959,6 +3975,10 @@ static int reclaim_one_dmap_locked(struct fuse_conn *fc, struct inode *inode,
> >  	int ret;
> >  	struct fuse_inode *fi = get_fuse_inode(inode);
> >  
> > +	/*
> > +	 * igrab() was done to make sure inode won't go under us, and this
> > +	 * further avoids the race with evict().
> > +	 */
> >  	ret = dmap_writeback_invalidate(inode, dmap);
> >  
> >  	/* TODO: What to do if above fails? For now,
> > @@ -4053,23 +4073,25 @@ static struct fuse_dax_mapping *alloc_dax_mapping_reclaim(struct fuse_conn *fc,
> >  	}
> >  }
> >  
> > -int lookup_and_reclaim_dmap_locked(struct fuse_conn *fc, struct inode *inode,
> > -				   u64 dmap_start)
> > +static int lookup_and_reclaim_dmap_locked(struct fuse_conn *fc,
> > +					  struct inode *inode, u64 dmap_start)
> >  {
> >  	int ret;
> >  	struct fuse_inode *fi = get_fuse_inode(inode);
> >  	struct fuse_dax_mapping *dmap;
> >  
> > -	WARN_ON(!inode_is_locked(inode));
> > -
> >  	/* Find fuse dax mapping at file offset inode. */
> >  	dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, dmap_start,
> > -							dmap_start);
> > +						 dmap_start);
> >  
> >  	/* Range already got cleaned up by somebody else */
> >  	if (!dmap)
> >  		return 0;
> >  
> > +	/* still in use. */
> > +	if (atomic_read(&dmap->refcnt))
> > +		return 0;
> > +
> >  	ret = reclaim_one_dmap_locked(fc, inode, dmap);
> >  	if (ret < 0)
> >  		return ret;
> > @@ -4084,29 +4106,21 @@ int lookup_and_reclaim_dmap_locked(struct fuse_conn *fc, struct inode *inode,
> >  /*
> >   * Free a range of memory.
> >   * Locking.
> > - * 1. Take inode->i_rwsem to prever further read/write.
> > - * 2. Take fuse_inode->i_mmap_sem to block dax faults.
> > - * 3. Take fuse_inode->i_dmap_sem to protect interval tree. It might not
> > + * 1. Take fuse_inode->i_mmap_sem to block dax faults.
> > + * 2. Take fuse_inode->i_dmap_sem to protect interval tree. It might not
> >   *    be strictly necessary as lock 1 and 2 seem sufficient.
> >   */
> > -int lookup_and_reclaim_dmap(struct fuse_conn *fc, struct inode *inode,
> > -			    u64 dmap_start)
> > +static int lookup_and_reclaim_dmap(struct fuse_conn *fc, struct inode *inode,
> > +				   u64 dmap_start)
> >  {
> >  	int ret;
> >  	struct fuse_inode *fi = get_fuse_inode(inode);
> >  
> > -	/*
> > -	 * If process is blocked waiting for memory while holding inode
> > -	 * lock, we will deadlock. So continue to free next range.
> > -	 */
> > -	if (!inode_trylock(inode))
> > -		return -EAGAIN;
> >  	down_write(&fi->i_mmap_sem);
> >  	down_write(&fi->i_dmap_sem);
> >  	ret = lookup_and_reclaim_dmap_locked(fc, inode, dmap_start);
> >  	up_write(&fi->i_dmap_sem);
> >  	up_write(&fi->i_mmap_sem);
> > -	inode_unlock(inode);
> >  	return ret;
> >  }
> >  
> > @@ -4134,6 +4148,12 @@ static int try_to_free_dmap_chunks(struct fuse_conn *fc,
> >  
> >  		list_for_each_entry_safe(pos, temp, &fc->busy_ranges,
> >  						busy_list) {
> > +			dmap = pos;
> > +
> > +			/* skip this range if it's in use. */
> > +			if (atomic_read(&dmap->refcnt))
> > +				continue;
> > +
> >  			inode = igrab(pos->inode);
> >  			/*
> >  			 * This inode is going away. That will free
> > @@ -4147,7 +4167,6 @@ static int try_to_free_dmap_chunks(struct fuse_conn *fc,
> >  			 * inode lock can't be obtained, this will help with
> >  			 * selecting new element
> >  			 */
> > -			dmap = pos;
> >  			list_move_tail(&dmap->busy_list, &fc->busy_ranges);
> >  			dmap_start = dmap->start;
> >  			window_offset = dmap->window_offset;
> > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> > index 94cfde0..31ecdac 100644
> > --- a/fs/fuse/fuse_i.h
> > +++ b/fs/fuse/fuse_i.h
> > @@ -143,6 +143,9 @@ struct fuse_dax_mapping {
> >  
> >  	/* indicate if the mapping is set up for write purpose */
> >  	unsigned flags;
> > +
> > +	/* reference count when the mapping is used by dax iomap. */
> > +	atomic_t refcnt;
> >  };
> >  
> >  /** FUSE inode */
> > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> > index 288daee..27c6055 100644
> > --- a/fs/fuse/inode.c
> > +++ b/fs/fuse/inode.c
> > @@ -671,6 +671,7 @@ static int fuse_dax_mem_range_init(struct fuse_conn *fc,
> >  		range->length = FUSE_DAX_MEM_RANGE_SZ;
> >  		list_add_tail(&range->list, &mem_ranges);
> >  		INIT_LIST_HEAD(&range->busy_list);
> > +		atomic_set(&range->refcnt, 0);
> >  		allocated_ranges++;
> >  	}
> >  
> > -- 
> > 1.8.3.1
> > 
> > _______________________________________________
> > Virtio-fs mailing list
> > Virtio-fs at redhat.com
> > https://www.redhat.com/mailman/listinfo/virtio-fs




More information about the Virtio-fs mailing list