[Virtio-fs] [PATCH 3/3] fuse: Add logic to upgrade a read-only mapping to read-write

Vivek Goyal vgoyal at redhat.com
Thu Jul 25 20:30:13 UTC 2019


On Thu, Jul 25, 2019 at 12:32:50PM -0700, Liu Bo wrote:
> On Thu, Jul 25, 2019 at 11:35:21AM -0400, Vivek Goyal wrote:
> > On Wed, Jul 24, 2019 at 05:08:58PM -0700, Liu Bo wrote:
> > > On Wed, Jul 24, 2019 at 05:07:21PM -0400, Vivek Goyal wrote:
> > > > Do not always map a dax mapping read-write. There are use cases like
> > > > executing a file where we want to map file read-only. virtio-fs dir on
> > > > host might be backed by overlayfs. We don't want to open file read-write
> > > > on overlayfs otherwise it will unnecessariliy be copied-up nullyifying
> > > > the advantages of sharing page cache between vms for unmodified files.
> > > > 
> > > > Hence, create a read-only mapping if user did not ask for writable mapping.
> > > > Later upgrade it to read-write mapping when users requests it.
> > > > 
> > > > Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
> > > > ---
> > > >  fs/fuse/file.c   | 114 ++++++++++++++++++++++++++++++++++++-----------
> > > >  fs/fuse/fuse_i.h |   3 ++
> > > >  2 files changed, 91 insertions(+), 26 deletions(-)
> > > > 
> > > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> > > > index a2c19e4a28b5..bcd8ddcc0fdf 100644
> > > > --- a/fs/fuse/file.c
> > > > +++ b/fs/fuse/file.c
> > > > @@ -265,7 +265,8 @@ static void dmap_add_to_free_pool(struct fuse_conn *fc,
> > > >  
> > > >  /* offset passed in should be aligned to FUSE_DAX_MEM_RANGE_SZ */
> > > >  static int fuse_setup_one_mapping(struct inode *inode, loff_t offset,
> > > > -				  struct fuse_dax_mapping *dmap)
> > > > +				  struct fuse_dax_mapping *dmap, bool writable,
> > > > +				  bool upgrade)
> > > >  {
> > > >  	struct fuse_conn *fc = get_fuse_conn(inode);
> > > >  	struct fuse_inode *fi = get_fuse_inode(inode);
> > > > @@ -283,7 +284,8 @@ static int fuse_setup_one_mapping(struct inode *inode, loff_t offset,
> > > >  	inarg.moffset = dmap->window_offset;
> > > >  	inarg.len = FUSE_DAX_MEM_RANGE_SZ;
> > > >  	inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ;
> > > > -	inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE;
> > > > +	if (writable)
> > > > +		inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE;
> > > >  	args.in.h.opcode = FUSE_SETUPMAPPING;
> > > >  	args.in.h.nodeid = fi->nodeid;
> > > >  	args.in.numargs = 1;
> > > > @@ -296,26 +298,30 @@ static int fuse_setup_one_mapping(struct inode *inode, loff_t offset,
> > > >  		return err;
> > > >  	}
> > > >  
> > > > -	pr_debug("fuse_setup_one_mapping() succeeded. offset=0x%llx err=%zd\n", offset, err);
> > > > +	pr_debug("fuse_setup_one_mapping() succeeded. offset=0x%llx writable=%d"
> > > > +		 " err=%zd\n", offset, writable, err);
> > > >  
> > > > -	/*
> > > > -	 * We don't take a refernce on inode. inode is valid right now and
> > > > -	 * when inode is going away, cleanup logic should first cleanup
> > > > -	 * dmap entries.
> > > > -	 *
> > > > -	 * TODO: Do we need to ensure that we are holding inode lock
> > > > -	 * as well.
> > > > -	 */
> > > > -	dmap->inode = inode;
> > > > -	dmap->start = offset;
> > > > -	dmap->end = offset + FUSE_DAX_MEM_RANGE_SZ - 1;
> > > > -	/* Protected by fi->i_dmap_sem */
> > > > -	fuse_dax_interval_tree_insert(dmap, &fi->dmap_tree);
> > > > -	fi->nr_dmaps++;
> > > > -	spin_lock(&fc->lock);
> > > > -	list_add_tail(&dmap->busy_list, &fc->busy_ranges);
> > > > -	fc->nr_busy_ranges++;
> > > > -	spin_unlock(&fc->lock);
> > > > +	dmap->writable = writable;
> > > > +	if (!upgrade) {
> > > > +		/*
> > > > +		 * We don't take a refernce on inode. inode is valid right now
> > > > +		 * and when inode is going away, cleanup logic should first
> > > > +		 * cleanup dmap entries.
> > > > +		 *
> > > > +		 * TODO: Do we need to ensure that we are holding inode lock
> > > > +		 * as well.
> > > > +		 */
> > > > +		dmap->inode = inode;
> > > > +		dmap->start = offset;
> > > > +		dmap->end = offset + FUSE_DAX_MEM_RANGE_SZ - 1;
> > > > +		/* Protected by fi->i_dmap_sem */
> > > > +		fuse_dax_interval_tree_insert(dmap, &fi->dmap_tree);
> > > > +		fi->nr_dmaps++;
> > > > +		spin_lock(&fc->lock);
> > > > +		list_add_tail(&dmap->busy_list, &fc->busy_ranges);
> > > > +		fc->nr_busy_ranges++;
> > > > +		spin_unlock(&fc->lock);
> > > > +	}
> > > >  	return 0;
> > > >  }
> > > >  
> > > > @@ -1890,6 +1896,7 @@ static int iomap_begin_setup_new_mapping(struct inode *inode, loff_t pos,
> > > >  	struct fuse_conn *fc = get_fuse_conn(inode);
> > > >  	struct fuse_dax_mapping *dmap, *alloc_dmap = NULL;
> > > >  	int ret;
> > > > +	bool writable = flags & IOMAP_WRITE;
> > > >  
> > > >  	/* Can't do reclaim in fault path yet due to lock ordering.
> > > >  	 * Read path takes shared inode lock and that's not sufficient
> > > > @@ -1930,10 +1937,11 @@ static int iomap_begin_setup_new_mapping(struct inode *inode, loff_t pos,
> > > >  
> > > >  	/* Setup one mapping */
> > > >  	ret = fuse_setup_one_mapping(inode,
> > > > -		ALIGN_DOWN(pos, FUSE_DAX_MEM_RANGE_SZ), alloc_dmap);
> > > > +				     ALIGN_DOWN(pos, FUSE_DAX_MEM_RANGE_SZ),
> > > > +				     alloc_dmap, writable, false);
> > > >  	if (ret < 0) {
> > > >  		printk("fuse_setup_one_mapping() failed. err=%d"
> > > > -			" pos=0x%llx\n", ret, pos);
> > > > +			" pos=0x%llx, writable=%d\n", ret, pos, writable);
> > > >  		dmap_add_to_free_pool(fc, alloc_dmap);
> > > >  		up_write(&fi->i_dmap_sem);
> > > >  		return ret;
> > > > @@ -1943,6 +1951,45 @@ static int iomap_begin_setup_new_mapping(struct inode *inode, loff_t pos,
> > > >  	return 0;
> > > >  }
> > > >  
> > > > +static int iomap_begin_upgrade_mapping(struct inode *inode, loff_t pos,
> > > > +					 loff_t length, unsigned flags,
> > > > +					 struct iomap *iomap)
> > > > +{
> > > > +	struct fuse_inode *fi = get_fuse_inode(inode);
> > > > +	struct fuse_dax_mapping *dmap;
> > > > +	int ret;
> > > > +
> > > > +	/*
> > > > +	 * Take exclusive lock so that only one caller can try to setup
> > > > +	 * mapping and others wait.
> > > > +	 */
> > > > +	down_write(&fi->i_dmap_sem);
> > > > +	dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos, pos);
> > > > +
> > > > +	/* We are holding either inode lock or i_mmap_sem, and that should
> > > > +	 * ensure that dmap can't reclaimed or truncated and it should still
> > > > +	 * be there in tree despite the fact we dropped and re-acquired the
> > > > +	 * lock.
> > > > +	 */
> > > > +	if (WARN_ON(!dmap))
> > > > +		return -EIO;
> > > 
> > > up_write() is missed here.
> > 
> > Good catch. Will fix it.
> > 
> > > 

[..]
> > > And I posted a similar patch[1] but with inode_lock being dropped in
> > > mind, anything wrong with that patch?
> > 
> > It was an old posting so did not pay much attention to it. Looking at
> > it now and few things I notice.
> > 
> > - That patch will allocate and possibly wait for memory range and free
> >   it up when upgrading from read-only to read-write. That's not required.
> >
> 
> What I was trying to say is that the above assumption about "dmap is still in
> tree despite dropping-acquiring lock" is doubtful if inode lock is not held by
> reclaim code.

In that case we will take reference on dmap before dropping the lock to
make sure dmap is not reclaimed in the time we dropped the lock and
re-acquired it? And that means there is no need to allocate dmap for
upgrade case.

> 
> There is a window between dropping and reacquiring lock, if it gets reclaimed,
> we're supposed to continue on the allocation path.

Thanks
Vivek




More information about the Virtio-fs mailing list