[Virtio-fs] [fuse-devel] 'FORGET' ordering semantics (vs unlink & NFS)

Vivek Goyal vgoyal at redhat.com
Tue Jan 5 15:42:41 UTC 2021


On Tue, Jan 05, 2021 at 12:24:48PM +0100, Miklos Szeredi wrote:
> On Mon, Jan 4, 2021 at 7:57 PM Dr. David Alan Gilbert
> <dgilbert at redhat.com> wrote:
> >
> > * Vivek Goyal (vgoyal at redhat.com) wrote:
> > > On Mon, Jan 04, 2021 at 04:00:13PM +0000, Dr. David Alan Gilbert wrote:
> > > > Hi,
> > > >   On virtio-fs we're hitting a problem with NFS, where
> > > > unlinking a file in a directory and then rmdir'ing that
> > > > directory fails complaining about the directory not being empty.
> > > >
> > > > The problem here is that if a file has an open fd, NFS doesn't
> > > > actually delete the file on unlink, it just renames it to
> > > > a hidden file (e.g. .nfs*******).  That open file is there because
> > > > the 'FORGET' hasn't completed yet by the time the rmdir is issued.
> > > >
> > > > Question:
> > > >   a) In the FUSE protocol, are requests assumed to complete in order;
> > > > i.e.  unlink, forget, rmdir   is it required that 'forget' completes
> > > > before the rmdir is processed?
> > > >      (In virtiofs we've been processing requests, in parallel, and
> > > > have sent forgets down a separate queue to keep them out of the way).
> > > >
> > > >   b) 'forget' doesn't send a reply - so the kernel can't wait for the
> > > > client to have finished it;  do we need a synchronous forget here?
> > >
> > > Even if we introduce a synchronous forget, will that really fix the
> > > issue. For example, this could also happen if file has been unlinked
> > > but it is still open and directory is being removed.
> > >
> > > fd = open(foo/bar.txt)
> > > unlink foo/bar.txt
> > > rmdir foo
> > > close(fd).
> > >
> > > In this case, final forget should go after fd has been closed. Its
> > > not a forget race.
> > >
> > > I wrote a test case for this and it works on regular file systems.
> > >
> > > https://github.com/rhvgoyal/misc/blob/master/virtiofs-tests/rmdir.c
> > >
> > > I suspect it will fail on nfs because I am assuming that temporary
> > > file will be there till final close(fd) happens. If that's the
> > > case this is a NFS specific issue because its behavior is different
> > > from other file systems.
> >
> > That's true; but that's NFS just being NFS; in our case we're keeping
> > an fd open even though the guest has been smart enough not to; so we're
> > causing the NFS oddity when it wouldn't normally happen.
> 
> Can't think of anything better than a synchronous forget.   Compile
> only tested patch attached.
> 
> Thanks,
> Miklos

> diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
> index 78f9f209078c..daa4e669441d 100644
> --- a/fs/fuse/dir.c
> +++ b/fs/fuse/dir.c
> @@ -373,6 +373,26 @@ static struct vfsmount *fuse_dentry_automount(struct path *path)
>  	return ERR_PTR(err);
>  }
>  
> +static void fuse_dentry_iput(struct dentry *dentry, struct inode *inode)
> +{
> +	if (!__lockref_is_dead(&dentry->d_lockref)) {
> +		/*
> +		 * This is an unlink/rmdir and removing the last ref to the
> +		 * dentry.  Use synchronous FORGET in case filesystem requests
> +		 * it.
> +		 *
> +		 * FIXME: This is racy!  Two or more instances of
> +		 * fuse_dentry_iput() could be running concurrently (unlink of
> +		 * several aliases in different directories).
> +		 */
> +		set_bit(FUSE_I_SYNC_FORGET, &get_fuse_inode(inode)->state);
> +		iput(inode);
> +		clear_bit(FUSE_I_SYNC_FORGET, &get_fuse_inode(inode)->state);

Hi Miklos,

Can above iput() be final reference on inode and free the inode? If yes,
then it is not safe to call clear_bit() after iput()?

Vivek

> +	} else {
> +		iput(inode);
> +	}
> +}
> +
>  const struct dentry_operations fuse_dentry_operations = {
>  	.d_revalidate	= fuse_dentry_revalidate,
>  	.d_delete	= fuse_dentry_delete,
> @@ -381,6 +401,7 @@ const struct dentry_operations fuse_dentry_operations = {
>  	.d_release	= fuse_dentry_release,
>  #endif
>  	.d_automount	= fuse_dentry_automount,
> +	.d_iput		= fuse_dentry_iput,
>  };
>  
>  const struct dentry_operations fuse_root_dentry_operations = {
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 7c4b8cb93f9f..0820b7a63ca7 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -174,6 +174,8 @@ enum {
>  	FUSE_I_SIZE_UNSTABLE,
>  	/* Bad inode */
>  	FUSE_I_BAD,
> +	/* Synchronous forget requested */
> +	FUSE_I_SYNC_FORGET,
>  };
>  
>  struct fuse_conn;
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index b0e18b470e91..a49ff30d1ecc 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -115,6 +115,26 @@ static void fuse_free_inode(struct inode *inode)
>  	kmem_cache_free(fuse_inode_cachep, fi);
>  }
>  
> +static void fuse_sync_forget(struct inode *inode)
> +{
> +	struct fuse_mount *fm = get_fuse_mount(inode);
> +	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct fuse_forget_in inarg;
> +	FUSE_ARGS(args);
> +
> +	memset(&inarg, 0, sizeof(inarg));
> +	inarg.nlookup = fi->nlookup;
> +	args.opcode = FUSE_SYNC_FORGET;
> +	args.nodeid = fi->nodeid;
> +	args.in_numargs = 1;
> +	args.in_args[0].size = sizeof(inarg);
> +	args.in_args[0].value = &inarg;
> +	args.force = true;
> +
> +	fuse_simple_request(fm, &args);
> +	/* ignore errors */
> +}
> +
>  static void fuse_evict_inode(struct inode *inode)
>  {
>  	struct fuse_inode *fi = get_fuse_inode(inode);
> @@ -127,9 +147,13 @@ static void fuse_evict_inode(struct inode *inode)
>  		if (FUSE_IS_DAX(inode))
>  			fuse_dax_inode_cleanup(inode);
>  		if (fi->nlookup) {
> -			fuse_queue_forget(fc, fi->forget, fi->nodeid,
> -					  fi->nlookup);
> -			fi->forget = NULL;
> +			if (test_bit(FUSE_I_SYNC_FORGET, &fi->state)) {
> +				fuse_sync_forget(inode);
> +			} else {
> +				fuse_queue_forget(fc, fi->forget, fi->nodeid,
> +						  fi->nlookup);
> +				fi->forget = NULL;
> +			}
>  		}
>  	}
>  	if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) {
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index 98ca64d1beb6..cfcf95cfde76 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -499,6 +499,7 @@ enum fuse_opcode {
>  	FUSE_COPY_FILE_RANGE	= 47,
>  	FUSE_SETUPMAPPING	= 48,
>  	FUSE_REMOVEMAPPING	= 49,
> +	FUSE_SYNC_FORGET	= 50,
>  
>  	/* CUSE specific operations */
>  	CUSE_INIT		= 4096,




More information about the Virtio-fs mailing list