[Cluster-devel] cluster/gfs-kernel/src/gfs inode.c inode.h ops ...

Wed Feb 14 23:15:44 UTC 2007

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	wcheng at sourceware.org	2007-02-14 23:15:44

Modified files:
	gfs-kernel/src/gfs: inode.c inode.h ops_inode.c 

Log message:
	Final patch for bugzilla 1904745 - there is a deadlock documented in the
	bugzilla entry comment #26 that is messy to get around under current
	GFS lock order rule. To compromise the issue, we will fail the rename
	with the original error return code (-ENOENT), if the new ino has any
	possibility to cause deadlock. This implies, if GFS ino number is
	allocated via true random manner, we only reduce this error return
	code by 25%.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/gfs-kernel/src/gfs/inode.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.20.2.3&r2=1.20.2.4
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/gfs-kernel/src/gfs/inode.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.3.2.1&r2=1.3.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/gfs-kernel/src/gfs/ops_inode.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.6.2.5&r2=1.6.2.6

--- cluster/gfs-kernel/src/gfs/inode.c	2006/10/10 18:35:27	1.20.2.3
+++ cluster/gfs-kernel/src/gfs/inode.c	2007/02/14 23:15:44	1.20.2.4
@@ -335,7 +335,7 @@
 
 	/* Read dinode from disk */
 	error = gfs_copyin_dinode(ip);
-	if (error)
+	if (error) 
 		goto fail_iopen;
 
 	gfs_glock_hold(i_gl);
@@ -1566,7 +1566,8 @@
  */
 
 int
-gfs_unlink_ok(struct gfs_inode *dip, struct qstr *name, struct gfs_inode *ip)
+gfs_unlink_ok(struct gfs_inode *dip, struct qstr *name, 
+	struct gfs_inode *ip, struct gfs_inum *inump)
 {
 	struct gfs_inum inum;
 	unsigned int type;
@@ -1589,11 +1590,22 @@
 		return error;
 
 	error = gfs_dir_search(dip, name, &inum, &type);
-	if (error)
+	if (error) 
 		return error;
 
-	if (inum.no_formal_ino != ip->i_num.no_formal_ino)
+	if (inum.no_formal_ino != ip->i_num.no_formal_ino) {
+		/*
+		 * Add for rename - bugzilla 190475
+		 *   return new inum such that gfs_rename can
+		 *   do its silly rename.
+		 */
+		if (unlikely(inump)) { 
+			inump->no_formal_ino = inum.no_formal_ino;
+			inump->no_addr = inum.no_addr;
+		}
+			
 		return -ENOENT;
+	}
 
 	if (ip->i_di.di_type != type) {
 		gfs_consist_inode(dip);
--- cluster/gfs-kernel/src/gfs/inode.h	2007/01/16 20:39:03	1.3.2.1
+++ cluster/gfs-kernel/src/gfs/inode.h	2007/02/14 23:15:44	1.3.2.2
@@ -37,7 +37,7 @@
 int gfs_unlinki(struct gfs_inode *dip, struct qstr *name, struct gfs_inode *ip);
 int gfs_rmdiri(struct gfs_inode *dip, struct qstr *name, struct gfs_inode *ip);
 int gfs_unlink_ok(struct gfs_inode *dip, struct qstr *name,
-		  struct gfs_inode *ip);
+		  struct gfs_inode *ip, struct gfs_inum *inum);
 int gfs_ok_to_move(struct gfs_inode *this, struct gfs_inode *to);
 int gfs_readlinki(struct gfs_inode *ip, char **buf, unsigned int *len);
 
--- cluster/gfs-kernel/src/gfs/ops_inode.c	2007/01/18 20:40:12	1.6.2.5
+++ cluster/gfs-kernel/src/gfs/ops_inode.c	2007/02/14 23:15:44	1.6.2.6
@@ -537,7 +537,7 @@
 	if (error)
 		goto fail;
 
-	error = gfs_unlink_ok(dip, &dentry->d_name, ip);
+	error = gfs_unlink_ok(dip, &dentry->d_name, ip, NULL);
 	if (error)
 		goto fail_gunlock;
 
@@ -772,7 +772,7 @@
 	if (error)
 		goto fail;
 
-	error = gfs_unlink_ok(dip, &dentry->d_name, ip);
+	error = gfs_unlink_ok(dip, &dentry->d_name, ip, NULL);
 	if (error)
 		goto fail_gunlock;
 
@@ -914,6 +914,75 @@
 	return 0;
 }
 
+/*
+ * Obtain a clone dentry with correct inode:
+ *
+ * This is to work around the window between lock_rename() 
+ * and gfs_rename() where another node may have renamed the 
+ * target file (bugzilla 190475). The bug most likely occurs
+ * in a mail server environment.
+ *
+ * Return true if the dentry has been successfully updated.
+ */
+static int
+gfs_refresh_dentry(struct gfs_sbd *sdp, struct dentry **f_dentry, 
+	struct gfs_inum *inump) 
+{
+	struct inode *inode;
+	struct dentry *dentry, *sdentry=*f_dentry;
+
+	dentry = d_alloc(sdentry->d_parent, &sdentry->d_name);
+	if (!dentry) 
+		return 0;
+
+	/* Get the fully updated inode */
+	inode = NULL;
+	if (inump->no_formal_ino) {
+		inode = gfs_refresh_iobj(sdp, inump, NULL);
+		if (IS_ERR(inode)) { 
+			dput(dentry);
+			return 0;
+		}
+	} 
+
+	/* 
+	 * Remove the old dentry from cache to avoid 
+	 * another lookup.
+	 */
+	if (!d_unhashed(sdentry)) {
+		d_drop(sdentry);
+	}
+
+	/* Instantiate the new dentry */
+	dentry->d_op = &gfs_dops;
+
+	d_splice_alias(inode, dentry);
+
+	/* Return with success */
+	*f_dentry = dentry;
+	return 1;
+}
+
+/* 
+ * Check lock order for the new ino: 
+ * return 1: if it could lead to deadlock
+ * return 0: if ordering is ok.
+ */ 
+
+static int
+gfs_check_lock_order(struct gfs_holder *ghs, int n_gh, uint64_t ino)
+{
+	int x;
+
+	/* no fancy algorithm here since max n_gh is 4 */
+	for (x = 0; x < n_gh; x++) {
+		if (ino < ghs[x].gh_gl->gl_name.ln_number) 
+			return 1;
+	}
+
+	return 0;
+}
+
 /**
  * gfs_rename - Rename a file
  * @odir: Parent directory of old file name
@@ -935,17 +1004,21 @@
 	struct gfs_sbd *sdp = odip->i_sbd;
 	struct qstr name;
 	struct gfs_alloc *al;
-	struct gfs_holder ghs[4], r_gh;
+	struct gfs_holder ghs[5], r_gh;
 	unsigned int num_gh;
 	int dir_rename = FALSE;
 	int alloc_required;
 	unsigned int x;
 	int error;
+	int once_cnt=0, has_ndentry=0;
+	struct gfs_inum new_inum;
 
 	atomic_inc(&sdp->sd_ops_inode);
 
 	gfs_unlinked_limit(sdp);
 
+	odir->i_sb->s_type->fs_flags &= ~FS_ODD_RENAME;
+
 	if (ndentry->d_inode) {
 		nip = vn2ip(ndentry->d_inode);
 		if (ip == nip)
@@ -968,36 +1041,78 @@
 			goto fail;
 	}
 
-	num_gh = 1;
+	num_gh  = 1;
 	gfs_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
 	if (odip != ndip) {
 		gfs_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs+num_gh);
 		num_gh++;
 	}
+
 	gfs_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs+num_gh);
 	num_gh++;
-
 	if (nip) {
 		gfs_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
 		num_gh++;
 	}
 
 	error = gfs_glock_nq_m(num_gh, ghs);
-	if (error)
-		goto fail_uninit;
+        if (error)
+                goto fail_uninit;
 
 	/*  Check out the old directory  */
 
-	error = gfs_unlink_ok(odip, &odentry->d_name, ip);
+	error = gfs_unlink_ok(odip, &odentry->d_name, ip, NULL);
 	if (error)
 		goto fail_gunlock;
 
+gfs_rename_retry:
+
 	/*  Check out the new directory  */
 
 	if (nip) {
-		error = gfs_unlink_ok(ndip, &ndentry->d_name, nip);
-		if (error)
+		/* Zero out this field so we can know whether gfs_unlink_ok 
+		 * could fill in the correct inode numbers.
+		 */
+		new_inum.no_formal_ino = 0;
+		error = gfs_unlink_ok(ndip, &ndentry->d_name, nip, &new_inum);
+
+		if (unlikely(error)) {
+			/* 
+			 * Handle a rare race condition that the new file
+			 * could have been renamed by another node - the
+			 * correct inode number is passed into new_inum. 
+			 * We only allow this to happen once.
+			 */
+			if ((error == -ENOENT) && (!once_cnt) 
+					&& (new_inum.no_formal_ino)) 
+			{
+				if (gfs_check_lock_order(ghs, num_gh, 
+						new_inum.no_formal_ino)) 
+					/* fail rename with -ENOENT */
+					goto fail_gunlock;
+
+				if (gfs_refresh_dentry(sdp,&ndentry,&new_inum))
+				{
+					odir->i_sb->s_type->fs_flags 
+							|= FS_ODD_RENAME;
+					has_ndentry = 1;
+					nip = vn2ip(ndentry->d_inode);
+					if (nip) {
+					   once_cnt++;
+					   error = gfs_glock_nq_init(nip->i_gl,
+							LM_ST_EXCLUSIVE, 0,
+							&ghs[num_gh++]);
+					   if (error) { 
+						num_gh--;
+						error = -ENOENT;
+					   }
+					   else
+					   	goto gfs_rename_retry;
+					}
+				}
+			}
 			goto fail_gunlock;
+		}
 
 		if (nip->i_di.di_type == GFS_FILE_DIR) {
 			if (nip->i_di.di_entries < 2) {
@@ -1150,6 +1265,14 @@
 	if (dir_rename)
 		gfs_glock_dq_uninit(&r_gh);
 
+
+	/* dput new dentry as vfs layer still uses old entry */
+	if (unlikely(has_ndentry)) {
+		d_move(odentry, ndentry);
+		dput(ndentry);
+		dput(ndentry);
+	}
+
 	return 0;
 
  fail_end_trans:
@@ -1178,6 +1301,12 @@
 	if (dir_rename)
 		gfs_glock_dq_uninit(&r_gh);
 
+	if (unlikely(has_ndentry)) {
+		d_move(odentry, ndentry);
+		dput(ndentry);
+		dput(ndentry);
+	}
+
 	return error;
 }