[Cluster-devel] cluster/gfs-kernel/src/gfs inode.c inode.h ops ...
wcheng at sourceware.org
wcheng at sourceware.org
Wed Feb 14 23:15:44 UTC 2007
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL4
Changes by: wcheng at sourceware.org 2007-02-14 23:15:44
Modified files:
gfs-kernel/src/gfs: inode.c inode.h ops_inode.c
Log message:
Final patch for bugzilla 1904745 - there is a deadlock documented in the
bugzilla entry comment #26 that is messy to get around under current
GFS lock order rule. To compromise the issue, we will fail the rename
with the original error return code (-ENOENT), if the new ino has any
possibility to cause deadlock. This implies, if GFS ino number is
allocated via true random manner, we only reduce this error return
code by 25%.
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/gfs-kernel/src/gfs/inode.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.20.2.3&r2=1.20.2.4
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/gfs-kernel/src/gfs/inode.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.3.2.1&r2=1.3.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/gfs-kernel/src/gfs/ops_inode.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.6.2.5&r2=1.6.2.6
--- cluster/gfs-kernel/src/gfs/inode.c 2006/10/10 18:35:27 1.20.2.3
+++ cluster/gfs-kernel/src/gfs/inode.c 2007/02/14 23:15:44 1.20.2.4
@@ -335,7 +335,7 @@
/* Read dinode from disk */
error = gfs_copyin_dinode(ip);
- if (error)
+ if (error)
goto fail_iopen;
gfs_glock_hold(i_gl);
@@ -1566,7 +1566,8 @@
*/
int
-gfs_unlink_ok(struct gfs_inode *dip, struct qstr *name, struct gfs_inode *ip)
+gfs_unlink_ok(struct gfs_inode *dip, struct qstr *name,
+ struct gfs_inode *ip, struct gfs_inum *inump)
{
struct gfs_inum inum;
unsigned int type;
@@ -1589,11 +1590,22 @@
return error;
error = gfs_dir_search(dip, name, &inum, &type);
- if (error)
+ if (error)
return error;
- if (inum.no_formal_ino != ip->i_num.no_formal_ino)
+ if (inum.no_formal_ino != ip->i_num.no_formal_ino) {
+ /*
+ * Add for rename - bugzilla 190475
+ * return new inum such that gfs_rename can
+ * do its silly rename.
+ */
+ if (unlikely(inump)) {
+ inump->no_formal_ino = inum.no_formal_ino;
+ inump->no_addr = inum.no_addr;
+ }
+
return -ENOENT;
+ }
if (ip->i_di.di_type != type) {
gfs_consist_inode(dip);
--- cluster/gfs-kernel/src/gfs/inode.h 2007/01/16 20:39:03 1.3.2.1
+++ cluster/gfs-kernel/src/gfs/inode.h 2007/02/14 23:15:44 1.3.2.2
@@ -37,7 +37,7 @@
int gfs_unlinki(struct gfs_inode *dip, struct qstr *name, struct gfs_inode *ip);
int gfs_rmdiri(struct gfs_inode *dip, struct qstr *name, struct gfs_inode *ip);
int gfs_unlink_ok(struct gfs_inode *dip, struct qstr *name,
- struct gfs_inode *ip);
+ struct gfs_inode *ip, struct gfs_inum *inum);
int gfs_ok_to_move(struct gfs_inode *this, struct gfs_inode *to);
int gfs_readlinki(struct gfs_inode *ip, char **buf, unsigned int *len);
--- cluster/gfs-kernel/src/gfs/ops_inode.c 2007/01/18 20:40:12 1.6.2.5
+++ cluster/gfs-kernel/src/gfs/ops_inode.c 2007/02/14 23:15:44 1.6.2.6
@@ -537,7 +537,7 @@
if (error)
goto fail;
- error = gfs_unlink_ok(dip, &dentry->d_name, ip);
+ error = gfs_unlink_ok(dip, &dentry->d_name, ip, NULL);
if (error)
goto fail_gunlock;
@@ -772,7 +772,7 @@
if (error)
goto fail;
- error = gfs_unlink_ok(dip, &dentry->d_name, ip);
+ error = gfs_unlink_ok(dip, &dentry->d_name, ip, NULL);
if (error)
goto fail_gunlock;
@@ -914,6 +914,75 @@
return 0;
}
+/*
+ * Obtain a clone dentry with correct inode:
+ *
+ * This is to work around the window between lock_rename()
+ * and gfs_rename() where another node may have renamed the
+ * target file (bugzilla 190475). The bug most likely occurs
+ * in a mail server environment.
+ *
+ * Return true if the dentry has been successfully updated.
+ */
+static int
+gfs_refresh_dentry(struct gfs_sbd *sdp, struct dentry **f_dentry,
+ struct gfs_inum *inump)
+{
+ struct inode *inode;
+ struct dentry *dentry, *sdentry=*f_dentry;
+
+ dentry = d_alloc(sdentry->d_parent, &sdentry->d_name);
+ if (!dentry)
+ return 0;
+
+ /* Get the fully updated inode */
+ inode = NULL;
+ if (inump->no_formal_ino) {
+ inode = gfs_refresh_iobj(sdp, inump, NULL);
+ if (IS_ERR(inode)) {
+ dput(dentry);
+ return 0;
+ }
+ }
+
+ /*
+ * Remove the old dentry from cache to avoid
+ * another lookup.
+ */
+ if (!d_unhashed(sdentry)) {
+ d_drop(sdentry);
+ }
+
+ /* Instantiate the new dentry */
+ dentry->d_op = &gfs_dops;
+
+ d_splice_alias(inode, dentry);
+
+ /* Return with success */
+ *f_dentry = dentry;
+ return 1;
+}
+
+/*
+ * Check lock order for the new ino:
+ * return 1: if it could lead to deadlock
+ * return 0: if ordering is ok.
+ */
+
+static int
+gfs_check_lock_order(struct gfs_holder *ghs, int n_gh, uint64_t ino)
+{
+ int x;
+
+ /* no fancy algorithm here since max n_gh is 4 */
+ for (x = 0; x < n_gh; x++) {
+ if (ino < ghs[x].gh_gl->gl_name.ln_number)
+ return 1;
+ }
+
+ return 0;
+}
+
/**
* gfs_rename - Rename a file
* @odir: Parent directory of old file name
@@ -935,17 +1004,21 @@
struct gfs_sbd *sdp = odip->i_sbd;
struct qstr name;
struct gfs_alloc *al;
- struct gfs_holder ghs[4], r_gh;
+ struct gfs_holder ghs[5], r_gh;
unsigned int num_gh;
int dir_rename = FALSE;
int alloc_required;
unsigned int x;
int error;
+ int once_cnt=0, has_ndentry=0;
+ struct gfs_inum new_inum;
atomic_inc(&sdp->sd_ops_inode);
gfs_unlinked_limit(sdp);
+ odir->i_sb->s_type->fs_flags &= ~FS_ODD_RENAME;
+
if (ndentry->d_inode) {
nip = vn2ip(ndentry->d_inode);
if (ip == nip)
@@ -968,36 +1041,78 @@
goto fail;
}
- num_gh = 1;
+ num_gh = 1;
gfs_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
if (odip != ndip) {
gfs_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs+num_gh);
num_gh++;
}
+
gfs_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs+num_gh);
num_gh++;
-
if (nip) {
gfs_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
num_gh++;
}
error = gfs_glock_nq_m(num_gh, ghs);
- if (error)
- goto fail_uninit;
+ if (error)
+ goto fail_uninit;
/* Check out the old directory */
- error = gfs_unlink_ok(odip, &odentry->d_name, ip);
+ error = gfs_unlink_ok(odip, &odentry->d_name, ip, NULL);
if (error)
goto fail_gunlock;
+gfs_rename_retry:
+
/* Check out the new directory */
if (nip) {
- error = gfs_unlink_ok(ndip, &ndentry->d_name, nip);
- if (error)
+ /* Zero out this field so we can know whether gfs_unlink_ok
+ * could fill in the correct inode numbers.
+ */
+ new_inum.no_formal_ino = 0;
+ error = gfs_unlink_ok(ndip, &ndentry->d_name, nip, &new_inum);
+
+ if (unlikely(error)) {
+ /*
+ * Handle a rare race condition that the new file
+ * could have been renamed by another node - the
+ * correct inode number is passed into new_inum.
+ * We only allow this to happen once.
+ */
+ if ((error == -ENOENT) && (!once_cnt)
+ && (new_inum.no_formal_ino))
+ {
+ if (gfs_check_lock_order(ghs, num_gh,
+ new_inum.no_formal_ino))
+ /* fail rename with -ENOENT */
+ goto fail_gunlock;
+
+ if (gfs_refresh_dentry(sdp,&ndentry,&new_inum))
+ {
+ odir->i_sb->s_type->fs_flags
+ |= FS_ODD_RENAME;
+ has_ndentry = 1;
+ nip = vn2ip(ndentry->d_inode);
+ if (nip) {
+ once_cnt++;
+ error = gfs_glock_nq_init(nip->i_gl,
+ LM_ST_EXCLUSIVE, 0,
+ &ghs[num_gh++]);
+ if (error) {
+ num_gh--;
+ error = -ENOENT;
+ }
+ else
+ goto gfs_rename_retry;
+ }
+ }
+ }
goto fail_gunlock;
+ }
if (nip->i_di.di_type == GFS_FILE_DIR) {
if (nip->i_di.di_entries < 2) {
@@ -1150,6 +1265,14 @@
if (dir_rename)
gfs_glock_dq_uninit(&r_gh);
+
+ /* dput new dentry as vfs layer still uses old entry */
+ if (unlikely(has_ndentry)) {
+ d_move(odentry, ndentry);
+ dput(ndentry);
+ dput(ndentry);
+ }
+
return 0;
fail_end_trans:
@@ -1178,6 +1301,12 @@
if (dir_rename)
gfs_glock_dq_uninit(&r_gh);
+ if (unlikely(has_ndentry)) {
+ d_move(odentry, ndentry);
+ dput(ndentry);
+ dput(ndentry);
+ }
+
return error;
}
More information about the Cluster-devel
mailing list