diff -Naur cvs_112004am/cluster/gfs-kernel/src/gfs/gfs_ondisk.h build_112004am/cluster/gfs-kernel/src/gfs/gfs_ondisk.h --- cvs_112004am/cluster/gfs-kernel/src/gfs/gfs_ondisk.h 2004-11-18 18:36:33.000000000 -0500 +++ build_112004am/cluster/gfs-kernel/src/gfs/gfs_ondisk.h 2004-11-22 15:40:32.100588888 -0500 @@ -381,7 +381,8 @@ #define GFS_DIF_DIRECTIO (0x00000010) #define GFS_DIF_IMMUTABLE (0x00000020) /* Can't change file */ #define GFS_DIF_APPENDONLY (0x00000040) /* Can only add to end of file */ -#define GFS_DIF_NOATIME (0x00000080) /* Don't update access time */ +#define GFS_DIF_NOATIME (0x00000080) /* Don't update access time + * (currently unused/ignored) */ #define GFS_DIF_SYNC (0x00000100) /* Flush to disk, don't cache */ #define GFS_DIF_INHERIT_DIRECTIO (0x40000000) /* new files get DIRECTIO flag */ #define GFS_DIF_INHERIT_JDATA (0x80000000) /* new files get JDATA flag */ diff -Naur cvs_112004am/cluster/gfs-kernel/src/gfs/inode.c build_112004am/cluster/gfs-kernel/src/gfs/inode.c --- cvs_112004am/cluster/gfs-kernel/src/gfs/inode.c 2004-11-18 18:36:33.000000000 -0500 +++ build_112004am/cluster/gfs-kernel/src/gfs/inode.c 2004-11-19 15:50:16.000000000 -0500 @@ -1626,12 +1626,16 @@ } /** - * gfs_glock_nq_atime - Acquire the glock and conditionally update the atime on an inode + * gfs_glock_nq_atime - Acquire a hold on an inode's glock, and + * conditionally update the inode's atime * @gh: the holder to acquire * - * Tests atime for gfs_read, gfs_readdir and gfs_test_mmap - * Update if the difference between the current time and the current atime - * is greater than an interval specified at mount (or default). + * Tests atime (access time) for gfs_read, gfs_readdir and gfs_test_mmap + * Update if the difference between the current time and the inode's current + * atime is greater than an interval specified at mount (or default). + * + * Will not update if GFS mounted NOATIME (this is *the* place where NOATIME + * has an effect) or Read-Only. * * Returns: 0 on success, -EXXX on error */ @@ -1654,6 +1658,7 @@ ip = gl2ip(gl); GFS_ASSERT_GLOCK(ip, gl,); + /* Save original request state of lock holder */ state = gh->gh_state; flags = gh->gh_flags; @@ -1667,8 +1672,10 @@ curtime = get_seconds(); if (curtime - ip->i_di.di_atime >= quantum) { + /* Check state of glock after lock holder request granted */ int was_exclusive = (gl->gl_state == LM_ST_EXCLUSIVE); + /* Get EX hold (force EX glock via !ANY) to write the dinode */ gfs_glock_dq(gh); gfs_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY, @@ -1677,7 +1684,7 @@ if (error) return error; - /* Verify this hasn't been updated while we were + /* Verify that atime hasn't been updated while we were trying to get exclusive lock. */ curtime = get_seconds(); @@ -1703,6 +1710,8 @@ gfs_trans_end(sdp); } + /* If glock satisfied the caller's hold request at less than EX, + force glock demotion to EXACTly hold request state */ if (!was_exclusive) { gfs_glock_dq(gh); flags &= ~LM_FLAG_ANY; @@ -1725,10 +1734,17 @@ } /** - * glock_compare_atime - Compare two struct gfs_glock structures for sorting + * glock_compare_atime - Compare two struct gfs_glock structures for gfs_sort() * @arg_a: the first structure * @arg_b: the second structure * + * Sort order determined by (in order of priority): + * -- lock number + * -- lock state (SHARED > EXCLUSIVE or GL_ATIME, which can demand EXCLUSIVE) + * + * Returns: 1 if A > B + * -1 if A < B + * 0 if A = B */ static int @@ -1757,11 +1773,15 @@ } /** - * gfs_glock_nq_m_atime - acquire multiple glocks where one may need an atime update + * gfs_glock_nq_m_atime - acquire multiple glocks where one may need an + * atime update * @num_gh: the number of structures * @ghs: an array of struct gfs_holder structures * - * Returns: 0 on success (all glocks acquired), -EXXX on failure (no glocks acquired) + * This blocks until all hold requests are satisfied. + * + * Returns: 0 on success (all glocks acquired), + * -EXXX on failure (no glocks acquired) */ int diff -Naur cvs_112004am/cluster/gfs-kernel/src/gfs/ops_file.c build_112004am/cluster/gfs-kernel/src/gfs/ops_file.c --- cvs_112004am/cluster/gfs-kernel/src/gfs/ops_file.c 2004-11-18 18:36:33.000000000 -0500 +++ build_112004am/cluster/gfs-kernel/src/gfs/ops_file.c 2004-11-19 15:09:02.000000000 -0500 @@ -183,7 +183,7 @@ } /** - * walk_vma - Walk the vmas associated with a buffer for read or write. + * walk_vm - Walk the vmas associated with a buffer for read or write. * If any of them are gfs, pass the gfs inode down to the read/write * worker function so that locks can be acquired in the correct order. * @file: The file to read/write from/to diff -Naur cvs_112004am/cluster/gfs-kernel/src/gfs/ops_fstype.c build_112004am/cluster/gfs-kernel/src/gfs/ops_fstype.c --- cvs_112004am/cluster/gfs-kernel/src/gfs/ops_fstype.c 2004-10-27 16:39:38.000000000 -0400 +++ build_112004am/cluster/gfs-kernel/src/gfs/ops_fstype.c 2004-11-22 15:45:38.715976264 -0500 @@ -41,9 +41,28 @@ * gfs_read_super - Read in superblock * @sb: The VFS superblock * @data: Mount options - * @silent: Don't complain if its not a GFS filesystem + * @silent: Don't complain if it's not a GFS filesystem * * Returns: errno + * + * After cross-linking Linux VFS incore superblock and our GFS incore superblock + * (filesystem instance structures) to one another, we: + * -- Init some of our GFS incore superblock, including some temporary + * block-size values (enough to read on-disk superblock). + * -- Set up some things in Linux VFS superblock. + * -- Mount a lock module, init glock system (incl. glock reclaim daemons), + * and init some important inter-node locks (MOUNT, LIVE, SuperBlock). + * -- Read-in the GFS on-disk superblock (1st time, to get enough info + * to do filesystem upgrade and journal replay, incl. journal index). + * -- Upgrade on-disk filesystem format (rarely needed). + * -- Replay journal(s) (always; replay *all* journals if we're first-to-mount). + * -- Read-in on-disk superblock and journal index special file again (2nd time, + * assumed 100% valid now after journal replay). + * -- Read-in info on other special (hidden) files (root inode, resource index, + * quota inode, license inode). + * -- Start other daemons (journal/log recovery, log tail, quota updates, inode + * reclaim) for periodic maintenance. + * */ static int @@ -126,26 +145,30 @@ goto fail_vfree; } - /* Copy out mount flags */ + /* Copy VFS mount flags */ if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME)) set_bit(SDF_NOATIME, &sdp->sd_flags); if (sb->s_flags & MS_RDONLY) set_bit(SDF_ROFS, &sdp->sd_flags); - /* Setup up Virtual Super Block */ + /* Set up Linux Virtual (VFS) Super Block */ sb->s_magic = GFS_MAGIC; sb->s_op = &gfs_super_ops; sb->s_export_op = &gfs_export_ops; + + /* Don't let the VFS update atimes. GFS handles this itself. */ sb->s_flags |= MS_NOATIME | MS_NODIRATIME; sb->s_maxbytes = MAX_LFS_FILESIZE; + /* If we were mounted with -o acl (to support POSIX access control + lists), tell VFS */ if (sdp->sd_args.ar_posix_acls) sb->s_flags |= MS_POSIXACL; - /* Set up the buffer cache and fill in some fake values - to allow us to read in the superblock. */ + /* Set up the buffer cache and fill in some fake block size values + to allow us to read-in the on-disk superblock. */ sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS_BASIC_BLOCK); sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits; @@ -154,6 +177,8 @@ GFS_ASSERT_SBD(sizeof(struct gfs_sb) <= sdp->sd_sb.sb_bsize, sdp,); + /* Mount an inter-node lock module, check for local optimizations */ + error = gfs_mount_lockproto(sdp, silent); if (error) goto fail_vfree; @@ -191,6 +216,7 @@ wait_for_completion(&sdp->sd_thread_completion); } + /* Only one node may mount at a time */ error = gfs_glock_nq_num(sdp, GFS_MOUNT_LOCK, &gfs_nondisk_glops, LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE, @@ -201,6 +227,7 @@ goto fail_glockd; } + /* Show that cluster is alive */ error = gfs_glock_nq_num(sdp, GFS_LIVE_LOCK, &gfs_nondisk_glops, LM_ST_SHARED, LM_FLAG_NOEXP | GL_EXACT, @@ -213,6 +240,9 @@ sdp->sd_live_gh.gh_owner = NULL; + /* Read the SuperBlock from disk, get enough info to enable us + to read-in the journal index and replay all journals. */ + error = gfs_glock_nq_num(sdp, GFS_SB_LOCK, &gfs_meta_glops, (sdp->sd_args.ar_upgrade) ? LM_ST_EXCLUSIVE : LM_ST_SHARED, @@ -230,7 +260,8 @@ goto fail_gunlock_sb; } - /* Set up the buffer cache and SB for real */ + /* Set up the buffer cache and SB for real, now that we know block + sizes, version #s, locations of important on-disk inodes, etc. */ error = -EINVAL; if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) { @@ -251,7 +282,7 @@ sb_set_blocksize(sb, sdp->sd_sb.sb_bsize); - /* Read in journal index inode */ + /* Read-in journal index inode (but not the file contents, yet) */ error = gfs_get_jiinode(sdp); if (error) { @@ -262,7 +293,8 @@ init_MUTEX(&sdp->sd_jindex_lock); - /* Get a handle on the transaction glock */ + /* Get a handle on the transaction glock; we need this for disk format + upgrade and journal replays, as well as normal operation. */ error = gfs_glock_get(sdp, GFS_TRANS_LOCK, &gfs_trans_glops, CREATE, &sdp->sd_trans_gl); @@ -270,7 +302,7 @@ goto fail_ji_free; set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags); - /* Upgrade version numbers if we need to */ + /* Upgrade GFS on-disk format version numbers if we need to */ if (sdp->sd_args.ar_upgrade) { error = gfs_do_upgrade(sdp, sb_gh.gh_gl); @@ -278,7 +310,7 @@ goto fail_trans_gl; } - /* Load in the journal index */ + /* Load in the journal index special file */ error = gfs_jindex_hold(sdp, &ji_gh); if (error) { @@ -287,6 +319,8 @@ goto fail_trans_gl; } + /* Discover this node's journal number (lock module tells us + which one to use), and lock it */ error = -EINVAL; if (sdp->sd_lockstruct.ls_jid >= sdp->sd_journals) { printk("GFS: fsid=%s: can't mount journal #%u\n", @@ -309,6 +343,9 @@ } if (sdp->sd_lockstruct.ls_first) { + /* We're first node within cluster to mount this filesystem, + replay ALL of the journals, then let lock module know + that we're done. */ for (x = 0; x < sdp->sd_journals; x++) { error = gfs_recover_journal(sdp, x, sdp->sd_jindex + x, @@ -323,8 +360,10 @@ sdp->sd_lockstruct.ls_ops->lm_others_may_mount(sdp->sd_lockstruct.ls_lockspace); sdp->sd_lockstruct.ls_first = FALSE; } else { + /* We're not the first; replay only our own journal. */ error = gfs_recover_journal(sdp, - sdp->sd_lockstruct.ls_jid, &sdp->sd_jdesc, + sdp->sd_lockstruct.ls_jid, + &sdp->sd_jdesc, TRUE); if (error) { printk("GFS: fsid=%s: error recovering my journal: %d\n", @@ -340,7 +379,10 @@ sdp->sd_journal_gh.gh_owner = NULL; - /* Drop our cache and reread all the things we read before the replay. */ + /* Drop our buffer cache and reread all the things we read before + the journal replay, on the unlikely chance that the replay might + have affected (corrected/updated) the superblock contents + or journal index. */ error = gfs_read_sb(sdp, sb_gh.gh_gl, FALSE); if (error) { @@ -370,7 +412,7 @@ } } - /* Start up the recover thread */ + /* Start up the journal recovery thread */ error = kernel_thread(gfs_recoverd, sdp, 0); if (error < 0) { @@ -421,7 +463,7 @@ gfs_glock_dq_uninit(&sb_gh); super = FALSE; - /* Get the inode/dentry */ + /* Get the root inode/dentry */ inode = gfs_iget(sdp->sd_rooti, CREATE); if (!inode) { diff -Naur cvs_112004am/cluster/gfs-kernel/src/gfs/ops_super.c build_112004am/cluster/gfs-kernel/src/gfs/ops_super.c --- cvs_112004am/cluster/gfs-kernel/src/gfs/ops_super.c 2004-10-27 16:39:38.000000000 -0400 +++ build_112004am/cluster/gfs-kernel/src/gfs/ops_super.c 2004-11-19 09:50:15.000000000 -0500 @@ -334,7 +334,7 @@ error = gfs_make_fs_rw(sdp); } - /* Don't let the VFS update atimes. */ + /* Don't let the VFS update atimes. GFS handles this itself. */ *flags |= MS_NOATIME | MS_NODIRATIME; return error; diff -Naur cvs_112004am/cluster/gfs-kernel/src/gfs/super.c build_112004am/cluster/gfs-kernel/src/gfs/super.c --- cvs_112004am/cluster/gfs-kernel/src/gfs/super.c 2004-11-18 18:36:33.000000000 -0500 +++ build_112004am/cluster/gfs-kernel/src/gfs/super.c 2004-11-22 15:59:35.930700280 -0500 @@ -471,7 +471,8 @@ * @ji_gh: the holder for the jindex glock * * This makes sure that we're using the latest copy of the journal index - * special file, which might have been updated if someone added journals + * special file (this describes all of the journals for this filesystem), + * which might have been updated if someone added journals * (via gfs_jadd utility). * * This is very similar to the gfs_rindex_hold() function, except that @@ -507,10 +508,13 @@ } /** - * gfs_get_jiinode - Read in the jindex inode for the superblock + * gfs_get_jiinode - Read-in the special (hidden) journal index inode * @sdp: The GFS superblock * * Returns: 0 on success, error code otherwise + * + * This reads-in just the dinode, not the special file contents that describe + * the journals themselves (see gfs_jindex_hold()). */ int @@ -540,10 +544,13 @@ } /** - * gfs_get_riinode - Read in the rindex inode for the superblock + * gfs_get_riinode - Read in the special (hidden) resource group index inode * @sdp: The GFS superblock * * Returns: 0 on success, error code otherwise + * + * This reads-in just the dinode, not the special file contents that describe + * the resource groups themselves (see gfs_rindex_hold()). */ int @@ -573,7 +580,7 @@ } /** - * gfs_get_rootinode - Read in the root inode + * gfs_get_rootinode - Read in the filesystem's root inode * @sdp: The GFS superblock * * Returns: 0 on success, error code otherwise