[Linux-cluster] FW: [PATCH] More comments for GFS files
Cahill, Ben M
ben.m.cahill at intel.com
Mon Sep 27 21:42:36 UTC 2004
-----Original Message-----
From: Cahill, Ben M
Sent: Thursday, September 23, 2004 4:12 PM
To: RedHat Cluster (linux-cluster at redhat.com)
Subject: [PATCH] More comments for GFS files
Hi all,
Below please find a patch for more comments in some files in
gfs-kernel/src/gfs:
dio.c
file.c
gfs_ioctl.c
incore.h
log.c
lops.c
lvb.h
rgrp.c
The focus was on incore.h.
These were diffed against Thursday's CVS, and I've built and run GFS
after applying the patches, so things should hopefully apply cleanly.
-- Ben --
Opinions are mine, not Intel's
diff -ru cvs/cluster/gfs-kernel/src/gfs/dio.c
build_092304/cluster/gfs-kernel/src/gfs/dio.c
--- cvs/cluster/gfs-kernel/src/gfs/dio.c 2004-06-24
04:53:27.000000000 -0400
+++ build_092304/cluster/gfs-kernel/src/gfs/dio.c 2004-09-23
14:18:00.229937128 -0400
@@ -1078,6 +1078,9 @@
* gfs_sync_meta - sync all the buffers in a filesystem
* @sdp: the filesystem
*
+ * Flush metadata blocks to on-disk journal, then
+ * Flush metadata blocks (now in AIL) to on-disk in-place locations
+ * Periodically keep checking until done (AIL empty)
*/
void
diff -ru cvs/cluster/gfs-kernel/src/gfs/file.c
build_092304/cluster/gfs-kernel/src/gfs/file.c
--- cvs/cluster/gfs-kernel/src/gfs/file.c 2004-06-24
04:53:27.000000000 -0400
+++ build_092304/cluster/gfs-kernel/src/gfs/file.c 2004-09-23
14:18:09.964457256 -0400
@@ -199,15 +199,18 @@
char **p = (char **)buf;
int error = 0;
+ /* the dinode block always gets journaled */
if (bh->b_blocknr == ip->i_num.no_addr) {
GFS_ASSERT_INODE(!new, ip,);
gfs_trans_add_bh(ip->i_gl, bh);
memcpy(bh->b_data + offset, *p, size);
+ /* data blocks get journaled only for special files */
} else if (gfs_is_jdata(ip)) {
gfs_trans_add_bh(ip->i_gl, bh);
memcpy(bh->b_data + offset, *p, size);
if (new)
gfs_buffer_clear_ends(bh, offset, size, TRUE);
+ /* non-journaled data blocks get written to in-place disk blocks
*/
} else {
memcpy(bh->b_data + offset, *p, size);
if (new)
@@ -240,11 +243,13 @@
char **p = (char **)buf;
int error = 0;
+ /* the dinode block always gets journaled */
if (bh->b_blocknr == ip->i_num.no_addr) {
GFS_ASSERT_INODE(!new, ip,);
gfs_trans_add_bh(ip->i_gl, bh);
if (copy_from_user(bh->b_data + offset, *p, size))
error = -EFAULT;
+ /* data blocks get journaled only for special files */
} else if (gfs_is_jdata(ip)) {
gfs_trans_add_bh(ip->i_gl, bh);
if (copy_from_user(bh->b_data + offset, *p, size))
@@ -254,6 +259,7 @@
if (error)
memset(bh->b_data + offset, 0, size);
}
+ /* non-journaled data blocks get written to in-place disk blocks
*/
} else {
if (copy_from_user(bh->b_data + offset, *p, size))
error = -EFAULT;
diff -ru cvs/cluster/gfs-kernel/src/gfs/gfs_ioctl.h
build_092304/cluster/gfs-kernel/src/gfs/gfs_ioctl.h
--- cvs/cluster/gfs-kernel/src/gfs/gfs_ioctl.h 2004-09-13
18:48:45.000000000 -0400
+++ build_092304/cluster/gfs-kernel/src/gfs/gfs_ioctl.h 2004-09-23
13:32:21.518284584 -0400
@@ -131,18 +131,21 @@
unsigned int gt_demote_secs;
unsigned int gt_incore_log_blocks;
unsigned int gt_jindex_refresh_secs;
+
+ /* how often various daemons run (seconds) */
unsigned int gt_depend_secs;
- unsigned int gt_scand_secs;
- unsigned int gt_recoverd_secs;
- unsigned int gt_logd_secs;
- unsigned int gt_quotad_secs;
- unsigned int gt_inoded_secs;
- unsigned int gt_quota_simul_sync;
- unsigned int gt_quota_warn_period;
+ unsigned int gt_scand_secs; /* find unused glocks and
inodes */
+ unsigned int gt_recoverd_secs; /* recover journal of crashed
node */
+ unsigned int gt_logd_secs; /* update log tail as AIL
flushes */
+ unsigned int gt_quotad_secs; /* sync changes to quota file,
clean*/
+ unsigned int gt_inoded_secs; /* toss unused inodes */
+
+ unsigned int gt_quota_simul_sync; /* max # quotavals to sync at
once */
+ unsigned int gt_quota_warn_period; /* secs between quota warn
msgs */
unsigned int gt_atime_quantum;
- unsigned int gt_quota_quantum;
- unsigned int gt_quota_scale_num;
- unsigned int gt_quota_scale_den;
+ unsigned int gt_quota_quantum; /* secs between syncs to quota
file */
+ unsigned int gt_quota_scale_num; /* numerator */
+ unsigned int gt_quota_scale_den; /* denominator */
unsigned int gt_quota_enforce;
unsigned int gt_quota_account;
unsigned int gt_new_files_jdata;
diff -ru cvs/cluster/gfs-kernel/src/gfs/incore.h
build_092304/cluster/gfs-kernel/src/gfs/incore.h
--- cvs/cluster/gfs-kernel/src/gfs/incore.h 2004-09-13
18:48:45.000000000 -0400
+++ build_092304/cluster/gfs-kernel/src/gfs/incore.h 2004-09-23
14:58:06.330154296 -0400
@@ -11,20 +11,28 @@
************************************************************************
*******
************************************************************************
******/
+/*
+ * In-core (memory/RAM) structures.
+ * These do not appear on-disk. See gfs_ondisk.h for on-disk
structures.
+ */
+
#ifndef __INCORE_DOT_H__
#define __INCORE_DOT_H__
+/* flags used in function call parameters */
+
#define DIO_NEW (0x00000001)
-#define DIO_FORCE (0x00000002)
-#define DIO_CLEAN (0x00000004)
-#define DIO_DIRTY (0x00000008)
-#define DIO_START (0x00000010)
-#define DIO_WAIT (0x00000020)
-#define DIO_METADATA (0x00000040)
-#define DIO_DATA (0x00000080)
+#define DIO_FORCE (0x00000002) /* force read of block from
disk */
+#define DIO_CLEAN (0x00000004) /* don't write to disk */
+#define DIO_DIRTY (0x00000008) /* data changed, must write to
disk */
+#define DIO_START (0x00000010) /* start disk read or write */
+#define DIO_WAIT (0x00000020) /* wait for disk r/w to
complete */
+
+#define DIO_METADATA (0x00000040) /* process glock's protected
metadata */
+#define DIO_DATA (0x00000080) /* process glock's protected
filedata */
#define DIO_INVISIBLE (0x00000100)
-#define DIO_CHECK (0x00000200)
-#define DIO_ALL (0x00000400)
+#define DIO_CHECK (0x00000200) /* make sure glock's AIL is
empty */
+#define DIO_ALL (0x00000400) /* flush all AIL transactions
to disk */
/* Structure prototypes */
@@ -98,6 +106,7 @@
void (*lo_after_scan) (struct gfs_sbd * sdp, unsigned int jid,
unsigned int pass);
+ /* type of element (glock/buf/unlinked/quota) */
char *lo_name;
};
@@ -107,227 +116,351 @@
*/
struct gfs_log_element {
- struct gfs_log_operations *le_ops;
+ struct gfs_log_operations *le_ops; /* vector of functions */
- struct gfs_trans *le_trans;
- struct list_head le_list;
+ struct gfs_trans *le_trans; /* we're part of this
transaction */
+ struct list_head le_list; /* link to transaction's element
list */
};
+/*
+ * Meta-header cache structure.
+ * One for each metadata block that we've read from disk, and are still
using.
+ * In-core superblock structure hosts the actual cache.
+ * Also, each resource group keeps a list of cached blocks within its
scope.
+ */
struct gfs_meta_header_cache {
- struct list_head mc_list_hash;
- struct list_head mc_list_single;
- struct list_head mc_list_rgd;
+ /* Links to various lists */
+ struct list_head mc_list_hash; /* superblock's hashed list */
+ struct list_head mc_list_single; /* superblock's single list */
+ struct list_head mc_list_rgd; /* resource group's list */
- uint64_t mc_block;
- struct gfs_meta_header mc_mh;
+ uint64_t mc_block; /* block # (in-place address)
*/
+ struct gfs_meta_header mc_mh; /* payload: the block's
meta-header */
};
+/*
+ * Dependency cache structure.
+ * In-core superblock structure hosts the actual cache.
+ * Also, each resource group keeps a list of dependency blocks within
its scope.
+ */
struct gfs_depend {
- struct list_head gd_list_hash;
- struct list_head gd_list_rgd;
+ /* Links to various lists */
+ struct list_head gd_list_hash; /* superblock's hashed list */
+ struct list_head gd_list_rgd; /* resource group's list */
- struct gfs_rgrpd *gd_rgd;
- uint64_t gd_formal_ino;
- unsigned long gd_time;
+ struct gfs_rgrpd *gd_rgd; /* resource group descriptor */
+ uint64_t gd_formal_ino; /* inode ID */
+ unsigned long gd_time; /* time (jiffies) when put on
list */
};
/*
- * Structure containing information about the allocation bitmaps.
- * There are one of these for each fs block that the bitmap for
- * the resource group header covers.
+ * Block allocation bitmap descriptor structure.
+ * One of these for each fs block that contains bitmap data
+ * (i.e. the resource group header blocks and their following bitmap
blocks).
+ * Each allocatable fs data block is represented by 2 bits (4 alloc
states).
*/
struct gfs_bitmap {
- uint32_t bi_offset; /* The offset in the buffer of the first
byte */
- uint32_t bi_start; /* The position of the first byte in
this block */
- uint32_t bi_len; /* The number of bytes in this block */
+ uint32_t bi_offset; /* Byte offset of bitmap within this bit
block
+ (non-zero only for an rgrp header block)
*/
+ uint32_t bi_start; /* Data block (rgrp scope, 32-bit)
represented
+ by the first bit-pair in this bit block
*/
+ uint32_t bi_len; /* The number of bitmap bytes in this bit
block */
};
/*
- * Structure containing information Resource Groups
+ * Resource Group (Rgrp) descriptor structure.
+ * There is one of these for each resource (block) group in the fs.
+ * The filesystem is divided into a number of resource groups to allow
+ * simultaneous block alloc operations by a number of nodes.
*/
struct gfs_rgrpd {
- struct list_head rd_list; /* Link with superblock */
- struct list_head rd_list_mru;
- struct list_head rd_recent; /* Recently used rgrps */
+ /* Links to superblock lists */
+ struct list_head rd_list; /* on-disk-order list of all
rgrps */
+ struct list_head rd_list_mru; /* Most Recently Used list of
all rgs */
+ struct list_head rd_recent; /* recently used rgrps */
- struct gfs_glock *rd_gl; /* Glock for rgrp */
+ struct gfs_glock *rd_gl; /* Glock for this rgrp */
- unsigned long rd_flags;
+ unsigned long rd_flags; /* ?? */
- struct gfs_rindex rd_ri; /* Resource Index structure */
- struct gfs_rgrp rd_rg; /* Resource Group structure */
- uint64_t rd_rg_vn;
+ struct gfs_rindex rd_ri; /* Resource Index (on-disk)
structure */
+ struct gfs_rgrp rd_rg; /* Resource Group (on-disk)
structure */
+ uint64_t rd_rg_vn; /* version #: if != glock's
gl_vn,
+ we need to read rgrp fm disk
*/
- struct gfs_bitmap *rd_bits;
- struct buffer_head **rd_bh;
+ /* Block alloc bitmap cache */
+ struct gfs_bitmap *rd_bits; /* Array of block bitmap
descriptors */
+ struct buffer_head **rd_bh; /* Array of ptrs to block bitmap
bh's */
- uint32_t rd_last_alloc_data;
- uint32_t rd_last_alloc_meta;
+ /* Block allocation strategy, rgrp scope. Start at these blocks
when
+ * searching for next data/meta block to alloc */
+ uint32_t rd_last_alloc_data; /* most recent data block
allocated */
+ uint32_t rd_last_alloc_meta; /* most recent meta block
allocated */
- struct list_head rd_mhc;
- struct list_head rd_depend;
+ struct list_head rd_mhc; /* cached meta-headers for this
rgrp */
+ struct list_head rd_depend; /* dependency elements */
- struct gfs_sbd *rd_sbd;
+ struct gfs_sbd *rd_sbd; /* fs incore superblock (fs
instance) */
};
/*
* Per-buffer data
+ * One of these is attached as GFS private data to each fs block's
buffer_head.
+ * These also link into the Active Items Lists (AIL) (buffers flushed
to
+ * on-disk log, but not yet flushed to on-disk in-place locations)
attached
+ * to transactions and glocks.
*/
struct gfs_bufdata {
- struct buffer_head *bd_bh; /* struct buffer_head which this
struct belongs to */
- struct gfs_glock *bd_gl; /* Pointer to Glock struct for
this bh */
+ struct buffer_head *bd_bh; /* we belong to this Linux
buffer_head */
+ struct gfs_glock *bd_gl; /* this glock protects buffer's
payload */
struct gfs_log_element bd_new_le;
struct gfs_log_element bd_incore_le;
- char *bd_frozen;
- struct semaphore bd_lock;
+ char *bd_frozen; /* "frozen" copy of buffer's data */
+ struct semaphore bd_lock; /* protects access to this structure
*/
- unsigned int bd_pinned; /* Pin count */
- struct list_head bd_ail_tr_list; /* List of buffers
hanging off tr_ail_bufs */
- struct list_head bd_ail_gl_list; /* List of buffers
hanging off gl_ail_bufs */
+ /* "pin" means keep buffer in RAM, don't write to disk (yet) */
+ unsigned int bd_pinned; /* recursive pin count */
+ struct list_head bd_ail_tr_list; /* link to transaction's AIL
list */
+ struct list_head bd_ail_gl_list; /* link to glock's AIL list */
};
/*
* Glock operations
+ * One set of operations for each glock, the set selected by type of
glock.
+ * These functions get called at various points in a glock's lifetime.
+ * "xmote" = promote (lock) a glock at inter-node level.
+ * "th" = top half, "bh" = bottom half
*/
struct gfs_glock_operations {
+
+ /* before acquiring a lock at inter-node level */
void (*go_xmote_th) (struct gfs_glock * gl, unsigned int state,
int flags);
+
+ /* after acquiring a lock at inter-node level */
void (*go_xmote_bh) (struct gfs_glock * gl);
+
+ /* before releasing a lock at inter-node level, calls go_sync
*/
void (*go_drop_th) (struct gfs_glock * gl);
+
+ /* after releasing a lock at inter-node level, calls go_inval
*/
void (*go_drop_bh) (struct gfs_glock * gl);
+
+ /* sync dirty data to disk before releasing an inter-node lock
+ * (another node needs to read the updated data from disk) */
void (*go_sync) (struct gfs_glock * gl, int flags);
+
+ /* invalidate local data just after releasing an inter-node lock
+ * (another node may change the on-disk data, so it's no good to
us) */
void (*go_inval) (struct gfs_glock * gl, int flags);
+
+ /* lock-type-specific check to see if it's okay to unlock a
glock */
int (*go_demote_ok) (struct gfs_glock * gl);
+
+ /* after locking at local process level */
int (*go_lock) (struct gfs_glock * gl, int flags);
+
+ /* before unlocking at local process level */
void (*go_unlock) (struct gfs_glock * gl, int flags);
+
+ /* after receiving a callback: another node needs the lock */
void (*go_callback) (struct gfs_glock * gl, unsigned int state);
+
void (*go_greedy) (struct gfs_glock * gl);
- int go_type;
+
+ /* lock type: locks with same lock # (usually an fs block #),
+ * but different types, are different locks */
+ int go_type; /* glock type */
};
-/* Actions */
-#define HIF_MUTEX (0)
-#define HIF_PROMOTE (1)
-#define HIF_DEMOTE (2)
-#define HIF_GREEDY (3)
+/*
+ * Glock holder structure
+ * These coordinate the use, within this node, of an acquired
inter-node lock.
+ * One for each holder of a glock. A glock may be shared within a
node by
+ * several processes, or even by several recursive requests from the
same
+ * process. Each is a separate "holder". To be shared locally, the
glock
+ * must be in "SHARED" or "DEFERRED" state at inter-node level,
which means
+ * that processes on other nodes might also read the protected
entity.
+ * When a process needs to manipulate a lock, it requests it via one
of
+ * these holder structures. If the request cannot be satisfied
immediately,
+ * the holder structure gets queued on one of these glock lists:
+ * 1) waiters1, for gaining exclusive access to the glock structure.
+ * 2) waiters2, for locking (promoting) or unlocking (demoting) a
lock.
+ * This may require changing lock state at inter-node level.
+ * When holding a lock, gfs_holder struct stays on glock's holder
list.
+ * See gfs-kernel/src/harness/lm_interface.h for gh_state (LM_ST_...)
+ * and gh_flags (LM_FLAG...) fields.
+ * Also see glock.h for gh_flags field (GL_...) flags.
+ */
+/* Action requests */
+#define HIF_MUTEX (0) /* exclusive access to glock struct */
+#define HIF_PROMOTE (1) /* change lock to more restrictive state
*/
+#define HIF_DEMOTE (2) /* change lock to less restrictive state
*/
+#define HIF_GREEDY (3)
/* States */
-#define HIF_ALLOCED (4)
-#define HIF_DEALLOC (5)
-#define HIF_HOLDER (6)
-#define HIF_FIRST (7)
-#define HIF_WAKEUP (8)
-#define HIF_RECURSE (9)
+#define HIF_ALLOCED (4) /* holder structure is or was in use */
+#define HIF_DEALLOC (5) /* holder structure no longer in use */
+#define HIF_HOLDER (6) /* we have been granted a hold on the lock
*/
+#define HIF_FIRST (7) /* we are first on glock's holder list */
+#define HIF_WAKEUP (8) /* wake us up when request is satisfied */
+#define HIF_RECURSE (9) /* recursive locks on same glock by same
process */
struct gfs_holder {
- struct list_head gh_list;
+ struct list_head gh_list; /* link to one of glock's holder
lists */
- struct gfs_glock *gh_gl;
- struct task_struct *gh_owner;
- unsigned int gh_state;
- int gh_flags;
-
- int gh_error;
- unsigned long gh_iflags;
- struct completion gh_wait;
+ struct gfs_glock *gh_gl; /* glock that we're holding */
+ struct task_struct *gh_owner; /* Linux process that is the
holder */
+
+ /* request to change lock state */
+ unsigned int gh_state; /* LM_ST_... requested lock state
*/
+ int gh_flags; /* GL_... or LM_FLAG_... req
modifiers */
+
+ int gh_error; /* GLR_... CANCELLED or TRYFAILED
*/
+ unsigned long gh_iflags; /* HIF_... see above */
+ struct completion gh_wait; /* wait for completion of ... */
};
/*
* Glock Structure
- */
-
-#define GLF_PLUG (0)
-#define GLF_LOCK (1)
-#define GLF_STICKY (2)
+ * One for each inter-node lock held by this node.
+ * A glock is a local representation/abstraction of an inter-node
lock.
+ * Inter-node locks are managed by a "lock module" which plugs in to
the
+ * lock harness / glock interface (see gfs-kernel/harness).
Different
+ * lock modules support different lock protocols (e.g. GULM, GDLM,
no_lock).
+ * A glock may have one or more holders within a node. See gfs_holder
above.
+ * Glocks are managed within a hash table hosted by the in-core
superblock.
+ * After all holders have released a glock, it will stay in the hash
table
+ * cache for a certain time (gt_prefetch_secs), during which the
inter-node
+ * lock will not be released unless another node needs the lock.
This
+ * provides better performance in case this node needs the glock
again soon.
+ * Each glock has an associated vector of lock-type-specific "glops"
functions
+ * which are called at important times during the life of a glock,
and
+ * which define the type of lock (e.g. dinode, rgrp, non-disk, etc).
+ * See gfs_glock_operations above.
+ * A glock, at inter-node scope, is identified by the following
dimensions:
+ * 1) lock number (usually a block # for on-disk protected
entities,
+ * or a fixed assigned number for non-disk locks, e.g.
MOUNT).
+ * 2) lock type (actually, the type of entity protected by the
lock).
+ * 3) lock namespace, to support multiple GFS filesystems
simultaneously.
+ * Namespace (usually cluster:filesystem) is specified when
mounting.
+ * See man page for gfs_mount.
+ * Glocks require support of Lock Value Blocks (LVBs) by the
inter-node lock
+ * manager. LVBs are small (32-byte) chunks of data associated with
a given
+ * lock, that can be quickly shared between cluster nodes. Used for
certain
+ * purposes such as sharing an rgroup's block usage statistics
without
+ * requiring the overhead of:
+ * -- sync-to-disk by one node, then a
+ * -- read from disk by another node.
+ *
+ */
+
+#define GLF_PLUG (0) /* dummy */
+#define GLF_LOCK (1) /* exclusive access to glock
structure */
+#define GLF_STICKY (2) /* permanent lock, used sparingly
*/
#define GLF_PREFETCH (3)
#define GLF_SYNC (4)
#define GLF_DIRTY (5)
-#define GLF_LVB_INVALID (6)
+#define GLF_LVB_INVALID (6) /* LVB does not contain valid data
*/
#define GLF_SKIP_WAITERS2 (7)
#define GLF_GREEDY (8)
struct gfs_glock {
- struct list_head gl_list;
- unsigned long gl_flags;
- struct lm_lockname gl_name;
- atomic_t gl_count;
-
- spinlock_t gl_spin;
-
- unsigned int gl_state;
- struct list_head gl_holders;
- struct list_head gl_waiters1; /* HIF_MUTEX */
- struct list_head gl_waiters2; /* HIF_DEMOTE, HIF_GREEDY */
- struct list_head gl_waiters3; /* HIF_PROMOTE */
+ struct list_head gl_list; /* link to superblock's hash table
*/
+ unsigned long gl_flags; /* GLF_... see above */
+ struct lm_lockname gl_name; /* lock number and lock type */
+ atomic_t gl_count; /* recursive access/usage count */
+
+ spinlock_t gl_spin; /* protects some members of this
struct */
+
+ /* lock state reflects inter-node manager's lock state */
+ unsigned int gl_state; /* LM_ST_... see
harness/lm_interface.h */
+
+ /* lists of gfs_holders */
+ struct list_head gl_holders; /* all current holders of the
glock */
+ struct list_head gl_waiters1; /* wait for excl. access to glock
struct*/
+ struct list_head gl_waiters2; /* HIF_DEMOTE, HIF_GREEDY */
+ struct list_head gl_waiters3; /* HIF_PROMOTE */
- struct gfs_glock_operations *gl_ops;
+ struct gfs_glock_operations *gl_ops; /* function vector, defines
type */
struct gfs_holder *gl_req_gh;
gfs_glop_bh_t gl_req_bh;
- lm_lock_t *gl_lock;
- char *gl_lvb;
- atomic_t gl_lvb_count;
-
- uint64_t gl_vn;
- unsigned long gl_stamp;
- void *gl_object;
+ lm_lock_t *gl_lock; /* lock module's private lock data */
+ char *gl_lvb; /* Lock Value Block */
+ atomic_t gl_lvb_count; /* LVB recursive usage (hold/unhold)
count */
+
+ uint64_t gl_vn; /* incremented when protected data
changes */
+ unsigned long gl_stamp; /* glock cache retention timer */
+ void *gl_object; /* the protected entity (e.g. a
dinode) */
struct gfs_log_element gl_new_le;
struct gfs_log_element gl_incore_le;
- struct gfs_gl_hash_bucket *gl_bucket;
- struct list_head gl_reclaim;
+ struct gfs_gl_hash_bucket *gl_bucket; /* our bucket in hash
table */
+ struct list_head gl_reclaim; /* link to "reclaim" list
*/
- struct gfs_sbd *gl_sbd;
+ struct gfs_sbd *gl_sbd; /* superblock (fs
instance) */
- struct inode *gl_aspace;
- struct list_head gl_dirty_buffers;
- struct list_head gl_ail_bufs;
+ struct inode *gl_aspace; /* Linux VFS inode */
+ struct list_head gl_dirty_buffers; /* ?? */
+ struct list_head gl_ail_bufs; /* AIL buffers protected
by us */
};
/*
* In-Place Reservation structure
+ * Coordinates allocation of "in-place" (as opposed to journal) fs
blocks,
+ * which contain persistent inode/file/directory data and metadata.
+ * These blocks are the allocatable blocks within resource groups
(i.e.
+ * not including rgrp header and block alloc bitmap blocks).
+ * gfs_inplace_reserve() calculates a fulfillment plan for allocating
blocks,
+ * based on block statistics in the resource group headers.
+ * Then, gfs_blkalloc() or gfs_metaalloc() walks the block alloc
bitmaps
+ * to do the actual allocation.
*/
struct gfs_alloc {
- /* Quota stuff */
-
- unsigned int al_qd_num;
- struct gfs_quota_data *al_qd[4];
- struct gfs_holder al_qd_ghs[4];
-
- /* Filled in by the caller to gfs_inplace_reserve() */
-
- uint32_t al_requested_di;
- uint32_t al_requested_meta;
- uint32_t al_requested_data;
-
- /* Filled in by gfs_inplace_reserve() */
-
- char *al_file;
- unsigned int al_line;
- struct gfs_holder al_ri_gh;
- struct gfs_holder al_rgd_gh;
- struct gfs_rgrpd *al_rgd;
- uint32_t al_reserved_meta;
- uint32_t al_reserved_data;
-
- /* Filled in by gfs_blkalloc() */
-
- uint32_t al_alloced_di;
- uint32_t al_alloced_meta;
- uint32_t al_alloced_data;
+ /*
+ * Up to 4 quotas (including an inode's user and group quotas)
+ * can track changes in block allocation
+ */
+
+ unsigned int al_qd_num; /* # of quotas tracking changes
*/
+ struct gfs_quota_data *al_qd[4]; /* ptrs to quota structures */
+ struct gfs_holder al_qd_ghs[4]; /* holders for quota glocks */
+
+ /* Request, filled in by the caller to gfs_inplace_reserve() */
+
+ uint32_t al_requested_di; /* number of dinodes to reserve */
+ uint32_t al_requested_meta; /* number of metadata blocks to
reserve */
+ uint32_t al_requested_data; /* number of data blocks to
reserve */
+
+ /* Fulfillment plan, filled in by gfs_inplace_reserve() */
+
+ char *al_file; /* debug info, .c file making
request */
+ unsigned int al_line; /* debug info, line of code making
req */
+ struct gfs_holder al_ri_gh; /* glock holder for resource grp
index */
+ struct gfs_holder al_rgd_gh; /* glock holder for al_rgd rgrp */
+ struct gfs_rgrpd *al_rgd; /* resource group from which to
alloc */
+ uint32_t al_reserved_meta; /* alloc this # meta blocks from
al_rgd */
+ uint32_t al_reserved_data; /* alloc this # data blocks from
al_rgd */
+
+ /* Actual alloc, filled in by gfs_blkalloc()/gfs_metaalloc(),
etc. */
+
+ uint32_t al_alloced_di; /* # dinode blocks allocated */
+ uint32_t al_alloced_meta; /* # meta blocks allocated */
+ uint32_t al_alloced_data; /* # data blocks allocated */
/* Dinode allocation crap */
- struct gfs_unlinked *al_ul;
+ struct gfs_unlinked *al_ul; /* unlinked dinode log entry */
};
/*
@@ -339,27 +472,32 @@
#define GIF_SW_PAGED (2)
struct gfs_inode {
- struct gfs_inum i_num;
+ struct gfs_inum i_num; /* formal inode # and block address */
- atomic_t i_count;
- unsigned long i_flags;
+ atomic_t i_count; /* recursive usage (get/put) count */
+ unsigned long i_flags; /* GIF_... see above */
- uint64_t i_vn;
- struct gfs_dinode i_di;
+ uint64_t i_vn; /* version #: if different from glock's
vn,
+ we need to read inode from disk */
+ struct gfs_dinode i_di; /* dinode (on-disk) structure */
- struct gfs_glock *i_gl;
- struct gfs_sbd *i_sbd;
- struct inode *i_vnode;
+ struct gfs_glock *i_gl; /* this glock protects this inode */
+ struct gfs_sbd *i_sbd; /* superblock (fs instance structure)
*/
+ struct inode *i_vnode; /* Linux VFS inode structure */
- struct gfs_holder i_iopen_gh;
+ struct gfs_holder i_iopen_gh; /* glock holder for # inode opens
lock */
- struct gfs_alloc *i_alloc;
- uint64_t i_last_rg_alloc;
+ /* block allocation strategy, inode scope */
+ struct gfs_alloc *i_alloc; /* in-place block reservation
structure */
+ uint64_t i_last_rg_alloc; /* most recnt block alloc was fm this
rgrp */
- struct task_struct *i_creat_task;
- pid_t i_creat_pid;
+ /* Linux process that originally created this inode */
+ struct task_struct *i_creat_task; /* Linux "current" task struct
*/
+ pid_t i_creat_pid; /* Linux process ID
current->pid */
- spinlock_t i_lock;
+ spinlock_t i_lock; /* protects this structure */
+
+ /* cache of most-recently used buffers in indirect addressing
chain */
struct buffer_head *i_cache[GFS_MAX_META_HEIGHT];
unsigned int i_greedy;
@@ -378,8 +516,8 @@
struct semaphore f_fl_lock;
struct gfs_holder f_fl_gh;
- struct gfs_inode *f_inode;
- struct file *f_vfile;
+ struct gfs_inode *f_inode; /* incore GFS inode */
+ struct file *f_vfile; /* Linux file struct */
};
/*
@@ -393,112 +531,143 @@
#define ULF_LOCK (4)
struct gfs_unlinked {
- struct list_head ul_list;
- unsigned int ul_count;
+ struct list_head ul_list; /* link to superblock's
sd_unlinked_list */
+ unsigned int ul_count; /* usage count */
- struct gfs_inum ul_inum;
- unsigned long ul_flags;
+ struct gfs_inum ul_inum; /* formal inode #, block addr */
+ unsigned long ul_flags; /* ULF_... */
- struct gfs_log_element ul_new_le;
- struct gfs_log_element ul_incore_le;
- struct gfs_log_element ul_ondisk_le;
+ struct gfs_log_element ul_new_le; /* new, not yet committed
*/
+ struct gfs_log_element ul_incore_le; /* committed to incore log
*/
+ struct gfs_log_element ul_ondisk_le; /* committed to ondisk log
*/
};
/*
* Quota log element
+ * One for each logged change in a block alloc value affecting a given
quota.
+ * Only one of these for a given quota within a given transaction;
+ * multiple changes, within one transaction, for a given quota will
be
+ * combined into one log element.
*/
struct gfs_quota_le {
- struct gfs_log_element ql_le;
+ /* Log element maps us to a particular set of log operations
functions,
+ * and to a particular transaction */
+ struct gfs_log_element ql_le; /* generic log element
structure */
- struct gfs_quota_data *ql_data;
- struct list_head ql_data_list;
+ struct gfs_quota_data *ql_data; /* the quota we're changing */
+ struct list_head ql_data_list; /* link to quota's log element
list */
- int64_t ql_change;
+ int64_t ql_change; /* # of blocks alloc'd (+) or freed
(-) */
};
-#define QDF_USER (0)
-#define QDF_OD_LIST (1)
-#define QDF_LOCK (2)
+/*
+ * Quota structure
+ * One for each user or group quota.
+ * Summarizes all block allocation activity for a given quota, and
supports
+ * recording updates of current block alloc values in GFS' special
quota
+ * file, including the journaling of these updates, encompassing
+ * multiple transactions and log dumps.
+ */
+
+#define QDF_USER (0) /* user (1) vs. group (0) quota
*/
+#define QDF_OD_LIST (1) /* waiting for sync to quota file
*/
+#define QDF_LOCK (2) /* protects access to this
structure */
struct gfs_quota_data {
- struct list_head qd_list;
- unsigned int qd_count;
+ struct list_head qd_list; /* Link to superblock's
sd_quota_list */
+ unsigned int qd_count; /* usage/reference count */
- uint32_t qd_id;
- unsigned long qd_flags;
+ uint32_t qd_id; /* user or group ID number */
+ unsigned long qd_flags; /* QDF_... */
- struct list_head qd_le_list;
+ /* this list is for non-log-dump transactions */
+ struct list_head qd_le_list; /* List of gfs_quota_le log
elements */
- int64_t qd_change_new;
- int64_t qd_change_ic;
- int64_t qd_change_od;
- int64_t qd_change_sync;
+ /* summary of block alloc changes affecting this quota, in
various
+ * stages of logging & syncing changes to the special quota file
*/
+ int64_t qd_change_new; /* new, not yet committed to in-core
log*/
+ int64_t qd_change_ic; /* committed to in-core log */
+ int64_t qd_change_od; /* committed to on-disk log */
+ int64_t qd_change_sync; /* being synced to the in-place quota
file */
- struct gfs_quota_le qd_ondisk_ql;
- uint64_t qd_sync_gen;
+ struct gfs_quota_le qd_ondisk_ql; /* log element for log dump */
+ uint64_t qd_sync_gen; /* sync-to-quota-file generation #
*/
- struct gfs_glock *qd_gl;
- struct gfs_quota_lvb qd_qb;
+ /* glock provides protection for quota, *and* provides
+ * lock value block (LVB) communication, between nodes, of
current
+ * quota values. Shared lock -> LVB read. EX lock -> LVB
write. */
+ struct gfs_glock *qd_gl; /* glock for this quota */
+ struct gfs_quota_lvb qd_qb; /* LVB (limit/warn/value) */
- unsigned long qd_last_warn;
+ unsigned long qd_last_warn; /* jiffies of last warning to user
*/
};
+/*
+ * Log Buffer descriptor structure
+ * One for each fs block buffer recorded in the log
+ */
struct gfs_log_buf {
- struct list_head lb_list;
+ /* link to one of the transaction structure's lists */
+ struct list_head lb_list; /* link to tr_free_bufs or
tr_list */
struct buffer_head lb_bh;
struct buffer_head *lb_unlock;
};
/*
- * Transaction structures
+ * Transaction structure
+ * One for each transaction
+ * This coordinates the logging and flushing of written metadata.
*/
#define TRF_LOG_DUMP (0x00000001)
struct gfs_trans {
- struct list_head tr_list;
+
+ /* link to various lists */
+ struct list_head tr_list; /* superblk's incore trans or AIL
list*/
/* Initial creation stuff */
- char *tr_file;
- unsigned int tr_line;
+ char *tr_file; /* debug info: .c file creating
trans */
+ unsigned int tr_line; /* debug info: codeline creating
trans */
- unsigned int tr_mblks_asked; /* Number of log blocks asked to
be reserved */
- unsigned int tr_eblks_asked;
- unsigned int tr_seg_reserved; /* Number of segments reserved
*/
+ /* reservations for on-disk space in journal */
+ unsigned int tr_mblks_asked; /* # of meta log blocks requested
*/
+ unsigned int tr_eblks_asked; /* # of extra log blocks
requested */
+ unsigned int tr_seg_reserved; /* # of segments actually
reserved */
- struct gfs_holder *tr_t_gh;
+ struct gfs_holder *tr_t_gh; /* glock holder for this
transaction */
/* Stuff filled in during creation */
- unsigned int tr_flags;
- struct list_head tr_elements;
+ unsigned int tr_flags; /* TRF_... */
+ struct list_head tr_elements; /* List of this trans' log
elements */
/* Stuff modified during the commit */
- unsigned int tr_num_free_bufs;
+ unsigned int tr_num_free_bufs; /* List of free gfs_log_buf
structs */
struct list_head tr_free_bufs;
- unsigned int tr_num_free_bmem;
+ unsigned int tr_num_free_bmem; /* List of free fs-block-size
buffers */
struct list_head tr_free_bmem;
- uint64_t tr_log_head; /* The current log head */
- uint64_t tr_first_head; /* First header block */
+ uint64_t tr_log_head; /* The current log head */
+ uint64_t tr_first_head; /* First header block */
- struct list_head tr_bufs; /* List of buffers going to the
log */
+ struct list_head tr_bufs; /* List of buffers going to the
log */
- /* Stuff that's part of the AIL */
+ /* Stuff that's part of the Active Items List (AIL) */
- struct list_head tr_ail_bufs;
+ struct list_head tr_ail_bufs; /* List of buffers on AIL list */
- /* Private data for different log element types */
+ /* # log elements of various types on tr_elements list */
- unsigned int tr_num_gl;
- unsigned int tr_num_buf;
- unsigned int tr_num_iul;
- unsigned int tr_num_ida;
- unsigned int tr_num_q;
+ unsigned int tr_num_gl; /* glocks */
+ unsigned int tr_num_buf; /* buffers */
+ unsigned int tr_num_iul; /* unlinked inodes */
+ unsigned int tr_num_ida; /* de-allocated inodes */
+ unsigned int tr_num_q; /* quotas */
};
/*
@@ -511,153 +680,201 @@
} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
/*
- * Super Block Data Structure (One per filesystem)
- */
+ * "Super Block" Data Structure
+ * One per mounted filesystem.
+ * This is the big instance structure that ties everything together
for
+ * a given mounted filesystem. Each GFS mount has its own,
supporting
+ * mounts of multiple GFS filesystems on each node.
+ * Pointer to this is usually seen as "sdp" throughout code.
+ * This is a very large structure, as structures go, in part because
it
+ * contains arrays of hash buckets for various in-core caches.
+ */
+
+/* sd_flags */
+
+#define SDF_JOURNAL_LIVE (0) /* journaling is active (fs is
writeable)*/
+
+/* daemon run (1) / stop (0) flags */
+#define SDF_SCAND_RUN (1) /* put unused glocks on reclaim
queue */
+#define SDF_GLOCKD_RUN (2) /* reclaim (dealloc) unused glocks
*/
+#define SDF_RECOVERD_RUN (3) /* recover journal of a crashed
node */
+#define SDF_LOGD_RUN (4) /* update log tail after AIL
flushed */
+#define SDF_QUOTAD_RUN (5) /* sync quota changes to file,
cleanup */
+#define SDF_INODED_RUN (6) /* deallocate unlinked inodes */
+
+/* (re)mount options from Linux VFS */
+#define SDF_NOATIME (7) /* don't change access time */
+#define SDF_ROFS (8) /* read-only mode (no journal) */
-#define SDF_JOURNAL_LIVE (0)
-#define SDF_SCAND_RUN (1)
-#define SDF_GLOCKD_RUN (2)
-#define SDF_RECOVERD_RUN (3)
-#define SDF_LOGD_RUN (4)
-#define SDF_QUOTAD_RUN (5)
-#define SDF_INODED_RUN (6)
-#define SDF_NOATIME (7)
-#define SDF_ROFS (8)
+/* journal log dump support */
#define SDF_NEED_LOG_DUMP (9)
#define SDF_FOUND_UL_DUMP (10)
#define SDF_FOUND_Q_DUMP (11)
-#define SDF_IN_LOG_DUMP (12)
+#define SDF_IN_LOG_DUMP (12) /* serializes log dumps */
+
-#define GFS_GL_HASH_SHIFT (13)
+/* constants for various in-core caches */
+
+/* glock cache */
+#define GFS_GL_HASH_SHIFT (13) /* # hash buckets = 8K */
#define GFS_GL_HASH_SIZE (1 << GFS_GL_HASH_SHIFT)
#define GFS_GL_HASH_MASK (GFS_GL_HASH_SIZE - 1)
-#define GFS_MHC_HASH_SHIFT (10)
+/* meta header cache */
+#define GFS_MHC_HASH_SHIFT (10) /* # hash buckets = 1K */
#define GFS_MHC_HASH_SIZE (1 << GFS_MHC_HASH_SHIFT)
#define GFS_MHC_HASH_MASK (GFS_MHC_HASH_SIZE - 1)
-#define GFS_DEPEND_HASH_SHIFT (10)
+/* dependency cache */
+#define GFS_DEPEND_HASH_SHIFT (10) /* # hash buckets = 1K */
#define GFS_DEPEND_HASH_SIZE (1 << GFS_DEPEND_HASH_SHIFT)
#define GFS_DEPEND_HASH_MASK (GFS_DEPEND_HASH_SIZE - 1)
struct gfs_sbd {
- struct gfs_sb sd_sb; /* Super Block */
+ struct gfs_sb sd_sb; /* GFS on-disk Super Block image
*/
- struct super_block *sd_vfs; /* FS's device independent sb */
+ struct super_block *sd_vfs; /* Linux VFS device independent
sb */
- struct gfs_args sd_args;
- unsigned long sd_flags;
+ struct gfs_args sd_args; /* Mount arguments */
+ unsigned long sd_flags; /* SDF_... see above */
- struct gfs_tune sd_tune; /* FS tuning structure */
+ struct gfs_tune sd_tune; /* Filesystem tuning structure
*/
/* Resource group stuff */
- struct gfs_inode *sd_riinode; /* rindex inode */
- uint64_t sd_riinode_vn; /* Version number of the resource index
inode */
-
- struct list_head sd_rglist; /* List of resource groups */
- struct semaphore sd_rindex_lock;
-
- struct list_head sd_rg_mru_list; /* List of resource
groups in MRU order */
- spinlock_t sd_rg_mru_lock; /* Lock for MRU list */
- struct list_head sd_rg_recent; /* Recently used rgrps */
- spinlock_t sd_rg_recent_lock;
- struct gfs_rgrpd *sd_rg_forward; /* Next new rgrp to try
for allocation */
- spinlock_t sd_rg_forward_lock;
+ struct gfs_inode *sd_riinode; /* Resource Index (rindex) inode
*/
+ uint64_t sd_riinode_vn; /* Resource Index version #
(detects
+ whether new rgrps have been
added) */
+
+ struct list_head sd_rglist; /* List of all resource groups,
*/
+ struct semaphore sd_rindex_lock;/* on-disk order */
+ struct list_head sd_rg_mru_list;/* List of resource groups, */
+ spinlock_t sd_rg_mru_lock; /* most-recently-used (MRU)
order */
+ struct list_head sd_rg_recent; /* List of rgrps from which
blocks */
+ spinlock_t sd_rg_recent_lock; /* were recently allocated
*/
+ struct gfs_rgrpd *sd_rg_forward;/* Next rgrp from which to
attempt */
+ spinlock_t sd_rg_forward_lock; /* a block alloc */
- unsigned int sd_rgcount; /* Count of resource groups */
+ unsigned int sd_rgcount; /* Total # of resource groups */
/* Constants computed on mount */
- uint32_t sd_fsb2bb;
- uint32_t sd_fsb2bb_shift; /* Shift FS Block numbers to the
left by
- this to get buffer cache
blocks */
- uint32_t sd_diptrs; /* Number of pointers in a dinode */
- uint32_t sd_inptrs; /* Number of pointers in a indirect
block */
- uint32_t sd_jbsize; /* Size of a journaled data block */
- uint32_t sd_hash_bsize; /* sizeof(exhash block) */
+ /* "bb" == "basic block" == 512Byte sector */
+ uint32_t sd_fsb2bb; /* # 512B basic blocks in a FS
block */
+ uint32_t sd_fsb2bb_shift; /* Shift sector # to the right
by
+ this to get FileSystem block
addr */
+ uint32_t sd_diptrs; /* Max # of block pointers in a dinode
*/
+ uint32_t sd_inptrs; /* Max # of block pointers in an
indirect blk */
+ uint32_t sd_jbsize; /* Payload size (bytes) of a journaled
metadata
+ block (GFS journals all meta
blocks) */
+ uint32_t sd_hash_bsize; /* sizeof(exhash block) */
uint32_t sd_hash_bsize_shift;
- uint32_t sd_hash_ptrs; /* Number of points in a hash block */
- uint32_t sd_max_dirres; /* Maximum space needed to add a
directory entry */
- uint32_t sd_max_height; /* Maximum height of a file's metadata
tree */
+ uint32_t sd_hash_ptrs; /* Number of points in a hash block */
+ uint32_t sd_max_dirres; /* Max blocks needed to add a directory
entry */
+ uint32_t sd_max_height; /* Max height of a file's indir addr
tree */
uint64_t sd_heightsize[GFS_MAX_META_HEIGHT];
- uint32_t sd_max_jheight; /* Maximum height of a journaled
file's metadata tree */
+ uint32_t sd_max_jheight; /* Max hgt, journaled file's indir addr
tree */
uint64_t sd_jheightsize[GFS_MAX_META_HEIGHT];
/* Lock Stuff */
+ /* glock cache (all glocks currently held by this node for this
fs) */
struct gfs_gl_hash_bucket sd_gl_hash[GFS_GL_HASH_SIZE];
- struct list_head sd_reclaim_list;
+ /* glock reclaim support for scand and glockd */
+ struct list_head sd_reclaim_list; /* list of glocks to reclaim
*/
spinlock_t sd_reclaim_lock;
wait_queue_head_t sd_reclaim_wchan;
- atomic_t sd_reclaim_count;
+ atomic_t sd_reclaim_count; /* # glocks on reclaim list
*/
- struct lm_lockstruct sd_lockstruct;
+ /* lock module tells us if we're first-to-mount,
+ * which journal to use, etc. */
+ struct lm_lockstruct sd_lockstruct; /* info provided by lock
module */
- struct list_head sd_mhc[GFS_MHC_HASH_SIZE];
- struct list_head sd_mhc_single;
+ /* Other caches */
+
+ /* meta-header cache (incore copies of on-disk meta headers)*/
+ struct list_head sd_mhc[GFS_MHC_HASH_SIZE]; /* hash buckets */
+ struct list_head sd_mhc_single; /* non-hashed list of all
MHCs */
spinlock_t sd_mhc_lock;
- atomic_t sd_mhc_count;
+ atomic_t sd_mhc_count; /* # MHCs in cache */
- struct list_head sd_depend[GFS_DEPEND_HASH_SIZE];
+ /* dependency cache */
+ struct list_head sd_depend[GFS_DEPEND_HASH_SIZE]; /* hash
buckets */
spinlock_t sd_depend_lock;
- atomic_t sd_depend_count;
+ atomic_t sd_depend_count; /* # dependencies in cache
*/
- struct gfs_holder sd_live_gh;
+ /* LIVE inter-node lock indicates that fs is mounted on at least
+ * one node */
+ struct gfs_holder sd_live_gh; /* glock holder for LIVE
lock */
+ /* for quiescing the filesystem */
struct gfs_holder sd_freeze_gh;
struct semaphore sd_freeze_lock;
unsigned int sd_freeze_count;
/* Inode Stuff */
- struct gfs_inode *sd_rooti; /* FS's root inode */
+ struct gfs_inode *sd_rooti; /* FS's root inode */
- struct gfs_glock *sd_rename_gl; /* rename glock */
+ /* only 1 node at a time may rename (e.g. mv) a file or dir */
+ struct gfs_glock *sd_rename_gl; /* rename glock */
/* Daemon stuff */
- struct task_struct *sd_scand_process;
- unsigned int sd_glockd_num;
+ /* scan for glocks and inodes to toss from memory */
+ struct task_struct *sd_scand_process; /* scand places on reclaim
list*/
+ unsigned int sd_glockd_num; /* # of glockd procs to do
reclaiming*/
+
+ /* recover journal of a crashed node */
struct task_struct *sd_recoverd_process;
+
+ /* update log tail as AIL gets flushed to in-place on-disk
blocks */
struct task_struct *sd_logd_process;
+
+ /* sync quota updates to disk, and clean up unused quota structs
*/
struct task_struct *sd_quotad_process;
+
+ /* clean up unused inode structures */
struct task_struct *sd_inoded_process;
+ /* support for starting/stopping daemons */
struct semaphore sd_thread_lock;
struct completion sd_thread_completion;
/* Log stuff */
- struct gfs_glock *sd_trans_gl; /* transaction glock */
+ /* transaction lock protects journal replay (recovery) */
+ struct gfs_glock *sd_trans_gl; /* transaction glock structure
*/
- struct gfs_inode *sd_jiinode; /* jindex inode */
- uint64_t sd_jiinode_vn; /* Version number of the journal index
inode */
+ struct gfs_inode *sd_jiinode; /* journal index inode */
+ uint64_t sd_jiinode_vn; /* journal index version #
(detects
+ if new journals have been
added) */
unsigned int sd_journals; /* Number of journals in the FS
*/
- struct gfs_jindex *sd_jindex; /* Array of Jindex structures
describing this FS's journals */
+ struct gfs_jindex *sd_jindex; /* Array of journal descriptors
*/
struct semaphore sd_jindex_lock;
- unsigned long sd_jindex_refresh_time;
+ unsigned long sd_jindex_refresh_time; /* poll for new journals
(secs) */
- struct gfs_jindex sd_jdesc; /* Jindex structure describing
this machine's journal */
- struct gfs_holder sd_journal_gh; /* the glock for this
machine's journal */
+ struct gfs_jindex sd_jdesc; /* this machine's journal
descriptor */
+ struct gfs_holder sd_journal_gh; /* this machine's journal glock
*/
uint64_t sd_sequence; /* Assigned to xactions in order they
commit */
uint64_t sd_log_head; /* Block number of next journal write */
uint64_t sd_log_wrap;
spinlock_t sd_log_seg_lock;
- unsigned int sd_log_seg_free; /* Free segments in the log */
+ unsigned int sd_log_seg_free; /* # of free segments in the log
*/
struct list_head sd_log_seg_list;
wait_queue_head_t sd_log_seg_wait;
- struct list_head sd_log_ail; /* struct gfs_trans structures
that form the Active Items List
- "next" is the head, "prev" is
the tail */
-
- struct list_head sd_log_incore; /* transactions that have been
commited incore (but not ondisk)
- "next" is the newest, "prev"
is the oldest */
- unsigned int sd_log_buffers; /* Number of buffers in the
incore log */
+ /* "Active Items List" of transactions that have been flushed to
+ * on-disk log, and are waiting for flush to in-place on-disk
blocks */
+ struct list_head sd_log_ail; /* "next" is head, "prev" is
tail */
+
+ /* Transactions committed incore, but not yet flushed to on-disk
log */
+ struct list_head sd_log_incore; /* "next" is newest, "prev" is
oldest */
+ unsigned int sd_log_buffers; /* # of buffers in the incore
log */
struct semaphore sd_log_lock; /* Lock for access to log values
*/
@@ -674,16 +891,17 @@
/* quota crap */
- struct list_head sd_quota_list;
+ struct list_head sd_quota_list; /* list of all gfs_quota_data
structs */
spinlock_t sd_quota_lock;
- atomic_t sd_quota_count;
- atomic_t sd_quota_od_count;
+ atomic_t sd_quota_count; /* # quotas on sd_quota_list */
+ atomic_t sd_quota_od_count; /* # quotas waiting for sync to
+ special on-disk quota file */
- struct gfs_inode *sd_qinode;
+ struct gfs_inode *sd_qinode; /* special on-disk quota file */
- uint64_t sd_quota_sync_gen;
- unsigned long sd_quota_sync_time;
+ uint64_t sd_quota_sync_gen; /* generation, incr when sync to
file */
+ unsigned long sd_quota_sync_time; /* jiffies, last sync to quota
file */
/* license crap */
diff -ru cvs/cluster/gfs-kernel/src/gfs/log.c
build_092304/cluster/gfs-kernel/src/gfs/log.c
--- cvs/cluster/gfs-kernel/src/gfs/log.c 2004-07-12
15:22:44.000000000 -0400
+++ build_092304/cluster/gfs-kernel/src/gfs/log.c 2004-09-23
14:18:29.406501616 -0400
@@ -134,7 +134,8 @@
/**
* gfs_ail_start - Start I/O on the AIL
* @sdp: the filesystem
- * @flags:
+ * @flags: DIO_ALL -- flush *all* AIL transactions to disk
+ * default -- flush first-on-list AIL transaction to disk
*
*/
@@ -1207,7 +1208,7 @@
LO_CLEAN_DUMP(sdp, le);
}
- /* If there isn't anything the AIL, we won't get back the log
+ /* If there isn't anything in the AIL, we won't get back the log
space we reserved unless we do it ourselves. */
if (list_empty(&sdp->sd_log_ail)) {
diff -ru cvs/cluster/gfs-kernel/src/gfs/lops.c
build_092304/cluster/gfs-kernel/src/gfs/lops.c
--- cvs/cluster/gfs-kernel/src/gfs/lops.c 2004-06-24
04:53:28.000000000 -0400
+++ build_092304/cluster/gfs-kernel/src/gfs/lops.c 2004-09-23
14:18:41.725628824 -0400
@@ -442,6 +442,13 @@
* @blkno: the location of the log's copy of the block
*
* Returns: 0 on success, -EXXX on failure
+ *
+ * Read in-place block from disk
+ * Read log (journal) block from disk
+ * Compare generation numbers
+ * Copy log block to in-place block on-disk if:
+ * log generation # > in-place generation #
+ * OR generation #s are ==, but data contained in block is different
(corrupt)
*/
static int
diff -ru cvs/cluster/gfs-kernel/src/gfs/lvb.h
build_092304/cluster/gfs-kernel/src/gfs/lvb.h
--- cvs/cluster/gfs-kernel/src/gfs/lvb.h 2004-06-24
04:53:28.000000000 -0400
+++ build_092304/cluster/gfs-kernel/src/gfs/lvb.h 2004-09-23
14:19:09.962336192 -0400
@@ -11,26 +11,44 @@
************************************************************************
*******
************************************************************************
******/
+/*
+ * Formats of Lock Value Blocks (LVBs) for various types of locks.
+ * These 32-bit data chunks can be shared quickly between nodes
+ * via the inter-node lock manager (via LAN instead of on-disk).
+ */
+
#ifndef __LVB_DOT_H__
#define __LVB_DOT_H__
#define GFS_MIN_LVB_SIZE (32)
+/*
+ * Resource Group block allocation statistics
+ * Each resource group lock contains one of these in its LVB.
+ * Used for sharing approximate current statistics for statfs.
+ * Not used for actual block allocation.
+ */
struct gfs_rgrp_lvb {
- uint32_t rb_magic;
- uint32_t rb_free;
- uint32_t rb_useddi;
- uint32_t rb_freedi;
- uint32_t rb_usedmeta;
- uint32_t rb_freemeta;
+ uint32_t rb_magic; /* GFS_MAGIC sanity check value */
+ uint32_t rb_free; /* # free data blocks */
+ uint32_t rb_useddi; /* # used dinode blocks */
+ uint32_t rb_freedi; /* # free dinode blocks */
+ uint32_t rb_usedmeta; /* # used metadata blocks */
+ uint32_t rb_freemeta; /* # free metadata blocks */
};
+/*
+ * Quota
+ * Each quota lock contains one of these in its LVB.
+ * Keeps track of block allocation limits and current block allocation
+ * for either a cluster-wide user or a cluster-wide group.
+ */
struct gfs_quota_lvb {
- uint32_t qb_magic;
+ uint32_t qb_magic; /* GFS_MAGIC sanity check value */
uint32_t qb_pad;
- uint64_t qb_limit;
- uint64_t qb_warn;
- int64_t qb_value;
+ uint64_t qb_limit; /* hard limit of # blocks to alloc */
+ uint64_t qb_warn; /* warn user when alloc is above this #
*/
+ int64_t qb_value; /* current # blocks allocated */
};
/* Translation functions */
diff -ru cvs/cluster/gfs-kernel/src/gfs/rgrp.c
build_092304/cluster/gfs-kernel/src/gfs/rgrp.c
--- cvs/cluster/gfs-kernel/src/gfs/rgrp.c 2004-06-24
04:53:28.000000000 -0400
+++ build_092304/cluster/gfs-kernel/src/gfs/rgrp.c 2004-09-23
14:18:56.703351864 -0400
@@ -372,6 +372,7 @@
memset(count, 0, 4 * sizeof(uint32_t));
+ /* count # blocks in each of 4 possible allocation states */
for (buf = 0; buf < length; buf++) {
bits = &rgd->rd_bits[buf];
for (x = 0; x < 4; x++)
@@ -531,6 +532,7 @@
* gfs_compute_bitstructs - Compute the bitmap sizes
* @rgd: The resource group descriptor
*
+ * Calculates bitmap descriptors, one for each block that contains
bitmap data
*/
static void
@@ -538,7 +540,7 @@
{
struct gfs_sbd *sdp = rgd->rd_sbd;
struct gfs_bitmap *bits;
- uint32_t length = rgd->rd_ri.ri_length;
+ uint32_t length = rgd->rd_ri.ri_length; /* # blocks in hdr &
bitmap */
uint32_t bytes_left, bytes;
int x;
@@ -550,21 +552,25 @@
for (x = 0; x < length; x++) {
bits = &rgd->rd_bits[x];
+ /* small rgrp; bitmap stored completely in header block
*/
if (length == 1) {
bytes = bytes_left;
bits->bi_offset = sizeof(struct gfs_rgrp);
bits->bi_start = 0;
bits->bi_len = bytes;
+ /* header block */
} else if (x == 0) {
bytes = sdp->sd_sb.sb_bsize - sizeof(struct
gfs_rgrp);
bits->bi_offset = sizeof(struct gfs_rgrp);
bits->bi_start = 0;
bits->bi_len = bytes;
+ /* last block */
} else if (x + 1 == length) {
bytes = bytes_left;
bits->bi_offset = sizeof(struct
gfs_meta_header);
bits->bi_start = rgd->rd_ri.ri_bitbytes -
bytes_left;
bits->bi_len = bytes;
+ /* other blocks */
} else {
bytes = sdp->sd_sb.sb_bsize - sizeof(struct
gfs_meta_header);
bits->bi_offset = sizeof(struct
gfs_meta_header);
@@ -855,10 +861,12 @@
* @rgd: the RG data
* @al: the struct gfs_alloc structure describing the reservation
*
- * Sets the $ir_datares field in @res.
- * Sets the $ir_metares field in @res.
+ * If there's room for the requested blocks to be allocated from the
RG:
+ * Sets the $al_reserved_data field in @al.
+ * Sets the $al_reserved_meta field in @al.
+ * Sets the $al_rgd field in @al.
*
- * Returns: 1 on success, 0 on failure
+ * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
*/
static int
@@ -900,7 +908,7 @@
}
/**
- * recent_rgrp_first - get first RG from recent list
+ * recent_rgrp_first - get first RG from "recent" list
* @sdp: The GFS superblock
* @rglast: address of the rgrp used last
*
@@ -939,7 +947,7 @@
}
/**
- * recent_rgrp_next - get next RG from recent list
+ * recent_rgrp_next - get next RG from "recent" list
* @cur_rgd: current rgrp
*
* Returns: The next rgrp in the recent list
@@ -978,7 +986,7 @@
}
/**
- * recent_rgrp_remove - remove an RG from recent list
+ * recent_rgrp_remove - remove an RG from "recent" list
* @rgd: The rgrp to remove
*
*/
@@ -992,9 +1000,14 @@
}
/**
- * recent_rgrp_add - add an RG to recent list
+ * recent_rgrp_add - add an RG to tail of "recent" list
* @new_rgd: The rgrp to add
*
+ * Before adding, make sure that:
+ * 1) it's not already on the list
+ * 2) there's still room for more entries
+ * The capacity limit imposed on the "recent" list is basically a
node's "share"
+ * of rgrps within a cluster, i.e. (total # rgrps) / (# nodes
(journals))
*/
static void
More information about the Linux-cluster
mailing list