diff -Naur cvs_102504/cluster/gfs-kernel/src/gfs/dio.c build_102504/cluster/gfs-kernel/src/gfs/dio.c --- cvs_102504/cluster/gfs-kernel/src/gfs/dio.c 2004-10-06 00:44:25.000000000 -0400 +++ build_102504/cluster/gfs-kernel/src/gfs/dio.c 2004-10-25 22:32:24.165775024 -0400 @@ -646,6 +646,7 @@ lock_page(bh->b_page); + /* if there's one attached already, we're done */ if (bh2bd(bh)) { unlock_page(bh->b_page); return; @@ -721,8 +722,8 @@ if (!bd->bd_pinned++) { wait_on_buffer(bh); - /* If this buffer is in the AIL and it has already been written, - remove it from the AIL. */ + /* If this buffer is in the AIL and it has already been written + to in-place disk block, remove it from the AIL. */ spin_lock(&sdp->sd_ail_lock); if (!list_empty(&bd->bd_ail_tr_list) && !buffer_busy(bh)) { @@ -760,7 +761,25 @@ * @sdp: the filesystem the buffer belongs to * @bh: The buffer to unpin * @tr: The transaction in the AIL that contains this buffer - * + * If NULL, don't attach buffer to any AIL list + * (i.e. when moving buffer from an older to a newer transaction + * in the incore log??) + * + * Called for (meta) buffers, after they've been logged to on-disk journal. + * Make a (meta) buffer writeable to in-place location on-disk, if recursive + * pin count is 1 (i.e. no other, later transaction is modifying this buffer). + * Add buffer to AIL lists of 1) the latest transaction that's modified and + * logged (on-disk) the buffer, and of 2) the glock that protects the buffer. + * A single buffer might have been modified by more than one transaction + * since the buffer's previous write to disk (in-place location). We keep + * the buffer on only one transaction's AIL list, i.e. that of the latest + * transaction that's completed logging this buffer (no need to write it to + * in-place block multiple times for multiple transactions, only once with + * the most up-to-date data). + * A single buffer will be protected by one and only one glock. If buffer is + * already on a (previous) transaction's AIL, we know that we're already + * on buffer's glock's AIL. + * */ void @@ -777,6 +796,7 @@ GFS_ASSERT_GLOCK(bd->bd_pinned, bd->bd_gl,); + /* no other (later) transaction is modifying buffer; ready to write */ if (bd->bd_pinned == 1) mark_buffer_dirty(bh); @@ -784,14 +804,16 @@ gfs_unlock_buffer(bh); - /* Add the buffer to the AIL - and get rid of an old reference if there is one */ - if (tr) { spin_lock(&sdp->sd_ail_lock); + /* Buffer not attached to any earlier transaction. + Add it to glock's AIL, and this transaction's AIL (below). */ if (list_empty(&bd->bd_ail_tr_list)) list_add(&bd->bd_ail_gl_list, &bd->bd_gl->gl_ail_bufs); + + /* Was part of earlier transaction. Move from that trans' AIL + to this newer one's AIL. Buf is already on glock's AIL. */ else { list_del_init(&bd->bd_ail_tr_list); brelse(bh); diff -Naur cvs_102504/cluster/gfs-kernel/src/gfs/incore.h build_102504/cluster/gfs-kernel/src/gfs/incore.h --- cvs_102504/cluster/gfs-kernel/src/gfs/incore.h 2004-10-13 17:33:38.000000000 -0400 +++ build_102504/cluster/gfs-kernel/src/gfs/incore.h 2004-10-25 22:44:56.629383168 -0400 @@ -204,7 +204,19 @@ * One of these is attached as GFS private data to each FS block's buffer_head. * These also link into the Active Items Lists (AIL) (buffers flushed to * on-disk log, but not yet flushed to on-disk in-place locations) attached - * to transactions and glocks. + * to 1) the latest transaction to modify and log (on-disk) the buffer, + * and 2) the glock that protects the buffer's contents. + * Note that multiple transactions can modify the buffer since its most + * recent write to disk (in-place location). Each transaction must log + * the modified buffer to the on-disk journal (e.g. 3 transactions + * will cause 3 different copies of the buffer to be logged on-disk). + * However, only the most up-to-date buffer content needs to be written + * to the in-place block on-disk, which is why the buffer is attached to + * only the most recent transaction's AIL list. + * If a transaction follows another transaction before the first transaction's + * log completes, the first transaction's results are copied to a "frozen" + * image of the buffer, so it can be logged properly, while the second + * transaction is modifying the "real" buffer. */ struct gfs_bufdata { struct buffer_head *bd_bh; /* We belong to this Linux buffer_head */ @@ -218,8 +230,10 @@ /* "Pin" means keep buffer in RAM, don't write to disk (yet) */ unsigned int bd_pinned; /* Recursive pin count */ - struct list_head bd_ail_tr_list; /* Link to transaction's AIL list */ - struct list_head bd_ail_gl_list; /* Link to glock's AIL list */ + + /* Links to Active Items Lists */ + struct list_head bd_ail_tr_list; /* This buf's most recent trans' AIL */ + struct list_head bd_ail_gl_list; /* This buf's glock's AIL */ }; /* @@ -596,6 +610,8 @@ /* * Log Buffer descriptor structure * One for each FS block buffer recorded in the log + * lb_bh is a "fake" buffer head that directs Linux block I/O to write the buf + * to the on-disk log location, rather than the on-disk in-place location. */ struct gfs_log_buf { /* Link to one of the transaction structure's lists */ @@ -833,7 +849,9 @@ /* Log stuff */ - /* Transaction lock protects journal replay (recovery) */ + /* Transaction lock protects the following from one another: + * normal write transaction, journal replay (recovery), fs upgrade, + * fs read-only => read/write and read/write => read-only conversions */ struct gfs_glock *sd_trans_gl; /* Transaction glock structure */ struct gfs_inode *sd_jiinode; /* Journal index inode */ diff -Naur cvs_102504/cluster/gfs-kernel/src/gfs/log.c build_102504/cluster/gfs-kernel/src/gfs/log.c --- cvs_102504/cluster/gfs-kernel/src/gfs/log.c 2004-10-06 00:44:25.000000000 -0400 +++ build_102504/cluster/gfs-kernel/src/gfs/log.c 2004-10-25 22:32:24.197770160 -0400 @@ -770,13 +770,15 @@ /** * log_refund - Refund log segments to the free pool * @sdp: The GFS superblock - * @tr: The tranaction to examine + * @tr: The transaction to examine * * Look at the number of segments reserved for this transaction and the * number of segments actually needed for it. If they aren't the * same, refund the difference to the free segment pool. * - * Called with the log lock held + * De-alloc any unneeded log buffers and log buffer descriptors. + * + * Called with the log lock held. */ static void @@ -793,6 +795,7 @@ num_bufs += segments + 1; num_bmem += segments + 1; + /* unreserve unneeded log segments */ if (tr->tr_seg_reserved > segments) { spin_lock(&sdp->sd_log_seg_lock); sdp->sd_log_seg_free += tr->tr_seg_reserved - segments; @@ -804,6 +807,7 @@ } else GFS_ASSERT_SBD(tr->tr_seg_reserved == segments, sdp,); + /* de-alloc unneeded log buffer descriptors */ GFS_ASSERT_SBD(tr->tr_num_free_bufs >= num_bufs, sdp,); while (tr->tr_num_free_bufs > num_bufs) { lb = list_entry(tr->tr_free_bufs.next, @@ -813,6 +817,7 @@ tr->tr_num_free_bufs--; } + /* de-alloc unneeded log buffers */ GFS_ASSERT_SBD(tr->tr_num_free_bmem >= num_bmem, sdp,); while (tr->tr_num_free_bmem > num_bmem) { bmem = tr->tr_free_bmem.next; @@ -973,7 +978,7 @@ * @new_tr: the transaction to commit * * Add the transaction @new_tr to the end of the incore commit list. - * Pull up and merge an previously commited transactions that share + * Pull up and merge any previously commited transactions that share * locks. Also pull up any rename transactions that need it. */ @@ -1033,6 +1038,7 @@ LO_INCORE_COMMIT(sdp, trans, le); } + /* If we successfully combined transactions, new_trans should be empty*/ if (trans != new_tr) { GFS_ASSERT_SBD(!new_tr->tr_num_free_bufs, sdp,); GFS_ASSERT_SBD(!new_tr->tr_num_free_bmem, sdp,); @@ -1040,6 +1046,9 @@ kfree(new_tr); } + /* If we successfully combined transactions, we might have some log + * segments that we reserved, and log buffers and buffer descriptors + * that we allocated, but now don't need*/ log_refund(sdp, trans); list_add(&trans->tr_list, &sdp->sd_log_incore); @@ -1061,6 +1070,7 @@ unsigned int num_mblks = 0, num_eblks = 0, num_bufs = 0, num_bmem = 0; unsigned int segments; + /* calculate actual log area needed for this trans */ LO_TRANS_SIZE(sdp, tr, &num_mblks, &num_eblks, &num_bufs, &num_bmem); GFS_ASSERT_SBD(num_mblks <= tr->tr_mblks_asked && @@ -1076,12 +1086,14 @@ num_bufs += segments + 1; num_bmem += segments + 1; + /* alloc log buffer descriptors */ while (num_bufs--) { lb = gmalloc(sizeof(struct gfs_log_buf)); memset(lb, 0, sizeof(struct gfs_log_buf)); list_add(&lb->lb_list, &tr->tr_free_bufs); tr->tr_num_free_bufs++; } + /* alloc log buffers */ while (num_bmem--) { bmem = gmalloc(sdp->sd_sb.sb_bsize); list_add(bmem, &tr->tr_free_bmem); @@ -1092,6 +1104,7 @@ incore_commit(sdp, tr); + /* flush log buffers to disk if we're over the threshold */ if (sdp->sd_log_buffers > sdp->sd_tune.gt_incore_log_blocks) { gfs_log_unlock(sdp); gfs_log_flush(sdp); diff -Naur cvs_102504/cluster/gfs-kernel/src/gfs/trans.c build_102504/cluster/gfs-kernel/src/gfs/trans.c --- cvs_102504/cluster/gfs-kernel/src/gfs/trans.c 2004-06-24 04:53:28.000000000 -0400 +++ build_102504/cluster/gfs-kernel/src/gfs/trans.c 2004-10-25 22:32:24.226765752 -0400 @@ -62,13 +62,17 @@ } /** - * gfs_trans_begin_i - Perpare to start a transaction + * gfs_trans_begin_i - Prepare to start a transaction * @sdp: The GFS superblock * @meta_blocks: Reserve this many metadata blocks in the log * @extra_blocks: Number of non-metadata blocks to reserve * - * Allocate the struct gfs_trans struct. Do in-place and - * log reservations. + * Allocate the struct gfs_trans struct. + * Grab a shared TRANSaction lock (protects this transaction from + * overlapping with unusual fs writes, e.g. journal replay, fs upgrade, + * while allowing simultaneous transaction writes throughout cluster). + * Reserve space in the log. @meta_blocks and @extra_blocks must indicate + * the worst case (maximum) size of the transaction. * * Returns: 0 on success, -EXXX on failure */ @@ -160,6 +164,7 @@ t_gh = tr->tr_t_gh; tr->tr_t_gh = NULL; + /* if no buffers were ever added to trans, forget it */ if (list_empty(&tr->tr_elements)) { gfs_log_release(sdp, tr->tr_seg_reserved); kfree(tr); @@ -170,6 +175,7 @@ return; } + /* do trans_end log-operation for each log element */ for (head = &tr->tr_elements, tmp = head->next; tmp != head; tmp = tmp->next) {