[Cluster-devel] GFS2 filesystem corruption when mkfs block size (4k) does not match blockdevice blocksize (512 bytes)

Wed Mar 20 15:54:27 UTC 2019

Hello,

The patch below to xfstests reliably reproduces the filesystem corruption on a 4.19 kernel.

In our lab we've found that if multiple iSCSI connection errors are
detected (without completely loosing the iSCSI connection) then the GFS2
filesystem becomes corrupt.
The connection errors occured very likely due to network congestion (the iSCSI target is running on a 10Gbps network,
but the test VM had only a 1Gbps network connection to it). Nevertheless networking issues shouldn't cause corruption.

Even after rebooting the host and remounting the filesystem has immediately withdrawn itself
when stating files. Sometimes fsck.gfs2 is able to fix it, but not always.

Tim Smith has found that using matching blocksizes for GFS2 and the underlying blockdevice avoids the problem
(running the testcase below for 30m in a loop does not reproduce the issue anymore).
This discovery was made after careful reading of the fsck.gfs2 manpage, which states:
"The first step to ensuring a healthy file system is the selec‐
tion of reliable hardware (i.e. storage systems that will write complete blocks - even in the event of power failure)."

By default GFS2 uses a blocksize of 4k, and typically the devices in our lab have a block size of 512 bytes, therefore
there are no atomicity guarantees on a 4k write: you may indeed get an incomplete write of 512, 1024, etc. bytes

We'd like your opinion on two questions:
1. We believe that mkfs.gfs2 in gfs2-utils should be updated to query the underlying blockdevice for its blocksize and use that by default instead of 4k,
and warn on a mismatch if you manually specify a blocksize. Also update the manpage of mkfs.gfs2 to state this.
Is this the right approach, i.e. would such a patch be acceptable upstream if we prepare it?

2. Can the code be made more robust to cope with the incomplete writes?
(e.g. can it throw away the bad journal entries, or replay the journal in such a way that it treats the underlying blockdevice's blocksize as the unit of atomicity
instead of the gfs2 block size?)
We're aware of your hard work on withdraw corruption https://www.redhat.com/archives/cluster-devel/2019-February/msg00077.html, would these patches help with this issue?

Ideally we'd like to fix both issues (use matching blocksizes for newly created filesystems, and fix the possibility of corruption for the benefit of those whom already
created their GFS2 filesystem with the default blocksize).


The initial withdraw looks like this:
Mar 20 15:10:29 localhost kernel: [13654.281665] Buffer I/O error on dev dm-4, logical block 431236, lost async page write
Mar 20 15:10:29 localhost kernel: [13654.281795] gfs2: fsid=dm-4.0: fatal: invalid metadata block
Mar 20 15:10:29 localhost kernel: [13654.281795]   bh = 431236 (magic number)
Mar 20 15:10:29 localhost kernel: [13654.281795]   function = gfs2_meta_indirect_buffer, file = fs/gfs2/meta_io.c, line = 419
Mar 20 15:10:29 localhost kernel: [13654.281834] gfs2: fsid=dm-4.0: about to withdraw this file system
Mar 20 15:10:29 localhost kernel: [13654.281884] gfs2: fsid=dm-4.0: withdrawn
Mar 20 15:10:29 localhost kernel: [13654.281894] CPU: 6 PID: 20880 Comm: xfs_io Tainted: G           O      4.19.0+1 #1
Mar 20 15:10:29 localhost kernel: [13654.281896] Hardware name: Dell Inc. PowerEdge R415/0GXH08, BIOS 1.2.5 03/16/2011
Mar 20 15:10:29 localhost kernel: [13654.281897] Call Trace:
Mar 20 15:10:29 localhost kernel: [13654.281916]  dump_stack+0x5a/0x73
Mar 20 15:10:29 localhost kernel: [13654.281941]  gfs2_lm_withdraw+0x125/0x1a0 [gfs2]
Mar 20 15:10:29 localhost kernel: [13654.281956]  ? gfs2_meta_read+0x22d/0x2d0 [gfs2]
Mar 20 15:10:29 localhost kernel: [13654.281969]  gfs2_meta_check_ii+0x2b/0x40 [gfs2]
Mar 20 15:10:29 localhost kernel: [13654.281981]  gfs2_meta_indirect_buffer+0x119/0x130 [gfs2]
Mar 20 15:10:29 localhost kernel: [13654.281993]  gfs2_dirty_inode+0x107/0x230 [gfs2]
Mar 20 15:10:29 localhost kernel: [13654.282006]  ? gfs2_dirty_inode+0x1bf/0x230 [gfs2]
Mar 20 15:10:29 localhost kernel: [13654.282012]  __mark_inode_dirty+0x17c/0x360
Mar 20 15:10:29 localhost kernel: [13654.282024]  gfs2_write_inode+0x11f/0x160 [gfs2]
Mar 20 15:10:29 localhost kernel: [13654.282028]  __writeback_singl
e_inode+0x26d/0x340
Mar 20 15:10:29 localhost kernel: [13654.282031]  writeback_single_inode+0xb7/0x130
Mar 20 15:10:29 localhost kernel: [13654.282033]  sync_inode_metadata+0x3d/0x60
Mar 20 15:10:29 localhost kernel: [13654.282045]  gfs2_fsync+0xad/0x110 [gfs2]
Mar 20 15:10:29 localhost kernel: [13654.282049]  do_fsync+0x38/0x60
Mar 20 15:10:29 localhost kernel: [13654.282053]  __x64_sys_fsync+0x10/0x20
Mar 20 15:10:29 localhost kernel: [13654.282057]  do_syscall_64+0x4e/0x100
Mar 20 15:10:29 localhost kernel: [13654.282062]  entry_SYSCALL_64_after_hwfr

After the remount:
Mar 20 15:10:30 localhost kernel: [13655.529526] gfs2: fsid=dm-4.0: first mount done, others may mount
Mar 20 15:10:30 localhost kernel: [13655.581457] gfs2: fsid=dm-4.0: fatal: invalid metadata block
Mar 20 15:10:30 localhost kernel: [13655.581457]   bh = 431236 (magic number)
Mar 20 15:10:30 localhost kernel: [13655.581457]   function = gfs2_meta_indirect_buffer, file = fs/gfs2/meta_io.c, line = 419
Mar 20 15:10:30 localhost kernel: [13655.581498] gfs2: fsid=dm-4.0: about to withdraw this file system
Mar 20 15:10:30 localhost kernel: [13655.581553] gfs2: fsid=dm-4.0: withdrawn
Mar 20 15:10:30 localhost kernel: [13655.581563] CPU: 1 PID: 21117 Comm: ls Tainted: G           O      4.19.0+1 #1
Mar 20 15:10:30 localhost kernel: [13655.581565] Hardware name: Dell Inc. PowerEdge R415/0GXH08, BIOS 1.2.5 03/16/2011
Mar 20 15:10:30 localhost kernel: [13655.581567] Call Trace:
Mar 20 15:10:30 localhost kernel: [13655.581586]  dump_stack+0x5a/0x73
Mar 20 15:10:30 localhost kernel: [13655.581611]  gfs2_lm_withdraw+0x125/0x1a0 [gfs2]
Mar 20 15:10:30 localhost kernel: [13655.581625]  ? gfs2_meta_read+0x22d/0x2d0 [gfs2]
Mar 20 15:10:30 localhost kernel: [13655.581638]  gfs2_meta_check_ii+0x2b/0x40 [gfs2]
Mar 20 15:10:30 localhost kernel: [13655.581650]  gfs2_meta_indirect_buffer+0x119/0x130 [gfs2]
Mar 20 15:10:30 localhost kernel: [13655.581661]  gfs2_inode_refresh+0x31/0x280 [gfs2]
Mar 20 15:10:30 localhost kernel: [13655.581673]  ? gfs2_inode_lookup+0x2a2/0x420 [gfs2]
Mar 20 15:10:30 localhost kernel: [13655.581683]  inode_go_lock+0x6b/0xf0 [gfs2]
Mar 20 15:10:30 localhost kernel: [13655.581693]  do_promote+0x158/0x290 [gfs2]
Mar 20 15:10:30 localhost kernel: [13655.581703]  finish_xmote+0x14f/0x350 [gfs2]
Mar 20 15:10:30 localhost kernel: [13655.581714]  do_xmote+0x1a0/0x200 [gfs2]
Mar 20 15:10:30 localhost kernel: [13655.581724]  gfs2_glock_nq+0x1ad/0x410 [gfs2]
Mar 20 15:10:30 localhost kernel: [13655.581736]  __gfs2_lookup+0x6e/0x130 [gfs2]
Mar 20 15:10:30 localhost kernel: [13655.581748]  ? __gfs2_lookup+0x66/0x130 [gfs2]
Mar 20 15:10:30 localhost kernel: [13655.581754]  __lookup_slow+0x94/0x140
Mar 20 15:10:30 localhost kernel: [13655.581758]  lookup_slow+0x35/0x50
Mar 20 15:10:30 localhost kernel: [13655.581761]  walk_component+0x15b/0x280
Mar 20 15:10:30 localhost kernel: [13655.581764]  path_lookupat+0x6d/0x200
Mar 20 15:10:30 localhost kernel: [13655.581767]  ? dput+0xa2/0xd0
Mar 20 15:10:30 localhost kernel: [13655.581773]  ? cap_inode_getsecurity+0x8e/0x210
Mar 20 15:10:30 localhost kernel: [13655.581776]  filename_lookup+0xab/0x140
Mar 20 15:10:30 localhost kernel: [13655.581781]  ? __check_object_size+0x15d/0x1b0
Mar 20 15:10:30 localhost kernel: [13655.581786]  ? strncpy_from_user+0x3e/0x150
Mar 20 15:10:30 localhost kernel: [13655.581788]  ? getname_flags+0x49/0x180
Mar 20 15:10:30 localhost kernel: [13655.581793]  vfs_statx+0x74/0xe0
Mar 20 15:10:30 localhost kernel: [13655.581797]  __do_sys_newlstat+0x39/0x70
Mar 20 15:10:30 localhost kernel: [13655.581801]  ? queued_spin_unlock+0x5/0x10
Mar 20 15:10:30 localhost kernel: [13655.581803]  ? dput+0x63/0xd0
Mar 20 15:10:30 localhost kernel: [13655.581807]  ? path_getxattr+0x6c/0xb0
Mar 20 15:10:30 localhost kernel: [13655.581812]  do_syscall_64+0x4e/0x100
Mar 20 15:10:30 localhost kernel: [13655.581817]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
Mar 20 15:10:30 localhost kernel: [13655.581821] RIP: 0033:0x7fe91ef14635


>From 407a254c7b147c377ac02d326b76456278e36cb9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok at citrix.com>
Date: Wed, 20 Mar 2019 12:49:52 +0000
Subject: [PATCH] tests/generic/536: intermittent I/O errors must not corrupt a
 filesystem
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Based on tests/generic/347.
In our lab we've found that if multiple iSCSI connection errors are
detected (without completely loosing the iSCSI connection) then the GFS2
filesystem becomes corrupt.
Add a test that explicitly checks for this by simulating I/O errors
deterministically with dm-thin.

Tim Smith has spotted that the root cause of the corruption in GFS2
seems to be a difference in the default GFS2 block size (4k)
and iSCSI block size (512 bytes) which breaks the requirements stated in fsck.gfs2.

Although it is expected that a filesystem might report errors and
unmount itself when it detects I/O errors, when those I/O errors are
fixed and the filesystem is remounted it should not fail with a fatal
assertion again.

Signed-off-by: Edwin Török <edvin.torok at citrix.com>
---
 common/dmthin         |  6 ++++
 tests/generic/536     | 80 +++++++++++++++++++++++++++++++++++++++++++
 tests/generic/536.out |  2 ++
 tests/generic/group   |  1 +
 4 files changed, 89 insertions(+)
 create mode 100755 tests/generic/536
 create mode 100644 tests/generic/536.out

diff --git a/common/dmthin b/common/dmthin
index 7946e9a7..28f2ef9c 100644
--- a/common/dmthin
+++ b/common/dmthin
@@ -41,6 +41,12 @@ _dmthin_check_fs()
 	_check_scratch_fs $DMTHIN_VOL_DEV
 }
 
+_dmthin_cycle_mount()
+{
+	$UMOUNT_PROG $SCRATCH_MNT > /dev/null 2>&1
+	_dmthin_mount
+}
+
 # Set up a dm-thin device on $SCRATCH_DEV
 #
 # All arguments are optional, and in this order; defaults follows:
diff --git a/tests/generic/536 b/tests/generic/536
new file mode 100755
index 00000000..08b94a21
--- /dev/null
+++ b/tests/generic/536
@@ -0,0 +1,80 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2015 Red Hat, Inc.  All Rights Reserved.
+#
+# FS QA Test No. generic/536
+#
+# Test that intermittent IO errors during pwrite do not cause filesystem corruption
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+
+status=1	# failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+BACKING_SIZE=$((500 * 1024 * 1024 / 512))	# 500M
+VIRTUAL_SIZE=$((10 * $BACKING_SIZE))		# 5000M
+GROW_SIZE=$((100 * 1024 * 1024 / 512))		# 100M
+
+_cleanup()
+{
+	_dmthin_cleanup
+	rm -f $tmp.*
+}
+
+_setup_thin()
+{
+	_dmthin_init $BACKING_SIZE $VIRTUAL_SIZE
+	_dmthin_set_queue
+	_mkfs_dev -b 512 $DMTHIN_VOL_DEV
+	_dmthin_mount
+}
+
+_workout()
+{
+	# Overfill it by a bit
+	for I in `seq 1 500`; do
+		$XFS_IO_PROG -f -c "pwrite -W 0 1M" $SCRATCH_MNT/file$I &>/dev/null
+	done
+
+	sync
+
+	_dmthin_grow  $GROW_SIZE
+
+	# Write a little more, but don't fill
+	for I in `seq 501 510`; do
+		$XFS_IO_PROG -f -c "pwrite 0 1M" $SCRATCH_MNT/file$I &>/dev/null
+	done
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/dmthin
+
+_supported_fs generic
+_supported_os Linux
+_require_scratch_nocheck
+_require_dm_target thin-pool
+
+_setup_thin
+
+# trigger IO errors, the filesystem may be remounted RO or withdrawn, this is expected
+_workout
+
+# now remount the filesystem without triggering IO errors,
+# and check that the filesystem is not corrupt
+_dmthin_cycle_mount
+# ls --color makes ls stat each file, which finds the corruption
+ls --color=always $SCRATCH_MNT/ >/dev/null || _fail "Failed to list filesystem after remount"
+ls --color=always $SCRATCH_MNT/ >/dev/null || _fail "Failed to list filesystem after remount"
+ls --color=always $SCRATCH_MNT/ >/dev/null || _fail "Failed to list filesystem after remount"
+_dmthin_cleanup
+
+echo "=== completed"
+
+status=0
+exit
diff --git a/tests/generic/536.out b/tests/generic/536.out
new file mode 100644
index 00000000..5140d261
--- /dev/null
+++ b/tests/generic/536.out
@@ -0,0 +1,2 @@
+QA output created by 536
+=== completed
diff --git a/tests/generic/group b/tests/generic/group
index 78b9b45d..a346dfab 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -538,3 +538,4 @@
 533 auto quick attr
 534 auto quick log
 535 auto quick log
+536 auto quick rw thin
-- 
2.19.1