[dm-devel] [PATCH] dm raid: fix transient device failure processing

Thu Jan 12 16:50:48 UTC 2017

It needs respective prerequisite patches to add raid4/5/6 journal device 
support raising version to 1.10.0.

Will seperate minor cleanups out and resubmit.

Heinz

On 01/05/2017 02:58 AM, Brassow Jonathan wrote:
> I think this patch is not inline with upstream.  (Version 1.10.0 is not yet available upstream - currently 1.9.1.)
>
> If you are resubmitting the patch, it’d be great if you could separate the clean-up (e.g. s/dev[0]/md/) from the necessary changes.
>
>   brassow
>
>> On Dec 22, 2016, at 2:26 PM, Heinz Mauelshagen <heinzm at redhat.com> wrote:
>>
>> This patch addresses the following 3 failure scenarios:
>>
>> 1) If a (transiently) inaccessible metadata device is being passed into the
>> constructor (e.g. a device tuple '254:4 254:5'), it is processed as if
>> '- -' was given.  This erroneously results in a status table line containing
>> '- -', which mistakenly differs from what has been passed in.  As a result,
>> userspace libdevmapper puts the device tuple seperate from the RAID device
>> thus not processing the dependencies properly.
>>
>> 2) False health status char 'A' instead of 'D' is emitted on the status
>> status info line for the meta/data device tuple in this metadata device
>> failure case.
>>
>> 3) If the metadata device is accessible when passed into the constructor
>> but the data device (partially) isn't, that leg may be set faulty by the
>> raid personality on access to the (partially) unavailable leg.  Restore
>> tried in a second raid device resume on such failed leg (status char 'D')
>> fails after the (partial) leg returned.
>>
>> Fixes for aforementioned failure scenarios:
>>
>> - don't release passed in devices in the constructor thus allowing the
>>   status table line to e.g. contain '254:4 254:5' rather than '- -'
>>
>> - emit device status char 'D' rather than 'A' for the device tuple
>>   with the failed metadata device on the status info line
>>
>> - when attempting to restore faulty devices in a second resume, allow the
>>   device hot remove function to succeed by setting the device to not in-sync
>>
>> In case userspace intentionally passes '- -' into the constructor to avoid that
>> device tuple (e.g. to split off a raid1 leg temporarily for later re-addition),
>> the status table line will correctly show '- -' and the status info line will
>> provide a '-' device health character for the non-defined device tuple.
>>
>> Cleanups:
>>
>> - use rs->md.dev_sectors throughout instead of rs->dev[0].rdev.sectors to be
>>   prepared for userspace ever passing in '- -' for the first tuple
>>
>> - avoid sync_page_io() in favour of read_disk_sb()
>>
>> - consistently use mddev-> instead of rdev->mddev on bitmap setup
>>
>>
>> This patch is on top of
>> "[PATCH] dm raid: change raid4/5/6 journal device status health char to 'A' rather than 'J'"
>> dated 12/13/2016.
>>
>> Resolves: rhbz1404425
>>
>> Signed-off-by: Heinz Mauelshagen <heinzm at redhat.com>
>> ---
>> Documentation/device-mapper/dm-raid.txt |   4 ++
>> drivers/md/dm-raid.c                    | 121 +++++++++++++++-----------------
>> 2 files changed, 60 insertions(+), 65 deletions(-)
>>
>> diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
>> index 8e6d910..f0f43a7 100644
>> --- a/Documentation/device-mapper/dm-raid.txt
>> +++ b/Documentation/device-mapper/dm-raid.txt
>> @@ -327,3 +327,7 @@ Version History
>> 	and set size reduction.
>> 1.9.1   Fix activation of existing RAID 4/10 mapped devices
>> 1.10.0  Add support for raid4/5/6 journal device
>> +1.10.1  Don't emit '- -' on the status table line in case the constructor
>> +        fails reading a superblock. Correctly emit 'maj:min1 maj:min2' and
>> +        'D' on the status line.  If '- -' is passed into the constructor, emit
>> +        '- -' on the table line and '-' as the status line health character.
>> diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
>> index 25bb5ab..f29abb2 100644
>> --- a/drivers/md/dm-raid.c
>> +++ b/drivers/md/dm-raid.c
>> @@ -388,7 +388,7 @@ static bool rs_is_reshapable(struct raid_set *rs)
>> /* Return true, if raid set in @rs is recovering */
>> static bool rs_is_recovering(struct raid_set *rs)
>> {
>> -	return rs->md.recovery_cp < rs->dev[0].rdev.sectors;
>> +	return rs->md.recovery_cp < rs->md.dev_sectors;
>> }
>>
>> /* Return true, if raid set in @rs is reshaping */
>> @@ -1576,9 +1576,9 @@ static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
>> 	else if (dev_sectors == MaxSector)
>> 		/* Prevent recovery */
>> 		__rs_setup_recovery(rs, MaxSector);
>> -	else if (rs->dev[0].rdev.sectors < dev_sectors)
>> +	else if (rs->md.dev_sectors < dev_sectors)
>> 		/* Grown raid set */
>> -		__rs_setup_recovery(rs, rs->dev[0].rdev.sectors);
>> +		__rs_setup_recovery(rs, rs->md.dev_sectors);
>> 	else
>> 		__rs_setup_recovery(rs, MaxSector);
>> }
>> @@ -1917,18 +1917,21 @@ static int rs_check_reshape(struct raid_set *rs)
>> 	return -EPERM;
>> }
>>
>> -static int read_disk_sb(struct md_rdev *rdev, int size)
>> +static int read_disk_sb(struct md_rdev *rdev, int size, bool force_reload)
>> {
>> 	BUG_ON(!rdev->sb_page);
>>
>> -	if (rdev->sb_loaded)
>> +	if (rdev->sb_loaded && !force_reload)
>> 		return 0;
>>
>> +	rdev->sb_loaded = 0;
>> +
>> 	if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true)) {
>> 		DMERR("Failed to read superblock of device at position %d",
>> 		      rdev->raid_disk);
>> 		md_error(rdev->mddev, rdev);
>> -		return -EINVAL;
>> +		set_bit(Faulty, &rdev->flags);
>> +		return -EIO;
>> 	}
>>
>> 	rdev->sb_loaded = 1;
>> @@ -2056,7 +2059,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
>> 		return -EINVAL;
>> 	}
>>
>> -	r = read_disk_sb(rdev, rdev->sb_size);
>> +	r = read_disk_sb(rdev, rdev->sb_size, false);
>> 	if (r)
>> 		return r;
>>
>> @@ -2323,7 +2326,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
>> 	struct mddev *mddev = &rs->md;
>> 	struct dm_raid_superblock *sb;
>>
>> -	if (rs_is_raid0(rs) || !rdev->sb_page)
>> +	if (rs_is_raid0(rs) || !rdev->sb_page || rdev->raid_disk < 0)
>> 		return 0;
>>
>> 	sb = page_address(rdev->sb_page);
>> @@ -2348,7 +2351,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
>>
>> 	/* Enable bitmap creation for RAID levels != 0 */
>> 	mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096);
>> -	rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
>> +	mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
>>
>> 	if (!test_and_clear_bit(FirstUse, &rdev->flags)) {
>> 		/* Retrieve device size stored in superblock to be prepared for shrink */
>> @@ -2386,12 +2389,11 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
>> static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
>> {
>> 	int r;
>> -	struct raid_dev *dev;
>> -	struct md_rdev *rdev, *tmp, *freshest;
>> +	struct md_rdev *rdev, *freshest;
>> 	struct mddev *mddev = &rs->md;
>>
>> 	freshest = NULL;
>> -	rdev_for_each_safe(rdev, tmp, mddev) {
>> +	rdev_for_each(rdev, mddev) {
>> 		if (test_bit(Journal, &rdev->flags))
>> 			continue;
>>
>> @@ -2401,9 +2403,8 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
>> 		 * though it were new.	This is the intended effect
>> 		 * of the "sync" directive.
>> 		 *
>> -		 * When reshaping capability is added, we must ensure
>> -		 * that the "sync" directive is disallowed during the
>> -		 * reshape.
>> +		 * With reshaping capability added, we must ensure that
>> +		 * that the "sync" directive is disallowed during the reshape.
>> 		 */
>> 		if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
>> 			continue;
>> @@ -2420,6 +2421,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
>> 		case 0:
>> 			break;
>> 		default:
>> +			/* This is a failure to read the superblock off the metadata device. */
>> 			/*
>> 			 * We have to keep any raid0 data/metadata device pairs or
>> 			 * the MD raid0 personality will fail to start the array.
>> @@ -2427,33 +2429,17 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
>> 			if (rs_is_raid0(rs))
>> 				continue;
>>
>> -			dev = container_of(rdev, struct raid_dev, rdev);
>> -			if (dev->meta_dev)
>> -				dm_put_device(ti, dev->meta_dev);
>> -
>> -			dev->meta_dev = NULL;
>> -			rdev->meta_bdev = NULL;
>> -
>> -			if (rdev->sb_page)
>> -				put_page(rdev->sb_page);
>> -
>> -			rdev->sb_page = NULL;
>> -
>> -			rdev->sb_loaded = 0;
>> -
>> 			/*
>> -			 * We might be able to salvage the data device
>> -			 * even though the meta device has failed.  For
>> -			 * now, we behave as though '- -' had been
>> -			 * set for this device in the table.
>> +			 * We keep the dm_devs to be able to emit the device tuple
>> +			 * properly on the table line in raid_status() (rather than
>> +			 * mistakenly acting as if '- -' got passed into the constructor).
>> +			 *
>> +			 * The rdev has to stay on the same_set list to allow for
>> +			 * the attempt to restore faulty devices on second resume.
>> 			 */
>> -			if (dev->data_dev)
>> -				dm_put_device(ti, dev->data_dev);
>> -
>> -			dev->data_dev = NULL;
>> -			rdev->bdev = NULL;
>> -
>> -			list_del(&rdev->same_set);
>> +			set_bit(Faulty, &rdev->flags);
>> +			rdev->raid_disk = rdev->saved_raid_disk = -1;
>> +			break;
>> 		}
>> 	}
>>
>> @@ -2924,7 +2910,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>> 	if (r)
>> 		goto bad;
>>
>> -	calculated_dev_sectors = rs->dev[0].rdev.sectors;
>> +	calculated_dev_sectors = rs->md.dev_sectors;
>>
>> 	/*
>> 	 * Backup any new raid set level, layout, ...
>> @@ -2937,7 +2923,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>> 	if (r)
>> 		goto bad;
>>
>> -	resize = calculated_dev_sectors != rs->dev[0].rdev.sectors;
>> +	resize = calculated_dev_sectors != rs->md.dev_sectors;
>>
>> 	INIT_WORK(&rs->md.event_work, do_table_event);
>> 	ti->private = rs;
>> @@ -3014,7 +3000,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>> 		 * is an instant operation as oposed to an ongoing reshape.
>> 		 */
>>
>> -		/* We can't reshape a jorunaled radi4/5/6 */
>> +		/* We can't reshape a journaled radi4/5/6 */
>> 		if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
>> 			ti->error = "Can't reshape a journaled raid4/5/6 set";
>> 			r = -EPERM;
>> @@ -3166,17 +3152,20 @@ static const char *decipher_sync_action(struct mddev *mddev)
>> }
>>
>> /*
>> - * Return status string @rdev
>> + * Return status string for @rdev
>>   *
>>   * Status characters:
>>   *
>>   *  'D' = Dead/Failed raid set component or raid4/5/6 journal device
>>   *  'a' = Alive but not in-sync
>>   *  'A' = Alive and in-sync raid set component or alive raid4/5/6 journal device
>> + *  '-' = Non-existing device (i.e. uspace passed '- -' into the ctr)
>>   */
>> static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync)
>> {
>> -	if (test_bit(Faulty, &rdev->flags))
>> +	if (!rdev->bdev)
>> +		return "-";
>> +	else if (test_bit(Faulty, &rdev->flags))
>> 		return "D";
>> 	else if (test_bit(Journal, &rdev->flags))
>> 		return "A";
>> @@ -3281,7 +3270,6 @@ static void raid_status(struct dm_target *ti, status_type_t type,
>> 	sector_t progress, resync_max_sectors, resync_mismatches;
>> 	const char *sync_action;
>> 	struct raid_type *rt;
>> -	struct md_rdev *rdev;
>>
>> 	switch (type) {
>> 	case STATUSTYPE_INFO:
>> @@ -3302,10 +3290,9 @@ static void raid_status(struct dm_target *ti, status_type_t type,
>> 				    atomic64_read(&mddev->resync_mismatches) : 0;
>> 		sync_action = decipher_sync_action(&rs->md);
>>
>> -		/* HM FIXME: do we want another state char for raid0? It shows 'D' or 'A' now */
>> -		rdev_for_each(rdev, mddev)
>> -			if (!test_bit(Journal, &rdev->flags))
>> -				DMEMIT(__raid_dev_status(rdev, array_in_sync));
>> +		/* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
>> +		for (i = 0; i < rs->raid_disks; i++)
>> +			DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync));
>>
>> 		/*
>> 		 * In-sync/Reshape ratio:
>> @@ -3536,11 +3523,12 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
>>
>> 	memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices));
>>
>> -	for (i = 0; i < rs->md.raid_disks; i++) {
>> +	for (i = 0; i < mddev->raid_disks; i++) {
>> 		r = &rs->dev[i].rdev;
>> -		if (test_bit(Faulty, &r->flags) && r->sb_page &&
>> -		    sync_page_io(r, 0, r->sb_size, r->sb_page,
>> -				 REQ_OP_READ, 0, true)) {
>> +		if (test_bit(Journal, &r->flags))
>> +			continue;
>> +
>> +		if (test_bit(Faulty, &r->flags) && !read_disk_sb(r, r->sb_size, true)) {
>> 			DMINFO("Faulty %s device #%d has readable super block."
>> 			       "  Attempting to revive it.",
>> 			       rs->raid_type->name, i);
>> @@ -3554,22 +3542,25 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
>> 			 * '>= 0' - meaning we must call this function
>> 			 * ourselves.
>> 			 */
>> -			if ((r->raid_disk >= 0) &&
>> -			    (mddev->pers->hot_remove_disk(mddev, r) != 0))
>> -				/* Failed to revive this device, try next */
>> -				continue;
>> -
>> -			r->raid_disk = i;
>> -			r->saved_raid_disk = i;
>> 			flags = r->flags;
>> +			clear_bit(In_sync, &r->flags); /* Mandatory for hot remove. */
>> +			if (r->raid_disk >= 0) {
>> +				if (mddev->pers->hot_remove_disk(mddev, r)) {
>> +					/* Failed to revive this device, try next */
>> +					r->flags = flags;
>> +					continue;
>> +				}
>> +			} else
>> +				r->raid_disk = r->saved_raid_disk = i;
>> +
>> 			clear_bit(Faulty, &r->flags);
>> 			clear_bit(WriteErrorSeen, &r->flags);
>> -			clear_bit(In_sync, &r->flags);
>> +
>> 			if (mddev->pers->hot_add_disk(mddev, r)) {
>> -				r->raid_disk = -1;
>> -				r->saved_raid_disk = -1;
>> +				r->raid_disk = r->saved_raid_disk = -1;
>> 				r->flags = flags;
>> 			} else {
>> +				clear_bit(In_sync, &r->flags);
>> 				r->recovery_offset = 0;
>> 				set_bit(i, (void *) cleared_failed_devices);
>> 				cleared = true;
>> @@ -3769,7 +3760,7 @@ static void raid_resume(struct dm_target *ti)
>>
>> static struct target_type raid_target = {
>> 	.name = "raid",
>> -	.version = {1, 10, 0},
>> +	.version = {1, 10, 1},
>> 	.module = THIS_MODULE,
>> 	.ctr = raid_ctr,
>> 	.dtr = raid_dtr,
>> -- 
>> 2.9.3
>>
>> --
>> dm-devel mailing list
>> dm-devel at redhat.com
>> https://www.redhat.com/mailman/listinfo/dm-devel