[dm-devel] [PATCH] dm raid: fix transient device failure processing
Brassow Jonathan
jbrassow at redhat.com
Thu Jan 5 01:58:39 UTC 2017
I think this patch is not inline with upstream. (Version 1.10.0 is not yet available upstream - currently 1.9.1.)
If you are resubmitting the patch, it’d be great if you could separate the clean-up (e.g. s/dev[0]/md/) from the necessary changes.
brassow
> On Dec 22, 2016, at 2:26 PM, Heinz Mauelshagen <heinzm at redhat.com> wrote:
>
> This patch addresses the following 3 failure scenarios:
>
> 1) If a (transiently) inaccessible metadata device is being passed into the
> constructor (e.g. a device tuple '254:4 254:5'), it is processed as if
> '- -' was given. This erroneously results in a status table line containing
> '- -', which mistakenly differs from what has been passed in. As a result,
> userspace libdevmapper puts the device tuple seperate from the RAID device
> thus not processing the dependencies properly.
>
> 2) False health status char 'A' instead of 'D' is emitted on the status
> status info line for the meta/data device tuple in this metadata device
> failure case.
>
> 3) If the metadata device is accessible when passed into the constructor
> but the data device (partially) isn't, that leg may be set faulty by the
> raid personality on access to the (partially) unavailable leg. Restore
> tried in a second raid device resume on such failed leg (status char 'D')
> fails after the (partial) leg returned.
>
> Fixes for aforementioned failure scenarios:
>
> - don't release passed in devices in the constructor thus allowing the
> status table line to e.g. contain '254:4 254:5' rather than '- -'
>
> - emit device status char 'D' rather than 'A' for the device tuple
> with the failed metadata device on the status info line
>
> - when attempting to restore faulty devices in a second resume, allow the
> device hot remove function to succeed by setting the device to not in-sync
>
> In case userspace intentionally passes '- -' into the constructor to avoid that
> device tuple (e.g. to split off a raid1 leg temporarily for later re-addition),
> the status table line will correctly show '- -' and the status info line will
> provide a '-' device health character for the non-defined device tuple.
>
> Cleanups:
>
> - use rs->md.dev_sectors throughout instead of rs->dev[0].rdev.sectors to be
> prepared for userspace ever passing in '- -' for the first tuple
>
> - avoid sync_page_io() in favour of read_disk_sb()
>
> - consistently use mddev-> instead of rdev->mddev on bitmap setup
>
>
> This patch is on top of
> "[PATCH] dm raid: change raid4/5/6 journal device status health char to 'A' rather than 'J'"
> dated 12/13/2016.
>
> Resolves: rhbz1404425
>
> Signed-off-by: Heinz Mauelshagen <heinzm at redhat.com>
> ---
> Documentation/device-mapper/dm-raid.txt | 4 ++
> drivers/md/dm-raid.c | 121 +++++++++++++++-----------------
> 2 files changed, 60 insertions(+), 65 deletions(-)
>
> diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
> index 8e6d910..f0f43a7 100644
> --- a/Documentation/device-mapper/dm-raid.txt
> +++ b/Documentation/device-mapper/dm-raid.txt
> @@ -327,3 +327,7 @@ Version History
> and set size reduction.
> 1.9.1 Fix activation of existing RAID 4/10 mapped devices
> 1.10.0 Add support for raid4/5/6 journal device
> +1.10.1 Don't emit '- -' on the status table line in case the constructor
> + fails reading a superblock. Correctly emit 'maj:min1 maj:min2' and
> + 'D' on the status line. If '- -' is passed into the constructor, emit
> + '- -' on the table line and '-' as the status line health character.
> diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
> index 25bb5ab..f29abb2 100644
> --- a/drivers/md/dm-raid.c
> +++ b/drivers/md/dm-raid.c
> @@ -388,7 +388,7 @@ static bool rs_is_reshapable(struct raid_set *rs)
> /* Return true, if raid set in @rs is recovering */
> static bool rs_is_recovering(struct raid_set *rs)
> {
> - return rs->md.recovery_cp < rs->dev[0].rdev.sectors;
> + return rs->md.recovery_cp < rs->md.dev_sectors;
> }
>
> /* Return true, if raid set in @rs is reshaping */
> @@ -1576,9 +1576,9 @@ static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
> else if (dev_sectors == MaxSector)
> /* Prevent recovery */
> __rs_setup_recovery(rs, MaxSector);
> - else if (rs->dev[0].rdev.sectors < dev_sectors)
> + else if (rs->md.dev_sectors < dev_sectors)
> /* Grown raid set */
> - __rs_setup_recovery(rs, rs->dev[0].rdev.sectors);
> + __rs_setup_recovery(rs, rs->md.dev_sectors);
> else
> __rs_setup_recovery(rs, MaxSector);
> }
> @@ -1917,18 +1917,21 @@ static int rs_check_reshape(struct raid_set *rs)
> return -EPERM;
> }
>
> -static int read_disk_sb(struct md_rdev *rdev, int size)
> +static int read_disk_sb(struct md_rdev *rdev, int size, bool force_reload)
> {
> BUG_ON(!rdev->sb_page);
>
> - if (rdev->sb_loaded)
> + if (rdev->sb_loaded && !force_reload)
> return 0;
>
> + rdev->sb_loaded = 0;
> +
> if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true)) {
> DMERR("Failed to read superblock of device at position %d",
> rdev->raid_disk);
> md_error(rdev->mddev, rdev);
> - return -EINVAL;
> + set_bit(Faulty, &rdev->flags);
> + return -EIO;
> }
>
> rdev->sb_loaded = 1;
> @@ -2056,7 +2059,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
> return -EINVAL;
> }
>
> - r = read_disk_sb(rdev, rdev->sb_size);
> + r = read_disk_sb(rdev, rdev->sb_size, false);
> if (r)
> return r;
>
> @@ -2323,7 +2326,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
> struct mddev *mddev = &rs->md;
> struct dm_raid_superblock *sb;
>
> - if (rs_is_raid0(rs) || !rdev->sb_page)
> + if (rs_is_raid0(rs) || !rdev->sb_page || rdev->raid_disk < 0)
> return 0;
>
> sb = page_address(rdev->sb_page);
> @@ -2348,7 +2351,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
>
> /* Enable bitmap creation for RAID levels != 0 */
> mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096);
> - rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
> + mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
>
> if (!test_and_clear_bit(FirstUse, &rdev->flags)) {
> /* Retrieve device size stored in superblock to be prepared for shrink */
> @@ -2386,12 +2389,11 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
> static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
> {
> int r;
> - struct raid_dev *dev;
> - struct md_rdev *rdev, *tmp, *freshest;
> + struct md_rdev *rdev, *freshest;
> struct mddev *mddev = &rs->md;
>
> freshest = NULL;
> - rdev_for_each_safe(rdev, tmp, mddev) {
> + rdev_for_each(rdev, mddev) {
> if (test_bit(Journal, &rdev->flags))
> continue;
>
> @@ -2401,9 +2403,8 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
> * though it were new. This is the intended effect
> * of the "sync" directive.
> *
> - * When reshaping capability is added, we must ensure
> - * that the "sync" directive is disallowed during the
> - * reshape.
> + * With reshaping capability added, we must ensure that
> + * that the "sync" directive is disallowed during the reshape.
> */
> if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
> continue;
> @@ -2420,6 +2421,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
> case 0:
> break;
> default:
> + /* This is a failure to read the superblock off the metadata device. */
> /*
> * We have to keep any raid0 data/metadata device pairs or
> * the MD raid0 personality will fail to start the array.
> @@ -2427,33 +2429,17 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
> if (rs_is_raid0(rs))
> continue;
>
> - dev = container_of(rdev, struct raid_dev, rdev);
> - if (dev->meta_dev)
> - dm_put_device(ti, dev->meta_dev);
> -
> - dev->meta_dev = NULL;
> - rdev->meta_bdev = NULL;
> -
> - if (rdev->sb_page)
> - put_page(rdev->sb_page);
> -
> - rdev->sb_page = NULL;
> -
> - rdev->sb_loaded = 0;
> -
> /*
> - * We might be able to salvage the data device
> - * even though the meta device has failed. For
> - * now, we behave as though '- -' had been
> - * set for this device in the table.
> + * We keep the dm_devs to be able to emit the device tuple
> + * properly on the table line in raid_status() (rather than
> + * mistakenly acting as if '- -' got passed into the constructor).
> + *
> + * The rdev has to stay on the same_set list to allow for
> + * the attempt to restore faulty devices on second resume.
> */
> - if (dev->data_dev)
> - dm_put_device(ti, dev->data_dev);
> -
> - dev->data_dev = NULL;
> - rdev->bdev = NULL;
> -
> - list_del(&rdev->same_set);
> + set_bit(Faulty, &rdev->flags);
> + rdev->raid_disk = rdev->saved_raid_disk = -1;
> + break;
> }
> }
>
> @@ -2924,7 +2910,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
> if (r)
> goto bad;
>
> - calculated_dev_sectors = rs->dev[0].rdev.sectors;
> + calculated_dev_sectors = rs->md.dev_sectors;
>
> /*
> * Backup any new raid set level, layout, ...
> @@ -2937,7 +2923,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
> if (r)
> goto bad;
>
> - resize = calculated_dev_sectors != rs->dev[0].rdev.sectors;
> + resize = calculated_dev_sectors != rs->md.dev_sectors;
>
> INIT_WORK(&rs->md.event_work, do_table_event);
> ti->private = rs;
> @@ -3014,7 +3000,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
> * is an instant operation as oposed to an ongoing reshape.
> */
>
> - /* We can't reshape a jorunaled radi4/5/6 */
> + /* We can't reshape a journaled radi4/5/6 */
> if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
> ti->error = "Can't reshape a journaled raid4/5/6 set";
> r = -EPERM;
> @@ -3166,17 +3152,20 @@ static const char *decipher_sync_action(struct mddev *mddev)
> }
>
> /*
> - * Return status string @rdev
> + * Return status string for @rdev
> *
> * Status characters:
> *
> * 'D' = Dead/Failed raid set component or raid4/5/6 journal device
> * 'a' = Alive but not in-sync
> * 'A' = Alive and in-sync raid set component or alive raid4/5/6 journal device
> + * '-' = Non-existing device (i.e. uspace passed '- -' into the ctr)
> */
> static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync)
> {
> - if (test_bit(Faulty, &rdev->flags))
> + if (!rdev->bdev)
> + return "-";
> + else if (test_bit(Faulty, &rdev->flags))
> return "D";
> else if (test_bit(Journal, &rdev->flags))
> return "A";
> @@ -3281,7 +3270,6 @@ static void raid_status(struct dm_target *ti, status_type_t type,
> sector_t progress, resync_max_sectors, resync_mismatches;
> const char *sync_action;
> struct raid_type *rt;
> - struct md_rdev *rdev;
>
> switch (type) {
> case STATUSTYPE_INFO:
> @@ -3302,10 +3290,9 @@ static void raid_status(struct dm_target *ti, status_type_t type,
> atomic64_read(&mddev->resync_mismatches) : 0;
> sync_action = decipher_sync_action(&rs->md);
>
> - /* HM FIXME: do we want another state char for raid0? It shows 'D' or 'A' now */
> - rdev_for_each(rdev, mddev)
> - if (!test_bit(Journal, &rdev->flags))
> - DMEMIT(__raid_dev_status(rdev, array_in_sync));
> + /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
> + for (i = 0; i < rs->raid_disks; i++)
> + DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync));
>
> /*
> * In-sync/Reshape ratio:
> @@ -3536,11 +3523,12 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
>
> memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices));
>
> - for (i = 0; i < rs->md.raid_disks; i++) {
> + for (i = 0; i < mddev->raid_disks; i++) {
> r = &rs->dev[i].rdev;
> - if (test_bit(Faulty, &r->flags) && r->sb_page &&
> - sync_page_io(r, 0, r->sb_size, r->sb_page,
> - REQ_OP_READ, 0, true)) {
> + if (test_bit(Journal, &r->flags))
> + continue;
> +
> + if (test_bit(Faulty, &r->flags) && !read_disk_sb(r, r->sb_size, true)) {
> DMINFO("Faulty %s device #%d has readable super block."
> " Attempting to revive it.",
> rs->raid_type->name, i);
> @@ -3554,22 +3542,25 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
> * '>= 0' - meaning we must call this function
> * ourselves.
> */
> - if ((r->raid_disk >= 0) &&
> - (mddev->pers->hot_remove_disk(mddev, r) != 0))
> - /* Failed to revive this device, try next */
> - continue;
> -
> - r->raid_disk = i;
> - r->saved_raid_disk = i;
> flags = r->flags;
> + clear_bit(In_sync, &r->flags); /* Mandatory for hot remove. */
> + if (r->raid_disk >= 0) {
> + if (mddev->pers->hot_remove_disk(mddev, r)) {
> + /* Failed to revive this device, try next */
> + r->flags = flags;
> + continue;
> + }
> + } else
> + r->raid_disk = r->saved_raid_disk = i;
> +
> clear_bit(Faulty, &r->flags);
> clear_bit(WriteErrorSeen, &r->flags);
> - clear_bit(In_sync, &r->flags);
> +
> if (mddev->pers->hot_add_disk(mddev, r)) {
> - r->raid_disk = -1;
> - r->saved_raid_disk = -1;
> + r->raid_disk = r->saved_raid_disk = -1;
> r->flags = flags;
> } else {
> + clear_bit(In_sync, &r->flags);
> r->recovery_offset = 0;
> set_bit(i, (void *) cleared_failed_devices);
> cleared = true;
> @@ -3769,7 +3760,7 @@ static void raid_resume(struct dm_target *ti)
>
> static struct target_type raid_target = {
> .name = "raid",
> - .version = {1, 10, 0},
> + .version = {1, 10, 1},
> .module = THIS_MODULE,
> .ctr = raid_ctr,
> .dtr = raid_dtr,
> --
> 2.9.3
>
> --
> dm-devel mailing list
> dm-devel at redhat.com
> https://www.redhat.com/mailman/listinfo/dm-devel
More information about the dm-devel
mailing list