[dm-devel] [PATCH 3/3] dm-zoned: properly handle backing device failure
Damien Le Moal
Damien.LeMoal at wdc.com
Wed Aug 7 09:22:53 UTC 2019
On 2019/08/06 10:06, Dmitry Fomichev wrote:
> dm-zoned is observed to lock up or livelock in case of hardware
> failure or some misconfiguration of the backing zoned device.
>
> This patch adds a new function in the target code that checks the
> status of the backing device request queue. If it goes to dying state,
> the health check code marks dm-zoned DM target read-only and begins
...target read-only and dying and beignins...
Marking the target as read-only seems useless though since *all* incoming
requests are rejected with DM_MAPIO_KILL for a dying device, including read
requests. So we can remove the set_disk_ro() call I think.
> to reject any further incoming I/O. In order to timely detect backing
> device failure, this new function is called in the request mapping
> path, at the beginning of every reclaim run and before performing any
> metadata I/O.
>
> The proper way out of this situation is to do
>
> dmsetup remove <dm-zoned target>
>
> and recreate the target when the problem with the backing device
> is resolved.
>
> Fixes: 3b1a94c88b79 ("dm zoned: drive-managed zoned block device target")
> Cc: stable at vger.kernel.org
> Signed-off-by: Dmitry Fomichev <dmitry.fomichev at wdc.com>
> ---
> drivers/md/dm-zoned-metadata.c | 51 +++++++++++++++++++++++++++-------
> drivers/md/dm-zoned-reclaim.c | 18 ++++++++++--
> drivers/md/dm-zoned-target.c | 50 +++++++++++++++++++++++++++++++--
> drivers/md/dm-zoned.h | 10 +++++++
> 4 files changed, 115 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
> index 6b7fbbd735ef..2a5bc51fd6d5 100644
> --- a/drivers/md/dm-zoned-metadata.c
> +++ b/drivers/md/dm-zoned-metadata.c
> @@ -403,15 +403,18 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
> sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no;
> struct bio *bio;
>
> + if (dmz_bdev_is_dying(zmd->dev))
> + return ERR_PTR(-EIO);
> +
> /* Get a new block and a BIO to read it */
> mblk = dmz_alloc_mblock(zmd, mblk_no);
> if (!mblk)
> - return NULL;
> + return ERR_PTR(-ENOMEM);
>
> bio = bio_alloc(GFP_NOIO, 1);
> if (!bio) {
> dmz_free_mblock(zmd, mblk);
> - return NULL;
> + return ERR_PTR(-ENOMEM);
> }
>
> spin_lock(&zmd->mblk_lock);
> @@ -542,8 +545,8 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd,
> if (!mblk) {
> /* Cache miss: read the block from disk */
> mblk = dmz_get_mblock_slow(zmd, mblk_no);
> - if (!mblk)
> - return ERR_PTR(-ENOMEM);
> + if (IS_ERR(mblk))
> + return mblk;
> }
>
> /* Wait for on-going read I/O and check for error */
> @@ -571,16 +574,19 @@ static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
> /*
> * Issue a metadata block write BIO.
> */
> -static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
> - unsigned int set)
> +static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
> + unsigned int set)
> {
> sector_t block = zmd->sb[set].block + mblk->no;
> struct bio *bio;
>
> + if (dmz_bdev_is_dying(zmd->dev))
> + return -EIO;
> +
> bio = bio_alloc(GFP_NOIO, 1);
> if (!bio) {
> set_bit(DMZ_META_ERROR, &mblk->state);
> - return;
> + return -ENOMEM;
> }
>
> set_bit(DMZ_META_WRITING, &mblk->state);
> @@ -592,6 +598,8 @@ static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
> bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO);
> bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
> submit_bio(bio);
> +
> + return 0;
> }
>
> /*
> @@ -603,6 +611,9 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
> struct bio *bio;
> int ret;
>
> + if (dmz_bdev_is_dying(zmd->dev))
> + return -EIO;
> +
> bio = bio_alloc(GFP_NOIO, 1);
> if (!bio)
> return -ENOMEM;
> @@ -660,22 +671,29 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
> {
> struct dmz_mblock *mblk;
> struct blk_plug plug;
> - int ret = 0;
> + int ret = 0, nr_mblks_submitted = 0;
>
> /* Issue writes */
> blk_start_plug(&plug);
> - list_for_each_entry(mblk, write_list, link)
> - dmz_write_mblock(zmd, mblk, set);
> + list_for_each_entry(mblk, write_list, link) {
> + ret = dmz_write_mblock(zmd, mblk, set);
> + if (ret)
> + break;
> + nr_mblks_submitted++;
> + }
> blk_finish_plug(&plug);
>
> /* Wait for completion */
> list_for_each_entry(mblk, write_list, link) {
> + if (!nr_mblks_submitted)
> + break;
> wait_on_bit_io(&mblk->state, DMZ_META_WRITING,
> TASK_UNINTERRUPTIBLE);
> if (test_bit(DMZ_META_ERROR, &mblk->state)) {
> clear_bit(DMZ_META_ERROR, &mblk->state);
> ret = -EIO;
> }
> + nr_mblks_submitted--;
> }
>
> /* Flush drive cache (this will also sync data) */
> @@ -737,6 +755,11 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
> */
> dmz_lock_flush(zmd);
>
> + if (dmz_bdev_is_dying(zmd->dev)) {
> + ret = -EIO;
> + goto out;
> + }
> +
> /* Get dirty blocks */
> spin_lock(&zmd->mblk_lock);
> list_splice_init(&zmd->mblk_dirty_list, &write_list);
> @@ -1632,6 +1655,10 @@ struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chu
> /* Allocate a random zone */
> dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
> if (!dzone) {
> + if (dmz_bdev_is_dying(zmd->dev)) {
> + dzone = ERR_PTR(-EIO);
> + goto out;
> + }
> dmz_wait_for_free_zones(zmd);
> goto again;
> }
> @@ -1729,6 +1756,10 @@ struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd,
> /* Allocate a random zone */
> bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
> if (!bzone) {
> + if (dmz_bdev_is_dying(zmd->dev)) {
> + bzone = ERR_PTR(-EIO);
> + goto out;
> + }
> dmz_wait_for_free_zones(zmd);
> goto again;
> }
> diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c
> index 26e34493a2db..d240d7ca8a8a 100644
> --- a/drivers/md/dm-zoned-reclaim.c
> +++ b/drivers/md/dm-zoned-reclaim.c
> @@ -38,7 +38,7 @@ enum {
> /*
> * Number of seconds of target BIO inactivity to consider the target idle.
> */
> -#define DMZ_IDLE_PERIOD (10UL * HZ)
> +#define DMZ_IDLE_PERIOD (10UL * HZ)
>
> /*
> * Percentage of unmapped (free) random zones below which reclaim starts
> @@ -135,6 +135,9 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
> set_bit(DM_KCOPYD_WRITE_SEQ, &flags);
>
> while (block < end_block) {
> + if (dev->flags & DMZ_BDEV_DYING)
> + return -EIO;
> +
> /* Get a valid region from the source zone */
> ret = dmz_first_valid_block(zmd, src_zone, &block);
> if (ret <= 0)
> @@ -452,6 +455,9 @@ static void dmz_reclaim_work(struct work_struct *work)
> unsigned int p_unmap_rnd;
> int ret;
>
> + if (dmz_bdev_is_dying(zrc->dev))
> + return;
> +
> if (!dmz_should_reclaim(zrc)) {
> mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD);
> return;
> @@ -481,8 +487,16 @@ static void dmz_reclaim_work(struct work_struct *work)
> p_unmap_rnd, nr_unmap_rnd, nr_rnd);
>
> ret = dmz_do_reclaim(zrc);
> - if (ret)
> + if (ret) {
> dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret);
> + if (ret == -EIO)
> + /*
> + * LLD might be performing some error handling sequence
> + * at the underlying device. To not interfere, do not
> + * attempt to schedule the next reclaim run immediately.
> + */
> + return;
> + }
>
> dmz_schedule_reclaim(zrc);
> }
> diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
> index c1992034c099..c40c18f7fd64 100644
> --- a/drivers/md/dm-zoned-target.c
> +++ b/drivers/md/dm-zoned-target.c
> @@ -134,6 +134,8 @@ static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone,
>
> refcount_inc(&bioctx->ref);
> generic_make_request(clone);
> + if (clone->bi_status == BLK_STS_IOERR)
> + return -EIO;
>
> if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone))
> zone->wp_block += nr_blocks;
> @@ -278,8 +280,8 @@ static int dmz_handle_buffered_write(struct dmz_target *dmz,
>
> /* Get the buffer zone. One will be allocated if needed */
> bzone = dmz_get_chunk_buffer(zmd, zone);
> - if (!bzone)
> - return -ENOSPC;
> + if (IS_ERR(bzone))
> + return PTR_ERR(bzone);
>
> if (dmz_is_readonly(bzone))
> return -EROFS;
> @@ -390,6 +392,11 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
>
> dmz_lock_metadata(zmd);
>
> + if (dmz->dev->flags & DMZ_BDEV_DYING) {
> + ret = -EIO;
> + goto out;
> + }
> +
> /*
> * Get the data zone mapping the chunk. There may be no
> * mapping for read and discard. If a mapping is obtained,
> @@ -494,6 +501,8 @@ static void dmz_flush_work(struct work_struct *work)
>
> /* Flush dirty metadata blocks */
> ret = dmz_flush_metadata(dmz->metadata);
> + if (ret)
> + dmz_dev_debug(dmz->dev, "Metadata flush failed, rc=%d\n", ret);
>
> /* Process queued flush requests */
> while (1) {
> @@ -557,6 +566,37 @@ static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
> return ret;
> }
>
> +/*
> + * Check the backing device availability. If it's on the way out,
> + * start failing I/O. Reclaim and metadata components also call this
> + * function to cleanly abort operation in the event of such failure.
> + */
> +bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev)
> +{
> + struct gendisk *disk;
> + bool dying = dmz_dev->flags & DMZ_BDEV_DYING;
> +
> + if (!dying) {
> + disk = dmz_dev->bdev->bd_disk;
> + if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) {
> + dmz_dev_warn(dmz_dev, "Backing device queue dying");
> + dying = true;
> + } else if (disk->fops->check_events) {
> + if (disk->fops->check_events(disk, 0) &
> + DISK_EVENT_MEDIA_CHANGE) {
> + dmz_dev_warn(dmz_dev, "Backing device offline");
> + dying = true;
> + }
> + }
> + if (dying) {
> + dmz_dev->flags |= DMZ_BDEV_DYING;
> + set_disk_ro(dmz_dev->bdev->bd_disk, 1);
> + }
> + }
> +
> + return dying;
> +}
> +
> /*
> * Process a new BIO.
> */
> @@ -570,6 +610,9 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
> sector_t chunk_sector;
> int ret;
>
> + if (dmz_bdev_is_dying(dmz->dev))
> + return DM_MAPIO_KILL;
> +
> dmz_dev_debug(dev, "BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks",
> bio_op(bio), (unsigned long long)sector, nr_sectors,
> (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
> @@ -866,6 +909,9 @@ static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
> {
> struct dmz_target *dmz = ti->private;
>
> + if (dmz_bdev_is_dying(dmz->dev))
> + return -ENODEV;
> +
> *bdev = dmz->dev->bdev;
>
> return 0;
> diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h
> index 1a3b06de2c00..d8e70b0ade35 100644
> --- a/drivers/md/dm-zoned.h
> +++ b/drivers/md/dm-zoned.h
> @@ -57,6 +57,8 @@ struct dmz_dev {
>
> unsigned int nr_zones;
>
> + unsigned int flags;
> +
> sector_t zone_nr_sectors;
> unsigned int zone_nr_sectors_shift;
>
> @@ -68,6 +70,9 @@ struct dmz_dev {
> (dev)->zone_nr_sectors_shift)
> #define dmz_chunk_block(dev, b) ((b) & ((dev)->zone_nr_blocks - 1))
>
> +/* Device flags. */
> +#define DMZ_BDEV_DYING (1 << 0)
> +
> /*
> * Zone descriptor.
> */
> @@ -246,4 +251,9 @@ void dmz_resume_reclaim(struct dmz_reclaim *zrc);
> void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc);
> void dmz_schedule_reclaim(struct dmz_reclaim *zrc);
>
> +/*
> + * Functions defined in dm-zoned-target.c
> + */
> +bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev);
> +
> #endif /* DM_ZONED_H */
>
--
Damien Le Moal
Western Digital Research
More information about the dm-devel
mailing list