[dm-devel] [PATCH 3/3] dm-zoned: properly handle backing device failure

Wed Aug 7 09:22:53 UTC 2019

On 2019/08/06 10:06, Dmitry Fomichev wrote:
> dm-zoned is observed to lock up or livelock in case of hardware
> failure or some misconfiguration of the backing zoned device.
> 
> This patch adds a new function in the target code that checks the
> status of the backing device request queue. If it goes to dying state,
> the health check code marks dm-zoned DM target read-only and begins

...target read-only and dying and beignins...

Marking the target as read-only seems useless though since *all* incoming
requests are rejected with DM_MAPIO_KILL for a dying device, including read
requests. So we can remove the set_disk_ro() call I think.

> to reject any further incoming I/O. In order to timely detect backing
> device failure, this new function is called in the request mapping
> path, at the beginning of every reclaim run and before performing any
> metadata I/O.
> 
> The proper way out of this situation is to do
> 
> dmsetup remove <dm-zoned target>
> 
> and recreate the target when the problem with the backing device
> is resolved.
> 
> Fixes: 3b1a94c88b79 ("dm zoned: drive-managed zoned block device target")
> Cc: stable at vger.kernel.org
> Signed-off-by: Dmitry Fomichev <dmitry.fomichev at wdc.com>
> ---
>  drivers/md/dm-zoned-metadata.c | 51 +++++++++++++++++++++++++++-------
>  drivers/md/dm-zoned-reclaim.c  | 18 ++++++++++--
>  drivers/md/dm-zoned-target.c   | 50 +++++++++++++++++++++++++++++++--
>  drivers/md/dm-zoned.h          | 10 +++++++
>  4 files changed, 115 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
> index 6b7fbbd735ef..2a5bc51fd6d5 100644
> --- a/drivers/md/dm-zoned-metadata.c
> +++ b/drivers/md/dm-zoned-metadata.c
> @@ -403,15 +403,18 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
>  	sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no;
>  	struct bio *bio;
>  
> +	if (dmz_bdev_is_dying(zmd->dev))
> +		return ERR_PTR(-EIO);
> +
>  	/* Get a new block and a BIO to read it */
>  	mblk = dmz_alloc_mblock(zmd, mblk_no);
>  	if (!mblk)
> -		return NULL;
> +		return ERR_PTR(-ENOMEM);
>  
>  	bio = bio_alloc(GFP_NOIO, 1);
>  	if (!bio) {
>  		dmz_free_mblock(zmd, mblk);
> -		return NULL;
> +		return ERR_PTR(-ENOMEM);
>  	}
>  
>  	spin_lock(&zmd->mblk_lock);
> @@ -542,8 +545,8 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd,
>  	if (!mblk) {
>  		/* Cache miss: read the block from disk */
>  		mblk = dmz_get_mblock_slow(zmd, mblk_no);
> -		if (!mblk)
> -			return ERR_PTR(-ENOMEM);
> +		if (IS_ERR(mblk))
> +			return mblk;
>  	}
>  
>  	/* Wait for on-going read I/O and check for error */
> @@ -571,16 +574,19 @@ static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
>  /*
>   * Issue a metadata block write BIO.
>   */
> -static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
> -			     unsigned int set)
> +static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
> +			    unsigned int set)
>  {
>  	sector_t block = zmd->sb[set].block + mblk->no;
>  	struct bio *bio;
>  
> +	if (dmz_bdev_is_dying(zmd->dev))
> +		return -EIO;
> +
>  	bio = bio_alloc(GFP_NOIO, 1);
>  	if (!bio) {
>  		set_bit(DMZ_META_ERROR, &mblk->state);
> -		return;
> +		return -ENOMEM;
>  	}
>  
>  	set_bit(DMZ_META_WRITING, &mblk->state);
> @@ -592,6 +598,8 @@ static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
>  	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO);
>  	bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
>  	submit_bio(bio);
> +
> +	return 0;
>  }
>  
>  /*
> @@ -603,6 +611,9 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
>  	struct bio *bio;
>  	int ret;
>  
> +	if (dmz_bdev_is_dying(zmd->dev))
> +		return -EIO;
> +
>  	bio = bio_alloc(GFP_NOIO, 1);
>  	if (!bio)
>  		return -ENOMEM;
> @@ -660,22 +671,29 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
>  {
>  	struct dmz_mblock *mblk;
>  	struct blk_plug plug;
> -	int ret = 0;
> +	int ret = 0, nr_mblks_submitted = 0;
>  
>  	/* Issue writes */
>  	blk_start_plug(&plug);
> -	list_for_each_entry(mblk, write_list, link)
> -		dmz_write_mblock(zmd, mblk, set);
> +	list_for_each_entry(mblk, write_list, link) {
> +		ret = dmz_write_mblock(zmd, mblk, set);
> +		if (ret)
> +			break;
> +		nr_mblks_submitted++;
> +	}
>  	blk_finish_plug(&plug);
>  
>  	/* Wait for completion */
>  	list_for_each_entry(mblk, write_list, link) {
> +		if (!nr_mblks_submitted)
> +			break;
>  		wait_on_bit_io(&mblk->state, DMZ_META_WRITING,
>  			       TASK_UNINTERRUPTIBLE);
>  		if (test_bit(DMZ_META_ERROR, &mblk->state)) {
>  			clear_bit(DMZ_META_ERROR, &mblk->state);
>  			ret = -EIO;
>  		}
> +		nr_mblks_submitted--;
>  	}
>  
>  	/* Flush drive cache (this will also sync data) */
> @@ -737,6 +755,11 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
>  	 */
>  	dmz_lock_flush(zmd);
>  
> +	if (dmz_bdev_is_dying(zmd->dev)) {
> +		ret = -EIO;
> +		goto out;
> +	}
> +
>  	/* Get dirty blocks */
>  	spin_lock(&zmd->mblk_lock);
>  	list_splice_init(&zmd->mblk_dirty_list, &write_list);
> @@ -1632,6 +1655,10 @@ struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chu
>  		/* Allocate a random zone */
>  		dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
>  		if (!dzone) {
> +			if (dmz_bdev_is_dying(zmd->dev)) {
> +				dzone = ERR_PTR(-EIO);
> +				goto out;
> +			}
>  			dmz_wait_for_free_zones(zmd);
>  			goto again;
>  		}
> @@ -1729,6 +1756,10 @@ struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd,
>  	/* Allocate a random zone */
>  	bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
>  	if (!bzone) {
> +		if (dmz_bdev_is_dying(zmd->dev)) {
> +			bzone = ERR_PTR(-EIO);
> +			goto out;
> +		}
>  		dmz_wait_for_free_zones(zmd);
>  		goto again;
>  	}
> diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c
> index 26e34493a2db..d240d7ca8a8a 100644
> --- a/drivers/md/dm-zoned-reclaim.c
> +++ b/drivers/md/dm-zoned-reclaim.c
> @@ -38,7 +38,7 @@ enum {
>  /*
>   * Number of seconds of target BIO inactivity to consider the target idle.
>   */
> -#define DMZ_IDLE_PERIOD		(10UL * HZ)
> +#define DMZ_IDLE_PERIOD			(10UL * HZ)
>  
>  /*
>   * Percentage of unmapped (free) random zones below which reclaim starts
> @@ -135,6 +135,9 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
>  		set_bit(DM_KCOPYD_WRITE_SEQ, &flags);
>  
>  	while (block < end_block) {
> +		if (dev->flags & DMZ_BDEV_DYING)
> +			return -EIO;
> +
>  		/* Get a valid region from the source zone */
>  		ret = dmz_first_valid_block(zmd, src_zone, &block);
>  		if (ret <= 0)
> @@ -452,6 +455,9 @@ static void dmz_reclaim_work(struct work_struct *work)
>  	unsigned int p_unmap_rnd;
>  	int ret;
>  
> +	if (dmz_bdev_is_dying(zrc->dev))
> +		return;
> +
>  	if (!dmz_should_reclaim(zrc)) {
>  		mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD);
>  		return;
> @@ -481,8 +487,16 @@ static void dmz_reclaim_work(struct work_struct *work)
>  		      p_unmap_rnd, nr_unmap_rnd, nr_rnd);
>  
>  	ret = dmz_do_reclaim(zrc);
> -	if (ret)
> +	if (ret) {
>  		dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret);
> +		if (ret == -EIO)
> +			/*
> +			 * LLD might be performing some error handling sequence
> +			 * at the underlying device. To not interfere, do not
> +			 * attempt to schedule the next reclaim run immediately.
> +			 */
> +			return;
> +	}
>  
>  	dmz_schedule_reclaim(zrc);
>  }
> diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
> index c1992034c099..c40c18f7fd64 100644
> --- a/drivers/md/dm-zoned-target.c
> +++ b/drivers/md/dm-zoned-target.c
> @@ -134,6 +134,8 @@ static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone,
>  
>  	refcount_inc(&bioctx->ref);
>  	generic_make_request(clone);
> +	if (clone->bi_status == BLK_STS_IOERR)
> +		return -EIO;
>  
>  	if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone))
>  		zone->wp_block += nr_blocks;
> @@ -278,8 +280,8 @@ static int dmz_handle_buffered_write(struct dmz_target *dmz,
>  
>  	/* Get the buffer zone. One will be allocated if needed */
>  	bzone = dmz_get_chunk_buffer(zmd, zone);
> -	if (!bzone)
> -		return -ENOSPC;
> +	if (IS_ERR(bzone))
> +		return PTR_ERR(bzone);
>  
>  	if (dmz_is_readonly(bzone))
>  		return -EROFS;
> @@ -390,6 +392,11 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
>  
>  	dmz_lock_metadata(zmd);
>  
> +	if (dmz->dev->flags & DMZ_BDEV_DYING) {
> +		ret = -EIO;
> +		goto out;
> +	}
> +
>  	/*
>  	 * Get the data zone mapping the chunk. There may be no
>  	 * mapping for read and discard. If a mapping is obtained,
> @@ -494,6 +501,8 @@ static void dmz_flush_work(struct work_struct *work)
>  
>  	/* Flush dirty metadata blocks */
>  	ret = dmz_flush_metadata(dmz->metadata);
> +	if (ret)
> +		dmz_dev_debug(dmz->dev, "Metadata flush failed, rc=%d\n", ret);
>  
>  	/* Process queued flush requests */
>  	while (1) {
> @@ -557,6 +566,37 @@ static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
>  	return ret;
>  }
>  
> +/*
> + * Check the backing device availability. If it's on the way out,
> + * start failing I/O. Reclaim and metadata components also call this
> + * function to cleanly abort operation in the event of such failure.
> + */
> +bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev)
> +{
> +	struct gendisk *disk;
> +	bool dying = dmz_dev->flags & DMZ_BDEV_DYING;
> +
> +	if (!dying) {
> +		disk = dmz_dev->bdev->bd_disk;
> +		if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) {
> +			dmz_dev_warn(dmz_dev, "Backing device queue dying");
> +			dying = true;
> +		} else if (disk->fops->check_events) {
> +			if (disk->fops->check_events(disk, 0) &
> +					DISK_EVENT_MEDIA_CHANGE) {
> +				dmz_dev_warn(dmz_dev, "Backing device offline");
> +				dying = true;
> +			}
> +		}
> +		if (dying) {
> +			dmz_dev->flags |= DMZ_BDEV_DYING;
> +			set_disk_ro(dmz_dev->bdev->bd_disk, 1);
> +		}
> +	}
> +
> +	return dying;
> +}
> +
>  /*
>   * Process a new BIO.
>   */
> @@ -570,6 +610,9 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
>  	sector_t chunk_sector;
>  	int ret;
>  
> +	if (dmz_bdev_is_dying(dmz->dev))
> +		return DM_MAPIO_KILL;
> +
>  	dmz_dev_debug(dev, "BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks",
>  		      bio_op(bio), (unsigned long long)sector, nr_sectors,
>  		      (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
> @@ -866,6 +909,9 @@ static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
>  {
>  	struct dmz_target *dmz = ti->private;
>  
> +	if (dmz_bdev_is_dying(dmz->dev))
> +		return -ENODEV;
> +
>  	*bdev = dmz->dev->bdev;
>  
>  	return 0;
> diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h
> index 1a3b06de2c00..d8e70b0ade35 100644
> --- a/drivers/md/dm-zoned.h
> +++ b/drivers/md/dm-zoned.h
> @@ -57,6 +57,8 @@ struct dmz_dev {
>  
>  	unsigned int		nr_zones;
>  
> +	unsigned int		flags;
> +
>  	sector_t		zone_nr_sectors;
>  	unsigned int		zone_nr_sectors_shift;
>  
> @@ -68,6 +70,9 @@ struct dmz_dev {
>  				 (dev)->zone_nr_sectors_shift)
>  #define dmz_chunk_block(dev, b)	((b) & ((dev)->zone_nr_blocks - 1))
>  
> +/* Device flags. */
> +#define DMZ_BDEV_DYING		(1 << 0)
> +
>  /*
>   * Zone descriptor.
>   */
> @@ -246,4 +251,9 @@ void dmz_resume_reclaim(struct dmz_reclaim *zrc);
>  void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc);
>  void dmz_schedule_reclaim(struct dmz_reclaim *zrc);
>  
> +/*
> + * Functions defined in dm-zoned-target.c
> + */
> +bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev);
> +
>  #endif /* DM_ZONED_H */
> 

-- 
Damien Le Moal
Western Digital Research