[dm-devel] [PATCH 1/2] dm-zoned: cache device for zones

Damien Le Moal Damien.LeMoal at wdc.com
Tue Mar 24 03:52:24 UTC 2020


+Bob who had proposed a similar change a last month.

On 2020/03/24 0:04, Hannes Reinecke wrote:
> Implement 'cache' zones which reside on a different device.
> The device is logically split into zones, which then will be
> used as 'cache' zones, similar to the existing randow write
> zones.

It does look like the new "cahce" zones are really used exactly as conventional
zones of the SMR drive. So I wonder: why even define this new zone type ? We
could have the "cache" device split into random (conventional) zones added to a
single pool of random zones. We can simply add device awareness to the zone
allocator to avoid as much as possible using a random zone from the same drive
as the sequential zone it buffers. That would avoid repeating most of the code
for cache & random.

Furthermore, this work is really great to support SMR drives with no
conventional zones (a lot of ask for these). And considering that the new FORMAT
WITH PRESET command is coming soon, a user will be able to reformat an SMR drive
with sequential zones only to maximize capacity. For these, the cache device
would need to hold the random zones, at which point the difference between cache
and rando goes away.

> 
> Signed-off-by: Hannes Reinecke <hare at suse.de>
> ---
>  drivers/md/dm-zoned-metadata.c | 174 ++++++++++++++++++++++++++++-----
>  drivers/md/dm-zoned-reclaim.c  |  76 +++++++++++---
>  drivers/md/dm-zoned-target.c   | 109 ++++++++++++++++++---
>  drivers/md/dm-zoned.h          |  31 +++++-
>  4 files changed, 339 insertions(+), 51 deletions(-)
> 
> diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
> index 369de15c4e80..41cc3a29db0b 100644
> --- a/drivers/md/dm-zoned-metadata.c
> +++ b/drivers/md/dm-zoned-metadata.c
> @@ -132,6 +132,8 @@ struct dmz_sb {
>  struct dmz_metadata {
>  	struct dmz_dev		*dev;
>  
> +	struct dmz_cdev		*cdev;

Given the point above, we could have this generalized as an array of devices,
with the first one meeting the constraints:
* It contains the metadata
* It has random/conventional zones, or is a regular device (with all its
capacity used through emulated random zones)

I do not think that complicates the changes you did a lot. The reclaim part will
need some more love I guess to be efficient, but it may be as simple as defining
one work struct for each drive beside the first one.

Thoughts ?

> +
>  	sector_t		zone_bitmap_size;
>  	unsigned int		zone_nr_bitmap_blocks;
>  	unsigned int		zone_bits_per_mblk;
> @@ -139,10 +141,12 @@ struct dmz_metadata {
>  	unsigned int		nr_bitmap_blocks;
>  	unsigned int		nr_map_blocks;
>  
> +	unsigned int		nr_zones;
>  	unsigned int		nr_useable_zones;
>  	unsigned int		nr_meta_blocks;
>  	unsigned int		nr_meta_zones;
>  	unsigned int		nr_data_zones;
> +	unsigned int		nr_cache_zones;
>  	unsigned int		nr_rnd_zones;
>  	unsigned int		nr_reserved_seq;
>  	unsigned int		nr_chunks;
> @@ -173,6 +177,11 @@ struct dmz_metadata {
>  	struct list_head	unmap_rnd_list;
>  	struct list_head	map_rnd_list;
>  
> +	unsigned int		nr_cache;
> +	atomic_t		unmap_nr_cache;
> +	struct list_head	unmap_cache_list;
> +	struct list_head	map_cache_list;
> +
>  	unsigned int		nr_seq;
>  	atomic_t		unmap_nr_seq;
>  	struct list_head	unmap_seq_list;
> @@ -189,17 +198,25 @@ struct dmz_metadata {
>   */
>  unsigned int dmz_id(struct dmz_metadata *zmd, struct dm_zone *zone)
>  {
> -	return ((unsigned int)(zone - zmd->zones));
> +	return zone->id;
>  }
>  
>  sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone)
>  {
> -	return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_sectors_shift;
> +	sector_t zone_id = dmz_id(zmd, zone);
> +
> +	if (dmz_is_cache(zone))
> +		zone_id -= zmd->dev->nr_zones;
> +	return zone_id << zmd->dev->zone_nr_sectors_shift;
>  }
>  
>  sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone)
>  {
> -	return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_blocks_shift;
> +	sector_t zone_id = dmz_id(zmd, zone);
> +
> +	if (dmz_is_cache(zone))
> +		zone_id -= zmd->dev->nr_zones;
> +	return zone_id << zmd->dev->zone_nr_blocks_shift;
>  }
>  
>  unsigned int dmz_nr_chunks(struct dmz_metadata *zmd)
> @@ -217,6 +234,16 @@ unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd)
>  	return atomic_read(&zmd->unmap_nr_rnd);
>  }
>  
> +unsigned int dmz_nr_cache_zones(struct dmz_metadata *zmd)
> +{
> +	return zmd->nr_cache;
> +}
> +
> +unsigned int dmz_nr_unmap_cache_zones(struct dmz_metadata *zmd)
> +{
> +	return atomic_read(&zmd->unmap_nr_cache);
> +}
> +
>  /*
>   * Lock/unlock mapping table.
>   * The map lock also protects all the zone lists.
> @@ -865,6 +892,10 @@ static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_super *sb)
>  		dmz_dev_err(dev, "Invalid number of metadata blocks");
>  		return -ENXIO;
>  	}
> +	if (zmd->nr_cache_zones && nr_meta_zones >= zmd->nr_cache_zones) {
> +		dmz_dev_err(dev, "Cache too small to hold metadata");
> +		return -ENXIO;\
> +	}
>  
>  	if (!le32_to_cpu(sb->nr_reserved_seq) ||
>  	    le32_to_cpu(sb->nr_reserved_seq) >= (zmd->nr_useable_zones - nr_meta_zones)) {
> @@ -1104,6 +1135,7 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data)
>  
>  	INIT_LIST_HEAD(&zone->link);
>  	atomic_set(&zone->refcount, 0);
> +	zone->id = idx;
>  	zone->chunk = DMZ_MAP_UNMAPPED;
>  
>  	switch (blkz->type) {
> @@ -1157,6 +1189,7 @@ static void dmz_drop_zones(struct dmz_metadata *zmd)
>  static int dmz_init_zones(struct dmz_metadata *zmd)
>  {
>  	struct dmz_dev *dev = zmd->dev;
> +	struct dmz_cdev *cdev = zmd->cdev;
>  	int ret;
>  
>  	/* Init */
> @@ -1167,12 +1200,16 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
>  					DMZ_BLOCK_SIZE_BITS);
>  
>  	/* Allocate zone array */
> -	zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL);
> +	zmd->nr_zones = dev->nr_zones;
> +	if (cdev)
> +		zmd->nr_zones += cdev->capacity / dev->zone_nr_sectors;
> +
> +	zmd->zones = kcalloc(zmd->nr_zones, sizeof(struct dm_zone), GFP_KERNEL);
>  	if (!zmd->zones)
>  		return -ENOMEM;
>  
>  	dmz_dev_info(dev, "Using %zu B for zone information",
> -		     sizeof(struct dm_zone) * dev->nr_zones);
> +		     sizeof(struct dm_zone) * zmd->nr_zones);
>  
>  	/*
>  	 * Get zone information and initialize zone descriptors.  At the same
> @@ -1185,7 +1222,26 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
>  		dmz_drop_zones(zmd);
>  		return ret;
>  	}
> +	if (cdev) {
> +		int idx;
> +
> +		for (idx = dev->nr_zones; idx < zmd->nr_zones; idx++) {
> +			struct dm_zone *zone = &zmd->zones[idx];
>  
> +			INIT_LIST_HEAD(&zone->link);
> +			atomic_set(&zone->refcount, 0);
> +			zone->id = idx;
> +			zone->chunk = DMZ_MAP_UNMAPPED;
> +			set_bit(DMZ_CACHE, &zone->flags);
> +			zmd->nr_cache_zones++;
> +			zone->wp_block = 0;
> +			zmd->nr_useable_zones++;
> +			if (!zmd->sb_zone) {
> +				/* Super block zone */
> +				zmd->sb_zone = zone;
> +			}
> +		}
> +	}
>  	return 0;
>  }
>  
> @@ -1216,6 +1272,9 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
>  	unsigned int noio_flag;
>  	int ret;
>  
> +	if (dmz_is_cache(zone))
> +		return 0;
> +
>  	/*
>  	 * Get zone information from disk. Since blkdev_report_zones() uses
>  	 * GFP_KERNEL by default for memory allocations, set the per-task
> @@ -1283,7 +1342,8 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
>  	 */
>  	if (dmz_is_offline(zone) ||
>  	    dmz_is_readonly(zone) ||
> -	    dmz_is_rnd(zone))
> +	    dmz_is_rnd(zone) ||
> +	    dmz_is_cache(zone))
>  		return 0;
>  
>  	if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) {
> @@ -1345,7 +1405,7 @@ static int dmz_load_mapping(struct dmz_metadata *zmd)
>  		if (dzone_id == DMZ_MAP_UNMAPPED)
>  			goto next;
>  
> -		if (dzone_id >= dev->nr_zones) {
> +		if (dzone_id >= zmd->nr_zones) {
>  			dmz_dev_err(dev, "Chunk %u mapping: invalid data zone ID %u",
>  				    chunk, dzone_id);
>  			return -EIO;
> @@ -1358,6 +1418,8 @@ static int dmz_load_mapping(struct dmz_metadata *zmd)
>  
>  		if (dmz_is_rnd(dzone))
>  			list_add_tail(&dzone->link, &zmd->map_rnd_list);
> +		else if (dmz_is_cache(dzone))
> +			list_add_tail(&dzone->link, &zmd->map_cache_list);
>  		else
>  			list_add_tail(&dzone->link, &zmd->map_seq_list);
>  
> @@ -1366,14 +1428,14 @@ static int dmz_load_mapping(struct dmz_metadata *zmd)
>  		if (bzone_id == DMZ_MAP_UNMAPPED)
>  			goto next;
>  
> -		if (bzone_id >= dev->nr_zones) {
> +		if (bzone_id >= zmd->nr_zones) {
>  			dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone ID %u",
>  				    chunk, bzone_id);
>  			return -EIO;
>  		}
>  
>  		bzone = dmz_get(zmd, bzone_id);
> -		if (!dmz_is_rnd(bzone)) {
> +		if (!dmz_is_rnd(bzone) && !dmz_is_cache(bzone)) {
>  			dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone %u",
>  				    chunk, bzone_id);
>  			return -EIO;
> @@ -1385,7 +1447,10 @@ static int dmz_load_mapping(struct dmz_metadata *zmd)
>  		bzone->bzone = dzone;
>  		dzone->bzone = bzone;
>  		dmz_get_zone_weight(zmd, bzone);
> -		list_add_tail(&bzone->link, &zmd->map_rnd_list);
> +		if (dmz_is_cache(bzone))
> +			list_add_tail(&bzone->link, &zmd->map_cache_list);
> +		else
> +			list_add_tail(&bzone->link, &zmd->map_rnd_list);
>  next:
>  		chunk++;
>  		e++;
> @@ -1398,13 +1463,15 @@ static int dmz_load_mapping(struct dmz_metadata *zmd)
>  	 * fully initialized. All remaining zones are unmapped data
>  	 * zones. Finish initializing those here.
>  	 */
> -	for (i = 0; i < dev->nr_zones; i++) {
> +	for (i = 0; i < zmd->nr_zones; i++) {
>  		dzone = dmz_get(zmd, i);
>  		if (dmz_is_meta(dzone))
>  			continue;
>  
>  		if (dmz_is_rnd(dzone))
>  			zmd->nr_rnd++;
> +		else if (dmz_is_cache(dzone))
> +			zmd->nr_cache++;
>  		else
>  			zmd->nr_seq++;
>  
> @@ -1419,6 +1486,9 @@ static int dmz_load_mapping(struct dmz_metadata *zmd)
>  		if (dmz_is_rnd(dzone)) {
>  			list_add_tail(&dzone->link, &zmd->unmap_rnd_list);
>  			atomic_inc(&zmd->unmap_nr_rnd);
> +		} else if (dmz_is_cache(dzone)) {
> +			list_add_tail(&dzone->link, &zmd->unmap_cache_list);
> +			atomic_inc(&zmd->unmap_nr_cache);
>  		} else if (atomic_read(&zmd->nr_reserved_seq_zones) < zmd->nr_reserved_seq) {
>  			list_add_tail(&dzone->link, &zmd->reserved_seq_zones_list);
>  			atomic_inc(&zmd->nr_reserved_seq_zones);
> @@ -1460,6 +1530,9 @@ static void __dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
>  	if (dmz_is_seq(zone)) {
>  		/* LRU rotate sequential zone */
>  		list_add_tail(&zone->link, &zmd->map_seq_list);
> +	} else if (dmz_is_cache(zone)) {
> +		/* LRU rotate cache zone */
> +		list_add_tail(&zone->link, &zmd->map_cache_list);
>  	} else {
>  		/* LRU rotate random zone */
>  		list_add_tail(&zone->link, &zmd->map_rnd_list);
> @@ -1557,6 +1630,29 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd)
>  	return ERR_PTR(-EBUSY);
>  }
>  
> +/*
> + * Select a cache zone for reclaim.
> + */
> +static struct dm_zone *dmz_get_cache_zone_for_reclaim(struct dmz_metadata *zmd)
> +{
> +	struct dm_zone *dzone = NULL;
> +	struct dm_zone *zone;
> +
> +	if (list_empty(&zmd->map_cache_list))
> +		return ERR_PTR(-EBUSY);
> +
> +	list_for_each_entry(zone, &zmd->map_cache_list, link) {
> +		if (dmz_is_buf(zone))
> +			dzone = zone->bzone;
> +		else
> +			dzone = zone;
> +		if (dmz_lock_zone_reclaim(dzone))
> +			return dzone;
> +	}
> +
> +	return ERR_PTR(-EBUSY);
> +}
> +
>  /*
>   * Select a buffered sequential zone for reclaim.
>   */
> @@ -1590,13 +1686,17 @@ struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd)
>  	 *     cannot be reclaimed. So choose a sequential zone to reclaim so
>  	 *     that afterward a random zone can be reclaimed.
>  	 * (2) At least one free sequential zone is available, then choose
> -	 *     the oldest random zone (data or buffer) that can be locked.
> +	 *     either the oldest cache zone, or, failing that, the oldest
> +	 *     random zone (data or buffer) that can be locked.
>  	 */
>  	dmz_lock_map(zmd);
>  	if (list_empty(&zmd->reserved_seq_zones_list))
>  		zone = dmz_get_seq_zone_for_reclaim(zmd);
> -	else
> -		zone = dmz_get_rnd_zone_for_reclaim(zmd);
> +	else {
> +		zone = dmz_get_cache_zone_for_reclaim(zmd);
> +		if (!zone)
> +			zone = dmz_get_rnd_zone_for_reclaim(zmd);
> +	}
>  	dmz_unlock_map(zmd);
>  
>  	return zone;
> @@ -1629,8 +1729,12 @@ struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chu
>  		if (op != REQ_OP_WRITE)
>  			goto out;
>  
> -		/* Allocate a random zone */
> -		dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
> +		/* Try to allocate a cache zone first */
> +		dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_CACHE);
> +		if (!dzone) {
> +			/* Allocate a random zone */
> +			dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
> +		}
>  		if (!dzone) {
>  			if (dmz_bdev_is_dying(zmd->dev)) {
>  				dzone = ERR_PTR(-EIO);
> @@ -1730,8 +1834,12 @@ struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd,
>  	if (bzone)
>  		goto out;
>  
> -	/* Allocate a random zone */
> -	bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
> +	/* Try to allocate a cache zone first */
> +	bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_CACHE);
> +	if (!bzone) {
> +		/* Allocate a random zone */
> +		bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
> +	}
>  	if (!bzone) {
>  		if (dmz_bdev_is_dying(zmd->dev)) {
>  			bzone = ERR_PTR(-EIO);
> @@ -1749,7 +1857,10 @@ struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd,
>  	bzone->chunk = dzone->chunk;
>  	bzone->bzone = dzone;
>  	dzone->bzone = bzone;
> -	list_add_tail(&bzone->link, &zmd->map_rnd_list);
> +	if (dmz_is_cache(bzone))
> +		list_add_tail(&bzone->link, &zmd->map_cache_list);
> +	else
> +		list_add_tail(&bzone->link, &zmd->map_rnd_list);
>  out:
>  	dmz_unlock_map(zmd);
>  
> @@ -1765,7 +1876,9 @@ struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags)
>  	struct list_head *list;
>  	struct dm_zone *zone;
>  
> -	if (flags & DMZ_ALLOC_RND)
> +	if (flags & DMZ_ALLOC_CACHE)
> +		list = &zmd->unmap_cache_list;
> +	else if (flags & DMZ_ALLOC_RND)
>  		list = &zmd->unmap_rnd_list;
>  	else
>  		list = &zmd->unmap_seq_list;
> @@ -1791,6 +1904,8 @@ struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags)
>  
>  	if (dmz_is_rnd(zone))
>  		atomic_dec(&zmd->unmap_nr_rnd);
> +	else if (dmz_is_cache(zone))
> +		atomic_dec(&zmd->unmap_nr_cache);
>  	else
>  		atomic_dec(&zmd->unmap_nr_seq);
>  
> @@ -1817,6 +1932,9 @@ void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
>  	if (dmz_is_rnd(zone)) {
>  		list_add_tail(&zone->link, &zmd->unmap_rnd_list);
>  		atomic_inc(&zmd->unmap_nr_rnd);
> +	} else if (dmz_is_cache(zone)) {
> +		list_add_tail(&zone->link, &zmd->unmap_cache_list);
> +		atomic_inc(&zmd->unmap_nr_cache);
>  	} else if (atomic_read(&zmd->nr_reserved_seq_zones) <
>  		   zmd->nr_reserved_seq) {
>  		list_add_tail(&zone->link, &zmd->reserved_seq_zones_list);
> @@ -1842,6 +1960,8 @@ void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *dzone,
>  	dzone->chunk = chunk;
>  	if (dmz_is_rnd(dzone))
>  		list_add_tail(&dzone->link, &zmd->map_rnd_list);
> +	else if (dmz_is_cache(dzone))
> +		list_add_tail(&dzone->link, &zmd->map_cache_list);
>  	else
>  		list_add_tail(&dzone->link, &zmd->map_seq_list);
>  }
> @@ -2360,7 +2480,8 @@ static void dmz_cleanup_metadata(struct dmz_metadata *zmd)
>  /*
>   * Initialize the zoned metadata.
>   */
> -int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata)
> +int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_cdev *cdev,
> +		     struct dmz_metadata **metadata)
>  {
>  	struct dmz_metadata *zmd;
>  	unsigned int i, zid;
> @@ -2372,6 +2493,7 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata)
>  		return -ENOMEM;
>  
>  	zmd->dev = dev;
> +	zmd->cdev = cdev;
>  	zmd->mblk_rbtree = RB_ROOT;
>  	init_rwsem(&zmd->mblk_sem);
>  	mutex_init(&zmd->mblk_flush_lock);
> @@ -2384,6 +2506,10 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata)
>  	INIT_LIST_HEAD(&zmd->unmap_rnd_list);
>  	INIT_LIST_HEAD(&zmd->map_rnd_list);
>  
> +	atomic_set(&zmd->unmap_nr_cache, 0);
> +	INIT_LIST_HEAD(&zmd->unmap_cache_list);
> +	INIT_LIST_HEAD(&zmd->map_cache_list);
> +
>  	atomic_set(&zmd->unmap_nr_seq, 0);
>  	INIT_LIST_HEAD(&zmd->unmap_seq_list);
>  	INIT_LIST_HEAD(&zmd->map_seq_list);
> @@ -2407,7 +2533,7 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata)
>  	zid = dmz_id(zmd, zmd->sb_zone);
>  	for (i = 0; i < zmd->nr_meta_zones << 1; i++) {
>  		zone = dmz_get(zmd, zid + i);
> -		if (!dmz_is_rnd(zone))
> +		if (!dmz_is_rnd(zone) && !dmz_is_cache(zone))
>  			goto err;
>  		set_bit(DMZ_META, &zone->flags);
>  	}
> @@ -2449,6 +2575,8 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata)
>  		     zmd->nr_data_zones, zmd->nr_chunks);
>  	dmz_dev_info(dev, "    %u random zones (%u unmapped)",
>  		     zmd->nr_rnd, atomic_read(&zmd->unmap_nr_rnd));
> +	dmz_dev_info(dev, "    %u cache zones (%u unmapped)",
> +		     zmd->nr_cache, atomic_read(&zmd->unmap_nr_cache));
>  	dmz_dev_info(dev, "    %u sequential zones (%u unmapped)",
>  		     zmd->nr_seq, atomic_read(&zmd->unmap_nr_seq));
>  	dmz_dev_info(dev, "  %u reserved sequential data zones",
> @@ -2495,7 +2623,7 @@ int dmz_resume_metadata(struct dmz_metadata *zmd)
>  	int ret;
>  
>  	/* Check zones */
> -	for (i = 0; i < dev->nr_zones; i++) {
> +	for (i = 0; i < zmd->nr_zones; i++) {
>  		zone = dmz_get(zmd, i);
>  		if (!zone) {
>  			dmz_dev_err(dev, "Unable to get zone %u", i);
> diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c
> index e7ace908a9b7..c394ba19cf9b 100644
> --- a/drivers/md/dm-zoned-reclaim.c
> +++ b/drivers/md/dm-zoned-reclaim.c
> @@ -14,6 +14,7 @@
>  struct dmz_reclaim {
>  	struct dmz_metadata     *metadata;
>  	struct dmz_dev		*dev;
> +	struct dmz_cdev		*cdev;
>  
>  	struct delayed_work	work;
>  	struct workqueue_struct *wq;
> @@ -44,13 +45,15 @@ enum {
>   * Percentage of unmapped (free) random zones below which reclaim starts
>   * even if the target is busy.
>   */
> -#define DMZ_RECLAIM_LOW_UNMAP_RND	30
> +#define DMZ_RECLAIM_LOW_UNMAP_RND	25
> +#define DMZ_RECLAIM_LOW_UNMAP_CACHE	40
>  
>  /*
>   * Percentage of unmapped (free) random zones above which reclaim will
>   * stop if the target is busy.
>   */
>  #define DMZ_RECLAIM_HIGH_UNMAP_RND	50
> +#define DMZ_RECLAIM_HIGH_UNMAP_CACHE	60
>  
>  /*
>   * Align a sequential zone write pointer to chunk_block.
> @@ -117,6 +120,7 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
>  {
>  	struct dmz_metadata *zmd = zrc->metadata;
>  	struct dmz_dev *dev = zrc->dev;
> +	struct dmz_cdev *cdev = zrc->cdev;
>  	struct dm_io_region src, dst;
>  	sector_t block = 0, end_block;
>  	sector_t nr_blocks;
> @@ -156,11 +160,17 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
>  				return ret;
>  		}
>  
> -		src.bdev = dev->bdev;
> +		if (dmz_is_cache(src_zone))
> +			src.bdev = cdev->bdev;
> +		else
> +			src.bdev = dev->bdev;
>  		src.sector = dmz_blk2sect(src_zone_block + block);
>  		src.count = dmz_blk2sect(nr_blocks);
>  
> -		dst.bdev = dev->bdev;
> +		if (dmz_is_cache(dst_zone))
> +			dst.bdev = cdev->bdev;
> +		else
> +			dst.bdev = dev->bdev;
>  		dst.sector = dmz_blk2sect(dst_zone_block + block);
>  		dst.count = src.count;
>  
> @@ -354,7 +364,7 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc)
>  
>  	start = jiffies;
>  
> -	if (dmz_is_rnd(dzone)) {
> +	if (dmz_is_rnd(dzone) || dmz_is_cache(dzone)) {
>  		if (!dmz_weight(dzone)) {
>  			/* Empty zone */
>  			dmz_reclaim_empty(zrc, dzone);
> @@ -423,7 +433,7 @@ static inline int dmz_target_idle(struct dmz_reclaim *zrc)
>  /*
>   * Test if reclaim is necessary.
>   */
> -static bool dmz_should_reclaim(struct dmz_reclaim *zrc)
> +static bool dmz_should_rnd_reclaim(struct dmz_reclaim *zrc)
>  {
>  	struct dmz_metadata *zmd = zrc->metadata;
>  	unsigned int nr_rnd = dmz_nr_rnd_zones(zmd);
> @@ -445,6 +455,32 @@ static bool dmz_should_reclaim(struct dmz_reclaim *zrc)
>  	return p_unmap_rnd <= DMZ_RECLAIM_LOW_UNMAP_RND;
>  }
>  
> +static bool dmz_should_cache_reclaim(struct dmz_reclaim *zrc)
> +{
> +	struct dmz_metadata *zmd = zrc->metadata;
> +	unsigned int nr_cache = dmz_nr_cache_zones(zmd);
> +	unsigned int nr_unmap_cache = dmz_nr_unmap_cache_zones(zmd);
> +	unsigned int p_unmap_cache;
> +
> +	if (!nr_cache)
> +		return false;
> +
> +	/* Reclaim when idle */
> +	if (dmz_target_idle(zrc) && nr_unmap_cache < nr_cache)
> +		return true;
> +
> +	/* If there are still plenty of cache zones, do not reclaim */
> +	p_unmap_cache = nr_unmap_cache * 100 / nr_cache;
> +	if (p_unmap_cache >= DMZ_RECLAIM_HIGH_UNMAP_CACHE)
> +		return false;
> +
> +	/*
> +	 * If the percentage of unmapped cache zones is low,
> +	 * reclaim even if the target is busy.
> +	 */
> +	return p_unmap_cache <= DMZ_RECLAIM_LOW_UNMAP_CACHE;
> +}
> +
>  /*
>   * Reclaim work function.
>   */
> @@ -452,14 +488,14 @@ static void dmz_reclaim_work(struct work_struct *work)
>  {
>  	struct dmz_reclaim *zrc = container_of(work, struct dmz_reclaim, work.work);
>  	struct dmz_metadata *zmd = zrc->metadata;
> -	unsigned int nr_rnd, nr_unmap_rnd;
> -	unsigned int p_unmap_rnd;
> +	unsigned int nr_rnd, nr_unmap_rnd, nr_cache, nr_unmap_cache;
> +	unsigned int p_unmap_rnd, p_unmap_cache = 100;
>  	int ret;
>  
>  	if (dmz_bdev_is_dying(zrc->dev))
>  		return;
>  
> -	if (!dmz_should_reclaim(zrc)) {
> +	if (!dmz_should_cache_reclaim(zrc) && !dmz_should_rnd_reclaim(zrc)) {
>  		mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD);
>  		return;
>  	}
> @@ -470,21 +506,33 @@ static void dmz_reclaim_work(struct work_struct *work)
>  	 * and slower if there are still some free random zones to avoid
>  	 * as much as possible to negatively impact the user workload.
>  	 */
> +	nr_cache = dmz_nr_cache_zones(zmd);
> +	if (nr_cache) {
> +		nr_unmap_cache = dmz_nr_unmap_cache_zones(zmd);
> +		p_unmap_cache = nr_unmap_cache * 100 / nr_cache;
> +	}
>  	nr_rnd = dmz_nr_rnd_zones(zmd);
>  	nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd);
>  	p_unmap_rnd = nr_unmap_rnd * 100 / nr_rnd;
> -	if (dmz_target_idle(zrc) || p_unmap_rnd < DMZ_RECLAIM_LOW_UNMAP_RND / 2) {
> -		/* Idle or very low percentage: go fast */
> +	if (dmz_target_idle(zrc)) {
> +		/* Idle; go fast */
> +		zrc->kc_throttle.throttle = 100;
> +	} else if (p_unmap_rnd < DMZ_RECLAIM_LOW_UNMAP_CACHE / 2) {
> +		/* Low percentage of cache zones; go fast */
>  		zrc->kc_throttle.throttle = 100;
> +	} else if (p_unmap_rnd < DMZ_RECLAIM_LOW_UNMAP_RND / 2) {
> +		/* Busy, but low percentage of random zones; throttle */
> +		zrc->kc_throttle.throttle = min(75U, 100U - p_unmap_cache / 2);
>  	} else {
>  		/* Busy but we still have some random zone: throttle */
>  		zrc->kc_throttle.throttle = min(75U, 100U - p_unmap_rnd / 2);
>  	}
>  
>  	dmz_dev_debug(zrc->dev,
> -		      "Reclaim (%u): %s, %u%% free rnd zones (%u/%u)",
> +		      "Reclaim (%u): %s, free zones: cache %u%% (%u/%u) rnd %uy%% (%u/%u)",
>  		      zrc->kc_throttle.throttle,
>  		      (dmz_target_idle(zrc) ? "Idle" : "Busy"),
> +		      p_unmap_cache, nr_unmap_cache, nr_cache,
>  		      p_unmap_rnd, nr_unmap_rnd, nr_rnd);
>  
>  	ret = dmz_do_reclaim(zrc);
> @@ -500,7 +548,8 @@ static void dmz_reclaim_work(struct work_struct *work)
>  /*
>   * Initialize reclaim.
>   */
> -int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd,
> +int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_cdev *cdev,
> +		    struct dmz_metadata *zmd,
>  		    struct dmz_reclaim **reclaim)
>  {
>  	struct dmz_reclaim *zrc;
> @@ -511,6 +560,7 @@ int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd,
>  		return -ENOMEM;
>  
>  	zrc->dev = dev;
> +	zrc->cdev = cdev;
>  	zrc->metadata = zmd;
>  	zrc->atime = jiffies;
>  
> @@ -583,7 +633,7 @@ void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc)
>   */
>  void dmz_schedule_reclaim(struct dmz_reclaim *zrc)
>  {
> -	if (dmz_should_reclaim(zrc))
> +	if (dmz_should_cache_reclaim(zrc) || dmz_should_rnd_reclaim(zrc))
>  		mod_delayed_work(zrc->wq, &zrc->work, 0);
>  }
>  
> diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
> index f4f83d39b3dc..3f25953672b9 100644
> --- a/drivers/md/dm-zoned-target.c
> +++ b/drivers/md/dm-zoned-target.c
> @@ -39,12 +39,16 @@ struct dm_chunk_work {
>   */
>  struct dmz_target {
>  	struct dm_dev		*ddev;
> +	struct dm_dev		*cddev;
>  
>  	unsigned long		flags;
>  
>  	/* Zoned block device information */
>  	struct dmz_dev		*dev;
>  
> +	/* Cache block device information */
> +	struct dmz_cdev		*cdev;
> +
>  	/* For metadata handling */
>  	struct dmz_metadata     *metadata;
>  
> @@ -125,7 +129,10 @@ static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone,
>  	if (!clone)
>  		return -ENOMEM;
>  
> -	bio_set_dev(clone, dmz->dev->bdev);
> +	if (dmz_is_cache(zone))
> +		bio_set_dev(clone, dmz->cdev->bdev);
> +	else
> +		bio_set_dev(clone, dmz->dev->bdev);
>  	clone->bi_iter.bi_sector =
>  		dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block);
>  	clone->bi_iter.bi_size = dmz_blk2sect(nr_blocks) << SECTOR_SHIFT;
> @@ -179,7 +186,8 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone,
>  
>  	dmz_dev_debug(dmz->dev, "READ chunk %llu -> %s zone %u, block %llu, %u blocks",
>  		      (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
> -		      (dmz_is_rnd(zone) ? "RND" : "SEQ"),
> +		      (dmz_is_rnd(zone) ?
> +		       (dmz_is_cache(zone) ? "CACHE" : "RND") : "SEQ"),
>  		      dmz_id(dmz->metadata, zone),
>  		      (unsigned long long)chunk_block, nr_blocks);
>  
> @@ -187,7 +195,8 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone,
>  	bzone = zone->bzone;
>  	while (chunk_block < end_block) {
>  		nr_blocks = 0;
> -		if (dmz_is_rnd(zone) || chunk_block < zone->wp_block) {
> +		if (dmz_is_rnd(zone) || dmz_is_cache(zone) ||
> +		    chunk_block < zone->wp_block) {
>  			/* Test block validity in the data zone */
>  			ret = dmz_block_valid(dmz->metadata, zone, chunk_block);
>  			if (ret < 0)
> @@ -316,11 +325,13 @@ static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone,
>  
>  	dmz_dev_debug(dmz->dev, "WRITE chunk %llu -> %s zone %u, block %llu, %u blocks",
>  		      (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
> -		      (dmz_is_rnd(zone) ? "RND" : "SEQ"),
> +		      (dmz_is_rnd(zone) ?
> +		       (dmz_is_cache(zone) ? "CACHE" : "RND") : "SEQ"),
>  		      dmz_id(dmz->metadata, zone),
>  		      (unsigned long long)chunk_block, nr_blocks);
>  
> -	if (dmz_is_rnd(zone) || chunk_block == zone->wp_block) {
> +	if (dmz_is_rnd(zone) || dmz_is_cache(zone) ||
> +	    chunk_block == zone->wp_block) {
>  		/*
>  		 * zone is a random zone or it is a sequential zone
>  		 * and the BIO is aligned to the zone write pointer:
> @@ -364,7 +375,8 @@ static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone,
>  	 * Invalidate blocks in the data zone and its
>  	 * buffer zone if one is mapped.
>  	 */
> -	if (dmz_is_rnd(zone) || chunk_block < zone->wp_block)
> +	if (dmz_is_rnd(zone) || dmz_is_cache(zone) ||
> +	    chunk_block < zone->wp_block)
>  		ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks);
>  	if (ret == 0 && zone->bzone)
>  		ret = dmz_invalidate_blocks(zmd, zone->bzone,
> @@ -714,8 +726,7 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path)
>  	dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
>  	aligned_capacity = dev->capacity &
>  				~((sector_t)blk_queue_zone_sectors(q) - 1);
> -	if (ti->begin ||
> -	    ((ti->len != dev->capacity) && (ti->len != aligned_capacity))) {
> +	if (ti->begin || ti->len < dev->capacity) {
>  		ti->error = "Partial mapping not supported";
>  		ret = -EINVAL;
>  		goto err;
> @@ -751,6 +762,64 @@ static void dmz_put_zoned_device(struct dm_target *ti)
>  	dmz->dev = NULL;
>  }
>  
> +/*
> + * Get cache device information.
> + */
> +static int dmz_get_cache_device(struct dm_target *ti, char *path)
> +{
> +	struct dmz_target *dmz = ti->private;
> +	struct dmz_cdev *cdev;
> +	int ret;
> +
> +	/* Get the target device */
> +	ret = dm_get_device(ti, path, dm_table_get_mode(ti->table),
> +			    &dmz->cddev);
> +	if (ret) {
> +		ti->error = "Get target device failed";
> +		dmz->cdev = NULL;
> +		return ret;
> +	}
> +
> +	cdev = kzalloc(sizeof(struct dmz_cdev), GFP_KERNEL);
> +	if (!cdev) {
> +		ret = -ENOMEM;
> +		goto err;
> +	}
> +
> +	cdev->bdev = dmz->cddev->bdev;
> +	(void)bdevname(cdev->bdev, cdev->name);
> +
> +	if (bdev_zoned_model(cdev->bdev) != BLK_ZONED_NONE) {
> +		ti->error = "Cache device must not be a zoned block device";
> +		ret = -EINVAL;
> +		goto err;
> +	}
> +
> +	cdev->capacity = i_size_read(cdev->bdev->bd_inode) >> SECTOR_SHIFT;
> +	dmz->cdev = cdev;
> +
> +	return 0;
> +err:
> +	dm_put_device(ti, dmz->cddev);
> +	kfree(cdev);
> +
> +	return ret;
> +}
> +
> +/*
> + * Cleanup cache device information.
> + */
> +static void dmz_put_cache_device(struct dm_target *ti)
> +{
> +	struct dmz_target *dmz = ti->private;
> +
> +	if (!dmz->cdev)
> +		return;
> +	dm_put_device(ti, dmz->cddev);
> +	kfree(dmz->cdev);
> +	dmz->cdev = NULL;
> +}
> +
>  /*
>   * Setup target.
>   */
> @@ -758,10 +827,11 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>  {
>  	struct dmz_target *dmz;
>  	struct dmz_dev *dev;
> +	struct dmz_cdev *cdev;
>  	int ret;
>  
>  	/* Check arguments */
> -	if (argc != 1) {
> +	if (argc > 2) {
>  		ti->error = "Invalid argument count";
>  		return -EINVAL;
>  	}
> @@ -781,12 +851,23 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>  		goto err;
>  	}
>  
> +	/* Get the cache device for random zones */
> +	if (argc == 2) {
> +		ret = dmz_get_cache_device(ti, argv[1]);
> +		if (ret) {
> +			dmz->cdev = NULL;
> +			ti->error = "Cache device failed";
> +			goto err_dev;
> +		}
> +	}
> +
>  	/* Initialize metadata */
>  	dev = dmz->dev;
> -	ret = dmz_ctr_metadata(dev, &dmz->metadata);
> +	cdev = dmz->cdev;
> +	ret = dmz_ctr_metadata(dev, cdev, &dmz->metadata);
>  	if (ret) {
>  		ti->error = "Metadata initialization failed";
> -		goto err_dev;
> +		goto err_cdev;
>  	}
>  
>  	/* Set target (no write same support) */
> @@ -833,7 +914,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>  	mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
>  
>  	/* Initialize reclaim */
> -	ret = dmz_ctr_reclaim(dev, dmz->metadata, &dmz->reclaim);
> +	ret = dmz_ctr_reclaim(dev, cdev, dmz->metadata, &dmz->reclaim);
>  	if (ret) {
>  		ti->error = "Zone reclaim initialization failed";
>  		goto err_fwq;
> @@ -853,6 +934,8 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>  	bioset_exit(&dmz->bio_set);
>  err_meta:
>  	dmz_dtr_metadata(dmz->metadata);
> +err_cdev:
> +	dmz_put_cache_device(ti);
>  err_dev:
>  	dmz_put_zoned_device(ti);
>  err:
> @@ -882,6 +965,8 @@ static void dmz_dtr(struct dm_target *ti)
>  
>  	bioset_exit(&dmz->bio_set);
>  
> +	dmz_put_cache_device(ti);
> +
>  	dmz_put_zoned_device(ti);
>  
>  	mutex_destroy(&dmz->chunk_lock);
> diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h
> index 5b5e493d479c..c2b6a919681a 100644
> --- a/drivers/md/dm-zoned.h
> +++ b/drivers/md/dm-zoned.h
> @@ -66,6 +66,21 @@ struct dmz_dev {
>  	sector_t		zone_nr_blocks_shift;
>  };
>  
> +/*
> + * Cache block device information.
> + */
> +struct dmz_cdev {
> +	struct block_device	*bdev;
> +
> +	char			name[BDEVNAME_SIZE];
> +
> +	sector_t		capacity;
> +
> +	unsigned int		nr_zones;
> +
> +	unsigned int		flags;
> +};
> +
>  #define dmz_bio_chunk(dev, bio)	((bio)->bi_iter.bi_sector >> \
>  				 (dev)->zone_nr_sectors_shift)
>  #define dmz_chunk_block(dev, b)	((b) & ((dev)->zone_nr_blocks - 1))
> @@ -73,6 +88,7 @@ struct dmz_dev {
>  /* Device flags. */
>  #define DMZ_BDEV_DYING		(1 << 0)
>  #define DMZ_CHECK_BDEV		(2 << 0)
> +#define DMZ_BDEV_CACHE		(4 << 0)
>  
>  /*
>   * Zone descriptor.
> @@ -87,6 +103,9 @@ struct dm_zone {
>  	/* Zone activation reference count */
>  	atomic_t		refcount;
>  
> +	/* Zone id */
> +	unsigned int		id;
> +
>  	/* Zone write pointer block (relative to the zone start block) */
>  	unsigned int		wp_block;
>  
> @@ -111,6 +130,7 @@ enum {
>  	/* Zone write type */
>  	DMZ_RND,
>  	DMZ_SEQ,
> +	DMZ_CACHE,
>  
>  	/* Zone critical condition */
>  	DMZ_OFFLINE,
> @@ -131,6 +151,7 @@ enum {
>   */
>  #define dmz_is_rnd(z)		test_bit(DMZ_RND, &(z)->flags)
>  #define dmz_is_seq(z)		test_bit(DMZ_SEQ, &(z)->flags)
> +#define dmz_is_cache(z)		test_bit(DMZ_CACHE, &(z)->flags)
>  #define dmz_is_empty(z)		((z)->wp_block == 0)
>  #define dmz_is_offline(z)	test_bit(DMZ_OFFLINE, &(z)->flags)
>  #define dmz_is_readonly(z)	test_bit(DMZ_READ_ONLY, &(z)->flags)
> @@ -164,7 +185,8 @@ struct dmz_reclaim;
>  /*
>   * Functions defined in dm-zoned-metadata.c
>   */
> -int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **zmd);
> +int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_cdev *cdev,
> +		     struct dmz_metadata **zmd);
>  void dmz_dtr_metadata(struct dmz_metadata *zmd);
>  int dmz_resume_metadata(struct dmz_metadata *zmd);
>  
> @@ -183,6 +205,7 @@ unsigned int dmz_nr_chunks(struct dmz_metadata *zmd);
>  
>  #define DMZ_ALLOC_RND		0x01
>  #define DMZ_ALLOC_RECLAIM	0x02
> +#define DMZ_ALLOC_CACHE		0x04
>  
>  struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags);
>  void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone);
> @@ -192,6 +215,8 @@ void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *zone,
>  void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone);
>  unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd);
>  unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd);
> +unsigned int dmz_nr_cache_zones(struct dmz_metadata *zmd);
> +unsigned int dmz_nr_unmap_cache_zones(struct dmz_metadata *zmd);
>  
>  /*
>   * Activate a zone (increment its reference count).
> @@ -244,8 +269,8 @@ int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
>  /*
>   * Functions defined in dm-zoned-reclaim.c
>   */
> -int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd,
> -		    struct dmz_reclaim **zrc);
> +int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_cdev *cdev,
> +		    struct dmz_metadata *zmd, struct dmz_reclaim **zrc);
>  void dmz_dtr_reclaim(struct dmz_reclaim *zrc);
>  void dmz_suspend_reclaim(struct dmz_reclaim *zrc);
>  void dmz_resume_reclaim(struct dmz_reclaim *zrc);
> 


-- 
Damien Le Moal
Western Digital Research






More information about the dm-devel mailing list