[dm-devel] [PATCH 11/14] dm-zoned: support arbitrary number of devices

Damien Le Moal Damien.LeMoal at wdc.com
Sun May 31 23:54:56 UTC 2020


On 2020/05/31 22:06, Hannes Reinecke wrote:
> On 5/31/20 11:10 AM, Damien Le Moal wrote:
>> On Fri, 2020-05-29 at 19:39 +0200, Hannes Reinecke wrote:
>>> Remove the hard-coded limit of two devices and support an unlimited
>>> number of additional zoned devices.
>>> With that we need to increase the device-mapper version number to
>>> 3.0.0 as we've modified the interface.
>>>
>>> Signed-off-by: Hannes Reinecke <hare at suse.de>
>>> ---
>>>   drivers/md/dm-zoned-metadata.c |  15 +++++-
>>>   drivers/md/dm-zoned-target.c   | 106 ++++++++++++++++++++++++-----------------
>>>   2 files changed, 75 insertions(+), 46 deletions(-)
>>>
>>> diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
>>> index 044c152eb756..221163ae5f68 100644
>>> --- a/drivers/md/dm-zoned-metadata.c
>>> +++ b/drivers/md/dm-zoned-metadata.c
>>> @@ -1523,7 +1523,20 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
>>>   		 */
>>>   		zmd->sb[0].zone = dmz_get(zmd, 0);
>>>   
>>> -		zoned_dev = &zmd->dev[1];
>>> +		for (i = 1; i < zmd->nr_devs; i++) {
>>> +			zoned_dev = &zmd->dev[i];
>>> +
>>> +			ret = blkdev_report_zones(zoned_dev->bdev, 0,
>>> +						  BLK_ALL_ZONES,
>>> +						  dmz_init_zone, zoned_dev);
>>> +			if (ret < 0) {
>>> +				DMDEBUG("(%s): Failed to report zones, error %d",
>>> +					zmd->devname, ret);
>>> +				dmz_drop_zones(zmd);
>>> +				return ret;
>>> +			}
>>> +		}
>>> +		return 0;
>>>   	}
>>>   
>>>   	/*
>>> diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
>>> index aa3d26d16441..4a51738d4b0d 100644
>>> --- a/drivers/md/dm-zoned-target.c
>>> +++ b/drivers/md/dm-zoned-target.c
>>> @@ -13,8 +13,6 @@
>>>   
>>>   #define DMZ_MIN_BIOS		8192
>>>   
>>> -#define DMZ_MAX_DEVS		2
>>> -
>>>   /*
>>>    * Zone BIO context.
>>>    */
>>> @@ -40,10 +38,10 @@ struct dm_chunk_work {
>>>    * Target descriptor.
>>>    */
>>>   struct dmz_target {
>>> -	struct dm_dev		*ddev[DMZ_MAX_DEVS];
>>> +	struct dm_dev		**ddev;
>>>   	unsigned int		nr_ddevs;
>>>   
>>> -	unsigned long		flags;
>>> +	unsigned int		flags;
>>>   
>>>   	/* Zoned block device information */
>>>   	struct dmz_dev		*dev;
>>> @@ -764,7 +762,7 @@ static void dmz_put_zoned_device(struct dm_target *ti)
>>>   	struct dmz_target *dmz = ti->private;
>>>   	int i;
>>>   
>>> -	for (i = 0; i < DMZ_MAX_DEVS; i++) {
>>> +	for (i = 0; i < dmz->nr_ddevs; i++) {
>>>   		if (dmz->ddev[i]) {
>>>   			dm_put_device(ti, dmz->ddev[i]);
>>>   			dmz->ddev[i] = NULL;
>>> @@ -777,21 +775,35 @@ static int dmz_fixup_devices(struct dm_target *ti)
>>>   	struct dmz_target *dmz = ti->private;
>>>   	struct dmz_dev *reg_dev, *zoned_dev;
>>>   	struct request_queue *q;
>>> +	sector_t zone_nr_sectors = 0;
>>> +	int i;
>>>   
>>>   	/*
>>> -	 * When we have two devices, the first one must be a regular block
>>> -	 * device and the second a zoned block device.
>>> +	 * When we have more than on devices, the first one must be a
>>> +	 * regular block device and the others zoned block devices.
>>>   	 */
>>> -	if (dmz->ddev[0] && dmz->ddev[1]) {
>>> +	if (dmz->nr_ddevs > 1) {
>>>   		reg_dev = &dmz->dev[0];
>>>   		if (!(reg_dev->flags & DMZ_BDEV_REGULAR)) {
>>>   			ti->error = "Primary disk is not a regular device";
>>>   			return -EINVAL;
>>>   		}
>>> -		zoned_dev = &dmz->dev[1];
>>> -		if (zoned_dev->flags & DMZ_BDEV_REGULAR) {
>>> -			ti->error = "Secondary disk is not a zoned device";
>>> -			return -EINVAL;
>>> +		for (i = 1; i < dmz->nr_ddevs; i++) {
>>> +			zoned_dev = &dmz->dev[i];
>>> +			if (zoned_dev->flags & DMZ_BDEV_REGULAR) {
>>> +				ti->error = "Secondary disk is not a zoned device";
>>> +				return -EINVAL;
>>> +			}
>>> +			q = bdev_get_queue(zoned_dev->bdev);
>>> +			if (zone_nr_sectors &&
>>> +			    zone_nr_sectors != blk_queue_zone_sectors(q)) {
>>> +				ti->error = "Zone nr sectors mismatch";
>>> +				return -EINVAL;
>>> +			}
>>> +			zone_nr_sectors = blk_queue_zone_sectors(q);
>>> +			zoned_dev->zone_nr_sectors = zone_nr_sectors;
>>> +			zoned_dev->nr_zones =
>>> +				blkdev_nr_zones(zoned_dev->bdev->bd_disk);
>>>   		}
>>>   	} else {
>>>   		reg_dev = NULL;
>>> @@ -800,17 +812,24 @@ static int dmz_fixup_devices(struct dm_target *ti)
>>>   			ti->error = "Disk is not a zoned device";
>>>   			return -EINVAL;
>>>   		}
>>> +		q = bdev_get_queue(zoned_dev->bdev);
>>> +		zoned_dev->zone_nr_sectors = blk_queue_zone_sectors(q);
>>> +		zoned_dev->nr_zones = blkdev_nr_zones(zoned_dev->bdev->bd_disk);
>>>   	}
>>> -	q = bdev_get_queue(zoned_dev->bdev);
>>> -	zoned_dev->zone_nr_sectors = blk_queue_zone_sectors(q);
>>> -	zoned_dev->nr_zones = blkdev_nr_zones(zoned_dev->bdev->bd_disk);
>>>   
>>>   	if (reg_dev) {
>>> -		reg_dev->zone_nr_sectors = zoned_dev->zone_nr_sectors;
>>> +		sector_t zone_offset;
>>> +
>>> +		reg_dev->zone_nr_sectors = zone_nr_sectors;
>>>   		reg_dev->nr_zones =
>>>   			DIV_ROUND_UP_SECTOR_T(reg_dev->capacity,
>>>   					      reg_dev->zone_nr_sectors);
>>> -		zoned_dev->zone_offset = reg_dev->nr_zones;
>>> +		reg_dev->zone_offset = 0;
>>> +		zone_offset = reg_dev->nr_zones;
>>> +		for (i = 1; i < dmz->nr_ddevs; i++) {
>>> +			dmz->dev[i].zone_offset = zone_offset;
>>> +			zone_offset += dmz->dev[i].nr_zones;
>>> +		}
>>>   	}
>>>   	return 0;
>>>   }
>>> @@ -824,7 +843,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>>>   	int ret, i;
>>>   
>>>   	/* Check arguments */
>>> -	if (argc < 1 || argc > 2) {
>>> +	if (argc < 1) {
>>>   		ti->error = "Invalid argument count";
>>>   		return -EINVAL;
>>>   	}
>>> @@ -835,32 +854,31 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>>>   		ti->error = "Unable to allocate the zoned target descriptor";
>>>   		return -ENOMEM;
>>>   	}
>>> -	dmz->dev = kcalloc(2, sizeof(struct dmz_dev), GFP_KERNEL);
>>> +	dmz->dev = kcalloc(argc, sizeof(struct dmz_dev), GFP_KERNEL);
>>>   	if (!dmz->dev) {
>>>   		ti->error = "Unable to allocate the zoned device descriptors";
>>>   		kfree(dmz);
>>>   		return -ENOMEM;
>>>   	}
>>> +	dmz->ddev = kcalloc(argc, sizeof(struct dm_dev *), GFP_KERNEL);
>>> +	if (!dmz->ddev) {
>>> +		ti->error = "Unable to allocate the dm device descriptors";
>>> +		ret = -ENOMEM;
>>> +		goto err;
>>> +	}
>>>   	dmz->nr_ddevs = argc;
>>> +
>>>   	ti->private = dmz;
>>>   
>>>   	/* Get the target zoned block device */
>>> -	ret = dmz_get_zoned_device(ti, argv[0], 0, argc);
>>> -	if (ret)
>>> -		goto err;
>>> -
>>> -	if (argc == 2) {
>>> -		ret = dmz_get_zoned_device(ti, argv[1], 1, argc);
>>> -		if (ret) {
>>> -			dmz_put_zoned_device(ti);
>>> -			goto err;
>>> -		}
>>> +	for (i = 0; i < argc; i++) {
>>> +		ret = dmz_get_zoned_device(ti, argv[i], i, argc);
>>> +		if (ret)
>>> +			goto err_dev;
>>>   	}
>>>   	ret = dmz_fixup_devices(ti);
>>> -	if (ret) {
>>> -		dmz_put_zoned_device(ti);
>>> -		goto err;
>>> -	}
>>> +	if (ret)
>>> +		goto err_dev;
>>>   
>>>   	/* Initialize metadata */
>>>   	ret = dmz_ctr_metadata(dmz->dev, argc, &dmz->metadata,
>>> @@ -1056,13 +1074,13 @@ static int dmz_iterate_devices(struct dm_target *ti,
>>>   	struct dmz_target *dmz = ti->private;
>>>   	unsigned int zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata);
>>>   	sector_t capacity;
>>> -	int r;
>>> +	int i, r;
>>>   
>>> -	capacity = dmz->dev[0].capacity & ~(zone_nr_sectors - 1);
>>> -	r = fn(ti, dmz->ddev[0], 0, capacity, data);
>>> -	if (!r && dmz->ddev[1]) {
>>> -		capacity = dmz->dev[1].capacity & ~(zone_nr_sectors - 1);
>>> -		r = fn(ti, dmz->ddev[1], 0, capacity, data);
>>> +	for (i = 0; i < dmz->nr_ddevs; i++) {
>>> +		capacity = dmz->dev[i].capacity & ~(zone_nr_sectors - 1);
>>> +		r = fn(ti, dmz->ddev[i], 0, capacity, data);
>>> +		if (r)
>>> +			break;
>>>   	}
>>>   	return r;
>>>   }
>>> @@ -1083,9 +1101,7 @@ static void dmz_status(struct dm_target *ti, status_type_t type,
>>>   		       dmz_nr_zones(dmz->metadata),
>>>   		       dmz_nr_unmap_cache_zones(dmz->metadata),
>>>   		       dmz_nr_cache_zones(dmz->metadata));
>>> -		for (i = 0; i < DMZ_MAX_DEVS; i++) {
>>> -			if (!dmz->ddev[i])
>>> -				continue;
>>> +		for (i = 0; i < dmz->nr_ddevs; i++) {
>>>   			/*
>>>   			 * For a multi-device setup the first device
>>>   			 * contains only cache zones.
>>> @@ -1104,8 +1120,8 @@ static void dmz_status(struct dm_target *ti, status_type_t type,
>>>   		dev = &dmz->dev[0];
>>>   		format_dev_t(buf, dev->bdev->bd_dev);
>>>   		DMEMIT("%s", buf);
>>> -		if (dmz->dev[1].bdev) {
>>> -			dev = &dmz->dev[1];
>>> +		for (i = 1; i < dmz->nr_ddevs; i++) {
>>> +			dev = &dmz->dev[i];
>>>   			format_dev_t(buf, dev->bdev->bd_dev);
>>>   			DMEMIT(" %s", buf);
>>>   		}
>>> @@ -1133,7 +1149,7 @@ static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv,
>>>   
>>>   static struct target_type dmz_type = {
>>>   	.name		 = "zoned",
>>> -	.version	 = {2, 0, 0},
>>> +	.version	 = {3, 0, 0},
>>>   	.features	 = DM_TARGET_SINGLETON | DM_TARGET_ZONED_HM,
>>>   	.module		 = THIS_MODULE,
>>>   	.ctr		 = dmz_ctr,
>>
>> Looks all good to me, but thinking more about it, don't we need to add
>> a device index in the super blocks ? The reason is that if the drive
>> configuration changes between stopt/start (drives removed, added or
>> changed slots), the drive names will change and while the userspace
>> will still be able to find the group of drives constituting the target
>> (using UUID9, there is no obvious way to find out what the original
>> drive order was. Since the kernel side relies on the drive being passed
>> to the ctr function in the order of the mapping, we need to preserve
>> that. Or change also the kernel side to use the index in the super
>> block to put each drive in its correct dmz->dev[] slot.
>>
> Already taken care of; here's where the tertiary superblocks come in.
> Each superblock carries its own position (in the 'sb_block' field).
> This is the _absolute_ position within the entire setup, not the
> relative per-device block number.
> And it also has the absolute number of blocks in the 'nr_chunks' field.
> 
> Hence we know exactly where this superblock (and, by implication, the 
> zones following this superblock) should end up. And we know how large
> the entire setup will be. So can insert the superblock at the right
> position and then can check if we have enough zones for the entire
> device.

I do not get it though. Where is that checked ? At least in this patch, drives
are initialized in the order of the ctr arguments, and this loop:

+		for (i = 1; i < dmz->nr_ddevs; i++) {
+			dmz->dev[i].zone_offset = zone_offset;
+			zone_offset += dmz->dev[i].nr_zones;
+		}

in dmz_fixup_devices() sets the zone offset for each device in the same order.
So for a given chunk mapped to a zone identified by its ID, if the device order
changes, zone ID will change and the chunk will not be mapped to the correct
zone. What am I missing here ?


> 
> Not sure if the dmzadm does it, though; but should be easy enough to 
> implement.
> 
> Cheers,
> 
> Hannes
> 


-- 
Damien Le Moal
Western Digital Research






More information about the dm-devel mailing list